diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index 5105b39855e..b0d27b026cf 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -181,6 +181,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No parser.add_argument("--iters", default=200, type=int, help="iters for autoround.") parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.") parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.") + parser.add_argument("--batch_size", default=8, type=int, help="batch size for autoround.") + parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="number of gradient accumulation steps for autoround.") parser.add_argument("--save", action="store_true", help="whether to save the quantized model") parser.add_argument("--export_path", type=str, default="saved_results", help="path to save the quantized model") parser.add_argument("--export_format", type=str, default="auto_round", help="format to save the quantized model") @@ -267,6 +269,7 @@ def load_recipe_results(file_path): # preprocess if isinstance(args.target_bits, list) and len(args.target_bits) == 1: args.target_bits = args.target_bits[0] + batch_size_to_accumulate = args.batch_size // args.gradient_accumulate_steps config = AutoRoundConfig( tokenizer=tokenizer, iters=args.iters, @@ -284,6 +287,8 @@ def load_recipe_results(file_path): output_dir=args.export_path, device_map=args.device_map, layer_config=layer_config if (args.use_recipe or args.quant_lm_head) else None, + gradient_accumulate_steps=args.gradient_accumulate_steps, + batch_size=batch_size_to_accumulate, ) if isinstance(args.target_bits, list) and len(args.target_bits) > 1: args.tune_tasks = args.tasks if args.tune_tasks is None else args.tune_tasks diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index e064465aee0..7ebf4ad2dad 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -5,6 +5,7 @@ # Parse command line arguments KV_CACHE_DTYPE="auto" STATIC_ATTENTION_DTYPE="auto" +GRADIENT_ACCUMULATE_STEPS=1 while [[ $# -gt 0 ]]; do case $1 in --topology=*) @@ -23,6 +24,10 @@ while [[ $# -gt 0 ]]; do OUTPUT_MODEL="${1#*=}" shift ;; + --gradient_accumulate_steps=*) + GRADIENT_ACCUMULATE_STEPS="${1#*=}" + shift + ;; --static_kv_dtype=*) KV_CACHE_DTYPE="${1#*=}" shift @@ -133,7 +138,7 @@ case "$TOPOLOGY" in ;; "mxfp4_mixed") echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --gradient_accumulate_steps \"$GRADIENT_ACCUMULATE_STEPS\" --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ @@ -142,6 +147,7 @@ case "$TOPOLOGY" in --options "MXFP4" "MXFP8" \ --shared_layers "k_proj" "v_proj" "q_proj" \ --shared_layers "gate_proj" "up_proj" \ + --gradient_accumulate_steps "$GRADIENT_ACCUMULATE_STEPS" \ --export_path "$OUTPUT_MODEL" ;; *)