Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No
parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.")
parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.")
parser.add_argument("--batch_size", default=8, type=int, help="batch size for autoround.")
parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="number of gradient accumulation steps for autoround.")
parser.add_argument("--save", action="store_true", help="whether to save the quantized model")
parser.add_argument("--export_path", type=str, default="saved_results", help="path to save the quantized model")
parser.add_argument("--export_format", type=str, default="auto_round", help="format to save the quantized model")
Expand Down Expand Up @@ -267,6 +269,7 @@ def load_recipe_results(file_path):
# preprocess
if isinstance(args.target_bits, list) and len(args.target_bits) == 1:
args.target_bits = args.target_bits[0]
batch_size_to_accumulate = args.batch_size // args.gradient_accumulate_steps
config = AutoRoundConfig(
tokenizer=tokenizer,
iters=args.iters,
Expand All @@ -284,6 +287,8 @@ def load_recipe_results(file_path):
output_dir=args.export_path,
device_map=args.device_map,
layer_config=layer_config if (args.use_recipe or args.quant_lm_head) else None,
gradient_accumulate_steps=args.gradient_accumulate_steps,
batch_size=batch_size_to_accumulate,
)
if isinstance(args.target_bits, list) and len(args.target_bits) > 1:
args.tune_tasks = args.tasks if args.tune_tasks is None else args.tune_tasks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Parse command line arguments
KV_CACHE_DTYPE="auto"
STATIC_ATTENTION_DTYPE="auto"
GRADIENT_ACCUMULATE_STEPS=1
while [[ $# -gt 0 ]]; do
case $1 in
--topology=*)
Expand All @@ -23,6 +24,10 @@ while [[ $# -gt 0 ]]; do
OUTPUT_MODEL="${1#*=}"
shift
;;
--gradient_accumulate_steps=*)
GRADIENT_ACCUMULATE_STEPS="${1#*=}"
shift
;;
--static_kv_dtype=*)
KV_CACHE_DTYPE="${1#*=}"
shift
Expand Down Expand Up @@ -133,7 +138,7 @@ case "$TOPOLOGY" in
;;
"mxfp4_mixed")
echo "Running Llama 3.3 70B MXFP4 (Mixed with MXFP8) quantization..."
CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --export_path \"$OUTPUT_MODEL\""
CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --target_bits 5.8 --options \"MXFP4\" \"MXFP8\" --shared_layers \"k_proj\" \"v_proj\" \"q_proj\" --shared_layers \"gate_proj\" \"up_proj\" --gradient_accumulate_steps \"$GRADIENT_ACCUMULATE_STEPS\" --export_path \"$OUTPUT_MODEL\""
echo "Executing command: $CMD"
python quantize.py \
--model_name_or_path "$INPUT_MODEL" \
Expand All @@ -142,6 +147,7 @@ case "$TOPOLOGY" in
--options "MXFP4" "MXFP8" \
--shared_layers "k_proj" "v_proj" "q_proj" \
--shared_layers "gate_proj" "up_proj" \
--gradient_accumulate_steps "$GRADIENT_ACCUMULATE_STEPS" \
--export_path "$OUTPUT_MODEL"
;;
*)
Expand Down
Loading