diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 0ce8e608ffc..3997ada478f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -88,6 +88,25 @@ OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" mkdir -p ${OUTPUT_DIR} +VLLM_EXTRA_ARGS="" +############################################################################### +# For deepseek-v4 force kv_cache to fp8 +############################################################################### +if [[ "${MODEL_NAME,,}" == *"deepseek-v4"* ]]; then + KV_CACHE_DTYPE="fp8" + echo "Detected DeepSeek-V4 model, forcing KV cache dtype to FP8" + # for scheme mxfp4, + if [[ "$SCHEME" == "mxfp4" ]]; then + echo "Forcing MXFP4 quantization for DeepSeek-V4 model" + export VLLM_MARLIN_MOE_QDQ_MODE=FORCE_MXFP4 + export VLLM_QDQ=1 + VLLM_EXTRA_ARGS="tokenizer_mode=deepseek_v4" + echo "Set vLLM_MARLIN_MOE_QDQ_MODE=FORCE_MXFP4 and VLLM_QDQ=1 for MXFP4 quantization on DeepSeek-V4" + fi +fi +############################################################################### + + SERVER_PORT=8000 max_length=8192 max_gen_toks=2048 @@ -144,6 +163,8 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then elif [[ "$SCHEME" == "fp8" ]]; then echo "Run original model." VLLM_USE_DEEP_GEMM=0 +elif [[ "$SCHEME" == "w4a8" ]]; then + echo "Run DS V4 model." else echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'." usage @@ -193,7 +214,7 @@ export VLLM_ENABLE_V1_MULTIPROCESSING=0 # Function to run standard lm-eval tasks run_standard_eval() { lm_eval --model vllm \ - --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}${VLLM_EXTRA_ARGS:+,${VLLM_EXTRA_ARGS}}" \ --tasks $TASK_NAME \ --batch_size $BATCH_SIZE \ --log_samples \ @@ -213,6 +234,7 @@ start_vllm_server() { --dtype bfloat16 \ --kv-cache-dtype ${KV_CACHE_DTYPE} \ --disable-log-requests \ + ${VLLM_EXTRA_ARGS} \ > ${OUTPUT_DIR}/vllm_server.log 2>&1 & VLLM_PID=$! diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh index 0486f99bfdd..cf8a4e3215d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh @@ -9,4 +9,7 @@ VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v cd .. # Uninstall flash_attn to avoid conflicts pip uninstall flash_attn -y -pip install lm_eval["ruler"] \ No newline at end of file +pip install lm_eval["ruler"] +# QDQ plugin +git clone https://github.com/yiliu30/vllm-qdq-plugin.git +uv pip install vllm-qdq-plugin/ -v \ No newline at end of file