Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9dc8fd6
llama3 add export_format
chensuyue Apr 3, 2026
e93512d
update llama3 example
chensuyue Apr 16, 2026
2c77d52
support qwen export llmc model
chensuyue Apr 17, 2026
73588d1
support ds export llmc format
chensuyue Apr 17, 2026
a8f06b5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 17, 2026
7b3cbb5
update requirements
chensuyue Apr 17, 2026
5f81c87
Merge branch 'suyue/model' of https://github.com/intel/neural-compres…
chensuyue Apr 17, 2026
1c2a1be
bug fix
chensuyue Apr 17, 2026
1056ee2
add run prompt.py
chensuyue Apr 20, 2026
9df639e
Merge branch 'master' into suyue/model
chensuyue Apr 21, 2026
3769c48
Enable CT Path (#2450)
yiliu30 Apr 21, 2026
ec6b658
use single scripts for env setup
chensuyue Apr 21, 2026
a6eeef6
add bench tool install
chensuyue Apr 21, 2026
2679f39
bug fix
chensuyue Apr 21, 2026
cfce3fb
add deps
chensuyue Apr 21, 2026
8eff609
update setup.sh
chensuyue Apr 21, 2026
78f3826
set AR as default format
chensuyue Apr 22, 2026
f0d9bee
freeze torch version for AR test
chensuyue Apr 22, 2026
c3e3a01
fix install vllm
chensuyue Apr 22, 2026
a070082
add eval env
chensuyue Apr 22, 2026
ca6b48a
fix deepseek
chensuyue Apr 23, 2026
d65eaea
setup ds ruler inputs=64k
chensuyue Apr 23, 2026
852bc25
add setting for A100
chensuyue Apr 23, 2026
975148b
nvfp4 (#2453)
yiliu30 Apr 24, 2026
60d1833
accept env params
chensuyue Apr 27, 2026
27e1f11
Merge branch 'suyue/model' of https://github.com/intel/neural-compres…
chensuyue Apr 27, 2026
d72786c
update rope config
yiliu30 Apr 27, 2026
9ff9019
fix
yiliu30 Apr 27, 2026
7ea6d7b
fix
yiliu30 Apr 27, 2026
2c18c2f
fix
yiliu30 Apr 27, 2026
a513f3a
fix
yiliu30 Apr 27, 2026
59ecc83
unify install
chensuyue Apr 27, 2026
7393067
use attn backend
yiliu30 Apr 27, 2026
a57e5fe
support ds v4
yiliu30 May 7, 2026
725ca1d
merge main
yiliu30 May 7, 2026
38d0b4a
clean
yiliu30 May 7, 2026
e0553d2
Update run_evaluation.sh
yiliu30 May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval"
mkdir -p ${OUTPUT_DIR}


VLLM_EXTRA_ARGS=""
###############################################################################
# For deepseek-v4 force kv_cache to fp8
###############################################################################
if [[ "${MODEL_NAME,,}" == *"deepseek-v4"* ]]; then
KV_CACHE_DTYPE="fp8"
echo "Detected DeepSeek-V4 model, forcing KV cache dtype to FP8"
# for scheme mxfp4,
if [[ "$SCHEME" == "mxfp4" ]]; then
echo "Forcing MXFP4 quantization for DeepSeek-V4 model"
export VLLM_MARLIN_MOE_QDQ_MODE=FORCE_MXFP4
export VLLM_QDQ=1
VLLM_EXTRA_ARGS="tokenizer_mode=deepseek_v4"
echo "Set vLLM_MARLIN_MOE_QDQ_MODE=FORCE_MXFP4 and VLLM_QDQ=1 for MXFP4 quantization on DeepSeek-V4"
fi
fi
###############################################################################


SERVER_PORT=8000
max_length=8192
max_gen_toks=2048
Expand Down Expand Up @@ -144,6 +163,8 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then
elif [[ "$SCHEME" == "fp8" ]]; then
echo "Run original model."
VLLM_USE_DEEP_GEMM=0
elif [[ "$SCHEME" == "w4a8" ]]; then
echo "Run DS V4 model."
else
echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'."
usage
Expand Down Expand Up @@ -193,7 +214,7 @@ export VLLM_ENABLE_V1_MULTIPROCESSING=0
# Function to run standard lm-eval tasks
run_standard_eval() {
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}${VLLM_EXTRA_ARGS:+,${VLLM_EXTRA_ARGS}}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
Expand All @@ -213,6 +234,7 @@ start_vllm_server() {
--dtype bfloat16 \
--kv-cache-dtype ${KV_CACHE_DTYPE} \
--disable-log-requests \
${VLLM_EXTRA_ARGS} \
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &

VLLM_PID=$!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
cd ..
# Uninstall flash_attn to avoid conflicts
pip uninstall flash_attn -y
pip install lm_eval["ruler"]
pip install lm_eval["ruler"]
# QDQ plugin
git clone https://github.com/yiliu30/vllm-qdq-plugin.git
uv pip install vllm-qdq-plugin/ -v
Loading