diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py old mode 100644 new mode 100755 index bdada9f8c15..d1f9fa1af46 --- a/tests/examples/llm_ptq/test_deploy.py +++ b/tests/examples/llm_ptq/test_deploy.py @@ -246,6 +246,12 @@ def test_llama(command): tensor_parallel_size=4, mini_sm=100, ), + *ModelDeployerList( + model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), *ModelDeployerList( model_id="nvidia/Qwen3-30B-A3B-NVFP4", backend=("trtllm", "vllm", "sglang"), @@ -346,6 +352,13 @@ def test_mixtral(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/Gemma-4-31B-IT-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) @@ -416,6 +429,44 @@ def test_kimi(command): command.run() +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/GLM-4.7-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/GLM-5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_glm(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/MiniMax-M2.5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_minimax(command): + command.run() + + @pytest.mark.parametrize( "command", [ @@ -451,6 +502,20 @@ def test_kimi(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + attn_backend="FLASHINFER", + ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) @@ -497,6 +562,22 @@ def test_medusa(command): tensor_parallel_size=8, mini_sm=89, ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2-Thinking-NVFP4", + model_id="nvidia/Kimi-K2-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2.5-NVFP4", + model_id="nvidia/Kimi-K2.5-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), *ModelDeployerList( base_model="Qwen/Qwen3-235B-A22B", model_id="nvidia/Qwen3-235B-A22B-Eagle3",