Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ hf = [
"peft>=0.17.0",
"sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
"tiktoken",
"transformers>=4.56", # Should match modelopt/torch/__init__.py and noxfile.py
"transformers>=4.56,<5.6", # Should match modelopt/torch/__init__.py and noxfile.py
Comment thread
kevalmorabia97 marked this conversation as resolved.
Outdated
Comment thread
kevalmorabia97 marked this conversation as resolved.
Outdated
"wonderwords",
]

Expand Down
81 changes: 81 additions & 0 deletions tests/examples/llm_ptq/test_deploy.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ def test_llama(command):
tensor_parallel_size=4,
mini_sm=100,
),
*ModelDeployerList(
model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
),
*ModelDeployerList(
model_id="nvidia/Qwen3-30B-A3B-NVFP4",
backend=("trtllm", "vllm", "sglang"),
Expand Down Expand Up @@ -346,6 +352,13 @@ def test_mixtral(command):
mini_sm=89,
attn_backend="FLASHINFER",
),
*ModelDeployerList(
model_id="nvidia/Gemma-4-31B-IT-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=1,
mini_sm=100,
attn_backend="FLASHINFER",
),
],
ids=idfn,
)
Expand Down Expand Up @@ -416,6 +429,44 @@ def test_kimi(command):
command.run()


@pytest.mark.parametrize(
"command",
[
*ModelDeployerList(
model_id="nvidia/GLM-4.7-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
),
*ModelDeployerList(
model_id="nvidia/GLM-5-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
),
],
ids=idfn,
)
def test_glm(command):
command.run()


@pytest.mark.parametrize(
"command",
[
*ModelDeployerList(
model_id="nvidia/MiniMax-M2.5-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
),
],
ids=idfn,
)
def test_minimax(command):
command.run()


@pytest.mark.parametrize(
"command",
[
Expand Down Expand Up @@ -451,6 +502,20 @@ def test_kimi(command):
mini_sm=89,
attn_backend="FLASHINFER",
),
*ModelDeployerList(
model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
attn_backend="FLASHINFER",
),
*ModelDeployerList(
model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
backend=("trtllm", "vllm", "sglang"),
tensor_parallel_size=8,
mini_sm=89,
attn_backend="FLASHINFER",
),
],
ids=idfn,
)
Expand Down Expand Up @@ -497,6 +562,22 @@ def test_medusa(command):
tensor_parallel_size=8,
mini_sm=89,
),
*ModelDeployerList(
base_model="nvidia/Kimi-K2-Thinking-NVFP4",
model_id="nvidia/Kimi-K2-Thinking-Eagle3",
backend=("trtllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
eagle3_one_model=False,
),
*ModelDeployerList(
base_model="nvidia/Kimi-K2.5-NVFP4",
model_id="nvidia/Kimi-K2.5-Thinking-Eagle3",
backend=("trtllm", "sglang"),
tensor_parallel_size=8,
mini_sm=100,
eagle3_one_model=False,
),
*ModelDeployerList(
base_model="Qwen/Qwen3-235B-A22B",
model_id="nvidia/Qwen3-235B-A22B-Eagle3",
Expand Down
Loading