From 8d2f99f0230b9a381c27e93c05497c7d9d102439 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 21 Apr 2026 00:43:20 +0530 Subject: [PATCH 1/5] [Release-fix] Pin transformers<5.6 in release branch Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fdd60b5193a..b129ae67093 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ hf = [ "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export "tiktoken", - "transformers>=4.56", # Should match modelopt/torch/__init__.py and noxfile.py + "transformers>=4.56,<5.6", # Should match modelopt/torch/__init__.py and noxfile.py "wonderwords", ] From 33d87c9dd016779998dd694b17882e65475fb162 Mon Sep 17 00:00:00 2001 From: Sirui Wang Date: Sun, 26 Apr 2026 01:35:03 -0700 Subject: [PATCH 2/5] Add deploy cases for new NVIDIA models --- tests/examples/llm_ptq/test_deploy.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/examples/llm_ptq/test_deploy.py diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py old mode 100644 new mode 100755 From c8cc7064d98f8987bd6dd35a0cf9509e15ccbb3e Mon Sep 17 00:00:00 2001 From: Sirui Wang Date: Sun, 26 Apr 2026 04:45:55 -0700 Subject: [PATCH 3/5] Add deploy cases for new NVIDIA models --- tests/examples/llm_ptq/test_deploy.py | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py index bdada9f8c15..d65aeed5d67 100755 --- a/tests/examples/llm_ptq/test_deploy.py +++ b/tests/examples/llm_ptq/test_deploy.py @@ -346,6 +346,13 @@ def test_mixtral(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/Gemma-4-31B-IT-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=100, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) @@ -416,6 +423,44 @@ def test_kimi(command): command.run() +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/GLM-4.7-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/GLM-5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_glm(command): + command.run() + + +@pytest.mark.parametrize( + "command", + [ + *ModelDeployerList( + model_id="nvidia/MiniMax-M2.5-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + ], + ids=idfn, +) +def test_minimax(command): + command.run() + + @pytest.mark.parametrize( "command", [ @@ -451,6 +496,20 @@ def test_kimi(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + attn_backend="FLASHINFER", + ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) From 40e70456dd9ed5234ec7da985dadb172861199bd Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 00:28:25 -0700 Subject: [PATCH 4/5] Add deploy cases for Kimi Eagle3 and Qwen3-VL models Made-with: Cursor --- tests/examples/llm_ptq/test_deploy.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py index d65aeed5d67..d1f9fa1af46 100755 --- a/tests/examples/llm_ptq/test_deploy.py +++ b/tests/examples/llm_ptq/test_deploy.py @@ -246,6 +246,12 @@ def test_llama(command): tensor_parallel_size=4, mini_sm=100, ), + *ModelDeployerList( + model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), *ModelDeployerList( model_id="nvidia/Qwen3-30B-A3B-NVFP4", backend=("trtllm", "vllm", "sglang"), @@ -556,6 +562,22 @@ def test_medusa(command): tensor_parallel_size=8, mini_sm=89, ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2-Thinking-NVFP4", + model_id="nvidia/Kimi-K2-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), + *ModelDeployerList( + base_model="nvidia/Kimi-K2.5-NVFP4", + model_id="nvidia/Kimi-K2.5-Thinking-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + eagle3_one_model=False, + ), *ModelDeployerList( base_model="Qwen/Qwen3-235B-A22B", model_id="nvidia/Qwen3-235B-A22B-Eagle3", From 0f4503189b2dadd58b40f175dec69f3b006ec00e Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 5 May 2026 10:02:04 +0530 Subject: [PATCH 5/5] Apply suggestion from @kevalmorabia97 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e5e6bc4c9ec..8e53eaf09d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ hf = [ "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export "tiktoken", - "transformers>=4.56,<5.6", # Should match modelopt/torch/__init__.py and noxfile.py + "transformers>=4.56", # Should match modelopt/torch/__init__.py and noxfile.py "wonderwords", ]