From 8d2f99f0230b9a381c27e93c05497c7d9d102439 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Tue, 21 Apr 2026 00:43:20 +0530
Subject: [PATCH 1/5] [Release-fix] Pin transformers<5.6 in release branch

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fdd60b5193a..b129ae67093 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,7 +82,7 @@ hf = [
     "peft>=0.17.0",
     "sentencepiece>=0.2.1",                                                           # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
     "tiktoken",
-    "transformers>=4.56",                                                             # Should match modelopt/torch/__init__.py and noxfile.py
+    "transformers>=4.56,<5.6",                                                        # Should match modelopt/torch/__init__.py and noxfile.py
     "wonderwords",
 ]
 

From 33d87c9dd016779998dd694b17882e65475fb162 Mon Sep 17 00:00:00 2001
From: Sirui Wang <siruiw@r6515-0097.ipp1a1.colossus.nvidia.com>
Date: Sun, 26 Apr 2026 01:35:03 -0700
Subject: [PATCH 2/5] Add deploy cases for new NVIDIA models

---
 tests/examples/llm_ptq/test_deploy.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/examples/llm_ptq/test_deploy.py

diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
old mode 100644
new mode 100755

From c8cc7064d98f8987bd6dd35a0cf9509e15ccbb3e Mon Sep 17 00:00:00 2001
From: Sirui Wang <siruiw@r6515-0097.ipp1a1.colossus.nvidia.com>
Date: Sun, 26 Apr 2026 04:45:55 -0700
Subject: [PATCH 3/5] Add deploy cases for new NVIDIA models

---
 tests/examples/llm_ptq/test_deploy.py | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
index bdada9f8c15..d65aeed5d67 100755
--- a/tests/examples/llm_ptq/test_deploy.py
+++ b/tests/examples/llm_ptq/test_deploy.py
@@ -346,6 +346,13 @@ def test_mixtral(command):
             mini_sm=89,
             attn_backend="FLASHINFER",
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Gemma-4-31B-IT-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=1,
+            mini_sm=100,
+            attn_backend="FLASHINFER",
+        ),
     ],
     ids=idfn,
 )
@@ -416,6 +423,44 @@ def test_kimi(command):
     command.run()
 
 
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/GLM-4.7-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/GLM-5-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_glm(command):
+    command.run()
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        *ModelDeployerList(
+            model_id="nvidia/MiniMax-M2.5-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+    ],
+    ids=idfn,
+)
+def test_minimax(command):
+    command.run()
+
+
 @pytest.mark.parametrize(
     "command",
     [
@@ -451,6 +496,20 @@ def test_kimi(command):
             mini_sm=89,
             attn_backend="FLASHINFER",
         ),
+        *ModelDeployerList(
+            model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+            attn_backend="FLASHINFER",
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=89,
+            attn_backend="FLASHINFER",
+        ),
     ],
     ids=idfn,
 )

From 40e70456dd9ed5234ec7da985dadb172861199bd Mon Sep 17 00:00:00 2001
From: root <root@umbriel-b200-078.ipp4a1.colossus.nvidia.com>
Date: Mon, 27 Apr 2026 00:28:25 -0700
Subject: [PATCH 4/5] Add deploy cases for Kimi Eagle3 and Qwen3-VL models

Made-with: Cursor
---
 tests/examples/llm_ptq/test_deploy.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
index d65aeed5d67..d1f9fa1af46 100755
--- a/tests/examples/llm_ptq/test_deploy.py
+++ b/tests/examples/llm_ptq/test_deploy.py
@@ -246,6 +246,12 @@ def test_llama(command):
             tensor_parallel_size=4,
             mini_sm=100,
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
         *ModelDeployerList(
             model_id="nvidia/Qwen3-30B-A3B-NVFP4",
             backend=("trtllm", "vllm", "sglang"),
@@ -556,6 +562,22 @@ def test_medusa(command):
             tensor_parallel_size=8,
             mini_sm=89,
         ),
+        *ModelDeployerList(
+            base_model="nvidia/Kimi-K2-Thinking-NVFP4",
+            model_id="nvidia/Kimi-K2-Thinking-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+            eagle3_one_model=False,
+        ),
+        *ModelDeployerList(
+            base_model="nvidia/Kimi-K2.5-NVFP4",
+            model_id="nvidia/Kimi-K2.5-Thinking-Eagle3",
+            backend=("trtllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+            eagle3_one_model=False,
+        ),
         *ModelDeployerList(
             base_model="Qwen/Qwen3-235B-A22B",
             model_id="nvidia/Qwen3-235B-A22B-Eagle3",

From 0f4503189b2dadd58b40f175dec69f3b006ec00e Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Tue, 5 May 2026 10:02:04 +0530
Subject: [PATCH 5/5] Apply suggestion from @kevalmorabia97

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e5e6bc4c9ec..8e53eaf09d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,7 +82,7 @@ hf = [
     "peft>=0.17.0",
     "sentencepiece>=0.2.1",                                                           # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
     "tiktoken",
-    "transformers>=4.56,<5.6",                                                        # Should match modelopt/torch/__init__.py and noxfile.py
+    "transformers>=4.56",                                                             # Should match modelopt/torch/__init__.py and noxfile.py
     "wonderwords",
 ]