NVIDIA · ajrasane · Apr 22, 2026 · Apr 22, 2026 · Apr 24, 2026 · May 1, 2026
@@ -540,6 +540,147 @@ def _resolve_file(filename):
         module.__dict__.pop("weight", None)
 
 
+def _maybe_patch_transformers_nemotron_h_mixer_types() -> None:
+    """Patch transformers' Nemotron-H implementation for ``-`` (MLP) blocks.
+
+    transformers 5.5.x ships a Nemotron-H port that is incomplete in three places:
+
+    1. ``NemotronHConfig._pattern_to_list`` maps ``M→mamba``, ``E→moe``, ``*→attention``
+       but forgets ``-→mlp``, so merely loading the config of Nemotron-H-v2
+       (whose ``hybrid_override_pattern`` contains ``-``) raises ``KeyError: '-'``.
+    2. ``PreTrainedConfig.validate_layer_type`` checks ``layer_types`` (aliased to
+       ``layers_block_type`` via ``attribute_map``) against a hard-coded
+       ``ALLOWED_LAYER_TYPES`` tuple that doesn't include ``"mlp"``, so once (1) is
+       fixed the validator rejects the config.
+    3. ``MIXER_TYPES`` in ``modeling_nemotron_h`` registers ``mamba``/``attention``/``moe``
+       but omits ``mlp`` even though ``NemotronHMLP`` is defined in the same module.
+       ``NemotronHBlock`` instantiates mixers as ``cls(config, layer_idx=...)``, which
+       ``NemotronHMLP.__init__`` doesn't accept, so we register a thin adapter.
+
+    All patches are idempotent.
+    """
+    # Extend ALLOWED_LAYER_TYPES so `validate_layer_type` accepts "mlp".
+    try:
+        cu = __import__("transformers.configuration_utils", fromlist=["ALLOWED_LAYER_TYPES"])
+    except ImportError:
+        cu = None
+    if cu is not None:
+        allowed = getattr(cu, "ALLOWED_LAYER_TYPES", None)
+        if isinstance(allowed, tuple) and "mlp" not in allowed:
+            cu.ALLOWED_LAYER_TYPES = (*allowed, "mlp")
+
+    # 1) MIXER_TYPES (modeling)
+    try:
+        mod = __import__(
+            "transformers.models.nemotron_h.modeling_nemotron_h",
+            fromlist=["MIXER_TYPES", "NemotronHMLP"],
+        )
+    except ImportError:
+        mod = None
+
+    if mod is not None:
+        mixer_types = getattr(mod, "MIXER_TYPES", None)
+        nemotron_h_mlp = getattr(mod, "NemotronHMLP", None)
+        if (
+            isinstance(mixer_types, dict)
+            and nemotron_h_mlp is not None
+            and "mlp" not in mixer_types
+        ):
+            # ``nemotron_h_mlp`` is resolved at runtime, so use ``types.new_class`` rather
+            # than a literal ``class`` statement (keeps mypy happy about dynamic bases).
+            import types as _types
+
+            def _mlp_adapter_init(self, config, layer_idx=None, **kwargs):
+                nemotron_h_mlp.__init__(self, config, **kwargs)
+
+            _mlp_adapter_cls = _types.new_class(
+                "_NemotronHMLPMixerAdapter",
+                (nemotron_h_mlp,),
+                {},
+                lambda ns: ns.update({"__init__": _mlp_adapter_init}),
+            )
+            mixer_types["mlp"] = _mlp_adapter_cls
+
+        # ``NemotronHModel.forward`` builds an inline ``block_type_to_mask`` dict that
+        # only knows about ``{"mamba", "attention", "moe"}`` and KeyErrors on "mlp".
+        # ``NemotronHBlock.forward`` routes "mlp"/"moe" through the same ``else`` branch
+        # that ignores the attention mask, so aliasing the MLP block's ``block_type``
+        # to ``"moe"`` after __init__ makes the mask lookup resolve to ``None`` without
+        # affecting mixer dispatch (the mixer instance was already built from
+        # ``layers_block_type[layer_idx] == "mlp"`` via MIXER_TYPES).
+        nemotron_h_block = getattr(mod, "NemotronHBlock", None)
+        if nemotron_h_block is not None and not getattr(
+            nemotron_h_block, "_modelopt_mlp_mask_patched", False
+        ):
+            _orig_init = nemotron_h_block.__init__
+
+            def _patched_init(self, config, layer_idx):
+                _orig_init(self, config, layer_idx)
+                if getattr(self, "block_type", None) == "mlp":
+                    self.block_type = "moe"
+
+            nemotron_h_block.__init__ = _patched_init
+            nemotron_h_block._modelopt_mlp_mask_patched = True
+
+    # 2) NemotronHConfig._pattern_to_list + validate_layers_block_type (configuration).
+    try:
+        cfg_mod = __import__(
+            "transformers.models.nemotron_h.configuration_nemotron_h",
+            fromlist=["NemotronHConfig"],
+        )
+    except ImportError:
+        return
+    cfg_cls = getattr(cfg_mod, "NemotronHConfig", None)
+    if cfg_cls is None or getattr(cfg_cls, "_modelopt_mlp_patched", False):
+        return
+
+    _orig_pattern_to_list = cfg_cls._pattern_to_list
+
+    def _patched_pattern_to_list(pattern: str) -> list:
+        mapping = {"M": "mamba", "E": "moe", "*": "attention", "-": "mlp"}
+        try:
+            return [mapping[ch] for ch in pattern]
+        except KeyError:
+            # Fall back to the stock implementation for any char we didn't add —
+            # this lets future transformers releases keep any additional mappings.
+            return _orig_pattern_to_list(pattern)
+
+    # Assign via ``staticmethod()`` so the attribute is unbound on the class (matches
+    # the original definition) — using the ``@staticmethod`` decorator on a nested
+    # function trips mypy's "staticmethod used with a non-method" check.
+    cfg_cls._pattern_to_list = staticmethod(_patched_pattern_to_list)
+
+    # Allow "mlp" alongside {"mamba", "attention", "moe"} in validate_layers_block_type.
+    # huggingface_hub's @strict_dataclass collects class validators into
+    # ``cls.__class_validators__`` at class-creation time, so we have to replace the
+    # entry in that list (not just overwrite the method attribute).
+    def _patched_validate_layers_block_type(self):
+        if not isinstance(self.layers_block_type, list):
+            raise ValueError(
+                f"`layers_block_type` must be a list of strings. "
+                f"Got type: {type(self.layers_block_type)}"
+            )
+        valid_types = {"mamba", "attention", "moe", "mlp"}
+        if not all(block_type in valid_types for block_type in self.layers_block_type):
+            invalid = set(self.layers_block_type) - valid_types
+            raise ValueError(
+                f"`layers_block_type` contains invalid types: {invalid}. "
+                f"Must be one of: {valid_types}"
+            )
+
+    _patched_validate_layers_block_type.__name__ = "validate_layers_block_type"
+    cfg_cls.validate_layers_block_type = staticmethod(_patched_validate_layers_block_type)
+    class_validators = list(getattr(cfg_cls, "__class_validators__", []))
+    for i, v in enumerate(class_validators):
+        if getattr(v, "__name__", None) == "validate_layers_block_type":
+            class_validators[i] = _patched_validate_layers_block_type
+            break
+    else:
+        class_validators.append(_patched_validate_layers_block_type)
+    cfg_cls.__class_validators__ = class_validators
+    cfg_cls._modelopt_mlp_patched = True
+
+
 def get_model(
     ckpt_path,
     device="cuda",
@@ -548,6 +689,9 @@ def get_model(
     use_seq_device_map=False,
     attn_implementation=None,
 ):
+    # Needs to run before AutoConfig.from_pretrained so the Nemotron-H config can parse
+    # the "-" (MLP) character in hybrid_override_pattern.
+    _maybe_patch_transformers_nemotron_h_mixer_types()
     print(f"Initializing model from {ckpt_path}")
 
     device_map = "auto"
@@ -706,6 +850,22 @@ def has_pack_quantized_config(config):
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
+    # Some model cards ship a generation_config.json that sets sampling hyperparameters
+    # (top_p, temperature) without ``do_sample=True`` (e.g. NVIDIA-Nemotron-3-Nano-4B-BF16).
+    # transformers 5.x strictly validates this on save_pretrained, so the export step
+    # fails with "GenerationConfig is invalid". Normalize by enabling do_sample whenever
+    # a sampling hyperparameter is set — this is only metadata, not behavior during
+    # calibration or export.
+    gen_cfg = getattr(model, "generation_config", None)
+    if gen_cfg is not None and not getattr(gen_cfg, "do_sample", False):
+        has_sampling_hyperparam = (
+            getattr(gen_cfg, "top_p", None) not in (None, 1.0)
+            or getattr(gen_cfg, "top_k", None) not in (None, 0, 50)
+            or getattr(gen_cfg, "temperature", None) not in (None, 1.0)
+        )
+        if has_sampling_hyperparam:
+            gen_cfg.do_sample = True
+
     return model
 
 

@@ -107,6 +107,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "int4_awq": mtq.INT4_AWQ_CFG,
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
     "nvfp4": mtq.NVFP4_DEFAULT_CFG,
+    "nvfp4_wo": mtq.NVFP4_DEFAULT_WEIGHT_ONLY_CFG,
     "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG,
     "nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG,
     "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
@@ -593,6 +594,59 @@ def sparsity_main(
     mts.export(full_model)
 
 
+def _enable_lm_head_and_embedding_quantization(
+    quant_cfg: dict[str, Any],
+    weight_quantizer_cfg: dict[str, Any],
+) -> None:
+    """Re-enable quantization of ``lm_head`` and the input embedding table.
+
+    ModelOpt's default PTQ recipes exclude ``*lm_head*`` and never touch ``nn.Embedding``
+    because most LLM deployment runtimes keep those layers at full precision. For Nemotron-H
+    (and similar SSM+Attention hybrids) the embedding and lm_head are a large fraction of the
+    total parameters — quantizing them recovers most of the promised memory savings. This
+    helper appends two entries to the cfg list that override earlier ``*lm_head*`` disables
+    and explicitly target the embedding weight quantizer.
+
+    Args:
+        quant_cfg: the primary quant_cfg dict (``{"quant_cfg": [...], "algorithm": ...}``).
+        weight_quantizer_cfg: the weight-quantizer attribute dict to apply (e.g. ``_nvfp4_cfg``).
+    """
+    # Ordering matters: these entries must come AFTER the _default_disabled_quantizer_cfg
+    # entries (which set *lm_head* → disabled) so they take effect.
+    quant_cfg["quant_cfg"].append(
+        {"quantizer_name": "*lm_head*weight_quantizer", "cfg": copy.deepcopy(weight_quantizer_cfg)}
+    )
+    # nn.Embedding quantizers only exist once `quant_embedding.py` registers the class.
+    # Nemotron-H's backbone attribute name differs between the remote-code ("backbone.embeddings")
+    # and transformers built-in ("model.embeddings") paths; both are weight-only vocab
+    # embeddings here. The broad "*embeddings*" wildcard covers both and does not match
+    # any other layer in a Nemotron-H model (no positional/rotary embeddings exist).
+    quant_cfg["quant_cfg"].append(
+        {
+            "quantizer_name": "*embeddings*weight_quantizer",
+            "cfg": copy.deepcopy(weight_quantizer_cfg),
+        }
+    )
+    # Also keep the standard HF "embed_tokens" naming in case future Nemotron-H variants
+    # rename the attribute.
+    quant_cfg["quant_cfg"].append(
+        {
+            "quantizer_name": "*embed_tokens*weight_quantizer",
+            "cfg": copy.deepcopy(weight_quantizer_cfg),
+        }
+    )
+
+
+def _extract_weight_quantizer_cfg(quant_cfg: dict[str, Any]) -> dict[str, Any] | None:
+    """Return the first ``*weight_quantizer`` cfg dict from an ordered quant_cfg list."""
+    for entry in quant_cfg.get("quant_cfg", []):
+        if entry.get("quantizer_name") == "*weight_quantizer" and isinstance(
+            entry.get("cfg"), dict
+        ):
+            return entry["cfg"]
+    return None
+
+
 def mono_quantize(
     args: argparse.Namespace,
     quant_cfg: dict[str, Any],
@@ -629,6 +683,24 @@ def mono_quantize(
         )  # Nemotron-Parse specific
         print("Quantization will only be applied to the decoder (text generation) component")
 
+    # For Nemotron-H (Mamba-2 + MLP + Attention hybrid, e.g. NVIDIA-Nemotron-3-Nano-4B),
+    # extend quantization coverage to the lm_head and the input token embedding. On this
+    # architecture those two 131072x3136 tables account for ~21% of parameters, so leaving
+    # them at bf16 wastes most of the NVFP4 memory benefit.
+    if model_type == "nemotron_h":
+        weight_quantizer_cfg = _extract_weight_quantizer_cfg(quant_cfg)
+        if weight_quantizer_cfg is not None:
+            print(
+                "Nemotron-H detected: extending quantization to lm_head and input embedding "
+                "(backbone.embeddings)."
+            )
+            _enable_lm_head_and_embedding_quantization(quant_cfg, weight_quantizer_cfg)
+        else:
+            warnings.warn(
+                "Nemotron-H detected but quant_cfg has no wildcard '*weight_quantizer' entry; "
+                "skipping lm_head/embedding extension (model-specific or non-standard recipe)."
+            )
+
     if not model_is_already_quantized or calibration_only:
         # quantize the model
 

@@ -69,6 +69,7 @@
 from .layer_utils import (
     get_expert_linear_names,
     get_experts_list,
+    is_embedding,
     is_layernorm,
     is_moe,
     is_quantlinear,
@@ -650,7 +651,7 @@ def _process_quantized_modules(
             # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear
             if type(sub_module).__name__ == "QuantMoELinear":
                 continue
-            if is_quantlinear(sub_module):
+            if is_quantlinear(sub_module) or is_embedding(sub_module):
                 try:
                     with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                         _export_quantized_weight(sub_module, dtype)

@@ -588,6 +588,8 @@ def _nvfp4_selective_quant_cfg(
 
 NVFP4_DEFAULT_CFG = _nvfp4_selective_quant_cfg(["*"])
 
+NVFP4_DEFAULT_WEIGHT_ONLY_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True)
+
 NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = {
     "quant_cfg": [
         *_base_disable_all,
@@ -816,6 +818,7 @@ def _nvfp4_selective_quant_cfg(
     "NVFP4_AWQ_FULL_CFG",
     "NVFP4_AWQ_LITE_CFG",
     "NVFP4_DEFAULT_CFG",
+    "NVFP4_DEFAULT_WEIGHT_ONLY_CFG",
     "NVFP4_FP8_MHA_CONFIG",
     "NVFP4_KV_CFG",
     "NVFP4_KV_ROTATE_CFG",

@@ -18,6 +18,7 @@
 from .modules.quant_activations import *
 from .modules.quant_batchnorm import *
 from .modules.quant_conv import *
+from .modules.quant_embedding import *
 from .modules.quant_instancenorm import *
 from .modules.quant_linear import *
 from .modules.quant_module import *

@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantized Embedding.
+
+``nn.Embedding`` quantization is weight-only: only the lookup table (``weight``) is
+fake-quantized. Embedding inputs are integer indices — their ``input_quantizer`` is
+registered (so config entries like ``"*input_quantizer"`` can still target it) but is
+disabled by default so integer tensors pass through untouched.
+"""
+
+import torch.nn as nn
+
+from ... import tensor_quant
+from .quant_module import QuantLinearConvBase, QuantModuleRegistry
+
+__all__ = ["QuantEmbedding"]
+
+
+@QuantModuleRegistry.register({nn.Embedding: "nn.Embedding"})
+class _QuantEmbedding(QuantLinearConvBase):
+    """Quantized base class for ``nn.Embedding``.
+
+    Weight-only quantization. Input/output quantizers are created (so wildcard configs
+    still resolve cleanly) but are disabled — an embedding's input is an index tensor.
+    """
+
+    default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW
+
+    def _setup(self):
+        super()._setup()
+        # Embedding inputs are integer indices; never fake-quantize them.
+        self.input_quantizer.disable()
+        # output_quantizer is already disabled by QuantInputBase._setup().
+
+
+# Alias to follow the naming convention of QuantLinear.
+QuantEmbedding = _QuantEmbedding