Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions examples/llm_ptq/example_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,147 @@ def _resolve_file(filename):
module.__dict__.pop("weight", None)


def _maybe_patch_transformers_nemotron_h_mixer_types() -> None:
"""Patch transformers' Nemotron-H implementation for ``-`` (MLP) blocks.

transformers 5.5.x ships a Nemotron-H port that is incomplete in three places:

1. ``NemotronHConfig._pattern_to_list`` maps ``M→mamba``, ``E→moe``, ``*→attention``
but forgets ``-→mlp``, so merely loading the config of Nemotron-H-v2
(whose ``hybrid_override_pattern`` contains ``-``) raises ``KeyError: '-'``.
2. ``PreTrainedConfig.validate_layer_type`` checks ``layer_types`` (aliased to
``layers_block_type`` via ``attribute_map``) against a hard-coded
``ALLOWED_LAYER_TYPES`` tuple that doesn't include ``"mlp"``, so once (1) is
fixed the validator rejects the config.
3. ``MIXER_TYPES`` in ``modeling_nemotron_h`` registers ``mamba``/``attention``/``moe``
but omits ``mlp`` even though ``NemotronHMLP`` is defined in the same module.
``NemotronHBlock`` instantiates mixers as ``cls(config, layer_idx=...)``, which
``NemotronHMLP.__init__`` doesn't accept, so we register a thin adapter.

All patches are idempotent.
"""
# Extend ALLOWED_LAYER_TYPES so `validate_layer_type` accepts "mlp".
try:
cu = __import__("transformers.configuration_utils", fromlist=["ALLOWED_LAYER_TYPES"])
except ImportError:
cu = None
if cu is not None:
allowed = getattr(cu, "ALLOWED_LAYER_TYPES", None)
if isinstance(allowed, tuple) and "mlp" not in allowed:
cu.ALLOWED_LAYER_TYPES = (*allowed, "mlp")

# 1) MIXER_TYPES (modeling)
try:
mod = __import__(
"transformers.models.nemotron_h.modeling_nemotron_h",
fromlist=["MIXER_TYPES", "NemotronHMLP"],
)
except ImportError:
mod = None

if mod is not None:
mixer_types = getattr(mod, "MIXER_TYPES", None)
nemotron_h_mlp = getattr(mod, "NemotronHMLP", None)
if (
isinstance(mixer_types, dict)
and nemotron_h_mlp is not None
and "mlp" not in mixer_types
):
# ``nemotron_h_mlp`` is resolved at runtime, so use ``types.new_class`` rather
# than a literal ``class`` statement (keeps mypy happy about dynamic bases).
import types as _types

def _mlp_adapter_init(self, config, layer_idx=None, **kwargs):
nemotron_h_mlp.__init__(self, config, **kwargs)

_mlp_adapter_cls = _types.new_class(
"_NemotronHMLPMixerAdapter",
(nemotron_h_mlp,),
{},
lambda ns: ns.update({"__init__": _mlp_adapter_init}),
)
mixer_types["mlp"] = _mlp_adapter_cls

# ``NemotronHModel.forward`` builds an inline ``block_type_to_mask`` dict that
# only knows about ``{"mamba", "attention", "moe"}`` and KeyErrors on "mlp".
# ``NemotronHBlock.forward`` routes "mlp"/"moe" through the same ``else`` branch
# that ignores the attention mask, so aliasing the MLP block's ``block_type``
# to ``"moe"`` after __init__ makes the mask lookup resolve to ``None`` without
# affecting mixer dispatch (the mixer instance was already built from
# ``layers_block_type[layer_idx] == "mlp"`` via MIXER_TYPES).
nemotron_h_block = getattr(mod, "NemotronHBlock", None)
if nemotron_h_block is not None and not getattr(
nemotron_h_block, "_modelopt_mlp_mask_patched", False
):
_orig_init = nemotron_h_block.__init__

def _patched_init(self, config, layer_idx):
_orig_init(self, config, layer_idx)
if getattr(self, "block_type", None) == "mlp":
self.block_type = "moe"

nemotron_h_block.__init__ = _patched_init
nemotron_h_block._modelopt_mlp_mask_patched = True

# 2) NemotronHConfig._pattern_to_list + validate_layers_block_type (configuration).
try:
cfg_mod = __import__(
"transformers.models.nemotron_h.configuration_nemotron_h",
fromlist=["NemotronHConfig"],
)
except ImportError:
return
cfg_cls = getattr(cfg_mod, "NemotronHConfig", None)
if cfg_cls is None or getattr(cfg_cls, "_modelopt_mlp_patched", False):
return

_orig_pattern_to_list = cfg_cls._pattern_to_list

def _patched_pattern_to_list(pattern: str) -> list:
mapping = {"M": "mamba", "E": "moe", "*": "attention", "-": "mlp"}
try:
return [mapping[ch] for ch in pattern]
except KeyError:
# Fall back to the stock implementation for any char we didn't add —
# this lets future transformers releases keep any additional mappings.
return _orig_pattern_to_list(pattern)

# Assign via ``staticmethod()`` so the attribute is unbound on the class (matches
# the original definition) — using the ``@staticmethod`` decorator on a nested
# function trips mypy's "staticmethod used with a non-method" check.
cfg_cls._pattern_to_list = staticmethod(_patched_pattern_to_list)

# Allow "mlp" alongside {"mamba", "attention", "moe"} in validate_layers_block_type.
# huggingface_hub's @strict_dataclass collects class validators into
# ``cls.__class_validators__`` at class-creation time, so we have to replace the
# entry in that list (not just overwrite the method attribute).
def _patched_validate_layers_block_type(self):
if not isinstance(self.layers_block_type, list):
raise ValueError(
f"`layers_block_type` must be a list of strings. "
f"Got type: {type(self.layers_block_type)}"
)
valid_types = {"mamba", "attention", "moe", "mlp"}
if not all(block_type in valid_types for block_type in self.layers_block_type):
invalid = set(self.layers_block_type) - valid_types
raise ValueError(
f"`layers_block_type` contains invalid types: {invalid}. "
f"Must be one of: {valid_types}"
)

_patched_validate_layers_block_type.__name__ = "validate_layers_block_type"
cfg_cls.validate_layers_block_type = staticmethod(_patched_validate_layers_block_type)
class_validators = list(getattr(cfg_cls, "__class_validators__", []))
for i, v in enumerate(class_validators):
if getattr(v, "__name__", None) == "validate_layers_block_type":
class_validators[i] = _patched_validate_layers_block_type
break
else:
class_validators.append(_patched_validate_layers_block_type)
cfg_cls.__class_validators__ = class_validators
cfg_cls._modelopt_mlp_patched = True


def get_model(
ckpt_path,
device="cuda",
Expand All @@ -548,6 +689,9 @@ def get_model(
use_seq_device_map=False,
attn_implementation=None,
):
# Needs to run before AutoConfig.from_pretrained so the Nemotron-H config can parse
# the "-" (MLP) character in hybrid_override_pattern.
_maybe_patch_transformers_nemotron_h_mixer_types()
print(f"Initializing model from {ckpt_path}")

device_map = "auto"
Expand Down Expand Up @@ -706,6 +850,22 @@ def has_pack_quantized_config(config):
if device == "cuda" and not is_model_on_gpu(model):
print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")

# Some model cards ship a generation_config.json that sets sampling hyperparameters
# (top_p, temperature) without ``do_sample=True`` (e.g. NVIDIA-Nemotron-3-Nano-4B-BF16).
# transformers 5.x strictly validates this on save_pretrained, so the export step
# fails with "GenerationConfig is invalid". Normalize by enabling do_sample whenever
# a sampling hyperparameter is set — this is only metadata, not behavior during
# calibration or export.
gen_cfg = getattr(model, "generation_config", None)
if gen_cfg is not None and not getattr(gen_cfg, "do_sample", False):
has_sampling_hyperparam = (
getattr(gen_cfg, "top_p", None) not in (None, 1.0)
or getattr(gen_cfg, "top_k", None) not in (None, 0, 50)
or getattr(gen_cfg, "temperature", None) not in (None, 1.0)
)
if has_sampling_hyperparam:
gen_cfg.do_sample = True
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

cd examples/llm_ptq && head -n 1 example_utils.py && wc -l example_utils.py

Repository: NVIDIA/Model-Optimizer

Length of output: 186


🏁 Script executed:

cd examples/llm_ptq && sed -n '830,880p' example_utils.py

Repository: NVIDIA/Model-Optimizer

Length of output: 2298


🏁 Script executed:

cd examples/llm_ptq && grep -n "def get_model" example_utils.py | head -5

Repository: NVIDIA/Model-Optimizer

Length of output: 84


🏁 Script executed:

cd examples/llm_ptq && rg "get_model\(" hf_ptq.py -A 5 -B 2

Repository: NVIDIA/Model-Optimizer

Length of output: 430


🏁 Script executed:

cd examples/llm_ptq && rg "\.generate\(" hf_ptq.py -B 3 -A 3

Repository: NVIDIA/Model-Optimizer

Length of output: 1010


🏁 Script executed:

cd examples/llm_ptq && sed -n '1,50p' hf_ptq.py | head -30

Repository: NVIDIA/Model-Optimizer

Length of output: 1060


🏁 Script executed:

cd examples/llm_ptq && grep -n "full_model" hf_ptq.py | head -20

Repository: NVIDIA/Model-Optimizer

Length of output: 1080


🏁 Script executed:

cd examples/llm_ptq && sed -n '300,350p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 1721


🏁 Script executed:

cd examples/llm_ptq && sed -n '334,345p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 465


🏁 Script executed:

cd examples/llm_ptq && sed -n '350,365p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 645


🏁 Script executed:

cd examples/llm_ptq && grep -n "generated_ids_before_ptq" hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 816


🏁 Script executed:

cd examples/llm_ptq && sed -n '257,270p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 653


🏁 Script executed:

cd examples/llm_ptq && sed -n '580,600p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 689


🏁 Script executed:

cd examples/llm_ptq && sed -n '880,925p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 1892


🏁 Script executed:

cd examples/llm_ptq && sed -n '1100,1125p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 1108


🏁 Script executed:

cd examples/llm_ptq && sed -n '970,1020p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 2331


🏁 Script executed:

cd examples/llm_ptq && sed -n '1180,1220p' hf_ptq.py

Repository: NVIDIA/Model-Optimizer

Length of output: 1472


Don't mutate the live generation_config in get_model().

The mutation persists on the returned model object, and both the before-PTQ and after-PTQ preview calls (full_model.generate() at lines 922 and 980 in hf_ptq.py) use that same model instance. For checkpoints with sampling hyperparameters, this makes the previews non-deterministic instead of deterministic, undermining PTQ smoke test comparisons. Normalize a copy during export instead.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/llm_ptq/example_utils.py` around lines 853 - 867, The current code
mutates the live model.generation_config (gen_cfg) which makes the same model
instance used by get_model() non-deterministic; instead, create a copy of the
generation_config (e.g., via copy.deepcopy or by constructing a new
GenerationConfig from the dict) and modify the copy’s do_sample flag, leaving
model.generation_config unchanged; update the export/normalization logic around
gen_cfg to use this gen_cfg_copy (or a temporary variable) so
previews/full_model.generate() remain deterministic and only the exported
metadata contains the normalized setting.


return model


Expand Down
72 changes: 72 additions & 0 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"nvfp4": mtq.NVFP4_DEFAULT_CFG,
"nvfp4_wo": mtq.NVFP4_DEFAULT_WEIGHT_ONLY_CFG,
"nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG,
"nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG,
"fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
Expand Down Expand Up @@ -593,6 +594,59 @@ def sparsity_main(
mts.export(full_model)


def _enable_lm_head_and_embedding_quantization(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we define this in the modelop_recipe if everything modelopt_recipes/models can be captured with our yaml recipe system?

quant_cfg: dict[str, Any],
weight_quantizer_cfg: dict[str, Any],
) -> None:
"""Re-enable quantization of ``lm_head`` and the input embedding table.

ModelOpt's default PTQ recipes exclude ``*lm_head*`` and never touch ``nn.Embedding``
because most LLM deployment runtimes keep those layers at full precision. For Nemotron-H
(and similar SSM+Attention hybrids) the embedding and lm_head are a large fraction of the
total parameters — quantizing them recovers most of the promised memory savings. This
helper appends two entries to the cfg list that override earlier ``*lm_head*`` disables
and explicitly target the embedding weight quantizer.

Args:
quant_cfg: the primary quant_cfg dict (``{"quant_cfg": [...], "algorithm": ...}``).
weight_quantizer_cfg: the weight-quantizer attribute dict to apply (e.g. ``_nvfp4_cfg``).
"""
# Ordering matters: these entries must come AFTER the _default_disabled_quantizer_cfg
# entries (which set *lm_head* → disabled) so they take effect.
quant_cfg["quant_cfg"].append(
{"quantizer_name": "*lm_head*weight_quantizer", "cfg": copy.deepcopy(weight_quantizer_cfg)}
)
# nn.Embedding quantizers only exist once `quant_embedding.py` registers the class.
# Nemotron-H's backbone attribute name differs between the remote-code ("backbone.embeddings")
# and transformers built-in ("model.embeddings") paths; both are weight-only vocab
# embeddings here. The broad "*embeddings*" wildcard covers both and does not match
# any other layer in a Nemotron-H model (no positional/rotary embeddings exist).
quant_cfg["quant_cfg"].append(
{
"quantizer_name": "*embeddings*weight_quantizer",
"cfg": copy.deepcopy(weight_quantizer_cfg),
}
)
# Also keep the standard HF "embed_tokens" naming in case future Nemotron-H variants
# rename the attribute.
quant_cfg["quant_cfg"].append(
{
"quantizer_name": "*embed_tokens*weight_quantizer",
"cfg": copy.deepcopy(weight_quantizer_cfg),
}
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated


def _extract_weight_quantizer_cfg(quant_cfg: dict[str, Any]) -> dict[str, Any] | None:
"""Return the first ``*weight_quantizer`` cfg dict from an ordered quant_cfg list."""
for entry in quant_cfg.get("quant_cfg", []):
if entry.get("quantizer_name") == "*weight_quantizer" and isinstance(
entry.get("cfg"), dict
):
return entry["cfg"]
return None


def mono_quantize(
args: argparse.Namespace,
quant_cfg: dict[str, Any],
Expand Down Expand Up @@ -629,6 +683,24 @@ def mono_quantize(
) # Nemotron-Parse specific
print("Quantization will only be applied to the decoder (text generation) component")

# For Nemotron-H (Mamba-2 + MLP + Attention hybrid, e.g. NVIDIA-Nemotron-3-Nano-4B),
# extend quantization coverage to the lm_head and the input token embedding. On this
# architecture those two 131072x3136 tables account for ~21% of parameters, so leaving
# them at bf16 wastes most of the NVFP4 memory benefit.
if model_type == "nemotron_h":
weight_quantizer_cfg = _extract_weight_quantizer_cfg(quant_cfg)
if weight_quantizer_cfg is not None:
print(
"Nemotron-H detected: extending quantization to lm_head and input embedding "
"(backbone.embeddings)."
)
_enable_lm_head_and_embedding_quantization(quant_cfg, weight_quantizer_cfg)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
else:
warnings.warn(
"Nemotron-H detected but quant_cfg has no wildcard '*weight_quantizer' entry; "
"skipping lm_head/embedding extension (model-specific or non-standard recipe)."
)

if not model_is_already_quantized or calibration_only:
# quantize the model

Expand Down
3 changes: 2 additions & 1 deletion modelopt/torch/export/unified_export_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from .layer_utils import (
get_expert_linear_names,
get_experts_list,
is_embedding,
is_layernorm,
is_moe,
is_quantlinear,
Expand Down Expand Up @@ -650,7 +651,7 @@ def _process_quantized_modules(
# Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear
if type(sub_module).__name__ == "QuantMoELinear":
continue
if is_quantlinear(sub_module):
if is_quantlinear(sub_module) or is_embedding(sub_module):
try:
with fsdp2_aware_weight_update(model, sub_module, reshard=False):
_export_quantized_weight(sub_module, dtype)
Expand Down
3 changes: 3 additions & 0 deletions modelopt/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,8 @@ def _nvfp4_selective_quant_cfg(

NVFP4_DEFAULT_CFG = _nvfp4_selective_quant_cfg(["*"])

NVFP4_DEFAULT_WEIGHT_ONLY_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True)

NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = {
"quant_cfg": [
*_base_disable_all,
Expand Down Expand Up @@ -816,6 +818,7 @@ def _nvfp4_selective_quant_cfg(
"NVFP4_AWQ_FULL_CFG",
"NVFP4_AWQ_LITE_CFG",
"NVFP4_DEFAULT_CFG",
"NVFP4_DEFAULT_WEIGHT_ONLY_CFG",
"NVFP4_FP8_MHA_CONFIG",
"NVFP4_KV_CFG",
"NVFP4_KV_ROTATE_CFG",
Expand Down
1 change: 1 addition & 0 deletions modelopt/torch/quantization/nn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .modules.quant_activations import *
from .modules.quant_batchnorm import *
from .modules.quant_conv import *
from .modules.quant_embedding import *
from .modules.quant_instancenorm import *
from .modules.quant_linear import *
from .modules.quant_module import *
Expand Down
50 changes: 50 additions & 0 deletions modelopt/torch/quantization/nn/modules/quant_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Quantized Embedding.

``nn.Embedding`` quantization is weight-only: only the lookup table (``weight``) is
fake-quantized. Embedding inputs are integer indices — their ``input_quantizer`` is
registered (so config entries like ``"*input_quantizer"`` can still target it) but is
disabled by default so integer tensors pass through untouched.
"""

import torch.nn as nn

from ... import tensor_quant
from .quant_module import QuantLinearConvBase, QuantModuleRegistry

__all__ = ["QuantEmbedding"]


@QuantModuleRegistry.register({nn.Embedding: "nn.Embedding"})
class _QuantEmbedding(QuantLinearConvBase):
"""Quantized base class for ``nn.Embedding``.

Weight-only quantization. Input/output quantizers are created (so wildcard configs
still resolve cleanly) but are disabled — an embedding's input is an index tensor.
"""

default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW

def _setup(self):
super()._setup()
# Embedding inputs are integer indices; never fake-quantize them.
self.input_quantizer.disable()
# output_quantizer is already disabled by QuantInputBase._setup().


# Alias to follow the naming convention of QuantLinear.
QuantEmbedding = _QuantEmbedding
Loading