diff --git a/.claude/skills/release-cherry-pick/SKILL.md b/.claude/skills/release-cherry-pick/SKILL.md new file mode 100644 index 0000000000..c0fe608b16 --- /dev/null +++ b/.claude/skills/release-cherry-pick/SKILL.md @@ -0,0 +1,89 @@ +--- +name: release-cherry-pick +description: Cherry-pick merged PRs labeled for a release branch into that branch, then open a PR and apply the cherry-pick-done label. Use when asked to "cherry-pick PRs for release/X.Y.Z", "pick PRs to release branch", or "cherry-pick labeled PRs". +--- + +# Cherry-pick PRs to a Release Branch + +Cherry-pick all merged `main` PRs labeled `cherry-pick-` (but not `cherry-pick-done`) into the corresponding `release/` branch, one by one in merge order. + +## Step 1 — Identify the target version + +Ask the user for the release version (e.g. `0.44.0`) if not already provided. + +Set `VERSION=` for use in subsequent steps. + +## Step 2 — Fetch pending PRs + +Use the GitHub search API to list PRs that have the cherry-pick label but not cherry-pick-done, sorted by merge date ascending: + +```bash +gh api "search/issues?q=repo:NVIDIA/Model-Optimizer+is:pr+is:merged+base:main+label:cherry-pick-+-label:cherry-pick-done&sort=updated&order=asc&per_page=50" \ + --jq '.items[] | [.number, .title, .pull_request.merged_at] | @tsv' \ + | sort -t$'\t' -k3 +``` + +Present the list to the user before proceeding. + +## Step 3 — Set up the release branch + +Check out `release/`, creating it from the remote if it doesn't exist locally: + +```bash +git fetch origin release/ +git checkout release/ +``` + +## Step 4 — Get merge commit SHAs + +All PRs are squash-merged, so each has a single-parent commit. Retrieve the SHA for each PR: + +```bash +gh pr view --repo NVIDIA/Model-Optimizer --json mergeCommit --jq '.mergeCommit.oid' +``` + +## Step 5 — Cherry-pick in merge order + +Cherry-pick each commit with `-s` (DCO sign-off). GPG signing is handled automatically by the repo's git config. + +```bash +git cherry-pick -s +``` + +**On conflict:** Tell the user which PR caused the conflict and ask them to fix it, then continue: + +```bash +git cherry-pick --continue +``` + +## Step 6 — Create a PR to the release branch + +Push the cherry-picks to a new branch and open a PR targeting `release/`. The PR title lists every cherry-picked PR number. The body uses `## Cherry-picked PRs` as the only heading with one `- #` bullet per PR — no titles, no links, no extra text. + +```bash +git checkout -B cherry-picks/release- +git push -u origin cherry-picks/release- + +gh pr create \ + --title "[Cherry-pick] PRs # # ..." \ + --base release/ \ + --head cherry-picks/release- \ + --body "$(cat <<'EOF' +## Cherry-picked PRs + +- # +- # +... +EOF +)" +``` + +## Step 7 — Apply cherry-pick-done label + +Add the `cherry-pick-done` label to every PR that was successfully cherry-picked: + +```bash +for pr in ...; do + gh pr edit $pr --repo NVIDIA/Model-Optimizer --add-label "cherry-pick-done" +done +``` diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 628aead7ee..ec378ff3ea 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -39,7 +39,7 @@ jobs: matrix: include: - example: gpu - timeout: 60 + timeout: 75 container_image: pytorch:26.01-py3 # tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98 - example: gpu_megatron diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 9832f0cc60..e0933babf6 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -99,6 +99,7 @@ jobs: - {nox_session: "unit-3.10(torch_211, tf_latest)", python_version: "3.10"} - {nox_session: "unit-3.11(torch_211, tf_latest)", python_version: "3.11"} - {nox_session: "unit-3.13(torch_211, tf_latest)", python_version: "3.13"} + - {nox_session: "unit-3.14(torch_211, tf_latest)", python_version: "3.14"} - {nox_session: "unit-3.12(torch_28, tf_latest)", python_version: "3.12"} - {nox_session: "unit-3.12(torch_29, tf_latest)", python_version: "3.12"} - {nox_session: "unit-3.12(torch_210, tf_latest)", python_version: "3.12"} diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2b595515f5..7c08a9e0d3 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog **New Features** - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. +- Add end-to-end tutorial for Minitron pruning + distillation + quantization + evaluation + vLLM deployment for Nemotron-Nano-9B-v2 → Pruned 7B along with data blend preparation steps (and ablation study). See `examples/pruning/minitron/README.md `_ for details. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. - Added iterator interface using CalibrationDataReader in ONNX quantization workflow. - Add N:M sparse softmax support to the Triton flash attention kernel (``modelopt.torch.kernels.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md `_ for usage. @@ -17,6 +18,7 @@ Changelog - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution. - Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml `_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml `_ for usage. - Add implicit GEMM CUDA kernel for Conv3D with fused NVFP4 fake quantization (``modelopt.torch.quantization.src.conv``). When NVFP4 quantization is applied to an ``nn.Conv3d`` layer via ModelOpt PTQ, the implicit GEMM path is used automatically instead of cuDNN. Uses BF16 WMMA tensor cores (SM80+) with FP32 accumulation and in-kernel FP4 (E2M1) activation quantization. Grouped convolution (``groups > 1``) falls back to the default cuDNN path. Inference only — training mode falls back to cuDNN with a warning. +- Add FP8 MHA quantization support for vision transformers. Adds an attention-aware ONNX post-processing pass (scale Mul / K-transpose move before Q, Q→DQ insertion on softmax output) in :class:`FP8QuantExporter `, per-instance nested-attention-wrapper skipping in the HF plugin, and ``nn.LayerNorm`` registration in ``QuantModuleRegistry`` so BMM input quantizers and LayerNorm output quantizers defined in FP8_DEFAULT_CFG are honored end-to-end. See `examples/torch_onnx/torch_quant_to_onnx.py `_ for the general timm-model quantize→ONNX workflow. **Backward Breaking Changes** @@ -29,6 +31,7 @@ Changelog - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this. - Fix TRT support for remote autotuning in ONNX Autotune from 10.16+ to 10.15+ and fix TRT versioning check to the ``trtexec`` version instead of the TRT Python API when using ``trtexec`` backend. - Exclude MatMul/Gemm nodes with K or N < 16 from ONNX INT8 and FP8 quantization. Such small-dimension GEMMs cannot efficiently use INT8/FP8 Tensor Cores and the added Q/DQ layers cause perf regressions in TensorRT. Honors Gemm ``transB`` when deriving K. +- Fix ``nvfp4_awq`` export ``AssertionError: Modules have different quantization formats`` for MoE models (e.g. Qwen3-30B-A3B) when some experts are not exercised by the calibration data. ``awq_lite`` now applies a neutral all-ones ``pre_quant_scale`` to any expert that ends up disabled (no cache-pass tokens, NaN scales, or no search-pass tokens) so its format remains ``nvfp4_awq``, consistent with the rest of the MoE block. A warning is emitted whenever this fallback fires. **Misc** @@ -36,6 +39,7 @@ Changelog - Bump minimum required PyTorch version to 2.8. - [Experimental] Add support for transformers>=5.0, including generic PTQ and unified HF checkpoint export for fused MoE expert modules (Mixtral, Qwen2-MoE, Qwen3-MoE, Qwen3.5-MoE, DeepSeek-V3, Jamba, OLMoE, etc.). - Improve ``megatron_preprocess_data``: add ``--reasoning_content`` support for Nemotron v3 datasets, eliminate intermediate JSONL for HuggingFace datasets, return output file prefixes from the Python API, add gzip input support (``.jsonl.gz``), add ``--strip_newlines`` flag for plain-text pretraining data, add ``--hf_streaming`` for very large datasets (only consumed rows downloaded), and auto-shuffle when ``--hf_max_samples_per_split`` is set to avoid biased sampling. +- Add installation support for Python 3.14. Only basic unit tests are verified for now. Production usage still defaults to Python 3.12. Python 3.10 support will be dropped in the next release. 0.43 (2026-04-16) ^^^^^^^^^^^^^^^^^ diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index a18b45ee7c..1c3f17fc0f 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -12,7 +12,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | Architecture | x86_64, aarch64 (SBSA) | +-------------------------+-----------------------------+ -| Python | >=3.10,<3.14 | +| Python | >=3.10,<3.15 | +-------------------------+-----------------------------+ | CUDA | 12.x, 13.x | +-------------------------+-----------------------------+ diff --git a/docs/source/getting_started/windows/_installation_standalone.rst b/docs/source/getting_started/windows/_installation_standalone.rst index 500b480e12..1fd1c3fca5 100644 --- a/docs/source/getting_started/windows/_installation_standalone.rst +++ b/docs/source/getting_started/windows/_installation_standalone.rst @@ -64,6 +64,22 @@ If you need to use any other EP for calibration, you can uninstall the existing By default, ModelOpt-Windows utilizes the `cupy-cuda12x `_ tool for GPU acceleration during the INT4 ONNX quantization process. This is compatible with CUDA 12.x. +If you are using CUDA 13.x, update CUDA-dependent packages manually: + +For official ONNX Runtime guidance, see `Nightly builds for CUDA 13.x `_. + +1. Uninstall ``cupy-cuda12x`` and install ``cupy-cuda13x``. +2. Uninstall ``onnxruntime-genai-cuda`` and ``onnxruntime-gpu``. +3. Install ONNX Runtime CUDA 13 nightly and the pre-release ``onnxruntime-genai-cuda`` package. + +.. code-block:: bash + + pip uninstall -y cupy-cuda12x onnxruntime-genai-cuda onnxruntime-gpu + pip install cupy-cuda13x + pip install coloredlogs flatbuffers numpy packaging protobuf sympy + pip install --pre --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-13-nightly/pypi/simple/ onnxruntime-gpu + pip install --pre onnxruntime-genai-cuda + **6. Verify Installation** Ensure the following steps are verified: diff --git a/examples/dataset/MEGATRON_DATA_PREP.md b/examples/dataset/MEGATRON_DATA_PREP.md new file mode 100644 index 0000000000..c3904d2a0f --- /dev/null +++ b/examples/dataset/MEGATRON_DATA_PREP.md @@ -0,0 +1,242 @@ +# Tokenizing for Megatron Frameworks + +| **Section** | **Description** | **Link** | +| :---: | :---: | :---: | +| From JSONL files | Tokenize local JSONL files | \[[Link](#from-jsonl-files)\] | +| From Hugging Face Hub | Stream or download HF datasets and tokenize | \[[Link](#from-hugging-face-hub)\] | +| `reasoning_content` for Post-Training v3 | Control how chain-of-thought traces are handled | \[[Link](#reasoning_content-for-post-training-v3-datasets)\] | +| Nemotron Pre/Post-Training Datasets | Ready-to-run commands for all Nemotron datasets | \[[Link](#ready-to-run-tokenization-commands)\] | + +The distillation and pre-training scripts in Megatron-Bridge or Megatron-LM expect data pre-tokenized in Megatron's binary indexed format (`.bin` / `.idx`). +Use the `megatron_preprocess_data` utility to tokenize any JSONL or Hugging Face dataset. +The tokenization scripts below print the list of output prefixes (e.g. `tokenized_qwen3/data1_text`) that you can use for the `data_paths` argument (with relative weights on different files) in Megatron training scripts. + +**Important Notes:** + +- For Pretraining / raw-text data (`text` key) — use `--append_eod` so Megatron can tell where documents end when concatenating them into long sequences. +- For Post-training chat data (`messages` key) — omit `--append_eod`; the chat template already appends EOS at the end of each conversation. +- Set `--max_sequence_length 256_000` to avoid rare OOM errors if some text is very long. + +## From JSONL files + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --jsonl_paths /path/to/data1.jsonl /path/to/data2.jsonl ... \ + --json_keys text \ + --tokenizer Qwen/Qwen3-0.6B \ + --output_dir tokenized_qwen3 \ + --workers 32 \ + --append_eod +``` + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --jsonl_paths /path/to/sft_data.jsonl \ + --json_keys messages \ + --tokenizer Qwen/Qwen3-0.6B \ + --output_dir tokenized_qwen3 \ + --workers 32 +``` + +Instead of `--jsonl_paths`, pass `--input_dir /path/to/dir` to tokenize all JSONL files in a directory (`.jsonl` and `.jsonl.gz` are both supported). + +## From Hugging Face Hub + +To tokenize a dataset directly from Hugging Face Hub: + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --hf_dataset nvidia/Nemotron-Pretraining-SFT-v1 \ + --hf_name Nemotron-SFT-Code \ + --hf_split train \ + --hf_max_samples_per_split 10_000_000 \ + --json_keys text \ + --tokenizer Qwen/Qwen3-0.6B \ + --output_dir tokenized_qwen3 \ + --workers 32 \ + --append_eod +``` + +Omit `--hf_name` to process all subsets, `--hf_split` for all splits, or `--hf_max_samples_per_split` for all samples. +To quickly test, use [nvidia/Nemotron-Pretraining-Dataset-sample](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-Dataset-sample). + +For very large datasets (tens of millions of documents), or datasets with complex nested message schemas (e.g. `tool_calls`, `function_call` fields) that cause Arrow type-cast errors in non-streaming mode, add `--hf_streaming` to avoid downloading the full dataset — only the rows actually consumed are fetched. Optionally pair with `--hf_max_samples_per_split ` to cap the row count; without it streaming still works but re-downloads on every run with no disk cache. + +> **Performance note:** Non-streaming mode downloads all Parquet shards once and caches them as Arrow files on disk. +> Re-runs read from cache and are much faster. +> Streaming re-downloads on every run with no cache, so it is slower for full-dataset processing. + +## `reasoning_content` for Post-Training v3 Datasets + +v3 datasets include a `reasoning_content` field in assistant messages (chain-of-thought separate from +the final answer). Use `--reasoning_content` to control how it is handled: + +| Value | Behaviour | +| --- | --- | +| `strip` (default) | Field is discarded before `apply_chat_template`. Safe for any tokenizer. | +| `inline` | Wrapped as `` and prepended to `content`. Preserves reasoning in a tokenizer-agnostic way. | +| `native` | Passed unchanged. Requires the tokenizer's chat template to handle the field (e.g. Qwen3). | + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --hf_dataset nvidia/Nemotron-Math-v2 \ + --hf_split high_part00 \ + --json_keys messages \ + --tokenizer nvidia/NVIDIA-Nemotron-Nano-9B-v2 \ + --output_dir tokenized_nemotron_v2 \ + --workers 32 \ + --reasoning_content inline +``` + +--- + +## Ready-to-run tokenization commands + +Tokenization commands for all Nemotron Pre-Training and Post-Training datasets used in Megatron-Bridge distillation experiments. + +Two parameters vary by model — set them before running the commands below: + +```bash +TOKENIZER=nvidia/NVIDIA-Nemotron-Nano-9B-v2 # HuggingFace tokenizer (or local path) +OUTPUT_DIR=tokenized_nemotron_v2 # Output directory for tokenized files +``` + +> [!TIP] +> Token count for a `.bin` file = file size in bytes ÷ 4. This is also printed by the tokenization script on completion. + +> [!NOTE] +> Tokenizing each of the datasets below will take anywhere between 10 minutes to few hours. You can tokenize all in parallel to speed up the process. +> +> You may tokenize more datasets or skip some datasets depending on your needs. + +### Nemotron Pretraining dataset + +**[nvidia/Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1)** — raw text; omitting `--hf_name` tokenizes all 3 subsets (Code, General, MATH) in one command, producing a separate output file per subset named after each: + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --hf_dataset nvidia/Nemotron-Pretraining-SFT-v1 \ + --hf_split train \ + --hf_streaming \ + --hf_max_samples_per_split 10_000_000 \ + --json_keys text \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 \ + --append_eod \ + --strip_newlines +``` + +--- + +### Nemotron Post-training v1 dataset + +**[nvidia/Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1)** — STEM subset, capped at 5M samples. v1 data does not contain reasoning traces: + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --hf_dataset nvidia/Nemotron-Post-Training-Dataset-v1 \ + --hf_name default \ + --hf_split stem \ + --hf_streaming \ + --hf_max_samples_per_split 5_000_000 \ + --json_keys messages \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 +``` + +--- + +### Nemotron Post-training v3 collection + +Datasets below are from the [Nemotron Post-Training v3 collection](https://huggingface.co/collections/nvidia/nemotron-post-training-v3). All use `--reasoning_content inline` to preserve `` traces. The collection contains many more datasets — if you care about benchmarks not covered here (e.g. multilingual, agentic/tool use, SWE, safety), pick the relevant datasets from the collection and tokenize them the same way. + +**[nvidia/Nemotron-Math-v2](https://huggingface.co/datasets/nvidia/Nemotron-Math-v2)** — tokenize `high_part00` and `high_part01` separately: + +```bash +for SPLIT in high_part00 high_part01; do + python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --hf_dataset nvidia/Nemotron-Math-v2 \ + --hf_split ${SPLIT} \ + --json_keys messages \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 \ + --reasoning_content inline +done +``` + +**[nvidia/Nemotron-SFT-Competitive-Programming-v2](https://huggingface.co/datasets/nvidia/Nemotron-SFT-Competitive-Programming-v2)** — stored as raw JSONL on HuggingFace, download before tokenizing: + +```bash +hf download nvidia/Nemotron-SFT-Competitive-Programming-v2 \ + --repo-type dataset \ + --local-dir datasets/Nemotron-SFT-Competitive-Programming-v2/ +for FILE in competitive_programming_python_00 competitive_programming_cpp_00; do + python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --jsonl_paths datasets/Nemotron-SFT-Competitive-Programming-v2/data/${FILE}.jsonl \ + --json_keys messages \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 \ + --reasoning_content inline +done +``` + +**[nvidia/Nemotron-Science-v1](https://huggingface.co/datasets/nvidia/Nemotron-Science-v1)** — stored as raw JSONL on HuggingFace, download before tokenizing: + +```bash +hf download nvidia/Nemotron-Science-v1 \ + --repo-type dataset \ + --local-dir datasets/Nemotron-Science-v1/ +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --input_dir datasets/Nemotron-Science-v1/data/ \ + --json_keys messages \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 \ + --reasoning_content inline +``` + +**[nvidia/Nemotron-SFT-Instruction-Following-Chat-v2](https://huggingface.co/datasets/nvidia/Nemotron-SFT-Instruction-Following-Chat-v2)** — stored as raw JSONL on HuggingFace, download before tokenizing: + +```bash +hf download nvidia/Nemotron-SFT-Instruction-Following-Chat-v2 \ + --repo-type dataset \ + --local-dir datasets/Nemotron-SFT-Instruction-Following-Chat-v2/ +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --input_dir datasets/Nemotron-SFT-Instruction-Following-Chat-v2/data/ \ + --json_keys messages \ + --tokenizer ${TOKENIZER} \ + --output_dir ${OUTPUT_DIR} \ + --workers 96 \ + --max_sequence_length 256_000 \ + --reasoning_content inline +``` + +--- + +### Expected output + +After running all commands above, `${OUTPUT_DIR}/` should contain the following `.bin` / `.idx` file pairs: + +```text +nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-Code_train_text_max10000000.{bin,idx} +nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-General_train_text_max10000000.{bin,idx} +nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-MATH_train_text_max10000000.{bin,idx} +nvidia--Nemotron-Post-Training-Dataset-v1_default_stem_messages_max5000000.{bin,idx} +nvidia--Nemotron-Math-v2_default_high_part00_messages.{bin,idx} +nvidia--Nemotron-Math-v2_default_high_part01_messages.{bin,idx} +competitive_programming_python_00_messages.{bin,idx} +competitive_programming_cpp_00_messages.{bin,idx} +MCQ_messages.{bin,idx} +RQA_messages.{bin,idx} +reasoning_off_messages.{bin,idx} +reasoning_on_messages.{bin,idx} +``` diff --git a/examples/dataset/README.md b/examples/dataset/README.md index 15cb21613c..d073237cf6 100644 --- a/examples/dataset/README.md +++ b/examples/dataset/README.md @@ -5,7 +5,7 @@ | **Section** | **Description** | **Link** | | :------------: | :------------: | :------------: | | Building Chat Datasets | Scripts to build conversation datasets from Nemotron and other HuggingFace sources | \[[Link](#building-chat-datasets)\] | -| Tokenizing for Megatron Frameworks | Convert JSONL or HF datasets to Megatron binary format for distillation and pre-training | \[[Link](#tokenizing-for-megatron-frameworks)\] | +| Tokenizing for Megatron Frameworks | Convert JSONL or HF datasets to Megatron binary format for distillation and pre-training | \[[Link](MEGATRON_DATA_PREP.md)\] | @@ -140,85 +140,7 @@ In `generate` mode, assistant turns are stripped so the row ends with a user tur ## Tokenizing for Megatron Frameworks -The distillation and pre-training scripts in Megatron-Bridge or Megatron-LM expect data pre-tokenized in Megatron's binary indexed format (`.bin` / `.idx`). -Use the `megatron_preprocess_data` utility to tokenize any JSONL or Hugging Face dataset. -The tokenization scripts below prints the list of output prefixes (e.g. `tokenized_qwen3/data1_text`) that you can use for the `data_paths` argument (with relative weights on different files) in Megatron training scripts. - -**Important Notes:** - -- For Pretraining / raw-text data (`text` key) — use `--append_eod` so Megatron can tell where documents end when concatenating them into long sequences. -- For Post-training chat data (`messages` key) — omit `--append_eod`; the chat template already appends EOS at the end of each conversation. -- Set `--max_sequence_length 256_000` to avoid rare OOM errors if some text is very long. - -### From JSONL files - -```bash -python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ - --jsonl_paths /path/to/data1.jsonl /path/to/data2.jsonl ... \ - --json_keys text \ - --tokenizer Qwen/Qwen3-0.6B \ - --output_dir tokenized_qwen3 \ - --workers 32 \ - --append_eod -``` - -```bash -python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ - --jsonl_paths /path/to/sft_data.jsonl \ - --json_keys messages \ - --tokenizer Qwen/Qwen3-0.6B \ - --output_dir tokenized_qwen3 \ - --workers 32 -``` - -Instead of `--jsonl_paths`, pass `--input_dir /path/to/dir` to tokenize all JSONL files in a directory (`.jsonl` and `.jsonl.gz` are both supported). - -### From Hugging Face Hub - -To tokenize a dataset directly from Hugging Face Hub: - -```bash -python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ - --hf_dataset nvidia/Nemotron-Pretraining-SFT-v1 \ - --hf_name Nemotron-SFT-Code \ - --hf_split train \ - --hf_max_samples_per_split 10_000_000 \ - --json_keys text \ - --tokenizer Qwen/Qwen3-0.6B \ - --output_dir tokenized_qwen3 \ - --workers 32 \ - --append_eod -``` - -Omit `--hf_name` to process all subsets, `--hf_split` for all splits, or `--hf_max_samples_per_split` for all samples. -To quickly test, use [nvidia/Nemotron-Pretraining-Dataset-sample](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-Dataset-sample). - -For **very large datasets** (tens of millions of documents), add `--hf_streaming --hf_max_samples_per_split ` to avoid downloading the full dataset — only the rows actually consumed are fetched. - -> **Performance note:** Non-streaming mode downloads all Parquet shards once and caches them as Arrow files on disk. -> Re-runs read from cache and are much faster. -> Streaming re-downloads on every run with no cache, so it is slower for full-dataset processing. - -### Nemotron Post-Training v3 (`reasoning_content`) - -v3 datasets include a `reasoning_content` field in assistant messages (chain-of-thought separate from -the final answer). Use `--reasoning_content` to control how it is handled: - -| Value | Behaviour | -| --- | --- | -| `strip` (default) | Field is discarded before `apply_chat_template`. Safe for any tokenizer. | -| `inline` | Wrapped as `` and prepended to `content`. Preserves reasoning in a tokenizer-agnostic way. | -| `native` | Passed unchanged. Requires the tokenizer's chat template to handle the field (e.g. Qwen3). | - -```bash -python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ - --hf_dataset nvidia/Nemotron-Post-Training-Dataset-v3 \ - --json_keys messages \ - --tokenizer Qwen/Qwen3-0.6B \ - --output_dir tokenized_qwen3 \ - --workers 32 \ - --reasoning_content inline -``` +See **[MEGATRON_DATA_PREP.md](MEGATRON_DATA_PREP.md)** for full documentation: general usage with JSONL and Hugging Face Hub datasets, handling of Nemotron Post-Training v3 `reasoning_content` fields, and ready-to-run tokenization commands for all Nemotron Pre/Post-Training datasets. ## Synthetic Test Dataset diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index d18f9eb539..f063bfb057 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,3 +1,3 @@ -kernels>=0.9.0 -trackio +kernels>=0.9.0,<0.13 +trackio<0.21 trl>=0.21.0 diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md index 571a0c4988..1e384acfb1 100644 --- a/examples/megatron_bridge/README.md +++ b/examples/megatron_bridge/README.md @@ -47,7 +47,7 @@ hf auth login --token ``` > [!WARNING] -> Use `python -m pip` instead of `pip` to avoid conflicts with the system-wide installed packages in the NeMo containers. +> Use `python -m pip` instead of `pip` to avoid conflicts with the system-wide installed packages in the NeMo containers. You may also refer to this [doc](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/docker/common/README.md#installing-packages-inside-the-container) on how to correctly install packages in the NeMo containers without breaking existing torch installation. ## Pruning @@ -189,7 +189,7 @@ For more details, see the [Megatron-Bridge conversion README](https://github.com ### Distillation Results -See [results/puzzletron.md](results/puzzletron.md) for MMLU results demonstrating knowledge distillation on Puzzletron-compressed student models. +See [examples/pruning/](../pruning/README.md#tutorials--results) for distillation experiment results covering Minitron and Puzzletron pruning algorithms. ## Post-Training Quantization diff --git a/examples/pruning/README.md b/examples/pruning/README.md index 9e84622269..294f00031d 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -20,6 +20,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar | Support Matrix | View the support matrix to see available pruning algorithms and their compatibility with different models and frameworks | \[[Link](#support-matrix)\] | | | Examples | Examples of different pruning methods | \[[Link](#examples)\] | | | Pruning Guidelines | Guidelines for choosing how and how much to prune for best results | \[[Link](#pruning-guidelines)\] | | +| Tutorials / Results | End-to-end tutorials for Minitron and Puzzletron pruning | \[[Link](#tutorials--results)\] | | | Resources | Extra links to relevant resources | \[[Link](#resources)\] | | @@ -186,16 +187,28 @@ If your model parameters are already sorted and you just want to prune the weigh ## Examples -### Minitron Pruning for Megatron-Bridge/ Megatron-LM Framework LLMs (e.g. Qwen 3, Nemotron Nano) +### Minitron Pruning for Megatron-Bridge/ Megatron-LM Framework LLMs (e.g. Qwen3, Nemotron 3 Nano) Checkout the Minitron pruning example for [Megatron-Bridge Framework](../megatron_bridge/README.md#pruning) or [Megatron-LM Framework](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt#-pruning) which showcases the usage of the powerful Minitron pruning algorithm developed by NVIDIA Research for pruning LLMs like Llama-3.1-8B, Qwen3-8B, Nemotron-Nano-9B-v2, Nemotron-3-Nano-30B-A3B, etc. Both frameworks support importing from a Hugging Face pretrained checkpoint. -Some of the models pruned using Minitron method followed by distillation and post-training are: +Some of the official models pruned using Minitron method followed by distillation and post-training are: - [Minitron Collection on Hugging Face](https://huggingface.co/collections/nvidia/minitron) - [NVIDIA-Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2) +See [minitron/](minitron/README.md) for end-to-end tutorials and results. + +### Puzzletron Pruning for LLMs (e.g. Llama, Qwen, Nemotron) + +Checkout the [Puzzletron README](../puzzletron/README.md) which showcases MIP-based NAS pruning that produces heterogeneous model architectures — varying FFN intermediate sizes per layer and selectively removing attention layers — to meet a target parameter count or memory budget. + +Supported models include Llama-3.1-8B-Instruct, Qwen3-8B, Qwen2.5-7B-Instruct, Nemotron-Nano-12B-v2, Mistral-Small-24B-Instruct-2501, and others via the [configs](../puzzletron/configs/) directory. See the [Puzzletron README](../puzzletron/README.md) for more details. + +After compression, use [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) to recover accuracy. + +See [puzzletron/](puzzletron/README.md) for distillation results on Puzzletron-compressed models. + ### FastNAS Pruning for PyTorch Computer Vision Models Check out the FastNAS pruning example usage in the [documentation](https://nvidia.github.io/Model-Optimizer/guides/3_pruning.html#pruning-and-subnet-search). @@ -279,16 +292,23 @@ After pruning, distillation is required to recover model accuracy. Below are rec | **Hyperparameter** | **Recommendation** | | :---: | :---: | | **Sequence Length** | 8192 (or 4096 if dataset has smaller sequences) | -| **Global Batch Size (GBS)** | 768 | +| **Global Batch Size (GBS)** | same as the original training or 768 if unsure | | **Micro Batch Size (MBS)** | As large as your GPU memory can accommodate | | **Learning Rate (LR)** | 1e-4 → 1e-5 (linear decay) for 30-50% pruning
• More compression → higher LR
• Less compression → lower LR
• As model gets larger → reduce LR to avoid divergence | | **Warmup Steps** | 100 | -| **Training Max Steps** | Num training tokens / (Seq len × GBS)
• Recommended: 80-100B tokens | +| **Training Max Steps** | Num training tokens / (Seq len × GBS)
• Recommended: 80-100B tokens for best results. | | **Data Composition** | • Standard models: 100% pre-training data
• Reasoning models: 70% reasoning data + 30% pre-training data | > [!TIP] > If you know the maximum learning rate used during the original training, a good rule of thumb for knowledge distillation is to use **1/5th of that maximum LR** when compressing by ~50%. +## Tutorials / Results + +End-to-end distillation results with Megatron-Bridge after Minitron and Puzzletron pruning: + +- **[Minitron — Nemotron-Nano-9B-v2](minitron/NVIDIA-Nemotron-Nano-9B-v2/README.md)**: End-to-end tutorial of structured pruning for Nemotron-Nano-9B-v2 to 7B followed by knowledge distillation up to 80B tokens, quantization, and vLLM deployment. Achieves near-parity with the official 9B model across popular pretraining and reasoning benchmarks. +- **[Puzzletron — Qwen3-8B and Llama-3.1-8B-Instruct](puzzletron/Llama-3.1-8B-Instruct.md)**: MIP-based compression followed by short distillation runs on WikiText-103. Shows MMLU recovery and illustrates the importance of using larger datasets to avoid overfitting. + ## Resources - 📅 [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) diff --git a/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/ABLATIONS.md b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/ABLATIONS.md new file mode 100644 index 0000000000..1786e88fdd --- /dev/null +++ b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/ABLATIONS.md @@ -0,0 +1,76 @@ +# Distillation Blend Ablations + +All experiments prune Nemotron-Nano-9B-v2 → 7B and distill with teacher = Nemotron-Nano-9B-v2 (official). The final chosen blend (**30pre_70post_v1v3**) is in [README.md](README.md). + +--- + +## Baseline: Pre-SFT-v1 Only (no post-training data) + +Pure Nemotron-Pretraining-SFT-v1 data only (no post-training reasoning traces). + +| Tokens | MMLU | MMLU Pro | GPQA Diamond | LCB v6 | AIME 2025 | Math 500 | IFEval | SciCode | +|---|---|---|---|---|---|---|---|---| +| 19B | 72.7 | 70.5 | 53.9 | 58.8 | 63.4 | 94.4 | 57.9 | 19.2 | +| 56B | 73.3 | 71.9 | 54.3 | 62.0 | 63.8 | 95.0 | 58.7 | 17.9 | + +**Notes:** Highest MMLU of any blend, but AIME stagnates and LCB lags. Pretraining data alone insufficient for reasoning benchmarks. + +--- + +## Baseline: Pure Post-Training Data (pt-v1v2) + +100% post-training data (no pretraining data), Nemotron-v1/v2 blend. + +| Tokens | MMLU | MMLU Pro | GPQA Diamond | LCB v6 | AIME 2025 | Math 500 | IFEval | SciCode | +|---|---|---|---|---|---|---|---|---| +| 2.5B | 71.0 | 69.3 | 52.6 | 54.8 | 58.2 | 94.1 | 51.7 | 14.4 | +| 5B | 70.8 | 70.7 | 53.6 | 57.2 | 63.8 | 94.1 | 50.5 | 14.2 | +| 20B | 69.8 | 71.7 | 54.7 | 57.5 | 64.7 | 94.6 | 41.9 | 13.4 | +| 40B | 70.0 | 71.7 | 53.2 | 57.4 | 67.6 | 95.2 | 43.3 | 16.2 | + +**Notes:** IFEval degrades badly at longer training (41.9 at 20B). LCB lags behind other blends. + +--- + +## 30% Pretraining / 70% Post-Training: v1v2 Blend + +30% Nemotron-Pretraining-SFT-v1 + 70% Nemotron-v1/v2 post-training data. + +| Tokens | MMLU | MMLU Pro | GPQA Diamond | LCB v6 | AIME 2025 | Math 500 | IFEval | SciCode | +|---|---|---|---|---|---|---|---|---| +| 2.5B | 71.9 | 68.9 | 49.8 | 56.4 | 55.3 | 93.3 | 58.2 | 14.6 | +| 5B | — | — | — | — | — | — | — | — | +| 20B | 71.6 | 71.2 | 52.7 | 58.0 | 65.1 | 94.0 | 55.7 | 14.2 | +| 40B | 72.7 | 71.1 | 54.0 | 59.7 | 65.5 | 95.2 | 53.8 | 19.2 | +| 60B | 73.0 | 71.9 | 55.9 | 60.0 | 67.8 | 95.4 | 56.4 | 21.7 | +| 80B | 73.4 | 72.7 | 54.7 | 61.8 | 70.7 | 95.3 | 57.8 | 19.9 | +| 100B | 73.5 | 72.8 | 56.4 | 62.4 | 71.9 | 95.8 | 59.1 | 19.4 | + +**Notes:** Best MMLU of the 30/70 blends (~1% above v3 blends). IFEval ~56–59 (lower than v3 blends). GPQA shows instability at longer runs. + +--- + +## 30% Pretraining / 70% Post-Training: v3 Blend + +Refined v3 blend: dropped exercism/text2sql, added Nemotron-Math-v2 part01, boosted Math to 30% total. + +| Tokens | MMLU | MMLU Pro | GPQA Diamond | LCB v6 | AIME 2025 | Math 500 | IFEval | SciCode | +|---|---|---|---|---|---|---|---|---| +| 2.5B | 70.5 | 69.0 | 51.2 | 59.1 | 62.9 | 94.3 | 62.2 | 11.6 | +| 5B | 71.0 | 69.8 | 53.0 | 59.4 | 65.0 | 94.4 | 66.8 | 20.3 | +| 20B | 71.2 | 70.8 | 53.3 | 60.0 | 69.1 | 95.3 | 63.8 | 22.6 | +| 40B | 71.0 | 71.7 | 54.0 | 62.3 | 71.3 | 95.3 | 66.8 | 17.9 | +| 60B | 72.0 | 72.3 | 56.3 | 62.0 | 71.6 | 95.6 | 65.5 | 21.5 | +| 80B | 72.3 | 73.0 | 53.9 | 63.0 | 72.4 | 96.2 | 65.5 | 21.3 | + +**Notes:** Better AIME and LCB than blend 1 at 40B+. GPQA still unstable (53.9 at 80B). MMLU ~1% below v1v2 blend. + +--- + +## Blend Design Notes + +**Why MMLU is ~1% lower with v3 blends:** The heavy reasoning-trace format (chain-of-thought, TIR) in v3 data suppresses general knowledge recall measured by MMLU. This is structural — v1v2 post-training data has a more knowledge-dense format. Upweighting Pretraining-SFT-v1 General (to 20%) partially mitigates this. Given that MMLU Pro is better with v3 blends, lower MMLU is acceptable. + +**Why GPQA is unstable in blend 1:** Science-v1 MCQ (497M tokens) and RQA (278M tokens) are repeated ~14× over 100B training steps, causing overfitting to MCQ format. Fix in v1v3: add Nemotron-Post-Training-Dataset-v1 STEM (~60B tokens, ~0.13 epochs at 80B) as primary science source; reduce Science-v1 to low weights (3+2) for format alignment only. + +**Why 80B is the recommended stopping point:** SciCode degrades or crashes at 100B (blend2: 1.6; AIME also degrades). Best overall profile is at 60–80B tokens. diff --git a/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/README.md b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/README.md new file mode 100644 index 0000000000..620c5780a4 --- /dev/null +++ b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/README.md @@ -0,0 +1,326 @@ +# Nemotron-Nano-9B-v2: Prune + Distill + Quantize + vLLM Deployment + +End-to-end optimization of [Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2) demonstrating how ModelOpt techniques stack: Minitron structured pruning to 7B → Megatron-Bridge knowledge distillation to recover accuracy → FP8 quantization → vLLM deployment and throughput benchmarking. This document covers: + +1. **[Data Preparation](#1-data-preparation)** — tokenizing the training blend for distillation +2. **[Pruning](#2-pruning)** — Minitron structured pruning from 9B to 7B +3. **[Distillation](#3-distillation)** — recovering accuracy via Megatron-Bridge knowledge distillation (up to 80B tokens) +4. **[Evaluation](#4-evaluation)** — benchmarking with NeMo Evaluator across MMLU Pro, GPQA Diamond, AIME, and more +5. **[Quantization](#5-quantization)** — FP8 PTQ on the distilled checkpoint using ModelOpt's `examples/llm_ptq/hf_ptq.py` script +6. **[vLLM Inference Benchmarking](#6-vllm-inference-benchmarking)** — throughput comparison of BF16 vs FP8 on a single H100 + +**Environment:** Container `nvcr.io/nvidia/nemo:26.02`, ModelOpt 0.44.0. See the [Megatron-Bridge README](../../../megatron_bridge/README.md) for environment setup (including ModelOpt mount path) and container usage. + +## Results + +![Benchmark Recovery During Knowledge Distillation](figures/learning_curves.png) + +| Model | MMLU | MMLU Pro | GPQA Diamond | LiveCodeBench v6 | AIME 2025 | Math 500 | IFEval | SciCode (Subtask) | Average | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Pruned 7B (no distillation) | 67.8 | 11.9 | 17.7 | 1.4 | 0.3 | 6.0 | 41.8 | 0.1 | 18.4 | +| Pruned 7B + distill 2.5B tokens (400 iters) | 70.7 | 68.4 | 52.7 | 57.0 | 63.0 | 93.7 | 63.2 | 11.6 | 60.0 | +| Pruned 7B + distill 20B tokens (3200 iters) | 71.3 | 71.7 | 54.8 | 62.0 | 69.1 | 95.2 | 63.8 | 20.9 | 63.6 | +| Pruned 7B + distill 40B tokens (6400 iters) | 71.1 | 71.6 | 53.7 | 60.9 | 70.4 | 95.6 | 68.0 | 21.1 | 64.1 | +| Pruned 7B + distill 60B tokens (9600 iters) | 72.1 | 72.1 | 54.9 | 61.6 | 70.3 | 95.4 | 64.7 | 24.1 | 64.4 | +| Pruned 7B + distill 80B tokens (12800 iters) | 72.2 | 73.0 | 56.9 | 62.6 | 72.0 | 95.8 | 66.2 | 22.2 | 65.1 | +| Nemotron-Nano-9B-v2 (official, pruned from 12B) | 74.7 | 74.9 | 56.1 | 64.4 | 73.2 | 95.9 | 65.8 | 21.9 | 65.9 | +| Nemotron-Nano-12B-v2 (official) | 78.5 | 77.9 | 58.2 | 66.6 | 76.1 | 96.9 | 67.9 | 28.4 | 68.8 | + +**Key observations:** + +- **All benchmarks recover dramatically within the first checkpoint (2.5B tokens).** The pruned-only model is essentially non-functional, but a single distillation run recovers most capabilities. +- **Math 500 and IFEval plateau quickly** — essentially saturated after 2.5B tokens, with minimal gains over the remaining training. +- **MMLU also largely plateaus** after the first checkpoint. +- **AIME, MMLU Pro, GPQA, and SciCode continue improving** throughout the full run and benefit meaningfully from longer training. +- **The 7B model at 80B tokens closes most of the gap to the official 9B**, and actually exceeds it on GPQA, IFEval, and SciCode. The table below compares the 7B→9B gap against the 9B→12B gap — both are ~25% compression — showing that the second pruning round recovers more efficiently: + +| Benchmark | 7B (80B tokens) vs 9B | 9B (official) vs 12B | +| --- | --- | --- | +| MMLU | −2.5 | −3.8 | +| MMLU Pro | −1.9 | −3.0 | +| GPQA Diamond | **+0.8** | −2.1 | +| LiveCodeBench v6 | −1.8 | −2.2 | +| AIME 2025 | −1.2 | −2.9 | +| Math 500 | −0.1 | −1.0 | +| IFEval | **+0.4** | −2.1 | +| SciCode (Subtask) | **+0.3** | −6.5 | +| Average | −0.8 | −2.9 | + +Distillation uses the **30% Pretraining (Code 5, General 20, MATH 5) + 70% Post-training v1/v3 (Math 30, Coding 20, Science 15, IF 5)** blend (see [Data Blend](#data-blend) below). Blend ablations are in [ABLATIONS.md](ABLATIONS.md). + +> [!NOTE] +> Exact numbers may vary depending on deployment and evaluation setup. All models above — including the official 9B and 12B — were evaluated with the same [nemo_evaluator.yaml](nemo_evaluator.yaml) for fair comparison. These numbers may differ from those reported on the official [Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2) and [Nemotron-Nano-12B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2) HuggingFace model cards. + +> [!NOTE] +> The official Nemotron-Nano-9B-v2 model was itself produced by pruning Nemotron-Nano-12B-v2 using Minitron. See [arxiv:2508.14444](https://arxiv.org/abs/2508.14444) for details on the exact steps used there. + +--- + +## Steps to Reproduce + +### 1. Data Preparation + +See [examples/dataset/MEGATRON_DATA_PREP.md](../../../dataset/MEGATRON_DATA_PREP.md) for tokenization commands for all datasets used in this blend. + +For this experiment: `TOKENIZER=nvidia/NVIDIA-Nemotron-Nano-9B-v2`, `OUTPUT_DIR=tokenized_nemotron_v2`. + +#### Data Blend + +**30% Pretraining (Code 5, General 20, MATH 5) + 70% Post-training v1/v3 (Math 30, Coding 20, Science 15, IF 5)** + +```bash +DATA_BLEND=" \ +5 tokenized_nemotron_v2/nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-Code_train_text_max10000000 \ +20 tokenized_nemotron_v2/nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-General_train_text_max10000000 \ +5 tokenized_nemotron_v2/nvidia--Nemotron-Pretraining-SFT-v1_Nemotron-SFT-MATH_train_text_max10000000 \ +15 tokenized_nemotron_v2/nvidia--Nemotron-Math-v2_default_high_part00_messages \ +15 tokenized_nemotron_v2/nvidia--Nemotron-Math-v2_default_high_part01_messages \ +15 tokenized_nemotron_v2/competitive_programming_python_00_messages \ +5 tokenized_nemotron_v2/competitive_programming_cpp_00_messages \ +10 tokenized_nemotron_v2/nvidia--Nemotron-Post-Training-Dataset-v1_default_stem_messages_max5000000 \ +3 tokenized_nemotron_v2/MCQ_messages \ +2 tokenized_nemotron_v2/RQA_messages \ +3 tokenized_nemotron_v2/reasoning_on_messages \ +2 tokenized_nemotron_v2/reasoning_off_messages \ +" +``` + +| Dataset | Tokens | Weight | Notes | +| --- | --- | --- | --- | +| Nemotron-Pretraining-SFT-v1 / Code (10M samples) | 7B | 5 | Pretraining code | +| Nemotron-Pretraining-SFT-v1 / General (10M samples) | 16B | 20 | Upweighted to better close MMLU gap | +| Nemotron-Pretraining-SFT-v1 / MATH (10M samples) | 12B | 5 | Pretraining math | +| Nemotron-Math-v2 / high_part00 | 9B | 15 | Hard math reasoning | +| Nemotron-Math-v2 / high_part01 | 11B | 15 | Hard math reasoning | +| Nemotron-SFT-Competitive-Programming-v2 / python_00 | 7B | 15 | Python reasoning traces | +| Nemotron-SFT-Competitive-Programming-v2 / cpp_00 | 7B | 5 | C++ reasoning traces | +| Nemotron-Post-Training-Dataset-v1 / stem (5M samples) | 20B | 10 | Broad STEM | +| Nemotron-Science-v1 / MCQ | 0.5B | 3 | GPQA MCQ format alignment | +| Nemotron-Science-v1 / RQA | 0.3B | 2 | GPQA format diversity | +| Nemotron-SFT-IF-Chat-v2 / reasoning_on | 2B | 3 | Instruction following (thinking on) | +| Nemotron-SFT-IF-Chat-v2 / reasoning_off | 1B | 2 | Instruction following (thinking off) | + +#### General Guidelines + +The optimal blend is 30% pretraining and 70% post-training data. Exact proportions may vary depending on the benchmarks you care about. The blend above was designed to maximize recovery on important benchmarks reported in the Nemotron-Nano-9B-v2 model card. The key design decisions were: + +- **30% pretraining data** closes the MMLU gap that arises from training exclusively on reasoning-heavy post-training data. The General split (20%) is upweighted specifically to recover general knowledge recall. +- **Math (30%)** is the largest post-training category because AIME and MMLU Pro respond strongly to more math reasoning tokens. Two `Nemotron-Math-v2` splits are used to avoid repetition at longer token budgets. +- **Science (15%)** uses `Nemotron-Post-Training-Dataset-v1 / stem` as the primary source for volume and GPQA stability, with small allocations to `Nemotron-Science-v1` MCQ/RQA subsets for format alignment with GPQA's multiple-choice structure. +- **Instruction following (5%)** saturates quickly — IFEval reaches 60+% within 2.5B tokens — so a small allocation is sufficient. + +This blend intentionally omits capabilities not targeted in this experiment (e.g. long context and multilingual benchmarks). Depending on what benchmarks matter for your use case, you can substitute or add datasets from the [Nemotron Post-Training v3 collection](https://huggingface.co/collections/nvidia/nemotron-post-training-v3), for example: + +| Capability | Relevant datasets | +| --- | --- | +| Multilingual | `Nemotron-SFT-Multilingual-v1` | +| Agentic / tool use | `Nemotron-SFT-Tool-Call-v1`, `Nemotron-SFT-Tool-Call-v2` | +| Software engineering (SWE) | `Nemotron-SFT-SWE-v1` | +| Safety / alignment | `Nemotron-SFT-Safety-v1` | +| Long context | `Nemotron-SFT-Long-Context-v1` | + +When adding new datasets, reduce weights of lower-priority categories proportionally to keep the total at 100%. + +--- + +### 2. Pruning + +Run on **1 node with 8x H100** (~1 hour) + +Non-default arguments: `--hparams_to_skip num_attention_heads` (default: none; attention heads pruning is harder to recover hence skipped), `--seq_length 8192` (default: 4096) since dataset has longer sequences. All other arguments use defaults i.e. we optimize for MMLU (10% subset, 0-shot) for the pruned model (without distillation). + +```bash +torchrun --nproc_per_node 8 /opt/Model-Optimizer/examples/megatron_bridge/prune_minitron.py \ + --pp_size 8 \ + --hf_model_name_or_path nvidia/NVIDIA-Nemotron-Nano-9B-v2 \ + --trust_remote_code \ + --prune_target_params 7e9 \ + --hparams_to_skip num_attention_heads \ + --seq_length 8192 \ + --output_hf_path /path/to/Nemotron-Nano-9B-v2-Pruned-7B +``` + +Important pruning logs: + +```text +Only considering atmost 40% for width and 20% for depth pruning hparams +Skipping hparams_to_skip=['num_attention_heads'] during search space generation... + Search space for num_layers: [46, 48, 50, 52, 54, 56] + Search space for hidden_size: [2816, 3072, 3328, 3584, 3840, 4096, 4352, 4480] + Search space for mamba_num_heads: [80, 88, 96, 104, 112, 120, 128] + Search space for mamba_head_dim: [56, 64, 72, 80] + Search space for ffn_hidden_size: [9728, 10240, 10752, 11264, 11776, 12288, 12800, 13312, 13824, 14336, 14848, 15360, 15680] + Total search space in consideration: 17472 + +Top 10 candidates with scores: +{'num_layers': 50, 'hidden_size': 4480, 'mamba_num_heads': 128, 'mamba_head_dim': 56, 'ffn_hidden_size': 15680} -> 7.00B params, 0.2019 score +{'num_layers': 56, 'hidden_size': 4096, 'mamba_num_heads': 96, 'mamba_head_dim': 80, 'ffn_hidden_size': 14336} -> 7.00B params, 0.4363 score +{'num_layers': 48, 'hidden_size': 4352, 'mamba_num_heads': 120, 'mamba_head_dim': 80, 'ffn_hidden_size': 13824} -> 7.00B params, 0.6789 score [BEST SUBNET] +{'num_layers': 56, 'hidden_size': 4352, 'mamba_num_heads': 112, 'mamba_head_dim': 80, 'ffn_hidden_size': 10240} -> 7.00B params, 0.5203 score +{'num_layers': 54, 'hidden_size': 4480, 'mamba_num_heads': 104, 'mamba_head_dim': 80, 'ffn_hidden_size': 11264} -> 7.00B params, 0.2615 score +{'num_layers': 46, 'hidden_size': 4480, 'mamba_num_heads': 128, 'mamba_head_dim': 72, 'ffn_hidden_size': 14848} -> 7.00B params, 0.6165 score +{'num_layers': 50, 'hidden_size': 4480, 'mamba_num_heads': 112, 'mamba_head_dim': 64, 'ffn_hidden_size': 15680} -> 7.00B params, 0.4214 score +{'num_layers': 54, 'hidden_size': 4096, 'mamba_num_heads': 112, 'mamba_head_dim': 80, 'ffn_hidden_size': 13312} -> 7.00B params, 0.5894 score +{'num_layers': 56, 'hidden_size': 4352, 'mamba_num_heads': 120, 'mamba_head_dim': 72, 'ffn_hidden_size': 10752} -> 7.00B params, 0.4688 score +{'num_layers': 52, 'hidden_size': 4352, 'mamba_num_heads': 120, 'mamba_head_dim': 72, 'ffn_hidden_size': 12800} -> 7.00B params, 0.5596 score + +Dropping decoder layers [43, 44, 45, 46, 47, 48, 50, 52] from model. +Original hybrid_override_pattern: M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M- +Pruned hybrid_override_pattern: M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-MMMM-M- +``` + +> [!TIP] +> Here we skip the Knowledge Distillation (KD) step for candidates for simplicity. If you want to find a better pruned model, you can take the top K candidates' `export_config` from the logs above and then export all models separately and perform KD for ~2B tokens on each of them before selecting the best subnet based on your desired metrics. + +--- + +### 3. Distillation + +Non-default arguments: `--seq_length 8192` (default: 4096), `--mbs 4` (default: 1), `--train_iters 16000` (train upto ~100B tokens — can stop earlier and take intermediate checkpoints for smaller runs), `--lr_warmup_iters 100` (default: 50), `--eval_interval 400` (default: 100). All other arguments use defaults. + +Run on **96 nodes × 8x H100 (768 GPUs total)**. ~600 H100 GPU-hours per 1k steps (~6.3B tokens), i.e. ~45 min wall-clock per 1k steps. Full 80B token run (~13k steps) takes ~9k H100 GPU-hours (~10 hours wall-clock). + +>[!TIP] +> While we use 96 nodes here for faster training, you can also run with 1 node. If you dont want to do full distillation run, you can stop earlier and take intermediate checkpoints as well. + +```bash +torchrun --nproc_per_node 8 /opt/Model-Optimizer/examples/megatron_bridge/distill_minitron.py \ + --teacher_hf_path nvidia/NVIDIA-Nemotron-Nano-9B-v2 \ + --student_hf_path /path/to/Nemotron-Nano-9B-v2-Pruned-7B \ + --trust_remote_code \ + --tp_size 8 \ + --pp_size 1 \ + --data_paths "${DATA_BLEND}" \ + --data_path_to_cache /path/to/cache \ + --seq_length 8192 \ + --mbs 4 \ + --gbs 768 \ + --train_iters 16000 \ + --lr 1e-4 \ + --min_lr 1e-5 \ + --lr_warmup_iters 100 \ + --eval_interval 400 \ + --eval_iters 32 \ + --log_interval 10 \ + --output_dir + +# Optional: Weights & Biases logging +# --wandb_project \ +# --wandb_entity \ +# --wandb_exp_name +``` + +For multi-node Slurm runs, see the [Megatron-Bridge README](../../../megatron_bridge/README.md#slurm-usage) for details. + +Distillation saves checkpoints in Megatron distributed format under `/checkpoints/iter_XXXXXXX`. You can convert any intermediate checkpoint to HuggingFace format using the Megatron-Bridge conversion script (see [Megatron Bridge README](../../../megatron_bridge/README.md) for full details): + +```bash +python /opt/Megatron-Bridge/examples/conversion/convert_checkpoints.py export \ + --hf-model /path/to/Nemotron-Nano-9B-v2-Pruned-7B \ + --megatron-path /checkpoints/iter_ \ + --hf-path /checkpoints/hf_iter_ +``` + +--- + +### 4. Evaluation + +The eval config xin [nemo_evaluator.yaml](nemo_evaluator.yaml) is for Slurm-based evaluation — it submits a vLLM serving job and runs evals against it. For local model execution and evaluation, refer to the [NeMo Evaluator documentation](https://docs.nvidia.com/nemo/evaluator/latest/) or this [blog](https://huggingface.co/blog/nvidia/nemotron-3-nano-evaluation-recipe). + +Before running, update the following fields in the yaml: + +- `execution.hostname` — your Slurm login node hostname +- `execution.account` — your Slurm account +- `deployment.checkpoint_path` — Hugging Face checkpoint path (original, pruned or quantized) +- `evaluation.nemo_evaluator_config.config.params.extra.tokenizer` — same path as `checkpoint_path` + +> [!TIP] +> Uncomment `limit_samples` under any task to run a small subset and verify the end-to-end eval pipeline before launching full evals. + +```bash +pip install "nemo-evaluator-launcher[all]==0.1.90" + +# Set required environment variables: +export HF_TOKEN= +export SLURM_JOB_DIR= +export HF_HOME= +export VLLM_CACHE_ROOT= + +# Set additional unused but required environment variables: +export API_KEY=xxxxxx +export INFERENCE_API_KEY=xxxxxx +export OPENAI_CLIENT_ID=xxxxxx +export OPENAI_CLIENT_SECRET=xxxxxx + +nemo-evaluator-launcher run --config nemo_evaluator.yaml +``` + +**Tasks and exact metric names reported in the results table:** + +| Benchmark | Tool | Metric name | +| --- | --- | --- | +| MMLU | [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) (5-shot) | `mmlu` | +| MMLU Pro | NeMo Evaluator | `mmlu-pro_pass_at_1_symbolic_correct` | +| GPQA Diamond | NeMo Evaluator | `gpqa_pass_at_1_symbolic_correct` | +| LiveCodeBench v6 | NeMo Evaluator | `livecodebench_pass_at_1_accuracy` | +| AIME 2025 | NeMo Evaluator | `aime25_pass_at_1_symbolic_correct` | +| Math 500 | NeMo Evaluator | `AA_math_test_500_score_micro_avg_of_5` | +| IFEval | NeMo Evaluator | `ifeval_pass_at_1_average_score` | +| SciCode (Subtask) | NeMo Evaluator | `scicode_pass_at_1_subtask_accuracy` | + +**Key vLLM settings:** Tool calling is not enabled in these evals. + +For more details on NeMo Evaluator, see the [GitHub repo](https://github.com/NVIDIA-NeMo/evaluator) and [documentation](https://docs.nvidia.com/nemo/evaluator/latest/). + +### 5. Quantization + +ModelOpt allows stacking multiple optimization techniques. Here we stack FP8 quantization on top of the pruned and distilled model to get an even more optimized model. See [examples/llm_ptq/README.md](../../../llm_ptq/README.md) for the full PTQ documentation. + +Similar to the official [Nemotron-Nano-9B-v2-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8) model, if you want to quantize the pruned 7B model to FP8, the Mamba and MLP layers are quantized to FP8, while all 4 attention layers and the Conv1d components within the Mamba layers are kept in BF16 to avoid accuracy degradation. + +This is done with the `mtq.MAMBA_MOE_FP8_AGGRESSIVE_CFG` config defined in [`modelopt/torch/quantization/config.py`](../../../../modelopt/torch/quantization/config.py). To apply this, you need to modify `QUANT_CFG_CHOICES["fp8"]` in [`examples/llm_ptq/hf_ptq.py`](../../../llm_ptq/hf_ptq.py) to use `mtq.MAMBA_MOE_FP8_AGGRESSIVE_CFG`. You may also consider using `mtq.MAMBA_MOE_FP8_CONSERVATIVE_CFG` for more conservative quantization. + +> [!NOTE] +> You can also quantize to NVFP4 using `mtq.MAMBA_MOE_NVFP4_AGGRESSIVE_CFG` or `mtq.MAMBA_MOE_NVFP4_CONSERVATIVE_CFG`, which may require further distillation (QAD) to recover accuracy and Blackwell GPU for deployment. + +Calibrate and export the HF checkpoint from iteration 12800 to FP8 (takes 1-2 mins on 8x H100): + +```bash +python /opt/Model-Optimizer/examples/llm_ptq/hf_ptq.py \ + --pyt_ckpt_path /checkpoints/hf_iter_12800 \ + --export_path /checkpoints/hf_iter_12800_fp8_aggressive \ + --qformat fp8 \ + --trust_remote_code +``` + +The quantized checkpoint is directly deployable with [vLLM](https://github.com/vllm-project/vllm), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and [SGLang](https://github.com/sgl-project/sglang). + +> [!TIP] +> You can run the evaluation using the same `nemo_evaluator.yaml` file for the quantized checkpoint also! + +### 6. vLLM Inference Benchmarking + +Benchmark throughput using [vLLM](https://github.com/vllm-project/vllm) on a single H100 GPU. Run the command once for each HuggingFace checkpoint. vLLM automatically detects FP8 quantization from the embedded `quantization_config` in `config.json` and applies it with no extra flags needed. + +Results on a single H100 (ISL=32768, OSL=1024): + +```bash +vllm bench throughput \ + --model \ + --random-input-len 32768 \ + --random-output-len 1024 \ + --trust-remote-code \ + --mamba_ssm_cache_dtype float32 \ + --kv-cache-dtype fp8 \ + --load-format safetensors +``` + +| Checkpoint | Model loading memory | Output tokens/s | Speedup vs Nemotron-Nano-9B-v2 BF16 | +| --- | --- | --- | --- | +| Nemotron-Nano-12B-v2 (official) | 22.9 GiB | 585 | 0.74× | +| Nemotron-Nano-9B-v2 (official) | 16.6 GiB | 794 | 1.00× | +| Nemotron-Nano-9B-v2-FP8 (official) | 9.6 GiB | 1,012 | 1.27× | +| Nemotron-Nano-9B-v2-Pruned-7B | 13.1 GiB | 963 | 1.21× | +| Nemotron-Nano-9B-v2-Pruned-7B-FP8 | 7.8 GiB | 1,147 | 1.44× | + +In this case, FP8 delivers a ~20-30% throughput gain over BF16 at the same parameter count. The NemotronH hybrid architecture (Mamba + attention) moderates this gain relative to pure-transformer models, since Attention and Conv1d layers are not quantized. diff --git a/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/figures/learning_curves.png b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/figures/learning_curves.png new file mode 100644 index 0000000000..40c507bd1b Binary files /dev/null and b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/figures/learning_curves.png differ diff --git a/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/nemo_evaluator.yaml b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/nemo_evaluator.yaml new file mode 100644 index 0000000000..256a4031be --- /dev/null +++ b/examples/pruning/minitron/NVIDIA-Nemotron-Nano-9B-v2/nemo_evaluator.yaml @@ -0,0 +1,194 @@ +# NeMo Evaluator Launcher config for Nemotron-Nano-9B-v2 and Pruned variants +# -------------------------------------------------------------------------- +# Before running, update the following fields in the yaml: +# - `execution.hostname` — your Slurm login node hostname +# - `execution.account` — your Slurm account +# - `deployment.checkpoint_path` — Hugging Face checkpoint path (original, pruned or quantized) +# - `evaluation.nemo_evaluator_config.config.params.extra.tokenizer` — same path as `checkpoint_path` +# +# Usage: +# pip install "nemo-evaluator-launcher[all]==0.1.90" +# +# # Set required environment variables: +# export HF_TOKEN= +# export SLURM_JOB_DIR= +# export HF_HOME= +# export VLLM_CACHE_ROOT= +# +# # Set additional unused but required environment variables: +# export API_KEY=xxxxxx +# export INFERENCE_API_KEY=xxxxxx +# export OPENAI_CLIENT_ID=xxxxxx +# export OPENAI_CLIENT_SECRET=xxxxxx +# +# nemo-evaluator-launcher run --config nemo_evaluator.yaml +# + +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ + +execution: + type: slurm + hostname: + username: ${oc.env:USER} + account: + partition: batch + num_nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 8 + gres: "gpu:8" + walltime: 04:00:00 + sbatch_comment: "{\"OccupiedIdleGPUsJobReaper\":{\"exemptIdleTimeMins\":\"1920\",\"reason\":\"benchmarking\",\"description\":\"Some evals need idle time\ + \ else gets cancelled\"}}" + subproject: nel + output_dir: ${oc.env:SLURM_JOB_DIR} + mode: sequential + + mounts: + mount_home: false + deployment: + n_tasks: 1 + batch_comment: "{\"OccupiedIdleGPUsJobReaper\":{\"exemptIdleTimeMins\":\"1920\",\"reason\":\"benchmarking\",\"description\":\"Required data validation\ + \ and evaluation\"}}" + +# Note: Only tp=1 works for Nano (Mamba-based architecture) +deployment: + # Update this to your Hugging Face checkpoint path (original, pruned or quantized) + checkpoint_path: + served_model_name: Nemotron-Nano-9B-v2 + port: 8000 + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + data_parallel_size: 8 + gpu_memory_utilization: 0.8 + extra_args: "--trust-remote-code --no-enable-prefix-caching --mamba_ssm_cache_dtype float32 --model-loader-extra-config '{\"enable_multithread_load\"\ + : true, \"num_threads\": 96}' --kv-cache-dtype fp8 " + env_vars: + VLLM_ATTENTION_BACKEND: FLASH_ATTN + endpoints: + chat: /v1/chat/completions + completions: /v1/completions + health: /health + multiple_instances: true + +evaluation: + nemo_evaluator_config: + target: + api_endpoint: + adapter_config: + use_system_prompt: true + use_reasoning: false + params_to_add: + chat_template_kwargs: + enable_thinking: true + skip_special_tokens: false + use_caching: true + tracking_requests_stats: true + log_failed_requests: true + use_request_logging: true + max_logged_requests: 10 + use_response_logging: true + max_logged_responses: 10 + config: + params: + parallelism: 64 + max_new_tokens: 32768 + temperature: 0.6 + top_p: 0.95 + request_timeout: 3600 + max_retries: 10 + extra: + tokenizer_backend: huggingface + # Update tokenizer path to match checkpoint_path above + tokenizer: + env_vars: + HF_TOKEN: HF_TOKEN + HF_HOME: HF_HOME + VLLM_CACHE_ROOT: VLLM_CACHE_ROOT + API_KEY: API_KEY + INFERENCE_API_KEY: INFERENCE_API_KEY + OPENAI_CLIENT_ID: OPENAI_CLIENT_ID + OPENAI_CLIENT_SECRET: OPENAI_CLIENT_SECRET + + tasks: + # 1. MMLU Pro + - name: ns_mmlu_pro + env_vars: + HF_TOKEN: HF_TOKEN + nemo_evaluator_config: + config: + params: + # limit_samples: 8 + extra: + num_repeats: 1 + args: "++prompt_config=eval/aai/mcq-10choices-boxed" + + # 2. GPQA Diamond + - name: ns_gpqa + env_vars: + HF_TOKEN: HF_TOKEN + nemo_evaluator_config: + config: + params: + # limit_samples: 8 + extra: + num_repeats: 8 + args: "++prompt_config=eval/aai/mcq-4choices" + + # 3. LiveCodeBench + - name: ns_livecodebench + env_vars: + HF_TOKEN: HF_TOKEN + nemo_evaluator_config: + config: + params: + # limit_samples: 8 + extra: + num_repeats: 8 + dataset_split: test_v6_2408_2505 + + # 4. AIME 2025 + - name: ns_aime2025 + env_vars: + HF_TOKEN: HF_TOKEN + nemo_evaluator_config: + config: + params: + # limit_samples: 8 + extra: + num_repeats: 64 + + # 5. MATH500 (Requires JUDGE_API_KEY) + # - name: AA_math_test_500 + # env_vars: + # HF_TOKEN: HF_TOKEN + # JUDGE_API_KEY: JUDGE_API_KEY + # nemo_evaluator_config: + # config: + # params: + # # limit_samples: 8 + # extra: + # n_samples: 5 + + # 6. IFEval + - name: ns_ifeval + env_vars: + HF_TOKEN: HF_TOKEN + # nemo_evaluator_config: + # config: + # params: + # limit_samples: 8 + + # 7. SciCode + - name: ns_scicode + env_vars: + HF_TOKEN: HF_TOKEN + nemo_evaluator_config: + config: + params: + # limit_samples: 8 + max_new_tokens: 8192 + extra: + num_repeats: 8 diff --git a/examples/pruning/minitron/README.md b/examples/pruning/minitron/README.md new file mode 100644 index 0000000000..8749c366a7 --- /dev/null +++ b/examples/pruning/minitron/README.md @@ -0,0 +1,11 @@ +# Minitron Pruning — End-to-End Tutorials + +End-to-end tutorials for [Minitron](https://arxiv.org/abs/2407.14679) structured pruning followed by knowledge distillation, quantization, evaluation,and vLLM deployment. + +Each subdirectory covers a specific source model and target size, including the full data blend, pruning config, distillation hyperparameters, evaluation results, and throughput benchmarks. + +## Related + +- [Minitron pruning instructions](../../megatron_bridge/README.md#pruning) and [Megatron-Bridge distillation instructions](../../megatron_bridge/README.md#distillation) +- [Megatron dataset tokenization](../../dataset/MEGATRON_DATA_PREP.md) +- [Puzzletron pruning algorithm](../../puzzletron/README.md) diff --git a/examples/megatron_bridge/results/puzzletron.md b/examples/pruning/puzzletron/Llama-3.1-8B-Instruct.md similarity index 100% rename from examples/megatron_bridge/results/puzzletron.md rename to examples/pruning/puzzletron/Llama-3.1-8B-Instruct.md diff --git a/examples/pruning/puzzletron/README.md b/examples/pruning/puzzletron/README.md new file mode 100644 index 0000000000..426ced00c4 --- /dev/null +++ b/examples/pruning/puzzletron/README.md @@ -0,0 +1,16 @@ +# Puzzletron Pruning — Distillation Results + +Distillation results for models compressed with [Puzzletron](../../puzzletron/README.md) MIP-based heterogeneous pruning, followed by Megatron-Bridge knowledge distillation. + +## Results + +| Model | File | +| --- | --- | +| Llama-3.1-8B-Instruct and Qwen3-8B | [Llama-3.1-8B-Instruct.md](Llama-3.1-8B-Instruct.md) | + +## Related + +- [Puzzletron pruning example](../../puzzletron/README.md) +- [Megatron-Bridge distillation instructions](../../megatron_bridge/README.md#distillation) +- [Megatron dataset tokenization](../../dataset/MEGATRON_DATA_PREP.md) +- [Minitron pruning instructions](../../pruning/README.md#minitron) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 8918307339..571b40ca49 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -341,6 +341,8 @@ To recover degradation in the quality of the compressed model, we can use knowle See [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) for instructions on using Megatron-Bridge for knowledge distillation. The distillation script supports both standard HuggingFace and Puzzletron AnyModel checkpoints. +For distillation results on Puzzletron-compressed models, see [examples/pruning/puzzletron/](../pruning/puzzletron/README.md). + ## Advanced Usage Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios. diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml index 01886607e4..53d7e4bd9c 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml @@ -1,8 +1,15 @@ defaults: - pruning_defaults +hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IndependentKvHeadContributionHook} + activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor + activation_hooks_kwargs: method: independent_kv_head_contribution optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 31c73d0427..6a8855930b 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -350,7 +350,7 @@ def train(): print_rank_0("Loading dataset...") is_dflash = training_args.mode == "dflash" - if training_args.mode in ("eagle3", "dflash"): + if training_args.mode in ("eagle3", "medusa", "dflash"): data_module = make_speculative_data_module( tokenizer, data_args, diff --git a/modelopt/onnx/llm_export_utils/__init__.py b/modelopt/onnx/llm_export_utils/__init__.py new file mode 100644 index 0000000000..8ea066d865 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/__init__.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deprecated shim for the legacy ``modelopt.onnx.llm_export_utils`` package. + +The in-repo LLM ONNX export pipeline (formerly ``examples/torch_onnx/llm_export.py`` +plus this package) was removed in 0.44.0rc1 in favor of +`TensorRT-Edge-LLM `_, which provides +a more complete and actively maintained pipeline. + +This package is preserved only as a compatibility shim so external consumers that +still import ``modelopt.onnx.llm_export_utils`` (notably TensorRT-Edge-LLM 0.6.1 +and earlier) continue to work. It will be removed in a future release. + +New code should migrate to: + +* ``modelopt.onnx.export`` — quant exporters (``FP8QuantExporter``, ``NVFP4QuantExporter``, etc.) +* ``modelopt.onnx.graph_surgery`` — graph transforms (GQA replacement, BF16 conversion, etc.) +* `TensorRT-Edge-LLM `_ — end-to-end LLM export. +""" + +import warnings + +warnings.warn( + "modelopt.onnx.llm_export_utils is deprecated and will be removed in a future " + "release. Use modelopt.onnx.export and modelopt.onnx.graph_surgery, or migrate " + "to TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM).", + DeprecationWarning, + stacklevel=2, +) diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py new file mode 100644 index 0000000000..2016e872e2 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -0,0 +1,162 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for exporting LLM models to ONNX.""" + +import json +import os +import time +from enum import Enum + +import torch +from transformers import AutoModelForCausalLM, DynamicCache + + +class RopeType(Enum): + """Rope type enum.""" + + K_NONE = 0 + K_ROPE_ROTATE_GPTJ = 1 + K_ROPE_ROTATE_NEOX = 2 + K_MROPE = 3 + + +class ModelLoader: + """A class to handle HuggingFace model loading and configuration.""" + + def __init__(self, hf_model_path: str, config_path: str): + """Initialize the ModelLoader.""" + self.config_path = config_path + self.hf_model_path = hf_model_path + self.model_type = self.get_model_type() + self.hf_model = None + self.rope_type = RopeType.K_ROPE_ROTATE_NEOX + + def get_model_type(self): + """Get model type from config file.""" + with open(self.config_path) as f: + return json.load(f).get("model_type") + + def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM: + """Load HuggingFace model based on model type.""" + print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}") + self.hf_model = AutoModelForCausalLM.from_pretrained( + self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + ) + + return self.hf_model.eval().cuda() # type: ignore[attr-defined] + + def get_rope_type(self): + """Get rope type.""" + return self.rope_type + + +class WrapperModelForCausalLM(torch.nn.Module): + """Wrapper Model to ensure all models have the same I/O.""" + + def __init__(self, model): + """Initialize the WrapperModelForCausalLM.""" + super().__init__() + try: + self.model = model.model + except Exception: + self.model = model + self.lm_head = model.lm_head + self.config = model.config + + def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): + """Forward pass.""" + # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3) + cache = DynamicCache(config=self.config) + cache.key_cache = [kv[0] for kv in past_key_values] + cache.value_cache = [kv[1] for kv in past_key_values] + past_key_values = cache + + outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) + hidden_states = outputs[0] + past_key_values = outputs.past_key_values.to_legacy_cache() + logits = self.lm_head(hidden_states) + return logits, past_key_values + + +def llm_to_onnx(model, output_dir, extra_inputs={}, extra_dyn_axes={}): + """Export the WrapperModelForCausalLM to ONNX with fixed I/O names and shape definitions and save to `output_dir`. + + Parameters: + model: torch.Module + output_dir: str, the output_dir of the original ONNX. + extra_inputs: dict, append additional inputs after kv_cache. Usually for VL models + extra_dyn_axes: dict. Usually for VL models + """ + start_time = time.time() + config = model.config + num_layers = config.num_hidden_layers + num_attention_heads = config.num_attention_heads + num_key_value_heads = config.num_key_value_heads + hidden_size = config.hidden_size + hidden_size_per_layer = hidden_size // num_attention_heads + + dummy_bs = 1 + dummy_len = 10 + dummy_input_ids = torch.randint(100, (dummy_bs, dummy_len), dtype=torch.int64).cuda() + input_names = ["input_ids"] + output_names = ["logits"] + dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}} + dummy_kv_cache = () + for i in range(num_layers): + dummy_k = torch.rand( + (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16 + ).cuda() + dummy_v = torch.rand( + (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16 + ).cuda() + dummy_kv_cache = (*dummy_kv_cache, (dummy_k, dummy_v)) + input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) + output_names.extend([f"present_key_values.{i}.key", f"present_key_values.{i}.value"]) + input_dynamic_axes = {0: "batch_size", 2: "past_len"} + dynamic_axes[f"past_key_values.{i}.key"] = input_dynamic_axes + dynamic_axes[f"past_key_values.{i}.value"] = input_dynamic_axes + + torch_to_onnx( + model, + (dummy_input_ids, {"past_key_values": dummy_kv_cache, **extra_inputs}), + output_dir, + "model.onnx", + input_names=input_names + list(extra_inputs.keys()), + output_names=output_names, + dynamic_axes=dynamic_axes | extra_dyn_axes, + ) + + end_time = time.time() + print( + f"Native ONNX Export from torch completed in {end_time - start_time}s. ONNX file is saved to {output_dir}." + ) + + +def torch_to_onnx(model, inputs, onnx_dir, onnx_name, input_names, output_names, dynamic_axes): + """Export the model to ONNX.""" + os.makedirs(onnx_dir, exist_ok=True) + with torch.inference_mode(): + torch.onnx.export( + model, + inputs, + f"{onnx_dir}/{onnx_name}", + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + opset_version=19, + do_constant_folding=True, + dynamo=False, + ) diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py new file mode 100644 index 0000000000..ac24c24a53 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quantization utilities for LLM models.""" + +import copy +import time + +import modelopt.torch.quantization as mtq +from modelopt.torch.utils.dataset_utils import get_dataset_dataloader + + +def _quantize_model(model, quant_config, calib_dataloader=None): + """The calibration loop for the model can be setup using the modelopt API. + + Example usage: + from modelopt.torch.utils.dataset_utils import create_forward_loop + model = ... # Initialize the model + tokenizer = ... # Initialize the tokenizer + quant_cfg = ... # Setup quantization configuration + forward_loop = create_forward_loop(model=model, dataset_name="cnn_dailymail", tokenizer=tokenizer) + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + """ + + def calibrate_loop(model): + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + if idx % 10 == 0: + print(f"Calibrating batch {idx}...") + if isinstance(data, dict): + data = {k: v.to(model.device) for k, v in data.items()} + model(**data) + else: + data = data.to(model.device) + model(data) + + print("Starting quantization...") + start_time = time.time() + mtq.quantize(model, quant_config, forward_loop=calibrate_loop) + end_time = time.time() + print(f"Quantization finishes in {end_time - start_time}s.") + + return model + + +def get_quant_config(precision, lm_head_precision="fp16"): + """Get the quantization configuration.""" + if precision == "fp8": + quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG) + + elif precision == "nvfp4": + quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG) + + elif precision == "int4_awq": + quant_cfg = copy.deepcopy(mtq.INT4_AWQ_CFG) # type: ignore[arg-type] + + else: + raise ValueError(f"Unsupported precision: {precision}") + + quant_cfg_list: list = [ + e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_name" in e + ] + + if lm_head_precision == "fp8": + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) + elif lm_head_precision == "nvfp4": + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.input_quantizer", + "cfg": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + }, + "enable": True, + } + ) + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.weight_quantizer", + "cfg": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + }, + "enable": True, + } + ) + quant_cfg["quant_cfg"] = quant_cfg_list + return quant_cfg + + +def quantize( + model, tokenizer, precision, lm_head_precision="fp16", dataset_dir=None, calib_size=512 +): + """Quantize the PyTorch model to fp8 or int4_awq.""" + assert precision in [ + "fp8", + "int4_awq", + "nvfp4", + ], ( + f"Only fp8(W8A8), int4_awq(W4A16), nvfp4(W4A4) is supported. You passed an unsupported precision: {precision}." + ) + + assert lm_head_precision in ["fp16"], ( + f"Only fp16(unquantized) is supported for lm_head. You passed an unsupported precision: {lm_head_precision}." + ) + + if tokenizer.pad_token != "": # nosec B105 + tokenizer.pad_token = tokenizer.eos_token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if not dataset_dir: + dataset_dir = "cnn_dailymail" + + batch_size = 1 + data_loader = get_dataset_dataloader( + dataset_name=dataset_dir, tokenizer=tokenizer, batch_size=batch_size, num_samples=calib_size + ) + quant_config = get_quant_config(precision, lm_head_precision) + quantized_model = _quantize_model(model, quant_config, data_loader) + mtq.print_quant_summary(quantized_model) + return quantized_model diff --git a/modelopt/onnx/llm_export_utils/surgeon_utils.py b/modelopt/onnx/llm_export_utils/surgeon_utils.py new file mode 100644 index 0000000000..2937f6ad0c --- /dev/null +++ b/modelopt/onnx/llm_export_utils/surgeon_utils.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities to surgeon ONNX graph after export.""" + +import re +import time + +import onnx +import onnx_graphsurgeon as gs +import torch +from onnx_graphsurgeon.ir.tensor import LazyValues + + +def clear_inputs(node: gs.Node | gs.Tensor): + """Clear all inputs for a node or tensor in ONNX.""" + for i in node.inputs: + i.outputs.clear() + node.inputs.clear() + return node + + +def clear_outputs(node: gs.Node | gs.Tensor): + """Clear all outputs for a node or tensor in ONNX.""" + for o in node.outputs: + o.inputs.clear() + node.outputs.clear() + return node + + +def extract_layer_id(name: str): + """Extract layer id from certain ONNX layer name. + + Parameters: + name: str + The name of ONNX layer. e.g. /model/layer.0/q_proj/... + + Returns: + The layer id for the layer as int. In the example above, it returns 0 + """ + match = re.search(r"layers\.(\d+)", name) + if match: + return int(match.group(1)) + raise Exception(f"{name} does not contain layer info!") + + +def no_none_elements(elements: list): + """Check if all elements in the list are not None.""" + return all(i is not None for i in elements) + + +def fold_fp8_qdq_to_dq(graph: gs.Graph): + """Convert FP32/FP16 weights of the given ONNX model to FP8 weights. + + Even though modelopt supports FP8 onnx export, the weights are represented in fp32 + QDQ. + The storage is therefore very bad. In this function, + Q nodes will get removed from the weights and have only DQ nodes with those converted FP8 + weights in the output model. + + Parameters: + graph: gs.Graph. + + Returns: + gs.Graph with only DQ nodes for weights and same QDQ nodes for activations. + """ + start_time = time.time() + print("Replacing all (fp32 weights + fp8 QDQ) with (fp8 weights + DQ)...") + # Fold constants is required since the scale is not constant yet. + graph.cleanup().toposort().fold_constants().cleanup() + + for node in graph.nodes: + if node.op == "TRT_FP8QuantizeLinear": + # Should not remove input QDQ + if not isinstance(node.inputs[0], gs.Constant): + continue + + weights = node.inputs[0] + scale = node.inputs[1] + torch_weights = torch.from_numpy(weights.values) + torch_scale = torch.from_numpy(scale.values) + quantizer_name = scale.name.rsplit("/", 1)[0] + dq_op = node.outputs[0].outputs[0] + assert dq_op.op == "TRT_FP8DequantizeLinear", ( + f"QDQ does not occur in pairs. You reached {dq_op.op}" + ) + + # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8. + numpy_weights = ( + (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy() + ) + tensor = onnx.TensorProto() + tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN + tensor.dims.extend(numpy_weights.shape) + tensor.raw_data = numpy_weights.tobytes() + values = LazyValues(tensor) + onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values) + + node.outputs.clear() + # DQ Op is separated out + dq_op.inputs[0] = onnx_weights_fp8 + dq_op.op = "DequantizeLinear" + dq_op.outputs[0].dtype = dq_op.inputs[1].dtype + + graph.cleanup().toposort() + end_time = time.time() + print(f"fp8 qdq replaced with only dq completed in {end_time - start_time}s.") + + return graph diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index f931ae6c11..df6dbc877d 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -220,7 +220,6 @@ def __init__( "Remote autotuning requires '--skipInference' to be set. Adding it to trtexec arguments." ) self.trtexec_args.append("--skipInference") - return except ImportError: self.logger.warning( "Remote autotuning is not supported with TensorRT version < 10.15. " diff --git a/modelopt/onnx/quantization/int4.py b/modelopt/onnx/quantization/int4.py index b17431fb9b..d680b47cfc 100644 --- a/modelopt/onnx/quantization/int4.py +++ b/modelopt/onnx/quantization/int4.py @@ -480,6 +480,23 @@ def _augment_graph( augmented_outputs.add(act_tensor.name) +def _remove_augmented_onnx(onnx_path: str, use_external_data_format: bool) -> None: + """Remove the augmented ONNX temp file and its external data companion (if any).""" + try: + os.remove(onnx_path) + except FileNotFoundError: + pass + except OSError as e: + logger.warning("Failed to remove augmented ONNX file: %s", e) + if use_external_data_format: + try: + os.remove(onnx_path + "_data") + except FileNotFoundError: + pass + except OSError as e: + logger.warning("Failed to remove augmented ONNX data file: %s", e) + + def _change_input_type( graph: onnx.GraphProto, input_name: str, gemm_io_type: onnx.TensorProto.DataType ): @@ -533,147 +550,152 @@ def _quantize_awq_clip( augmented_onnx_file, augmented_onnx_path = tempfile.mkstemp(suffix=".onnx") os.close(augmented_onnx_file) - save_onnx(augmented_model, augmented_onnx_path, use_external_data_format) - logger.info(f"Saving the model took {time.time() - t} seconds") - - # Creating inference session and preparing inputs for calibration - session = create_inference_session(augmented_onnx_path, calibration_eps, input_shapes_profile) - inputs = [] - for inp_d in data_reader: - inputs.append(inp_d) - assert isinstance(inp_d, dict) - layer_info = get_layer_info(onnx_model, nodes_to_exclude, block_size, **kwargs) - # Apply AWQ clip on selected weights - t = time.time() - alphas = {} - for i in tqdm(range(len(wa_pack)), desc="Running clip search..."): - act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] - - # First capture all the activation values after calibration data sweep - output_dicts = {} - for inp_d in inputs: - np_inp_d = {name: numpy.asarray(tensor) for name, tensor in inp_d.items()} - output = session.run([act_tensor.name], np_inp_d) - out = np.asarray(output[0]) - output_dicts.setdefault(act_tensor.name, []).append(out) - - # Concatenating the activation tensors over all calib data - x = np.concatenate(output_dicts[act_tensor.name], axis=0) # n_token, ci - w = numpy_helper.to_array( - weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) - ).copy() - if do_transpose: - w = w.T - w = np.asarray(w) - num_bits = get_num_bits(layer_info, weight_tensor.name) - # Updating the block size as for 8bit quantization, per-channel quantization is used. - block_size_updated = update_block_size(block_size, layer_info, weight_tensor.name, w=w) - awq_clip = AWQClipHelper(w, block_size_updated, **kwargs) - _clip_search(x, w, awq_clip, num_bits=num_bits, **kwargs) - alphas[weight_tensor.name] = awq_clip.best_alpha - - logger.info(f"Clip search for all weights took {time.time() - t} seconds") + session = None + try: + save_onnx(augmented_model, augmented_onnx_path, use_external_data_format) + logger.info(f"Saving the model took {time.time() - t} seconds") - del session + # Creating inference session and preparing inputs for calibration + session = create_inference_session( + augmented_onnx_path, calibration_eps, input_shapes_profile + ) + inputs = [] + for inp_d in data_reader: + inputs.append(inp_d) + assert isinstance(inp_d, dict) + layer_info = get_layer_info(onnx_model, nodes_to_exclude, block_size, **kwargs) + # Apply AWQ clip on selected weights + t = time.time() + alphas = {} + for i in tqdm(range(len(wa_pack)), desc="Running clip search..."): + act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] - # Compute quantized weights and scales which are needed for DQ nodes - t = time.time() - for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."): - act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] - gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type) + # First capture all the activation values after calibration data sweep + output_dicts = {} + for inp_d in inputs: + np_inp_d = {name: numpy.asarray(tensor) for name, tensor in inp_d.items()} + output = session.run([act_tensor.name], np_inp_d) + out = np.asarray(output[0]) + output_dicts.setdefault(act_tensor.name, []).append(out) - if force_fp16: - gemm_io_type = onnx.TensorProto.FLOAT16 + # Concatenating the activation tensors over all calib data + x = np.concatenate(output_dicts[act_tensor.name], axis=0) # n_token, ci + w = numpy_helper.to_array( + weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) + ).copy() + if do_transpose: + w = w.T + w = np.asarray(w) + num_bits = get_num_bits(layer_info, weight_tensor.name) + # Updating the block size as for 8bit quantization, per-channel quantization is used. + block_size_updated = update_block_size(block_size, layer_info, weight_tensor.name, w=w) + awq_clip = AWQClipHelper(w, block_size_updated, **kwargs) + _clip_search(x, w, awq_clip, num_bits=num_bits, **kwargs) + alphas[weight_tensor.name] = awq_clip.best_alpha - w = numpy_helper.to_array( - weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) - ).copy() - if do_transpose: - w = w.T - w = np.asarray(w) + logger.info(f"Clip search for all weights took {time.time() - t} seconds") - alpha = alphas.get(weight_tensor.name, 1) - num_bits = get_num_bits(layer_info, weight_tensor.name) - # Updating the block size as for 8bit quantization, per-channel quantization is used. - block_size_updated = update_block_size(block_size, layer_info, weight_tensor.name, w=w) - qw, scale, _ = quant_tensor(w, block_size_updated, alpha=alpha, num_bits=num_bits) - if has_cupy: - qw = np.asnumpy(qw) - scale = np.asnumpy(scale) - if do_transpose: - qw = qw.T - scale = scale.T - scales[weight_tensor.name] = scale.astype( - onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type) - ) - gemm_weights_quantized[weight_tensor.name] = numpy.asarray(qw).astype(numpy.int8) + session = None - # Change the input activation type to the expected type, fp16 by default - # TODO: cast input C for Gemm - _change_input_type(onnx_model.graph, act_tensor.name, gemm_io_type) + # Compute quantized weights and scales which are needed for DQ nodes + t = time.time() + for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."): + act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] + gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type) - logger.info(f"Quantizing actual weights took {time.time() - t} seconds") + if force_fp16: + gemm_io_type = onnx.TensorProto.FLOAT16 - graph_gs = gs.import_onnx(onnx_model) + w = numpy_helper.to_array( + weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) + ).copy() + if do_transpose: + w = w.T + w = np.asarray(w) - gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE) - gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS) - gather_w_map = None - gather_s_map = None - if gather_quantize_axis is not None: - gather_w_map, gather_s_map, _ = _quantize_gather_nodes( - graph_gs, - nodes_to_exclude, - use_zero_point=False, - dq_only=True, - layer_info=layer_info, - ) + alpha = alphas.get(weight_tensor.name, 1) + num_bits = get_num_bits(layer_info, weight_tensor.name) + # Updating the block size as for 8bit quantization, per-channel quantization is used. + block_size_updated = update_block_size(block_size, layer_info, weight_tensor.name, w=w) + qw, scale, _ = quant_tensor(w, block_size_updated, alpha=alpha, num_bits=num_bits) + if has_cupy: + qw = np.asnumpy(qw) + scale = np.asnumpy(scale) + if do_transpose: + qw = qw.T + scale = scale.T + scales[weight_tensor.name] = scale.astype( + onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type) + ) + gemm_weights_quantized[weight_tensor.name] = numpy.asarray(qw).astype(numpy.int8) + + # Change the input activation type to the expected type, fp16 by default + # TODO: cast input C for Gemm + _change_input_type(onnx_model.graph, act_tensor.name, gemm_io_type) + + logger.info(f"Quantizing actual weights took {time.time() - t} seconds") + + graph_gs = gs.import_onnx(onnx_model) + + gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE) + gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS) + gather_w_map = None + gather_s_map = None + if gather_quantize_axis is not None: + gather_w_map, gather_s_map, _ = _quantize_gather_nodes( + graph_gs, + nodes_to_exclude, + use_zero_point=False, + dq_only=True, + layer_info=layer_info, + ) - t = time.time() - # Apply column-major optimization if flag is set - # Transposes the weights and scales in-place - use_column_major = kwargs.get("use_column_major", False) - if use_column_major: - qdq.apply_column_major_transformation(gemm_weights_quantized, scales) - dq_node_attributes = {"axis": 1, "block_size": block_size} - else: - dq_node_attributes = {"axis": 0, "block_size": block_size} - scales = reshape_scales_for_per_channel_nodes(scales, block_size, layer_info) - qdq.insert_dq_nodes( - graph_gs, - scales, - quantized_weights=gemm_weights_quantized, - attributes=dq_node_attributes, - layer_info=layer_info, - ) - # Add transpose nodes for column-major if needed - if use_column_major: - qdq.insert_transpose_nodes_for_column_major(graph_gs) - if gather_w_map is not None: - assert gather_s_map is not None, "scale-map not found for quantizable gather nodes" - gather_dq_node_attributes = {"axis": gather_quantize_axis, "block_size": gather_block_size} + t = time.time() + # Apply column-major optimization if flag is set + # Transposes the weights and scales in-place + use_column_major = kwargs.get("use_column_major", False) + if use_column_major: + qdq.apply_column_major_transformation(gemm_weights_quantized, scales) + dq_node_attributes = {"axis": 1, "block_size": block_size} + else: + dq_node_attributes = {"axis": 0, "block_size": block_size} + scales = reshape_scales_for_per_channel_nodes(scales, block_size, layer_info) qdq.insert_dq_nodes( graph_gs, - gather_s_map, - quantized_weights=gather_w_map, - attributes=gather_dq_node_attributes, + scales, + quantized_weights=gemm_weights_quantized, + attributes=dq_node_attributes, layer_info=layer_info, ) - logger.info(f"Inserting DQ nodes took {time.time() - t} seconds") - - logger.info("Exporting the quantized graph") - t = time.time() - model = gs.export_onnx(graph_gs) - # Set ir_version to 10, remove it once ORT supports ir_version 11 - model.ir_version = 10 - logger.info(f"Exporting took {time.time() - t} seconds") + # Add transpose nodes for column-major if needed + if use_column_major: + qdq.insert_transpose_nodes_for_column_major(graph_gs) + if gather_w_map is not None: + assert gather_s_map is not None, "scale-map not found for quantizable gather nodes" + gather_dq_node_attributes = { + "axis": gather_quantize_axis, + "block_size": gather_block_size, + } + qdq.insert_dq_nodes( + graph_gs, + gather_s_map, + quantized_weights=gather_w_map, + attributes=gather_dq_node_attributes, + layer_info=layer_info, + ) + logger.info(f"Inserting DQ nodes took {time.time() - t} seconds") - try: - os.remove(augmented_onnx_path) - if use_external_data_format: - os.remove(augmented_onnx_path + "_data") - except OSError: - logger.warn("Augmented ONNX model or external data file was not found") + logger.info("Exporting the quantized graph") + t = time.time() + model = gs.export_onnx(graph_gs) + # Set ir_version to 10, remove it once ORT supports ir_version 11 + model.ir_version = 10 + logger.info(f"Exporting took {time.time() - t} seconds") + finally: + if session is not None: + session = None + gc.collect() + _remove_augmented_onnx(augmented_onnx_path, use_external_data_format) return model @@ -1085,316 +1107,323 @@ def _quantize_awq_lite( augmented_onnx_file, augmented_onnx_path = tempfile.mkstemp(suffix=".onnx") os.close(augmented_onnx_file) - save_onnx(augmented_model, augmented_onnx_path, use_external_data_format) - logger.info(f"Saving the model took {time.time() - t} seconds") - - # Creating inference session and preparing inputs for calibration - session = create_inference_session(augmented_onnx_path, calibration_eps, input_shapes_profile) - inputs = [] - for inp_d in data_reader: - inputs.append(inp_d) - assert isinstance(inp_d, dict) - - gc.collect() - - output_data = [] - - if enable_fast_path_using_high_sysram: - logger.info("Fast-path-using-high-sysram is enabled\n") - - tensor_names_list = [] - for i in tqdm(range(len(wa_pack)), desc="Getting tensor names..."): - act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] - tensor_names_list.append(act_tensor.name) + session = None + try: + save_onnx(augmented_model, augmented_onnx_path, use_external_data_format) + logger.info(f"Saving the model took {time.time() - t} seconds") - for i in tqdm(range(len(inputs)), desc="Caching activations..."): - inp_d = inputs[i] - np_inp_d = {name: numpy.asarray(tensor) for name, tensor in inp_d.items()} - output = session.run(tensor_names_list, np_inp_d) - output_data.append(output) + # Creating inference session and preparing inputs for calibration + session = create_inference_session( + augmented_onnx_path, calibration_eps, input_shapes_profile + ) + inputs = [] + for inp_d in data_reader: + inputs.append(inp_d) + assert isinstance(inp_d, dict) - del session - session = None gc.collect() - # Apply AWQ lite on selected weights - t = time.time() - awq_lite = [None] * len(wa_pack) - clip_alphas = {} - - msg = "..." - if enable_weight_clipping: - msg = " and clip-range search..." + output_data = [] - act_to_wa_pack_map, act_to_quant_nodes_weight_shape_map = ( - get_act_to_weight_map_and_act_to_wa_pack_map(wa_pack) - ) - if run_per_subgraph: - # TODO - add support for handling awq_lite mixed precision for per-subgraph implementation - awq_lite = run_awq_scale_search_per_subgraph( - wa_pack, - act_to_wa_pack_map, - act_to_quant_nodes_weight_shape_map, - augmented_onnx_path, - block_size, - use_zero_point, - session, - awq_lite, - inputs, - msg, - **kwargs, - ) - else: - awq_lite, clip_alphas = run_awq_scale_search_per_node( - wa_pack, - augmented_onnx_path, - block_size, - use_zero_point, - session, - awq_lite, - inputs, - msg, - enable_weight_clipping, - enable_fast_path_using_high_sysram, - output_data, - clip_alphas, - layer_info, - **kwargs, - ) - assert len(awq_lite) == len(wa_pack) - for i in range(len(awq_lite)): - assert awq_lite[i] is not None - - if enable_weight_clipping: - assert len(clip_alphas.keys()) == len(wa_pack) - - logger.info("AWQ scale search" + msg.strip(".") + f" took {time.time() - t} seconds") + if enable_fast_path_using_high_sysram: + logger.info("Fast-path-using-high-sysram is enabled\n") - if session is not None: - del session - session = None - if has_cupy: - np.get_default_memory_pool().free_all_blocks() - del output_data - gc.collect() + tensor_names_list = [] + for i in tqdm(range(len(wa_pack)), desc="Getting tensor names..."): + act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] + tensor_names_list.append(act_tensor.name) - # Compute quantized weights and scales which are needed for DQ nodes - t = time.time() - # Use a common mean scale for weights within a sub-graph - if fuse_nodes and not run_per_subgraph: - for wa_pack_idx_list in act_to_wa_pack_map.values(): - group_awq_scale = [ - awq_lite[wa_pack_idx].best_scale[:, np.newaxis] for wa_pack_idx in wa_pack_idx_list - ] - mean_awq_scale = np.concatenate(group_awq_scale, axis=1) - mean_awq_scale = mean_awq_scale.mean(axis=1) - for wa_pack_idx in wa_pack_idx_list: - awq_lite[wa_pack_idx].best_scale = mean_awq_scale + for i in tqdm(range(len(inputs)), desc="Caching activations..."): + inp_d = inputs[i] + np_inp_d = {name: numpy.asarray(tensor) for name, tensor in inp_d.items()} + output = session.run(tensor_names_list, np_inp_d) + output_data.append(output) - for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."): - act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] - gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type) + del session + session = None + gc.collect() - if force_fp16: - gemm_io_type = onnx.TensorProto.FLOAT16 + # Apply AWQ lite on selected weights + t = time.time() + awq_lite = [None] * len(wa_pack) + clip_alphas = {} - w = numpy_helper.to_array( - weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) - ).copy() - if do_transpose: - w = w.T - w = np.asarray(w) + msg = "..." + if enable_weight_clipping: + msg = " and clip-range search..." - w_scaled = w * awq_lite[i].best_scale[:, np.newaxis] - alpha = clip_alphas.get(weight_tensor.name, 1) - assert enable_weight_clipping or (alpha == 1), ( - "clip range enabled without enabling weight-clipping param" - ) - # Updating the block size as for 8bit quantization, per-channel quantization is used. - num_bits = get_num_bits(layer_info, weight_tensor.name) - block_size_updated = update_block_size( - block_size, layer_info, weight_tensor.name, w=w_scaled - ) - qw, scale, zp = quant_tensor( - w_scaled, - block_size_updated, - alpha=alpha, - use_zero_point=use_zero_point, - num_bits=num_bits, + act_to_wa_pack_map, act_to_quant_nodes_weight_shape_map = ( + get_act_to_weight_map_and_act_to_wa_pack_map(wa_pack) ) + if run_per_subgraph: + # TODO - add support for handling awq_lite mixed precision for per-subgraph implementation + awq_lite = run_awq_scale_search_per_subgraph( + wa_pack, + act_to_wa_pack_map, + act_to_quant_nodes_weight_shape_map, + augmented_onnx_path, + block_size, + use_zero_point, + session, + awq_lite, + inputs, + msg, + **kwargs, + ) + else: + awq_lite, clip_alphas = run_awq_scale_search_per_node( + wa_pack, + augmented_onnx_path, + block_size, + use_zero_point, + session, + awq_lite, + inputs, + msg, + enable_weight_clipping, + enable_fast_path_using_high_sysram, + output_data, + clip_alphas, + layer_info, + **kwargs, + ) + assert len(awq_lite) == len(wa_pack) + for i in range(len(awq_lite)): + assert awq_lite[i] is not None - assert use_zero_point is True or zp is None, "zp is not according to use-zero-point setting" - if do_transpose: - qw = qw.T - scale = scale.T - if zp is not None: - zp = zp.T - if has_cupy: - qw = np.asnumpy(qw) - scale = np.asnumpy(scale) - if zp is not None: - zp = np.asnumpy(zp) - scales[weight_tensor.name] = scale.astype( - onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type) - ) - weight_dtype = numpy.int8 - if zp is not None: - zero_points[weight_tensor.name] = numpy.asarray(zp).astype(numpy.uint8) - weight_dtype = numpy.uint8 - gemm_weights_quantized[weight_tensor.name] = numpy.asarray(qw).astype(weight_dtype) - input_tensors[weight_tensor.name] = act_tensor.name - pqs_value = ( - awq_lite[i] - .best_scale[:, np.newaxis] - .astype(onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type)) - ).T - if has_cupy: - pqs_value = np.asnumpy(pqs_value) - pre_quant_scale[weight_tensor.name] = pqs_value + if enable_weight_clipping: + assert len(clip_alphas.keys()) == len(wa_pack) - # Change the input activation type to the expected type, fp16 by default - # TODO: cast input C for Gemm - _change_input_type(onnx_model.graph, act_tensor.name, gemm_io_type) + logger.info("AWQ scale search" + msg.strip(".") + f" took {time.time() - t} seconds") - logger.info(f"Quantizing actual weights took {time.time() - t} seconds") + if session is not None: + session = None + if has_cupy: + np.get_default_memory_pool().free_all_blocks() + del output_data + gc.collect() - # Fuse Mul nodes with parent node if possible - if fuse_nodes: - logger.info("Fusing pre-quant scale Mul nodes with parent node") + # Compute quantized weights and scales which are needed for DQ nodes t = time.time() - updated_nodes = set() - name_to_node_map = {node.name: node for node in onnx_model.graph.node} - initializer_map = { - initializer.name: initializer for initializer in onnx_model.graph.initializer - } - for parent, child_nodes in parent_child_nodes_map.items(): - if parent == "root_0": - continue - parent = name_to_node_map[parent] - if parent.name in updated_nodes: - continue - # When fuse_nodes or run_per_subgraph is True, - # scales computed for each child_nodes will be same. - # Hence, picking pre_quant_scale corresponding to any child_nodes is acceptable - input_scale = np.asarray(pre_quant_scale[child_nodes[0].input[1]]) - weight_tensor_names = [node.input[1] for node in child_nodes] - if ( - is_fusible_scaling_op(parent.op_type) - and not all(initializer_map.get(inp) is None for inp in parent.input) - and len(input_name_to_nodes[child_nodes[0].input[0]]) == len(child_nodes) - ): - for inp in parent.input: - if initializer_map.get(inp) is not None: - tensor = initializer_map[inp] - old_dim = tensor.dims - tensor_array = numpy_helper.to_array( - tensor, - base_dir=os.path.dirname(augmented_onnx_path), - ) - new_tensor = np.asarray(tensor_array) / input_scale - new_tensor = new_tensor.reshape(old_dim) - new_tensor = numpy_helper.from_array(new_tensor.get(), tensor.name) - # replace initializer with new scaled array - tensor.CopyFrom(new_tensor) - for w_name in weight_tensor_names: - del pre_quant_scale[w_name] - updated_nodes.add(parent.name) - else: - scale_tensor = onnx.helper.make_tensor( - name=parent.output[0] + "_pre_quant_scale", - data_type=onnx.helper.np_dtype_to_tensor_dtype(input_scale.dtype), - dims=input_scale.shape, - vals=(1.0 / input_scale).flatten().tolist(), - ) - mul_op_name = parent.output[0] + "_pre_quant_scale_out" - mul_node = onnx.helper.make_node( - "Mul", - inputs=[child_nodes[0].input[0], scale_tensor.name], - outputs=[mul_op_name], - name=child_nodes[0].input[0] + "_pre_quant_scale_mul", - ) - for node in child_nodes: - node.input[0] = mul_node.output[0] - for w_name in weight_tensor_names: - del pre_quant_scale[w_name] - onnx_model.graph.initializer.append(scale_tensor) - onnx_model.graph.node.append(mul_node) - - logger.info(f"Fusing pre-quant scale Mul nodes took {time.time() - t} seconds") + # Use a common mean scale for weights within a sub-graph + if fuse_nodes and not run_per_subgraph: + for wa_pack_idx_list in act_to_wa_pack_map.values(): + group_awq_scale = [ + awq_lite[wa_pack_idx].best_scale[:, np.newaxis] + for wa_pack_idx in wa_pack_idx_list + ] + mean_awq_scale = np.concatenate(group_awq_scale, axis=1) + mean_awq_scale = mean_awq_scale.mean(axis=1) + for wa_pack_idx in wa_pack_idx_list: + awq_lite[wa_pack_idx].best_scale = mean_awq_scale + + for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."): + act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i] + gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type) + + if force_fp16: + gemm_io_type = onnx.TensorProto.FLOAT16 + + w = numpy_helper.to_array( + weight_tensor, base_dir=os.path.dirname(augmented_onnx_path) + ).copy() + if do_transpose: + w = w.T + w = np.asarray(w) + + w_scaled = w * awq_lite[i].best_scale[:, np.newaxis] + alpha = clip_alphas.get(weight_tensor.name, 1) + assert enable_weight_clipping or (alpha == 1), ( + "clip range enabled without enabling weight-clipping param" + ) + # Updating the block size as for 8bit quantization, per-channel quantization is used. + num_bits = get_num_bits(layer_info, weight_tensor.name) + block_size_updated = update_block_size( + block_size, layer_info, weight_tensor.name, w=w_scaled + ) + qw, scale, zp = quant_tensor( + w_scaled, + block_size_updated, + alpha=alpha, + use_zero_point=use_zero_point, + num_bits=num_bits, + ) - logger.info( - "Inserting DQ nodes and input_pre_quant_scale node using quantized weights and scales" - ) + assert use_zero_point is True or zp is None, ( + "zp is not according to use-zero-point setting" + ) + if do_transpose: + qw = qw.T + scale = scale.T + if zp is not None: + zp = zp.T + if has_cupy: + qw = np.asnumpy(qw) + scale = np.asnumpy(scale) + if zp is not None: + zp = np.asnumpy(zp) + scales[weight_tensor.name] = scale.astype( + onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type) + ) + weight_dtype = numpy.int8 + if zp is not None: + zero_points[weight_tensor.name] = numpy.asarray(zp).astype(numpy.uint8) + weight_dtype = numpy.uint8 + gemm_weights_quantized[weight_tensor.name] = numpy.asarray(qw).astype(weight_dtype) + input_tensors[weight_tensor.name] = act_tensor.name + pqs_value = ( + awq_lite[i] + .best_scale[:, np.newaxis] + .astype(onnx.helper.tensor_dtype_to_np_dtype(gemm_io_type)) + ).T + if has_cupy: + pqs_value = np.asnumpy(pqs_value) + pre_quant_scale[weight_tensor.name] = pqs_value + + # Change the input activation type to the expected type, fp16 by default + # TODO: cast input C for Gemm + _change_input_type(onnx_model.graph, act_tensor.name, gemm_io_type) + + logger.info(f"Quantizing actual weights took {time.time() - t} seconds") + + # Fuse Mul nodes with parent node if possible + if fuse_nodes: + logger.info("Fusing pre-quant scale Mul nodes with parent node") + t = time.time() + updated_nodes = set() + name_to_node_map = {node.name: node for node in onnx_model.graph.node} + initializer_map = { + initializer.name: initializer for initializer in onnx_model.graph.initializer + } + for parent, child_nodes in parent_child_nodes_map.items(): + if parent == "root_0": + continue + parent = name_to_node_map[parent] + if parent.name in updated_nodes: + continue + # When fuse_nodes or run_per_subgraph is True, + # scales computed for each child_nodes will be same. + # Hence, picking pre_quant_scale corresponding to any child_nodes is acceptable + input_scale = np.asarray(pre_quant_scale[child_nodes[0].input[1]]) + weight_tensor_names = [node.input[1] for node in child_nodes] + if ( + is_fusible_scaling_op(parent.op_type) + and not all(initializer_map.get(inp) is None for inp in parent.input) + and len(input_name_to_nodes[child_nodes[0].input[0]]) == len(child_nodes) + ): + for inp in parent.input: + if initializer_map.get(inp) is not None: + tensor = initializer_map[inp] + old_dim = tensor.dims + tensor_array = numpy_helper.to_array( + tensor, + base_dir=os.path.dirname(augmented_onnx_path), + ) + new_tensor = np.asarray(tensor_array) / input_scale + new_tensor = new_tensor.reshape(old_dim) + new_tensor = numpy_helper.from_array(new_tensor.get(), tensor.name) + # replace initializer with new scaled array + tensor.CopyFrom(new_tensor) + for w_name in weight_tensor_names: + del pre_quant_scale[w_name] + updated_nodes.add(parent.name) + else: + scale_tensor = onnx.helper.make_tensor( + name=parent.output[0] + "_pre_quant_scale", + data_type=onnx.helper.np_dtype_to_tensor_dtype(input_scale.dtype), + dims=input_scale.shape, + vals=(1.0 / input_scale).flatten().tolist(), + ) + mul_op_name = parent.output[0] + "_pre_quant_scale_out" + mul_node = onnx.helper.make_node( + "Mul", + inputs=[child_nodes[0].input[0], scale_tensor.name], + outputs=[mul_op_name], + name=child_nodes[0].input[0] + "_pre_quant_scale_mul", + ) + for node in child_nodes: + node.input[0] = mul_node.output[0] + for w_name in weight_tensor_names: + del pre_quant_scale[w_name] + onnx_model.graph.initializer.append(scale_tensor) + onnx_model.graph.node.append(mul_node) - graph_gs = gs.import_onnx(onnx_model) + logger.info(f"Fusing pre-quant scale Mul nodes took {time.time() - t} seconds") - gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE) - gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS) - gather_w_map = None - gather_s_map = None - gather_zp_map = None - if gather_quantize_axis is not None: - gather_w_map, gather_s_map, gather_zp_map = _quantize_gather_nodes( - graph_gs, - nodes_to_exclude, - use_zero_point=use_zero_point, - dq_only=True, - layer_info=layer_info, + logger.info( + "Inserting DQ nodes and input_pre_quant_scale node using quantized weights and scales" ) - t = time.time() - # Apply column-major optimization if flag is set - # Transposes the weights and scales in-place - use_column_major = kwargs.get("use_column_major", False) - if use_column_major: - qdq.apply_column_major_transformation(gemm_weights_quantized, scales) - dq_node_attributes = {"axis": 1, "block_size": block_size} - else: - dq_node_attributes = {"axis": 0, "block_size": block_size} - scales = reshape_scales_for_per_channel_nodes(scales, block_size, layer_info) - qdq.insert_dq_nodes( - graph_gs, - scales, - quantized_weights=gemm_weights_quantized, - attributes=dq_node_attributes, - zero_points=zero_points if use_zero_point else None, - layer_info=layer_info, - ) - # Add transpose nodes for column-major if needed - if use_column_major: - qdq.insert_transpose_nodes_for_column_major(graph_gs) - if gather_w_map is not None: - assert gather_s_map is not None, "scale-map not found for quantizable gather nodes" - assert not use_zero_point or gather_zp_map, ( - "zero-point setting and zero-point map not in sync for quantizable gather nodes" - ) - gather_dq_node_attributes = {"axis": gather_quantize_axis, "block_size": gather_block_size} + graph_gs = gs.import_onnx(onnx_model) + + gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE) + gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS) + gather_w_map = None + gather_s_map = None + gather_zp_map = None + if gather_quantize_axis is not None: + gather_w_map, gather_s_map, gather_zp_map = _quantize_gather_nodes( + graph_gs, + nodes_to_exclude, + use_zero_point=use_zero_point, + dq_only=True, + layer_info=layer_info, + ) + + t = time.time() + # Apply column-major optimization if flag is set + # Transposes the weights and scales in-place + use_column_major = kwargs.get("use_column_major", False) + if use_column_major: + qdq.apply_column_major_transformation(gemm_weights_quantized, scales) + dq_node_attributes = {"axis": 1, "block_size": block_size} + else: + dq_node_attributes = {"axis": 0, "block_size": block_size} + scales = reshape_scales_for_per_channel_nodes(scales, block_size, layer_info) qdq.insert_dq_nodes( graph_gs, - gather_s_map, - quantized_weights=gather_w_map, - attributes=gather_dq_node_attributes, - zero_points=gather_zp_map if use_zero_point else None, + scales, + quantized_weights=gemm_weights_quantized, + attributes=dq_node_attributes, + zero_points=zero_points if use_zero_point else None, layer_info=layer_info, ) - if pre_quant_scale: - qdq.insert_pre_quant_scale_nodes(graph_gs, input_tensors, pre_quant_scale) - - logger.info(f"Inserting nodes took {time.time() - t} seconds") + # Add transpose nodes for column-major if needed + if use_column_major: + qdq.insert_transpose_nodes_for_column_major(graph_gs) + if gather_w_map is not None: + assert gather_s_map is not None, "scale-map not found for quantizable gather nodes" + assert not use_zero_point or gather_zp_map, ( + "zero-point setting and zero-point map not in sync for quantizable gather nodes" + ) + gather_dq_node_attributes = { + "axis": gather_quantize_axis, + "block_size": gather_block_size, + } + qdq.insert_dq_nodes( + graph_gs, + gather_s_map, + quantized_weights=gather_w_map, + attributes=gather_dq_node_attributes, + zero_points=gather_zp_map if use_zero_point else None, + layer_info=layer_info, + ) + if pre_quant_scale: + qdq.insert_pre_quant_scale_nodes(graph_gs, input_tensors, pre_quant_scale) - logger.info("Exporting the quantized graph") - t = time.time() - model = gs.export_onnx(graph_gs) - # Set ir_version to 10, remove it once ORT supports ir_version 11 - model.ir_version = 10 - logger.info(f"Exporting took {time.time() - t} seconds") + logger.info(f"Inserting nodes took {time.time() - t} seconds") - try: - os.remove(augmented_onnx_path) - if use_external_data_format: - os.remove(augmented_onnx_path + "_data") - except OSError: - logger.error("Augmented ONNX model or external data file was not found") + logger.info("Exporting the quantized graph") + t = time.time() + model = gs.export_onnx(graph_gs) + # Set ir_version to 10, remove it once ORT supports ir_version 11 + model.ir_version = 10 + logger.info(f"Exporting took {time.time() - t} seconds") + finally: + if session is not None: + session = None + gc.collect() + _remove_augmented_onnx(augmented_onnx_path, use_external_data_format) return model diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index 0ea465487a..2c5a0b7d2d 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -18,6 +18,7 @@ import glob import io import os +import pathlib import platform import re import shutil @@ -25,6 +26,7 @@ import sys from collections.abc import Sequence from contextlib import redirect_stderr, redirect_stdout +from importlib.metadata import PackageNotFoundError, distribution import onnxruntime as ort from onnxruntime.quantization.operators.qdq_base_operator import QDQOperatorBase @@ -126,6 +128,78 @@ def _check_for_tensorrt(min_version: str = "10.0"): ) +def _find_cudnn_bin_dir(): + """Locate the nvidia cudnn bin directory inside site-packages.""" + for pkg_name in ("nvidia-cudnn-cu12", "nvidia-cudnn-cu13"): + try: + dist = distribution(pkg_name) + except PackageNotFoundError: + continue + for f in dist.files or []: + if f.name.startswith("cudnn64_") and f.name.endswith(".dll"): + bin_dir = str(pathlib.Path(f.locate()).parent) + if os.path.isdir(bin_dir): + return bin_dir + return None + + +def _load_extra_cudnn_dlls(): + """Load any cuDNN DLLs from site-packages that ORT's preload_dlls() missed. + + TEMPORARY WORKAROUND: This function exists because ort.preload_dlls() has a + hardcoded list of cuDNN sub-libraries which may be incomplete for newer cuDNN + versions (e.g. cuDNN 9.21 added cudnn_engines_tensor_ir64_9.dll, cuDNN 9.20 + added cudnn_cnn64_9.dll). Once ort.preload_dlls() is fixed upstream to + dynamically discover all cuDNN DLLs, this function and its helper + (_find_cudnn_bin_dir) should be removed. + + This scans the nvidia-cudnn bin directory and loads any cudnn*.dll not already + loaded in the process. + """ + import ctypes + import ctypes.wintypes + + cudnn_bin_dir = _find_cudnn_bin_dir() + if not cudnn_bin_dir: + logger.debug( + "nvidia-cudnn bin directory not found in site-packages, skipping extra DLL load" + ) + return + + dll_files = sorted(glob.glob(os.path.join(cudnn_bin_dir, "cudnn*.dll"))) + if not dll_files: + logger.debug("No cudnn*.dll files found in %s", cudnn_bin_dir) + return + + get_module_handle_w = ctypes.windll.kernel32.GetModuleHandleW # type: ignore[attr-defined] + get_module_handle_w.argtypes = [ctypes.wintypes.LPCWSTR] + get_module_handle_w.restype = ctypes.wintypes.HMODULE + + loaded = [] + skipped = [] + failed = [] + for dll_path in dll_files: + dll_name = os.path.basename(dll_path) + if get_module_handle_w(dll_name): + skipped.append(dll_name) + continue + try: + ctypes.CDLL(dll_path) + loaded.append(dll_name) + except OSError as e: + failed.append(dll_name) + logger.warning(f"Failed to load {dll_name} from site-packages: {e}") + + if skipped: + logger.debug(f"Already loaded (skipped): {skipped}") + if loaded: + logger.info( + f"Loaded {len(loaded)} extra cuDNN DLLs that ort.preload_dlls() missed: {loaded}" + ) + if failed: + logger.warning(f"Failed to load {len(failed)} cuDNN DLLs: {failed}") + + def _check_for_libcudnn(): # TODO: handle multiple calls to this function logger.info("Checking for cuDNN library") @@ -150,10 +224,6 @@ def _check_for_libcudnn(): f"cuDNN not found in {env_variable}. " "Attempting onnxruntime.preload_dlls() to load from site-packages..." ) - # preload_dlls() does not raise on failure — it silently prints - # "Failed to load ..." messages. Capture its output and check - # whether the key cuDNN DLL actually loaded. - cudnn_dll = "cudnn" if platform.system() == "Windows" else "libcudnn_adv" captured = io.StringIO() try: with redirect_stdout(captured), redirect_stderr(captured): @@ -163,14 +233,17 @@ def _check_for_libcudnn(): preload_output = captured.getvalue() if preload_output: - logger.debug(f"preload_dlls() output:\n{preload_output}") + logger.warning(f"preload_dlls() output:\n{preload_output}") - if f"Failed to load {cudnn_dll}" in preload_output: + core_cudnn_dll = "cudnn64_9" if platform.system() == "Windows" else "libcudnn_adv" + if f"Failed to load {core_cudnn_dll}" in preload_output: logger.error( - f"onnxruntime.preload_dlls() was called but {cudnn_dll} failed to load. " + f"onnxruntime.preload_dlls() was called but {core_cudnn_dll} failed to load. " "cuDNN DLLs were NOT successfully loaded from site-packages." ) else: + if platform.system() == "Windows": + _load_extra_cudnn_dlls() logger.info( "onnxruntime.preload_dlls() succeeded — CUDA/cuDNN DLLs loaded" " from site-packages. Verify version compatibility at" diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py index 0cb1a45f68..265bcf36b2 100644 --- a/modelopt/onnx/quantization/qdq_utils.py +++ b/modelopt/onnx/quantization/qdq_utils.py @@ -1011,14 +1011,37 @@ def replace_zero_scale_with_smallest_nonzero(onnx_model: onnx.ModelProto) -> onn """Replace zero scale values with smallest nonzero fp16 value in the ONNX model.""" graph = onnx_model.graph fp16_smallest_nonzero = np.float16(6e-08) - scale_nodes = [node.input[1] for node in graph.node if node.op_type == "QuantizeLinear"] + qdq_op_types = { + "QuantizeLinear", + "DequantizeLinear", + "TRT_INT4QuantizeLinear", + "TRT_INT4DequantizeLinear", + } + scale_tensor_names = { + node.input[1] + for node in graph.node + if node.op_type in qdq_op_types and len(node.input) >= 2 + } + # Scales stored as graph initializers (e.g. INT4_AWQ / TRT_INT4DequantizeLinear exports). + for init in graph.initializer: + if init.name in scale_tensor_names: + tensor = numpy_helper.to_array(init) + if tensor.dtype.kind == "f": + new_tensor = np.where(tensor == 0, fp16_smallest_nonzero, tensor).astype( + tensor.dtype + ) + init.CopyFrom(numpy_helper.from_array(new_tensor, init.name)) + # Scales emitted by Constant nodes (legacy QDQ export path). for node in graph.node: - if node.op_type == "Constant" and node.output[0] in scale_nodes: + if node.op_type == "Constant" and node.output[0] in scale_tensor_names: for attr in node.attribute: if attr.name == "value": tensor = numpy_helper.to_array(attr.t) - new_tensor = np.where(tensor == 0, fp16_smallest_nonzero, tensor) - attr.t.CopyFrom(numpy_helper.from_array(new_tensor, attr.t.name)) + if tensor.dtype.kind == "f": + new_tensor = np.where(tensor == 0, fp16_smallest_nonzero, tensor).astype( + tensor.dtype + ) + attr.t.CopyFrom(numpy_helper.from_array(new_tensor, attr.t.name)) return onnx_model diff --git a/modelopt/torch/distill/plugins/megatron.py b/modelopt/torch/distill/plugins/megatron.py index dbfad6fb6b..9a98eee9c7 100644 --- a/modelopt/torch/distill/plugins/megatron.py +++ b/modelopt/torch/distill/plugins/megatron.py @@ -163,7 +163,7 @@ def setup_distillation_config( def _adjust_layer_index_for_pp(submodule_name, model_cfg): """Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.""" - match = re.search(r"(?<=\.)\d+(?=\.)", submodule_name) + match = re.search(r"(?<=\.)\d+(?=\.|$)", submodule_name) if not match: return submodule_name @@ -172,7 +172,7 @@ def _adjust_layer_index_for_pp(submodule_name, model_cfg): if new_layer_idx < 0: raise ValueError(f"Layer {submodule_name} does not fall on final PP rank.") - new_submodule_name = submodule_name.replace(match.group(0), str(new_layer_idx)) + new_submodule_name = submodule_name.replace(f".{match.group(0)}", f".{new_layer_idx}") if parallel_state.get_tensor_and_context_parallel_rank() == 0: logger.info( f'Distillation: Renamed layer "{submodule_name}" on final PP rank to "{new_submodule_name}"' diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py index b1d37c1ad9..e485731b3d 100644 --- a/modelopt/torch/export/plugins/megatron_importer.py +++ b/modelopt/torch/export/plugins/megatron_importer.py @@ -39,6 +39,7 @@ has_mcore = False with import_plugin("megatron"): from megatron.core.parallel_state import ( + get_expert_model_parallel_rank, get_expert_tensor_parallel_world_size, get_tensor_model_parallel_world_size, ) @@ -294,9 +295,13 @@ def _grouped_mlp_merging( assert module.num_gemms == num_local_experts, ( "num_gemms must be equal to num_local_experts in TEGroupedMLP" ) - for expert_id in range(init_expert_id, init_expert_id + num_local_experts): - tensor = self._get_safetensor(prefix.format(expert_id) + ".weight") - state_dict[f"weight{expert_id}"] = tensor + # init_expert_id is the global index of this rank's first local expert. + # TEGroupedMLP stores weights as weight0..weight{num_local-1} locally, so we + # map global expert_id -> local slot (expert_id - init_expert_id). + for local_id in range(num_local_experts): + global_expert_id = init_expert_id + local_id + tensor = self._get_safetensor(prefix.format(global_expert_id) + ".weight") + state_dict[f"weight{local_id}"] = tensor # TODO handle weight_scale module.load_state_dict(state_dict) @@ -653,10 +658,13 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool = layer_pbar.set_description("Importing MoE grouped local experts") num_local_experts = experts.num_local_experts num_global_experts = experts.config.num_moe_experts - assert num_local_experts == num_global_experts, ( - "num_local_experts must be equal to num_global_experts during MoE import" + assert num_global_experts % num_local_experts == 0, ( + "num_global_experts must be divisible by num_local_experts " + "during MoE import" ) - init_index = 0 + # Each EP rank owns a contiguous slice of global experts: + # [ep_rank * num_local_experts, (ep_rank + 1) * num_local_experts). + init_index = get_expert_model_parallel_rank() * num_local_experts self.rules["experts.linear_fc1"]( experts.linear_fc1, diff --git a/modelopt/torch/export/plugins/vllm_fakequant_hf.py b/modelopt/torch/export/plugins/vllm_fakequant_hf.py index 42baad912b..ad0b88f2f7 100644 --- a/modelopt/torch/export/plugins/vllm_fakequant_hf.py +++ b/modelopt/torch/export/plugins/vllm_fakequant_hf.py @@ -47,14 +47,18 @@ "merge_amax_tensors_for_group", ] -# Matches ``…weight_quantizer``, ``…weight_quantizer.0``, ``…w13_weight_quantizer.0``, etc. -_WEIGHT_QUANTIZER_STATE_KEY = re.compile(r"(?:^|\.)(?:\w+_)?weight_quantizer(?:\.\d+)*$") +# Matches ``…weight_quantizer``, ``…weight_quantizer.0``, ``…w13_weight_quantizer.0``, +# and the plural fused-experts form ``…weight_quantizers.0`` (per-expert ModuleList). +_WEIGHT_QUANTIZER_STATE_KEY = re.compile(r"(?:^|\.)(?:\w+_)?weight_quantizers?(?:\.\d+)*$") def is_weight_quantizer_state_key(key: str) -> bool: - """Return True for weight-quantizer state keys, including SequentialQuantizer entries. + """Return True for weight-quantizer state keys. - Matches ``weight_quantizer``, ``w13_weight_quantizer``, ``weight_quantizer.0``, etc. + Includes ``SequentialQuantizer`` entries and fused-experts ``ModuleList`` + entries (``*_weight_quantizers.``). Matches ``weight_quantizer``, + ``w13_weight_quantizer``, ``weight_quantizer.0``, + ``gate_up_proj_weight_quantizers.0``, etc. """ return bool(_WEIGHT_QUANTIZER_STATE_KEY.search(key)) @@ -142,6 +146,56 @@ def disable_rotate(quantizer: TensorQuantizer): return False +def _fakequant_fused_experts_weights( + module: nn.Module, + module_name: str, + state_dict: dict | None, + fakequant_weights: set, + inplace: bool, +): + """Apply per-expert fake-quant to a ``_QuantFusedExperts`` module's 3-D weights. + + The base loop in :func:`_fakequant_module_weights` only handles singular + ``*_weight_quantizer`` attrs (one TensorQuantizer per weight). Fused-experts + modules expose ``*_weight_quantizers`` (``nn.ModuleList`` with one entry per + expert) that the base loop skips, leaving the fused 3-D weight unquantized + in the export and breaking weight-fold round-trips. + """ + for w_attr, q_attr in ( + ("gate_up_proj", "gate_up_proj_weight_quantizers"), + ("down_proj", "down_proj_weight_quantizers"), + ): + quantizers = getattr(module, q_attr, None) + if not isinstance(quantizers, nn.ModuleList): + continue + if not any( + isinstance(q, TensorQuantizer) and q.fake_quant and q.is_enabled for q in quantizers + ): + continue + sd_key = f"{module_name}.{w_attr}" if module_name else w_attr + if sd_key in fakequant_weights: + raise RuntimeError(f"Weight {sd_key} has already been fakequantized") + + if inplace: + w = getattr(module, w_attr) + for idx, q in enumerate(quantizers): + if not (isinstance(q, TensorQuantizer) and q.fake_quant and q.is_enabled): + continue + slice_ = w.data[idx] + slice_.copy_(q(slice_.float()).to(w.dtype)) + else: + if state_dict is None or sd_key not in state_dict: + continue + w_3d = state_dict[sd_key].clone() + for idx, q in enumerate(quantizers): + if not (isinstance(q, TensorQuantizer) and q.fake_quant and q.is_enabled): + continue + slice_ = w_3d[idx] + w_3d[idx] = q(slice_.float()).to(slice_.dtype) + state_dict[sd_key] = w_3d.cpu() + fakequant_weights.add(sd_key) + + def _fakequant_module_weights( module: nn.Module, module_name: str, @@ -159,6 +213,7 @@ def _fakequant_module_weights( """ if not isinstance(module, QuantModule): return + _fakequant_fused_experts_weights(module, module_name, state_dict, fakequant_weights, inplace) for attr_name, quantizer in module.named_children(): if not ( attr_name.endswith("weight_quantizer") diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 4ceb51cd2c..76f304a478 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -42,6 +42,7 @@ QuantizerAttrNames, quantizer_attr_names, reduce_block_amax, + representative_weight_quantizer, weight_attr_names, ) from modelopt.torch.utils import clear_cuda_cache @@ -546,7 +547,7 @@ def _compute_kv_cache_dtype( def get_weight_block_size(module: nn.Module, weight_name: str = "weight") -> int: """Returns the weight block size.""" - weight_quantizer = getattr(module, quantizer_attr_names(weight_name).weight_quantizer, None) + weight_quantizer = representative_weight_quantizer(module, weight_name) if weight_quantizer is None: return 0 @@ -572,7 +573,11 @@ def get_quantization_format(module) -> str | None: """ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames): - weight_quantizer = getattr(layer, quantizer_attr_names.weight_quantizer, None) + # Singular form first, plural ModuleList fallback (fused-experts). + # Strip the "_weight_quantizer" suffix to recover the weight attr name. + weight_attr = quantizer_attr_names.weight_quantizer + weight_name = weight_attr[: -len("_weight_quantizer")].rstrip("_") or "weight" + weight_quantizer = representative_weight_quantizer(layer, weight_name) input_quantizer = getattr(layer, quantizer_attr_names.input_quantizer, None) if weight_quantizer is None or not weight_quantizer.is_enabled: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index af936a3002..ed6ed2fcf2 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -88,6 +88,7 @@ QUANTIZATION_W4A8_NVFP4_FP8, ) from .model_utils import get_language_model_from_vl, is_multimodal_model +from .moe_utils import _export_fused_experts from .plugins import SpeculativeDecodingExporter, has_spec_opt from .quant_utils import ( fuse_prequant_layernorm, @@ -642,11 +643,20 @@ def _process_quantized_modules( if is_modelopt_qlora and (hasattr(sub_module, "base_layer")): continue + # Preprocessing: restore unpacked weight so the export path can read + # the live quantizer state. Falls through to the export branches below. if hasattr(sub_module, "weight_packed") or ( "QuantFP8Linear" in type(sub_module).__name__ and sub_module.weight.element_size() <= 1 ): sub_module.unpack_weight() - if get_quantization_format(sub_module) != QUANTIZATION_NONE: + + if hasattr(sub_module, "gate_up_proj_weight_quantizers"): + # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList), + # which get_quantization_format's singular-weight_quantizer check misses. Handle + # it explicitly before the format gate so fused-experts get split + quantized. + with fsdp2_aware_weight_update(model, sub_module, reshard=False): + _export_fused_experts(sub_module, dtype) + elif get_quantization_format(sub_module) != QUANTIZATION_NONE: # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear if type(sub_module).__name__ == "QuantMoELinear": continue @@ -677,13 +687,6 @@ def _process_quantized_modules( with fsdp2_aware_weight_update(model, sub_module, reshard=False): for weight_name in ["gate_up_proj", "down_proj"]: _export_quantized_weight(sub_module, dtype, weight_name) - elif hasattr(sub_module, "gate_up_proj_weight_quantizers"): - # Generic fused MoE experts (_QuantFusedExperts) with per-expert - # quantizer ModuleLists. Split into per-expert modules and export. - from modelopt.torch.export.moe_utils import _export_fused_experts - - with fsdp2_aware_weight_update(model, sub_module, reshard=False): - _export_fused_experts(sub_module, dtype) def _export_transformers_checkpoint( @@ -1186,12 +1189,25 @@ def export_hf_checkpoint( try: post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype) - if hf_quant_config is not None: + # Only treat the export as quantized when at least one quant_algo field is set. + # get_quant_config always returns a dict (even for sparsity-only or unmodified models), + # so emitting hf_quant_config.json unconditionally produces a file with + # "quant_algo": null that downstream loaders (e.g. TensorRT-LLM) reject as a + # malformed pre-quantized checkpoint. + quantization_details = (hf_quant_config or {}).get("quantization", {}) + is_quantized_export = ( + quantization_details.get("quant_algo") is not None + or quantization_details.get("kv_cache_quant_algo") is not None + ) + + if is_quantized_export: # Save hf_quant_config.json for backward compatibility with open(f"{export_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) + else: + hf_quant_config = None # Remove hf_quantizer from model so post_state_dict can be exported. if getattr(model, "hf_quantizer", None) is not None: diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index 62053e549c..24983a2489 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -72,6 +72,11 @@ with import_plugin("megatron"): from megatron.core.models.gpt import GPTModel from megatron.core.models.mamba import MambaModel + + try: + from megatron.core.models.hybrid.hybrid_model import HybridModel + except ImportError: + HybridModel = MambaModel from megatron.core.models.multimodal.llava_model import LLaVAModel from megatron.core.parallel_state import ( get_pipeline_model_parallel_rank, @@ -121,7 +126,7 @@ def __init__( moe_router_dtype: str | None = None, ): """Create a GPTModel exporter instance.""" - if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)): + if not isinstance(model, (GPTModel, MambaModel, HybridModel, LLaVAModel)): raise ValueError("Input to GPTModelExport must be a megatron.core.models.GPTModel!") self._state_dict = OrderedDict() diff --git a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor_factory.py b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor_factory.py index 74aaf311bf..cff972a51e 100644 --- a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor_factory.py +++ b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor_factory.py @@ -33,7 +33,7 @@ "qwen3": "qwen3", "nemotron_h": "nemotron_h", "nemotron_h_v2": "nemotron_h_v2", - "gpt_oss_20b": "gpt_oss_20b", + "gpt_oss": "gpt_oss", } diff --git a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py index b5d0c754f1..999ec6c690 100644 --- a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py @@ -509,13 +509,18 @@ def _build_layer_replacements_from_block_library(block_library_df: pd.DataFrame) weight_paths = [] for subblock_name in ["attention", "ffn"]: checkpoint_dir = row[f"{subblock_name}_checkpoint_dir"] - if checkpoint_dir is not None: - subblock_path = ( - Path(checkpoint_dir) - / SAFETENSORS_SUBBLOCKS_DIR_NAME - / f"block_{block_idx}_{subblock_name}.safetensors" - ) - weight_paths.append(subblock_path) + # pandas represents missing cells as float NaN (e.g. for no-op subblocks), + # so check for both None and NaN before constructing a Path. + if checkpoint_dir is None or ( + isinstance(checkpoint_dir, float) and pd.isna(checkpoint_dir) + ): + continue + subblock_path = ( + Path(checkpoint_dir) + / SAFETENSORS_SUBBLOCKS_DIR_NAME + / f"block_{block_idx}_{subblock_name}.safetensors" + ) + weight_paths.append(subblock_path) weight_paths = sorted(set(weight_paths)) layer_replacement = { "parent_layer_indices": [block_idx], diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py index 69b8e5e29d..1240d1c9b6 100644 --- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py @@ -29,12 +29,14 @@ from typing import TYPE_CHECKING, Any, BinaryIO import torch +import torch.distributed as tdist import transformers from safetensors.torch import save_file as safe_save_file from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.utils import SAFE_WEIGHTS_INDEX_NAME +import modelopt.torch.utils.distributed as dist_utils from modelopt.torch.utils import json_dumps from ..block_config import maybe_cast_block_configs @@ -51,6 +53,7 @@ "load_model_config", "init_model_from_config", "save_checkpoint", + "save_checkpoint_from_shards", "save_subblocks", "save_model_config", ] @@ -200,6 +203,52 @@ def save_checkpoint( _save_checkpoint(model.config, model.state_dict(), checkpoint_dir, descriptor) +def save_checkpoint_from_shards( + model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor" +) -> None: + """ + Save a checkpoint when the model's weights are sharded across distributed ranks. + + Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint + (including the safetensors index and subblocks) from the merged weights. On a single-process + run, saves directly from the local state dict. Only rank 0 performs the filesystem write; + non-master ranks only participate in the gather. + + Parameters: + model (PreTrainedModel): The model instance whose local state_dict contains this rank's + shard of weights. + checkpoint_dir (Path | str): Destination directory for the checkpoint files. + descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build + the safetensors index. + """ + + local_sd = {k: v.cpu() for k, v in model.state_dict().items()} + if dist_utils.size() > 1: + save_err: str | None = None + if dist_utils.is_master(): + gathered: list[dict] = [None] * dist_utils.size() + tdist.gather_object(local_sd, gathered, dst=0) + full_sd: dict[str, torch.Tensor] = {} + for shard_sd in gathered: + if shard_sd is None: + continue + full_sd.update(shard_sd) + try: + _save_checkpoint(model.config, full_sd, checkpoint_dir, descriptor) + except Exception as e: + save_err = repr(e) + else: + tdist.gather_object(local_sd, dst=0) + err_box = [save_err] + tdist.broadcast_object_list(err_box, src=0) + # Barrier ensures all ranks wait until file I/O completes before continuing + dist_utils.barrier() + if err_box[0] is not None: + raise RuntimeError(f"Checkpoint save failed on rank 0: {err_box[0]}") + else: + _save_checkpoint(model.config, local_sd, checkpoint_dir, descriptor) + + def _save_checkpoint( model_config: PretrainedConfig, state_dict: dict[str, torch.Tensor], diff --git a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py index d8471aee23..a46fba52d0 100644 --- a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py +++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py @@ -41,7 +41,7 @@ from ..utils.validate_runtime_pipeline import perform_pipeline_stitches from . import validate_model from .checkpoint_utils import copy_tokenizer -from .checkpoint_utils_hf import save_checkpoint +from .checkpoint_utils_hf import save_checkpoint_from_shards from .common import resolve_torch_dtype from .sharded_checkpoint_utils import load_and_shard_model from .validation_utils import ( @@ -189,7 +189,7 @@ def validate_puzzle_solutions(args: DictConfig) -> None: # TODO: Loo into internal Puzzleron code to see how to save as symlinks # save_checkpoint_as_symlinks is currently not supported pass - save_checkpoint(model, checkpoint_dir, descriptor) + save_checkpoint_from_shards(model, checkpoint_dir, descriptor) copy_tokenizer( args.tokenizer_name, diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 3f24ac09a4..794a669337 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -236,10 +236,18 @@ def find_quant_cfg_entry_by_path( _mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [ {"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE - {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear - {"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear - {"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear - {"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection + {"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming) + {"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear (HF naming) + {"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear (HF naming) + {"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection (HF naming) + { + "quantizer_name": "*self_attention.linear_qkv*", + "enable": False, + }, # Skip QKV Linear (Mcore naming) + { + "quantizer_name": "*self_attention.linear_proj*", + "enable": False, + }, # Skip QKV Output Projection (Mcore naming) ] INT8_DEFAULT_CFG = { diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 55f7fdf6fc..3f97f8380b 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -16,6 +16,7 @@ """Quantization conversion/restore utilities.""" import fnmatch +import re import warnings from collections.abc import Callable from contextlib import contextmanager @@ -286,6 +287,33 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType set_quantizer_attributes_full(quant_model, quantizer_name, attributes, parent_class) +_FUSED_EXPERTS_QUANTIZER_LIST_RE = re.compile( + r"(weight_quantizers?|input_quantizers?)\.\d+(?=$|\.)" +) + + +def _normalize_fused_experts_quantizer_name(name: str) -> str: + """Strip the per-expert index from per-expert quantizer ModuleList names. + + Fused-experts modules register per-expert weight/input quantizers in a + ``nn.ModuleList``; its children surface as dotted names like + ``...gate_up_proj_weight_quantizers.0`` (plural) or — if a variant uses + singular naming — ``...gate_up_proj_weight_quantizer.0``. Neither matches + the singular-suffix wildcards (``*weight_quantizer``) used in the stock + configs, so the experts stay at their defaults. + + Return a normalized name where either ``weight_quantizer[s]?.N`` or + ``input_quantizer[s]?.N`` collapses to the singular form without the index + so the standard wildcards match. + """ + + def _repl(m: re.Match) -> str: + base = m.group(1) + return base.removesuffix("s") + + return _FUSED_EXPERTS_QUANTIZER_LIST_RE.sub(_repl, name) + + def _match_quantizer( wildcard_or_filter_func: str | Callable, name: str, @@ -296,7 +324,11 @@ def _match_quantizer( if not isinstance(module, (TensorQuantizer, SequentialQuantizer)): return False if isinstance(wildcard_or_filter_func, str): - if not fnmatch.fnmatch(name, wildcard_or_filter_func): + normalized = _normalize_fused_experts_quantizer_name(name) + if not ( + fnmatch.fnmatch(name, wildcard_or_filter_func) + or (normalized != name and fnmatch.fnmatch(normalized, wildcard_or_filter_func)) + ): return False elif callable(wildcard_or_filter_func): if not wildcard_or_filter_func(name): diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 9b1cc5bc0c..0aec4411e0 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -1270,11 +1270,30 @@ def postprocess(module, name): for name, module in model.named_modules(): if hasattr(module, "awq_lite"): - if module.awq_lite.num_cache_steps == 0: - # Uncalibrated expert: max calibrate weights and apply neutral - # (all-ones) pre_quant_scale for export consistency. - # NOTE: ones_scale must be registered OUTSIDE enable_weight_access_and_writeback + # Flag modules whose search pass missed them despite cache hits, so + # they fall through to the neutral-scale path below. + if module.awq_lite.num_cache_steps > 0 and module.awq_lite.num_search_steps == 0: + module.awq_lite.is_enabled = False + warnings.warn( + "awq_lite: Calling `forward_loop(model)` the second time did not forward" + f" data through the {name}. Please provide a valid `forward_loop` function" + " that can be used to forward data through the model many times." + ) + + if not module.awq_lite.is_enabled: + # Expert is disabled — uncalibrated (no cache-pass tokens, set + # at the pre-search pass above), had NaN in act/weight scales, + # or saw no search-pass tokens. Max-calibrate weights and apply + # a neutral (all-ones) pre_quant_scale so the exporter sees a + # consistent nvfp4_awq format across all expert linears in an + # MoE group. + # NOTE: ones-scale must be registered OUTSIDE enable_weight_access_and_writeback # because HF accelerate post_forward drops newly-registered submodule buffers. + warnings.warn( + f"awq_lite: Forcing pre_quant_scale=1 for {name} because the expert " + "was not properly exercised during calibration. This may degrade accuracy; " + "consider increasing calibration size or using a more diverse dataset." + ) with enable_weight_access_and_writeback(module, model, name_to_module): max_calibrate(module, lambda module: module.weight_quantizer(module.weight)) w_shape, w_dtype, w_device = ( @@ -1289,13 +1308,6 @@ def postprocess(module, name): device=w_device, ) else: - if module.awq_lite.num_search_steps == 0: - module.awq_lite.is_enabled = False - warnings.warn( - "awq_lite: Calling `forward_loop(model)` the second time did not forward" - f" data through the {name}. Please provide a valid `forward_loop` function" - " that can be used to forward data through the model many times." - ) with enable_weight_access_and_writeback(module, model, name_to_module): postprocess(module, name) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 990d0c0348..48fba1e145 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -900,6 +900,33 @@ def forward(self, *args, **kwargs): self._down_proj_linear = False return super().forward(*args, **kwargs) + def fold_weight(self, keep_attrs: bool = False): + """Fold per-expert weight quantizers into the fused 3-D weights. + + The base ``fold_weight`` only handles singular ``*_weight_quantizer`` + attributes. Fused experts use ``nn.ModuleList`` of per-expert quantizers + (``gate_up_proj_weight_quantizers``, ``down_proj_weight_quantizers``), + which would otherwise be skipped, leaving ``_amax`` on every quantizer. + """ + for weight_name, quantizers_name in ( + ("gate_up_proj", "gate_up_proj_weight_quantizers"), + ("down_proj", "down_proj_weight_quantizers"), + ): + weight = getattr(self, weight_name, None) + quantizers = getattr(self, quantizers_name, None) + if weight is None or quantizers is None: + continue + for idx, q in enumerate(quantizers): + if not (isinstance(q, TensorQuantizer) and q.fake_quant): + continue + slice_ = weight.data[idx] + slice_.copy_(q(slice_.float()).to(weight.dtype)) + q.disable() + if not keep_attrs: + for attr_name in ("_pre_quant_scale", "_amax"): + if hasattr(q, attr_name): + delattr(q, attr_name) + class _QuantDbrxFFN(_QuantSparseSequentialMoe): @property @@ -1438,6 +1465,38 @@ def register_fused_experts_on_the_fly(model): QuantModuleRegistry.register({mod_type: f"hf.{mod_type.__name__}"})(_QuantFusedExperts) +def force_eager_experts_impl_on_the_fly(model): + """Force HF fused-experts modules onto the eager ``F.linear``-based forward. + + HF transformers 5.0+ decorates fused-experts forwards with + ``@use_experts_implementation``, which may dispatch to ``torch._grouped_mm`` + or ``torch.bmm`` backends. Those backends bypass ``F.linear`` and so bypass + ``_QuantFusedExperts``'s input/weight quantizer hooks — calibration silently + does nothing, no ``input_scale`` / ``amax`` is collected, and the exported + checkpoint produces garbage at inference. + + Sets ``config._experts_implementation = "eager"`` on the model config (and + recursively on ``text_config`` / ``vision_config`` / ``audio_config`` / + ``speech_config``) whenever a fused-experts module is present. + """ + if not any(_is_fused_experts_module(m) for m in model.modules()): + return + + nested_cfg_attrs = ("text_config", "vision_config", "audio_config", "speech_config") + + def _force(cfg): + if cfg is None: + return + if hasattr(cfg, "_experts_implementation"): + cfg._experts_implementation = "eager" + for sub in nested_cfg_attrs: + if hasattr(cfg, sub): + _force(getattr(cfg, sub)) + + if hasattr(model, "config"): + _force(model.config) + + def _is_supported_hf_model(model): """Check if the model a valid model for transformers quantization specific support.""" supported_models = [transformers.PreTrainedModel] @@ -1665,6 +1724,7 @@ def _reconstruct_fused_moe_linear(model: nn.Module) -> None: register_dbrx_moe_on_the_fly, register_step3p5_moe_on_the_fly, register_fused_experts_on_the_fly, + force_eager_experts_impl_on_the_fly, register_sparse_moe_on_the_fly, register_hf_attentions_on_the_fly, convert_hf_parallel_linears_on_the_fly, diff --git a/modelopt/torch/quantization/utils/__init__.py b/modelopt/torch/quantization/utils/__init__.py index dfc23c42ee..dc6daa0084 100644 --- a/modelopt/torch/quantization/utils/__init__.py +++ b/modelopt/torch/quantization/utils/__init__.py @@ -30,6 +30,7 @@ "reduce_amax", "reduce_sum", "replace_function", + "representative_weight_quantizer", "update_quant_cfg_with_kv_cache_quant", "weight_attr_names", ] diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 29661e18f5..1a177e04dc 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -202,27 +202,57 @@ def reduce_sum(input, axis=None, keepdims=True): return output -def weight_attr_names(module: nn.Module) -> "Generator[str, None, None]": - """Get the weight param attribute names in a converted module, non-recursive. +def representative_weight_quantizer(module: nn.Module, weight_name: str = "weight"): + """Return the representative weight quantizer for ``weight_name`` on ``module``. + + Handles two layouts: + + - singular ``_weight_quantizer`` — standard ``nn.Linear`` / ``_QuantLinear``. + - plural ``_weight_quantizers`` (``nn.ModuleList``) — fused-experts modules + (``_QuantFusedExperts``) hold one ``TensorQuantizer`` per expert. Per-expert + formats are identical, so the first element is representative. - We consider the following two cases for each weight param attribute: - - The standard weight attribute (e.g. nn.Linear). - - The custom `weight_attr_name`. (e.g. Llama4TextExperts has weight attributes `gate_up_proj` and `down_proj`) + Returns ``None`` if no matching quantizer is found. """ from ..nn import SequentialQuantizer, TensorQuantizer - # the standard weight and quantizer case - weight = getattr(module, "weight", None) - weight_quantizer = getattr(module, "weight_quantizer", None) - if weight is not None and isinstance(weight_quantizer, (TensorQuantizer, SequentialQuantizer)): - yield "weight" + singular = quantizer_attr_names(weight_name).weight_quantizer + q = getattr(module, singular, None) + if isinstance(q, (TensorQuantizer, SequentialQuantizer)): + return q - # other weight and quantizer case + plural = getattr(module, singular + "s", None) + if isinstance(plural, nn.ModuleList) and len(plural) > 0: + first = plural[0] + if isinstance(first, (TensorQuantizer, SequentialQuantizer)): + return first + return None + + +def weight_attr_names(module: nn.Module) -> "Generator[str, None, None]": + """Get the weight param attribute names in a converted module, non-recursive. + + Covers three layouts: + + - standard ``nn.Linear``: ``weight`` + ``weight_quantizer``. + - custom per-weight quantizer (e.g. ``Llama4TextExperts`` with ``gate_up_proj`` + + ``gate_up_proj_weight_quantizer``). + - fused-experts ``nn.ModuleList`` quantizers (``_QuantFusedExperts`` with + ``gate_up_proj`` + ``gate_up_proj_weight_quantizers`` plural list). + """ + # standard: "weight" + "weight_quantizer" (singular) or "weight_quantizers" (plural) + if getattr(module, "weight", None) is not None: + if representative_weight_quantizer(module, "weight") is not None: + yield "weight" + + # per-parameter custom attr names for name, _ in module.named_parameters(recurse=False): + if name == "weight": + continue weight = getattr(module, name, None) - weight_quantizer = getattr(module, f"{name}_weight_quantizer", None) - if isinstance(weight, nn.Parameter) and isinstance( - weight_quantizer, (TensorQuantizer, SequentialQuantizer) + if ( + isinstance(weight, nn.Parameter) + and representative_weight_quantizer(module, name) is not None ): yield name diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 01cb3abe88..73cb917f37 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -18,7 +18,8 @@ import copy import json import os -from collections.abc import Callable +from collections.abc import Callable, Iterator +from contextlib import contextmanager, suppress from pathlib import Path from typing import TYPE_CHECKING, Any from warnings import warn @@ -437,6 +438,36 @@ def get_supported_datasets() -> list[str]: return list(SUPPORTED_DATASET_CONFIG.keys()) +@contextmanager +def _disable_use_cache(model: torch.nn.Module) -> Iterator[None]: + """Set ``model.config.use_cache = False`` for the duration of the block. + + KV caching is unwanted during calibration / memory-probe forward passes: + it wastes memory, and for hybrid Mamba/attention models (e.g., NemotronH) + the cache state is mutated in-place and breaks correctness. Setting + ``use_cache`` unconditionally (rather than only when it was already + present) also sidesteps configs that never assign the attribute at all + — e.g., ``Step3p5Config`` from stepfun-ai/Step-3.5-Flash — where forward + code that reads ``self.config.use_cache`` would otherwise raise + ``AttributeError``. The prior value is restored on exit if one existed. + """ + config = getattr(model, "config", None) + if config is None: + yield + return + had_attr = hasattr(config, "use_cache") + prev = config.use_cache if had_attr else None + config.use_cache = False + try: + yield + finally: + if had_attr: + config.use_cache = prev + else: + with suppress(AttributeError): + delattr(config, "use_cache") + + def get_max_batch_size( model: torch.nn.Module, max_sample_length: int = 512, @@ -467,42 +498,43 @@ def _get_free_gpu_mem(): torch.ones([1, max_sample_length], dtype=torch.int32, device=model.device) * 100 ) - # Calculate single batch inference with dummy input. - with torch.set_grad_enabled(enable_grad): - infer_method(sample_input_single_batch) - free_mem_after, max_allocated_after = _get_free_gpu_mem() + with _disable_use_cache(model): + # Calculate single batch inference with dummy input. + with torch.set_grad_enabled(enable_grad): + infer_method(sample_input_single_batch) + free_mem_after, max_allocated_after = _get_free_gpu_mem() - mem_diff_per_data_batch = ( - max( - (free_mem_before - free_mem_after), - (max_allocated_after - max_allocated_before), + mem_diff_per_data_batch = ( + max( + (free_mem_before - free_mem_after), + (max_allocated_after - max_allocated_before), + ) + * sample_memory_usage_ratio ) - * sample_memory_usage_ratio - ) - if mem_diff_per_data_batch <= 0: - print( - "Warning: No measurable memory usage found for a single batch. " - "Falling back to batch_size=1." + if mem_diff_per_data_batch <= 0: # pragma: no cover - GPU memory probe edge case + print( # pragma: no cover + "Warning: No measurable memory usage found for a single batch. " + "Falling back to batch_size=1." + ) + target_data_batch = 1 # pragma: no cover + else: + target_data_batch = max(int(free_mem_before / mem_diff_per_data_batch), 1) + target_input = sample_input_single_batch.expand( + [ + target_data_batch if index == 0 else dim + for index, dim in enumerate(sample_input_single_batch.shape) + ] ) - target_data_batch = 1 - else: - target_data_batch = max(int(free_mem_before / mem_diff_per_data_batch), 1) - target_input = sample_input_single_batch.expand( - [ - target_data_batch if index == 0 else dim - for index, dim in enumerate(sample_input_single_batch.shape) - ] - ) - # For some models on multi GPU, we observe the memory per batch is not a constant. - # So we just test the target batch size and make sure we do not go OOM. - while target_data_batch > 1: - with torch.set_grad_enabled(enable_grad): - try: - infer_method(target_input) - break - except torch.cuda.OutOfMemoryError: - target_data_batch = target_data_batch // 2 + # For some models on multi GPU, we observe the memory per batch is not a constant. + # So we just test the target batch size and make sure we do not go OOM. + while target_data_batch > 1: + with torch.set_grad_enabled(enable_grad): + try: + infer_method(target_input) + break + except torch.cuda.OutOfMemoryError: # pragma: no cover - GPU OOM retry path + target_data_batch = target_data_batch // 2 # pragma: no cover # Regulate the data batch target to be 1, 2, 4, 8, 12, ..., capped at 64 if target_data_batch < 2: @@ -601,28 +633,16 @@ def _forward_loop( dataloader: DataLoader containing the batched input data allowed_non_tensor_keys: Set of key names whose values may be non-tensor types """ - # Disable KV caching during calibration — it is unnecessary overhead and causes - # correctness issues with hybrid Mamba/attention models whose cache state is mutated - # in-place (e.g., NemotronH). - config = getattr(model, "config", None) - prev_use_cache = getattr(config, "use_cache", None) - if config is not None and prev_use_cache is not None: - config.use_cache = False + with _disable_use_cache(model), torch.no_grad(): + is_enc_dec = model_type_is_enc_dec(model) + infer_method = model.generate if is_enc_dec else model.forward + max_working_batch_size = None # Initialize max working batch size as None - try: - with torch.no_grad(): - is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward - max_working_batch_size = None # Initialize max working batch size as None - - for _, data in enumerate(tqdm(dataloader)): - # Process batch and update max working batch size - max_working_batch_size = _process_batch( - data, infer_method, max_working_batch_size, allowed_non_tensor_keys - ) - finally: - if config is not None and prev_use_cache is not None: - config.use_cache = prev_use_cache + for _, data in enumerate(tqdm(dataloader)): + # Process batch and update max working batch size + max_working_batch_size = _process_batch( + data, infer_method, max_working_batch_size, allowed_non_tensor_keys + ) def create_forward_loop( diff --git a/modelopt/torch/utils/plugins/megatron_preprocess_data.py b/modelopt/torch/utils/plugins/megatron_preprocess_data.py index 0c9a121f69..81dac1580b 100644 --- a/modelopt/torch/utils/plugins/megatron_preprocess_data.py +++ b/modelopt/torch/utils/plugins/megatron_preprocess_data.py @@ -78,8 +78,9 @@ --strip_newlines ``` -Note: ``--hf_streaming`` without ``--hf_max_samples_per_split`` falls back to non-streaming, -since streaming the full dataset is slower than the cached non-streaming path. +Note: streaming does not cache to disk, so re-runs re-download. For full-dataset streaming +without a sample cap this is slower than non-streaming mode, but it avoids Arrow schema +compatibility issues with complex nested message types. """ import argparse @@ -191,7 +192,14 @@ def encode(self, json_line: str): if tools: kwargs["tools"] = tools value = self._process_messages(value) - text = _Encoder.tokenizer.apply_chat_template(value, tokenize=False, **kwargs) + try: + text = _Encoder.tokenizer.apply_chat_template(value, tokenize=False, **kwargs) + except Exception as e: + print( + f"apply_chat_template failed: {e}\nData:\n{json.dumps(data, indent=2, default=str)}", + flush=True, + ) + raise # chat template already embeds all special tokens; don't add BOS again add_special_tokens = False else: @@ -452,8 +460,9 @@ def megatron_preprocess_data( hf_split: Hugging Face Hub dataset split. Defaults to None (all splits). hf_max_samples_per_split: Maximum number of rows to consume per split. hf_streaming: Load HuggingFace datasets in streaming mode. Only consumed rows are - downloaded — useful for very large pretraining datasets. Note: streaming does not - cache to disk, so re-runs re-download. Defaults to False. + downloaded — useful for very large pretraining datasets or datasets with complex + nested message schemas that cause Arrow type-cast errors in non-streaming mode. + Note: streaming does not cache to disk, so re-runs re-download. Defaults to False. output_dir: Path to directory to save binary output files. tokenizer_name_or_path: Name or path of the Hugging Face tokenizer to use. json_keys: Key or list of keys to extract from json. Defaults to ["text"]. @@ -485,10 +494,9 @@ def megatron_preprocess_data( warnings.warn( "--hf_streaming is set but --hf_max_samples_per_split is not. " "Streaming without a sample cap re-downloads the full dataset on every run with no " - "disk cache, which is slower than non-streaming mode. Falling back to streaming=False.", + "disk cache, which is slower than the cached non-streaming path.", stacklevel=2, ) - hf_streaming = False Path(output_dir).mkdir(parents=True, exist_ok=True) vocab_size = AutoTokenizer.from_pretrained(tokenizer_name_or_path).vocab_size diff --git a/modelopt/torch/utils/plugins/transformers_dataset.py b/modelopt/torch/utils/plugins/transformers_dataset.py index 56b1e4f07b..162bdbd8cf 100644 --- a/modelopt/torch/utils/plugins/transformers_dataset.py +++ b/modelopt/torch/utils/plugins/transformers_dataset.py @@ -181,6 +181,8 @@ def _post_process_tokenizer(self): def _post_process_chat_template(self): # [WAR]: For DeepSeek-V3/R1 tokenizer, we modify the chat_template such that the # tokens are preserved for supervised learning. + if self.tokenizer.chat_template is None: + return self.tokenizer.chat_template = self.tokenizer.chat_template.replace( REMOVE_THINK_CHAT_TEMPLATE, "" ) diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml index 220d062232..7c55703963 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml @@ -20,7 +20,9 @@ quantize: algorithm: method: max # Max calibration is fast and does not typically need checkpointing. - layerwise: true + # layerwise=false required for VLMs where the decoder layers are nested under + # `model.language_model.layers` (layerwise_calibrate can't find them otherwise). + layerwise: false quant_cfg: - quantizer_name: '*' enable: false diff --git a/noxfile.py b/noxfile.py index fcef3d3087..4b012c9bcc 100644 --- a/noxfile.py +++ b/noxfile.py @@ -52,7 +52,7 @@ def _cov_args(): # ─── CPU unit tests ─────────────────────────────────────────────────────────── -@nox.session(python=["3.10", "3.11", "3.12", "3.13"]) +@nox.session(python=["3.10", "3.11", "3.12", "3.13", "3.14"]) @nox.parametrize("tf_ver", [nox.param(k, id=k) for k in TRANSFORMERS_VERSIONS]) @nox.parametrize("torch_ver", [nox.param(k, id=k) for k in TORCH_VERSIONS]) def unit(session, torch_ver, tf_ver): diff --git a/pyproject.toml b/pyproject.toml index b129ae6709..a174c6218d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ description = "Nvidia Model Optimizer: a unified model optimization and deployme readme = { text = "Checkout https://github.com/nvidia/Model-Optimizer for more information.", content-type = "text/markdown" } license = "Apache-2.0" license-files = ["LICENSE_HEADER"] -requires-python = ">=3.10,<3.14" +requires-python = ">=3.10,<3.15" authors = [{ name = "NVIDIA Corporation" }] classifiers = [ "Programming Language :: Python :: 3", diff --git a/tests/gpu/torch/puzzletron/test_puzzletron.py b/tests/gpu/torch/puzzletron/test_puzzletron.py index a393e1e086..d44cbc71e9 100644 --- a/tests/gpu/torch/puzzletron/test_puzzletron.py +++ b/tests/gpu/torch/puzzletron/test_puzzletron.py @@ -25,11 +25,6 @@ from _test_utils.torch.puzzletron.utils import setup_test_model_and_data from packaging.version import Version -# The puzzletron pipeline imports mip unconditionally at module level. In NeMo containers -# the [puzzletron] extras are not pre-installed, so importing the test file fails with a -# deep ModuleNotFoundError. Skip early with an actionable message instead. -pytest.importorskip("mip", reason="pip install -e '.[puzzletron]' to install MIP solver") - import modelopt.torch.puzzletron as mtpz import modelopt.torch.utils.distributed as dist diff --git a/tests/gpu/torch/puzzletron/tools/test_save_ckpt_from_shards.py b/tests/gpu/torch/puzzletron/tools/test_save_ckpt_from_shards.py new file mode 100644 index 0000000000..a31c687cc1 --- /dev/null +++ b/tests/gpu/torch/puzzletron/tools/test_save_ckpt_from_shards.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for save_checkpoint_from_shards in checkpoint_utils_hf.""" + +import json +from functools import partial + +import pytest +import torch +from _test_utils.torch.distributed.utils import spawn_multiprocess_job +from _test_utils.torch.transformers_models import get_tiny_llama +from safetensors.torch import load_file as safe_load_file + +from modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor import ( + LlamaModelDescriptor, +) +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import ( + SAFE_WEIGHTS_INDEX_NAME, + SAFETENSORS_SUBBLOCKS_DIR_NAME, + save_checkpoint_from_shards, +) + + +class TestSaveCheckpointFromShardsSingleProcess: + """Tests that run without torch.distributed (world_size=1 path).""" + + def test_creates_config_index_and_subblocks(self, tmp_path): + model = get_tiny_llama() + expected_keys = set(model.state_dict().keys()) + save_checkpoint_from_shards(model, tmp_path, LlamaModelDescriptor) + + # test safetensors index file exists and contains weight map + index_path = tmp_path / SAFE_WEIGHTS_INDEX_NAME + assert index_path.exists(), "safetensors index file was not written" + index = json.loads(index_path.read_text()) + assert "weight_map" in index + assert set(index["weight_map"].keys()) == expected_keys + + # test subblocks directory exists and contains shard files + subblocks_dir = tmp_path / SAFETENSORS_SUBBLOCKS_DIR_NAME + assert subblocks_dir.is_dir(), "subblocks directory was not created" + assert len(list(subblocks_dir.glob("*.safetensors"))) > 0, ( + "no safetensors shard files were saved" + ) + + # test config.json saved + config_path = tmp_path / "config.json" + assert config_path.exists(), "config.json was not saved" + cfg = json.loads(config_path.read_text()) + assert cfg["num_hidden_layers"] == get_tiny_llama().config.num_hidden_layers + + # test subblock filenames follow descriptor groups + filenames = set(index["weight_map"].values()) + expected_substrings = {"embeddings", "lm_head", "block_0_ffn", "block_0_attention"} + for substr in expected_substrings: + assert any(substr in f for f in filenames), f"no shard filename contains '{substr}'" + + def test_tie_word_embeddings_excluded(self, tmp_path): + model = get_tiny_llama(tie_word_embeddings=True) + save_checkpoint_from_shards(model, tmp_path, LlamaModelDescriptor) + + index = json.loads((tmp_path / SAFE_WEIGHTS_INDEX_NAME).read_text()) + assert "lm_head.weight" not in index["weight_map"] + + reloaded_sd = {} + for shard in (tmp_path / SAFETENSORS_SUBBLOCKS_DIR_NAME).glob("*.safetensors"): + reloaded_sd.update(safe_load_file(str(shard))) + assert "lm_head.weight" not in reloaded_sd + + def test_saved_weights_match_original(self, tmp_path): + model = get_tiny_llama() + original_sd = {k: v.clone().cpu() for k, v in model.state_dict().items()} + save_checkpoint_from_shards(model, tmp_path, LlamaModelDescriptor) + + reloaded_sd = {} + for shard in (tmp_path / SAFETENSORS_SUBBLOCKS_DIR_NAME).glob("*.safetensors"): + reloaded_sd.update(safe_load_file(str(shard))) + + assert set(reloaded_sd.keys()) == set(original_sd.keys()) + for key in original_sd: + torch.testing.assert_close(reloaded_sd[key], original_sd[key]) + + +def _distributed_save_worker(rank, world_size, checkpoint_dir): + """Worker that shards a model's state dict across ranks and saves.""" + model = get_tiny_llama() + full_sd = model.state_dict() + keys = sorted(full_sd.keys()) + per_rank = len(keys) // world_size + start = rank * per_rank + end = start + per_rank if rank < world_size - 1 else len(keys) + shard_keys = keys[start:end] + + # Zero out keys not owned by this rank so gather reconstructs the full dict. + for k in keys: + if k not in shard_keys: + full_sd[k] = torch.zeros_like(full_sd[k]) + + model.load_state_dict(full_sd) + save_checkpoint_from_shards(model, checkpoint_dir, LlamaModelDescriptor) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="need >=2 GPUs for multi-rank test") +class TestSaveCheckpointFromShardsMultiProcess: + """Tests that exercise the distributed gather path (world_size > 1).""" + + def test_distributed_save_creates_valid_checkpoint(self, tmp_path): + spawn_multiprocess_job(2, partial(_distributed_save_worker, checkpoint_dir=tmp_path)) + + index_path = tmp_path / SAFE_WEIGHTS_INDEX_NAME + assert index_path.exists() + index = json.loads(index_path.read_text()) + + model = get_tiny_llama() + expected_keys = set(model.state_dict().keys()) + assert set(index["weight_map"].keys()) == expected_keys + + shard_files = list((tmp_path / SAFETENSORS_SUBBLOCKS_DIR_NAME).glob("*.safetensors")) + assert len(shard_files) > 0 diff --git a/tests/unit/onnx/quantization/test_qdq_utils.py b/tests/unit/onnx/quantization/test_qdq_utils.py index 42aa317119..8af5f560dd 100644 --- a/tests/unit/onnx/quantization/test_qdq_utils.py +++ b/tests/unit/onnx/quantization/test_qdq_utils.py @@ -1021,3 +1021,90 @@ def test_column_major_gemm_trans_b_flip(self): print(f"transB flipped: 1 -> {trans_b_value}") print(f"Transpose nodes: {len(transpose_nodes)}") + + +def _build_model_with_zero_scale_initializer(dq_op_type: str): + """Build an ONNX model whose scale initializer feeds a (Quantize|Dequantize)Linear node. + + Mirrors the INT4_AWQ failure mode from NVBug 6110209: scales live in graph initializers + (not Constant nodes) and feed DequantizeLinear (default or trt:: domain) consumers. + """ + weight_data = np.random.randint(-8, 8, size=(6, 8), dtype=np.int8) + weight_tensor = numpy_helper.from_array(weight_data, "weight") + + scale_data = np.array([1e-3, 0.0, 5e-4, 0.0, 0.0, 2e-3], dtype=np.float16).reshape(6, 1) + scale_tensor = numpy_helper.from_array(scale_data, "scale") + + input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT16, [None, 6]) + dq_node = helper.make_node( + dq_op_type, inputs=["weight", "scale"], outputs=["dq_output"], name="weight_dq" + ) + matmul_node = helper.make_node( + "MatMul", inputs=["input", "dq_output"], outputs=["output"], name="matmul" + ) + graph = helper.make_graph( + nodes=[dq_node, matmul_node], + name="test_graph", + inputs=[input_tensor], + outputs=[helper.make_tensor_value_info("output", TensorProto.FLOAT16, [None, 8])], + initializer=[weight_tensor, scale_tensor], + ) + return helper.make_model(graph) + + +class TestReplaceZeroScaleWithSmallestNonzero: + """Regression tests for ``replace_zero_scale_with_smallest_nonzero`` (NVBug 6110209).""" + + @pytest.mark.parametrize("dq_op_type", ["DequantizeLinear", "TRT_INT4DequantizeLinear"]) + def test_zero_scale_initializer_fed_to_dq_is_patched(self, dq_op_type): + from modelopt.onnx.quantization.qdq_utils import replace_zero_scale_with_smallest_nonzero + + model = _build_model_with_zero_scale_initializer(dq_op_type) + scale_before = numpy_helper.to_array( + next(init for init in model.graph.initializer if init.name == "scale") + ) + assert (scale_before == 0).any(), "fixture must contain zeros to exercise the fix" + + patched = replace_zero_scale_with_smallest_nonzero(model) + + scale_after_init = next(init for init in patched.graph.initializer if init.name == "scale") + scale_after = numpy_helper.to_array(scale_after_init) + assert not (scale_after == 0).any() + assert (scale_after > 0).all() + assert scale_after_init.data_type == TensorProto.FLOAT16 + + def test_constant_node_scale_path_still_patched(self): + """Legacy Constant-node QDQ path must continue to be patched.""" + from modelopt.onnx.quantization.qdq_utils import replace_zero_scale_with_smallest_nonzero + + scale_data = np.array([1e-3, 0.0, 2e-3], dtype=np.float16) + scale_const = helper.make_node( + "Constant", + inputs=[], + outputs=["scale_out"], + value=numpy_helper.from_array(scale_data), + name="scale_constant", + ) + input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [3]) + q_node = helper.make_node( + "QuantizeLinear", + inputs=["input", "scale_out"], + outputs=["q_output"], + name="q", + ) + graph = helper.make_graph( + nodes=[scale_const, q_node], + name="test_graph", + inputs=[input_tensor], + outputs=[helper.make_tensor_value_info("q_output", TensorProto.INT8, [3])], + initializer=[], + ) + model = helper.make_model(graph) + + patched = replace_zero_scale_with_smallest_nonzero(model) + + const = next(n for n in patched.graph.node if n.op_type == "Constant") + value_attr = next(a for a in const.attribute if a.name == "value") + scale_arr = numpy_helper.to_array(value_attr.t) + assert not (scale_arr == 0).any() + assert (scale_arr > 0).all() diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index 7e77bf1151..2943582774 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -22,11 +22,13 @@ pytest.importorskip("transformers") +from modelopt.torch.quantization.conversion import _normalize_fused_experts_quantizer_name from modelopt.torch.quantization.nn import QuantModuleRegistry from modelopt.torch.quantization.plugins.huggingface import ( _is_fused_experts_module, _is_sparse_sequaential_moe_block, _QuantFusedExperts, + force_eager_experts_impl_on_the_fly, register_fused_experts_on_the_fly, register_sparse_moe_on_the_fly, ) @@ -297,3 +299,316 @@ def test_export_creates_per_expert_submodules(self): if QuantModuleRegistry.get(expert_type) is not None: QuantModuleRegistry.unregister(expert_type) + + +# --------------------------------------------------------------------------- +# Tests for force_eager_experts_impl_on_the_fly +# --------------------------------------------------------------------------- +class _StubConfig: + """Minimal stand-in for HF PretrainedConfig with optional nested sub-configs.""" + + def __init__(self, impl=None, **nested): + if impl is not None: + self._experts_implementation = impl + for key, value in nested.items(): + setattr(self, key, value) + + +class _TinyMoEModelWithConfig(_TinyMoEModel): + def __init__(self, config): + super().__init__() + self.config = config + + +class _NonMoEModelWithConfig(nn.Module): + def __init__(self, config): + super().__init__() + self.linear = nn.Linear(HIDDEN_DIM, HIDDEN_DIM) + self.config = config + + +class TestForceEagerExpertsImpl: + def test_sets_eager_on_moe_model(self): + """Non-eager backend on an MoE model gets flipped to eager.""" + cfg = _StubConfig(impl="kernels") + model = _TinyMoEModelWithConfig(cfg) + force_eager_experts_impl_on_the_fly(model) + assert cfg._experts_implementation == "eager" + + def test_recurses_into_nested_configs(self): + """VLM-style nested text_config / vision_config are also flipped.""" + text_cfg = _StubConfig(impl="grouped_mm") + vision_cfg = _StubConfig(impl="bmm") + root_cfg = _StubConfig(text_config=text_cfg, vision_config=vision_cfg) + model = _TinyMoEModelWithConfig(root_cfg) + force_eager_experts_impl_on_the_fly(model) + assert text_cfg._experts_implementation == "eager" + assert vision_cfg._experts_implementation == "eager" + + def test_skips_model_without_fused_experts(self): + """Non-MoE models must not have their config silently mutated.""" + cfg = _StubConfig(impl="kernels") + model = _NonMoEModelWithConfig(cfg) + force_eager_experts_impl_on_the_fly(model) + assert cfg._experts_implementation == "kernels" + + def test_no_crash_when_config_missing(self): + """Model without a ``config`` attribute must not raise.""" + force_eager_experts_impl_on_the_fly(_TinyMoEModel()) # no-op, no error + + def test_no_crash_when_impl_attr_missing(self): + """Config without ``_experts_implementation`` must not raise.""" + cfg = _StubConfig() # no impl attr + model = _TinyMoEModelWithConfig(cfg) + force_eager_experts_impl_on_the_fly(model) + assert not hasattr(cfg, "_experts_implementation") + + def test_leaves_eager_value_unchanged(self): + cfg = _StubConfig(impl="eager") + model = _TinyMoEModelWithConfig(cfg) + force_eager_experts_impl_on_the_fly(model) + assert cfg._experts_implementation == "eager" + + +# --------------------------------------------------------------------------- +# End-to-end PTQ calibration test — guards the full fused-experts path: +# register_fused_experts_on_the_fly → _QuantFusedExperts.{_setup, forward} → +# plural ModuleList name normalization in conversion._match_quantizer → +# TensorQuantizer amax collection via the F.linear hook. +# If any link breaks, quantizer `amax` stays None and this test fails. +# --------------------------------------------------------------------------- +class TestFusedExpertsCalibration: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + + def test_calibration_populates_all_expert_quantizers(self): + """After PTQ, every input/weight quantizer on the fused-experts module has amax set.""" + import modelopt.torch.quantization as mtq + + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + quant_cfg = { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + ], + "algorithm": "max", + } + + def forward_loop(m): + torch.manual_seed(0) + for _ in range(2): + x = torch.randn(1, 4, HIDDEN_DIM) + m(x) + + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + + experts = model.moe.experts + assert experts.gate_up_proj_input_quantizer.amax is not None, ( + "Shared gate_up_proj input quantizer was not calibrated — " + "F.linear hook likely bypassed by non-eager experts_implementation." + ) + assert experts.down_proj_input_quantizer.amax is not None, ( + "Shared down_proj input quantizer was not calibrated." + ) + for idx in range(NUM_EXPERTS): + assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, ( + f"gate_up_proj_weight_quantizers[{idx}].amax is None — " + "plural ModuleList name normalization in _match_quantizer likely broken." + ) + assert experts.down_proj_weight_quantizers[idx].amax is not None, ( + f"down_proj_weight_quantizers[{idx}].amax is None." + ) + + self._cleanup_registry(expert_type) + + +# --------------------------------------------------------------------------- +# Tests for export enumeration — guards the bug where fused-experts were +# silently skipped by get_quant_config because their weight quantizers live +# on a plural nn.ModuleList instead of the singular *_weight_quantizer attr. +# Missed enumeration → experts don't appear in quantized_layers → +# quantization_formats has only 1 entry from the non-expert modules → +# quant_algo lands on that format instead of "MIXED_PRECISION". +# --------------------------------------------------------------------------- +class _MixedPrecisionModel(nn.Module): + """A model with both a fused-experts block AND a standard Linear, so a + mixed-precision recipe should produce two distinct format groups.""" + + def __init__(self): + super().__init__() + self.moe = _SyntheticSparseMoeBlock() + self.dense = nn.Linear(HIDDEN_DIM, HIDDEN_DIM) + + def forward(self, x): + return self.dense(self.moe(x)) + + +class TestMixedPrecisionExport: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + + def test_weight_attr_names_yields_fused_expert_params(self): + """weight_attr_names must yield gate_up_proj / down_proj on fused experts + even though their quantizers are a plural ModuleList, not singular.""" + from modelopt.torch.quantization.utils.core_utils import weight_attr_names + + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + register_fused_experts_on_the_fly(model) + converted = QuantModuleRegistry.convert(model.moe.experts) + + yielded = list(weight_attr_names(converted)) + assert set(yielded) == {"gate_up_proj", "down_proj"}, ( + f"Expected both fused weight attrs, got {yielded}. " + "Likely regression in representative_weight_quantizer plural fallback." + ) + + self._cleanup_registry(expert_type) + + def test_mixed_precision_config_export(self): + """Mixed-precision recipe (experts FP8 + dense Linear FP8 per-channel) should + show both modules in quantized_layers. Using two distinct formats would + trigger MIXED_PRECISION; using same-format still exercises enumeration.""" + import modelopt.torch.quantization as mtq + from modelopt.torch.export.quant_utils import get_quant_config + + model = _MixedPrecisionModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + # FP8 per-tensor for experts; FP8 per-channel for dense — two distinct + # format strings in quantization_formats, so quant_algo must become + # MIXED_PRECISION. + quant_cfg = { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_name": "*down_proj_input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_name": "*dense.input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_name": "*dense.weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": 0}, # per-channel → FP8_PC_PT + }, + ], + "algorithm": "max", + } + + def forward_loop(m): + torch.manual_seed(0) + for _ in range(2): + x = torch.randn(1, 4, HIDDEN_DIM) + m(x) + + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + + cfg = get_quant_config(model) + q = cfg["quantization"] + + # The fused-experts module MUST appear in quantized_layers. This is the + # central guard: regressions of weight_attr_names plural fallback would + # make experts disappear here. + layer_names = set(q.get("quantized_layers", {}).keys()) + assert any("moe.experts" in n for n in layer_names), ( + f"Fused-experts module missing from quantized_layers: {layer_names}. " + "weight_attr_names likely not yielding plural-ModuleList weight attrs." + ) + assert any(n.endswith("dense") for n in layer_names), ( + f"Dense Linear missing from quantized_layers: {layer_names}." + ) + + # Two distinct formats → MIXED_PRECISION at top level. + assert q["quant_algo"] == "MIXED_PRECISION", ( + f"Expected MIXED_PRECISION (fused-experts FP8 per-tensor + dense " + f"FP8 per-channel), got quant_algo={q['quant_algo']}. " + f"quantized_layers={q.get('quantized_layers')}" + ) + + self._cleanup_registry(expert_type) + + +# --------------------------------------------------------------------------- +# Tests for the fused-experts quantizer-name normalizer used by +# conversion._match_quantizer. Covers both plural (actual _QuantFusedExperts +# layout) and singular (defensive: future variants may name the ModuleList +# without the trailing `s`) forms. +# --------------------------------------------------------------------------- +class TestNormalizeFusedExpertsQuantizerName: + def test_plural_weight_quantizers_stripped(self): + assert ( + _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizers.7") + == "moe.experts.gate_up_proj_weight_quantizer" + ) + + def test_plural_input_quantizers_stripped(self): + assert ( + _normalize_fused_experts_quantizer_name("moe.experts.down_proj_input_quantizers.3") + == "moe.experts.down_proj_input_quantizer" + ) + + def test_singular_weight_quantizer_with_index_stripped(self): + """Defensive: handle variants that name the ModuleList singular.""" + assert ( + _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizer.2") + == "moe.experts.gate_up_proj_weight_quantizer" + ) + + def test_singular_input_quantizer_with_index_stripped(self): + assert ( + _normalize_fused_experts_quantizer_name("moe.experts.down_proj_input_quantizer.0") + == "moe.experts.down_proj_input_quantizer" + ) + + def test_non_indexed_name_unchanged(self): + """Plain singular names (no index) must be passed through untouched.""" + assert ( + _normalize_fused_experts_quantizer_name("moe.experts.gate_up_proj_weight_quantizer") + == "moe.experts.gate_up_proj_weight_quantizer" + ) + + def test_unrelated_dotted_number_unchanged(self): + """Dotted numbers that aren't inside a quantizer-list context are left alone.""" + assert ( + _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight") + == "moe.layers.3.gate.weight" + ) diff --git a/tests/unit/torch/utils/test_dataset_utils.py b/tests/unit/torch/utils/test_dataset_utils.py index 9a89d53672..94a2a5a6aa 100644 --- a/tests/unit/torch/utils/test_dataset_utils.py +++ b/tests/unit/torch/utils/test_dataset_utils.py @@ -17,8 +17,14 @@ import pytest import torch +from torch.utils.data import DataLoader -from modelopt.torch.utils.dataset_utils import _process_batch, get_dataset_samples +from modelopt.torch.utils.dataset_utils import ( + _disable_use_cache, + _forward_loop, + _process_batch, + get_dataset_samples, +) def setup_test_data(): @@ -145,6 +151,86 @@ def mock_infer(**kwargs): _process_batch(batch_data, mock_infer, allowed_non_tensor_keys={"base_model_outputs"}) +class _Config: + """Minimal config stand-in; instances start with no `use_cache` attribute.""" + + +def test_disable_use_cache_no_config_attr(): + """Model without a `config` attribute: CM is a no-op and does not raise.""" + model = torch.nn.Linear(4, 4) + assert not hasattr(model, "config") + + with _disable_use_cache(model): + assert not hasattr(model, "config") + + assert not hasattr(model, "config") + + +@pytest.mark.parametrize("prev_value", [True, False]) +def test_disable_use_cache_with_existing_attr(prev_value): + """Config that already has `use_cache`: forced to False inside, restored on exit.""" + model = torch.nn.Linear(4, 4) + model.config = _Config() + model.config.use_cache = prev_value + + with _disable_use_cache(model): + assert model.config.use_cache is False + + assert model.config.use_cache is prev_value + + +def test_disable_use_cache_without_existing_attr(): + """Config that lacks `use_cache`: set to False inside, attribute removed on exit (no leak).""" + model = torch.nn.Linear(4, 4) + model.config = _Config() + assert not hasattr(model.config, "use_cache") + + with _disable_use_cache(model): + assert model.config.use_cache is False + + assert not hasattr(model.config, "use_cache") + + +def test_forward_loop_runs_under_disabled_use_cache(): + """`_forward_loop` runs forward on every batch and restores `use_cache` on exit.""" + seen_use_cache: list[bool] = [] + + class _Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = _Config() + self.config.use_cache = True + + def forward(self, **kwargs): + seen_use_cache.append(self.config.use_cache) + + model = _Model() + + def _collate(samples): + return {"input_ids": torch.stack([s["input_ids"] for s in samples])} + + data = [{"input_ids": torch.zeros(8, dtype=torch.long)} for _ in range(3)] + loader = DataLoader(data, batch_size=1, collate_fn=_collate) + + _forward_loop(model, loader) + + assert seen_use_cache == [False, False, False] + assert model.config.use_cache is True + + +def test_disable_use_cache_restores_on_exception(): + """Restore must run even if the with-block raises.""" + model = torch.nn.Linear(4, 4) + model.config = _Config() + model.config.use_cache = True + + with pytest.raises(RuntimeError, match="boom"), _disable_use_cache(model): + assert model.config.use_cache is False + raise RuntimeError("boom") + + assert model.config.use_cache is True + + @pytest.mark.parametrize("test_local_path", [True, False]) def test_get_dataset_samples_with_unsupported_minipile_dataset(tmp_path, test_local_path): pytest.importorskip("datasets")