diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge index 7110a96427..95e5f38f87 160000 --- a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge @@ -1 +1 @@ -Subproject commit 7110a964272a5c74dcb6b680b691087e190c220c +Subproject commit 95e5f38f8727c4ab30830559c68939f35f4e52f6 diff --git a/3rdparty/Megatron-Bridge-workspace/setup.py b/3rdparty/Megatron-Bridge-workspace/setup.py index 397dde2c7d..871557a27c 100644 --- a/3rdparty/Megatron-Bridge-workspace/setup.py +++ b/3rdparty/Megatron-Bridge-workspace/setup.py @@ -56,7 +56,7 @@ "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", - "mlflow>=3.5.0", + "mlflow>=3.9.0", "comet-ml>=3.50.0", "torch>=2.6.0", ] diff --git a/3rdparty/Megatron-LM-workspace/Megatron-LM b/3rdparty/Megatron-LM-workspace/Megatron-LM index 17a67b9a97..d30c3ae546 160000 --- a/3rdparty/Megatron-LM-workspace/Megatron-LM +++ b/3rdparty/Megatron-LM-workspace/Megatron-LM @@ -1 +1 @@ -Subproject commit 17a67b9a97fb11a75933fd7f76ad76e1ac98a53d +Subproject commit d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81 diff --git a/3rdparty/Megatron-LM-workspace/setup.py b/3rdparty/Megatron-LM-workspace/setup.py index d6339e726a..75b5831fb4 100644 --- a/3rdparty/Megatron-LM-workspace/setup.py +++ b/3rdparty/Megatron-LM-workspace/setup.py @@ -51,7 +51,7 @@ # TODO(https://github.com/NVIDIA-NeMo/RL/issues/2111): upgrade to core_cu13 when we move to CUDA 13 base container "transformer-engine[pytorch,core_cu12]", # VCS dependency - must match pyproject.toml [tool.uv.sources] - "nvidia-resiliency-ext @ git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@63154570cea17f8805a7fd15cc3b8cc2919ba575", + "nvidia-resiliency-ext @ git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@15a851565a4ce846c04431ecb0cf09903ab4837e", "tqdm", "einops~=0.8", "tensorstore~=0.1,!=0.1.46,!=0.1.72", diff --git a/nemo_rl/models/generation/vllm/vllm_backend.py b/nemo_rl/models/generation/vllm/vllm_backend.py index 9237788be1..05c9b837c9 100644 --- a/nemo_rl/models/generation/vllm/vllm_backend.py +++ b/nemo_rl/models/generation/vllm/vllm_backend.py @@ -37,6 +37,20 @@ ) +def fix_gpt_oss_export_transpose(key: str, weight: torch.Tensor) -> torch.Tensor: + """Apply GPT-OSS down_proj transpose fix to the weight. + + This is a workaround for the issue that the down_proj layout is not the same across different frameworks. + - HF needs [in, out] layout. + - Megatron needs [in, out] layout. + - vLLM needs [out, in] layout. + See https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/3271 for more details. + """ + if key.endswith("mlp.experts.down_proj"): + weight = weight.transpose(-2, -1).contiguous() + return weight + + class VllmInternalWorkerExtension: def init_collective( self, @@ -199,20 +213,30 @@ def update_weights_via_ipc_zmq(self) -> bool: shape, dtype = self.state_dict_info[key] # pyrefly if isinstance(shape, list): shape = torch.Size(shape) + + # Get the weight from the buffer size_in_bytes = dtype.itemsize * shape.numel() - weights.append( - ( - key, - buffer[offset : offset + size_in_bytes] - .view(dtype=dtype) - .view(shape), - ) + weight = ( + buffer[offset : offset + size_in_bytes] + .view(dtype=dtype) + .view(shape) ) + # apply gpt-oss transpose fix + if ( + "GptOssForCausalLM" + in self.model_runner.vllm_config.model_config.architectures + ): + weight = fix_gpt_oss_export_transpose(key, weight) + weights.append((key, weight)) + + # Move offset to the next weight aligned_size = calculate_aligned_size(size_in_bytes) offset += aligned_size + assert offset == used_bytes, ( "Offset is not equal to used bytes, usually indicate inaccurate info like keys or cached dtype in state_dict_info" ) + # Load weights into the model from nemo_rl.models.generation.vllm.quantization import fp8 @@ -276,6 +300,15 @@ def _load_model_weights(weights, model_runner): """ from nemo_rl.models.generation.vllm.quantization import fp8 + # apply gpt-oss transpose fix + if ( + "GptOssForCausalLM" + in self.model_runner.vllm_config.model_config.architectures + ): + for idx, (key, weight) in enumerate(weights): + weight = fix_gpt_oss_export_transpose(key, weight) + weights[idx] = (key, weight) + policy_weights, draft_weights = self._split_policy_and_draft_weights( weights ) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 49d9007fcd..fc5c6c44fa 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -142,29 +142,6 @@ def destroy_parallel_state(): except ImportError: pass - # Reset the third global async_calls instance in base strategy module - try: - import megatron.core.dist_checkpointing.strategies.base as base_strategy - from megatron.core.dist_checkpointing.strategies.async_utils import ( - AsyncCallsQueue, - ) - - # Clean up and reset the global async_calls in base strategy - old_call_idx = getattr(base_strategy.async_calls, "call_idx", None) - num_unfinalized = base_strategy.async_calls.get_num_unfinalized_calls() - if num_unfinalized > 0: - print( - f"[WARNING] Resetting base strategy async_calls with {num_unfinalized} unfinalized calls" - ) - try: - base_strategy.async_calls.close() - except: - pass - base_strategy.async_calls = AsyncCallsQueue() - print(f"[DEBUG] Reset base strategy async_calls (old call_idx: {old_call_idx})") - except ImportError: - pass - def setup_distributed() -> None: """Handle NCCL settings, dtype mapping, and basic config setup.""" diff --git a/pyproject.toml b/pyproject.toml index 284d782030..4a3574dc10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -497,7 +497,7 @@ requires-dist = [ "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", - "mlflow>=3.5.0", + "mlflow>=3.9.0", "comet-ml>=3.50.0", "torch>=2.6.0", ] diff --git a/uv.lock b/uv.lock index 9f32f86af7..55021a6873 100644 --- a/uv.lock +++ b/uv.lock @@ -146,7 +146,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"] [[manifest.dependency-metadata]] name = "megatron-bridge" version = "0.0.0" -requires-dist = ["transformers>=5.0.0,<=5.3.0", "peft>=0.18.1", "datasets>=2.20.0", "accelerate", "diffusers>=0.36.0", "peft>=0.18.0", "einops", "imageio", "imageio-ffmpeg", "omegaconf>=2.3.0", "tensorboard>=2.19.0", "typing-extensions", "rich", "wandb>=0.25.0", "six>=1.17.0", "regex>=2024.11.6", "pyyaml>=6.0.2", "tqdm>=4.67.1", "hydra-core>1.3,<=1.3.2", "qwen-vl-utils", "transformer-engine[pytorch,core-cu12]", "mamba-ssm", "nvidia-resiliency-ext", "causal-conv1d", "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", "mlflow>=3.5.0", "comet-ml>=3.50.0", "torch>=2.6.0"] +requires-dist = ["transformers>=5.0.0,<=5.3.0", "peft>=0.18.1", "datasets>=2.20.0", "accelerate", "diffusers>=0.36.0", "peft>=0.18.0", "einops", "imageio", "imageio-ffmpeg", "omegaconf>=2.3.0", "tensorboard>=2.19.0", "typing-extensions", "rich", "wandb>=0.25.0", "six>=1.17.0", "regex>=2024.11.6", "pyyaml>=6.0.2", "tqdm>=4.67.1", "hydra-core>1.3,<=1.3.2", "qwen-vl-utils", "transformer-engine[pytorch,core-cu12]", "mamba-ssm", "nvidia-resiliency-ext", "causal-conv1d", "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", "mlflow>=3.9.0", "comet-ml>=3.50.0", "torch>=2.6.0"] [[manifest.dependency-metadata]] name = "nv-grouped-gemm" @@ -3309,7 +3309,7 @@ requires-dist = [ { name = "imageio" }, { name = "imageio-ffmpeg" }, { name = "mamba-ssm" }, - { name = "mlflow", specifier = ">=3.5.0" }, + { name = "mlflow", specifier = ">=3.9.0" }, { name = "nvidia-resiliency-ext" }, { name = "omegaconf", specifier = ">=2.3.0" }, { name = "open-clip-torch", specifier = ">=3.2.0" }, @@ -3381,7 +3381,7 @@ requires-dist = [ { name = "multi-storage-client", specifier = "~=0.27" }, { name = "numpy" }, { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin'" }, - { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=63154570cea17f8805a7fd15cc3b8cc2919ba575" }, + { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e" }, { name = "nvtx", specifier = "~=0.2" }, { name = "onnxscript" }, { name = "openai", extras = ["aiohttp"] }, @@ -4807,8 +4807,8 @@ wheels = [ [[package]] name = "nvidia-resiliency-ext" -version = "0.6.0" -source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=63154570cea17f8805a7fd15cc3b8cc2919ba575#63154570cea17f8805a7fd15cc3b8cc2919ba575" } +version = "0.6.0.dev33+15a8515" +source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e#15a851565a4ce846c04431ecb0cf09903ab4837e" } dependencies = [ { name = "defusedxml" }, { name = "grpcio" }, @@ -4818,8 +4818,10 @@ dependencies = [ { name = "mcp" }, { name = "nvidia-ml-py" }, { name = "packaging" }, + { name = "protobuf" }, { name = "psutil" }, { name = "pyyaml" }, + { name = "setproctitle" }, { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin' or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-fsdp') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-mcore') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-fsdp' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-sglang' and extra == 'extra-7-nemo-rl-vllm')" }, { name = "torch", version = "2.10.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin' or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-fsdp') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-mcore') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-fsdp' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-sglang' and extra == 'extra-7-nemo-rl-vllm')" }, ]