diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
index 7110a96427..95e5f38f87 160000
--- a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 7110a964272a5c74dcb6b680b691087e190c220c
+Subproject commit 95e5f38f8727c4ab30830559c68939f35f4e52f6
diff --git a/3rdparty/Megatron-Bridge-workspace/setup.py b/3rdparty/Megatron-Bridge-workspace/setup.py
index 397dde2c7d..871557a27c 100644
--- a/3rdparty/Megatron-Bridge-workspace/setup.py
+++ b/3rdparty/Megatron-Bridge-workspace/setup.py
@@ -56,7 +56,7 @@
     "flash-linear-attention",
     "timm",
     "open-clip-torch>=3.2.0",
-    "mlflow>=3.5.0",
+    "mlflow>=3.9.0",
     "comet-ml>=3.50.0",
     "torch>=2.6.0",
 ]
diff --git a/3rdparty/Megatron-LM-workspace/Megatron-LM b/3rdparty/Megatron-LM-workspace/Megatron-LM
index 17a67b9a97..d30c3ae546 160000
--- a/3rdparty/Megatron-LM-workspace/Megatron-LM
+++ b/3rdparty/Megatron-LM-workspace/Megatron-LM
@@ -1 +1 @@
-Subproject commit 17a67b9a97fb11a75933fd7f76ad76e1ac98a53d
+Subproject commit d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81
diff --git a/3rdparty/Megatron-LM-workspace/setup.py b/3rdparty/Megatron-LM-workspace/setup.py
index d6339e726a..75b5831fb4 100644
--- a/3rdparty/Megatron-LM-workspace/setup.py
+++ b/3rdparty/Megatron-LM-workspace/setup.py
@@ -51,7 +51,7 @@
     # TODO(https://github.com/NVIDIA-NeMo/RL/issues/2111): upgrade to core_cu13 when we move to CUDA 13 base container
     "transformer-engine[pytorch,core_cu12]",
     # VCS dependency - must match pyproject.toml [tool.uv.sources]
-    "nvidia-resiliency-ext @ git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@63154570cea17f8805a7fd15cc3b8cc2919ba575",
+    "nvidia-resiliency-ext @ git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@15a851565a4ce846c04431ecb0cf09903ab4837e",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
diff --git a/nemo_rl/models/generation/vllm/vllm_backend.py b/nemo_rl/models/generation/vllm/vllm_backend.py
index 9237788be1..05c9b837c9 100644
--- a/nemo_rl/models/generation/vllm/vllm_backend.py
+++ b/nemo_rl/models/generation/vllm/vllm_backend.py
@@ -37,6 +37,20 @@
     )
 
 
+def fix_gpt_oss_export_transpose(key: str, weight: torch.Tensor) -> torch.Tensor:
+    """Apply GPT-OSS down_proj transpose fix to the weight.
+
+    This is a workaround for the issue that the down_proj layout is not the same across different frameworks.
+        - HF needs [in, out] layout.
+        - Megatron needs [in, out] layout.
+        - vLLM needs [out, in] layout.
+    See https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/3271 for more details.
+    """
+    if key.endswith("mlp.experts.down_proj"):
+        weight = weight.transpose(-2, -1).contiguous()
+    return weight
+
+
 class VllmInternalWorkerExtension:
     def init_collective(
         self,
@@ -199,20 +213,30 @@ def update_weights_via_ipc_zmq(self) -> bool:
                     shape, dtype = self.state_dict_info[key]  # pyrefly
                     if isinstance(shape, list):
                         shape = torch.Size(shape)
+
+                    # Get the weight from the buffer
                     size_in_bytes = dtype.itemsize * shape.numel()
-                    weights.append(
-                        (
-                            key,
-                            buffer[offset : offset + size_in_bytes]
-                            .view(dtype=dtype)
-                            .view(shape),
-                        )
+                    weight = (
+                        buffer[offset : offset + size_in_bytes]
+                        .view(dtype=dtype)
+                        .view(shape)
                     )
+                    # apply gpt-oss transpose fix
+                    if (
+                        "GptOssForCausalLM"
+                        in self.model_runner.vllm_config.model_config.architectures
+                    ):
+                        weight = fix_gpt_oss_export_transpose(key, weight)
+                    weights.append((key, weight))
+
+                    # Move offset to the next weight
                     aligned_size = calculate_aligned_size(size_in_bytes)
                     offset += aligned_size
+
                 assert offset == used_bytes, (
                     "Offset is not equal to used bytes, usually indicate inaccurate info like keys or cached dtype in state_dict_info"
                 )
+
                 # Load weights into the model
                 from nemo_rl.models.generation.vllm.quantization import fp8
 
@@ -276,6 +300,15 @@ def _load_model_weights(weights, model_runner):
             """
             from nemo_rl.models.generation.vllm.quantization import fp8
 
+            # apply gpt-oss transpose fix
+            if (
+                "GptOssForCausalLM"
+                in self.model_runner.vllm_config.model_config.architectures
+            ):
+                for idx, (key, weight) in enumerate(weights):
+                    weight = fix_gpt_oss_export_transpose(key, weight)
+                    weights[idx] = (key, weight)
+
             policy_weights, draft_weights = self._split_policy_and_draft_weights(
                 weights
             )
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 49d9007fcd..fc5c6c44fa 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -142,29 +142,6 @@ def destroy_parallel_state():
     except ImportError:
         pass
 
-    # Reset the third global async_calls instance in base strategy module
-    try:
-        import megatron.core.dist_checkpointing.strategies.base as base_strategy
-        from megatron.core.dist_checkpointing.strategies.async_utils import (
-            AsyncCallsQueue,
-        )
-
-        # Clean up and reset the global async_calls in base strategy
-        old_call_idx = getattr(base_strategy.async_calls, "call_idx", None)
-        num_unfinalized = base_strategy.async_calls.get_num_unfinalized_calls()
-        if num_unfinalized > 0:
-            print(
-                f"[WARNING] Resetting base strategy async_calls with {num_unfinalized} unfinalized calls"
-            )
-        try:
-            base_strategy.async_calls.close()
-        except:
-            pass
-        base_strategy.async_calls = AsyncCallsQueue()
-        print(f"[DEBUG] Reset base strategy async_calls (old call_idx: {old_call_idx})")
-    except ImportError:
-        pass
-
 
 def setup_distributed() -> None:
     """Handle NCCL settings, dtype mapping, and basic config setup."""
diff --git a/pyproject.toml b/pyproject.toml
index 284d782030..4a3574dc10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -497,7 +497,7 @@ requires-dist = [
   "flash-linear-attention",
   "timm",
   "open-clip-torch>=3.2.0",
-  "mlflow>=3.5.0",
+  "mlflow>=3.9.0",
   "comet-ml>=3.50.0",
   "torch>=2.6.0",
 ]
diff --git a/uv.lock b/uv.lock
index 9f32f86af7..55021a6873 100644
--- a/uv.lock
+++ b/uv.lock
@@ -146,7 +146,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"]
 [[manifest.dependency-metadata]]
 name = "megatron-bridge"
 version = "0.0.0"
-requires-dist = ["transformers>=5.0.0,<=5.3.0", "peft>=0.18.1", "datasets>=2.20.0", "accelerate", "diffusers>=0.36.0", "peft>=0.18.0", "einops", "imageio", "imageio-ffmpeg", "omegaconf>=2.3.0", "tensorboard>=2.19.0", "typing-extensions", "rich", "wandb>=0.25.0", "six>=1.17.0", "regex>=2024.11.6", "pyyaml>=6.0.2", "tqdm>=4.67.1", "hydra-core>1.3,<=1.3.2", "qwen-vl-utils", "transformer-engine[pytorch,core-cu12]", "mamba-ssm", "nvidia-resiliency-ext", "causal-conv1d", "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", "mlflow>=3.5.0", "comet-ml>=3.50.0", "torch>=2.6.0"]
+requires-dist = ["transformers>=5.0.0,<=5.3.0", "peft>=0.18.1", "datasets>=2.20.0", "accelerate", "diffusers>=0.36.0", "peft>=0.18.0", "einops", "imageio", "imageio-ffmpeg", "omegaconf>=2.3.0", "tensorboard>=2.19.0", "typing-extensions", "rich", "wandb>=0.25.0", "six>=1.17.0", "regex>=2024.11.6", "pyyaml>=6.0.2", "tqdm>=4.67.1", "hydra-core>1.3,<=1.3.2", "qwen-vl-utils", "transformer-engine[pytorch,core-cu12]", "mamba-ssm", "nvidia-resiliency-ext", "causal-conv1d", "flash-linear-attention", "timm", "open-clip-torch>=3.2.0", "mlflow>=3.9.0", "comet-ml>=3.50.0", "torch>=2.6.0"]
 
 [[manifest.dependency-metadata]]
 name = "nv-grouped-gemm"
@@ -3309,7 +3309,7 @@ requires-dist = [
     { name = "imageio" },
     { name = "imageio-ffmpeg" },
     { name = "mamba-ssm" },
-    { name = "mlflow", specifier = ">=3.5.0" },
+    { name = "mlflow", specifier = ">=3.9.0" },
     { name = "nvidia-resiliency-ext" },
     { name = "omegaconf", specifier = ">=2.3.0" },
     { name = "open-clip-torch", specifier = ">=3.2.0" },
@@ -3381,7 +3381,7 @@ requires-dist = [
     { name = "multi-storage-client", specifier = "~=0.27" },
     { name = "numpy" },
     { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin'" },
-    { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=63154570cea17f8805a7fd15cc3b8cc2919ba575" },
+    { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e" },
     { name = "nvtx", specifier = "~=0.2" },
     { name = "onnxscript" },
     { name = "openai", extras = ["aiohttp"] },
@@ -4807,8 +4807,8 @@ wheels = [
 
 [[package]]
 name = "nvidia-resiliency-ext"
-version = "0.6.0"
-source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=63154570cea17f8805a7fd15cc3b8cc2919ba575#63154570cea17f8805a7fd15cc3b8cc2919ba575" }
+version = "0.6.0.dev33+15a8515"
+source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e#15a851565a4ce846c04431ecb0cf09903ab4837e" }
 dependencies = [
     { name = "defusedxml" },
     { name = "grpcio" },
@@ -4818,8 +4818,10 @@ dependencies = [
     { name = "mcp" },
     { name = "nvidia-ml-py" },
     { name = "packaging" },
+    { name = "protobuf" },
     { name = "psutil" },
     { name = "pyyaml" },
+    { name = "setproctitle" },
     { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin' or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-fsdp') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-mcore') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-fsdp' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-sglang' and extra == 'extra-7-nemo-rl-vllm')" },
     { name = "torch", version = "2.10.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin' or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-fsdp') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-mcore') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-automodel' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-fsdp' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-sglang') or (extra == 'extra-7-nemo-rl-mcore' and extra == 'extra-7-nemo-rl-vllm') or (extra == 'extra-7-nemo-rl-sglang' and extra == 'extra-7-nemo-rl-vllm')" },
 ]