From bdeba127c06485769557575ec4b6ea60641e9fe1 Mon Sep 17 00:00:00 2001
From: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
Date: Mon, 30 Mar 2026 19:39:52 +0000
Subject: [PATCH 1/2] Remove BIONEMO-2840 sm120 fused attention workarounds

The THD implementation for fused attention on sm120 (Blackwell) is now
available in Transformer Engine, so these workarounds are no longer needed.

Removes:
- pytest.xfail guards for sm120 in test_modeling_common.py (6 files)
- monkeypatch.setenv("NVTE_FUSED_ATTN", "0") blocks in esm2 recipe tests
- Unused monkeypatch parameters and torch import where applicable

Signed-off-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
---
 .../tests/common/test_modeling_common.py      |  2 -
 .../codonfm/tests/test_modeling_codonfm_te.py |  2 -
 .../esm2/tests/common/test_modeling_common.py |  2 -
 .../tests/common/test_modeling_common.py      |  2 -
 .../tests/common/test_modeling_common.py      |  2 -
 .../qwen/tests/common/test_modeling_common.py |  2 -
 .../esm2_native_te/tests/test_train.py        | 54 ++++---------------
 .../esm2_peft_te/tests/test_train_lora.py     |  8 +--
 8 files changed, 10 insertions(+), 64 deletions(-)

diff --git a/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py b/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py
index 45ff3021f4..90f674dc3c 100644
--- a/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py
+++ b/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py
@@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         input_data_bshd = self.get_test_input_data(format="bshd")
         input_data_thd = self.get_test_input_data(format="thd")
diff --git a/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py b/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py
index b46cc0d6b3..0b00e7b9c5 100644
--- a/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py
+++ b/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py
@@ -328,8 +328,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         golden_dir = Path(__file__).parent
         golden_sd_path = golden_dir / "golden_state_dict.safetensors"
diff --git a/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py b/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py
index daa4dbd900..89b86f67e5 100644
--- a/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py
+++ b/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py
@@ -718,8 +718,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         input_data_bshd = self.get_test_input_data(format="bshd")
         input_data_thd = self.get_test_input_data(format="thd")
diff --git a/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py b/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py
index 45ff3021f4..90f674dc3c 100644
--- a/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py
+++ b/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py
@@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         input_data_bshd = self.get_test_input_data(format="bshd")
         input_data_thd = self.get_test_input_data(format="thd")
diff --git a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py
index 45ff3021f4..90f674dc3c 100644
--- a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py
+++ b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py
@@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         input_data_bshd = self.get_test_input_data(format="bshd")
         input_data_thd = self.get_test_input_data(format="thd")
diff --git a/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py b/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py
index 45ff3021f4..90f674dc3c 100644
--- a/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py
+++ b/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py
@@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend):
 
         if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8:
             pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.")
-        elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12:
-            pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.")
 
         input_data_bshd = self.get_test_input_data(format="bshd")
         input_data_thd = self.get_test_input_data(format="thd")
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
index 792ac7e804..aaf8d2c56b 100644
--- a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
+++ b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
@@ -364,12 +364,8 @@ def test_sanity_convergence_fsdp2_fp8_and_model_init(tmp_path, recipe_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_sanity_convergence_fsdp2_thd(tmp_path, monkeypatch, recipe_path):
+def test_sanity_convergence_fsdp2_thd(tmp_path, recipe_path):
     """For FSDP2, we check that the script can run successfully with FP8 and check convergence."""
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
 
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -386,12 +382,8 @@ def test_sanity_convergence_fsdp2_thd(tmp_path, monkeypatch, recipe_path):
 
 
 @requires_fp8
-def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, monkeypatch, recipe_path):
+def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, recipe_path):
     """For FSDP2, we check that the script can run successfully with THD + FP8 and check convergence."""
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
 
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -408,11 +400,7 @@ def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, monkeypatch, recipe_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_ddp_thd(tmp_path, recipe_path):
 
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
@@ -429,11 +417,7 @@ def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path):
     main_ddp(sanity_config)
 
 
-def test_sanity_mfsdp_thd(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_mfsdp_thd(tmp_path, recipe_path):
 
     # For MFSDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
@@ -451,11 +435,7 @@ def test_sanity_mfsdp_thd(tmp_path, monkeypatch, recipe_path):
 
 
 @requires_fp8
-def test_sanity_ddp_thd_fp8(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_ddp_thd_fp8(tmp_path, recipe_path):
 
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
@@ -474,11 +454,7 @@ def test_sanity_ddp_thd_fp8(tmp_path, monkeypatch, recipe_path):
 
 
 @requires_fp8
-def test_sanity_mfsdp_thd_fp8(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_mfsdp_thd_fp8(tmp_path, recipe_path):
 
     # For MFSDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
@@ -571,11 +547,7 @@ def test_sanity_convergence_fsdp2_huggingface_model(tmp_path, recipe_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_sanity_ddp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_ddp_thd_token_packing(tmp_path, recipe_path):
 
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
@@ -592,11 +564,7 @@ def test_sanity_ddp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
     main_ddp(sanity_config)
 
 
-def test_sanity_mfsdp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_mfsdp_thd_token_packing(tmp_path, recipe_path):
 
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -612,11 +580,7 @@ def test_sanity_mfsdp_thd_token_packing(tmp_path, monkeypatch, recipe_path):
     main_mfsdp(sanity_config)
 
 
-def test_sanity_fsdp2_thd_token_packing(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
+def test_sanity_fsdp2_thd_token_packing(tmp_path, recipe_path):
 
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
diff --git a/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py b/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py
index eb0d4960b7..f00d647c73 100644
--- a/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py
+++ b/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from hydra import compose, initialize_config_dir
 
 from train_lora_ddp import main as main_ddp
@@ -54,12 +53,7 @@ def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path):
-    if torch.cuda.get_device_capability() == (12, 0):
-        # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default,
-        # but it's missing this THD implementation.
-        monkeypatch.setenv("NVTE_FUSED_ATTN", "0")
-
+def test_sanity_ddp_thd(tmp_path, recipe_path):
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(

From af434ba47f4ac22beb511bdcfb0aebd181e74de9 Mon Sep 17 00:00:00 2001
From: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:46:40 -0700
Subject: [PATCH 2/2] fix: remove extra blank lines after function defs in
 test_train.py

Pre-commit (ruff format) flagged 7 functions with a blank line between
the def signature and the first body line. Removed them.

Signed-off-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
---
 bionemo-recipes/recipes/esm2_native_te/tests/test_train.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
index aaf8d2c56b..75ae5f05e7 100644
--- a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
+++ b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py
@@ -401,7 +401,6 @@ def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, recipe_path):
 
 
 def test_sanity_ddp_thd(tmp_path, recipe_path):
-
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -418,7 +417,6 @@ def test_sanity_ddp_thd(tmp_path, recipe_path):
 
 
 def test_sanity_mfsdp_thd(tmp_path, recipe_path):
-
     # For MFSDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -436,7 +434,6 @@ def test_sanity_mfsdp_thd(tmp_path, recipe_path):
 
 @requires_fp8
 def test_sanity_ddp_thd_fp8(tmp_path, recipe_path):
-
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -455,7 +452,6 @@ def test_sanity_ddp_thd_fp8(tmp_path, recipe_path):
 
 @requires_fp8
 def test_sanity_mfsdp_thd_fp8(tmp_path, recipe_path):
-
     # For MFSDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -548,7 +544,6 @@ def test_sanity_convergence_fsdp2_huggingface_model(tmp_path, recipe_path):
 
 
 def test_sanity_ddp_thd_token_packing(tmp_path, recipe_path):
-
     # For DDP, we only check that the script can run successfully with THD, not convergence.
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
@@ -565,7 +560,6 @@ def test_sanity_ddp_thd_token_packing(tmp_path, recipe_path):
 
 
 def test_sanity_mfsdp_thd_token_packing(tmp_path, recipe_path):
-
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
@@ -581,7 +575,6 @@ def test_sanity_mfsdp_thd_token_packing(tmp_path, recipe_path):
 
 
 def test_sanity_fsdp2_thd_token_packing(tmp_path, recipe_path):
-
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",