diff --git a/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py b/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py index 45ff3021f4..90f674dc3c 100644 --- a/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py +++ b/bionemo-recipes/models/codonfm/tests/common/test_modeling_common.py @@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") input_data_bshd = self.get_test_input_data(format="bshd") input_data_thd = self.get_test_input_data(format="thd") diff --git a/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py b/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py index b46cc0d6b3..0b00e7b9c5 100644 --- a/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py +++ b/bionemo-recipes/models/codonfm/tests/test_modeling_codonfm_te.py @@ -328,8 +328,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") golden_dir = Path(__file__).parent golden_sd_path = golden_dir / "golden_state_dict.safetensors" diff --git a/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py b/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py index daa4dbd900..89b86f67e5 100644 --- a/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py +++ b/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py @@ -718,8 +718,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") input_data_bshd = self.get_test_input_data(format="bshd") input_data_thd = self.get_test_input_data(format="thd") diff --git a/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py b/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py index 45ff3021f4..90f674dc3c 100644 --- a/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py +++ b/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py @@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") input_data_bshd = self.get_test_input_data(format="bshd") input_data_thd = self.get_test_input_data(format="thd") diff --git a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py index 45ff3021f4..90f674dc3c 100644 --- a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py +++ b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py @@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") input_data_bshd = self.get_test_input_data(format="bshd") input_data_thd = self.get_test_input_data(format="thd") diff --git a/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py b/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py index 45ff3021f4..90f674dc3c 100644 --- a/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py +++ b/bionemo-recipes/models/qwen/tests/common/test_modeling_common.py @@ -724,8 +724,6 @@ def test_golden_values_thd(self, te_attn_backend): if te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 8: pytest.xfail("On Ada and Ampere, no THD implementation is available for fused attn.") - elif te_attn_backend == "fused_attn" and torch.cuda.get_device_capability()[0] == 12: - pytest.xfail("BIONEMO-2840: On sm120, the THD implementation is not available for fused attn.") input_data_bshd = self.get_test_input_data(format="bshd") input_data_thd = self.get_test_input_data(format="thd") diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py index 792ac7e804..75ae5f05e7 100644 --- a/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py +++ b/bionemo-recipes/recipes/esm2_native_te/tests/test_train.py @@ -364,12 +364,8 @@ def test_sanity_convergence_fsdp2_fp8_and_model_init(tmp_path, recipe_path): assert final_loss < 3.0, f"Final loss {final_loss} is too high" -def test_sanity_convergence_fsdp2_thd(tmp_path, monkeypatch, recipe_path): +def test_sanity_convergence_fsdp2_thd(tmp_path, recipe_path): """For FSDP2, we check that the script can run successfully with FP8 and check convergence.""" - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -386,12 +382,8 @@ def test_sanity_convergence_fsdp2_thd(tmp_path, monkeypatch, recipe_path): @requires_fp8 -def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, monkeypatch, recipe_path): +def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, recipe_path): """For FSDP2, we check that the script can run successfully with THD + FP8 and check convergence.""" - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -408,12 +400,7 @@ def test_sanity_convergence_fsdp2_thd_fp8(tmp_path, monkeypatch, recipe_path): assert final_loss < 3.0, f"Final loss {final_loss} is too high" -def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_ddp_thd(tmp_path, recipe_path): # For DDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -429,12 +416,7 @@ def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path): main_ddp(sanity_config) -def test_sanity_mfsdp_thd(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_mfsdp_thd(tmp_path, recipe_path): # For MFSDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -451,12 +433,7 @@ def test_sanity_mfsdp_thd(tmp_path, monkeypatch, recipe_path): @requires_fp8 -def test_sanity_ddp_thd_fp8(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_ddp_thd_fp8(tmp_path, recipe_path): # For DDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -474,12 +451,7 @@ def test_sanity_ddp_thd_fp8(tmp_path, monkeypatch, recipe_path): @requires_fp8 -def test_sanity_mfsdp_thd_fp8(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_mfsdp_thd_fp8(tmp_path, recipe_path): # For MFSDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -571,12 +543,7 @@ def test_sanity_convergence_fsdp2_huggingface_model(tmp_path, recipe_path): assert final_loss < 3.0, f"Final loss {final_loss} is too high" -def test_sanity_ddp_thd_token_packing(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_ddp_thd_token_packing(tmp_path, recipe_path): # For DDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( @@ -592,12 +559,7 @@ def test_sanity_ddp_thd_token_packing(tmp_path, monkeypatch, recipe_path): main_ddp(sanity_config) -def test_sanity_mfsdp_thd_token_packing(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_mfsdp_thd_token_packing(tmp_path, recipe_path): with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( config_name="L0_sanity", @@ -612,12 +574,7 @@ def test_sanity_mfsdp_thd_token_packing(tmp_path, monkeypatch, recipe_path): main_mfsdp(sanity_config) -def test_sanity_fsdp2_thd_token_packing(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_fsdp2_thd_token_packing(tmp_path, recipe_path): with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose( config_name="L0_sanity", diff --git a/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py b/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py index eb0d4960b7..f00d647c73 100644 --- a/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py +++ b/bionemo-recipes/recipes/esm2_peft_te/tests/test_train_lora.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from hydra import compose, initialize_config_dir from train_lora_ddp import main as main_ddp @@ -54,12 +53,7 @@ def test_sanity_convergence_ddp_non_streaming_dataset(tmp_path, recipe_path): assert final_loss < 3.0, f"Final loss {final_loss} is too high" -def test_sanity_ddp_thd(tmp_path, monkeypatch, recipe_path): - if torch.cuda.get_device_capability() == (12, 0): - # TODO(BIONEMO-2840): On sm120, we need to set NVTE_FUSED_ATTN to 0 since TE will choose fused attn by default, - # but it's missing this THD implementation. - monkeypatch.setenv("NVTE_FUSED_ATTN", "0") - +def test_sanity_ddp_thd(tmp_path, recipe_path): # For DDP, we only check that the script can run successfully with THD, not convergence. with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"): sanity_config = compose(