Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 15 additions & 26 deletions deepmd/pt/model/descriptor/se_a.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@
from typing import (
Any,
ClassVar,
Final,
)

import numpy as np
import torch
import torch.nn as nn

from deepmd.dpmodel.utils import EnvMat as DPEnvMat
from deepmd.dpmodel.utils.seed import (
child_seed,
)
from deepmd.pt.model.descriptor import (
DescriptorBlock,
prod_env_mat,
)
from deepmd.pt.model.network.mlp import (
EmbeddingNet,
NetworkCollection,
)
from deepmd.pt.utils import (
env,
)
Expand All @@ -29,9 +35,18 @@
from deepmd.pt.utils.env_mat_stat import (
EnvMatStatSe,
)
from deepmd.pt.utils.exclude_mask import (
PairExcludeMask,
)
from deepmd.pt.utils.tabulate import (
DPTabulate,
)
from deepmd.pt.utils.update_sel import (
UpdateSel,
)
from deepmd.pt.utils.utils import (
ActivationFn,
)
from deepmd.utils.data_system import (
DeepmdDataSystem,
)
Expand All @@ -45,28 +60,6 @@
check_version_compatibility,
)

try:
from typing import (
Final,
)
except ImportError:
from torch.jit import Final

from deepmd.dpmodel.utils import EnvMat as DPEnvMat
from deepmd.pt.model.network.mlp import (
EmbeddingNet,
NetworkCollection,
)
from deepmd.pt.utils.exclude_mask import (
PairExcludeMask,
)
from deepmd.pt.utils.tabulate import (
DPTabulate,
)
from deepmd.pt.utils.utils import (
ActivationFn,
)

from .base_descriptor import (
BaseDescriptor,
)
Expand Down Expand Up @@ -776,7 +769,6 @@ def forward(

dmatrix = dmatrix.view(-1, self.nnei, 4)
nfnl = dmatrix.shape[0]
# pre-allocate a shape to pass jit
xyz_scatter = torch.zeros(
[nfnl, 4, self.filter_neuron[-1]],
dtype=self.prec,
Expand All @@ -790,9 +782,6 @@ def forward(
if self.type_one_side:
ii = embedding_idx
ti = -1
# torch.jit is not happy with slice(None)
# ti_mask = torch.ones(nfnl, dtype=torch.bool, device=dmatrix.device)
# applying a mask seems to cause performance degradation
ti_mask = None
else:
# ti: center atom type, ii: neighbor type...
Expand Down
187 changes: 187 additions & 0 deletions source/tests/pt/model/test_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import json
import os
import shutil
import unittest
from copy import (
deepcopy,
)
from pathlib import (
Path,
)

import torch
import torch.export

from deepmd.pt.entrypoints.main import (
get_trainer,
)
from deepmd.pt.infer import (
inference,
)
from deepmd.pt.model.descriptor.se_a import (
DescrptSeA,
)
from deepmd.pt.model.model import (
get_model,
)
from deepmd.pt.utils import (
env,
)

from .test_permutation import (
model_se_e2_a,
)


class TestExport(unittest.TestCase):
def setUp(self):
self.rcut = 6.0
self.rcut_smth = 5.0
self.sel = [4, 4]
self.neuron = [10, 10]
self.axis_neuron = 4

def test_export_descriptor_se_a(self):
"""Test DescrptSeA descriptor export."""
for type_one_side in [True, False]:
model = DescrptSeA(
rcut=self.rcut,
rcut_smth=self.rcut_smth,
sel=self.sel,
neuron=self.neuron,
axis_neuron=self.axis_neuron,
precision="float32",
trainable=False,
type_one_side=type_one_side,
)
model.eval()
Comment on lines +49 to +59
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Move descriptor to DEVICE before export

When CUDA is available, env.DEVICE becomes a GPU device, but the DescrptSeA module stays on CPU. The test builds inputs on env.DEVICE and then calls torch.export.export(model, ...), which will raise a device-mismatch error (CPU parameters vs. CUDA inputs). This means the new export test will fail on GPU runners. Moving the model to env.DEVICE (or forcing CPU inputs) avoids the mismatch.

Useful? React with 👍 / 👎.


nf = 2
nloc = 5
nnei = sum(self.sel)
nall = nloc + 10

coord_ext = torch.randn(nf, nall * 3, device=env.DEVICE)
atype_ext = torch.randint(
0, 2, (nf, nall), dtype=torch.int32, device=env.DEVICE
)
nlist = torch.randint(
0, nall, (nf, nloc, nnei), dtype=torch.int32, device=env.DEVICE
)

exported = torch.export.export(model, (coord_ext, atype_ext, nlist))
self.assertIsNotNone(exported)
Comment on lines +46 to +75
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Model not moved to device, potential device mismatch.

Same issue as in test_se_e2_a.py: the DescrptSeA model is created but not moved to env.DEVICE, while tensors are on env.DEVICE. This will fail on GPU.

Proposed fix
             model = DescrptSeA(
                 rcut=self.rcut,
                 rcut_smth=self.rcut_smth,
                 sel=self.sel,
                 neuron=self.neuron,
                 axis_neuron=self.axis_neuron,
                 precision="float32",
                 trainable=False,
                 type_one_side=type_one_side,
-            )
+            ).to(env.DEVICE)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def test_export_descriptor_se_a(self):
"""Test DescrptSeA descriptor export."""
for type_one_side in [True, False]:
model = DescrptSeA(
rcut=self.rcut,
rcut_smth=self.rcut_smth,
sel=self.sel,
neuron=self.neuron,
axis_neuron=self.axis_neuron,
precision="float32",
trainable=False,
type_one_side=type_one_side,
)
model.eval()
nf = 2
nloc = 5
nnei = sum(self.sel)
nall = nloc + 10
coord_ext = torch.randn(nf, nall * 3, device=env.DEVICE)
atype_ext = torch.randint(
0, 2, (nf, nall), dtype=torch.int32, device=env.DEVICE
)
nlist = torch.randint(
0, nall, (nf, nloc, nnei), dtype=torch.int32, device=env.DEVICE
)
exported = torch.export.export(model, (coord_ext, atype_ext, nlist))
self.assertIsNotNone(exported)
def test_export_descriptor_se_a(self):
"""Test DescrptSeA descriptor export."""
for type_one_side in [True, False]:
model = DescrptSeA(
rcut=self.rcut,
rcut_smth=self.rcut_smth,
sel=self.sel,
neuron=self.neuron,
axis_neuron=self.axis_neuron,
precision="float32",
trainable=False,
type_one_side=type_one_side,
).to(env.DEVICE)
model.eval()
nf = 2
nloc = 5
nnei = sum(self.sel)
nall = nloc + 10
coord_ext = torch.randn(nf, nall * 3, device=env.DEVICE)
atype_ext = torch.randint(
0, 2, (nf, nall), dtype=torch.int32, device=env.DEVICE
)
nlist = torch.randint(
0, nall, (nf, nloc, nnei), dtype=torch.int32, device=env.DEVICE
)
exported = torch.export.export(model, (coord_ext, atype_ext, nlist))
self.assertIsNotNone(exported)
🤖 Prompt for AI Agents
In `@source/tests/pt/model/test_export.py` around lines 45 - 74, The
test_export_descriptor_se_a creates a DescrptSeA instance but never moves it to
env.DEVICE while inputs are created on env.DEVICE, causing device-mismatch
errors on GPU; fix by sending the model to the same device (call
model.to(env.DEVICE) or model.to(device)) before calling model.eval() and before
export (so adjust around the DescrptSeA instantiation and before
torch.export.export) to ensure model, coord_ext, atype_ext, and nlist are all on
env.DEVICE.


def test_export_energy_model_se_a(self):
"""Test EnergyModel with se_e2_a descriptor export."""
model_params = {
"type_map": ["O", "H"],
"descriptor": {
"type": "se_e2_a",
"sel": [4, 4],
"rcut_smth": 0.50,
"rcut": 4.00,
"neuron": [10, 20],
"axis_neuron": 4,
},
"fitting_net": {
"neuron": [10, 10],
},
}
model = get_model(model_params).to(env.DEVICE)
model.eval()

nf = 1
nloc = 3
nall = 10
nnei = 8

coord_ext = torch.randn(nf, nall * 3, device=env.DEVICE)
atype_ext = torch.randint(
0, 2, (nf, nall), dtype=torch.int32, device=env.DEVICE
)
nlist = torch.randint(
0, nall, (nf, nloc, nnei), dtype=torch.int32, device=env.DEVICE
)

class ForwardLowerWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model

def forward(self, extended_coord, extended_atype, nlist):
return self.model.forward_lower(extended_coord, extended_atype, nlist)

wrapper = ForwardLowerWrapper(model)
exported = torch.export.export(wrapper, (coord_ext, atype_ext, nlist))
self.assertIsNotNone(exported)


class ExportIntegrationTest:
"""Integration test base for torch.export.export following JITTest pattern."""

def test_export(self) -> None:
trainer = get_trainer(deepcopy(self.config))
trainer.run()
tester = inference.Tester("./model.pt")
model = tester.model
model.eval()

# synthesizing dummy inputs
nf = 1
nloc = 10
nall = 20
descriptor = model.get_descriptor()
nnei = sum(descriptor.get_sel())

coord_ext = torch.randn(nf, nall * 3, device=env.DEVICE, dtype=descriptor.prec)
atype_ext = torch.randint(
0, descriptor.get_ntypes(), (nf, nall), device=env.DEVICE, dtype=torch.int32
)
nlist = torch.randint(
0, nall, (nf, nloc, nnei), device=env.DEVICE, dtype=torch.int32
)

class ForwardLowerWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model

def forward(self, extended_coord, extended_atype, nlist):
return self.model.forward_lower(extended_coord, extended_atype, nlist)
Comment on lines +147 to +153
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This ForwardLowerWrapper class is a duplicate of the one defined in test_export_energy_model_se_a starting at line 108. To avoid code duplication and improve maintainability, this class should be defined only once at the module level and then reused in both test methods. I suggest moving the other definition to the top of the file (e.g., after imports) and removing this one.


wrapper = ForwardLowerWrapper(model)
exported = torch.export.export(wrapper, (coord_ext, atype_ext, nlist))
self.assertIsNotNone(exported)

def tearDown(self) -> None:
for f in os.listdir("."):
if f.startswith("model") and f.endswith("pt"):
os.remove(f)
if f in ["lcurve.out", "frozen_model.pth"]:
os.remove(f)
if f in ["stat_files"]:
shutil.rmtree(f)
if f in ["checkpoint"]:
os.remove(f)
Comment on lines +159 to +168
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, examine the test file to understand the context
cat -n source/tests/pt/model/test_export.py | sed -n '150,175p'

Repository: deepmodeling/deepmd-kit

Length of output: 1278


🏁 Script executed:

# Search for checkpoint creation and usage in test files
rg -n "checkpoint" source/tests/pt/ --type py -C 2 | head -50

Repository: deepmodeling/deepmd-kit

Length of output: 3797


🏁 Script executed:

# Check if there's any checkpoint directory creation or usage patterns
rg -n "checkpoint" source/pt/ --type py -C 2 | head -50

Repository: deepmodeling/deepmd-kit

Length of output: 117


🏁 Script executed:

# Find _file_delete function definition
rg -n "def _file_delete" source/tests/pt/ --type py

Repository: deepmodeling/deepmd-kit

Length of output: 322


🏁 Script executed:

# Search for checkpoint creation in the codebase to understand its nature
rg -n "checkpoint.*mkdir\|mkdir.*checkpoint\|os\.makedirs.*checkpoint" source/ --type py

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

# Look at how training creates checkpoints
rg -n "checkpoint" source/deepmd/pt/ --type py -C 1 | head -80

Repository: deepmodeling/deepmd-kit

Length of output: 124


🏁 Script executed:

# Read _file_delete function
cat -n source/tests/pt/test_model_compression_se_a.py | sed -n '25,40p'

Repository: deepmodeling/deepmd-kit

Length of output: 725


🏁 Script executed:

# Check repository structure
ls -la source/ | head -20

Repository: deepmodeling/deepmd-kit

Length of output: 1165


🏁 Script executed:

# Search for where "checkpoint" is created as a directory in tests
rg -n "checkpoint" source/tests/pt/model/ --type py -B 2 -A 2

Repository: deepmodeling/deepmd-kit

Length of output: 729


🏁 Script executed:

# Search for where checkpoint directory is created
rg -n "checkpoint" source/tests/pt/model/ --type py | grep -E "mkdir|makedirs|mkdir_if_not_exist"

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

# Check the training code to see if checkpoint is a directory
find source -name "*.py" -type f | xargs rg -l "checkpoint" | head -5 | xargs grep -l "mkdir\|makedirs" | head -3

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

# Look for what checkpoint actually contains in training context
rg -n "os\.listdir\|os\.makedirs" source/tests/pt/model/test_export.py -B 5 -A 5

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

# Read the entire test_export.py file to understand test setup
cat source/tests/pt/model/test_export.py

Repository: deepmodeling/deepmd-kit

Length of output: 5727


Use shutil.rmtree() for the checkpoint directory in cleanup.

Line 167 uses os.remove("checkpoint"), which only works for files. During training execution (line 136), the trainer creates a checkpoint directory to save model state. Attempting to remove a directory with os.remove() will raise IsADirectoryError. The fix should check the path type before removal, consistent with how stat_files is handled on line 165.

Proposed fix
             if f in ["stat_files"]:
                 shutil.rmtree(f)
-            if f in ["checkpoint"]:
-                os.remove(f)
+            if f == "checkpoint":
+                if os.path.isdir(f):
+                    shutil.rmtree(f)
+                else:
+                    os.remove(f)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def tearDown(self) -> None:
for f in os.listdir("."):
if f.startswith("model") and f.endswith("pt"):
os.remove(f)
if f in ["lcurve.out", "frozen_model.pth"]:
os.remove(f)
if f in ["stat_files"]:
shutil.rmtree(f)
if f in ["checkpoint"]:
os.remove(f)
def tearDown(self) -> None:
for f in os.listdir("."):
if f.startswith("model") and f.endswith("pt"):
os.remove(f)
if f in ["lcurve.out", "frozen_model.pth"]:
os.remove(f)
if f in ["stat_files"]:
shutil.rmtree(f)
if f == "checkpoint":
if os.path.isdir(f):
shutil.rmtree(f)
else:
os.remove(f)
🤖 Prompt for AI Agents
In `@source/tests/pt/model/test_export.py` around lines 158 - 167, The tearDown
method is attempting to remove the "checkpoint" directory with os.remove, which
fails for directories; update the cleanup logic in tearDown (the method in
test_export.py) to check whether "checkpoint" is a directory and use
shutil.rmtree to remove it (mirroring how "stat_files" is handled), otherwise
use os.remove for files — ensure the branch targets the symbol "checkpoint" and
uses shutil.rmtree when os.path.isdir("checkpoint").



class TestEnergyModelSeAIntegrationExport(unittest.TestCase, ExportIntegrationTest):
def setUp(self) -> None:
input_json = str(Path(__file__).parent / "water/se_atten.json")
with open(input_json) as f:
self.config = json.load(f)
data_file = [str(Path(__file__).parent / "water/data/data_0")]
self.config["training"]["training_data"]["systems"] = data_file
self.config["training"]["validation_data"]["systems"] = data_file
self.config["model"] = deepcopy(model_se_e2_a)
self.config["training"]["numb_steps"] = 2
self.config["training"]["save_freq"] = 2

def tearDown(self) -> None:
ExportIntegrationTest.tearDown(self)


if __name__ == "__main__":
unittest.main()
17 changes: 0 additions & 17 deletions source/tests/pt/model/test_jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
model_dpa1,
model_dpa2,
model_hybrid,
model_se_e2_a,
)


Expand All @@ -47,22 +46,6 @@ def tearDown(self) -> None:
os.remove(f)


class TestEnergyModelSeA(unittest.TestCase, JITTest):
def setUp(self) -> None:
input_json = str(Path(__file__).parent / "water/se_atten.json")
with open(input_json) as f:
self.config = json.load(f)
data_file = [str(Path(__file__).parent / "water/data/data_0")]
self.config["training"]["training_data"]["systems"] = data_file
self.config["training"]["validation_data"]["systems"] = data_file
self.config["model"] = deepcopy(model_se_e2_a)
self.config["training"]["numb_steps"] = 10
self.config["training"]["save_freq"] = 10

def tearDown(self) -> None:
JITTest.tearDown(self)


class TestDOSModelSeA(unittest.TestCase, JITTest):
def setUp(self) -> None:
input_json = str(Path(__file__).parent.parent / "dos/input.json")
Expand Down
17 changes: 10 additions & 7 deletions source/tests/pt/model/test_se_e2_a.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_consistency(
err_msg=err_msg,
)

def test_jit(
def test_export(
self,
) -> None:
rng = np.random.default_rng(GLOBAL_SEED)
Expand All @@ -114,13 +114,12 @@ def test_jit(
dstd = rng.normal(size=(self.nt, nnei, 4))
dstd = 0.1 + np.abs(dstd)

for idt, prec in itertools.product(
for idt, prec, type_one_side in itertools.product(
[False, True],
["float64", "float32"],
[False, True],
):
dtype = PRECISION_DICT[prec]
rtol, atol = get_tols(prec)
err_msg = f"idt={idt} prec={prec}"
# sea new impl
dd0 = DescrptSeA(
self.rcut,
Expand All @@ -129,9 +128,13 @@ def test_jit(
precision=prec,
resnet_dt=idt,
seed=GLOBAL_SEED,
type_one_side=type_one_side,
)
Comment on lines 128 to 132
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Export test mixes CPU model with DEVICE inputs

This test constructs coord_ext, atype_ext, and nlist on env.DEVICE, but the DescrptSeA instance remains on CPU. On GPU-capable environments, torch.export.export(dd0, ...) will error because module parameters and buffers are on CPU while inputs are on CUDA. This makes the test non-portable across CPU/GPU CI. Ensure the model is moved to env.DEVICE (or create CPU inputs) before exporting.

Useful? React with 👍 / 👎.

dd0.sea.mean = torch.tensor(davg, dtype=dtype, device=env.DEVICE)
dd0.sea.dstd = torch.tensor(dstd, dtype=dtype, device=env.DEVICE)
dd1 = DescrptSeA.deserialize(dd0.serialize())
model = torch.jit.script(dd0)
model = torch.jit.script(dd1)
coord_ext = torch.tensor(self.coord_ext, dtype=dtype, device=env.DEVICE)
atype_ext = torch.tensor(self.atype_ext, dtype=int, device=env.DEVICE)
nlist = torch.tensor(self.nlist, dtype=int, device=env.DEVICE)

# torch.export.export
_ = torch.export.export(dd0, (coord_ext, atype_ext, nlist))
Loading