From ddf48d127c4c4b400461367886f9618b22788292 Mon Sep 17 00:00:00 2001 From: hyunjunji Date: Thu, 25 Sep 2025 10:15:26 +0900 Subject: [PATCH] feat: add to_binary(), to_dict(), from_dict(); modify to_mol() --- datamol/__init__.py | 6 +++++ datamol/convert.py | 44 +++++++++++++++++++++++++++++++++++++ datamol/mol.py | 10 +++++---- tests/test_convert.py | 51 +++++++++++++++++++++++++++++++++++++++++++ tests/test_mol.py | 4 ++++ 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/datamol/__init__.py b/datamol/__init__.py index 769cf80c..c7fb998e 100644 --- a/datamol/__init__.py +++ b/datamol/__init__.py @@ -109,6 +109,9 @@ "render_mol_df": "datamol.convert", "to_inchi_non_standard": "datamol.convert", "to_inchikey_non_standard": "datamol.convert", + "to_dict": "datamol.convert", + "from_dict": "datamol.convert", + "to_binary": "datamol.convert", # fp "to_fp": "datamol.fp", "fp_to_array": "datamol.fp", @@ -305,6 +308,9 @@ def __dir__(): from .convert import render_mol_df from .convert import to_inchi_non_standard from .convert import to_inchikey_non_standard + from .convert import to_dict + from .convert import from_dict + from .convert import to_binary from .fp import to_fp from .fp import fp_to_array diff --git a/datamol/convert.py b/datamol/convert.py index 293f8df7..640f3b80 100644 --- a/datamol/convert.py +++ b/datamol/convert.py @@ -1,3 +1,4 @@ +from typing import Dict from typing import Union from typing import List from typing import Optional @@ -5,6 +6,7 @@ from typing import Sequence import re +import json from loguru import logger @@ -12,6 +14,7 @@ from rdkit import Chem from rdkit.Chem import rdmolfiles +from rdkit.Chem import rdMolInterchange from rdkit.Chem import PandasTools import selfies as sf @@ -361,6 +364,22 @@ def from_smarts(smarts: Optional[str]) -> Optional[Mol]: return Chem.MolFromSmarts(smarts) # type: ignore +def to_binary(mol: Mol) -> Optional[bytes]: + """Convert a mol to a binary string. + + Note that the molecular information to be stored in the binary string + is dependent on the RDKit pickling options. + + Args: + mol: a molecule. + """ + + if mol is None: + return None + + return mol.ToBinary() # type: ignore + + def to_df( mols: Sequence[Mol], smiles_column: Optional[str] = "smiles", @@ -514,6 +533,31 @@ def render_mol_df(df: pd.DataFrame): _ChangeMoleculeRendering(df) +def to_dict(mols: Sequence[Mol]) -> Dict: + """Convert a list of mols to a dataframe using each mol properties + as a column. + + For the reverse operation, you might to check `dm.from_df()`. + + Args: + mols: a molecule. + """ + + return json.loads(rdMolInterchange.MolsToJSON(mols)) + + +def from_dict(mol_dict: Dict) -> List[Mol]: + """Convert a dict to a list of mols. + + For the reverse operation, you might to check `dm.to_dict()`. + + Args: + mol_dict: a dict. + """ + + return rdMolInterchange.JSONToMols(json.dumps(mol_dict)) + + def _ChangeMoleculeRendering(frame=None, renderer="PNG"): """Allows to change the rendering of the molecules between base64 PNG images and string representations. diff --git a/datamol/mol.py b/datamol/mol.py index a2e0efae..dfd25431 100644 --- a/datamol/mol.py +++ b/datamol/mol.py @@ -58,7 +58,7 @@ def copy_mol(mol: Mol) -> Mol: def to_mol( - mol: Union[str, Mol], + mol: Union[str, bytes, Mol], add_hs: bool = False, explicit_only: bool = False, ordered: bool = False, @@ -72,7 +72,7 @@ def to_mol( """Convert an input molecule (smiles representation) into a `Mol`. Args: - mol: A SMILES or a molecule. + mol: A SMILES, a binary string from Mol.ToBinary(), or a molecule. add_hs: Whether hydrogens should be added the molecule after the SMILES has been parsed. explicit_only: Whether to only add explicit hydrogen or both (implicit and explicit). when `add_hs` is set to True. @@ -91,8 +91,8 @@ def to_mol( None is returned so make sure that you handle this case on your own. """ - if not isinstance(mol, (str, Mol)): - raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'") + if not isinstance(mol, (str, bytes, Mol)): + raise ValueError(f"Input should be a Mol, a string, or bytes instead of '{type(mol)}'") if isinstance(mol, str): smiles_params = rdmolfiles.SmilesParserParams() @@ -106,6 +106,8 @@ def to_mol( if not sanitize and _mol is not None: _mol.UpdatePropertyCache(False) + elif isinstance(mol, bytes): + _mol = Chem.Mol(mol) else: _mol = mol diff --git a/tests/test_convert.py b/tests/test_convert.py index 3997f70e..ece520da 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -135,6 +135,29 @@ def test_inchi(): assert dm.from_inchi(None) is None +def test_to_binary(datadir): + smiles = "CC(=O)Oc1ccccc1C(=O)O" + mol = dm.to_mol(smiles) + + binary_string = dm.to_binary(mol) + assert isinstance(binary_string, bytes) + new_mol = dm.to_mol(binary_string) + assert dm.same_mol(mol, new_mol) + + data_path = datadir / "TUBB3-observations.sdf" + mols = dm.read_sdf(data_path) + mol = mols[0] + + binary_string = dm.to_binary(mol) + assert isinstance(binary_string, bytes) + new_mol = dm.to_mol(binary_string) + assert dm.same_mol(mol, new_mol) + assert ( + np.sum(np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions())) + < 1e-5 + ) + + def test_to_df(datadir): data_path = datadir / "TUBB3-observations.sdf" mols = dm.read_sdf(data_path) @@ -218,6 +241,34 @@ def test_to_df_smiles_warning(datadir, caplog): ) +def test_to_dict(datadir): + data_path = datadir / "TUBB3-observations.sdf" + mols = dm.read_sdf(data_path) + mols_dict = dm.to_dict(mols) + + assert len(mols_dict["molecules"]) == 10 + for mol_dict in mols_dict["molecules"]: + assert "conformers" in mol_dict + assert "properties" in mol_dict + assert len(mol_dict["properties"]) == 11 + + +def test_from_dict(datadir): + data_path = datadir / "TUBB3-observations.sdf" + mols = dm.read_sdf(data_path) + mols_dict = dm.to_dict(mols) + new_mols = dm.from_dict(mols_dict) + + for mol, new_mol in zip(mols, new_mols): + assert dm.same_mol(mol, new_mol) + assert ( + np.sum( + np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions()) + ) + < 1e-5 + ) + + def test_to_cxsmiles(): mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") smiles = dm.to_smiles(mol, cxsmiles=True) diff --git a/tests/test_mol.py b/tests/test_mol.py index ef77af8d..979454b4 100644 --- a/tests/test_mol.py +++ b/tests/test_mol.py @@ -23,6 +23,10 @@ def test_to_mol(): mol = dm.to_mol(smiles) assert mol is None + binary_string = b"\xef\xbe\xad\xde\x00\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x06\x00\x00\x00\x80\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x0b\x00\x01h\x0c\x01\x02h\x0c\x02\x03h\x0c\x03\x04h\x0c\x04\x05h\x0c\x05\x00h\x0cB\x01\x00\x00\x00\x06\x00\x05\x04\x03\x02\x01\x17\x04\x00\x00\x00\x00\x00\x00\x00\x16" + mol = dm.to_mol(binary_string) + assert mol.GetNumAtoms() == 6 + def test_reorder_atoms(): smiles = "c1ccc(C(=O)O)c(c1)OC(=O)C"