From ddf48d127c4c4b400461367886f9618b22788292 Mon Sep 17 00:00:00 2001
From: hyunjunji <hyunjun.ji@standigm.com>
Date: Thu, 25 Sep 2025 10:15:26 +0900
Subject: [PATCH] feat: add to_binary(), to_dict(), from_dict(); modify
 to_mol()

---
 datamol/__init__.py   |  6 +++++
 datamol/convert.py    | 44 +++++++++++++++++++++++++++++++++++++
 datamol/mol.py        | 10 +++++----
 tests/test_convert.py | 51 +++++++++++++++++++++++++++++++++++++++++++
 tests/test_mol.py     |  4 ++++
 5 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/datamol/__init__.py b/datamol/__init__.py
index 769cf80c..c7fb998e 100644
--- a/datamol/__init__.py
+++ b/datamol/__init__.py
@@ -109,6 +109,9 @@
     "render_mol_df": "datamol.convert",
     "to_inchi_non_standard": "datamol.convert",
     "to_inchikey_non_standard": "datamol.convert",
+    "to_dict": "datamol.convert",
+    "from_dict": "datamol.convert",
+    "to_binary": "datamol.convert",
     # fp
     "to_fp": "datamol.fp",
     "fp_to_array": "datamol.fp",
@@ -305,6 +308,9 @@ def __dir__():
     from .convert import render_mol_df
     from .convert import to_inchi_non_standard
     from .convert import to_inchikey_non_standard
+    from .convert import to_dict
+    from .convert import from_dict
+    from .convert import to_binary
 
     from .fp import to_fp
     from .fp import fp_to_array
diff --git a/datamol/convert.py b/datamol/convert.py
index 293f8df7..640f3b80 100644
--- a/datamol/convert.py
+++ b/datamol/convert.py
@@ -1,3 +1,4 @@
+from typing import Dict
 from typing import Union
 from typing import List
 from typing import Optional
@@ -5,6 +6,7 @@
 from typing import Sequence
 
 import re
+import json
 
 from loguru import logger
 
@@ -12,6 +14,7 @@
 
 from rdkit import Chem
 from rdkit.Chem import rdmolfiles
+from rdkit.Chem import rdMolInterchange
 from rdkit.Chem import PandasTools
 
 import selfies as sf
@@ -361,6 +364,22 @@ def from_smarts(smarts: Optional[str]) -> Optional[Mol]:
     return Chem.MolFromSmarts(smarts)  # type: ignore
 
 
+def to_binary(mol: Mol) -> Optional[bytes]:
+    """Convert a mol to a binary string.
+
+    Note that the molecular information to be stored in the binary string
+    is dependent on the RDKit pickling options.
+
+    Args:
+        mol: a molecule.
+    """
+
+    if mol is None:
+        return None
+
+    return mol.ToBinary()  # type: ignore
+
+
 def to_df(
     mols: Sequence[Mol],
     smiles_column: Optional[str] = "smiles",
@@ -514,6 +533,31 @@ def render_mol_df(df: pd.DataFrame):
         _ChangeMoleculeRendering(df)
 
 
+def to_dict(mols: Sequence[Mol]) -> Dict:
+    """Convert a list of mols to a dataframe using each mol properties
+    as a column.
+
+    For the reverse operation, you might to check `dm.from_df()`.
+
+    Args:
+        mols: a molecule.
+    """
+
+    return json.loads(rdMolInterchange.MolsToJSON(mols))
+
+
+def from_dict(mol_dict: Dict) -> List[Mol]:
+    """Convert a dict to a list of mols.
+
+    For the reverse operation, you might to check `dm.to_dict()`.
+
+    Args:
+        mol_dict: a dict.
+    """
+
+    return rdMolInterchange.JSONToMols(json.dumps(mol_dict))
+
+
 def _ChangeMoleculeRendering(frame=None, renderer="PNG"):
     """Allows to change the rendering of the molecules between base64 PNG images and string
     representations.
diff --git a/datamol/mol.py b/datamol/mol.py
index a2e0efae..dfd25431 100644
--- a/datamol/mol.py
+++ b/datamol/mol.py
@@ -58,7 +58,7 @@ def copy_mol(mol: Mol) -> Mol:
 
 
 def to_mol(
-    mol: Union[str, Mol],
+    mol: Union[str, bytes, Mol],
     add_hs: bool = False,
     explicit_only: bool = False,
     ordered: bool = False,
@@ -72,7 +72,7 @@ def to_mol(
     """Convert an input molecule (smiles representation) into a `Mol`.
 
     Args:
-        mol: A SMILES or a molecule.
+        mol: A SMILES, a binary string from Mol.ToBinary(), or a molecule.
         add_hs: Whether hydrogens should be added the molecule after the SMILES has been parsed.
         explicit_only: Whether to only add explicit hydrogen or both
             (implicit and explicit). when `add_hs` is set to True.
@@ -91,8 +91,8 @@ def to_mol(
         None is returned so make sure that you handle this case on your own.
     """
 
-    if not isinstance(mol, (str, Mol)):
-        raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'")
+    if not isinstance(mol, (str, bytes, Mol)):
+        raise ValueError(f"Input should be a Mol, a string, or bytes instead of '{type(mol)}'")
 
     if isinstance(mol, str):
         smiles_params = rdmolfiles.SmilesParserParams()
@@ -106,6 +106,8 @@ def to_mol(
 
         if not sanitize and _mol is not None:
             _mol.UpdatePropertyCache(False)
+    elif isinstance(mol, bytes):
+        _mol = Chem.Mol(mol)
     else:
         _mol = mol
 
diff --git a/tests/test_convert.py b/tests/test_convert.py
index 3997f70e..ece520da 100644
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@@ -135,6 +135,29 @@ def test_inchi():
     assert dm.from_inchi(None) is None
 
 
+def test_to_binary(datadir):
+    smiles = "CC(=O)Oc1ccccc1C(=O)O"
+    mol = dm.to_mol(smiles)
+
+    binary_string = dm.to_binary(mol)
+    assert isinstance(binary_string, bytes)
+    new_mol = dm.to_mol(binary_string)
+    assert dm.same_mol(mol, new_mol)
+
+    data_path = datadir / "TUBB3-observations.sdf"
+    mols = dm.read_sdf(data_path)
+    mol = mols[0]
+
+    binary_string = dm.to_binary(mol)
+    assert isinstance(binary_string, bytes)
+    new_mol = dm.to_mol(binary_string)
+    assert dm.same_mol(mol, new_mol)
+    assert (
+        np.sum(np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions()))
+        < 1e-5
+    )
+
+
 def test_to_df(datadir):
     data_path = datadir / "TUBB3-observations.sdf"
     mols = dm.read_sdf(data_path)
@@ -218,6 +241,34 @@ def test_to_df_smiles_warning(datadir, caplog):
     )
 
 
+def test_to_dict(datadir):
+    data_path = datadir / "TUBB3-observations.sdf"
+    mols = dm.read_sdf(data_path)
+    mols_dict = dm.to_dict(mols)
+
+    assert len(mols_dict["molecules"]) == 10
+    for mol_dict in mols_dict["molecules"]:
+        assert "conformers" in mol_dict
+        assert "properties" in mol_dict
+        assert len(mol_dict["properties"]) == 11
+
+
+def test_from_dict(datadir):
+    data_path = datadir / "TUBB3-observations.sdf"
+    mols = dm.read_sdf(data_path)
+    mols_dict = dm.to_dict(mols)
+    new_mols = dm.from_dict(mols_dict)
+
+    for mol, new_mol in zip(mols, new_mols):
+        assert dm.same_mol(mol, new_mol)
+        assert (
+            np.sum(
+                np.abs(mol.GetConformer(0).GetPositions() - new_mol.GetConformer(0).GetPositions())
+            )
+            < 1e-5
+        )
+
+
 def test_to_cxsmiles():
     mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
     smiles = dm.to_smiles(mol, cxsmiles=True)
diff --git a/tests/test_mol.py b/tests/test_mol.py
index ef77af8d..979454b4 100644
--- a/tests/test_mol.py
+++ b/tests/test_mol.py
@@ -23,6 +23,10 @@ def test_to_mol():
     mol = dm.to_mol(smiles)
     assert mol is None
 
+    binary_string = b"\xef\xbe\xad\xde\x00\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x06\x00\x00\x00\x80\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x06@h\x00\x00\x00\x03\x03\x01\x0b\x00\x01h\x0c\x01\x02h\x0c\x02\x03h\x0c\x03\x04h\x0c\x04\x05h\x0c\x05\x00h\x0cB\x01\x00\x00\x00\x06\x00\x05\x04\x03\x02\x01\x17\x04\x00\x00\x00\x00\x00\x00\x00\x16"
+    mol = dm.to_mol(binary_string)
+    assert mol.GetNumAtoms() == 6
+
 
 def test_reorder_atoms():
     smiles = "c1ccc(C(=O)O)c(c1)OC(=O)C"