Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 102 additions & 7 deletions openff/nagl_models/_dynamic_fetch.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import functools
import hashlib
import json
import re
import pathlib
import urllib.request

import platformdirs
from packaging.version import Version

Expand All @@ -19,19 +19,85 @@

CACHE_DIR = platformdirs.user_cache_path() / "OPENFF_NAGL_MODELS"


class HashComparisonFailedException(Exception):
"""Exception raised when a NAGL file being loaded fails a comparison to a known or user-provided hash."""


class UnableToParseDOIException(Exception):
"""Exception raised when a Zenodo DOI is unable to be parsed according to the expected pattern."""


def get_release_metadata() -> list[dict]:
return json.loads(urllib.request.urlopen(RELEASES_URL).read().decode("utf-8"))


@functools.lru_cache()
def get_model(filename: str) -> str:
"""Return the path of a model as cached on disk, downloading if necessary."""
def get_model(
filename: str,
doi: None | str = None,
file_hash: None | str = None,
_sandbox: bool = False,
) -> pathlib.Path:
"""
Return the path of a model as cached on disk, downloading if necessary. The lookup order of this implementation is:
1. Try to retrieve the file from the local cache
2. Try to fetch the file from a release of https://github.com/openforcefield/openff-nagl-models
3. Try to fetch the file from the DOI, if provided

This method will raise an HashComparisonFailedException as soon as a hash mismatch is encountered. So if
there's a file with a matching name but a non-matching hash in the local cache, an exception will be raised
immediately, even if a file with a matching name that WOULD satisfy the hash check exists in release
metadata or at a provided Zenodo DOI.

Parameters
----------
filename : str
The name of the file to search for.
doi : typing.Optional[str], default=None
The Zenodo DOI to use as a backup location for fetching the model file if it's not found in the local cache
or in the
[release metadata of an openff-nagl-models release](https://github.com/openforcefield/openff-nagl-models/releases)
on GitHub. For example: "10.5072/zenodo.278300"
file_hash : typing.Optional[str], default=None
The sha256 hash of the model file to verify the correct contents. Hash checks are automatically performed
on some OpenFF-released NAGL models. But if the model isn't released by OpenFF and this argument is
not provided or has a value of `None`, then no hash check is performed. Raises HashComparisonFailedException
if unsuccessful. If a user provides a hash value here that disagrees with the known hash for the same file
name, the user-provided hash takes precedence.
_sandbox : bool, default=False
Whether to connect to sandbox.zenodo.com instead of zenodo.com. Used for testing.

Returns
-------
pathlib.Path
The path to the file if it was found. If the file wasn't found then a FileNotFoundError is rasied.

Raises
------
HashComparisonFailedException
FileNotFoundError
"""

def assert_hash_equal(cached_path, expected_hash):
actual_hash = _get_sha256(cached_path)
if actual_hash != expected_hash:
raise HashComparisonFailedException(
f"NAGL model file hash check failed. Expected hash is "
f"{expected_hash} but actual hash is {actual_hash}"
)
Comment thread
j-wags marked this conversation as resolved.
Outdated

pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)

cached_path = CACHE_DIR / filename

check_hash = file_hash
Comment thread
j-wags marked this conversation as resolved.
Outdated
if check_hash is None and filename in KNOWN_HASHES:
check_hash = KNOWN_HASHES[filename]

if cached_path.exists():
assert _get_sha256(cached_path) == KNOWN_HASHES[filename]
if check_hash:
assert_hash_equal(cached_path, check_hash)

return cached_path.as_posix()

Expand All @@ -55,12 +121,41 @@ def get_model(filename: str) -> str:
assert cached_path.exists()
assert path_to_file == cached_path.as_posix()

assert _get_sha256(cached_path) == KNOWN_HASHES[filename], (
f"Hash mismatch for {filename}"
)
if check_hash:
assert_hash_equal(cached_path, check_hash)

return cached_path.as_posix()

if doi:
try:
zenodo_id = re.findall("10.5072/zenodo.([0-9]+)", doi)[0]
Comment thread
j-wags marked this conversation as resolved.
Outdated
except IndexError:
raise UnableToParseDOIException(
f"Unable to parse Zenodo DOI {doi}. DOI values are expected to look "
f"like '10.5072/zenodo.278300'"
)

if _sandbox:
file_url = (
f"https://sandbox.zenodo.org/api/records/{zenodo_id}/files/{filename}"
)
else:
file_url = f"https://zenodo.org/api/records/{zenodo_id}/files/{filename}"

try:
path_to_file, _ = urllib.request.urlretrieve(
file_url, filename=cached_path.as_posix()
)
except urllib.error.HTTPError:
raise FileNotFoundError(f"No file at {file_url}")
assert cached_path.exists()
assert path_to_file == cached_path.as_posix()

if check_hash:
assert_hash_equal(cached_path, check_hash)

return cached_path.as_posix()
Comment thread
j-wags marked this conversation as resolved.
Outdated

raise FileNotFoundError(
f"Could not find asset with name '{filename}' in any release"
)
Expand Down
83 changes: 78 additions & 5 deletions openff/nagl_models/tests/test_dynamic_fetch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
import pathlib
import shutil
import urllib.request
Expand All @@ -9,7 +10,11 @@

import openff.nagl_models._dynamic_fetch
from openff.nagl_models import __file__ as root
from openff.nagl_models._dynamic_fetch import get_model
from openff.nagl_models._dynamic_fetch import (
get_model,
HashComparisonFailedException,
UnableToParseDOIException,
)


def mocked_urlretrieve(url, filename):
Expand Down Expand Up @@ -59,11 +64,27 @@ def test_get_known_models(monkeypatch, known_model):
assert "OPENFF_NAGL_MODELS" in get_model(known_model)


def test_access_internet_with_empty_cache():
cache_path = platformdirs.user_cache_path() / "OPENFF_NAGL_MODELS"
@pytest.fixture
def hide_cache():
cache_dir = platformdirs.user_cache_path() / "OPENFF_NAGL_MODELS"
alt_dir = str(cache_dir) + "_temp"

if os.path.exists(alt_dir):
raise FileExistsError(f"Temporary directory already exists: {alt_dir}")

if os.path.exists(cache_dir):
shutil.move(cache_dir, alt_dir)

yield

if cache_path.exists():
shutil.rmtree(cache_path)
if os.path.exists(alt_dir):
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
shutil.move(alt_dir, cache_dir)


def test_access_internet_with_empty_cache(hide_cache):
cache_path = platformdirs.user_cache_path() / "OPENFF_NAGL_MODELS"

disable_socket()

Expand Down Expand Up @@ -147,3 +168,55 @@ def test_all_models_loadable(model, monkeypatch):
)

GNNModel.load(get_model(model), eval_mode=True)


def test_get_model_by_doi_and_hash(hide_cache):
get_model(
"my_favorite_model.pt",
doi="10.5072/zenodo.278300",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This record must be sand-box only? This is my first Google result, which seems unlikely to be what you actually want to point to: https://zenodo.org/records/14335473

A comment or note about where this lives and how the hash was generated would be useful for future developers, I don't think anything else would be necessary here

file_hash="127eb0b9512f22546f8b455582bcd85b2521866d32b86d231fee26d4771b1d81",
_sandbox=True,
)


def test_get_model_by_doi_no_hash(hide_cache):
get_model("my_favorite_model.pt", doi="10.5072/zenodo.278300", _sandbox=True)


def test_get_model_hash_comparison_fails():
with pytest.raises(HashComparisonFailedException):
get_model(
"my_favorite_model.pt",
doi="10.5072/zenodo.278300",
file_hash="wrong_hash",
_sandbox=True,
)


def test_user_provided_hash_conflicts_with_known_hash():
with pytest.raises(HashComparisonFailedException):
get_model("openff-gnn-am1bcc-0.1.0-rc.3.pt", file_hash="wrong_hash")


def test_malformed_doi(monkeypatch, hide_cache):
with monkeypatch.context() as m:
m.setattr(
urllib.request,
"urlretrieve",
mocked_urlretrieve,
)
m.setattr(
openff.nagl_models._dynamic_fetch,
"get_release_metadata",
mocked_get_release_metadata,
)

with pytest.raises(UnableToParseDOIException):
get_model("my_favorite_model.pt", doi="zenodo.278300", _sandbox=True)


def test_no_matching_file_at_doi():
with pytest.raises(FileNotFoundError, match="sandbox.zenodo"):
get_model(
"file_that_doesnt_exist.pt", doi="10.5072/zenodo.278300", _sandbox=True
)