Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
453198b
Issue #276 - Add a new argument to accept a URL for harvesting
Aidajafarbigloo Oct 10, 2024
3a7c9ad
Issue #276 - Harvest metadata from the provided URL
Aidajafarbigloo Oct 10, 2024
153e676
Issue #276 - Store harvested data from URL
Aidajafarbigloo Oct 26, 2024
16cba5d
Issue #276 - Harvest metadata from CFF via path
Aidajafarbigloo Jan 30, 2025
afb8189
Issue #276 - Harvest metadata from CodeMeta via path
Aidajafarbigloo Jan 30, 2025
09401ed
Issue #276 - Refactor functions for harvesting CFF/CodeMeta via path
Aidajafarbigloo Jan 30, 2025
f193cc9
Issue #276 - Revert to original base.py
Aidajafarbigloo Jan 30, 2025
1bab2c7
Issue #276 - Update base.py
Aidajafarbigloo Jan 30, 2025
25eec31
Add functionality to remove temp files
Aidajafarbigloo Feb 7, 2025
98814f4
Remove temp files
Aidajafarbigloo Feb 7, 2025
dd56827
Remove temp files
Aidajafarbigloo Feb 7, 2025
88ad304
Merge branch 'develop' into 'feature/276-harvesting-metadata-from-a-p…
Aidajafarbigloo Feb 12, 2025
5f75ad1
Issue #276 - Add SPDX headers
Aidajafarbigloo Feb 13, 2025
b9e5523
softwarepub#276 - Merge latest changes from develop into feature
Aidajafarbigloo Apr 11, 2025
4d901fc
Update base.py
Aidajafarbigloo Apr 14, 2025
3aa06a0
Fix issues: HERMES user agent and temporary files
Aidajafarbigloo May 14, 2025
1bd4d1f
Fix hermes clean command
Aidajafarbigloo May 14, 2025
3918954
Small fix
Aidajafarbigloo May 17, 2025
f170481
Load token from toml file
Aidajafarbigloo Jun 6, 2025
a894259
Use token
Aidajafarbigloo Jun 6, 2025
14fc040
Small fix
Aidajafarbigloo Jun 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/hermes/commands/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None:
def load_settings(self, args: argparse.Namespace):
"""Load settings from the configuration file (passed in from command line)."""

toml_data = toml.load(args.path / args.config)
toml_data = toml.load("." / args.config)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this still work if a regular path is given to HERMES?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, specifying a directory path containing CFF or CodeMeta files is also acceptable. For example, the following command works:

hermes harvest --path C:\path\to\your\directory

self.root_settings = HermesCommand.settings_class.model_validate(toml_data)
self.settings = getattr(self.root_settings, self.command_name)

Expand Down
37 changes: 22 additions & 15 deletions src/hermes/commands/harvest/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from hermes.model.context import ContextPath
from hermes.model.errors import HermesValidationError
from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file


# TODO: should this be configurable via a CLI option?
Expand All @@ -45,6 +46,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
# Read the content
cff_data = cff_file.read_text()

remove_temp_file(cff_file)
# Validate the content to be correct CFF
cff_dict = self._load_cff_from_file(cff_data)

Expand Down Expand Up @@ -109,18 +111,23 @@ def _validate(self, cff_file: pathlib.Path, cff_dict: t.Dict) -> bool:
return True

def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
# Find CFF files in directories and subdirectories
cff_file = path / 'CITATION.cff'
if cff_file.exists():
return cff_file

# TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
# which is given via the --path arg. Maybe add another option to enable pointing to a single file?
# (So this stays "convention over configuration")
files = list(path.rglob('**/CITATION.cff'))
if len(files) == 1:
return pathlib.Path(files[0])
# TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
# TODO: Do we want to hand down a logging instance via Hermes context or just encourage
# peeps to use the Click context?
return None
if str(path).startswith("http:") or str(path).startswith("https:"):
# Find CFF files from the provided URL repository
normalized_url = normalize_url(str(path))
return fetch_metadata_from_repo(normalized_url, "CITATION.cff")
else:
# Find CFF files in directories and subdirectories
cff_file = path / 'CITATION.cff'
if cff_file.exists():
return cff_file

# TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
# which is given via the --path arg. Maybe add another option to enable pointing to a single file?
# (So this stays "convention over configuration")
files = list(path.rglob('**/CITATION.cff'))
if len(files) == 1:
return pathlib.Path(files[0])
# TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
# TODO: Do we want to hand down a logging instance via Hermes context or just encourage
# peeps to use the Click context?
return None
28 changes: 17 additions & 11 deletions src/hermes/commands/harvest/codemeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
from hermes.model.errors import HermesValidationError

from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file

class CodeMetaHarvestPlugin(HermesHarvestPlugin):
def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
Expand All @@ -38,6 +38,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
if not self._validate(codemeta_file):
raise HermesValidationError(codemeta_file)

remove_temp_file(codemeta_file)
codemeta = json.loads(codemeta_str)
return codemeta, {'local_path': str(codemeta_file)}

Expand All @@ -56,13 +57,18 @@ def _validate(self, codemeta_file: pathlib.Path) -> bool:
return True

def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
# Find CodeMeta files in directories and subdirectories
# TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
# (So this stays "convention over configuration")
files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
if len(files) == 1:
return pathlib.Path(files[0])
# TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
# TODO: Do we want to hand down a logging instance via Hermes context or just encourage
# peeps to use the Click context?
return None
if str(path).startswith("http:") or str(path).startswith("https:"):
# Find CodeMeta files from the provided URL repository
normalized_url = normalize_url(str(path))
return fetch_metadata_from_repo(normalized_url, "codemeta.json")
else:
# Find CodeMeta files in directories and subdirectories
# TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
# (So this stays "convention over configuration")
files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
if len(files) == 1:
return pathlib.Path(files[0])
# TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
# TODO: Do we want to hand down a logging instance via Hermes context or just encourage
# peeps to use the Click context?
return None
86 changes: 86 additions & 0 deletions src/hermes/commands/harvest/util/remote_harvesting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pathlib
import re
import requests
import tempfile
import typing as t
import os

def normalize_url(path: str) -> str:
"""Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
corrected_url = path.replace("\\", "/")
return corrected_url.replace("https:/", "https://")


def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method makes multiple HTTP requests. I think it would be nice to use the hermes user agent, just to let the services know who we are. You can do something like:

from hermes.utils import hermes_user_agent

session = requests.Session()
session.headers.update({"User-Agent": hermes_user_agent})

then use the session to make the requests:

session.get(api_url)

"""
Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.

:param repo_url: The repository URL.
:param filename: The name of the metadata file to fetch.
:return: Path to the temporary file containing the downloaded metadata, or None.
"""
try:
if "github.com" in repo_url:
Copy link
Copy Markdown
Contributor

@zyzzyxdonta zyzzyxdonta Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should test if parsed_url.netloc == "github.com", otherwise some possible strange URLs like https://gitlab.example.com/foo/github.com-migration-script might be detected as GitHub.

# GitHub API
api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
response = requests.get(api_url)
if response.status_code == 200:
for file_info in response.json():
if file_info["name"] == filename:
return _download_to_tempfile(file_info["download_url"], filename)
elif "gitlab.com" in repo_url:
# GitLab API
match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
if match:
base_domain = match.group(1)
group_or_user = match.group(2)
project_name = match.group(3).split('/')[0]
project_path = f"{group_or_user}/{project_name}"
api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"

response = requests.get(api_url)
if response.status_code == 200:
for file_info in response.json():
if file_info["name"] == filename:
file_url = (
f"https://{base_domain}/api/v4/projects/"
f"{requests.utils.quote(project_path, safe='')}/repository/files/"
f"{requests.utils.quote(filename, safe='')}/raw"
)
return _download_to_tempfile(file_url, filename)
else:
print(f"Unsupported repository URL: {repo_url}")
return None
except Exception as e:
print(f"Error fetching metadata from repository: {e}")
return None


def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you delete the tempfiles later?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current code the temp files for CFF and CodeMeta are stored separately in C:\Temp on the local machine.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will these files be deleted after the extraction process?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The files won't be deleted after harvesting, however, I can modify the code to delete temp files after extraction. Do you agree with this change?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think the temp files should be deleted at the end of the process.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the code to remove the temp files after the harvesting process. Could you please have a look on the changes?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

"""
Download a file from a URL and save it to a temporary file.

:param url: The URL to download from.
:param filename: The name of the file to save.
:return: Path to the temporary file.
"""
try:
content = requests.get(url).text
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file:
temp_file.write(content.encode("utf-8"))
print(f"Downloaded {filename} to {temp_file.name}")
return pathlib.Path(temp_file.name)
except Exception as e:
print(f"Error downloading {filename}: {e}")
return None


def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C:/Temp is windows-specific. You could use tempfile.TemporaryDirectory and place the files in there. Then, instead of deleting the files one by one, you can use .cleanup() on the TemporaryDirectory object.

"""
Removes a temporary file if it is inside the temp directory.

:param file_path: The file path to check and remove.
:param temp_dir: The directory considered as temporary (default: "C:/Temp").
"""
if str(file_path).startswith(str(temp_dir)):
os.remove(file_path)