softwarepub · Aidajafarbigloo · Oct 10, 2024 · Oct 10, 2024 · Oct 26, 2024 · Jan 30, 2025
diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py
@@ -132,7 +132,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None:
     def load_settings(self, args: argparse.Namespace):
         """Load settings from the configuration file (passed in from command line)."""
 
-        toml_data = toml.load(args.path / args.config)
+        toml_data = toml.load("." / args.config)
         self.root_settings = HermesCommand.settings_class.model_validate(toml_data)
         self.settings = getattr(self.root_settings, self.command_name)
 

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
@@ -19,6 +19,7 @@
 from hermes.model.context import ContextPath
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
 
 
 # TODO: should this be configurable via a CLI option?
@@ -45,6 +46,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         # Read the content
         cff_data = cff_file.read_text()
 
+        remove_temp_file(cff_file)
         # Validate the content to be correct CFF
         cff_dict = self._load_cff_from_file(cff_data)
 
@@ -109,18 +111,23 @@ def _validate(self, cff_file: pathlib.Path, cff_dict: t.Dict) -> bool:
             return True
 
     def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
-        # Find CFF files in directories and subdirectories
-        cff_file = path / 'CITATION.cff'
-        if cff_file.exists():
-            return cff_file
-
-        # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
-        #       which is given via the --path arg. Maybe add another option to enable pointing to a single file?
-        #       (So this stays "convention over configuration")
-        files = list(path.rglob('**/CITATION.cff'))
-        if len(files) == 1:
-            return pathlib.Path(files[0])
-        # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
-        # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
-        #       peeps to use the Click context?
-        return None
+        if str(path).startswith("http:") or str(path).startswith("https:"):
+            # Find CFF files from the provided URL repository
+            normalized_url = normalize_url(str(path))
+            return fetch_metadata_from_repo(normalized_url, "CITATION.cff")
+        else:
+            # Find CFF files in directories and subdirectories
+            cff_file = path / 'CITATION.cff'
+            if cff_file.exists():
+                return cff_file
+
+            # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
+            #       which is given via the --path arg. Maybe add another option to enable pointing to a single file?
+            #       (So this stays "convention over configuration")
+            files = list(path.rglob('**/CITATION.cff'))
+            if len(files) == 1:
+                return pathlib.Path(files[0])
+            # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
+            # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
+            #       peeps to use the Click context?
+            return None
diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
@@ -13,7 +13,7 @@
 from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
 from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
 from hermes.model.errors import HermesValidationError
-
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
 
 class CodeMetaHarvestPlugin(HermesHarvestPlugin):
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
@@ -38,6 +38,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         if not self._validate(codemeta_file):
             raise HermesValidationError(codemeta_file)
 
+        remove_temp_file(codemeta_file)
         codemeta = json.loads(codemeta_str)
         return codemeta, {'local_path': str(codemeta_file)}
 
@@ -56,13 +57,18 @@ def _validate(self, codemeta_file: pathlib.Path) -> bool:
         return True
 
     def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
-        # Find CodeMeta files in directories and subdirectories
-        # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
-        #       (So this stays "convention over configuration")
-        files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
-        if len(files) == 1:
-            return pathlib.Path(files[0])
-        # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
-        # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
-        #       peeps to use the Click context?
-        return None
+        if str(path).startswith("http:") or str(path).startswith("https:"):
+            # Find CodeMeta files from the provided URL repository
+            normalized_url = normalize_url(str(path))
+            return fetch_metadata_from_repo(normalized_url, "codemeta.json")
+        else:
+            # Find CodeMeta files in directories and subdirectories
+            # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
+            #       (So this stays "convention over configuration")
+            files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
+            if len(files) == 1:
+                return pathlib.Path(files[0])
+            # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
+            # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
+            #       peeps to use the Click context?
+            return None
diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -0,0 +1,86 @@
+import pathlib
+import re
+import requests
+import tempfile
+import typing as t
+import os
+
+def normalize_url(path: str) -> str:
+    """Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
+    corrected_url = path.replace("\\", "/")
+    return corrected_url.replace("https:/", "https://")
+
+
+def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]:
+    """
+    Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.
+
+    :param repo_url: The repository URL.
+    :param filename: The name of the metadata file to fetch.
+    :return: Path to the temporary file containing the downloaded metadata, or None.
+    """
+    try:
+        if "github.com" in repo_url:
+            # GitHub API
+            api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
+            response = requests.get(api_url)
+            if response.status_code == 200:
+                for file_info in response.json():
+                    if file_info["name"] == filename:
+                        return _download_to_tempfile(file_info["download_url"], filename)
+        elif "gitlab.com" in repo_url:
+            # GitLab API
+            match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
+            if match:
+                base_domain = match.group(1)
+                group_or_user = match.group(2)
+                project_name = match.group(3).split('/')[0]
+                project_path = f"{group_or_user}/{project_name}"
+                api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"
+
+                response = requests.get(api_url)
+                if response.status_code == 200:
+                    for file_info in response.json():
+                        if file_info["name"] == filename:
+                            file_url = (
+                                f"https://{base_domain}/api/v4/projects/"
+                                f"{requests.utils.quote(project_path, safe='')}/repository/files/"
+                                f"{requests.utils.quote(filename, safe='')}/raw"
+                            )
+                            return _download_to_tempfile(file_url, filename)
+        else:
+            print(f"Unsupported repository URL: {repo_url}")
+            return None
+    except Exception as e:
+        print(f"Error fetching metadata from repository: {e}")
+        return None
+
+
+def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
+    """
+    Download a file from a URL and save it to a temporary file.
+
+    :param url: The URL to download from.
+    :param filename: The name of the file to save.
+    :return: Path to the temporary file.
+    """
+    try:
+        content = requests.get(url).text
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file:
+            temp_file.write(content.encode("utf-8"))
+            print(f"Downloaded {filename} to {temp_file.name}")
+            return pathlib.Path(temp_file.name)
+    except Exception as e:
+        print(f"Error downloading {filename}: {e}")
+        return None
+
+
+def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")):
+    """
+    Removes a temporary file if it is inside the temp directory.
+
+    :param file_path: The file path to check and remove.
+    :param temp_dir: The directory considered as temporary (default: "C:/Temp").
+    """
+    if str(file_path).startswith(str(temp_dir)):
+        os.remove(file_path)