-
Notifications
You must be signed in to change notification settings - Fork 7
Feature/276 harvesting metadata from a provided repository url #278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 12 commits
453198b
3a7c9ad
153e676
16cba5d
afb8189
09401ed
f193cc9
1bab2c7
25eec31
98814f4
dd56827
88ad304
5f75ad1
b9e5523
4d901fc
3aa06a0
1bd4d1f
3918954
f170481
a894259
14fc040
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| import pathlib | ||
| import re | ||
| import requests | ||
| import tempfile | ||
| import typing as t | ||
| import os | ||
|
|
||
| def normalize_url(path: str) -> str: | ||
| """Normalize a given URL by correcting backslashes and fixing malformed HTTPS.""" | ||
| corrected_url = path.replace("\\", "/") | ||
| return corrected_url.replace("https:/", "https://") | ||
|
|
||
|
|
||
| def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]: | ||
|
||
| """ | ||
| Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository. | ||
|
|
||
| :param repo_url: The repository URL. | ||
| :param filename: The name of the metadata file to fetch. | ||
| :return: Path to the temporary file containing the downloaded metadata, or None. | ||
| """ | ||
| try: | ||
| if "github.com" in repo_url: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should test |
||
| # GitHub API | ||
| api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents" | ||
| response = requests.get(api_url) | ||
| if response.status_code == 200: | ||
| for file_info in response.json(): | ||
| if file_info["name"] == filename: | ||
| return _download_to_tempfile(file_info["download_url"], filename) | ||
| elif "gitlab.com" in repo_url: | ||
| # GitLab API | ||
| match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url) | ||
| if match: | ||
| base_domain = match.group(1) | ||
| group_or_user = match.group(2) | ||
| project_name = match.group(3).split('/')[0] | ||
| project_path = f"{group_or_user}/{project_name}" | ||
| api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree" | ||
|
|
||
| response = requests.get(api_url) | ||
| if response.status_code == 200: | ||
| for file_info in response.json(): | ||
| if file_info["name"] == filename: | ||
| file_url = ( | ||
| f"https://{base_domain}/api/v4/projects/" | ||
| f"{requests.utils.quote(project_path, safe='')}/repository/files/" | ||
| f"{requests.utils.quote(filename, safe='')}/raw" | ||
| ) | ||
| return _download_to_tempfile(file_url, filename) | ||
| else: | ||
| print(f"Unsupported repository URL: {repo_url}") | ||
| return None | ||
| except Exception as e: | ||
| print(f"Error fetching metadata from repository: {e}") | ||
| return None | ||
|
|
||
|
|
||
| def _download_to_tempfile(url: str, filename: str) -> pathlib.Path: | ||
|
||
| """ | ||
| Download a file from a URL and save it to a temporary file. | ||
|
|
||
| :param url: The URL to download from. | ||
| :param filename: The name of the file to save. | ||
| :return: Path to the temporary file. | ||
| """ | ||
| try: | ||
| content = requests.get(url).text | ||
| with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file: | ||
| temp_file.write(content.encode("utf-8")) | ||
| print(f"Downloaded {filename} to {temp_file.name}") | ||
| return pathlib.Path(temp_file.name) | ||
| except Exception as e: | ||
| print(f"Error downloading {filename}: {e}") | ||
| return None | ||
|
|
||
|
|
||
| def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")): | ||
|
||
| """ | ||
| Removes a temporary file if it is inside the temp directory. | ||
|
|
||
| :param file_path: The file path to check and remove. | ||
| :param temp_dir: The directory considered as temporary (default: "C:/Temp"). | ||
| """ | ||
| if str(file_path).startswith(str(temp_dir)): | ||
| os.remove(file_path) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this still work if a regular path is given to HERMES?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, specifying a directory path containing CFF or CodeMeta files is also acceptable. For example, the following command works:
hermes harvest --path C:\path\to\your\directory