From fc11a43dc63d6d7f1b463c2024e92edd3784e445 Mon Sep 17 00:00:00 2001 From: Pedro M Duarte Date: Tue, 25 Mar 2025 10:57:32 -0400 Subject: [PATCH 1/2] Add link checker script on every push --- .circleci/config.yml | 23 ++++++++++++- scripts/markdown_link_checker.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 scripts/markdown_link_checker.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 560d22f5957..58d4acb365a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -246,6 +246,17 @@ jobs: name: Run pytest command: | uv run pytest tests -v + + markdown-link-checker: + executor: default + resource_class: small + steps: + - checkout-dep-1 + - setup-python-env + - run: + name: Run link checker script + command: | + uv run scripts/markdown_link_checker.py workflows: version: 2 @@ -255,11 +266,21 @@ workflows: or: # run when manually triggered - equal: [<< pipeline.parameters.run_job >>, "run-cicd"] - #run for every push + # run for every push - equal: [<< pipeline.trigger_source >>, "webhook"] jobs: - cicd + markdown-link-checker: + when: + or: + # run when manually triggered + - equal: [<< pipeline.parameters.run_job >>, "markdown-link-checker"] + # run for every push + - equal: [<< pipeline.trigger_source >>, "webhook"] + jobs: + - markdown-link-checker + daily-l2-aggregate-later-loads: when: or: diff --git a/scripts/markdown_link_checker.py b/scripts/markdown_link_checker.py new file mode 100644 index 00000000000..9d2d748bae3 --- /dev/null +++ b/scripts/markdown_link_checker.py @@ -0,0 +1,58 @@ +import re +import subprocess +import shlex +from urllib.parse import urlparse + + +MARKDOWN_LINK_RE = re.compile(r"\[([^\]]+)\]\((?P[^)]+)\)") + + +def main(): + # Run the command to find markdown links and capture the output + found_links = subprocess.check_output( + shlex.split(r"""find . -name "*.md" -exec grep -HoP '\[([^\]]+)\]\(([^)]+)\)' {} \;"""), + text=True, + ) + + for link in found_links.split("\n"): + if not link: + continue + + link_file, markdown_link = link.split(":", maxsplit=1) + + destination_match = MARKDOWN_LINK_RE.match(markdown_link) + if not destination_match: + print(f"Invalid link: {markdown_link}") + continue + destination = destination_match.group("destination") + + if destination.startswith("/"): + # The destination is an absolute path + check_absolute_path(destination) + + elif destination.startswith("."): + # The destination is a relative path + check_relative_path(link_file, destination) + + elif destination.startswith(("http://", "https://")): + # The destination is a URL + parsed_url = urlparse(destination) + if parsed_url.scheme and parsed_url.netloc: + check_url(destination) + else: + continue + + +def check_absolute_path(destination: str): + print(destination) + return + + +def check_relative_path(link_file: str, destination: str): + print(link_file, destination) + return + + +def check_url(destination: str): + print(destination) + return From e7a61b816e0610b9e1a93735f676dd8f98a0d8bf Mon Sep 17 00:00:00 2001 From: Pedro M Duarte Date: Tue, 25 Mar 2025 11:00:09 -0400 Subject: [PATCH 2/2] Actually run the thing --- scripts/markdown_link_checker.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/markdown_link_checker.py b/scripts/markdown_link_checker.py index 9d2d748bae3..de23bcd4af3 100644 --- a/scripts/markdown_link_checker.py +++ b/scripts/markdown_link_checker.py @@ -44,15 +44,19 @@ def main(): def check_absolute_path(destination: str): - print(destination) + print("ABS: ", destination) return def check_relative_path(link_file: str, destination: str): - print(link_file, destination) + print("REL: ", link_file, destination) return def check_url(destination: str): - print(destination) + print("URL: ", destination) return + + +if __name__ == "__main__": + main()