diff --git a/mcpdoc/cli.py b/mcpdoc/cli.py index 2634dd4..d5b2796 100644 --- a/mcpdoc/cli.py +++ b/mcpdoc/cli.py @@ -179,12 +179,18 @@ def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]: for entry in urls: if not entry.strip(): continue - if ":" in entry and not entry.startswith(("http:", "https:")): + parts = entry.split(":", 1) + is_windows_drive = len(parts[0]) == 1 and parts[0].isalpha() + if ( + ":" in entry + and not entry.startswith(("http:", "https:", "file:")) + and not is_windows_drive + ): # Format is name:url - name, url = entry.split(":", 1) + name, url = parts doc_sources.append({"name": name, "llms_txt": url}) else: - # Format is just url + # Format is just url or file: URL or Windows drive path doc_sources.append({"llms_txt": entry}) return doc_sources diff --git a/tests/unit_tests/test_cli.py b/tests/unit_tests/test_cli.py new file mode 100644 index 0000000..adc35a4 --- /dev/null +++ b/tests/unit_tests/test_cli.py @@ -0,0 +1,130 @@ +"""Tests for mcpdoc.cli module — focused on create_doc_sources_from_urls parsing. + +These are fail-to-pass (F2P) tests: they fail on main (bug present) and pass +on the aneesh-fix branch (bug fixed). + +Bug: --urls tokens are split on the first ':' even when that colon is part of +a 'file:' URL scheme or a Windows drive letter, not a label separator. +""" + +from mcpdoc.cli import create_doc_sources_from_urls + + +# --------------------------------------------------------------------------- +# file: URL handling +# --------------------------------------------------------------------------- + + +def test_file_url_not_split_into_label(): + """file:///path/to/llms.txt must NOT be treated as label='file', url='///path/to/llms.txt'. + + On main this fails because 'file:' is not excluded from the colon-split logic. + """ + sources = create_doc_sources_from_urls(["file:///path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "file:///path/to/llms.txt" + assert "name" not in sources[0] + + +def test_file_url_preserves_full_value(): + """The full file: URL must reach llms_txt unchanged.""" + url = "file:///home/user/docs/llms.txt" + sources = create_doc_sources_from_urls([url]) + assert sources[0]["llms_txt"] == url + + +# --------------------------------------------------------------------------- +# Windows drive path handling +# --------------------------------------------------------------------------- + + +def test_windows_drive_path_not_split_into_label(): + """C:/Users/docs/llms.txt must NOT be treated as label='C', url='/Users/docs/llms.txt'. + + On main this fails because a single-letter prefix before ':' is not excluded. + """ + sources = create_doc_sources_from_urls(["C:/Users/docs/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "C:/Users/docs/llms.txt" + assert "name" not in sources[0] + + +def test_windows_drive_path_lowercase(): + """Lowercase drive letter (c:/...) should also be treated as a path, not a label.""" + sources = create_doc_sources_from_urls(["c:/path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "c:/path/to/llms.txt" + assert "name" not in sources[0] + + +# --------------------------------------------------------------------------- +# Correct label:url format must still work +# --------------------------------------------------------------------------- + + +def test_label_with_http_url(): + """LangGraph:https://example.com/llms.txt must still parse label + URL correctly.""" + sources = create_doc_sources_from_urls( + ["LangGraph:https://langchain-ai.github.io/langgraph/llms.txt"] + ) + assert len(sources) == 1 + assert sources[0]["name"] == "LangGraph" + assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt" + + +def test_label_with_local_path(): + """MyDocs:/path/to/llms.txt — multi-char label with unix path must split correctly.""" + sources = create_doc_sources_from_urls(["MyDocs:/path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["name"] == "MyDocs" + assert sources[0]["llms_txt"] == "/path/to/llms.txt" + + +# --------------------------------------------------------------------------- +# Plain URLs / paths with no label +# --------------------------------------------------------------------------- + + +def test_plain_http_url_no_label(): + """A bare https URL with no label prefix must be stored as-is.""" + sources = create_doc_sources_from_urls( + ["https://langchain-ai.github.io/langgraph/llms.txt"] + ) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt" + assert "name" not in sources[0] + + +def test_plain_unix_path_no_label(): + """A bare unix path with no colon must be stored as-is.""" + sources = create_doc_sources_from_urls(["/home/user/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "/home/user/llms.txt" + assert "name" not in sources[0] + + +# --------------------------------------------------------------------------- +# Mixed input — multiple entries together +# --------------------------------------------------------------------------- + + +def test_mixed_entries_parsed_correctly(): + """When file: URL, Windows path, and a labelled URL are combined, all parse correctly. + + On main this fails because the first two entries are mis-parsed. + """ + inputs = [ + "file:///opt/docs/llms.txt", + "C:/docs/llms.txt", + "LangGraph:https://langchain-ai.github.io/langgraph/llms.txt", + "https://plain-url.com/llms.txt", + ] + sources = create_doc_sources_from_urls(inputs) + + assert sources[0] == {"llms_txt": "file:///opt/docs/llms.txt"} + assert sources[1] == {"llms_txt": "C:/docs/llms.txt"} + assert sources[2] == { + "name": "LangGraph", + "llms_txt": "https://langchain-ai.github.io/langgraph/llms.txt", + } + assert sources[3] == {"llms_txt": "https://plain-url.com/llms.txt"}