langchain-ai · aneesh-spec · Apr 6, 2026 · Apr 6, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/mcpdoc/cli.py b/mcpdoc/cli.py
@@ -179,12 +179,18 @@ def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
     for entry in urls:
         if not entry.strip():
             continue
-        if ":" in entry and not entry.startswith(("http:", "https:")):
+        parts = entry.split(":", 1)
+        is_windows_drive = len(parts[0]) == 1 and parts[0].isalpha()
+        if (
+            ":" in entry
+            and not entry.startswith(("http:", "https:", "file:"))
+            and not is_windows_drive
+        ):
             # Format is name:url
-            name, url = entry.split(":", 1)
+            name, url = parts
             doc_sources.append({"name": name, "llms_txt": url})
         else:
-            # Format is just url
+            # Format is just url or file: URL or Windows drive path
             doc_sources.append({"llms_txt": entry})
     return doc_sources
 

diff --git a/tests/unit_tests/test_cli.py b/tests/unit_tests/test_cli.py
@@ -0,0 +1,130 @@
+"""Tests for mcpdoc.cli module — focused on create_doc_sources_from_urls parsing.
+
+These are fail-to-pass (F2P) tests: they fail on main (bug present) and pass
+on the aneesh-fix branch (bug fixed).
+
+Bug: --urls tokens are split on the first ':' even when that colon is part of
+a 'file:' URL scheme or a Windows drive letter, not a label separator.
+"""
+
+from mcpdoc.cli import create_doc_sources_from_urls
+
+
+# ---------------------------------------------------------------------------
+# file: URL handling
+# ---------------------------------------------------------------------------
+
+
+def test_file_url_not_split_into_label():
+    """file:///path/to/llms.txt must NOT be treated as label='file', url='///path/to/llms.txt'.
+
+    On main this fails because 'file:' is not excluded from the colon-split logic.
+    """
+    sources = create_doc_sources_from_urls(["file:///path/to/llms.txt"])
+    assert len(sources) == 1
+    assert sources[0]["llms_txt"] == "file:///path/to/llms.txt"
+    assert "name" not in sources[0]
+
+
+def test_file_url_preserves_full_value():
+    """The full file: URL must reach llms_txt unchanged."""
+    url = "file:///home/user/docs/llms.txt"
+    sources = create_doc_sources_from_urls([url])
+    assert sources[0]["llms_txt"] == url
+
+
+# ---------------------------------------------------------------------------
+# Windows drive path handling
+# ---------------------------------------------------------------------------
+
+
+def test_windows_drive_path_not_split_into_label():
+    """C:/Users/docs/llms.txt must NOT be treated as label='C', url='/Users/docs/llms.txt'.
+
+    On main this fails because a single-letter prefix before ':' is not excluded.
+    """
+    sources = create_doc_sources_from_urls(["C:/Users/docs/llms.txt"])
+    assert len(sources) == 1
+    assert sources[0]["llms_txt"] == "C:/Users/docs/llms.txt"
+    assert "name" not in sources[0]
+
+
+def test_windows_drive_path_lowercase():
+    """Lowercase drive letter (c:/...) should also be treated as a path, not a label."""
+    sources = create_doc_sources_from_urls(["c:/path/to/llms.txt"])
+    assert len(sources) == 1
+    assert sources[0]["llms_txt"] == "c:/path/to/llms.txt"
+    assert "name" not in sources[0]
+
+
+# ---------------------------------------------------------------------------
+# Correct label:url format must still work
+# ---------------------------------------------------------------------------
+
+
+def test_label_with_http_url():
+    """LangGraph:https://example.com/llms.txt must still parse label + URL correctly."""
+    sources = create_doc_sources_from_urls(
+        ["LangGraph:https://langchain-ai.github.io/langgraph/llms.txt"]
+    )
+    assert len(sources) == 1
+    assert sources[0]["name"] == "LangGraph"
+    assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt"
+
+
+def test_label_with_local_path():
+    """MyDocs:/path/to/llms.txt — multi-char label with unix path must split correctly."""
+    sources = create_doc_sources_from_urls(["MyDocs:/path/to/llms.txt"])
+    assert len(sources) == 1
+    assert sources[0]["name"] == "MyDocs"
+    assert sources[0]["llms_txt"] == "/path/to/llms.txt"
+
+
+# ---------------------------------------------------------------------------
+# Plain URLs / paths with no label
+# ---------------------------------------------------------------------------
+
+
+def test_plain_http_url_no_label():
+    """A bare https URL with no label prefix must be stored as-is."""
+    sources = create_doc_sources_from_urls(
+        ["https://langchain-ai.github.io/langgraph/llms.txt"]
+    )
+    assert len(sources) == 1
+    assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt"
+    assert "name" not in sources[0]
+
+
+def test_plain_unix_path_no_label():
+    """A bare unix path with no colon must be stored as-is."""
+    sources = create_doc_sources_from_urls(["/home/user/llms.txt"])
+    assert len(sources) == 1
+    assert sources[0]["llms_txt"] == "/home/user/llms.txt"
+    assert "name" not in sources[0]
+
+
+# ---------------------------------------------------------------------------
+# Mixed input — multiple entries together
+# ---------------------------------------------------------------------------
+
+
+def test_mixed_entries_parsed_correctly():
+    """When file: URL, Windows path, and a labelled URL are combined, all parse correctly.
+
+    On main this fails because the first two entries are mis-parsed.
+    """
+    inputs = [
+        "file:///opt/docs/llms.txt",
+        "C:/docs/llms.txt",
+        "LangGraph:https://langchain-ai.github.io/langgraph/llms.txt",
+        "https://plain-url.com/llms.txt",
+    ]
+    sources = create_doc_sources_from_urls(inputs)
+
+    assert sources[0] == {"llms_txt": "file:///opt/docs/llms.txt"}
+    assert sources[1] == {"llms_txt": "C:/docs/llms.txt"}
+    assert sources[2] == {
+        "name": "LangGraph",
+        "llms_txt": "https://langchain-ai.github.io/langgraph/llms.txt",
+    }
+    assert sources[3] == {"llms_txt": "https://plain-url.com/llms.txt"}