Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openrag/components/indexer/vectordb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i
**(row.file_metadata or {}),
}
for row in result
if row.relationship_id is not None # Only include files that are part of a relationship
]

def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]:
Expand Down
64 changes: 63 additions & 1 deletion openrag/components/test_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i
"parent_id": row.parent_id,
}
for row in rows
if row.relationship_id is not None # Only include files that are part of a relationship
]

def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]:
Expand Down Expand Up @@ -322,11 +323,12 @@ class TestGetFileAncestors:
"""Test retrieving ancestor chain for a file."""

def test_get_file_ancestors_single_file(self, file_manager):
"""Test that a file with no parent returns only itself."""
"""Test that a file with no parent but with a relationship_id returns only itself."""
file_manager.add_file_to_partition(
partition="test_partition",
file_id="root_email",
file_metadata={"filename": "root.eml"},
relationship_id="thread_single",
)

ancestors = file_manager.get_file_ancestors(
Expand Down Expand Up @@ -378,24 +380,28 @@ def test_get_file_ancestors_returns_ordered_path(self, file_manager):
partition="test_partition",
file_id="file_a",
file_metadata={"filename": "a.txt"},
relationship_id="thread_ordered",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="file_b",
file_metadata={"filename": "b.txt"},
parent_id="file_a",
relationship_id="thread_ordered",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="file_c",
file_metadata={"filename": "c.txt"},
parent_id="file_b",
relationship_id="thread_ordered",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="file_d",
file_metadata={"filename": "d.txt"},
parent_id="file_c",
relationship_id="thread_ordered",
)

ancestors = file_manager.get_file_ancestors(
Expand Down Expand Up @@ -424,12 +430,14 @@ def test_get_ancestor_file_ids(self, file_manager):
partition="test_partition",
file_id="parent_file",
file_metadata={"filename": "parent.txt"},
relationship_id="thread_ids",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="child_file",
file_metadata={"filename": "child.txt"},
parent_id="parent_file",
relationship_id="thread_ids",
)

ancestor_ids = file_manager.get_ancestor_file_ids(
Expand All @@ -447,13 +455,15 @@ def test_get_file_ancestors_max_ancestor_depth_none_returns_all(self, file_manag
partition="test_partition",
file_id="level_0",
file_metadata={"filename": "root.txt"},
relationship_id="thread_depth_none",
)
for i in range(1, 6):
file_manager.add_file_to_partition(
partition="test_partition",
file_id=f"level_{i}",
file_metadata={"filename": f"level_{i}.txt"},
parent_id=f"level_{i - 1}",
relationship_id="thread_depth_none",
)

# Without max_ancestor_depth (None), should return all 6 levels
Expand All @@ -474,13 +484,15 @@ def test_get_file_ancestors_max_ancestor_depth_limits_traversal(self, file_manag
partition="test_partition",
file_id="node_0",
file_metadata={"filename": "root.txt"},
relationship_id="thread_depth_limit",
)
for i in range(1, 6):
file_manager.add_file_to_partition(
partition="test_partition",
file_id=f"node_{i}",
file_metadata={"filename": f"node_{i}.txt"},
parent_id=f"node_{i - 1}",
relationship_id="thread_depth_limit",
)

# With max_ancestor_depth=2, should return target (depth 0) + 2 ancestors
Expand All @@ -501,12 +513,14 @@ def test_get_file_ancestors_max_ancestor_depth_zero_returns_only_target(self, fi
partition="test_partition",
file_id="root",
file_metadata={"filename": "root.txt"},
relationship_id="thread_depth_zero",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="child",
file_metadata={"filename": "child.txt"},
parent_id="root",
relationship_id="thread_depth_zero",
)

# max_ancestor_depth=0 means no traversal beyond the target
Expand All @@ -527,18 +541,21 @@ def test_get_file_ancestors_max_ancestor_depth_exceeds_chain_length(self, file_m
partition="test_partition",
file_id="short_0",
file_metadata={"filename": "a.txt"},
relationship_id="thread_short",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="short_1",
file_metadata={"filename": "b.txt"},
parent_id="short_0",
relationship_id="thread_short",
)
file_manager.add_file_to_partition(
partition="test_partition",
file_id="short_2",
file_metadata={"filename": "c.txt"},
parent_id="short_1",
relationship_id="thread_short",
)

# max_ancestor_depth=100 but chain is only 3 levels
Expand All @@ -560,13 +577,15 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager):
partition="test_partition",
file_id="chain_0",
file_metadata={"filename": "a.txt"},
relationship_id="thread_chain",
)
for i in range(1, 4):
file_manager.add_file_to_partition(
partition="test_partition",
file_id=f"chain_{i}",
file_metadata={"filename": f"{chr(97 + i)}.txt"},
parent_id=f"chain_{i - 1}",
relationship_id="thread_chain",
)

# With max_ancestor_depth=1, should get target + 1 ancestor
Expand All @@ -580,6 +599,49 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager):
assert ancestor_ids == ["chain_2", "chain_3"]


class TestStandaloneFileNoExpansion:
"""Test that a file indexed without relationship_id yields no additional chunks
when include_related and include_ancestors are both active."""

def test_no_extra_chunks_for_file_without_relationship_id(self, file_manager):
"""A standalone file (no relationship_id, no parent_id) must not bring
additional files when both include_related and include_ancestors are activated.

Mirrors the logic in _expand_with_related_chunks:
- include_related: the guard `metadata.get("relationship_id")` is falsy,
so no related lookup is issued and the related task set stays empty.
- include_ancestors: get_file_ancestors returns only the file itself when
there is no parent, so it is already in seen_ids — nothing new is added.
"""
file_manager.add_file_to_partition(
partition="test_partition",
file_id="standalone",
file_metadata={"filename": "standalone.pdf"},
# No relationship_id, no parent_id
)

# Verify the file has no relationship_id (the falsy guard that prevents
# the include_related lookup from being issued at all).
files = file_manager.get_files_by_relationship(
partition="test_partition",
relationship_id="standalone", # non-existent → empty
)
assert files == [], "No files should share a relationship with a standalone file"

with file_manager.Session() as session:
row = session.execute(text("SELECT relationship_id FROM files WHERE file_id = 'standalone'")).fetchone()
assert not row[0], "relationship_id must be falsy so include_related is skipped"

ancestors = file_manager.get_file_ancestors(
partition="test_partition",
file_id="standalone",
)
assert len(ancestors) == 0, (
"Standalone file has no relationship_id, so ancestor list must be empty — "
"the relationship_id filter in get_file_ancestors excludes it"
)


class TestFileModelFields:
"""Test that File model correctly handles relationship fields."""

Expand Down
Loading