diff --git a/openrag/components/indexer/vectordb/utils.py b/openrag/components/indexer/vectordb/utils.py index 903f2017..56d3d54c 100644 --- a/openrag/components/indexer/vectordb/utils.py +++ b/openrag/components/indexer/vectordb/utils.py @@ -578,6 +578,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i **(row.file_metadata or {}), } for row in result + if row.relationship_id is not None # Only include files that are part of a relationship ] def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]: diff --git a/openrag/components/test_relationships.py b/openrag/components/test_relationships.py index 5299d815..882230f2 100644 --- a/openrag/components/test_relationships.py +++ b/openrag/components/test_relationships.py @@ -143,6 +143,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i "parent_id": row.parent_id, } for row in rows + if row.relationship_id is not None # Only include files that are part of a relationship ] def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]: @@ -322,11 +323,12 @@ class TestGetFileAncestors: """Test retrieving ancestor chain for a file.""" def test_get_file_ancestors_single_file(self, file_manager): - """Test that a file with no parent returns only itself.""" + """Test that a file with no parent but with a relationship_id returns only itself.""" file_manager.add_file_to_partition( partition="test_partition", file_id="root_email", file_metadata={"filename": "root.eml"}, + relationship_id="thread_single", ) ancestors = file_manager.get_file_ancestors( @@ -378,24 +380,28 @@ def test_get_file_ancestors_returns_ordered_path(self, file_manager): partition="test_partition", file_id="file_a", file_metadata={"filename": "a.txt"}, + relationship_id="thread_ordered", ) file_manager.add_file_to_partition( partition="test_partition", file_id="file_b", file_metadata={"filename": "b.txt"}, parent_id="file_a", + relationship_id="thread_ordered", ) file_manager.add_file_to_partition( partition="test_partition", file_id="file_c", file_metadata={"filename": "c.txt"}, parent_id="file_b", + relationship_id="thread_ordered", ) file_manager.add_file_to_partition( partition="test_partition", file_id="file_d", file_metadata={"filename": "d.txt"}, parent_id="file_c", + relationship_id="thread_ordered", ) ancestors = file_manager.get_file_ancestors( @@ -424,12 +430,14 @@ def test_get_ancestor_file_ids(self, file_manager): partition="test_partition", file_id="parent_file", file_metadata={"filename": "parent.txt"}, + relationship_id="thread_ids", ) file_manager.add_file_to_partition( partition="test_partition", file_id="child_file", file_metadata={"filename": "child.txt"}, parent_id="parent_file", + relationship_id="thread_ids", ) ancestor_ids = file_manager.get_ancestor_file_ids( @@ -447,6 +455,7 @@ def test_get_file_ancestors_max_ancestor_depth_none_returns_all(self, file_manag partition="test_partition", file_id="level_0", file_metadata={"filename": "root.txt"}, + relationship_id="thread_depth_none", ) for i in range(1, 6): file_manager.add_file_to_partition( @@ -454,6 +463,7 @@ def test_get_file_ancestors_max_ancestor_depth_none_returns_all(self, file_manag file_id=f"level_{i}", file_metadata={"filename": f"level_{i}.txt"}, parent_id=f"level_{i - 1}", + relationship_id="thread_depth_none", ) # Without max_ancestor_depth (None), should return all 6 levels @@ -474,6 +484,7 @@ def test_get_file_ancestors_max_ancestor_depth_limits_traversal(self, file_manag partition="test_partition", file_id="node_0", file_metadata={"filename": "root.txt"}, + relationship_id="thread_depth_limit", ) for i in range(1, 6): file_manager.add_file_to_partition( @@ -481,6 +492,7 @@ def test_get_file_ancestors_max_ancestor_depth_limits_traversal(self, file_manag file_id=f"node_{i}", file_metadata={"filename": f"node_{i}.txt"}, parent_id=f"node_{i - 1}", + relationship_id="thread_depth_limit", ) # With max_ancestor_depth=2, should return target (depth 0) + 2 ancestors @@ -501,12 +513,14 @@ def test_get_file_ancestors_max_ancestor_depth_zero_returns_only_target(self, fi partition="test_partition", file_id="root", file_metadata={"filename": "root.txt"}, + relationship_id="thread_depth_zero", ) file_manager.add_file_to_partition( partition="test_partition", file_id="child", file_metadata={"filename": "child.txt"}, parent_id="root", + relationship_id="thread_depth_zero", ) # max_ancestor_depth=0 means no traversal beyond the target @@ -527,18 +541,21 @@ def test_get_file_ancestors_max_ancestor_depth_exceeds_chain_length(self, file_m partition="test_partition", file_id="short_0", file_metadata={"filename": "a.txt"}, + relationship_id="thread_short", ) file_manager.add_file_to_partition( partition="test_partition", file_id="short_1", file_metadata={"filename": "b.txt"}, parent_id="short_0", + relationship_id="thread_short", ) file_manager.add_file_to_partition( partition="test_partition", file_id="short_2", file_metadata={"filename": "c.txt"}, parent_id="short_1", + relationship_id="thread_short", ) # max_ancestor_depth=100 but chain is only 3 levels @@ -560,6 +577,7 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager): partition="test_partition", file_id="chain_0", file_metadata={"filename": "a.txt"}, + relationship_id="thread_chain", ) for i in range(1, 4): file_manager.add_file_to_partition( @@ -567,6 +585,7 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager): file_id=f"chain_{i}", file_metadata={"filename": f"{chr(97 + i)}.txt"}, parent_id=f"chain_{i - 1}", + relationship_id="thread_chain", ) # With max_ancestor_depth=1, should get target + 1 ancestor @@ -580,6 +599,49 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager): assert ancestor_ids == ["chain_2", "chain_3"] +class TestStandaloneFileNoExpansion: + """Test that a file indexed without relationship_id yields no additional chunks + when include_related and include_ancestors are both active.""" + + def test_no_extra_chunks_for_file_without_relationship_id(self, file_manager): + """A standalone file (no relationship_id, no parent_id) must not bring + additional files when both include_related and include_ancestors are activated. + + Mirrors the logic in _expand_with_related_chunks: + - include_related: the guard `metadata.get("relationship_id")` is falsy, + so no related lookup is issued and the related task set stays empty. + - include_ancestors: get_file_ancestors returns only the file itself when + there is no parent, so it is already in seen_ids — nothing new is added. + """ + file_manager.add_file_to_partition( + partition="test_partition", + file_id="standalone", + file_metadata={"filename": "standalone.pdf"}, + # No relationship_id, no parent_id + ) + + # Verify the file has no relationship_id (the falsy guard that prevents + # the include_related lookup from being issued at all). + files = file_manager.get_files_by_relationship( + partition="test_partition", + relationship_id="standalone", # non-existent → empty + ) + assert files == [], "No files should share a relationship with a standalone file" + + with file_manager.Session() as session: + row = session.execute(text("SELECT relationship_id FROM files WHERE file_id = 'standalone'")).fetchone() + assert not row[0], "relationship_id must be falsy so include_related is skipped" + + ancestors = file_manager.get_file_ancestors( + partition="test_partition", + file_id="standalone", + ) + assert len(ancestors) == 0, ( + "Standalone file has no relationship_id, so ancestor list must be empty — " + "the relationship_id filter in get_file_ancestors excludes it" + ) + + class TestFileModelFields: """Test that File model correctly handles relationship fields."""