From 7ea7c2ae08fb9a06f819fe6d39e0075720e4ff05 Mon Sep 17 00:00:00 2001
From: octo-patch <octo-patch@github.com>
Date: Fri, 17 Apr 2026 09:06:45 +0800
Subject: [PATCH] fix: deduplicate search sources by title to prevent duplicate
 citations

The same academic paper often appears on multiple platforms (PubMed,
ScienceDirect, ResearchGate, Google Scholar) with different URLs. The
existing URL-based deduplication did not catch these, leading to 100+
citations in quality/balanced mode where most were duplicates of the
same underlying paper.

Add a normalised-title Set alongside the existing URL Map so that a
result whose title (case-folded, trimmed) has already been seen is
dropped before being numbered and passed to the writer LLM. This keeps
the first (highest-priority) occurrence while silently discarding
cross-platform duplicates. Results without a title are unaffected.

Fixes #1109
---
 src/lib/agents/search/researcher/index.ts | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/src/lib/agents/search/researcher/index.ts b/src/lib/agents/search/researcher/index.ts
index 0a25d91df..9fe18d859 100644
--- a/src/lib/agents/search/researcher/index.ts
+++ b/src/lib/agents/search/researcher/index.ts
@@ -187,10 +187,22 @@ class Researcher {
       .flatMap((a) => a.results);
 
     const seenUrls = new Map<string, number>();
+    const seenTitles = new Set<string>();
 
     const filteredSearchResults = searchResults
       .map((result, index) => {
         if (result.metadata.url && !seenUrls.has(result.metadata.url)) {
+          // Deduplicate by normalized title to avoid citing the same paper
+          // from multiple academic platforms (e.g. PubMed, ScienceDirect,
+          // ResearchGate, Google Scholar) as separate sources.
+          if (result.metadata.title) {
+            const normalizedTitle = result.metadata.title.toLowerCase().trim();
+            if (seenTitles.has(normalizedTitle)) {
+              return undefined;
+            }
+            seenTitles.add(normalizedTitle);
+          }
+
           seenUrls.set(result.metadata.url, index);
           return result;
         } else if (result.metadata.url && seenUrls.has(result.metadata.url)) {