From 7ea7c2ae08fb9a06f819fe6d39e0075720e4ff05 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Fri, 17 Apr 2026 09:06:45 +0800 Subject: [PATCH] fix: deduplicate search sources by title to prevent duplicate citations The same academic paper often appears on multiple platforms (PubMed, ScienceDirect, ResearchGate, Google Scholar) with different URLs. The existing URL-based deduplication did not catch these, leading to 100+ citations in quality/balanced mode where most were duplicates of the same underlying paper. Add a normalised-title Set alongside the existing URL Map so that a result whose title (case-folded, trimmed) has already been seen is dropped before being numbered and passed to the writer LLM. This keeps the first (highest-priority) occurrence while silently discarding cross-platform duplicates. Results without a title are unaffected. Fixes #1109 --- src/lib/agents/search/researcher/index.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lib/agents/search/researcher/index.ts b/src/lib/agents/search/researcher/index.ts index 0a25d91df..9fe18d859 100644 --- a/src/lib/agents/search/researcher/index.ts +++ b/src/lib/agents/search/researcher/index.ts @@ -187,10 +187,22 @@ class Researcher { .flatMap((a) => a.results); const seenUrls = new Map(); + const seenTitles = new Set(); const filteredSearchResults = searchResults .map((result, index) => { if (result.metadata.url && !seenUrls.has(result.metadata.url)) { + // Deduplicate by normalized title to avoid citing the same paper + // from multiple academic platforms (e.g. PubMed, ScienceDirect, + // ResearchGate, Google Scholar) as separate sources. + if (result.metadata.title) { + const normalizedTitle = result.metadata.title.toLowerCase().trim(); + if (seenTitles.has(normalizedTitle)) { + return undefined; + } + seenTitles.add(normalizedTitle); + } + seenUrls.set(result.metadata.url, index); return result; } else if (result.metadata.url && seenUrls.has(result.metadata.url)) {