Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"clsx": "^2.1.0",
"drizzle-orm": "^0.40.1",
"js-tiktoken": "^1.0.21",
"jsdom": "^29.0.1",
"jspdf": "^3.0.4",
"lightweight-charts": "^5.0.9",
"lucide-react": "^0.556.0",
Expand Down Expand Up @@ -54,6 +55,7 @@
},
"devDependencies": {
"@types/better-sqlite3": "^7.6.12",
"@types/jsdom": "^28.0.1",
"@types/jspdf": "^2.0.0",
"@types/node": "^24.8.1",
"@types/pdf-parse": "^1.1.4",
Expand Down
17 changes: 16 additions & 1 deletion src/lib/agents/search/researcher/actions/scrapeURL.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { ResearchAction } from '../../types';
import { Chunk, ReadingResearchBlock } from '@/lib/types';
import TurnDown from 'turndown';
import path from 'path';
import { JSDOM } from 'jsdom';

const turndownService = new TurnDown();

Expand Down Expand Up @@ -40,11 +41,25 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
params.urls.map(async (url) => {
try {
const res = await fetch(url);
const text = await res.text();
let text = await res.text();

const title =
text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;

// if response is an html page, clean it up to reduce the amount of tokens used
if (res.headers?.get('Content-Type') === 'text/html') {
// remove comments and spaces
text = text
.replace(/<\!--[\s.]*?-->/gm, '') // comments
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
.replace(/^\s+|\s+$</gm, '') // head and tail spaces
.replace(/\s+</gm, '<') // spaces before tags
.replace(/>\s+/gm, '>') // spaces after tags
const dom = new JSDOM(text);
// removed unused tags
dom.window.document.querySelectorAll('script, style, template').forEach(el => el.remove());
text = dom.window.document.documentElement.outerHTML;
}

if (
!readingEmitted &&
researchBlock &&
Expand Down
Loading