Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"format": "pnpm -r run format",
"format:check": "pnpm -r run format:check",
"release": "pnpm build && changeset publish",
"release:snapshot": "changeset version --snapshot canary && pnpm build && changeset publish --tag canary --no-git-tag"
"release:snapshot": "changeset version --snapshot canary && pnpm build && changeset publish --tag canary --no-git-tag",
"bench": "pnpm --filter @upstash/benchmark build && node packages/benchmark/dist/index.js"
},
"repository": {
"type": "git",
Expand Down
3 changes: 3 additions & 0 deletions packages/benchmark/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dist
node_modules
results
41 changes: 41 additions & 0 deletions packages/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# @upstash/benchmark

Benchmarks Context7 trigger accuracy across different integration modes. Runs eval queries via `claude -p` and measures recall, precision, and false positives.

## Usage

```bash
# from repo root
pnpm bench

# specific modes
pnpm bench -- --modes mcp:prod,mcp:dev
pnpm bench -- --modes cli:prod,cli:dev

# custom options
pnpm bench -- --modes mcp:prod --workers 10 --model claude-sonnet-4-6
```

## Modes

| Mode | MCP server | Rule source | Skill source | Purpose |
|------|-----------|-------------|--------------|---------|
| `mcp:prod` | npm latest | master | - | MCP prod baseline |
| `mcp:dev` | local build | working tree | - | Test MCP changes |
| `cli:prod` | - | master | master | CLI prod baseline |
| `cli:dev` | - | working tree | working tree | Test CLI changes |

Ad-hoc modes for isolated testing: `mcp`, `mcp+rule`, `mcp+claude.md`, `cli+skill`, `cli+rule`, `cli+claude.md`

## Options

| Flag | Default | Description |
|------|---------|-------------|
| `--modes` | `mcp:prod,mcp:dev,cli:prod,cli:dev` | Comma-separated modes |
| `--model` | `claude-opus-4-6` | Claude model |
| `--workers` | `60` | Concurrent queries |
| `--max-turns` | `10` | Max turns per query |
| `--timeout` | `120` | Seconds per query |
| `--auth-mode` | `default` | `default` or `api-key` |
| `--with-context` | off | Prepend mid-session context |
| `--compare` | off | Run clean + with-context side by side |
54 changes: 54 additions & 0 deletions packages/benchmark/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name": "@upstash/benchmark",
"version": "0.1.0",
"private": true,
"description": "Benchmarks Context7 trigger accuracy across different integration modes",
"type": "module",
"bin": {
"benchmark": "./dist/index.js"
},
"files": [
"dist"
],
"scripts": {
"build": "tsup",
"dev": "tsup --watch",
"typecheck": "tsc --noEmit",
"lint": "eslint src --fix",
"lint:check": "eslint src",
"format": "prettier --write src",
"format:check": "prettier --check src",
"clean": "rm -rf dist node_modules",
"bench": "node dist/index.js"
},
"dependencies": {
"commander": "^13.1.0"
},
"devDependencies": {
"@types/node": "^22.19.1",
"@typescript-eslint/eslint-plugin": "^8.28.0",
"@typescript-eslint/parser": "^8.28.0",
"eslint": "^9.34.0",
"eslint-plugin-prettier": "^5.2.5",
"prettier": "^3.6.2",
"tsup": "^8.5.0",
"typescript": "^5.8.2",
"typescript-eslint": "^8.28.0"
},
"keywords": [
"context7",
"benchmark",
"trigger",
"eval"
],
"author": "Upstash",
"license": "MIT",
"repository": {
"type": "git",
"url": "git+https://github.com/upstash/context7.git",
"directory": "packages/benchmark"
},
"engines": {
"node": ">=18"
}
}
182 changes: 182 additions & 0 deletions packages/benchmark/src/detection.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import type { DetectionMethod } from "./types.js";

interface ToolUseBlock {
type: "tool_use";
name: string;
input: Record<string, unknown>;
}

interface StreamEvent {
type?: string;
message?: {
content?: ToolUseBlock[];
};
}

const NIA_TOOLS = new Set([
"search_documentation",
"search_codebase",
"index",
"regex_search",
"manage_resource",
"get_github_file_tree",
"nia_web_search",
"nia_deep_research_agent",
"read_source_content",
]);

function isContext7Tool(name: string): boolean {
return (
name.includes("resolve-library-id") ||
name.includes("resolve_library_id") ||
name.includes("query-docs") ||
name.includes("query_docs")
);
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have to update these lists when there are new tools or tool names change? :'( also i remember nia having more tools, is this list accurate


function isNiaTool(name: string): boolean {
return NIA_TOOLS.has(name) || name.startsWith("mcp__nia__");
}

export function detectTrigger(detection: DetectionMethod, stdout: string): boolean {
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) continue;

let event: StreamEvent;
try {
event = JSON.parse(trimmed);
} catch {
continue;
}

if (event.type !== "assistant") continue;

const content = event.message?.content ?? [];
for (const block of content) {
if (block.type !== "tool_use") continue;
const name = block.name ?? "";
const toolInput = block.input ?? {};

if (detection === "mcp" || detection === "versus") {
if (isContext7Tool(name)) return true;
}
if (detection === "nia") {
if (isNiaTool(name)) return true;
}
if (detection === "skill") {
if (name === "Skill") {
const inputStr = JSON.stringify(toolInput);
if (inputStr.includes("find-docs")) return true;
}
if (name === "Bash") {
const command = String(toolInput.command ?? "");
if (command.includes("ctx7") || command.includes("context7")) return true;
}
} else if (detection === "cli") {
if (name === "Bash") {
const command = String(toolInput.command ?? "");
if (command.includes("ctx7") || command.includes("context7")) return true;
}
}
}
}

return false;
}

export function detectVersusProvider(stdout: string): "context7" | "nia" | "both" | "neither" {
let ctx7 = false;
let nia = false;

for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) continue;

let event: StreamEvent;
try {
event = JSON.parse(trimmed);
} catch {
continue;
}

if (event.type !== "assistant") continue;

for (const block of event.message?.content ?? []) {
if (block.type !== "tool_use") continue;
const name = block.name ?? "";
if (isContext7Tool(name)) ctx7 = true;
if (isNiaTool(name)) nia = true;
}
}

if (ctx7 && nia) return "both";
if (ctx7) return "context7";
if (nia) return "nia";
return "neither";
}

export function extractToolChain(stdout: string): string | null {
const tools: string[] = [];
const seen = new Set<string>();

for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) continue;

let event: StreamEvent;
try {
event = JSON.parse(trimmed);
} catch {
continue;
}

if (event.type !== "assistant") continue;

for (const block of event.message?.content ?? []) {
if (block.type !== "tool_use") continue;
const name = block.name ?? "";
const inp = block.input ?? {};

let label: string;

if (name === "Skill") {
const skillId =
(inp.name as string) ||
(inp.skill as string) ||
(inp.skill_name as string) ||
(inp.query as string) ||
"";
let resolved = skillId;
if (!resolved) {
for (const v of Object.values(inp)) {
if (typeof v === "string" && v.length < 100) {
resolved = v;
break;
}
}
}
label = `Skill(${resolved || "?"})`;
} else if (name === "ToolSearch") {
label = "ToolSearch";
} else if (name === "Bash") {
const cmd = String(inp.command ?? "");
label = cmd.includes("ctx7") ? "Bash(ctx7)" : "Bash";
} else if (isContext7Tool(name)) {
label = name.includes("resolve") ? "ctx7:resolve" : "ctx7:query";
} else if (isNiaTool(name)) {
const short = name.replace("mcp__nia__", "");
label = `nia:${short}`;
} else {
label = name;
}

if (!seen.has(label)) {
tools.push(label);
seen.add(label);
}
}
}

return tools.length > 0 ? tools.join(" -> ") : null;
}
Loading
Loading