Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 245 additions & 0 deletions packages/client/src/experiment/RunnerContext.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
import { describe, expect, it, vi } from "vitest";

import type { LangfuseClient } from "../LangfuseClient.js";

import { RegressionError, RunnerContext } from "./RunnerContext.js";
import type { ExperimentResult } from "./types.js";

function createResult(): ExperimentResult {
return {
experimentId: "exp-1",
runName: "run-1",
itemResults: [],
runEvaluations: [],
format: vi.fn(async () => "formatted"),
};
}

function createContext(params?: {
data?: { input?: string; expectedOutput?: string }[];
datasetVersion?: string;
metadata?: Record<string, any>;
}) {
const run = vi.fn();
const client = {
experiment: { run },
} as unknown as LangfuseClient;

return {
run,
ctx: new RunnerContext({
client,
data: params?.data,
datasetVersion: params?.datasetVersion,
metadata: params?.metadata,
}),
};
}

describe("RunnerContext", () => {
it("uses context defaults when call-time values are omitted", async () => {
const result = createResult();
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
run.mockResolvedValue(result);

await expect(
ctx.runExperiment({
name: "exp",
task: async () => "output",
}),
).resolves.toBe(result);

expect(run).toHaveBeenCalledWith({
name: "exp",
task: expect.any(Function),
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
});

it("lets call-time overrides win", async () => {
const result = createResult();
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
const overrideData = [{ input: "override" }];
run.mockResolvedValue(result);

await ctx.runExperiment({
name: "exp",
runName: "call-run",
data: overrideData,
datasetVersion: "2026-06-06T00:00:00.000Z",
metadata: { sha: "def", pr: 42 },
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
runName: "call-run",
data: overrideData,
datasetVersion: "2026-06-06T00:00:00.000Z",
metadata: { sha: "def", pr: 42 },
task: expect.any(Function),
});
});

it("merges metadata with call-time keys winning on collision", async () => {
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
metadata: { sha: "abc", branch: "main" },
});
run.mockResolvedValue(createResult());

await ctx.runExperiment({
name: "exp",
metadata: { sha: "def", pr: 42 },
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
data: [{ input: "ctx" }],
datasetVersion: undefined,
metadata: { sha: "def", branch: "main", pr: 42 },
task: expect.any(Function),
});
});

it("keeps metadata undefined when neither side provides it", async () => {
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
});
run.mockResolvedValue(createResult());

await ctx.runExperiment({
name: "exp",
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
data: [{ input: "ctx" }],
datasetVersion: undefined,
metadata: undefined,
task: expect.any(Function),
});
});

it("throws when data is missing on both the context and the call", async () => {
const { ctx } = createContext();

await expect(
ctx.runExperiment({
name: "exp",
task: async () => "output",
}),
).rejects.toThrow(
"`data` must be provided either on the RunnerContext or the runExperiment call",
);
});
});

describe("RegressionError", () => {
it("is an error and stores the result", () => {
const result = createResult();
const error = new RegressionError({ result });

expect(error).toBeInstanceOf(Error);
expect(error.name).toBe("RegressionError");
expect(error.result).toBe(result);
});

it("uses the default message when no details are provided", () => {
const error = new RegressionError({ result: createResult() });

expect(error.message).toBe("Experiment regression detected");
expect(error.metric).toBeUndefined();
expect(error.value).toBeUndefined();
expect(error.threshold).toBeUndefined();
});

it("renders a structured message when metric and value are provided", () => {
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
value: 0.78,
threshold: 0.9,
});

expect(error.metric).toBe("avg_accuracy");
expect(error.value).toBe(0.78);
expect(error.threshold).toBe(0.9);
expect(error.message).toBe(
"Regression on `avg_accuracy`: 0.78 (threshold 0.9)",
);
});

it("omits the threshold suffix when no threshold is provided", () => {
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
value: 0.78,
});

expect(error.metric).toBe("avg_accuracy");
expect(error.value).toBe(0.78);
expect(error.threshold).toBeUndefined();
expect(error.message).toBe("Regression on `avg_accuracy`: 0.78");
});

it("lets an explicit message win over the structured format", () => {
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
value: 0.5,
threshold: 0.9,
message: "custom explanation",
});

expect(error.message).toBe("custom explanation");
expect(error.metric).toBe("avg_accuracy");
expect(error.value).toBe(0.5);
expect(error.threshold).toBe(0.9);
});

it("falls back to the default message for partial structured input", () => {
// @ts-expect-error Testing the runtime fallback when a caller bypasses the type system.
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
});

expect(error.message).toBe("Experiment regression detected");
});

it("only accepts the intended constructor shapes at type level", () => {
const result = createResult();

expect(
new RegressionError({ result, message: "custom explanation" }),
).toBeInstanceOf(RegressionError);
expect(
new RegressionError({
result,
metric: "avg_accuracy",
value: 0.5,
threshold: 0.9,
message: "custom explanation",
}),
).toBeInstanceOf(RegressionError);

// @ts-expect-error metric requires value.
new RegressionError({ result, metric: "avg_accuracy" });

// @ts-expect-error threshold is only valid in the structured shape.
new RegressionError({ result, threshold: 0.9 });
});
});
Loading
Loading