Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions packages/client/src/experiment/RunnerContext.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import { describe, expect, it, vi } from "vitest";

import type { LangfuseClient } from "../LangfuseClient.js";

import { RegressionError, RunnerContext } from "./RunnerContext.js";
import type { ExperimentResult } from "./types.js";

function createResult(): ExperimentResult {
return {
experimentId: "exp-1",
runName: "run-1",
itemResults: [],
runEvaluations: [],
format: vi.fn(async () => "formatted"),
};
}

function createContext(params?: {
data?: { input?: string; expectedOutput?: string }[];
datasetVersion?: string;
metadata?: Record<string, any>;
}) {
const run = vi.fn();
const client = {
experiment: { run },
} as unknown as LangfuseClient;

return {
run,
ctx: new RunnerContext({
client,
data: params?.data,
datasetVersion: params?.datasetVersion,
metadata: params?.metadata,
}),
};
}

describe("RunnerContext", () => {
it("uses context defaults when call-time values are omitted", async () => {
const result = createResult();
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
run.mockResolvedValue(result);

await expect(
ctx.runExperiment({
name: "exp",
task: async () => "output",
}),
).resolves.toBe(result);

expect(run).toHaveBeenCalledWith({
name: "exp",
task: expect.any(Function),
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
});

it("lets call-time overrides win", async () => {
const result = createResult();
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
datasetVersion: "2026-01-01T00:00:00.000Z",
metadata: { sha: "abc" },
});
const overrideData = [{ input: "override" }];
run.mockResolvedValue(result);

await ctx.runExperiment({
name: "exp",
runName: "call-run",
data: overrideData,
datasetVersion: "2026-06-06T00:00:00.000Z",
metadata: { sha: "def", pr: 42 },
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
runName: "call-run",
data: overrideData,
datasetVersion: "2026-06-06T00:00:00.000Z",
metadata: { sha: "def", pr: 42 },
task: expect.any(Function),
});
});

it("merges metadata with call-time keys winning on collision", async () => {
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
metadata: { sha: "abc", branch: "main" },
});
run.mockResolvedValue(createResult());

await ctx.runExperiment({
name: "exp",
metadata: { sha: "def", pr: 42 },
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
data: [{ input: "ctx" }],
datasetVersion: undefined,
metadata: { sha: "def", branch: "main", pr: 42 },
task: expect.any(Function),
});
});

it("keeps metadata undefined when neither side provides it", async () => {
const { ctx, run } = createContext({
data: [{ input: "ctx" }],
});
run.mockResolvedValue(createResult());

await ctx.runExperiment({
name: "exp",
task: async () => "output",
});

expect(run).toHaveBeenCalledWith({
name: "exp",
data: [{ input: "ctx" }],
datasetVersion: undefined,
metadata: undefined,
task: expect.any(Function),
});
});

it("throws when data is missing on both the context and the call", async () => {
const { ctx } = createContext();

await expect(
ctx.runExperiment({
name: "exp",
task: async () => "output",
}),
).rejects.toThrow(
"`data` must be provided either on the RunnerContext or the runExperiment call",
);
});
});

describe("RegressionError", () => {
it("is an error and stores the result", () => {
const result = createResult();
const error = new RegressionError({ result });

expect(error).toBeInstanceOf(Error);
expect(error.name).toBe("RegressionError");
expect(error.result).toBe(result);
});

it("uses the default message when no details are provided", () => {
const error = new RegressionError({ result: createResult() });

expect(error.message).toBe("Experiment regression detected");
expect(error.metric).toBeUndefined();
expect(error.value).toBeUndefined();
expect(error.threshold).toBeUndefined();
});

it("renders a structured message when metric and value are provided", () => {
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
value: 0.78,
threshold: 0.9,
});

expect(error.metric).toBe("avg_accuracy");
expect(error.value).toBe(0.78);
expect(error.threshold).toBe(0.9);
expect(error.message).toBe(
"Regression on `avg_accuracy`: 0.78 (threshold 0.9)",
);
});

it("lets an explicit message win over the structured format", () => {
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
value: 0.5,
threshold: 0.9,
message: "custom explanation",
});

expect(error.message).toBe("custom explanation");
expect(error.metric).toBe("avg_accuracy");
expect(error.value).toBe(0.5);
expect(error.threshold).toBe(0.9);
});

it("falls back to the default message for partial structured input", () => {
// @ts-expect-error Testing the runtime fallback when a caller bypasses the type system.
const error = new RegressionError({
result: createResult(),
metric: "avg_accuracy",
});

expect(error.message).toBe("Experiment regression detected");
});

it("only accepts the intended constructor shapes at type level", () => {
const result = createResult();

expect(
new RegressionError({ result, message: "custom explanation" }),
).toBeInstanceOf(RegressionError);
expect(
new RegressionError({
result,
metric: "avg_accuracy",
value: 0.5,
threshold: 0.9,
message: "custom explanation",
}),
).toBeInstanceOf(RegressionError);

// @ts-expect-error metric requires value.
new RegressionError({ result, metric: "avg_accuracy" });

// @ts-expect-error threshold is only valid in the structured shape.
new RegressionError({ result, threshold: 0.9 });
});
});
166 changes: 166 additions & 0 deletions packages/client/src/experiment/RunnerContext.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import type { LangfuseClient } from "../LangfuseClient.js";

import type {
ExperimentItem,
ExperimentParams,
ExperimentResult,
} from "./types.js";

export type RunnerContextOptions<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> = {
client: LangfuseClient;
data?: ExperimentItem<Input, ExpectedOutput, Metadata>[];
datasetVersion?: string;
metadata?: Record<string, any>;
};

export type RunnerContextExperimentParams<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> = Omit<
ExperimentParams<Input, ExpectedOutput, Metadata>,
"data" | "datasetVersion" | "metadata"
> & {
data?: ExperimentItem<Input, ExpectedOutput, Metadata>[];
datasetVersion?: string;
metadata?: Record<string, any>;
};

/**
* Wraps `langfuse.experiment.run` with CI-injected defaults.
*
* Intended for use with the `langfuse/experiment-action` GitHub Action.
* Defaults set here are applied when the caller omits them on the
* `runExperiment` call, while explicit call-time values still win.
*
* @public
*/
export class RunnerContext<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> {
public readonly client: LangfuseClient;
public readonly data?: ExperimentItem<Input, ExpectedOutput, Metadata>[];
public readonly datasetVersion?: string;
public readonly metadata?: Record<string, any>;

constructor({
client,
data,
datasetVersion,
metadata,
}: RunnerContextOptions<Input, ExpectedOutput, Metadata>) {
this.client = client;
this.data = data;
this.datasetVersion = datasetVersion;
this.metadata = metadata;
}

async runExperiment(
params: RunnerContextExperimentParams<Input, ExpectedOutput, Metadata>,
): Promise<ExperimentResult<Input, ExpectedOutput, Metadata>> {
const resolvedData = params.data ?? this.data;

if (resolvedData === undefined) {
throw new Error(
"`data` must be provided either on the RunnerContext or the runExperiment call",
);
}

const mergedMetadata =
this.metadata === undefined && params.metadata === undefined
? undefined
: {
...(this.metadata ?? {}),
...(params.metadata ?? {}),
};

return this.client.experiment.run({
...params,
data: resolvedData,
datasetVersion: params.datasetVersion ?? this.datasetVersion,
metadata: mergedMetadata,
});
}
}

export type RegressionErrorMessageOptions<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> = {
result: ExperimentResult<Input, ExpectedOutput, Metadata>;
message?: string;
metric?: never;
value?: never;
threshold?: never;
};

export type RegressionErrorMetricOptions<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> = {
result: ExperimentResult<Input, ExpectedOutput, Metadata>;
metric: string;
value: number;
threshold?: number;
message?: string;
};

export type RegressionErrorOptions<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> =
| RegressionErrorMessageOptions<Input, ExpectedOutput, Metadata>
| RegressionErrorMetricOptions<Input, ExpectedOutput, Metadata>;

/**
* Raised by experiment runners to signal a CI gate failure.
*
* Intended for use with the `langfuse/experiment-action` GitHub Action.
*
* @public
*/
export class RegressionError<
Input = any,
ExpectedOutput = any,
Metadata extends Record<string, any> = Record<string, any>,
> extends Error {
public readonly result: ExperimentResult<Input, ExpectedOutput, Metadata>;
public readonly metric?: string;
public readonly value?: number;
public readonly threshold?: number;

constructor(
params: RegressionErrorMessageOptions<Input, ExpectedOutput, Metadata>,
);
constructor(
params: RegressionErrorMetricOptions<Input, ExpectedOutput, Metadata>,
);
constructor(params: RegressionErrorOptions<Input, ExpectedOutput, Metadata>) {
const { result, message } = params;
const metric = "metric" in params ? params.metric : undefined;
const value = "value" in params ? params.value : undefined;
const threshold = "threshold" in params ? params.threshold : undefined;

super(
message ??
(metric !== undefined && value !== undefined
? `Regression on \`${metric}\`: ${value} (threshold ${threshold})`
: "Experiment regression detected"),
);

this.name = "RegressionError";
this.result = result;
this.metric = metric;
this.value = value;
this.threshold = threshold;
}
}
Loading
Loading