diff --git a/packages/client/src/experiment/RunnerContext.test.ts b/packages/client/src/experiment/RunnerContext.test.ts new file mode 100644 index 00000000..ee0fa605 --- /dev/null +++ b/packages/client/src/experiment/RunnerContext.test.ts @@ -0,0 +1,245 @@ +import { describe, expect, it, vi } from "vitest"; + +import type { LangfuseClient } from "../LangfuseClient.js"; + +import { RegressionError, RunnerContext } from "./RunnerContext.js"; +import type { ExperimentResult } from "./types.js"; + +function createResult(): ExperimentResult { + return { + experimentId: "exp-1", + runName: "run-1", + itemResults: [], + runEvaluations: [], + format: vi.fn(async () => "formatted"), + }; +} + +function createContext(params?: { + data?: { input?: string; expectedOutput?: string }[]; + datasetVersion?: string; + metadata?: Record; +}) { + const run = vi.fn(); + const client = { + experiment: { run }, + } as unknown as LangfuseClient; + + return { + run, + ctx: new RunnerContext({ + client, + data: params?.data, + datasetVersion: params?.datasetVersion, + metadata: params?.metadata, + }), + }; +} + +describe("RunnerContext", () => { + it("uses context defaults when call-time values are omitted", async () => { + const result = createResult(); + const { ctx, run } = createContext({ + data: [{ input: "ctx" }], + datasetVersion: "2026-01-01T00:00:00.000Z", + metadata: { sha: "abc" }, + }); + run.mockResolvedValue(result); + + await expect( + ctx.runExperiment({ + name: "exp", + task: async () => "output", + }), + ).resolves.toBe(result); + + expect(run).toHaveBeenCalledWith({ + name: "exp", + task: expect.any(Function), + data: [{ input: "ctx" }], + datasetVersion: "2026-01-01T00:00:00.000Z", + metadata: { sha: "abc" }, + }); + }); + + it("lets call-time overrides win", async () => { + const result = createResult(); + const { ctx, run } = createContext({ + data: [{ input: "ctx" }], + datasetVersion: "2026-01-01T00:00:00.000Z", + metadata: { sha: "abc" }, + }); + const overrideData = [{ input: "override" }]; + run.mockResolvedValue(result); + + await ctx.runExperiment({ + name: "exp", + runName: "call-run", + data: overrideData, + datasetVersion: "2026-06-06T00:00:00.000Z", + metadata: { sha: "def", pr: 42 }, + task: async () => "output", + }); + + expect(run).toHaveBeenCalledWith({ + name: "exp", + runName: "call-run", + data: overrideData, + datasetVersion: "2026-06-06T00:00:00.000Z", + metadata: { sha: "def", pr: 42 }, + task: expect.any(Function), + }); + }); + + it("merges metadata with call-time keys winning on collision", async () => { + const { ctx, run } = createContext({ + data: [{ input: "ctx" }], + metadata: { sha: "abc", branch: "main" }, + }); + run.mockResolvedValue(createResult()); + + await ctx.runExperiment({ + name: "exp", + metadata: { sha: "def", pr: 42 }, + task: async () => "output", + }); + + expect(run).toHaveBeenCalledWith({ + name: "exp", + data: [{ input: "ctx" }], + datasetVersion: undefined, + metadata: { sha: "def", branch: "main", pr: 42 }, + task: expect.any(Function), + }); + }); + + it("keeps metadata undefined when neither side provides it", async () => { + const { ctx, run } = createContext({ + data: [{ input: "ctx" }], + }); + run.mockResolvedValue(createResult()); + + await ctx.runExperiment({ + name: "exp", + task: async () => "output", + }); + + expect(run).toHaveBeenCalledWith({ + name: "exp", + data: [{ input: "ctx" }], + datasetVersion: undefined, + metadata: undefined, + task: expect.any(Function), + }); + }); + + it("throws when data is missing on both the context and the call", async () => { + const { ctx } = createContext(); + + await expect( + ctx.runExperiment({ + name: "exp", + task: async () => "output", + }), + ).rejects.toThrow( + "`data` must be provided either on the RunnerContext or the runExperiment call", + ); + }); +}); + +describe("RegressionError", () => { + it("is an error and stores the result", () => { + const result = createResult(); + const error = new RegressionError({ result }); + + expect(error).toBeInstanceOf(Error); + expect(error.name).toBe("RegressionError"); + expect(error.result).toBe(result); + }); + + it("uses the default message when no details are provided", () => { + const error = new RegressionError({ result: createResult() }); + + expect(error.message).toBe("Experiment regression detected"); + expect(error.metric).toBeUndefined(); + expect(error.value).toBeUndefined(); + expect(error.threshold).toBeUndefined(); + }); + + it("renders a structured message when metric and value are provided", () => { + const error = new RegressionError({ + result: createResult(), + metric: "avg_accuracy", + value: 0.78, + threshold: 0.9, + }); + + expect(error.metric).toBe("avg_accuracy"); + expect(error.value).toBe(0.78); + expect(error.threshold).toBe(0.9); + expect(error.message).toBe( + "Regression on `avg_accuracy`: 0.78 (threshold 0.9)", + ); + }); + + it("omits the threshold suffix when no threshold is provided", () => { + const error = new RegressionError({ + result: createResult(), + metric: "avg_accuracy", + value: 0.78, + }); + + expect(error.metric).toBe("avg_accuracy"); + expect(error.value).toBe(0.78); + expect(error.threshold).toBeUndefined(); + expect(error.message).toBe("Regression on `avg_accuracy`: 0.78"); + }); + + it("lets an explicit message win over the structured format", () => { + const error = new RegressionError({ + result: createResult(), + metric: "avg_accuracy", + value: 0.5, + threshold: 0.9, + message: "custom explanation", + }); + + expect(error.message).toBe("custom explanation"); + expect(error.metric).toBe("avg_accuracy"); + expect(error.value).toBe(0.5); + expect(error.threshold).toBe(0.9); + }); + + it("falls back to the default message for partial structured input", () => { + // @ts-expect-error Testing the runtime fallback when a caller bypasses the type system. + const error = new RegressionError({ + result: createResult(), + metric: "avg_accuracy", + }); + + expect(error.message).toBe("Experiment regression detected"); + }); + + it("only accepts the intended constructor shapes at type level", () => { + const result = createResult(); + + expect( + new RegressionError({ result, message: "custom explanation" }), + ).toBeInstanceOf(RegressionError); + expect( + new RegressionError({ + result, + metric: "avg_accuracy", + value: 0.5, + threshold: 0.9, + message: "custom explanation", + }), + ).toBeInstanceOf(RegressionError); + + // @ts-expect-error metric requires value. + new RegressionError({ result, metric: "avg_accuracy" }); + + // @ts-expect-error threshold is only valid in the structured shape. + new RegressionError({ result, threshold: 0.9 }); + }); +}); diff --git a/packages/client/src/experiment/RunnerContext.ts b/packages/client/src/experiment/RunnerContext.ts new file mode 100644 index 00000000..b0f9011c --- /dev/null +++ b/packages/client/src/experiment/RunnerContext.ts @@ -0,0 +1,166 @@ +import type { LangfuseClient } from "../LangfuseClient.js"; + +import type { + ExperimentItem, + ExperimentParams, + ExperimentResult, +} from "./types.js"; + +export type RunnerContextOptions< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + client: LangfuseClient; + data?: ExperimentItem[]; + datasetVersion?: string; + metadata?: Record; +}; + +export type RunnerContextExperimentParams< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = Omit< + ExperimentParams, + "data" | "datasetVersion" | "metadata" +> & { + data?: ExperimentItem[]; + datasetVersion?: string; + metadata?: Record; +}; + +/** + * Wraps `langfuse.experiment.run` with CI-injected defaults. + * + * Intended for use with the `langfuse/experiment-action` GitHub Action. + * Defaults set here are applied when the caller omits them on the + * `runExperiment` call, while explicit call-time values still win. + * + * @public + */ +export class RunnerContext< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> { + public readonly client: LangfuseClient; + public readonly data?: ExperimentItem[]; + public readonly datasetVersion?: string; + public readonly metadata?: Record; + + constructor({ + client, + data, + datasetVersion, + metadata, + }: RunnerContextOptions) { + this.client = client; + this.data = data; + this.datasetVersion = datasetVersion; + this.metadata = metadata; + } + + async runExperiment( + params: RunnerContextExperimentParams, + ): Promise> { + const resolvedData = params.data ?? this.data; + + if (resolvedData === undefined) { + throw new Error( + "`data` must be provided either on the RunnerContext or the runExperiment call", + ); + } + + const mergedMetadata = + this.metadata === undefined && params.metadata === undefined + ? undefined + : { + ...(this.metadata ?? {}), + ...(params.metadata ?? {}), + }; + + return this.client.experiment.run({ + ...params, + data: resolvedData, + datasetVersion: params.datasetVersion ?? this.datasetVersion, + metadata: mergedMetadata, + }); + } +} + +export type RegressionErrorMessageOptions< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + result: ExperimentResult; + message?: string; + metric?: never; + value?: never; + threshold?: never; +}; + +export type RegressionErrorMetricOptions< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + result: ExperimentResult; + metric: string; + value: number; + threshold?: number; + message?: string; +}; + +export type RegressionErrorOptions< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = + | RegressionErrorMessageOptions + | RegressionErrorMetricOptions; + +/** + * Raised by experiment runners to signal a CI gate failure. + * + * Intended for use with the `langfuse/experiment-action` GitHub Action. + * + * @public + */ +export class RegressionError< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> extends Error { + public readonly result: ExperimentResult; + public readonly metric?: string; + public readonly value?: number; + public readonly threshold?: number; + + constructor( + params: RegressionErrorMessageOptions, + ); + constructor( + params: RegressionErrorMetricOptions, + ); + constructor(params: RegressionErrorOptions) { + const { result, message } = params; + const metric = "metric" in params ? params.metric : undefined; + const value = "value" in params ? params.value : undefined; + const threshold = "threshold" in params ? params.threshold : undefined; + + super( + message ?? + (metric !== undefined && value !== undefined + ? `Regression on \`${metric}\`: ${value}${threshold !== undefined ? ` (threshold ${threshold})` : ""}` + : "Experiment regression detected"), + ); + + this.name = "RegressionError"; + this.result = result; + this.metric = metric; + this.value = value; + this.threshold = threshold; + } +} diff --git a/packages/client/src/index.ts b/packages/client/src/index.ts index d55bc403..d9dcc83e 100644 --- a/packages/client/src/index.ts +++ b/packages/client/src/index.ts @@ -4,5 +4,6 @@ export * from "./score/index.js"; export * from "./dataset/index.js"; export * from "./media/index.js"; export * from "./experiment/ExperimentManager.js"; +export * from "./experiment/RunnerContext.js"; export * from "./experiment/adapters.js"; export * from "./experiment/types.js";