From 8ee05763150ad0f585f02adeb7ec3183bca1fbae Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Mon, 20 Apr 2026 17:13:27 +0200 Subject: [PATCH 1/4] feat(ci): add RunnerContext and RegressionError for experiment GH action Adds the SDK-side primitives consumed by the upcoming `langfuse/experiment-action` GitHub Action (LFE-9241): - `RunnerContext` wraps `Langfuse.run_experiment` with action-injected defaults (data, dataset_version, name, run_name, metadata). Users can override any default on the call site; metadata is merged with user-supplied keys winning on collision. - `RegressionError` lets users signal a CI gate failure and optionally pass structured `metric`/`value`/`threshold` fields so the action can render a callout in the PR comment. Both live in a dedicated `langfuse/ci.py` module so the CI surface stays isolated from the general experiment API. Co-Authored-By: Claude Opus 4.7 (1M context) --- langfuse/__init__.py | 3 + langfuse/ci.py | 166 ++++++++++++++++++++++++++++++ tests/unit/test_ci.py | 231 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 langfuse/ci.py create mode 100644 tests/unit/test_ci.py diff --git a/langfuse/__init__.py b/langfuse/__init__.py index d33febca7..c5f5b9d30 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -8,6 +8,7 @@ EvaluatorStats, MapperFunction, ) +from langfuse.ci import RegressionError, RunnerContext from langfuse.experiment import Evaluation from ._client import client as _client_module @@ -63,6 +64,8 @@ "EvaluatorStats", "BatchEvaluationResumeToken", "BatchEvaluationResult", + "RunnerContext", + "RegressionError", "__version__", "is_default_export_span", "is_langfuse_span", diff --git a/langfuse/ci.py b/langfuse/ci.py new file mode 100644 index 000000000..3d10af156 --- /dev/null +++ b/langfuse/ci.py @@ -0,0 +1,166 @@ +"""CI/CD helpers for running Langfuse experiments in GitHub Actions. + +Designed to be used in conjunction with the ``langfuse/experiment-action`` +GitHub Action (https://github.com/langfuse/experiment-action). The action +constructs a :class:`RunnerContext` pre-populated with dataset, run name, and +GitHub-sourced metadata, then calls the user's ``experiment(context)`` +function. +""" + +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List, Optional + +from langfuse.batch_evaluation import CompositeEvaluatorFunction +from langfuse.experiment import ( + EvaluatorFunction, + ExperimentData, + ExperimentResult, + RunEvaluatorFunction, + TaskFunction, +) + +if TYPE_CHECKING: + from langfuse._client.client import Langfuse + + +class RunnerContext: + """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. + + Intended for use with the ``langfuse/experiment-action`` GitHub Action + (https://github.com/langfuse/experiment-action). The action builds a + ``RunnerContext`` before invoking the user's ``experiment(context)`` + function. Defaults set here (dataset, name, run name, metadata tags) are + applied when the user omits them on the :meth:`run_experiment` call; + users can override any default by passing the corresponding argument + explicitly. + """ + + def __init__( + self, + *, + client: "Langfuse", + data: Optional[ExperimentData] = None, + dataset_version: Optional[datetime] = None, + name: Optional[str] = None, + run_name: Optional[str] = None, + metadata: Optional[Dict[str, str]] = None, + ): + """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. + + Typically called by the ``langfuse/experiment-action`` GitHub Action, + not by end users directly. Every field except ``client`` is optional: + fields left as ``None`` simply mean the corresponding argument must be + supplied on the :meth:`run_experiment` call. + + Args: + client: Initialized Langfuse SDK client used to execute the + experiment. The action creates this from the + ``langfuse_public_key`` / ``langfuse_secret_key`` / + ``langfuse_base_url`` inputs. + data: Default dataset items to run the experiment on. Accepts + either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. + Injected by the action when ``dataset_name`` is configured. + If ``None``, the user must pass ``data=`` to + :meth:`run_experiment`. + dataset_version: Optional pinned dataset version. Injected by the + action when ``dataset_version`` is configured. + name: Default human-readable experiment name (e.g. the action's + ``experiment_name`` input). If ``None``, the user must pass + ``name=`` to :meth:`run_experiment`. + run_name: Default exact run name. The action typically derives + this from the commit SHA / PR number so that reruns produce + distinct runs in Langfuse. + metadata: Default metadata attached to every experiment trace and + the dataset run. The action injects GitHub-sourced tags (SHA, + PR link, workflow run link, branch, GH user, etc.). Merged + with any ``metadata`` passed to :meth:`run_experiment`, with + user-supplied keys winning on collision. + """ + self.client = client + self.data = data + self.dataset_version = dataset_version + self.name = name + self.run_name = run_name + self.metadata = metadata + + def run_experiment( + self, + *, + name: Optional[str] = None, + run_name: Optional[str] = None, + description: Optional[str] = None, + data: Optional[ExperimentData] = None, + task: TaskFunction, + evaluators: List[EvaluatorFunction] = [], + composite_evaluator: Optional[CompositeEvaluatorFunction] = None, + run_evaluators: List[RunEvaluatorFunction] = [], + max_concurrency: int = 50, + metadata: Optional[Dict[str, str]] = None, + _dataset_version: Optional[datetime] = None, + ) -> ExperimentResult: + resolved_name = name if name is not None else self.name + if resolved_name is None: + raise ValueError( + "`name` must be provided either on the RunnerContext or the run_experiment call" + ) + + resolved_data = data if data is not None else self.data + if resolved_data is None: + raise ValueError( + "`data` must be provided either on the RunnerContext or the run_experiment call" + ) + + resolved_run_name = run_name if run_name is not None else self.run_name + resolved_dataset_version = ( + _dataset_version if _dataset_version is not None else self.dataset_version + ) + + merged_metadata: Optional[Dict[str, str]] + if self.metadata is None and metadata is None: + merged_metadata = None + else: + merged_metadata = {**(self.metadata or {}), **(metadata or {})} + + return self.client.run_experiment( + name=resolved_name, + run_name=resolved_run_name, + description=description, + data=resolved_data, + task=task, + evaluators=evaluators, + composite_evaluator=composite_evaluator, + run_evaluators=run_evaluators, + max_concurrency=max_concurrency, + metadata=merged_metadata, + _dataset_version=resolved_dataset_version, + ) + + +class RegressionError(Exception): + """Raised by a user's ``experiment`` function to signal a CI gate failure. + + The GitHub action catches this exception and, when ``should_fail_on_error`` + is enabled, fails the workflow run and renders a callout in the PR comment + using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. + """ + + def __init__( + self, + *, + result: ExperimentResult, + metric: Optional[str] = None, + value: Optional[float] = None, + threshold: Optional[float] = None, + message: Optional[str] = None, + ): + self.result = result + self.metric = metric + self.value = value + self.threshold = threshold + if message is not None: + formatted = message + elif metric is not None: + formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" + else: + formatted = "Experiment regression detected" + super().__init__(formatted) diff --git a/tests/unit/test_ci.py b/tests/unit/test_ci.py new file mode 100644 index 000000000..b9b1e986c --- /dev/null +++ b/tests/unit/test_ci.py @@ -0,0 +1,231 @@ +"""Tests for ``langfuse.ci`` — ``RunnerContext`` and ``RegressionError``.""" + +import inspect +from datetime import datetime +from typing import Dict +from unittest.mock import MagicMock + +import pytest + +from langfuse import RegressionError, RunnerContext +from langfuse._client.client import Langfuse + + +def _noop_task(*, item, **kwargs): # pragma: no cover - never invoked via mock + return None + + +def _make_ctx(**kwargs) -> RunnerContext: + client = MagicMock(spec=Langfuse) + client.run_experiment.return_value = "result-sentinel" + return RunnerContext(client=client, **kwargs) + + +class TestRunnerContextDefaults: + def test_context_defaults_flow_through(self): + ctx_data = [{"input": "a"}] + ctx_version = datetime(2026, 1, 1) + ctx = _make_ctx( + data=ctx_data, + dataset_version=ctx_version, + name="ctx-name", + run_name="ctx-run", + metadata={"sha": "abc123"}, + ) + + result = ctx.run_experiment(task=_noop_task) + + assert result == "result-sentinel" + ctx.client.run_experiment.assert_called_once() + kwargs = ctx.client.run_experiment.call_args.kwargs + assert kwargs["name"] == "ctx-name" + assert kwargs["run_name"] == "ctx-run" + assert kwargs["data"] is ctx_data + assert kwargs["metadata"] == {"sha": "abc123"} + assert kwargs["_dataset_version"] == ctx_version + assert kwargs["task"] is _noop_task + + def test_call_overrides_win(self): + ctx = _make_ctx( + data=[{"input": "ctx"}], + dataset_version=datetime(2026, 1, 1), + name="ctx-name", + run_name="ctx-run", + ) + + override_data = [{"input": "override"}] + override_version = datetime(2026, 6, 6) + ctx.run_experiment( + task=_noop_task, + name="call-name", + run_name="call-run", + data=override_data, + _dataset_version=override_version, + ) + + kwargs = ctx.client.run_experiment.call_args.kwargs + assert kwargs["name"] == "call-name" + assert kwargs["run_name"] == "call-run" + assert kwargs["data"] is override_data + assert kwargs["_dataset_version"] == override_version + + +class TestRunnerContextMetadataMerge: + def test_user_keys_win_on_collision(self): + ctx = _make_ctx( + data=[{"input": "a"}], + name="n", + metadata={"sha": "abc", "branch": "main"}, + ) + ctx.run_experiment(task=_noop_task, metadata={"sha": "def", "pr": "42"}) + assert ctx.client.run_experiment.call_args.kwargs["metadata"] == { + "sha": "def", + "branch": "main", + "pr": "42", + } + + def test_context_metadata_only(self): + ctx = _make_ctx( + data=[{"input": "a"}], name="n", metadata={"sha": "abc"} + ) + ctx.run_experiment(task=_noop_task) + assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"sha": "abc"} + + def test_call_metadata_only(self): + ctx = _make_ctx(data=[{"input": "a"}], name="n") + ctx.run_experiment(task=_noop_task, metadata={"pr": "1"}) + assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"pr": "1"} + + def test_both_none_stays_none(self): + ctx = _make_ctx(data=[{"input": "a"}], name="n") + ctx.run_experiment(task=_noop_task) + assert ctx.client.run_experiment.call_args.kwargs["metadata"] is None + + +class TestRunnerContextLocalItems: + def test_local_items_pass_through_as_context_default(self): + items = [{"input": "x", "expected_output": "y"}] + ctx = _make_ctx(data=items, name="n") + ctx.run_experiment(task=_noop_task) + assert ctx.client.run_experiment.call_args.kwargs["data"] is items + + def test_local_items_pass_through_as_call_override(self): + ctx = _make_ctx(name="n") + items = [{"input": "x"}] + ctx.run_experiment(task=_noop_task, data=items) + assert ctx.client.run_experiment.call_args.kwargs["data"] is items + + +class TestRunnerContextValidation: + def test_missing_name_raises(self): + ctx = _make_ctx(data=[{"input": "a"}]) + with pytest.raises(ValueError, match="name"): + ctx.run_experiment(task=_noop_task) + + def test_missing_data_raises(self): + ctx = _make_ctx(name="n") + with pytest.raises(ValueError, match="data"): + ctx.run_experiment(task=_noop_task) + + +class TestRegressionError: + def test_is_exception(self): + result = MagicMock() + exc = RegressionError(result=result) + assert isinstance(exc, Exception) + assert exc.result is result + + def test_default_message(self): + exc = RegressionError(result=MagicMock()) + assert str(exc) == "Experiment regression detected" + assert exc.metric is None + assert exc.value is None + assert exc.threshold is None + + def test_structured_message(self): + exc = RegressionError( + result=MagicMock(), metric="avg_accuracy", value=0.78, threshold=0.9 + ) + assert exc.metric == "avg_accuracy" + assert exc.value == 0.78 + assert exc.threshold == 0.9 + assert "avg_accuracy" in str(exc) + assert "0.78" in str(exc) + assert "0.9" in str(exc) + + def test_user_message_wins(self): + exc = RegressionError( + result=MagicMock(), + metric="avg_accuracy", + value=0.5, + threshold=0.9, + message="custom explanation", + ) + assert str(exc) == "custom explanation" + + +class TestSignatureDriftGuard: + """Fails loudly if ``Langfuse.run_experiment`` grows a parameter that is + not threaded through ``RunnerContext.run_experiment``. + + The four action-relaxed params (``name``, ``run_name``, ``data``, + ``_dataset_version``) are allowed to diverge: the RunnerContext variant + must be the ``Optional[...]`` of the client annotation so the action can + inject them. + """ + + RELAXED_PARAMS = {"name", "run_name", "data", "_dataset_version"} + + def test_no_divergence(self): + client_params = self._params(Langfuse.run_experiment, skip_self=True) + ctx_params = self._params(RunnerContext.run_experiment, skip_self=True) + + assert set(client_params) == set(ctx_params), ( + "RunnerContext.run_experiment params do not match " + "Langfuse.run_experiment. Missing: " + f"{set(client_params) - set(ctx_params)}. " + f"Extra: {set(ctx_params) - set(client_params)}." + ) + + for name, client_param in client_params.items(): + ctx_param = ctx_params[name] + client_ann = client_param.annotation + ctx_ann = ctx_param.annotation + + if name in self.RELAXED_PARAMS: + # RunnerContext version must be Optional[] + # Already-optional client annotations (run_name, + # _dataset_version) just need to match as-is. + if self._is_optional(client_ann): + assert ctx_ann == client_ann, ( + f"param `{name}`: expected {client_ann}, got {ctx_ann}" + ) + else: + from typing import Optional + + assert ctx_ann == Optional[client_ann], ( + f"param `{name}`: expected Optional[{client_ann}], " + f"got {ctx_ann}" + ) + else: + assert ctx_ann == client_ann, ( + f"param `{name}`: annotation drift — " + f"client={client_ann}, context={ctx_ann}" + ) + + @staticmethod + def _params(func, *, skip_self: bool) -> Dict[str, inspect.Parameter]: + sig = inspect.signature(func) + return { + name: p + for name, p in sig.parameters.items() + if not (skip_self and name == "self") + } + + @staticmethod + def _is_optional(annotation) -> bool: + import typing + + origin = typing.get_origin(annotation) + args = typing.get_args(annotation) + return origin is typing.Union and type(None) in args From 43cfd572a32bfe1feb5d06f619efba287b254ccb Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 22 Apr 2026 10:28:48 +0200 Subject: [PATCH 2/4] refactor(experiment): move RunnerContext and RegressionError into experiment module Relocates the CI-action primitives from the standalone `langfuse/ci.py` module into `langfuse/experiment.py` alongside the other experiment types. Deletes `langfuse/ci.py` and renames the tests accordingly. The public import paths (`from langfuse import RunnerContext, RegressionError`) are unchanged. `CompositeEvaluatorFunction` is imported under `TYPE_CHECKING` to avoid a circular import with `langfuse.batch_evaluation`. The signature-drift guard now resolves the forward reference via `typing.get_type_hints(..., localns=...)`. Co-Authored-By: Claude Opus 4.7 (1M context) --- langfuse/__init__.py | 3 +- langfuse/ci.py | 166 ------------------ langfuse/experiment.py | 151 ++++++++++++++++ .../{test_ci.py => test_runner_context.py} | 55 +++--- 4 files changed, 183 insertions(+), 192 deletions(-) delete mode 100644 langfuse/ci.py rename tests/unit/{test_ci.py => test_runner_context.py} (81%) diff --git a/langfuse/__init__.py b/langfuse/__init__.py index c5f5b9d30..08d8325cf 100644 --- a/langfuse/__init__.py +++ b/langfuse/__init__.py @@ -8,8 +8,7 @@ EvaluatorStats, MapperFunction, ) -from langfuse.ci import RegressionError, RunnerContext -from langfuse.experiment import Evaluation +from langfuse.experiment import Evaluation, RegressionError, RunnerContext from ._client import client as _client_module from ._client.attributes import LangfuseOtelSpanAttributes diff --git a/langfuse/ci.py b/langfuse/ci.py deleted file mode 100644 index 3d10af156..000000000 --- a/langfuse/ci.py +++ /dev/null @@ -1,166 +0,0 @@ -"""CI/CD helpers for running Langfuse experiments in GitHub Actions. - -Designed to be used in conjunction with the ``langfuse/experiment-action`` -GitHub Action (https://github.com/langfuse/experiment-action). The action -constructs a :class:`RunnerContext` pre-populated with dataset, run name, and -GitHub-sourced metadata, then calls the user's ``experiment(context)`` -function. -""" - -from datetime import datetime -from typing import TYPE_CHECKING, Dict, List, Optional - -from langfuse.batch_evaluation import CompositeEvaluatorFunction -from langfuse.experiment import ( - EvaluatorFunction, - ExperimentData, - ExperimentResult, - RunEvaluatorFunction, - TaskFunction, -) - -if TYPE_CHECKING: - from langfuse._client.client import Langfuse - - -class RunnerContext: - """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. - - Intended for use with the ``langfuse/experiment-action`` GitHub Action - (https://github.com/langfuse/experiment-action). The action builds a - ``RunnerContext`` before invoking the user's ``experiment(context)`` - function. Defaults set here (dataset, name, run name, metadata tags) are - applied when the user omits them on the :meth:`run_experiment` call; - users can override any default by passing the corresponding argument - explicitly. - """ - - def __init__( - self, - *, - client: "Langfuse", - data: Optional[ExperimentData] = None, - dataset_version: Optional[datetime] = None, - name: Optional[str] = None, - run_name: Optional[str] = None, - metadata: Optional[Dict[str, str]] = None, - ): - """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. - - Typically called by the ``langfuse/experiment-action`` GitHub Action, - not by end users directly. Every field except ``client`` is optional: - fields left as ``None`` simply mean the corresponding argument must be - supplied on the :meth:`run_experiment` call. - - Args: - client: Initialized Langfuse SDK client used to execute the - experiment. The action creates this from the - ``langfuse_public_key`` / ``langfuse_secret_key`` / - ``langfuse_base_url`` inputs. - data: Default dataset items to run the experiment on. Accepts - either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. - Injected by the action when ``dataset_name`` is configured. - If ``None``, the user must pass ``data=`` to - :meth:`run_experiment`. - dataset_version: Optional pinned dataset version. Injected by the - action when ``dataset_version`` is configured. - name: Default human-readable experiment name (e.g. the action's - ``experiment_name`` input). If ``None``, the user must pass - ``name=`` to :meth:`run_experiment`. - run_name: Default exact run name. The action typically derives - this from the commit SHA / PR number so that reruns produce - distinct runs in Langfuse. - metadata: Default metadata attached to every experiment trace and - the dataset run. The action injects GitHub-sourced tags (SHA, - PR link, workflow run link, branch, GH user, etc.). Merged - with any ``metadata`` passed to :meth:`run_experiment`, with - user-supplied keys winning on collision. - """ - self.client = client - self.data = data - self.dataset_version = dataset_version - self.name = name - self.run_name = run_name - self.metadata = metadata - - def run_experiment( - self, - *, - name: Optional[str] = None, - run_name: Optional[str] = None, - description: Optional[str] = None, - data: Optional[ExperimentData] = None, - task: TaskFunction, - evaluators: List[EvaluatorFunction] = [], - composite_evaluator: Optional[CompositeEvaluatorFunction] = None, - run_evaluators: List[RunEvaluatorFunction] = [], - max_concurrency: int = 50, - metadata: Optional[Dict[str, str]] = None, - _dataset_version: Optional[datetime] = None, - ) -> ExperimentResult: - resolved_name = name if name is not None else self.name - if resolved_name is None: - raise ValueError( - "`name` must be provided either on the RunnerContext or the run_experiment call" - ) - - resolved_data = data if data is not None else self.data - if resolved_data is None: - raise ValueError( - "`data` must be provided either on the RunnerContext or the run_experiment call" - ) - - resolved_run_name = run_name if run_name is not None else self.run_name - resolved_dataset_version = ( - _dataset_version if _dataset_version is not None else self.dataset_version - ) - - merged_metadata: Optional[Dict[str, str]] - if self.metadata is None and metadata is None: - merged_metadata = None - else: - merged_metadata = {**(self.metadata or {}), **(metadata or {})} - - return self.client.run_experiment( - name=resolved_name, - run_name=resolved_run_name, - description=description, - data=resolved_data, - task=task, - evaluators=evaluators, - composite_evaluator=composite_evaluator, - run_evaluators=run_evaluators, - max_concurrency=max_concurrency, - metadata=merged_metadata, - _dataset_version=resolved_dataset_version, - ) - - -class RegressionError(Exception): - """Raised by a user's ``experiment`` function to signal a CI gate failure. - - The GitHub action catches this exception and, when ``should_fail_on_error`` - is enabled, fails the workflow run and renders a callout in the PR comment - using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. - """ - - def __init__( - self, - *, - result: ExperimentResult, - metric: Optional[str] = None, - value: Optional[float] = None, - threshold: Optional[float] = None, - message: Optional[str] = None, - ): - self.result = result - self.metric = metric - self.value = value - self.threshold = threshold - if message is not None: - formatted = message - elif metric is not None: - formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" - else: - formatted = "Experiment regression detected" - super().__init__(formatted) diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 67b50a900..150b1c747 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -6,7 +6,9 @@ """ import asyncio +from datetime import datetime from typing import ( + TYPE_CHECKING, Any, Awaitable, Dict, @@ -21,6 +23,10 @@ from langfuse.logger import langfuse_logger as logger from langfuse.types import ExperimentScoreType +if TYPE_CHECKING: + from langfuse._client.client import Langfuse + from langfuse.batch_evaluation import CompositeEvaluatorFunction + class LocalExperimentItem(TypedDict, total=False): """Structure for local experiment data items (not from Langfuse datasets). @@ -1049,3 +1055,148 @@ def langfuse_evaluator( ) return langfuse_evaluator + + +class RunnerContext: + """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. + + Intended for use with the ``langfuse/experiment-action`` GitHub Action + (https://github.com/langfuse/experiment-action). The action builds a + ``RunnerContext`` before invoking the user's ``experiment(context)`` + function. Defaults set here (dataset, name, run name, metadata tags) are + applied when the user omits them on the :meth:`run_experiment` call; + users can override any default by passing the corresponding argument + explicitly. + """ + + def __init__( + self, + *, + client: "Langfuse", + data: Optional[ExperimentData] = None, + dataset_version: Optional[datetime] = None, + name: Optional[str] = None, + run_name: Optional[str] = None, + metadata: Optional[Dict[str, str]] = None, + ): + """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. + + Typically called by the ``langfuse/experiment-action`` GitHub Action, + not by end users directly. Every field except ``client`` is optional: + fields left as ``None`` simply mean the corresponding argument must be + supplied on the :meth:`run_experiment` call. + + Args: + client: Initialized Langfuse SDK client used to execute the + experiment. The action creates this from the + ``langfuse_public_key`` / ``langfuse_secret_key`` / + ``langfuse_base_url`` inputs. + data: Default dataset items to run the experiment on. Accepts + either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. + Injected by the action when ``dataset_name`` is configured. + If ``None``, the user must pass ``data=`` to + :meth:`run_experiment`. + dataset_version: Optional pinned dataset version. Injected by the + action when ``dataset_version`` is configured. + name: Default human-readable experiment name (e.g. the action's + ``experiment_name`` input). If ``None``, the user must pass + ``name=`` to :meth:`run_experiment`. + run_name: Default exact run name. The action typically derives + this from the commit SHA / PR number so that reruns produce + distinct runs in Langfuse. + metadata: Default metadata attached to every experiment trace and + the dataset run. The action injects GitHub-sourced tags (SHA, + PR link, workflow run link, branch, GH user, etc.). Merged + with any ``metadata`` passed to :meth:`run_experiment`, with + user-supplied keys winning on collision. + """ + self.client = client + self.data = data + self.dataset_version = dataset_version + self.name = name + self.run_name = run_name + self.metadata = metadata + + def run_experiment( + self, + *, + name: Optional[str] = None, + run_name: Optional[str] = None, + description: Optional[str] = None, + data: Optional[ExperimentData] = None, + task: TaskFunction, + evaluators: List[EvaluatorFunction] = [], + composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, + run_evaluators: List[RunEvaluatorFunction] = [], + max_concurrency: int = 50, + metadata: Optional[Dict[str, str]] = None, + _dataset_version: Optional[datetime] = None, + ) -> ExperimentResult: + resolved_name = name if name is not None else self.name + if resolved_name is None: + raise ValueError( + "`name` must be provided either on the RunnerContext or the run_experiment call" + ) + + resolved_data = data if data is not None else self.data + if resolved_data is None: + raise ValueError( + "`data` must be provided either on the RunnerContext or the run_experiment call" + ) + + resolved_run_name = run_name if run_name is not None else self.run_name + resolved_dataset_version = ( + _dataset_version if _dataset_version is not None else self.dataset_version + ) + + merged_metadata: Optional[Dict[str, str]] + if self.metadata is None and metadata is None: + merged_metadata = None + else: + merged_metadata = {**(self.metadata or {}), **(metadata or {})} + + return self.client.run_experiment( + name=resolved_name, + run_name=resolved_run_name, + description=description, + data=resolved_data, + task=task, + evaluators=evaluators, + composite_evaluator=composite_evaluator, + run_evaluators=run_evaluators, + max_concurrency=max_concurrency, + metadata=merged_metadata, + _dataset_version=resolved_dataset_version, + ) + + +class RegressionError(Exception): + """Raised by a user's ``experiment`` function to signal a CI gate failure. + + Intended for use with the ``langfuse/experiment-action`` GitHub Action + (https://github.com/langfuse/experiment-action). The action catches this + exception and, when ``should_fail_on_error`` is enabled, fails the + workflow run and renders a callout in the PR comment using + ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. + """ + + def __init__( + self, + *, + result: ExperimentResult, + metric: Optional[str] = None, + value: Optional[float] = None, + threshold: Optional[float] = None, + message: Optional[str] = None, + ): + self.result = result + self.metric = metric + self.value = value + self.threshold = threshold + if message is not None: + formatted = message + elif metric is not None: + formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" + else: + formatted = "Experiment regression detected" + super().__init__(formatted) diff --git a/tests/unit/test_ci.py b/tests/unit/test_runner_context.py similarity index 81% rename from tests/unit/test_ci.py rename to tests/unit/test_runner_context.py index b9b1e986c..e46eebdf8 100644 --- a/tests/unit/test_ci.py +++ b/tests/unit/test_runner_context.py @@ -1,14 +1,16 @@ -"""Tests for ``langfuse.ci`` — ``RunnerContext`` and ``RegressionError``.""" +"""Tests for ``RunnerContext`` and ``RegressionError`` in ``langfuse.experiment``.""" import inspect +import typing from datetime import datetime -from typing import Dict +from typing import get_type_hints from unittest.mock import MagicMock import pytest from langfuse import RegressionError, RunnerContext from langfuse._client.client import Langfuse +from langfuse.batch_evaluation import CompositeEvaluatorFunction def _noop_task(*, item, **kwargs): # pragma: no cover - never invoked via mock @@ -176,34 +178,42 @@ class TestSignatureDriftGuard: RELAXED_PARAMS = {"name", "run_name", "data", "_dataset_version"} + # `CompositeEvaluatorFunction` is only imported under TYPE_CHECKING in + # ``langfuse.experiment`` to break the circular dependency with + # ``langfuse.batch_evaluation``, so its forward-ref must be resolved + # explicitly when inspecting annotations. + LOCALNS = {"CompositeEvaluatorFunction": CompositeEvaluatorFunction} + def test_no_divergence(self): - client_params = self._params(Langfuse.run_experiment, skip_self=True) - ctx_params = self._params(RunnerContext.run_experiment, skip_self=True) + client_param_names = self._param_names(Langfuse.run_experiment) + ctx_param_names = self._param_names(RunnerContext.run_experiment) - assert set(client_params) == set(ctx_params), ( + assert client_param_names == ctx_param_names, ( "RunnerContext.run_experiment params do not match " "Langfuse.run_experiment. Missing: " - f"{set(client_params) - set(ctx_params)}. " - f"Extra: {set(ctx_params) - set(client_params)}." + f"{client_param_names - ctx_param_names}. " + f"Extra: {ctx_param_names - client_param_names}." + ) + + client_hints = get_type_hints(Langfuse.run_experiment) + ctx_hints = get_type_hints( + RunnerContext.run_experiment, localns=self.LOCALNS ) - for name, client_param in client_params.items(): - ctx_param = ctx_params[name] - client_ann = client_param.annotation - ctx_ann = ctx_param.annotation + for name in client_param_names: + client_ann = client_hints.get(name, inspect.Parameter.empty) + ctx_ann = ctx_hints.get(name, inspect.Parameter.empty) if name in self.RELAXED_PARAMS: - # RunnerContext version must be Optional[] - # Already-optional client annotations (run_name, - # _dataset_version) just need to match as-is. + # RunnerContext version must be Optional[]. + # Already-optional client annotations (``run_name``, + # ``_dataset_version``) just need to match as-is. if self._is_optional(client_ann): assert ctx_ann == client_ann, ( f"param `{name}`: expected {client_ann}, got {ctx_ann}" ) else: - from typing import Optional - - assert ctx_ann == Optional[client_ann], ( + assert ctx_ann == typing.Optional[client_ann], ( f"param `{name}`: expected Optional[{client_ann}], " f"got {ctx_ann}" ) @@ -214,18 +224,15 @@ def test_no_divergence(self): ) @staticmethod - def _params(func, *, skip_self: bool) -> Dict[str, inspect.Parameter]: - sig = inspect.signature(func) + def _param_names(func) -> set: return { - name: p - for name, p in sig.parameters.items() - if not (skip_self and name == "self") + name + for name in inspect.signature(func).parameters + if name != "self" } @staticmethod def _is_optional(annotation) -> bool: - import typing - origin = typing.get_origin(annotation) args = typing.get_args(annotation) return origin is typing.Union and type(None) in args From ef1aa71632c47e9faa1d8f832bb8a89f6db1297d Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 22 Apr 2026 10:33:29 +0200 Subject: [PATCH 3/4] test: rename test_runner_context.py to test_experiment.py Mirrors the module name now that RunnerContext and RegressionError live in `langfuse.experiment`. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/{test_runner_context.py => test_experiment.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/unit/{test_runner_context.py => test_experiment.py} (98%) diff --git a/tests/unit/test_runner_context.py b/tests/unit/test_experiment.py similarity index 98% rename from tests/unit/test_runner_context.py rename to tests/unit/test_experiment.py index e46eebdf8..5a92022f1 100644 --- a/tests/unit/test_runner_context.py +++ b/tests/unit/test_experiment.py @@ -1,4 +1,4 @@ -"""Tests for ``RunnerContext`` and ``RegressionError`` in ``langfuse.experiment``.""" +"""Tests for ``langfuse.experiment`` — ``RunnerContext`` and ``RegressionError``.""" import inspect import typing From 76c5852f009eed97169b1dafcf710015cdd72d7d Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 22 Apr 2026 11:17:14 +0200 Subject: [PATCH 4/4] feat(experiment): tighten RunnerContext + RegressionError public surface - RunnerContext no longer carries `name` or `run_name` as context-level defaults. `name` is now required on every `run_experiment` call (supports the action's directory-of-experiments mode where each script must name itself). `run_name` passes straight through to `Langfuse.run_experiment`. - RegressionError gains three typed `@overload` signatures (minimal, free-form message, structured metric/value/threshold) so type checkers enforce that `metric` and `value` are supplied together. At runtime, partial structured input falls back to the default message instead of rendering misleading `None` placeholders in PR comments. Co-Authored-By: Claude Opus 4.7 (1M context) --- langfuse/experiment.py | 55 ++++++++++++----------- tests/unit/test_experiment.py | 82 ++++++++++++++++++++--------------- 2 files changed, 76 insertions(+), 61 deletions(-) diff --git a/langfuse/experiment.py b/langfuse/experiment.py index 150b1c747..404c96e1d 100644 --- a/langfuse/experiment.py +++ b/langfuse/experiment.py @@ -17,6 +17,7 @@ Protocol, TypedDict, Union, + overload, ) from langfuse.api import DatasetItem @@ -1063,10 +1064,9 @@ class RunnerContext: Intended for use with the ``langfuse/experiment-action`` GitHub Action (https://github.com/langfuse/experiment-action). The action builds a ``RunnerContext`` before invoking the user's ``experiment(context)`` - function. Defaults set here (dataset, name, run name, metadata tags) are - applied when the user omits them on the :meth:`run_experiment` call; - users can override any default by passing the corresponding argument - explicitly. + function. Defaults set here (dataset, metadata tags) are applied when + the user omits them on the :meth:`run_experiment` call; users can + override any default by passing the corresponding argument explicitly. """ def __init__( @@ -1075,8 +1075,6 @@ def __init__( client: "Langfuse", data: Optional[ExperimentData] = None, dataset_version: Optional[datetime] = None, - name: Optional[str] = None, - run_name: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, ): """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. @@ -1098,12 +1096,6 @@ def __init__( :meth:`run_experiment`. dataset_version: Optional pinned dataset version. Injected by the action when ``dataset_version`` is configured. - name: Default human-readable experiment name (e.g. the action's - ``experiment_name`` input). If ``None``, the user must pass - ``name=`` to :meth:`run_experiment`. - run_name: Default exact run name. The action typically derives - this from the commit SHA / PR number so that reruns produce - distinct runs in Langfuse. metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged @@ -1113,14 +1105,12 @@ def __init__( self.client = client self.data = data self.dataset_version = dataset_version - self.name = name - self.run_name = run_name self.metadata = metadata def run_experiment( self, *, - name: Optional[str] = None, + name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Optional[ExperimentData] = None, @@ -1132,19 +1122,12 @@ def run_experiment( metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime] = None, ) -> ExperimentResult: - resolved_name = name if name is not None else self.name - if resolved_name is None: - raise ValueError( - "`name` must be provided either on the RunnerContext or the run_experiment call" - ) - resolved_data = data if data is not None else self.data if resolved_data is None: raise ValueError( "`data` must be provided either on the RunnerContext or the run_experiment call" ) - resolved_run_name = run_name if run_name is not None else self.run_name resolved_dataset_version = ( _dataset_version if _dataset_version is not None else self.dataset_version ) @@ -1156,8 +1139,8 @@ def run_experiment( merged_metadata = {**(self.metadata or {}), **(metadata or {})} return self.client.run_experiment( - name=resolved_name, - run_name=resolved_run_name, + name=name, + run_name=run_name, description=description, data=resolved_data, task=task, @@ -1178,8 +1161,30 @@ class RegressionError(Exception): exception and, when ``should_fail_on_error`` is enabled, fails the workflow run and renders a callout in the PR comment using ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. + + Callers choose one of three forms: + + - ``RegressionError(result=r)`` — minimal, generic message. + - ``RegressionError(result=r, message="...")`` — free-form message. + - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` — + structured; ``metric`` and ``value`` must be provided together so the + action can render a targeted callout without ``None`` placeholders. """ + @overload + def __init__(self, *, result: ExperimentResult) -> None: ... + @overload + def __init__(self, *, result: ExperimentResult, message: str) -> None: ... + @overload + def __init__( + self, + *, + result: ExperimentResult, + metric: str, + value: float, + threshold: Optional[float] = None, + message: Optional[str] = None, + ) -> None: ... def __init__( self, *, @@ -1195,7 +1200,7 @@ def __init__( self.threshold = threshold if message is not None: formatted = message - elif metric is not None: + elif metric is not None and value is not None: formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" else: formatted = "Experiment regression detected" diff --git a/tests/unit/test_experiment.py b/tests/unit/test_experiment.py index 5a92022f1..c6c8465a3 100644 --- a/tests/unit/test_experiment.py +++ b/tests/unit/test_experiment.py @@ -30,18 +30,15 @@ def test_context_defaults_flow_through(self): ctx = _make_ctx( data=ctx_data, dataset_version=ctx_version, - name="ctx-name", - run_name="ctx-run", metadata={"sha": "abc123"}, ) - result = ctx.run_experiment(task=_noop_task) + result = ctx.run_experiment(name="exp", task=_noop_task) assert result == "result-sentinel" ctx.client.run_experiment.assert_called_once() kwargs = ctx.client.run_experiment.call_args.kwargs - assert kwargs["name"] == "ctx-name" - assert kwargs["run_name"] == "ctx-run" + assert kwargs["name"] == "exp" assert kwargs["data"] is ctx_data assert kwargs["metadata"] == {"sha": "abc123"} assert kwargs["_dataset_version"] == ctx_version @@ -51,22 +48,20 @@ def test_call_overrides_win(self): ctx = _make_ctx( data=[{"input": "ctx"}], dataset_version=datetime(2026, 1, 1), - name="ctx-name", - run_name="ctx-run", ) override_data = [{"input": "override"}] override_version = datetime(2026, 6, 6) ctx.run_experiment( + name="exp", task=_noop_task, - name="call-name", run_name="call-run", data=override_data, _dataset_version=override_version, ) kwargs = ctx.client.run_experiment.call_args.kwargs - assert kwargs["name"] == "call-name" + assert kwargs["name"] == "exp" assert kwargs["run_name"] == "call-run" assert kwargs["data"] is override_data assert kwargs["_dataset_version"] == override_version @@ -76,10 +71,11 @@ class TestRunnerContextMetadataMerge: def test_user_keys_win_on_collision(self): ctx = _make_ctx( data=[{"input": "a"}], - name="n", metadata={"sha": "abc", "branch": "main"}, ) - ctx.run_experiment(task=_noop_task, metadata={"sha": "def", "pr": "42"}) + ctx.run_experiment( + name="exp", task=_noop_task, metadata={"sha": "def", "pr": "42"} + ) assert ctx.client.run_experiment.call_args.kwargs["metadata"] == { "sha": "def", "branch": "main", @@ -87,47 +83,40 @@ def test_user_keys_win_on_collision(self): } def test_context_metadata_only(self): - ctx = _make_ctx( - data=[{"input": "a"}], name="n", metadata={"sha": "abc"} - ) - ctx.run_experiment(task=_noop_task) + ctx = _make_ctx(data=[{"input": "a"}], metadata={"sha": "abc"}) + ctx.run_experiment(name="exp", task=_noop_task) assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"sha": "abc"} def test_call_metadata_only(self): - ctx = _make_ctx(data=[{"input": "a"}], name="n") - ctx.run_experiment(task=_noop_task, metadata={"pr": "1"}) + ctx = _make_ctx(data=[{"input": "a"}]) + ctx.run_experiment(name="exp", task=_noop_task, metadata={"pr": "1"}) assert ctx.client.run_experiment.call_args.kwargs["metadata"] == {"pr": "1"} def test_both_none_stays_none(self): - ctx = _make_ctx(data=[{"input": "a"}], name="n") - ctx.run_experiment(task=_noop_task) + ctx = _make_ctx(data=[{"input": "a"}]) + ctx.run_experiment(name="exp", task=_noop_task) assert ctx.client.run_experiment.call_args.kwargs["metadata"] is None class TestRunnerContextLocalItems: def test_local_items_pass_through_as_context_default(self): items = [{"input": "x", "expected_output": "y"}] - ctx = _make_ctx(data=items, name="n") - ctx.run_experiment(task=_noop_task) + ctx = _make_ctx(data=items) + ctx.run_experiment(name="exp", task=_noop_task) assert ctx.client.run_experiment.call_args.kwargs["data"] is items def test_local_items_pass_through_as_call_override(self): - ctx = _make_ctx(name="n") + ctx = _make_ctx() items = [{"input": "x"}] - ctx.run_experiment(task=_noop_task, data=items) + ctx.run_experiment(name="exp", task=_noop_task, data=items) assert ctx.client.run_experiment.call_args.kwargs["data"] is items class TestRunnerContextValidation: - def test_missing_name_raises(self): - ctx = _make_ctx(data=[{"input": "a"}]) - with pytest.raises(ValueError, match="name"): - ctx.run_experiment(task=_noop_task) - def test_missing_data_raises(self): - ctx = _make_ctx(name="n") + ctx = _make_ctx() with pytest.raises(ValueError, match="data"): - ctx.run_experiment(task=_noop_task) + ctx.run_experiment(name="exp", task=_noop_task) class TestRegressionError: @@ -155,7 +144,14 @@ def test_structured_message(self): assert "0.78" in str(exc) assert "0.9" in str(exc) - def test_user_message_wins(self): + def test_free_form_message(self): + exc = RegressionError( + result=MagicMock(), + message="custom explanation", + ) + assert str(exc) == "custom explanation" + + def test_message_wins_over_structured(self): exc = RegressionError( result=MagicMock(), metric="avg_accuracy", @@ -164,19 +160,33 @@ def test_user_message_wins(self): message="custom explanation", ) assert str(exc) == "custom explanation" + assert exc.metric == "avg_accuracy" + assert exc.value == 0.5 + assert exc.threshold == 0.9 + + def test_partial_structured_falls_back_to_default(self): + """The structured overload requires ``metric`` and ``value`` together. + + If a caller bypasses the type checker and passes only one, we fall + back to the default message rather than rendering misleading + ``None`` placeholders in the PR comment. + """ + exc = RegressionError(result=MagicMock(), metric="avg_accuracy") # type: ignore[call-overload] + assert str(exc) == "Experiment regression detected" class TestSignatureDriftGuard: """Fails loudly if ``Langfuse.run_experiment`` grows a parameter that is not threaded through ``RunnerContext.run_experiment``. - The four action-relaxed params (``name``, ``run_name``, ``data``, - ``_dataset_version``) are allowed to diverge: the RunnerContext variant - must be the ``Optional[...]`` of the client annotation so the action can - inject them. + ``data`` is the only genuinely relaxed parameter: it is required on the + client but optional on the RunnerContext so the action can inject it. + ``run_name`` and ``_dataset_version`` are already ``Optional`` on the + client and must match as-is. ``name`` is required on both — the action + supports a directory of experiments, so each script must name itself. """ - RELAXED_PARAMS = {"name", "run_name", "data", "_dataset_version"} + RELAXED_PARAMS = {"data"} # `CompositeEvaluatorFunction` is only imported under TYPE_CHECKING in # ``langfuse.experiment`` to break the circular dependency with