Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/code_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ jobs:
# Skipping one nbconvert vulnerability that has no fix version
# Skipping one orjson vulnerability that has no fix version
# Skipping one protobuf vulnerability that has no fix version
# Skipping one pygments vulnerability that has no fix version
ignore-vulns: |
GHSA-xm59-rqc7-hhvf
GHSA-hx9q-6w63-j58v
GHSA-7gcm-g887-7qv7
CVE-2026-4539
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
... )
"""

import logging
from pathlib import Path
from typing import Any

Expand All @@ -34,6 +35,10 @@
from pydantic import BaseModel, Field


logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logger = logging.getLogger(__name__)


DEFAULT_SYSTEM_PROMPT_TEMPLATE = """\
You are an impartial and expert evaluator. Your task is to grade the quality of a Candidate Output based on a provided Input.

Expand Down Expand Up @@ -232,6 +237,7 @@ async def _evaluator(

return _to_evaluations(judge_response)
except Exception as exc:
logger.exception("Error in LLM judge evaluator")
return [build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="LLM judge error")]

_evaluator.__name__ = name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
treated as hallucination.
"""

import logging
from pathlib import Path
from typing import Any, Literal

Expand All @@ -26,6 +27,10 @@
from pydantic import BaseModel, Field


logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logger = logging.getLogger(__name__)


DEFAULT_GROUNDEDNESS_SYSTEM_PROMPT = """\
You are a Fact-Checking Judge. Your ONLY function is to verify if the Candidate Output is factually supported by the provided Context.

Expand Down Expand Up @@ -212,6 +217,7 @@ async def _evaluator(
)
except Exception as exc:
# Deterministic error scores keep rows analyzable without dropping traces.
logger.exception("Trace groundedness error")
return build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="Trace groundedness error")

_evaluator.__name__ = name
Expand Down
2 changes: 2 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/evaluation/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ async def _evaluate_trace(
try:
trace, ready = await fetch_trace_with_wait(langfuse_client, trace_id, wait)
except Exception as exc:
logger.exception("Trace fetch failed.")
return [], TraceEvalStatus.FAILED, f"Trace fetch failed: {exc}"

if trace is None or not ready:
Expand All @@ -249,6 +250,7 @@ async def _evaluate_trace(
evaluations.extend(await _normalize_evaluations(raw_result))
except Exception as exc:
evaluator_name = _get_evaluator_name(evaluator)
logger.exception(f"Trace evaluator '{evaluator_name}' failed.")
return [], TraceEvalStatus.FAILED, f"Trace evaluator '{evaluator_name}' failed: {exc}"

# Persist scores so they appear alongside traces in the Langfuse UI.
Expand Down
2 changes: 1 addition & 1 deletion aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ async def _run_with_retry() -> dict[str, Any]:
try:
return await self._run_agent_once_inner(question, new_session_id)
except Exception as retry_error:
logger.error(f"Retry with fresh session failed: {retry_error}")
logger.exception(f"Retry with fresh session failed: {retry_error}")
raise RuntimeError(
f"Context overflow error. Original error: {e}. "
f"Retry with fresh session also failed: {retry_error}"
Expand Down
14 changes: 3 additions & 11 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading