diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index 2dc847e6..563b9f76 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -57,7 +57,9 @@ jobs: # Skipping one nbconvert vulnerability that has no fix version # Skipping one orjson vulnerability that has no fix version # Skipping one protobuf vulnerability that has no fix version + # Skipping one pygments vulnerability that has no fix version ignore-vulns: | GHSA-xm59-rqc7-hhvf GHSA-hx9q-6w63-j58v GHSA-7gcm-g887-7qv7 + CVE-2026-4539 diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py index bf4a63ee..2823f583 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/llm_judge.py @@ -18,6 +18,7 @@ ... ) """ +import logging from pathlib import Path from typing import Any @@ -34,6 +35,10 @@ from pydantic import BaseModel, Field +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + DEFAULT_SYSTEM_PROMPT_TEMPLATE = """\ You are an impartial and expert evaluator. Your task is to grade the quality of a Candidate Output based on a provided Input. @@ -232,6 +237,7 @@ async def _evaluator( return _to_evaluations(judge_response) except Exception as exc: + logger.exception("Error in LLM judge evaluator") return [build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="LLM judge error")] _evaluator.__name__ = name diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/trace_groundedness.py b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/trace_groundedness.py index f5104fa1..33a51f6f 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/graders/trace_groundedness.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/graders/trace_groundedness.py @@ -5,6 +5,7 @@ treated as hallucination. """ +import logging from pathlib import Path from typing import Any, Literal @@ -26,6 +27,10 @@ from pydantic import BaseModel, Field +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + DEFAULT_GROUNDEDNESS_SYSTEM_PROMPT = """\ You are a Fact-Checking Judge. Your ONLY function is to verify if the Candidate Output is factually supported by the provided Context. @@ -212,6 +217,7 @@ async def _evaluator( ) except Exception as exc: # Deterministic error scores keep rows analyzable without dropping traces. + logger.exception("Trace groundedness error") return build_error_evaluation(name=resolved_error_metric_name, error=exc, prefix="Trace groundedness error") _evaluator.__name__ = name diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py b/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py index 5a749f1b..25c4baf8 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py @@ -237,6 +237,7 @@ async def _evaluate_trace( try: trace, ready = await fetch_trace_with_wait(langfuse_client, trace_id, wait) except Exception as exc: + logger.exception("Trace fetch failed.") return [], TraceEvalStatus.FAILED, f"Trace fetch failed: {exc}" if trace is None or not ready: @@ -249,6 +250,7 @@ async def _evaluate_trace( evaluations.extend(await _normalize_evaluations(raw_result)) except Exception as exc: evaluator_name = _get_evaluator_name(evaluator) + logger.exception(f"Trace evaluator '{evaluator_name}' failed.") return [], TraceEvalStatus.FAILED, f"Trace evaluator '{evaluator_name}' failed: {exc}" # Persist scores so they appear alongside traces in the Langfuse UI. diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py index 06b5c8e7..2c1fdc83 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py @@ -619,7 +619,7 @@ async def _run_with_retry() -> dict[str, Any]: try: return await self._run_agent_once_inner(question, new_session_id) except Exception as retry_error: - logger.error(f"Retry with fresh session failed: {retry_error}") + logger.exception(f"Retry with fresh session failed: {retry_error}") raise RuntimeError( f"Context overflow error. Original error: {e}. " f"Retry with fresh session also failed: {retry_error}" diff --git a/uv.lock b/uv.lock index feed0e8a..7b148f77 100644 --- a/uv.lock +++ b/uv.lock @@ -2705,10 +2705,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/66/eea81dfff765ed66c68fd2ed8c96245109e13c896c2a5015c7839c92367e/jiter-0.13.0-cp314-cp314t-win32.whl", hash = "sha256:24dc96eca9f84da4131cdf87a95e6ce36765c3b156fc9ae33280873b1c32d5f6", size = 201196 }, { url = "https://files.pythonhosted.org/packages/ff/32/4ac9c7a76402f8f00d00842a7f6b83b284d0cf7c1e9d4227bc95aa6d17fa/jiter-0.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0a8d76c7524087272c8ae913f5d9d608bd839154b62c4322ef65723d2e5bb0b8", size = 204215 }, { url = "https://files.pythonhosted.org/packages/f9/8e/7def204fea9f9be8b3c21a6f2dd6c020cf56c7d5ff753e0e23ed7f9ea57e/jiter-0.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024", size = 187152 }, - { url = "https://files.pythonhosted.org/packages/80/60/e50fa45dd7e2eae049f0ce964663849e897300433921198aef94b6ffa23a/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:3d744a6061afba08dd7ae375dcde870cffb14429b7477e10f67e9e6d68772a0a", size = 305169 }, - { url = "https://files.pythonhosted.org/packages/d2/73/a009f41c5eed71c49bec53036c4b33555afcdee70682a18c6f66e396c039/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:ff732bd0a0e778f43d5009840f20b935e79087b4dc65bd36f1cd0f9b04b8ff7f", size = 303808 }, - { url = "https://files.pythonhosted.org/packages/c4/10/528b439290763bff3d939268085d03382471b442f212dca4ff5f12802d43/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab44b178f7981fcaea7e0a5df20e773c663d06ffda0198f1a524e91b2fde7e59", size = 337384 }, - { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768 }, ] [[package]] @@ -4933,10 +4929,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906 }, { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607 }, { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769 }, - { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495 }, - { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388 }, - { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879 }, - { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017 }, ] [[package]] @@ -5368,7 +5360,7 @@ wheels = [ [[package]] name = "requests" -version = "2.32.5" +version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -5376,9 +5368,9 @@ dependencies = [ { name = "idna" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517 } +sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232 } wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738 }, + { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017 }, ] [[package]]