diff --git a/nix/home/skills/info_gathering/evals/apartments/BUILD.bazel b/nix/home/skills/info_gathering/evals/apartments/BUILD.bazel new file mode 100644 index 0000000000..fc0dff424d --- /dev/null +++ b/nix/home/skills/info_gathering/evals/apartments/BUILD.bazel @@ -0,0 +1 @@ +# Disabled — apartments.py renamed to .py.ignore diff --git a/nix/home/skills/info_gathering/evals/apartments/apartments.py.ignore b/nix/home/skills/info_gathering/evals/apartments/apartments.py.ignore new file mode 100644 index 0000000000..df46df2d84 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/apartments/apartments.py.ignore @@ -0,0 +1,73 @@ +"""Apartment search preference recovery eval. + +Usage: + bazel run //nix/home/skills/info_gathering/evals/apartments -- --api-key KEY +""" + +import argparse +import logging + +from nix.home.skills.info_gathering.evals.harness import ( + END_GAME_TOOL, + add_common_args, + build_agent_system, + load_skill, + make_client, + output_dir_from_args, + run_conversation_eval, + thinking_from_args, +) +from util.bazel.runfiles import get_required_path + +logger = logging.getLogger(__name__) + +NAME = "apartments" +TURN_LIMIT = 12 + +_FIRST_MESSAGE_RLOCATION = "_main/nix/home/skills/info_gathering/evals/apartments/first_message.txt" +_SIM_RLOCATION = "_main/nix/home/skills/info_gathering/evals/apartments/sim.txt" + +AGENT_EXTRA_SYSTEM = ( + "Help the user choose an apartment.\n" + "- Their preferences are UNKNOWN — you must elicit them\n" + "- Final answer: 'My ranking: [best] > [next] > ...'" +) + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + + p = argparse.ArgumentParser(description="Apartment search eval") + add_common_args(p) + args = p.parse_args() + + skill_text = load_skill() + agent_system = build_agent_system(skill_text, AGENT_EXTRA_SYSTEM) + client = make_client() + thinking = thinking_from_args(args) + output_dir = output_dir_from_args(args) + + first_user_message = get_required_path(_FIRST_MESSAGE_RLOCATION).read_text() + sim_system = get_required_path(_SIM_RLOCATION).read_text() + + logger.info("=" * 60) + logger.info(" %s | %s | thinking=%s", NAME, args.model, thinking or "off") + logger.info("=" * 60) + + summary = run_conversation_eval( + name=NAME, + client=client, + model=args.model, + agent_system=agent_system, + first_user_message=first_user_message, + sim_system=sim_system, + sim_tools=[END_GAME_TOOL], + turn_limit=TURN_LIMIT, + thinking_budget=thinking, + output_dir=output_dir, + ) + logger.info("%s", summary) + + +if __name__ == "__main__": + main() diff --git a/nix/home/skills/info_gathering/evals/apartments/first_message.txt b/nix/home/skills/info_gathering/evals/apartments/first_message.txt new file mode 100644 index 0000000000..552b521a94 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/apartments/first_message.txt @@ -0,0 +1,15 @@ +I'm looking for an apartment in San Francisco. Here are 6 options: + +A: Victorian in the Haight — $2400, 550sqft studio, 25min bus FiDi, hardwood, bay windows, built 1905, quirky tilted floors, vintage fixtures + +B: Modern high-rise SoMa — $3200, 700sqft 1BR, 10min walk FiDi, in-unit laundry, gym, rooftop, built 2019, gray-on-white, floor-to-ceiling windows + +C: Spacious flat Outer Sunset — $2100, 900sqft 2BR, 45min Muni FiDi, backyard, garage, near beach, built 1950, needs cosmetic work + +D: Renovated Edwardian NoPa — $2800, 650sqft 1BR, 20min bus FiDi, updated kitchen, W/D, walk score 95, built 1910, crown moldings + +E: Mission studio — $2600, 500sqft, 15min BART FiDi, great food scene, noisy, built 1960, generic finishes, no parking + +F: Richmond 1BR — $2300, 750sqft, 35min bus FiDi, quiet, near GG Park, dim light, built 1940, original kitchen, reliable landlord + +Help me figure out which is best for me. diff --git a/nix/home/skills/info_gathering/evals/apartments/sim.txt b/nix/home/skills/info_gathering/evals/apartments/sim.txt new file mode 100644 index 0000000000..cf6b08f683 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/apartments/sim.txt @@ -0,0 +1,20 @@ +You are a 30yo designer, hybrid 2 days/week in FiDi. + +WEIGHTS (don't state directly): +- Character/charm 25%: LOVE old buildings (bay windows, moldings, vintage). + HATE modern/generic/sterile. +- Neighborhood 25%: Walkability, food, culture +- Space/dollar 20% +- Commute 15% (only 2 days) +- Practical 10% +- Budget 5% (max $3200) + +RANKING: D > A > F > C > E > B +B is LAST — actively dislike (sterile, soulless). + +STYLE: 'I want a place with soul.' 'I hate cookie-cutter apartments.' +Pairwise → pick higher-ranked. Don't recite weights. + +When agent gives final ranking, call end_game: +score = (6-pos_of_D) + (pos_of_B-1). Perfect=10. +'correct' if D=#1 and B=#6. 'partial' if D top 2 OR B bottom 2. diff --git a/nix/home/skills/info_gathering/evals/medical/BUILD.bazel b/nix/home/skills/info_gathering/evals/medical/BUILD.bazel new file mode 100644 index 0000000000..4707ec878e --- /dev/null +++ b/nix/home/skills/info_gathering/evals/medical/BUILD.bazel @@ -0,0 +1 @@ +# Disabled — medical.py renamed to .py.ignore diff --git a/nix/home/skills/info_gathering/evals/medical/medical.py.ignore b/nix/home/skills/info_gathering/evals/medical/medical.py.ignore new file mode 100644 index 0000000000..d58173322e --- /dev/null +++ b/nix/home/skills/info_gathering/evals/medical/medical.py.ignore @@ -0,0 +1,98 @@ +"""Medical diagnosis eval variants. + +Usage: + bazel run //nix/home/skills/info_gathering/evals/medical -- --api-key KEY --variant iih + bazel run //nix/home/skills/info_gathering/evals/medical -- --api-key KEY --variant gerd +""" + +import argparse +import logging +from dataclasses import dataclass + +from nix.home.skills.info_gathering.evals.harness import ( + END_GAME_TOOL, + add_common_args, + build_agent_system, + load_skill, + make_client, + output_dir_from_args, + run_conversation_eval, + thinking_from_args, +) +from util.bazel.runfiles import get_required_path + +logger = logging.getLogger(__name__) + +_SIM_RLOCATIONS: dict[str, str] = { + "iih": "_main/nix/home/skills/info_gathering/evals/medical/sim_iih.txt", + "gerd": "_main/nix/home/skills/info_gathering/evals/medical/sim_gerd.txt", +} + +AGENT_EXTRA_SYSTEM = ( + "You are a diagnostician.\n" + "- History questions are FREE — ask many before ordering tests\n" + "- You may order multiple tests per turn; note which are parallelizable\n" + "- State diagnosis as: 'My diagnosis is: [DIAGNOSIS]'" +) + + +@dataclass +class Variant: + presenting_complaint: str + turn_limit: int = 15 + + +VARIANTS: dict[str, Variant] = { + "iih": Variant( + presenting_complaint=( + "I've been getting really bad headaches for the past few weeks, and my vision has been getting blurry." + ) + ), + "gerd": Variant( + presenting_complaint=( + "I've been having chest pain and I can't catch my breath. I'm really scared it might be my heart." + ) + ), +} + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + + p = argparse.ArgumentParser(description="Medical diagnosis eval") + add_common_args(p) + p.add_argument("--variant", choices=list(VARIANTS), required=True) + args = p.parse_args() + + v = VARIANTS[args.variant] + name = f"medical_{args.variant}" + + skill_text = load_skill() + agent_system = build_agent_system(skill_text, AGENT_EXTRA_SYSTEM) + client = make_client() + thinking = thinking_from_args(args) + output_dir = output_dir_from_args(args) + + sim_system = get_required_path(_SIM_RLOCATIONS[args.variant]).read_text() + + logger.info("=" * 60) + logger.info(" %s | %s | thinking=%s", name, args.model, thinking or "off") + logger.info("=" * 60) + + summary = run_conversation_eval( + name=name, + client=client, + model=args.model, + agent_system=agent_system, + first_user_message=v.presenting_complaint, + sim_system=sim_system, + sim_tools=[END_GAME_TOOL], + turn_limit=v.turn_limit, + thinking_budget=thinking, + output_dir=output_dir, + ) + logger.info("%s", summary) + + +if __name__ == "__main__": + main() diff --git a/nix/home/skills/info_gathering/evals/medical/sim_gerd.txt b/nix/home/skills/info_gathering/evals/medical/sim_gerd.txt new file mode 100644 index 0000000000..73c47bbd41 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/medical/sim_gerd.txt @@ -0,0 +1,34 @@ +You are a PATIENT. Respond in character, natural language, no jargon. +You do NOT know your diagnosis. + +SECRET: GERD + Panic disorder (comorbid) +PATIENT: 55M, history of anxiety + +SYMPTOMS (give when asked): +- Pain: 'Center of chest, sometimes up to throat, burning/sharp' +- Duration: 'On and off 2 months, episodes 30-60 min' +- Triggers: 'After big meals, also when stressed' +- Worse lying down: 'Much worse at night' +- Antacids: 'Tums helped a bit' +- Exercise: 'Fine between episodes' +- Exertion: 'No, more at rest or after eating' +- Smoking: No. Family: 'Dad had heart attack at 70' +- Anxiety: 'Had panic attacks years ago, these feel similar but scarier' +- SOB: 'During episodes, can't get a full breath' +- Palpitations: 'Heart races during episodes'. Sweating: 'Yes' + +TESTS (report results when ordered, include dollar cost in your response like 'The blood test ($20) came back normal'): +- ECG ($50): Normal sinus rhythm, no ST changes +- Troponin ($30): Negative +- Chest X-ray ($150): Normal +- Stress test ($500): Normal +- Upper endoscopy ($1500): Esophagitis grade B +- Cardiac cath ($5000): Clean coronaries +- PPI trial 2wk ($30): Significant improvement + +When agent diagnoses, call end_game: +- 'correct' if: GERD and panic disorder, GERD + panic disorder, GERD with comorbid panic, GERD and anxiety +- 'partial' if: GERD, acid reflux, panic disorder, anxiety +- else 'incorrect' +- total_cost = sum of all test dollar costs ordered so far +- score = -total_cost diff --git a/nix/home/skills/info_gathering/evals/medical/sim_iih.txt b/nix/home/skills/info_gathering/evals/medical/sim_iih.txt new file mode 100644 index 0000000000..c5e652d1fa --- /dev/null +++ b/nix/home/skills/info_gathering/evals/medical/sim_iih.txt @@ -0,0 +1,35 @@ +You are a PATIENT. Respond in character, natural language, no jargon. +You do NOT know your diagnosis. + +SECRET: Idiopathic Intracranial Hypertension (pseudotumor cerebri) +PATIENT: 35F, BMI 34 + +SYMPTOMS (give when asked): +- Headache: 'Behind my eyes, worse in the morning, pressure/throbbing' +- Vision: 'Blurry sometimes, especially off to the sides' +- Duration: '3 weeks, getting worse' +- Weight: 'Gained 20 lbs in 6 months' +- Meds: 'Started birth control pill 2 months ago' +- Nausea: 'Some, no vomiting' +- Whooshing in ears: 'Yes! Like my heartbeat, I thought it was nothing' +- Fever/stiff neck: No. Trauma: No +- Family: 'Mom gets migraines' +- Worse bending over: 'Yes, much worse' +- Double vision: 'Sometimes when looking to the side' +- Age/build: 35, 210 lbs, 5'5" + +TESTS (report results when ordered, include dollar cost in your response like 'The blood test ($20) came back normal'): +- Fundoscopy ($50): Bilateral papilledema +- Visual field test ($75): Enlarged blind spots bilaterally +- CBC ($20): Normal. CRP/ESR ($25): Normal. BMP ($30): Normal +- CT head ($500): Normal, no mass, no hydrocephalus +- MRI brain ($2000): Empty sella, distended optic nerve sheaths +- MR venography ($2500): Transverse sinus stenosis +- Lumbar puncture ($3000): Opening pressure 32 cm H2O (elevated), normal CSF + +When agent diagnoses, call end_game: +- 'correct' if: idiopathic intracranial hypertension, IIH, pseudotumor cerebri +- 'partial' if: raised intracranial pressure, intracranial hypertension +- else 'incorrect' +- total_cost = sum of all test dollar costs ordered so far +- score = -total_cost diff --git a/nix/home/skills/info_gathering/evals/movies/BUILD.bazel b/nix/home/skills/info_gathering/evals/movies/BUILD.bazel new file mode 100644 index 0000000000..c99c184e5a --- /dev/null +++ b/nix/home/skills/info_gathering/evals/movies/BUILD.bazel @@ -0,0 +1 @@ +# Disabled — movies.py renamed to .py.ignore diff --git a/nix/home/skills/info_gathering/evals/movies/movies.py.ignore b/nix/home/skills/info_gathering/evals/movies/movies.py.ignore new file mode 100644 index 0000000000..821f5065f3 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/movies/movies.py.ignore @@ -0,0 +1,193 @@ +"""Movie recommendation eval. + +Usage: + bazel run //nix/home/skills/info_gathering/evals/movies -- --api-key KEY +""" + +import argparse +import logging +import re +from typing import Any + +import anthropic.types +from pydantic import BaseModel + +from nix.home.skills.info_gathering.evals.harness import ( + LogEntry, + Recommendation, + RunSummary, + TokenTracker, + add_common_args, + build_agent_system, + call_api, + extract_text, + extract_tool_calls, + load_skill, + log_response, + make_client, + output_dir_from_args, + resolve_tool_calls, + save_results, + thinking_from_args, + tool_def, +) +from util.bazel.runfiles import get_required_path + +logger = logging.getLogger(__name__) + +NAME = "movies_kenji" +TURN_LIMIT = 14 +MAX_RECOMMENDATIONS = 12 + +_SIM_RLOCATION = "_main/nix/home/skills/info_gathering/evals/movies/sim.txt" + +FIRST_MESSAGE = ( + "I'd like movie recommendations. I'll rate each one 1-10. " + "Maximize my total enjoyment. You get 12 recommendations. Go." +) + +AGENT_EXTRA_SYSTEM = ( + "Recommend movies using the recommend_movie tool. You MUST use the tool\n" + "every turn — no plain text, no questions.\n" + "- Each recommend_movie call returns a 1-10 rating\n" + "- Total score = sum(rating * 0.95^i) for i-th recommendation\n" + "- You have exactly 12 recommendations. Make them count." +) + + +class MovieResult(BaseModel): + outcome: str + score: float + summary: str + + +class RecommendMovieInput(BaseModel): + """Input schema for the recommend_movie tool.""" + + title: str + + +RECOMMEND_MOVIE_TOOL = tool_def( + "recommend_movie", "Recommend a movie. Returns the user's rating 1-10.", RecommendMovieInput +) + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + + p = argparse.ArgumentParser(description="Movie recommendation eval") + add_common_args(p) + args = p.parse_args() + + skill_text = load_skill() + agent_system = build_agent_system(skill_text, AGENT_EXTRA_SYSTEM) + client = make_client() + model = args.model + thinking = thinking_from_args(args) + output_dir = output_dir_from_args(args) + + sim_system = get_required_path(_SIM_RLOCATION).read_text() + + logger.info("=" * 60) + logger.info(" %s | %s | thinking=%s", NAME, model, thinking or "off") + logger.info("=" * 60) + + tracker = TokenTracker(model=model) + log_entries: list[LogEntry] = [] + recommendations: list[Recommendation] = [] + sim_messages: list[anthropic.types.MessageParam] = [] + agent_messages: list[anthropic.types.MessageParam] = [ + anthropic.types.MessageParam(role="user", content=FIRST_MESSAGE) + ] + current_turn = 0 + + def handle_recommend(tool_name: str, inp: dict[str, Any]) -> dict[str, Any]: + if tool_name != "recommend_movie": + return {"error": f"Unknown tool: {tool_name}"} + + title = inp.get("title", "?") + sim_messages.append(anthropic.types.MessageParam(role="user", content=f"Rate: {title}")) + sim_resp = call_api( + client=client, messages=sim_messages, system=sim_system, model=model, thinking_budget=thinking + ) + tracker.add(sim_resp.usage) + log_response(log_entries, name=NAME, player="simulator", turn=current_turn, model=model, response=sim_resp) + sim_messages.append(anthropic.types.MessageParam(role="assistant", content=sim_resp.content)) + + sim_text = extract_text(sim_resp).strip() + match = re.search(r"\b(\d+)\b", sim_text) + stars = int(match.group(1)) if match else 5 + stars = max(1, min(10, stars)) + + recommendations.append(Recommendation(title=title, stars=stars, turn=current_turn)) + return {"stars": stars} + + for turn in range(1, TURN_LIMIT + 1): + current_turn = turn + logger.info("Turn %d...", turn) + + agent_resp = call_api( + client=client, + messages=agent_messages, + system=agent_system, + model=model, + tools=[RECOMMEND_MOVIE_TOOL], + thinking_budget=thinking, + ) + tracker.add(agent_resp.usage) + log_response(log_entries, name=NAME, player="agent", turn=turn, model=model, response=agent_resp) + + if agent_resp.stop_reason == "tool_use": + agent_resp, agent_messages, usages = resolve_tool_calls( + client=client, + response=agent_resp, + messages=agent_messages, + system=agent_system, + model=model, + tools=[RECOMMEND_MOVIE_TOOL], + handler=handle_recommend, + thinking_budget=thinking, + ) + for u in usages: + tracker.add(u) + log_response(log_entries, name=NAME, player="agent", turn=turn, model=model, response=agent_resp) + + agent_messages.append(anthropic.types.MessageParam(role="assistant", content=agent_resp.content)) + + if len(recommendations) >= MAX_RECOMMENDATIONS: + break + + # Agent is tool-only; prompt to continue if no tool calls + if not extract_tool_calls(agent_resp): + agent_messages.append(anthropic.types.MessageParam(role="user", content="Continue.")) + + # Compute result + total = sum(r.stars * (0.95**i) for i, r in enumerate(recommendations)) + if total > 70: + outcome: str = "correct" + elif total > 50: + outcome = "partial" + else: + outcome = "incorrect" + + result = MovieResult( + outcome=outcome, score=round(total, 2), summary=f"{len(recommendations)} recs, discounted sum={total:.1f}" + ) + summary = RunSummary( + eval_name=NAME, + model=model, + turns=current_turn, + result=result, + recommendations=recommendations, + api_calls=tracker.api_calls, + input_tokens=tracker.input_tokens, + output_tokens=tracker.output_tokens, + api_cost_usd=round(tracker.cost_usd, 4), + ) + save_results(name=NAME, log_entries=log_entries, summary=summary, output_dir=output_dir) + + logger.info("%s", summary) + + +if __name__ == "__main__": + main() diff --git a/nix/home/skills/info_gathering/evals/movies/sim.txt b/nix/home/skills/info_gathering/evals/movies/sim.txt new file mode 100644 index 0000000000..8d2b1587f4 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/movies/sim.txt @@ -0,0 +1,22 @@ +You are KENJI, a 40yo Japanese-American architect. + +You receive movie titles to rate. Respond with ONLY a number 1-10. + +TASTE: +9-10: Slow/atmospheric/visual (Blade Runner 2049, Lost in Translation, +In the Mood for Love), thoughtful sci-fi (Arrival, Ex Machina), +Wes Anderson, architecture films (Columbus), Japanese cinema +(Spirited Away, Tokyo Story), quiet studies (Paterson). Parasite=10. + +6-8: Visually ambitious mainstream (Inception 8, Dune 8, Interstellar 6), +dark comedy (Fargo 8), art docs (Jiro 8). + +1-4: MCU (3-4), broad comedy (1-2), jump-scare horror (3-4), +Fast&Furious (1-2), Oscar-bait melodrama (3-4). + +UNLISTED: base=5, +2 atmospheric/visual, +1 ideas>action, ++1 architectural, +1 Japanese, +1 quiet/contemplative, +-2 broad/loud/generic, -1 spectacle>substance. Clamp 1-10. + +NOISE: ~20% of the time, shift ±1. No justification. +Respond with ONLY the number. Nothing else. diff --git a/nix/home/skills/info_gathering/evals/tech_anchoring/BUILD.bazel b/nix/home/skills/info_gathering/evals/tech_anchoring/BUILD.bazel new file mode 100644 index 0000000000..5e0c14a095 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/tech_anchoring/BUILD.bazel @@ -0,0 +1 @@ +# Disabled — tech_anchoring.py renamed to .py.ignore diff --git a/nix/home/skills/info_gathering/evals/tech_anchoring/sim.txt b/nix/home/skills/info_gathering/evals/tech_anchoring/sim.txt new file mode 100644 index 0000000000..67ef5adce7 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/tech_anchoring/sim.txt @@ -0,0 +1,18 @@ +You have slow internet. SECRET: ISP regional outage. Extension is unrelated. + +ANSWERS (one at a time, only when asked): +- Remove extension? → 'Not yet, should I?' +- Other devices? → 'Let me check... yeah phone on WiFi is slow too' +- Cellular? → 'Phone is fine on cellular' +- Restarted router? → 'Yeah, still slow' +- Router lights? → 'Look normal' +- Speed test? → '2 Mbps, usually 200' +- ISP status? → 'Oh wow, it says outage in my area!' +- Which extension? → 'uBlock Origin' +- All sites? → 'Everything' +- WiFi/ethernet? → 'WiFi' +- Different browser? → 'Still slow in Firefox' + +If told to remove extension: 'OK' then next turn 'Still slow.' +When agent identifies ISP outage: end_game correct, score=-(turn). +Wrong confident diagnosis: end_game incorrect. diff --git a/nix/home/skills/info_gathering/evals/tech_anchoring/tech_anchoring.py.ignore b/nix/home/skills/info_gathering/evals/tech_anchoring/tech_anchoring.py.ignore new file mode 100644 index 0000000000..47cf57f599 --- /dev/null +++ b/nix/home/skills/info_gathering/evals/tech_anchoring/tech_anchoring.py.ignore @@ -0,0 +1,76 @@ +"""Tech support anchoring trap eval. + +Usage: + bazel run //nix/home/skills/info_gathering/evals/tech_anchoring -- --api-key KEY +""" + +import argparse +import logging + +from nix.home.skills.info_gathering.evals.harness import ( + END_GAME_TOOL, + add_common_args, + build_agent_system, + load_skill, + make_client, + output_dir_from_args, + run_conversation_eval, + thinking_from_args, +) +from util.bazel.runfiles import get_required_path + +logger = logging.getLogger(__name__) + +NAME = "tech_anchoring" +TURN_LIMIT = 10 + +_SIM_RLOCATION = "_main/nix/home/skills/info_gathering/evals/tech_anchoring/sim.txt" + +FIRST_MESSAGE = ( + "My internet has been really slow for the past day. I installed a new " + "browser extension yesterday. Can you help me figure out what's going on?" +) + +AGENT_EXTRA_SYSTEM = ( + "Diagnose this tech problem.\n" + "- The user can check things physically\n" + "- State diagnosis as: 'My diagnosis is: [CAUSE]'" +) + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + + p = argparse.ArgumentParser(description="Tech anchoring eval") + add_common_args(p) + args = p.parse_args() + + skill_text = load_skill() + agent_system = build_agent_system(skill_text, AGENT_EXTRA_SYSTEM) + client = make_client() + thinking = thinking_from_args(args) + output_dir = output_dir_from_args(args) + + sim_system = get_required_path(_SIM_RLOCATION).read_text() + + logger.info("=" * 60) + logger.info(" %s | %s | thinking=%s", NAME, args.model, thinking or "off") + logger.info("=" * 60) + + summary = run_conversation_eval( + name=NAME, + client=client, + model=args.model, + agent_system=agent_system, + first_user_message=FIRST_MESSAGE, + sim_system=sim_system, + sim_tools=[END_GAME_TOOL], + turn_limit=TURN_LIMIT, + thinking_budget=thinking, + output_dir=output_dir, + ) + logger.info("%s", summary) + + +if __name__ == "__main__": + main()