diff --git a/.gitignore b/.gitignore index 42545fbf7..a1719928e 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,6 @@ cython_debug/ # PyPI configuration file .pypirc +results/ +examples/boids_flocking/metrics.json +examples/boids_flocking/correct.json diff --git a/README.md b/README.md index 55f40d262..4404c24d9 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,16 @@ - +

-`ShinkaEvolve` is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements. +[`ShinkaEvolve`](https://arxiv.org/abs/2509.19349) is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements. The framework supports **parallel evaluation of candidates** locally or on a Slurm cluster. It maintains an archive of successful solutions, enabling knowledge transfer between different evolutionary islands. `ShinkaEvolve` is particularly well-suited for scientific tasks where there is a verifier available and the goal is to optimize performance metrics while maintaining code correctness and readability. -![](docs/conceptual.png) +![evolution](https://github.com/user-attachments/assets/22cf3468-17fe-4995-9e13-d602b490a54e) ## Documentation πŸ“ @@ -26,6 +26,7 @@ The framework supports **parallel evaluation of candidates** locally or on a Slu | πŸ““ **[Tutorial Notebook](examples/shinka_tutorial.ipynb)** | Interactive walkthrough of Shinka features | Hands-on examples, configuration, best practices | | βš™οΈ **[Configuration](docs/configuration.md)** | Comprehensive configuration reference | All config options, optimization settings, advanced features | | 🎨 **[WebUI](docs/webui.md)** | Interactive visualization and monitoring | Real-time tracking, result analysis, debugging tools | +|πŸ•ΉοΈ **[Local LLM Support](https://github.com/SakanaAI/ShinkaEvolve/blob/main/docs/support_local_llm.md)**| Instructions for Local LLMs | How to setup local LLMs on your machine| ## Installation & Quick Start πŸš€ @@ -52,9 +53,9 @@ For detailed installation instructions and usage examples, see the [Getting Star | Example | Description | Environment Setup | |---------|-------------|-------------------| | β­• [Circle Packing](examples/circle_packing) | Optimize circle packing to maximize radii. | `LocalJobConfig` | -| πŸ€– [Agent Design](examples/agent_design) | Design agent scaffolds for math tasks. | `LocalJobConfig` | +| πŸ€– [Agent Design](examples/adas_aime) | Design agent scaffolds for math tasks. | `LocalJobConfig` | | 🎯 [ALE-Bench](examples/ale_bench) | Code optimization for ALE-Bench tasks. | `LocalJobConfig` | -| ✨ [Novelty Generator](examples/novelty_generator_bck) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` | +| ✨ [Novelty Generator](examples/novelty_generator) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` | ## `shinka` Run with Python API 🐍 @@ -308,9 +309,9 @@ If you use `ShinkaEvolve` in your research, please cite it as follows: ``` @article{lange2025shinka, - title={ShinkaEvolve: Towards Open-Ended and Sample-Efficient Program Evolution}, + title={ShinkaEvolve: Towards Open-Ended And Sample-Efficient Program Evolution}, author={Lange, Robert Tjarko and Imajuku, Yuki and Cetin, Edoardo}, - journal={arXiv preprint}, + journal={arXiv preprint arXiv:2509.19349}, year={2025} } -``` \ No newline at end of file +``` diff --git a/configs/cluster/local.yaml b/configs/cluster/local.yaml index c8e4fc8c7..4b73e28bc 100644 --- a/configs/cluster/local.yaml +++ b/configs/cluster/local.yaml @@ -1,6 +1,7 @@ job_config: _target_: shinka.launch.LocalJobConfig eval_program_path: ${distributed_job_config.eval_program_path} - + eval_command: ${oc.select:distributed_job_config.eval_command,null} + evo_config: job_type: "local" diff --git a/configs/config.yaml b/configs/config.yaml index 9702c6617..577e1dfe2 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -2,9 +2,9 @@ defaults: - _self_ - database@_global_: island_small - evolution@_global_: small_budget - - task@_global_: mad_tf + - task@_global_: circle_packing - cluster@_global_: local - - variant@_global_: mad_tf_example + - variant@_global_: circle_packing_example verbose: false results_dir: results diff --git a/configs/evolution/agentic.yaml b/configs/evolution/agentic.yaml new file mode 100644 index 000000000..3fd0bf102 --- /dev/null +++ b/configs/evolution/agentic.yaml @@ -0,0 +1,45 @@ +evo_config: + _target_: shinka.core.EvolutionConfig + agentic_mode: true + # LLM models for patch generation (used by bandit sampling) + llm_models: + - "gpt-4.1" + - "claude-sonnet-4-20250514" + - "gemini-2.5-flash" + llm_dynamic_selection: ucb + embedding_model: "text-embedding-3-small" + num_generations: 2 + max_parallel_jobs: 1 + agentic: + _target_: shinka.core.runner.AgenticConfig + backend: "shinka" + cli_profile: null + sandbox: "workspace-write" + approval_mode: "full-auto" + max_turns: 50 + max_seconds: 0 + cli_path: null + extra_cli_config: + # Model used for agentic editing sessions + # REQUIRED: Will fail if not set (no silent fallbacks to old models) + model: "gpt-4.1" + resume_parent_session: false + # Use /tmp to isolate scratch dirs from git repos, preventing Codex CLI + # from discovering parent AGENTS.md files. Set to null to use results_dir. + scratch_dir_base: "/tmp/shinka_scratch" + evaluator: + _target_: shinka.core.runner.EvaluatorConfig + mode: auto + agentic: + _target_: shinka.core.runner.AgenticEvaluatorConfig + # If null, inherits backend from agentic.backend + backend: null + sandbox: "workspace-write" + approval_mode: "full-auto" + max_events: 80 + max_seconds: 0 + extra_cli_config: + model: "gpt-4.1" + # Custom evaluation criteria (null for default quantitative eval) + eval_prompt: null + results_dir: ${output_dir} diff --git a/configs/task/boids_flocking.yaml b/configs/task/boids_flocking.yaml new file mode 100644 index 000000000..180c3db6a --- /dev/null +++ b/configs/task/boids_flocking.yaml @@ -0,0 +1,55 @@ +# Boids Flocking Task Configuration +# Task: Evolve flocking behavior to minimize collisions while maintaining tight grouping + +# Task metadata (used by UI/logging) +task: + task_name: boids_flocking + description: | + Optimize the Boids flocking simulation. The goal is to evolve the separation, + alignment, and cohesion behaviors to: + 1. Minimize collisions between boids + 2. Maintain tight grouping (cohesion) + 3. Achieve good velocity alignment + + The simulation runs for 1000 steps with 50 boids. Improve the scoring function, + behavior weights, and physics parameters to achieve a higher combined score. + exec_fname: main.py + init_support_dir: examples/boids_flocking + language: python + metrics_fname: metrics.json + correct_fname: correct.json + score_key: combined_score + higher_is_better: true + allowed_files: + - boid.py + - simulation.py + - render.py + - main.py + primary_file: main.py + +# Evolution config overrides (merged into global evo_config) +evo_config: + init_program_path: "examples/boids_flocking/main.py" + task_sys_msg: | + You are an expert in emergent behavior simulation and evolutionary algorithms. + Optimize the Boids flocking simulation to achieve: + 1. Minimize collisions between boids (separation) + 2. Maintain tight grouping (cohesion) + 3. Achieve good velocity alignment + + The simulation runs 1000 steps with 50 boids. You can edit multiple files: + - main.py: Entry point and configuration + - boid.py: Individual boid behavior + - simulation.py: Simulation loop and physics + - render.py: Visualization (optional) + + Focus on tuning behavior weights, perception radius, and force calculations. + language: python + init_support_dir: examples/boids_flocking + job_type: local + +distributed_job_config: + eval_program_path: "examples/boids_flocking/main.py" + # Don't set eval_command - let framework pass --results_dir dynamically + +exp_name: shinka_boids_flocking diff --git a/configs/task/circle_packing.yaml b/configs/task/circle_packing.yaml index 43b0c8441..0a4fd309b 100644 --- a/configs/task/circle_packing.yaml +++ b/configs/task/circle_packing.yaml @@ -30,6 +30,8 @@ evo_config: 7. The math literature suggests special arrangements for specific values of n Be creative and try to find a new solution. + + IMPORTANT: Your solution must be in main.py - this is the file that gets evaluated. language: "python" init_program_path: "examples/circle_packing/initial.py" job_type: "slurm_conda" diff --git a/configs/variant/boids_flocking.yaml b/configs/variant/boids_flocking.yaml new file mode 100644 index 000000000..5fbc282eb --- /dev/null +++ b/configs/variant/boids_flocking.yaml @@ -0,0 +1,13 @@ +# Variant configuration for Boids Flocking task +# This defines default overrides for the boids task + +defaults: + - /task: boids_flocking + - /evolution: small_budget + +variant_suffix: "_boids" + +# Task-specific evolution overrides +evo_config: + # Enable agentic mode for multi-file editing + agentic_mode: false # Set to true for agentic experiments diff --git a/configs/variant/boids_flocking_agentic.yaml b/configs/variant/boids_flocking_agentic.yaml new file mode 100644 index 000000000..087ff43bd --- /dev/null +++ b/configs/variant/boids_flocking_agentic.yaml @@ -0,0 +1,74 @@ +# Variant configuration for Boids Flocking task with agentic editing +# This enables the multi-turn agentic backend for multi-file evolution + +defaults: + - override /task@_global_: boids_flocking + - override /evolution@_global_: agentic + +variant_suffix: "_boids_agentic" +exp_name: "shinka_boids_flocking" + +# Override evo_config with boids-specific values (applied last) +evo_config: + init_program_path: "examples/boids_flocking/main.py" + init_support_dir: examples/boids_flocking + max_score: 100.0 + num_generations: 30 + max_parallel_jobs: 2 + llm_models: + - "gemini-2.5-flash" + agentic: + extra_cli_config: + model: "gemini-2.5-flash" + task_sys_msg: | + You are an expert in emergent behavior simulation and evolutionary algorithms. + Optimize the Boids flocking simulation to achieve beautiful, natural flocking behavior. + + The simulation runs 1000 steps with 50 boids. You can edit multiple files: + - main.py: Entry point and configuration + - boid.py: Individual boid behavior + - simulation.py: Simulation loop and physics + - render.py: Visualization (optional) + + Focus on creating emergent patterns, smooth motion, and natural group dynamics. + evaluator: + agentic: + extra_cli_config: + model: "gemini-2.5-flash" + eval_prompt: | + Evaluate this boids simulation using BOTH quantitative metrics AND code quality. + + ## Part 1: Performance Metrics (0-50 points) + Run the simulation and read the ACTUAL metrics from stdout. + + **Collision Avoidance** (0-20 points): + - 0 collisions = 20 pts | <100 = 15 pts | <500 = 10 pts | <1000 = 5 pts | >=1000 = 0 pts + + **Alignment** (0-15 points): Read final alignment_score (0.0-1.0) + - >=0.95 = 15 pts | >=0.85 = 12 pts | >=0.70 = 8 pts | <0.70 = 4 pts + + **Cohesion** (0-15 points): Read final cohesion_score (0.0-1.0) + - >=0.70 = 15 pts | >=0.50 = 12 pts | >=0.30 = 8 pts | <0.30 = 4 pts + + ## Part 2: Solution Quality (0-50 points) + Review the code in boid.py, simulation.py, and main.py. + + **Algorithm Elegance** (0-20 points): + - Novel/creative approach to flocking behavior? + - Clean separation of concerns? + - Efficient force calculations? + - Smart use of spatial partitioning or other optimizations? + + **Parameter Tuning** (0-15 points): + - Well-reasoned weight values for separation/alignment/cohesion? + - Appropriate perception/separation radii? + - Good balance between stability and responsiveness? + + **Code Quality** (0-15 points): + - Readable and well-structured? + - No hacky workarounds or magic numbers without explanation? + - Would this scale to more boids? + + IMPORTANT: Base performance scores on ACTUAL simulation output, not guesses. + combined_score = Part 1 + Part 2 (0-100) + correct = true if simulation runs without crashes diff --git a/configs/variant/circle_packing_agentic.yaml b/configs/variant/circle_packing_agentic.yaml new file mode 100644 index 000000000..428b3120c --- /dev/null +++ b/configs/variant/circle_packing_agentic.yaml @@ -0,0 +1,26 @@ +# Variant configuration for Circle Packing task with agentic editing +# This enables the multi-turn agentic backend for evolution + +defaults: + - override /database@_global_: island_large + - override /task@_global_: circle_packing + - override /evolution@_global_: agentic + - override /cluster@_global_: local + +variant_suffix: "_agentic" +exp_name: "shinka_circle_packing" + +# Override evo_config with agentic-specific values for circle packing +evo_config: + num_generations: 50 + max_parallel_jobs: 4 + llm_models: + - "gemini-2.5-flash" + llm_dynamic_selection: ucb + # Override agentic model settings + agentic: + extra_cli_config: + model: "gemini-2.5-flash" + # Use legacy evaluator for circle packing (deterministic metric: sum of radii) + evaluator: + mode: legacy diff --git a/docs/getting_started.md b/docs/getting_started.md index 234158839..2fcc287d0 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -2,6 +2,8 @@ Shinka is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. This guide will help you get started with installing, configuring, and running your first evolutionary experiments. +![](../docs/conceptual.png) + ## Table of Contents 1. [What is Shinka?](#what-is-shinka) @@ -53,7 +55,7 @@ pip install uv ```bash git clone -cd shinka +cd ShinkaEvolve # Create virtual environment with Python 3.11 uv venv --python 3.11 @@ -79,7 +81,7 @@ conda activate shinka ```bash git clone -cd shinka +cd ShinkaEvolve pip install -e . ``` @@ -249,7 +251,7 @@ from shinka.core import run_shinka_eval def main(program_path: str, results_dir: str): """Main evaluation function called by Shinka""" - + metrics, correct, error_msg = run_shinka_eval( program_path=program_path, results_dir=results_dir, @@ -268,11 +270,11 @@ def main(program_path: str, results_dir: str): def validate_packing(run_output): """Returns (is_valid: bool, error_msg: str or None)""" centers, radii, reported_sum = run_output - + # Check constraints (bounds, overlaps, etc.) if constraint_violated: return False, "Specific error description" - + return True, None # Valid solution ``` @@ -280,10 +282,10 @@ def validate_packing(run_output): ```python def aggregate_metrics(results, results_dir): """Returns metrics dictionary with required structure""" - + # Extract data from results centers, radii, reported_sum = results[0] - + return { "combined_score": float(reported_sum), # PRIMARY FITNESS (higher = better) "public": { # Visible in WebUI/logs @@ -331,6 +333,75 @@ The `run_shinka_eval` function returns three values: ## Advanced Usage +### Resuming Experiments + +If you need to pause and resume an evolutionary run, or extend a completed run with more generations, Shinka supports seamless resumption from existing results. + +#### How Resuming Works + +When you specify an existing `results_dir` that contains a database, Shinka will: +- Detect the previous run automatically +- Restore the population database and all program history +- Resume meta-recommendations from the last checkpoint +- Continue from the last completed generation + +#### Using the CLI (Hydra) + +```bash +# Resume an existing run and extend to 50 generations +shinka_launch \ + variant=circle_packing_example \ + evo_config.results_dir=results_20250101_120000 \ + evo_config.num_generations=50 + +# Or with a custom task +shinka_launch \ + task=circle_packing \ + database=island_small \ + evolution=small_budget \ + cluster=local \ + evo_config.results_dir=path/to/previous/results \ + evo_config.num_generations=100 +``` + +#### Using the Python API + +```python +from shinka.core import EvolutionRunner, EvolutionConfig +from shinka.database import DatabaseConfig +from shinka.launch import LocalJobConfig + +# Point to existing results directory +evo_config = EvolutionConfig( + num_generations=50, # Extend to 50 total generations + results_dir="results_20250101_120000", # Existing results + # ... other config parameters ... +) + +job_config = LocalJobConfig( + eval_program_path="examples/circle_packing/evaluate.py", +) + +db_config = DatabaseConfig( + archive_size=20, + num_islands=2, +) + +# Run will automatically detect and resume +runner = EvolutionRunner( + evo_config=evo_config, + job_config=job_config, + db_config=db_config, +) +runner.run() +``` + +**Important Notes:** +- The `num_generations` parameter should be set to the **total** number of generations you want (not additional generations) +- For example, if you completed 20 generations and want 30 more, set `num_generations=50` +- The database configuration (number of islands, archive size, etc.) should match the original run +- All previous progress, including the best solutions and meta-recommendations, will be preserved + ### Environment Management for Local Jobs When running jobs locally, you have several options for managing Python environments: @@ -373,6 +444,101 @@ Generate animations showing how code evolves: python code_path_anim.py --results_dir examples/circle_packing/results_20250101_120000 ``` +## Agentic Mode (Multi-Turn Editing) + +Shinka supports **agentic mode** for multi-turn, multi-file code editing. Instead of single LLM calls, an agent can execute bash commands and modify multiple files over multiple turns. + +### Backends + +Agentic mode supports two backends: + +| Backend | Description | Setup Required | +|---------|-------------|----------------| +| **ShinkaAgent** (default) | Native in-process agent using LLMClient | Just API keys in `.env` | +| **Codex** | OpenAI's Codex CLI wrapper | Requires CLI installation + authentication | + +### Using ShinkaAgent (Recommended for Getting Started) + +ShinkaAgent is the default backend and requires no additional setup beyond your API keys: + +```bash +# Run with agentic mode using ShinkaAgent +shinka_launch variant=boids_flocking_agentic +``` + +### Setting Up Codex Backend + +If you want to use the Codex backend, follow these steps: + +#### Step 1: Install Codex CLI + +```bash +npm install -g @openai/codex +``` + +Verify installation: +```bash +codex --version +``` + +#### Step 2: Authenticate Codex + +```bash +codex login +``` + +This opens your browser for OAuth authentication with your ChatGPT account. + +#### Step 3: Verify Authentication + +```bash +codex login status +# Should show: "Logged in using ChatGPT" or similar +``` + +#### Step 4: Run with Codex Backend + +```bash +# Override the backend to use Codex +shinka_launch variant=circle_packing_agentic evo_config.agentic.backend=codex +``` + +### Agentic Mode Configuration + +Key configuration options in your variant YAML: + +```yaml +evo_config: + agentic_mode: true # Enable agentic editing + agentic: + backend: "shinka" # or "codex" + max_turns: 50 # Max conversation turns + sandbox: "workspace-write" + approval_mode: "full-auto" +``` + +### Troubleshooting Agentic Mode + +**Codex not found:** +``` +CodexUnavailableError: Codex CLI not found +``` +Solution: `npm install -g @openai/codex` + +**Codex not authenticated:** +``` +CodexAuthError: Codex CLI is not authenticated +``` +Solution: `codex login` + +**ShinkaAgent API key missing:** +``` +ShinkaUnavailableError: No API keys configured +``` +Solution: Ensure `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` is set in your `.env` file + +--- + ## Troubleshooting ### Common Issues diff --git a/docs/support_local_llm.md b/docs/support_local_llm.md new file mode 100644 index 000000000..5f406e7b9 --- /dev/null +++ b/docs/support_local_llm.md @@ -0,0 +1,232 @@ + +# 🧩 Integrating Local LLMs into **ShinkaEvolve** + +## 🧠 Overview + +The original **ShinkaEvolve** code does **not** include built-in support for running **local LLMs**. +To enable this functionality, parts of the codebase can be modified to integrate locally hosted models. + +--- + +## πŸ—οΈ Code Organization + +**ShinkaEvolve** uses a **modular architecture** that supports multiple **LLM providers**. +The relevant code for LLM interaction is located in the **`LLM/`** folder, which manages all model communications. +ShinkaEvolve distinguishes between two LLM types: + +* **Regular LLMs** +* **Embedding LLMs** + +--- + +## βš™οΈ Adding a Regular LLM + +To add support for a **regular LLM**, follow these steps. They will show an example of adding support for gpt-oss models running with unsloth, which provides an API compatible with OpenAI API (v1/completions). +This LLM can then be specified in the configuration variables: + +```yaml +llm_models: +meta_llm_models: +``` + +--- + +### πŸ”§ Step 1: Modify the Client + +The file **`client.py`** is responsible for creating clients that interact with LLMs. +Each client instance is later used to query a specific model. + +To add a local model, introduce a new client configuration. +The API URL is extracted from the model name, which follows this format: + +``` +local-gptoss-unsloth-url +``` + +#### Example + +```python +elif "local-gptoss-unsloth" in model_name: + # Extract URL from model name + pattern = r"https?://" + match = re.search(pattern, model_name) + if match: + start_index = match.start() + url = model_name[start_index:] + else: + raise ValueError(f"Invalid URL in model name: {model_name}") + + # Create OpenAI-compatible client + client = openai.OpenAI( + api_key="filler", + base_url=url + ) + + # Structured output mode (if required) + if structured_output: + client = instructor.from_openai( + client, + mode=instructor.Mode.JSON, + ) +``` + +--- + +### πŸ“ Step 2: Create the Local Query Function + +Inside the **`models/`** folder, create a new subfolder to store the query functions for your local models: + +``` +LLM/models/local/ +``` + +> Don’t forget to include an empty `__init__.py` file. + +This folder should contain a **custom query function** for the local model. I called my file local_gptoss_unsloth.py. +It should follow the same structure as other functions in `LLM/models/`, but with small adjustments. + +#### My Key Adjustments + +* Replace `max_output_tokens` with **`max_tokens`** to match the local API. +* Extract additional response metadata such as: + + * `total_tokens` + * `thinking_tokens` (if your model includes reasoning traces) + +This function is later imported and registered in **`query.py`**. + +--- + +### 🧩 Step 3: Update `__init__.py` + +Configure **`__init__.py`** to include and expose the new local query function, so it can be imported elsewhere. + +``` +from .local.local_gptoss_unsloth import query_local_gptoss_unsloth # ADDED THIS LINE +from .result import QueryResult + +__all__ = [ + "query_anthropic", + "query_openai", + "query_deepseek", + "query_gemini", + "query_local_gptoss_unsloth", # ADDED THIS LINE + "QueryResult", +] +``` + +--- + +### πŸ“¬ Step 4: Update `query.py` + +Import and register the new local query function in query.py. + +#### Imports + +```python +from .models import ( + query_anthropic, + query_openai, + query_deepseek, + query_gemini, + query_local_gptoss_unsloth, # ADDED THIS LINE + QueryResult, +) +``` + +#### Model Selection Logic + +```python +elif "local-gptoss-unsloth" in model_name: # ADDED THIS LINE + query_fn = query_local_gptoss_unsloth +``` + +--- + +### 🧠 Step 5: Other Observations + +The file **`query.py`** also defines functions such as: + +* `sample_model_kwargs` +* `sample_batch_kwargs` + +However, these are **not referenced anywhere else** in the repository, so no modifications are required here for now. + +--- + +### βœ… Summary + +| Step | File | Change | Description | +| ---- | -------------------------------------------- | -------------------- | -------------------------------------------------------- | +| 1 | `client.py` | Add new client block | Create OpenAI-compatible client for local LLM | +| 2 | `models/local/query_local_gptoss_unsloth.py` | New function | Query local model, adjust tokens, extract reasoning info | +| 3 | `__init__.py` | Add import | Expose new query function | +| 4 | `query.py` | Register model | Add conditional for local LLM | +| 5 | β€” | Review only | Ignored unused functions | + +--- + +## 🧬 Adding a Local Embedding Model + +For embedding models, you can use **Ollama**, which follows the **OpenAI API** format. +The only relevant file is **`embedding.py`**. + +### Code Addition + +```python +elif model_name.startswith("local-"): + # Pattern: local-(model-name)-(http or https url) + match = re.match(r"local-(.+?)-(https?://.+)", model_name) + if match: + model_to_use = match.group(1) + url = match.group(2) + else: + raise ValueError(f"Invalid local model format: {model_name}") + + client = openai.OpenAI( + base_url=url, + api_key="filler" + ) +``` + +#### Notes + +* Compatible with **any Ollama model**. +* The model name must follow this convention: + + ``` + local-model-name-url + ``` +* The code extracts both `model-name` and `url`, and uses them to query Ollama. + +--- + +### Query Logic + +The existing line in **`embedding.py`** remains unchanged: + +```python +response = self.client.embeddings.create( + model=self.model, + input=code, + encoding_format="float" +) +``` + +For local embedding models, `self.model` corresponds to the extracted model name. +The only addition to the **Embedding Client** class: + +```python +elif self.model_name.startswith("local-"): + cost = 0.0 +``` + +--- + +## πŸš€ Result + +ShinkaEvolve can now connect to **locally hosted LLMs** and **embedding models** through **OpenAI-compatible APIs**. +This setup supports **Ollama** and other frameworks such as **gpt-oss** under **Unsloth**. + +If your model has different requirements, follow the same pattern with a distinct model identifier and your own custom logic. + diff --git a/examples/boids_flocking/boid.py b/examples/boids_flocking/boid.py new file mode 100644 index 000000000..c59d30c6b --- /dev/null +++ b/examples/boids_flocking/boid.py @@ -0,0 +1,171 @@ +""" +Boid class implementing separation, alignment, and cohesion behaviors. +""" + +import math +from dataclasses import dataclass, field +from typing import List + + +@dataclass +class Vector2D: + """Simple 2D vector for boid physics.""" + + x: float = 0.0 + y: float = 0.0 + + def __add__(self, other: "Vector2D") -> "Vector2D": + return Vector2D(self.x + other.x, self.y + other.y) + + def __sub__(self, other: "Vector2D") -> "Vector2D": + return Vector2D(self.x - other.x, self.y - other.y) + + def __mul__(self, scalar: float) -> "Vector2D": + return Vector2D(self.x * scalar, self.y * scalar) + + def __truediv__(self, scalar: float) -> "Vector2D": + if scalar == 0: + return Vector2D(0, 0) + return Vector2D(self.x / scalar, self.y / scalar) + + def magnitude(self) -> float: + return math.sqrt(self.x * self.x + self.y * self.y) + + def normalize(self) -> "Vector2D": + mag = self.magnitude() + if mag == 0: + return Vector2D(0, 0) + return self / mag + + def limit(self, max_val: float) -> "Vector2D": + mag = self.magnitude() + if mag > max_val: + return self.normalize() * max_val + return Vector2D(self.x, self.y) + + def distance_to(self, other: "Vector2D") -> float: + return (self - other).magnitude() + + +@dataclass +class Boid: + """A single boid in the flock.""" + + position: Vector2D = field(default_factory=lambda: Vector2D(0, 0)) + velocity: Vector2D = field(default_factory=lambda: Vector2D(0, 0)) + acceleration: Vector2D = field(default_factory=lambda: Vector2D(0, 0)) + + # Behavior weights (SUBOPTIMAL: these could be evolved) + separation_weight: float = 1.0 + alignment_weight: float = 1.0 + cohesion_weight: float = 1.0 + + # Physical parameters + max_speed: float = 4.0 + max_force: float = 0.1 + perception_radius: float = 50.0 + separation_radius: float = 25.0 + + def apply_force(self, force: Vector2D) -> None: + """Apply a steering force to the boid.""" + self.acceleration = self.acceleration + force + + def update(self) -> None: + """Update velocity and position.""" + self.velocity = self.velocity + self.acceleration + self.velocity = self.velocity.limit(self.max_speed) + self.position = self.position + self.velocity + self.acceleration = Vector2D(0, 0) + + def seek(self, target: Vector2D) -> Vector2D: + """Calculate steering force toward a target.""" + desired = target - self.position + desired = desired.normalize() * self.max_speed + steer = desired - self.velocity + return steer.limit(self.max_force) + + def separation(self, neighbors: List["Boid"]) -> Vector2D: + """Steer to avoid crowding local flockmates.""" + steer = Vector2D(0, 0) + count = 0 + + for other in neighbors: + d = self.position.distance_to(other.position) + if 0 < d < self.separation_radius: + diff = self.position - other.position + diff = diff.normalize() + # SUBOPTIMAL: Simple inverse weighting (could use inverse square) + diff = diff / d + steer = steer + diff + count += 1 + + if count > 0: + steer = steer / count + if steer.magnitude() > 0: + steer = steer.normalize() * self.max_speed + steer = steer - self.velocity + steer = steer.limit(self.max_force) + + return steer * self.separation_weight + + def alignment(self, neighbors: List["Boid"]) -> Vector2D: + """Steer towards the average heading of local flockmates.""" + avg_velocity = Vector2D(0, 0) + count = 0 + + for other in neighbors: + d = self.position.distance_to(other.position) + if 0 < d < self.perception_radius: + avg_velocity = avg_velocity + other.velocity + count += 1 + + if count > 0: + avg_velocity = avg_velocity / count + avg_velocity = avg_velocity.normalize() * self.max_speed + steer = avg_velocity - self.velocity + steer = steer.limit(self.max_force) + return steer * self.alignment_weight + + return Vector2D(0, 0) + + def cohesion(self, neighbors: List["Boid"]) -> Vector2D: + """Steer to move toward the average position of local flockmates.""" + center = Vector2D(0, 0) + count = 0 + + for other in neighbors: + d = self.position.distance_to(other.position) + if 0 < d < self.perception_radius: + center = center + other.position + count += 1 + + if count > 0: + center = center / count + return self.seek(center) * self.cohesion_weight + + return Vector2D(0, 0) + + def flock(self, boids: List["Boid"]) -> None: + """Apply all three flocking behaviors.""" + # Filter out self from neighbors + neighbors = [b for b in boids if b is not self] + + sep = self.separation(neighbors) + ali = self.alignment(neighbors) + coh = self.cohesion(neighbors) + + self.apply_force(sep) + self.apply_force(ali) + self.apply_force(coh) + + def wrap_edges(self, width: float, height: float) -> None: + """Wrap boid around screen edges.""" + if self.position.x > width: + self.position.x = 0 + elif self.position.x < 0: + self.position.x = width + + if self.position.y > height: + self.position.y = 0 + elif self.position.y < 0: + self.position.y = height diff --git a/examples/boids_flocking/main.py b/examples/boids_flocking/main.py new file mode 100644 index 000000000..415a683ce --- /dev/null +++ b/examples/boids_flocking/main.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Boids Flocking Simulation - Main Entry Point + +This simulation evolves flocking behavior by optimizing separation, alignment, +and cohesion weights to minimize collisions while maintaining tight grouping. + +Usage: + python main.py # Run with visualization + python main.py --headless # Run without visualization + python main.py --steps 500 # Run for specific number of steps +""" + +import argparse +import json +import sys +from pathlib import Path + +from render import create_renderer +from simulation import SimulationConfig, SimulationEnvironment + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Boids Flocking Simulation") + parser.add_argument( + "--headless", action="store_true", help="Run without graphical output" + ) + parser.add_argument( + "--gui", action="store_true", help="Run with graphical output (opposite of --headless)" + ) + parser.add_argument( + "--steps", + type=int, + default=1000, + help="Number of simulation steps (default: 1000)", + ) + parser.add_argument( + "--boids", + type=int, + default=50, + help="Number of boids in the simulation (default: 50)", + ) + parser.add_argument( + "--output-dir", type=str, default=".", help="Directory for output files" + ) + # For framework compatibility (--results_dir is passed by shinka legacy evaluator) + parser.add_argument( + "--results_dir", type=str, default=None, help="Alias for --output-dir (framework compat)" + ) + parser.add_argument( + "--program_path", type=str, default=None, help="Ignored (framework compat)" + ) + return parser.parse_args() + + +def calculate_combined_score(metrics: dict) -> float: + """ + Calculate a combined fitness score from the simulation metrics. + + SUBOPTIMAL SCORING (room for evolution): + - Simple weighted average + - Doesn't account for trade-offs between metrics + - Could use more sophisticated aggregation + """ + # Extract key metrics + avg_separation = metrics.get("avg_separation", 0) + alignment_score = metrics.get("alignment_score", 0.5) + cohesion_score = metrics.get("cohesion_score", 0) + collision_rate = metrics.get("collision_rate", 1) + + # SUBOPTIMAL: Simple weighting scheme + # Ideal separation is around 20-40 (not too close, not too far) + separation_penalty = abs(avg_separation - 30) / 30 + separation_score = max(0, 1 - separation_penalty) + + # Penalize collisions heavily + collision_penalty = min(1, collision_rate * 10) + + # Combined score (higher is better) + combined = ( + 0.25 * separation_score + + 0.25 * alignment_score + + 0.25 * cohesion_score + + 0.25 * (1 - collision_penalty) + ) + + return max(0, min(100, combined * 100)) + + +def evaluate_simulation(args) -> dict: + """Run simulation and return evaluation results.""" + # Create simulation config + config = SimulationConfig( + num_boids=args.boids, + max_steps=args.steps, + # SUBOPTIMAL weights (evolution should improve these) + separation_weight=1.5, + alignment_weight=1.0, + cohesion_weight=1.0, + max_speed=4.0, + max_force=0.1, + perception_radius=50.0, + separation_radius=25.0, + ) + + # Create and run simulation + sim = SimulationEnvironment(config) + + # Create renderer if --gui is set (default is headless for framework eval) + renderer = None + headless = args.headless or not args.gui # Default to headless unless --gui is set + if not headless: + try: + renderer = create_renderer( + headless=False, width=config.width, height=config.height + ) + except Exception as e: + print(f"Warning: Could not create graphical renderer: {e}") + print("Falling back to headless mode.") + + # Run simulation + for step in range(args.steps): + sim.step() + + # Render if available + if renderer and hasattr(renderer, "render"): + try: + positions = sim.get_boid_positions() + velocities = sim.get_boid_velocities() + renderer.render(positions, velocities, step) + except Exception: + pass # Continue even if rendering fails + + # Progress output every 100 steps + if (step + 1) % 100 == 0: + metrics = sim.get_final_metrics() + print( + f"Step {step + 1}/{args.steps}: " + f"collisions={metrics.get('total_collisions', 0)}, " + f"alignment={metrics.get('alignment_score', 0):.3f}, " + f"cohesion={metrics.get('cohesion_score', 0):.3f}" + ) + + # Close renderer + if renderer and hasattr(renderer, "close"): + renderer.close() + + # Get final metrics + final_metrics = sim.get_final_metrics() + combined_score = calculate_combined_score(final_metrics) + + return { + "metrics": final_metrics, + "combined_score": combined_score, + "correct": combined_score >= 40, # SUBOPTIMAL threshold (should be higher) + } + + +def main(): + """Main entry point.""" + args = parse_args() + # Use --results_dir if provided (framework compat), otherwise --output-dir + output_dir = Path(args.results_dir if args.results_dir else args.output_dir) + + print("=" * 60) + print("BOIDS FLOCKING SIMULATION") + print("=" * 60) + print(f"Boids: {args.boids}") + print(f"Steps: {args.steps}") + headless = args.headless or not args.gui # Default to headless unless --gui + print(f"Mode: {'Headless' if headless else 'Graphical'}") + print("=" * 60) + + # Run evaluation + result = evaluate_simulation(args) + + # Print results + print("\n" + "=" * 60) + print("SIMULATION RESULTS") + print("=" * 60) + metrics = result["metrics"] + print(f"Average Separation: {metrics.get('avg_separation', 0):.2f}") + print(f"Alignment Score: {metrics.get('alignment_score', 0):.3f}") + print(f"Cohesion Score: {metrics.get('cohesion_score', 0):.3f}") + print(f"Total Collisions: {metrics.get('total_collisions', 0)}") + print(f"Collision Rate: {metrics.get('collision_rate', 0):.4f}") + print(f"Combined Score: {result['combined_score']:.2f}") + print(f"Correct: {result['correct']}") + print("=" * 60) + + # Write output files + metrics_file = output_dir / "metrics.json" + correct_file = output_dir / "correct.json" + + # Write full evaluation results including combined_score + eval_output = { + **metrics, + "combined_score": result["combined_score"], + "correct": result["correct"], + "details": f"Collisions: {metrics.get('total_collisions', 0)}, " + f"Alignment: {metrics.get('alignment_score', 0):.3f}, " + f"Cohesion: {metrics.get('cohesion_score', 0):.3f}" + } + with open(metrics_file, "w") as f: + json.dump(eval_output, f, indent=2) + print(f"Metrics written to: {metrics_file}") + + with open(correct_file, "w") as f: + json.dump({"correct": result["correct"]}, f) + print(f"Correctness written to: {correct_file}") + + return 0 if result["correct"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/boids_flocking/render.py b/examples/boids_flocking/render.py new file mode 100644 index 000000000..bc5aac7a3 --- /dev/null +++ b/examples/boids_flocking/render.py @@ -0,0 +1,94 @@ +""" +Renderer for visualizing the boids simulation. +""" + +from typing import List, Optional, Tuple + + +class MatplotlibRenderer: + """Matplotlib-based renderer for graphical output.""" + + def __init__(self, width: float = 800, height: float = 600): + self.width = width + self.height = height + self.fig = None + self.ax = None + self.scatter = None + self.quiver = None + + def initialize(self) -> None: + """Initialize matplotlib figure.""" + try: + import matplotlib.pyplot as plt + + plt.ion() + self.fig, self.ax = plt.subplots(figsize=(10, 8)) + self.ax.set_xlim(0, self.width) + self.ax.set_ylim(0, self.height) + self.ax.set_aspect("equal") + self.ax.set_facecolor("#1a1a2e") + self.fig.patch.set_facecolor("#1a1a2e") + self.ax.axis("off") + + except ImportError: + raise RuntimeError("matplotlib not available for graphical rendering") + + def render( + self, + positions: List[Tuple[float, float]], + velocities: List[Tuple[float, float]], + step: int = 0, + ) -> None: + """Render current frame.""" + import matplotlib.pyplot as plt + + if self.fig is None: + self.initialize() + + self.ax.clear() + self.ax.set_xlim(0, self.width) + self.ax.set_ylim(0, self.height) + self.ax.set_facecolor("#1a1a2e") + self.ax.axis("off") + + if positions: + xs, ys = zip(*positions) + vxs, vys = zip(*velocities) if velocities else (None, None) + + # Draw boids as points + self.ax.scatter(xs, ys, c="#00d9ff", s=30, alpha=0.8) + + # Draw velocity vectors + if vxs and vys: + self.ax.quiver( + xs, ys, vxs, vys, color="#ff6b6b", alpha=0.5, scale=50, width=0.003 + ) + + self.ax.set_title(f"Step: {step}", color="white", fontsize=12) + plt.pause(0.001) + + def save_frame(self, filename: str) -> None: + """Save current frame to file.""" + if self.fig: + self.fig.savefig(filename, dpi=100, facecolor="#1a1a2e") + + def close(self) -> None: + """Close the renderer.""" + if self.fig: + import matplotlib.pyplot as plt + + plt.close(self.fig) + + +def create_renderer( + headless: bool = False, width: float = 800, height: float = 600, **kwargs +) -> Optional[object]: + """Factory function to create appropriate renderer.""" + if headless: + return None # No rendering needed in headless mode + renderer = MatplotlibRenderer(width=width, height=height, **kwargs) + try: + renderer.initialize() + return renderer + except RuntimeError: + return None # No rendering if matplotlib unavailable diff --git a/examples/boids_flocking/simulation.py b/examples/boids_flocking/simulation.py new file mode 100644 index 000000000..af40df239 --- /dev/null +++ b/examples/boids_flocking/simulation.py @@ -0,0 +1,205 @@ +""" +Simulation environment for managing a flock of boids. +""" + +import math +import random +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple + +from boid import Boid, Vector2D + + +@dataclass +class SimulationConfig: + """Configuration for the boids simulation.""" + + width: float = 800.0 + height: float = 600.0 + num_boids: int = 50 + max_steps: int = 1000 + + # Boid parameters (SUBOPTIMAL: could be evolved) + separation_weight: float = 1.5 + alignment_weight: float = 1.0 + cohesion_weight: float = 1.0 + max_speed: float = 4.0 + max_force: float = 0.1 + perception_radius: float = 50.0 + separation_radius: float = 25.0 + + +class SimulationEnvironment: + """Manages a flock of boids and runs the simulation.""" + + def __init__(self, config: SimulationConfig): + self.config = config + self.boids: List[Boid] = [] + self.step_count: int = 0 + self.collision_count: int = 0 + self.metrics_history: List[Dict[str, float]] = [] + self._initialize_flock() + + def _initialize_flock(self) -> None: + """Create the initial flock with random positions and velocities.""" + for _ in range(self.config.num_boids): + position = Vector2D( + random.uniform(0, self.config.width), + random.uniform(0, self.config.height), + ) + angle = random.uniform(0, 2 * math.pi) + speed = random.uniform(2, self.config.max_speed) + velocity = Vector2D(math.cos(angle) * speed, math.sin(angle) * speed) + + boid = Boid( + position=position, + velocity=velocity, + separation_weight=self.config.separation_weight, + alignment_weight=self.config.alignment_weight, + cohesion_weight=self.config.cohesion_weight, + max_speed=self.config.max_speed, + max_force=self.config.max_force, + perception_radius=self.config.perception_radius, + separation_radius=self.config.separation_radius, + ) + self.boids.append(boid) + + def step(self) -> Dict[str, float]: + """Run one simulation step and return current metrics.""" + # Apply flocking behavior to each boid + for boid in self.boids: + boid.flock(self.boids) + + # Update positions and wrap edges + for boid in self.boids: + boid.update() + boid.wrap_edges(self.config.width, self.config.height) + + # Count collisions (boids too close together) + step_collisions = self._count_collisions() + self.collision_count += step_collisions + + # Calculate metrics + metrics = self._calculate_metrics() + metrics["step_collisions"] = step_collisions + self.metrics_history.append(metrics) + + self.step_count += 1 + return metrics + + def _count_collisions(self) -> int: + """Count pairs of boids that are too close (collision).""" + collision_threshold = 10.0 # Minimum safe distance + collisions = 0 + + for i, boid1 in enumerate(self.boids): + for boid2 in self.boids[i + 1 :]: + distance = boid1.position.distance_to(boid2.position) + if distance < collision_threshold: + collisions += 1 + + return collisions + + def _calculate_metrics(self) -> Dict[str, float]: + """Calculate current flock metrics.""" + if not self.boids: + return {"avg_separation": 0, "alignment_score": 0, "cohesion_score": 0} + + # Average separation (distance to nearest neighbor) + separations = [] + for boid in self.boids: + min_dist = float("inf") + for other in self.boids: + if other is not boid: + dist = boid.position.distance_to(other.position) + min_dist = min(min_dist, dist) + if min_dist != float("inf"): + separations.append(min_dist) + + avg_separation = sum(separations) / len(separations) if separations else 0 + + # Alignment score (how similar are velocity directions) + alignment_scores = [] + for boid in self.boids: + neighbors = [ + b + for b in self.boids + if b is not boid + and boid.position.distance_to(b.position) < boid.perception_radius + ] + if neighbors: + # Calculate average velocity direction + avg_vx = sum(n.velocity.x for n in neighbors) / len(neighbors) + avg_vy = sum(n.velocity.y for n in neighbors) / len(neighbors) + avg_vel = Vector2D(avg_vx, avg_vy) + + if boid.velocity.magnitude() > 0 and avg_vel.magnitude() > 0: + # Dot product normalized (1 = perfect alignment) + dot = boid.velocity.x * avg_vel.x + boid.velocity.y * avg_vel.y + alignment = dot / (boid.velocity.magnitude() * avg_vel.magnitude()) + alignment_scores.append((alignment + 1) / 2) # Normalize to 0-1 + + alignment_score = ( + sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0.5 + ) + + # Cohesion score (how close are boids to the flock center) + center_x = sum(b.position.x for b in self.boids) / len(self.boids) + center_y = sum(b.position.y for b in self.boids) / len(self.boids) + center = Vector2D(center_x, center_y) + + distances_to_center = [b.position.distance_to(center) for b in self.boids] + avg_distance = sum(distances_to_center) / len(distances_to_center) + + # Normalize cohesion (lower distance = better cohesion) + max_expected_distance = ( + math.sqrt(self.config.width**2 + self.config.height**2) / 4 + ) + cohesion_score = max(0, 1 - avg_distance / max_expected_distance) + + return { + "avg_separation": avg_separation, + "alignment_score": alignment_score, + "cohesion_score": cohesion_score, + "avg_distance_to_center": avg_distance, + } + + def run(self, steps: int = None) -> Dict[str, Any]: + """Run simulation for specified steps and return final metrics.""" + steps = steps or self.config.max_steps + + for _ in range(steps): + self.step() + + return self.get_final_metrics() + + def get_final_metrics(self) -> Dict[str, Any]: + """Get final aggregated metrics.""" + if not self.metrics_history: + return {} + + # Average over last 100 steps for stability + recent = ( + self.metrics_history[-100:] + if len(self.metrics_history) >= 100 + else self.metrics_history + ) + + return { + "avg_separation": sum(m["avg_separation"] for m in recent) / len(recent), + "alignment_score": sum(m["alignment_score"] for m in recent) / len(recent), + "cohesion_score": sum(m["cohesion_score"] for m in recent) / len(recent), + "total_collisions": self.collision_count, + "collision_rate": ( + self.collision_count / self.step_count if self.step_count > 0 else 0 + ), + "steps_completed": self.step_count, + } + + def get_boid_positions(self) -> List[Tuple[float, float]]: + """Get current positions of all boids for rendering.""" + return [(b.position.x, b.position.y) for b in self.boids] + + def get_boid_velocities(self) -> List[Tuple[float, float]]: + """Get current velocities of all boids for rendering.""" + return [(b.velocity.x, b.velocity.y) for b in self.boids] diff --git a/examples/shinka_tutorial.ipynb b/examples/shinka_tutorial.ipynb index 66a71a073..c6d818994 100644 --- a/examples/shinka_tutorial.ipynb +++ b/examples/shinka_tutorial.ipynb @@ -237,6 +237,17 @@ "if not llm_models:\n", " llm_models = [\"gpt-5-mini\"] # fallback if no keys detected\n", "\n", + "# pick embedding model based on available keys\n", + "embedding_model_name = \"\"\n", + "if os.getenv(\"GEMINI_API_KEY\"):\n", + " embedding_model_name = \"gemini-embedding-001\"\n", + "elif os.getenv(\"OPENAI_API_KEY\"):\n", + " embedding_model_name = \"text-embedding-3-small\"\n", + "else:\n", + " embedding_model_name = \"text-embedding-3-small\"\n", + "print(f\"βœ… Embedding model selected: {embedding_model_name}\")\n", + "\n", + "\n", "# unique experiment directory\n", "timestamp = dt.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "run_tag = f\"{timestamp}_weighted_fast\"\n", @@ -271,6 +282,8 @@ " max_novelty_attempts=3,\n", " # ensemble llm selection among candidates based on past performance\n", " llm_dynamic_selection=None, # e.g. \"ucb1\"\n", + " # set embedding model\n", + " embedding_model=embedding_model_name,\n", ")\n", "\n", "db_config = DatabaseConfig(\n", @@ -286,11 +299,13 @@ " enforce_island_separation=True,\n", " parent_selection_strategy=\"weighted\",\n", " parent_selection_lambda=10.0,\n", + " \n", ")\n", "\n", "job_config = LocalJobConfig(eval_program_path=\"evaluate.py\")\n", "\n", "print(\"llm_models:\", llm_models)\n", + "print(\"embedding_model:\", embedding_model_name)\n", "print(\"results_dir:\", evo_config.results_dir)" ] }, diff --git a/pyproject.toml b/pyproject.toml index e3ec455af..5802a1522 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,17 +45,20 @@ dependencies = [ "adjustText", "markdown", "aiofiles", + "google-generativeai", ] [tool.setuptools] -packages = ["shinka"] script-files = ["shinka/shinka_launch", "shinka/shinka_visualize"] +[tool.setuptools.packages.find] +include = ["shinka", "shinka.*"] + [tool.setuptools.package-data] "*" = ["*"] -[tool.uv] -dev-dependencies = [ +[dependency-groups] +dev = [ "pytest>=6.0", "black", "isort", diff --git a/shinka/core/embedding_corpus.py b/shinka/core/embedding_corpus.py new file mode 100644 index 000000000..943ef1908 --- /dev/null +++ b/shinka/core/embedding_corpus.py @@ -0,0 +1,220 @@ +import fnmatch +import hashlib +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Iterable, List, Optional, Sequence, Set + + +@dataclass +class EmbeddingCorpus: + """Result of building an embedding corpus for a generation directory.""" + + text: str + included_files: List[str] = field(default_factory=list) + skipped_files: List[str] = field(default_factory=list) + binary_files: List[str] = field(default_factory=list) + truncated: bool = False + total_bytes: int = 0 + + +def extract_file_content(corpus_text: str, filename: str) -> Optional[str]: + """ + Extract the content of a specific file from a corpus text dump. + Returns None if the file is not found or the corpus format is invalid. + """ + if not corpus_text: + return None + + # Regex to find the file header and capture content until the next header or end of string + # Header format: === FILE: {filename} ({size} bytes)[TRUNCATED?] === + escaped_filename = re.escape(filename) + # Look for header at start of string or after a newline + pattern = rf"(?:^|\n)=== FILE: {escaped_filename} \(\d+ bytes\)(?: \[TRUNCATED\])? ===\n(.*?)(?=\n=== FILE: |$)" + + match = re.search(pattern, corpus_text, re.DOTALL) + if match: + return match.group(1) + + return None + + +def _is_text_bytes(buf: bytes) -> bool: + """Heuristic: treat content as binary if it contains null bytes.""" + if not buf: + return True + return b"\x00" not in buf + + +def _sha256_prefix(buf: bytes, length: int = 8) -> str: + return hashlib.sha256(buf).hexdigest()[:length] + + +def _matches_any(patterns: Sequence[str], path: str) -> bool: + if not patterns: + return False + p_obj = Path(path) + for pat in patterns: + if pat in ("**", "**/*"): + return True + if fnmatch.fnmatch(path, pat): + return True + try: + if p_obj.match(pat): + return True + except Exception: + continue + return False + + +def build_embedding_corpus( + root: Path, + *, + include_globs: Sequence[str], + exclude_globs: Sequence[str], + max_files: int, + max_total_bytes: int, + max_bytes_per_file: int, + changed_first: Optional[Iterable[Path]] = None, + exclude_dirs: Optional[Set[str]] = None, + exclude_suffixes: Optional[Set[str]] = None, + exclude_files: Optional[Set[str]] = None, +) -> EmbeddingCorpus: + """ + Build a deterministic, artifact-agnostic corpus from a generation directory. + + Text files contribute their (possibly truncated) content. Binary files and + over-limit files contribute small placeholders (path, size, hash) so changes + are still visible to novelty checks without embedding raw bytes. + """ + + root = root.resolve() + exclude_dirs = exclude_dirs or set() + exclude_suffixes = exclude_suffixes or set() + exclude_files = exclude_files or set() + + def should_skip(rel: Path) -> bool: + if rel.name in exclude_files: + return True + if rel.suffix in exclude_suffixes: + return True + if rel.parts and rel.parts[0] in exclude_dirs: + return True + rel_posix = rel.as_posix() + if exclude_globs and _matches_any(exclude_globs, rel_posix): + return True + if include_globs and not _matches_any(include_globs, rel_posix): + return True + return False + + seen: Set[Path] = set() + ordered_candidates: List[Path] = [] + + # Prioritize explicitly changed files (if provided) + if changed_first: + for p in changed_first: + abs_path = (root / p).resolve() if not p.is_absolute() else p + if abs_path.is_file() and abs_path.is_relative_to(root): + rel = abs_path.relative_to(root) + if rel not in seen and not should_skip(rel): + seen.add(rel) + ordered_candidates.append(rel) + + # Discover remaining files + for path in sorted(root.rglob("*")): + if not path.is_file(): + continue + try: + rel = path.relative_to(root) + except ValueError: + continue + if rel in seen: + continue + if should_skip(rel): + continue + seen.add(rel) + ordered_candidates.append(rel) + + segments: List[str] = [] + included_files: List[str] = [] + skipped_files: List[str] = [] + binary_files: List[str] = [] + truncated = False + total_bytes = 0 + + for rel in ordered_candidates: + if len(included_files) >= max_files: + truncated = True + skipped_files.extend( + [r.as_posix() for r in ordered_candidates[len(included_files) :]] + ) + break + + abs_path = root / rel + try: + raw = abs_path.read_bytes() + except Exception: + skipped_files.append(rel.as_posix()) + continue + + size = len(raw) + to_embed = raw[:max_bytes_per_file] + file_truncated = size > max_bytes_per_file + + if total_bytes >= max_total_bytes: + truncated = True + skipped_files.append(rel.as_posix()) + continue + + is_text = _is_text_bytes(to_embed) + rel_posix = rel.as_posix() + + if is_text: + try: + text = to_embed.decode("utf-8", errors="replace") + except Exception: + is_text = False + + if not is_text: + placeholder = ( + f"[BINARY FILE] {rel_posix} size={size} sha256={_sha256_prefix(raw)}" + ) + addition = placeholder + "\n" + if total_bytes + len(addition) > max_total_bytes: + truncated = True + skipped_files.append(rel_posix) + continue + segments.append(placeholder) + included_files.append(rel_posix) + binary_files.append(rel_posix) + total_bytes += len(addition) + continue + + # Text path header for clarity/determinism + header = f"=== FILE: {rel_posix} ({size} bytes){' [TRUNCATED]' if file_truncated else ''} ===\n" + addition_len = len(header) + len(text) + 1 # trailing newline + if total_bytes + addition_len > max_total_bytes: + # Try to fit partial content + remaining = max_total_bytes - total_bytes - len(header) - 1 + if remaining <= 0: + truncated = True + skipped_files.append(rel_posix) + continue + text = text[:remaining] + addition_len = len(header) + len(text) + 1 + truncated = True + + segments.append(header + text + "\n") + included_files.append(rel_posix) + total_bytes += addition_len + + corpus_text = "".join(segments) + + return EmbeddingCorpus( + text=corpus_text, + included_files=included_files, + skipped_files=skipped_files, + binary_files=binary_files, + truncated=truncated, + total_bytes=total_bytes, + ) diff --git a/shinka/core/novelty_judge.py b/shinka/core/novelty_judge.py index 9fe0e0d00..eebdc5ab1 100644 --- a/shinka/core/novelty_judge.py +++ b/shinka/core/novelty_judge.py @@ -1,15 +1,24 @@ -from typing import Optional, Tuple, List import logging from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple + from shinka.database import Program from shinka.llm import LLMClient from shinka.prompts import NOVELTY_SYSTEM_MSG, NOVELTY_USER_MSG +# Type for agent runner function (used in agentic mode) +AgentRunner = Callable[..., Iterator[Dict[str, Any]]] + logger = logging.getLogger(__name__) class NoveltyJudge: - """Handles novelty assessment for generated code using LLM-based comparison.""" + """Handles novelty assessment for generated code using LLM-based comparison. + + Supports optional agentic mode where LLM novelty checks can be performed + via CLI agents (Codex, ShinkaAgent). When agentic mode is disabled or + agent_runner is not provided, falls back to legacy LLMClient-based checks. + """ def __init__( self, @@ -17,11 +26,26 @@ def __init__( language: str = "python", similarity_threshold: float = 1.0, max_novelty_attempts: int = 3, + # Agentic mode parameters (optional, graceful fallback to legacy) + agentic_mode: bool = False, + agent_runner: Optional[AgentRunner] = None, + agent_config: Optional[Any] = None, ): self.novelty_llm_client = novelty_llm_client self.language = language self.similarity_threshold = similarity_threshold self.max_novelty_attempts = max_novelty_attempts + # Store agentic config for future use (not implemented in minimal PR) + self.agentic_mode = agentic_mode + self.agent_runner = agent_runner + self.agent_config = agent_config + + # Log if agentic mode requested but no runner provided + if agentic_mode and agent_runner is None: + logger.warning( + "Agentic mode enabled but no agent_runner provided. " + "Falling back to legacy LLMClient-based novelty checks." + ) def should_check_novelty( self, diff --git a/shinka/core/runner.py b/shinka/core/runner.py index 3c818742c..f4fc4adcc 100644 --- a/shinka/core/runner.py +++ b/shinka/core/runner.py @@ -1,39 +1,136 @@ +import difflib +import hashlib +import json +import logging import shutil -import uuid import time -import logging -import yaml -from rich.logging import RichHandler -from rich.table import Table -from rich.console import Console -import rich.box -from typing import List, Optional, Union, cast +import uuid +from concurrent.futures import Future, ThreadPoolExecutor +from dataclasses import asdict, dataclass, field, is_dataclass, replace from datetime import datetime from pathlib import Path -from dataclasses import dataclass, field, asdict from subprocess import Popen -from shinka.launch import JobScheduler, JobConfig, ProcessWithLogging -from shinka.database import ProgramDatabase, DatabaseConfig, Program -from shinka.llm import ( - LLMClient, - extract_between, - EmbeddingClient, - BanditBase, - AsymmetricUCB, +from typing import Any, Dict, List, Literal, Optional, Union, cast + +import rich.box +import yaml +from rich.console import Console +from rich.logging import RichHandler +from rich.table import Table + +from shinka.core.embedding_corpus import ( + EmbeddingCorpus, + build_embedding_corpus, + extract_file_content, ) +from shinka.core.novelty_judge import NoveltyJudge +from shinka.core.sampler import PromptSampler +from shinka.core.summarizer import MetaSummarizer +from shinka.database import DatabaseConfig, Program, ProgramDatabase from shinka.edit import ( + AgentContext, + AgenticEditor, + CommandResult, apply_diff_patch, apply_full_patch, - summarize_diff, redact_immutable, + summarize_diff, +) +from shinka.edit.codex_cli import ( + CodexAuthError, + CodexExecutionError, + CodexUnavailableError, + ensure_codex_available, + run_codex_task, + validate_codex_setup, +) +from shinka.edit.shinka_agent import ( + ShinkaExecutionError, + ShinkaUnavailableError, + ensure_shinka_available, + run_shinka_task, +) +from shinka.launch import JobConfig, JobScheduler, ProcessWithLogging +from shinka.llm import ( + AsymmetricUCB, + BanditBase, + EmbeddingClient, + LLMClient, + extract_between, ) -from shinka.core.sampler import PromptSampler -from shinka.core.summarizer import MetaSummarizer -from shinka.core.novelty_judge import NoveltyJudge from shinka.logo import print_gradient_logo +from shinka.eval.agentic import AgenticEvaluator, AgenticEvaluatorResult FOLDER_PREFIX = "gen" +# Number of session events to include in agentic evaluator metadata +AGENTIC_EVAL_PREVIEW_LIMIT = 50 + +# Directories to exclude when copying workspace files for agentic edits +WORKSPACE_EXCLUDE_DIRS = { + "results", + "workspace_snapshot", + "agent_sessions", + ".hydra", + "__pycache__", +} +WORKSPACE_EXCLUDE_SUFFIXES = {".pyc", ".pyo"} +WORKSPACE_EXCLUDE_FILES = { + "rewrite.txt", + "edit.diff", + "session_log.jsonl", +} + + +@dataclass +class AgenticConfig: + """Configuration options for agentic editing sessions. + + This config supports Codex CLI and ShinkaAgent backends. + The `backend` field selects which one to use. + """ + + backend: str = "shinka" # "shinka" or "codex" + cli_profile: Optional[str] = None + sandbox: str = "workspace-write" + approval_mode: str = "full-auto" + max_turns: int = 50 + max_events: int = 240 # Event limit for Codex CLI streaming (3x default) + max_seconds: int = 0 + cli_path: Optional[str] = None + extra_cli_config: Dict[str, Any] = field(default_factory=dict) + resume_parent_session: bool = False + # Base directory for scratch workspaces. Using /tmp ensures scratch dirs are + # outside any git repo, preventing CLI from discovering parent AGENTS.md files. + scratch_dir_base: Optional[str] = "/tmp/shinka_scratch" + + +@dataclass +class AgenticEvaluatorConfig: + """Configuration for agentic evaluation sessions. + + The evaluator can use a different backend than the editor. + If backend is None, inherits from AgenticConfig.backend. + """ + + backend: Optional[str] = None # If None, use agentic.backend + cli_profile: Optional[str] = None + sandbox: str = "workspace-write" + approval_mode: str = "full-auto" + max_events: int = 240 # Event limit for Codex CLI streaming (3x default) + max_seconds: int = 0 + cli_path: Optional[str] = None + extra_cli_config: Dict[str, Any] = field(default_factory=dict) + eval_prompt: Optional[str] = None # Custom evaluation criteria for LLM judge + + +@dataclass +class EvaluatorConfig: + """Evaluator selection configuration.""" + + mode: Literal["auto", "legacy", "agentic"] = "auto" + agentic: AgenticEvaluatorConfig = field(default_factory=AgenticEvaluatorConfig) + @dataclass class EvolutionConfig: @@ -55,6 +152,13 @@ class EvolutionConfig: meta_llm_kwargs: dict = field(default_factory=lambda: {}) meta_max_recommendations: int = 5 embedding_model: Optional[str] = None + # Multi-file embedding configuration + embedding_use_corpus: bool = False # Use multi-file corpus instead of single file + embedding_include_globs: List[str] = field(default_factory=lambda: ["**/*.py"]) + embedding_exclude_globs: List[str] = field(default_factory=list) + embedding_max_files: int = 20 + embedding_max_bytes_per_file: int = 50000 + embedding_max_total_bytes: int = 200000 init_program_path: Optional[str] = "initial.py" results_dir: Optional[str] = None max_novelty_attempts: int = 3 @@ -62,6 +166,14 @@ class EvolutionConfig: novelty_llm_models: Optional[List[str]] = None novelty_llm_kwargs: dict = field(default_factory=lambda: {}) use_text_feedback: bool = False + # Agentic editing configuration + agentic_mode: bool = False + agentic: AgenticConfig = field(default_factory=AgenticConfig) + evaluator: EvaluatorConfig = field(default_factory=EvaluatorConfig) + # Maximum possible score for evaluation (used by agentic evaluator prompts) + max_score: float = 100.0 + # Multi-file support: directory containing additional files to copy + init_support_dir: Optional[str] = None @dataclass @@ -71,6 +183,7 @@ class RunningJob: job_id: Union[str, Popen, ProcessWithLogging] exec_fname: str results_dir: str + generation_dir: Path start_time: float generation: int parent_id: Optional[str] @@ -81,6 +194,13 @@ class RunningJob: code_embedding: List[float] = field(default_factory=list) embed_cost: float = 0.0 novelty_cost: float = 0.0 + # For multi-file embedding corpus + corpus_text: str = "" + corpus_meta: dict = field(default_factory=dict) + # For agentic evaluator results (pre-computed when agentic mode) + agentic_result: Optional[tuple] = None + # For async agentic evaluation (Future object) + agentic_future: Optional[Future] = None # Set up logging @@ -135,6 +255,17 @@ def __init__( logger.info(f"Log file: {log_filename}") logger.info("=" * 80) + # Validate agentic backend setup early (fail fast, not mid-evolution) + if evo_config.agentic_mode: + if evo_config.agentic.backend == "codex": + logger.info("Validating Codex backend setup...") + validate_codex_setup(evo_config.agentic.cli_path) + logger.info("Codex backend validated successfully") + else: + logger.info("Validating ShinkaAgent backend setup...") + ensure_shinka_available() + logger.info("ShinkaAgent backend validated successfully") + # Check if we are resuming a run resuming_run = False db_path = Path(f"{self.results_dir}/{db_config.db_path}") @@ -158,13 +289,48 @@ def __init__( # Initialize database and scheduler db_config.db_path = str(db_path) - self.db = ProgramDatabase(config=db_config) + embedding_model_to_use = evo_config.embedding_model or "text-embedding-3-small" + self.db = ProgramDatabase( + config=db_config, embedding_model=embedding_model_to_use + ) self.scheduler = JobScheduler( job_type=evo_config.job_type, config=job_config, # type: ignore verbose=verbose, ) + # Initialize agentic evaluator if enabled + self.evaluator_mode = self._resolve_evaluator_mode() + if self.evaluator_mode == "agentic": + # Use evaluator-specific backend if set, else fall back to agentic backend + eval_backend = ( + self.evo_config.evaluator.agentic.backend + or self.evo_config.agentic.backend + ) + if eval_backend == "shinka": + runner_fn = run_shinka_task + else: + runner_fn = run_codex_task + self.agentic_evaluator: Optional[AgenticEvaluator] = AgenticEvaluator( + self.evo_config.evaluator.agentic, + agent_runner=runner_fn, + ) + if self.verbose: + logger.info(f"Agentic evaluator using backend: {eval_backend}") + else: + self.agentic_evaluator = None + self.agentic_eval_sessions_dir = ( + Path(self.results_dir) / "agentic_eval_sessions" + ) + # Thread pool for parallel job execution (uses max_parallel_jobs workers) + # Enabled when agentic editing mode is on (works with both legacy and agentic eval) + self._eval_executor: Optional[ThreadPoolExecutor] = None + if evo_config.agentic_mode: + max_workers = evo_config.max_parallel_jobs or 6 + self._eval_executor = ThreadPoolExecutor(max_workers=max_workers) + if self.verbose: + logger.info(f"Parallel agentic editing enabled with {max_workers} workers") + self.llm = LLMClient( model_names=evo_config.llm_models, model_selection=self.llm_selection, @@ -204,6 +370,7 @@ def __init__( patch_types=evo_config.patch_types, patch_type_probs=evo_config.patch_type_probs, use_text_feedback=evo_config.use_text_feedback, + agentic_mode=evo_config.agentic_mode, ) # Initialize MetaSummarizer for meta-recommendations @@ -215,11 +382,16 @@ def __init__( ) # Initialize NoveltyJudge for novelty assessment + # Pass agentic config for potential future use, with graceful fallback self.novelty_judge = NoveltyJudge( novelty_llm_client=self.novelty_llm, language=evo_config.language, similarity_threshold=evo_config.code_embed_sim_threshold, max_novelty_attempts=evo_config.max_novelty_attempts, + # Agentic novelty (falls back to legacy if agent_runner not set) + agentic_mode=evo_config.agentic_mode, + agent_runner=None, # Not implemented in minimal PR + agent_config=evo_config.agentic if evo_config.agentic_mode else None, ) # Initialize rich console for formatted output @@ -231,6 +403,12 @@ def __init__( self.lang_ext = "cpp" elif self.evo_config.language == "python": self.lang_ext = "py" + elif self.evo_config.language == "rust": + self.lang_ext = "rs" + elif self.evo_config.language == "swift": + self.lang_ext = "swift" + elif self.evo_config.language in ["json", "json5"]: + self.lang_ext = "json" else: msg = f"Language {self.evo_config.language} not supported" raise ValueError(msg) @@ -333,11 +511,17 @@ def run(self): break # Submit new jobs to fill the queue (only if we have capacity) - if ( + while ( len(self.running_jobs) < max_jobs and self.next_generation_to_submit < target_gens ): - self._submit_new_job() + if self.evo_config.agentic_mode: + # Full parallelism: parent sampling in main thread (thread-safe), + # edit + eval in worker threads (works with both legacy and agentic eval) + self._submit_agentic_job_async() + else: + self._submit_new_job() + break # Legacy editing mode submits one job at a time # Wait a bit before checking again time.sleep(2) @@ -358,6 +542,13 @@ def run(self): logger.info(f"Evolution run ended at {end_time}") logger.info("=" * 80) + # Cleanup thread pool executors + if self._eval_executor is not None: + self._eval_executor.shutdown(wait=False) + self._eval_executor = None + if hasattr(self, 'scheduler') and self.scheduler is not None: + self.scheduler.shutdown() + def generate_initial_program(self): """Generate initial program with LLM, with retries.""" llm_kwargs = self.llm.get_kwargs() @@ -459,6 +650,29 @@ def _run_generation_0(self): patch_description = "Initial program from file." patch_type = "init" + # Multi-file support: copy additional support files into generation 0 directory + if self.evo_config.init_support_dir: + support_dir = Path(self.evo_config.init_support_dir) + if support_dir.is_dir(): + for path in support_dir.rglob("*"): + rel = path.relative_to(support_dir) + # Skip excluded dirs/files + if any(part in WORKSPACE_EXCLUDE_DIRS for part in rel.parts): + continue + if path.is_dir(): + continue + if path.suffix in WORKSPACE_EXCLUDE_SUFFIXES: + continue + if path.name in WORKSPACE_EXCLUDE_FILES: + continue + target = Path(initial_dir) / rel + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, target) + else: + logger.warning( + f"init_support_dir provided but not a directory: {support_dir}" + ) + if self.evo_config.init_program_path: if self.verbose: logger.info( @@ -481,7 +695,16 @@ def _run_generation_0(self): logger.info(f"Initial program generated and saved to {exec_fname}") # Run the evaluation synchronously - results, rtime = self.scheduler.run(exec_fname, results_dir) + if self.evaluator_mode == "agentic": + results, rtime = self._run_agentic_evaluation( + exec_fname=exec_fname, + results_dir=results_dir, + generation_dir=Path(initial_dir), + generation=0, + parent_id=None, + ) + else: + results, rtime = self.scheduler.run(exec_fname, results_dir) code_embedding, e_cost = self.get_code_embedding(exec_fname) @@ -615,10 +838,9 @@ def _submit_new_job(self): self.next_generation_to_submit += 1 - exec_fname = ( - f"{self.results_dir}/{FOLDER_PREFIX}_{current_gen}/main.{self.lang_ext}" - ) - results_dir = f"{self.results_dir}/{FOLDER_PREFIX}_{current_gen}/results" + generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{current_gen}" + exec_fname = str(generation_dir / f"main.{self.lang_ext}") + results_dir = str(generation_dir / "results") Path(results_dir).mkdir(parents=True, exist_ok=True) # Get current meta-recommendations for this job @@ -725,48 +947,266 @@ def _submit_new_job(self): meta_patch_data["novelty_cost"] = novelty_cost meta_patch_data["novelty_explanation"] = novelty_explanation - # Submit the job asynchronously - job_id = self.scheduler.submit_async(exec_fname, results_dir) + # Submit the job (agentic uses async thread pool, legacy uses async scheduler) + if self.evaluator_mode == "agentic": + # Submit agentic evaluation to thread pool for parallel execution + future = self._eval_executor.submit( + self._run_agentic_evaluation, + exec_fname=exec_fname, + results_dir=results_dir, + generation_dir=generation_dir, + generation=current_gen, + parent_id=parent_id, + ) + # Create job with future for async completion checking + running_job = RunningJob( + job_id=f"agentic_gen_{current_gen}", + exec_fname=exec_fname, + results_dir=results_dir, + generation_dir=generation_dir, + start_time=time.time(), + generation=current_gen, + parent_id=parent_id, + archive_insp_ids=archive_insp_ids, + top_k_insp_ids=top_k_insp_ids, + code_diff=code_diff, + meta_patch_data=meta_patch_data, + code_embedding=code_embedding, + embed_cost=embed_cost, + novelty_cost=novelty_cost, + agentic_future=future, # Store future for completion checking + ) + self.running_jobs.append(running_job) + else: + job_id = self.scheduler.submit_async(exec_fname, results_dir) + # Add to running jobs queue + running_job = RunningJob( + job_id=job_id, + exec_fname=exec_fname, + results_dir=results_dir, + generation_dir=generation_dir, + start_time=time.time(), + generation=current_gen, + parent_id=parent_id, + archive_insp_ids=archive_insp_ids, + top_k_insp_ids=top_k_insp_ids, + code_diff=code_diff, + meta_patch_data=meta_patch_data, + code_embedding=code_embedding, + embed_cost=embed_cost, + novelty_cost=novelty_cost, + ) + self.running_jobs.append(running_job) + + if self.verbose: + logger.info( + f"Submitted job for generation {current_gen}, " + f"queue size: {len(self.running_jobs)}" + ) + + def _submit_agentic_job_async(self): + """Submit an agentic job asynchronously (non-blocking). + + This method samples the parent in the main thread (thread-safe DB access), + then submits the edit + eval to the thread pool for parallel execution. + """ + current_gen = self.next_generation_to_submit + + if current_gen >= self.evo_config.num_generations: + return + + self.next_generation_to_submit += 1 + + generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{current_gen}" + exec_fname = str(generation_dir / f"main.{self.lang_ext}") + results_dir = str(generation_dir / "results") + + # Sample parent in main thread (DB access is NOT thread-safe) + parent_program, archive_programs, top_k_programs = self.db.sample( + target_generation=current_gen, + novelty_attempt=1, + max_novelty_attempts=self.evo_config.max_novelty_attempts, + resample_attempt=1, + max_resample_attempts=self.evo_config.max_patch_resamples, + ) + parent_id = parent_program.id + archive_insp_ids = [p.id for p in archive_programs] + top_k_insp_ids = [p.id for p in top_k_programs] - # Add to running jobs queue + # Get meta-recommendations in main thread + meta_recs, meta_summary, meta_scratch = self.meta_summarizer.get_current() + + # Submit the edit + eval to thread pool (no DB access in worker) + future = self._eval_executor.submit( + self._run_full_agentic_job, + current_gen=current_gen, + generation_dir=generation_dir, + exec_fname=exec_fname, + results_dir=results_dir, + parent_program=parent_program, + archive_programs=archive_programs, + top_k_programs=top_k_programs, + meta_recs=meta_recs, + meta_summary=meta_summary, + meta_scratch=meta_scratch, + ) + + # Create job with known parent info running_job = RunningJob( - job_id=job_id, + job_id=f"agentic_async_gen_{current_gen}", exec_fname=exec_fname, results_dir=results_dir, + generation_dir=generation_dir, start_time=time.time(), generation=current_gen, parent_id=parent_id, archive_insp_ids=archive_insp_ids, top_k_insp_ids=top_k_insp_ids, - code_diff=code_diff, - meta_patch_data=meta_patch_data, - code_embedding=code_embedding, - embed_cost=embed_cost, - novelty_cost=novelty_cost, + code_diff=None, + meta_patch_data={}, + agentic_future=future, ) self.running_jobs.append(running_job) if self.verbose: logger.info( - f"Submitted job for generation {current_gen}, " + f"Submitted async agentic job for gen {current_gen}, " f"queue size: {len(self.running_jobs)}" ) + def _run_full_agentic_job( + self, + current_gen: int, + generation_dir: Path, + exec_fname: str, + results_dir: str, + parent_program: "Program", + archive_programs: List["Program"], + top_k_programs: List["Program"], + meta_recs: Optional[str], + meta_summary: Optional[str], + meta_scratch: Optional[str], + ) -> tuple: + """Run the full agentic job (edit + eval) in a thread. + + NOTE: This runs in a worker thread. It must NOT access self.db directly + because SQLite connections are not thread-safe. All parent/inspiration + data is passed in from the main thread. + + Returns tuple of (results, rtime, job_metadata). + """ + Path(results_dir).mkdir(parents=True, exist_ok=True) + + parent_id = parent_program.id + archive_insp_ids = [p.id for p in archive_programs] + top_k_insp_ids = [p.id for p in top_k_programs] + + # Run the edit (patch generation) + code_diff, meta_patch_data, num_applied = self.run_patch( + parent_program, + archive_programs, + top_k_programs, + current_gen, + novelty_attempt=1, + resample_attempt=1, + ) + + # Get code embedding (thread-safe - uses HTTP calls) + code_embedding, embed_cost = self.get_code_embedding(exec_fname) + + # Add meta info + if meta_recs is not None: + meta_patch_data["meta_recommendations"] = meta_recs + meta_patch_data["meta_summary"] = meta_summary + meta_patch_data["meta_scratch_pad"] = meta_scratch + + # Run evaluation (legacy or agentic based on evaluator_mode) + if self.evaluator_mode == "legacy": + results, rtime = self._run_legacy_evaluation_sync( + exec_fname=exec_fname, + results_dir=results_dir, + ) + else: + results, rtime = self._run_agentic_evaluation( + exec_fname=exec_fname, + results_dir=results_dir, + generation_dir=generation_dir, + generation=current_gen, + parent_id=parent_id, + ) + + # Return all data needed to process the job + # Note: novelty_cost is 0 because we skip novelty checks in parallel mode + # (novelty checks require DB access which is not thread-safe) + job_metadata = { + "parent_id": parent_id, + "archive_insp_ids": archive_insp_ids, + "top_k_insp_ids": top_k_insp_ids, + "code_diff": code_diff, + "meta_patch_data": meta_patch_data, + "code_embedding": code_embedding, + "embed_cost": embed_cost, + "novelty_cost": 0.0, + } + + return (results, rtime, job_metadata) + def _check_completed_jobs(self) -> List[RunningJob]: """Check for completed jobs and return them.""" completed = [] still_running = [] for job in self.running_jobs: - is_running = self.scheduler.check_job_status(job) - if not is_running: - # Job completed + # Agentic jobs with pre-computed results are already complete + if job.agentic_result is not None: if self.verbose: - logger.info(f"Job {job.job_id} completed!") + logger.info(f"Agentic job for gen {job.generation} completed!") completed.append(job) + # Agentic jobs with futures - check if future is done + elif job.agentic_future is not None: + if job.agentic_future.done(): + # Future completed - get results and store them + try: + future_result = job.agentic_future.result() + # Handle both 2-tuple (results, rtime) and 3-tuple (results, rtime, metadata) + if len(future_result) == 3: + results, rtime, job_metadata = future_result + # Update job with metadata from async execution + job.parent_id = job_metadata.get("parent_id") + job.archive_insp_ids = job_metadata.get("archive_insp_ids", []) + job.top_k_insp_ids = job_metadata.get("top_k_insp_ids", []) + job.code_diff = job_metadata.get("code_diff") + job.meta_patch_data = job_metadata.get("meta_patch_data", {}) + job.code_embedding = job_metadata.get("code_embedding", []) + job.embed_cost = job_metadata.get("embed_cost", 0.0) + job.novelty_cost = job_metadata.get("novelty_cost", 0.0) + else: + results, rtime = future_result + job.agentic_result = (results, rtime) + if self.verbose: + logger.info(f"Agentic job for gen {job.generation} completed (async)!") + completed.append(job) + except Exception as e: + # Evaluation failed - create error result + logger.error(f"Agentic evaluation for gen {job.generation} failed: {e}") + job.agentic_result = ( + {"correct": {"correct": False}, "metrics": {"error": str(e)}}, + time.time() - job.start_time, + ) + completed.append(job) + else: + # Future still running + still_running.append(job) else: - # Job still running - still_running.append(job) + is_running = self.scheduler.check_job_status(job) + if not is_running: + # Job completed + if self.verbose: + logger.info(f"Job {job.job_id} completed!") + completed.append(job) + else: + # Job still running + still_running.append(job) self.running_jobs = still_running return completed @@ -774,10 +1214,13 @@ def _check_completed_jobs(self) -> List[RunningJob]: def _process_completed_job(self, job: RunningJob): """Process a completed job and add results to database.""" end_time = time.time() - rtime = end_time - job.start_time - # Get job results - results = self.scheduler.get_job_results(job.job_id, job.results_dir) + # Get job results (agentic has pre-computed results, legacy uses scheduler) + if job.agentic_result is not None: + results, rtime = job.agentic_result + else: + rtime = end_time - job.start_time + results = self.scheduler.get_job_results(job.job_id, job.results_dir) # Read the evaluated code try: @@ -972,6 +1415,18 @@ def run_patch( meta_recommendations=meta_recs, ) + # Route to agentic patch if enabled + if self.evo_config.agentic_mode: + return self._run_agentic_patch( + parent_program=parent_program, + generation=generation, + patch_sys=patch_sys, + patch_msg=patch_msg, + patch_type=patch_type, + novelty_attempt=novelty_attempt, + resample_attempt=resample_attempt, + ) + if patch_type in ["full", "cross"]: apply_patch = apply_full_patch elif patch_type == "diff": @@ -982,6 +1437,25 @@ def run_patch( else: raise ValueError(f"Invalid patch type: {patch_type}") + # Multi-file support (legacy patch path): ensure helper files are present. + # Agentic mode hydrates the workspace explicitly; for legacy patches we + # hydrate from the parent generation directory so multi-file tasks can run. + generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{generation}" + if generation_dir.is_dir(): + # Clear any stale workspace files from earlier patch attempts/resamples. + # Keep evaluation artifacts directories (e.g., results/) intact. + for child in generation_dir.iterdir(): + if child.name in WORKSPACE_EXCLUDE_DIRS: + continue + try: + if child.is_dir(): + shutil.rmtree(child) + else: + child.unlink() + except OSError: + continue + self._hydrate_generation_directory(parent_program, generation_dir) + total_costs = 0 msg_history = [] llm_kwargs = self.llm.get_kwargs() @@ -1096,9 +1570,10 @@ def run_patch( # error_attempt is already set from apply_patch or default pass - # Only consider the diff summary for the original.py file!!! - if "original.py" in diff_summary: - diff_summary = diff_summary["original.py"] + # Only consider the diff summary for the original source file + original_filename = f"original.{self.lang_ext}" + if original_filename in diff_summary: + diff_summary = diff_summary[original_filename] meta_edit_data = { "patch_type": patch_type, @@ -1119,43 +1594,81 @@ def run_patch( # Delete generation from meta_edit_data return code_diff, meta_edit_data, num_applied_attempt - def get_code_embedding(self, exec_fname: str) -> tuple[List[float], float]: - """Get the embedding of the code.""" - # Read the evaluated code + def get_code_embedding( + self, + exec_fname: str, + changed_files: Optional[List[Path]] = None, + ) -> tuple[List[float], float]: + """Get the embedding of the code. + + Args: + exec_fname: Path to the main executable file. + changed_files: Optional list of files that were changed (for multi-file + corpus mode, these will be prioritized in the embedding). + + Returns: + Tuple of (embedding vector, API cost). + """ + if self.embedding is None: + if self.verbose: + logger.debug("=> EMBED: No embedding model configured.") + return [], 0.0 + try: - evaluated_code = Path(exec_fname).read_text(encoding="utf-8") - except Exception as e: - logger.warning(f"Could not read code for job {exec_fname}. Error: {e}") - evaluated_code = "" - if evaluated_code != "": - # Get the embedding of the initial program - try: - if self.embedding is not None: - redacted_code = redact_immutable(evaluated_code, no_state=True) - if self.verbose: - logger.debug( - "=> EMBED: Code length - " - f"Original: {len(evaluated_code)} - " - f"Redacted: {len(redacted_code)}" - ) + # Multi-file corpus mode: build corpus from generation directory + if self.evo_config.embedding_use_corpus: + generation_dir = Path(exec_fname).parent + corpus = build_embedding_corpus( + root=generation_dir, + include_globs=self.evo_config.embedding_include_globs, + exclude_globs=self.evo_config.embedding_exclude_globs, + max_files=self.evo_config.embedding_max_files, + max_total_bytes=self.evo_config.embedding_max_total_bytes, + max_bytes_per_file=self.evo_config.embedding_max_bytes_per_file, + changed_first=changed_files, + exclude_dirs={"__pycache__", ".git", "venv", ".venv"}, + exclude_suffixes={".pyc", ".pyo", ".so", ".dll"}, + ) + text_to_embed = corpus.text - embedding_result, e_cost = self.embedding.get_embedding( - redacted_code + if self.verbose: + logger.debug( + f"=> EMBED: Corpus built - " + f"Files: {len(corpus.included_files)}, " + f"Bytes: {corpus.total_bytes}, " + f"Truncated: {corpus.truncated}" + ) + else: + # Single-file mode: read and redact the main executable + try: + evaluated_code = Path(exec_fname).read_text(encoding="utf-8") + except Exception as e: + logger.warning( + f"Could not read code for job {exec_fname}. Error: {e}" + ) + return [], 0.0 + + if not evaluated_code: + return [], 0.0 + + text_to_embed = redact_immutable(evaluated_code, no_state=True) + + if self.verbose: + logger.debug( + "=> EMBED: Code length - " + f"Original: {len(evaluated_code)} - " + f"Redacted: {len(text_to_embed)}" ) - else: - if self.verbose: - logger.debug("=> EMBED: No embedding model configured.") - embedding_result = [] - e_cost = 0.0 - code_embedding = cast(List[float], embedding_result) - except Exception as e: - logger.warning(f"Could not embed code for job {exec_fname}. Error: {e}") - code_embedding = [] - e_cost = 0.0 - else: - code_embedding = [] - e_cost = 0.0 - return code_embedding, e_cost + + if not text_to_embed: + return [], 0.0 + + embedding_result, e_cost = self.embedding.get_embedding(text_to_embed) + return cast(List[float], embedding_result), e_cost + + except Exception as e: + logger.warning(f"Could not embed code for job {exec_fname}. Error: {e}") + return [], 0.0 def _print_metadata_table(self, meta_data: dict, generation: int): """Display metadata in a formatted rich table.""" @@ -1286,3 +1799,717 @@ def _restore_meta_memory(self) -> None: ) else: logger.info("No previous meta memory state found - starting fresh") + + def _collect_parent_workspace_files( + self, parent_program: Program + ) -> Dict[Path, str]: + """Collect workspace files from parent program's generation directory.""" + workspace_files: Dict[Path, str] = {} + parent_generation_dir = ( + Path(self.results_dir) / f"{FOLDER_PREFIX}_{parent_program.generation}" + ) + if parent_generation_dir.is_dir(): + for file_path in parent_generation_dir.rglob("*"): + if not file_path.is_file(): + continue + rel_path = file_path.relative_to(parent_generation_dir) + if any(part in WORKSPACE_EXCLUDE_DIRS for part in rel_path.parts): + continue + if file_path.suffix in WORKSPACE_EXCLUDE_SUFFIXES: + continue + if file_path.name in WORKSPACE_EXCLUDE_FILES: + continue + try: + workspace_files[rel_path] = file_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + continue + return workspace_files + + parent_metadata = parent_program.metadata or {} + + # Fallback: Check if parent has stored changed files from agentic edit + agent_changed = parent_metadata.get("agent_changed_files") + if agent_changed and isinstance(agent_changed, dict): + for rel_path_str, content in agent_changed.items(): + workspace_files[Path(rel_path_str)] = content + + return workspace_files + + def _hydrate_generation_directory( + self, parent_program: Program, generation_dir: Path + ) -> None: + """Copy workspace files from parent to new generation directory.""" + parent_generation_dir = ( + Path(self.results_dir) / f"{FOLDER_PREFIX}_{parent_program.generation}" + ) + if parent_generation_dir.is_dir(): + for src_path in parent_generation_dir.rglob("*"): + rel_path = src_path.relative_to(parent_generation_dir) + if any(part in WORKSPACE_EXCLUDE_DIRS for part in rel_path.parts): + continue + if src_path.is_dir(): + continue + if src_path.suffix in WORKSPACE_EXCLUDE_SUFFIXES: + continue + if src_path.name in WORKSPACE_EXCLUDE_FILES: + continue + dst_path = generation_dir / rel_path + dst_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src_path, dst_path) + return + + # Fallback to metadata-stored files + workspace_files = self._collect_parent_workspace_files(parent_program) + for rel_path, content in workspace_files.items(): + target_path = generation_dir / rel_path + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_text(content, encoding="utf-8") + + def _run_agentic_patch( + self, + *, + parent_program: Program, + generation: int, + patch_sys: str, + patch_msg: str, + patch_type: str, + novelty_attempt: int, + resample_attempt: int, + ) -> tuple[Optional[str], dict, int]: + """Execute an agentic editing session via CLI backend (Codex or ShinkaAgent).""" + logger.info(f"_run_agentic_patch: START gen={generation} nov={novelty_attempt} resamp={resample_attempt}") + + primary_filename = Path(f"main.{self.lang_ext}") + + # Extract content from corpus; fallback to raw code if not a corpus + primary_content = extract_file_content( + parent_program.code, str(primary_filename) + ) + if primary_content is None: + if "=== FILE:" not in parent_program.code: + primary_content = parent_program.code + else: + primary_content = extract_file_content(parent_program.code, "main.py") + if primary_content is None: + primary_content = parent_program.code + + base_files: Dict[Path, str] = {primary_filename: primary_content} + base_files.update(self._collect_parent_workspace_files(parent_program)) + + session_root: Optional[Path] = None + parent_metadata = parent_program.metadata or {} + resume_session_id: Optional[str] = None + resumed_from_parent = False + + if self.evo_config.agentic.resume_parent_session: + candidate = parent_metadata.get("agent_session_id") + if isinstance(candidate, str) and candidate.strip(): + resume_session_id = candidate.strip() + resumed_from_parent = True + + def _serialize_changed_files( + changed_files: Optional[Dict[Path, str]], + ) -> Dict[str, str]: + if not changed_files: + return {} + serialized: Dict[str, str] = {} + for rel_path, content in changed_files.items(): + if rel_path == primary_filename: + continue + serialized[str(rel_path)] = content + return serialized + + def _build_code_diffs( + changed_files: Optional[Dict[Path, str]], + ) -> List[Dict[str, str]]: + """Build multi-file diffs for frontend display.""" + if not changed_files: + return [] + diffs: List[Dict[str, str]] = [] + for rel_path, new_content in changed_files.items(): + before = base_files.get(rel_path, "") + before_lines = before.splitlines(keepends=True) + after_lines = new_content.splitlines(keepends=True) + diff_text = "".join( + difflib.unified_diff( + before_lines, + after_lines, + fromfile=f"a/{rel_path}", + tofile=f"b/{rel_path}", + ) + ) + diffs.append({"path": str(rel_path), "diff": diff_text}) + return diffs + + def _agent_model_name(backend: str, actual_model: Optional[str] = None) -> str: + """Determine model name with priority: actual > config > profile > fallback.""" + if actual_model: + return actual_model + extra_cli = self.evo_config.agentic.extra_cli_config + if extra_cli: + model_override = ( + extra_cli.get("model") if isinstance(extra_cli, dict) else None + ) + if model_override: + return str(model_override) + if self.evo_config.agentic.cli_profile: + return self.evo_config.agentic.cli_profile + return f"{backend}-default" + + selected_backend = self.evo_config.agentic.backend + + # Bandit model selection (same as legacy path at lines 1150-1153) + bandit_model: Optional[str] = None + if self.llm_selection is not None: + llm_kwargs = self.llm.get_kwargs() + bandit_model = llm_kwargs.get("model_name") + if bandit_model: + self.llm_selection.update_submitted(bandit_model) + + def failure_meta( + message: str, + *, + session_log: Optional[List[str]] = None, + commands: Optional[List[CommandResult]] = None, + metrics: Optional[Dict[str, float]] = None, + session_id: Optional[str] = None, + changed_files: Optional[Dict[Path, str]] = None, + ) -> tuple[Optional[str], dict, int]: + api_cost = 0.0 + if metrics: + api_cost = ( + metrics.get("total_cost") + or metrics.get("estimated_total_cost") + or 0.0 + ) + serialized_changed = _serialize_changed_files(changed_files) + meta_edit_data = { + "patch_type": "agentic", + "api_costs": api_cost, + "num_applied": 0, + "patch_name": None, + "patch_description": None, + "error_attempt": message, + "novelty_attempt": novelty_attempt, + "resample_attempt": resample_attempt, + "patch_attempt": 1, + "agent_session_path": str(session_root) if session_root else None, + "agent_session_log": session_log or [], + "agent_commands": [asdict(cmd) for cmd in commands or []], + "agent_metrics": metrics or {}, + "agent_changed_files": serialized_changed, + "agent_code_diffs": _build_code_diffs(changed_files), + "agent_primary_file": str(primary_filename), + # Use bandit-selected model for bandit learning, fall back to backend default + "model_name": bandit_model or _agent_model_name(selected_backend), + "agent_backend": selected_backend, + "agent_session_id": session_id, + "agent_resumed_from_parent": resumed_from_parent, + } + return None, meta_edit_data, 0 + + # Ensure backend is available + try: + if selected_backend == "shinka": + ensure_shinka_available() + else: + ensure_codex_available(self.evo_config.agentic.cli_path) + except (CodexUnavailableError, ShinkaUnavailableError) as exc: + return failure_meta(str(exc)) + + # Create scratch directory + session_uuid = str(uuid.uuid4()) + if self.evo_config.agentic.scratch_dir_base: + scratch_base = Path(self.evo_config.agentic.scratch_dir_base) + scratch_base.mkdir(parents=True, exist_ok=True) + session_root = scratch_base / session_uuid + else: + session_root = Path(self.results_dir) / "agent_sessions" / session_uuid + + session_root.mkdir(parents=True, exist_ok=True) + + # Write session metadata + session_meta = { + "parent_id": parent_program.id, + "generation": generation, + "patch_type": patch_type, + "novelty_attempt": novelty_attempt, + "resample_attempt": resample_attempt, + "start_time": time.time(), + "results_dir": str(self.results_dir), + } + try: + with open(session_root / "session_meta.json", "w") as f: + json.dump(session_meta, f, indent=2) + except Exception as e: + logger.warning(f"Failed to write session_meta.json: {e}") + + # Build context for agent + helper_files = [p for p in base_files.keys() if p != primary_filename] + system_prompt = patch_sys.strip() + if helper_files: + helper_listing = "\n".join( + f"- {path.as_posix()}" for path in sorted(helper_files) + ) + system_prompt += ( + "\n\n# Workspace Files\n" + "The following helper files were copied from the parent program:\n" + f"{helper_listing}" + ) + + context = AgentContext( + user_prompt=patch_msg.strip(), + system_prompt=system_prompt, + language=self.evo_config.language, + base_files=base_files, + primary_file=primary_filename, + metadata={ + "generation": generation, + "novelty_attempt": novelty_attempt, + "resample_attempt": resample_attempt, + "patch_type": patch_type, + "results_dir": str(self.results_dir), + }, + resume_session_id=resume_session_id, + ) + + # Create config with bandit-selected model if available + agentic_config = self.evo_config.agentic + if bandit_model: + # Create modified extra_cli_config with bandit model + modified_extra_cli = dict(agentic_config.extra_cli_config) + modified_extra_cli["model"] = bandit_model + # Create new config with modified extra_cli_config + # Handle both dataclass instances and DictConfig from Hydra CLI overrides + if is_dataclass(agentic_config) and not isinstance(agentic_config, type): + agentic_config = replace( + agentic_config, extra_cli_config=modified_extra_cli + ) + else: + # DictConfig from Hydra - create a mutable copy preserving attribute access + from omegaconf import OmegaConf + agentic_config = OmegaConf.create(OmegaConf.to_container(agentic_config, resolve=True)) + agentic_config.extra_cli_config = modified_extra_cli + + editor = AgenticEditor( + scratch_dir=session_root, + config=agentic_config, + runner=run_shinka_task if selected_backend == "shinka" else run_codex_task, + ) + + try: + agent_result = editor.run_session(context) + logger.info(f"_run_agentic_patch: session completed, changed_files={list(agent_result.changed_files.keys())}") + except (CodexExecutionError, ShinkaExecutionError) as exc: + logger.info(f"_run_agentic_patch: session FAILED with {type(exc).__name__}: {exc}") + return failure_meta(str(exc)) + + # Create generation directory + generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{generation}" + if generation_dir.exists(): + shutil.rmtree(generation_dir) + generation_dir.mkdir(parents=True, exist_ok=True) + self._hydrate_generation_directory(parent_program, generation_dir) + + # Get primary file content from agent result + primary_content = agent_result.changed_files.get( + context.primary_file, base_files[context.primary_file] + ) + original_for_patch = base_files[context.primary_file] + + # Write ALL changed files directly to generation directory + # (Agentic mode: no EVOLVE-BLOCK markers needed) + logger.info( + f"Agentic edit: writing {len(agent_result.changed_files)} changed files " + f"to {generation_dir}" + ) + for rel_path, content in agent_result.changed_files.items(): + target = generation_dir / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + logger.info(f" Wrote: {rel_path} ({len(content)} bytes)") + + # If agent didn't change the primary file, ensure it exists + primary_target = generation_dir / context.primary_file + if not primary_target.exists(): + primary_target.write_text(primary_content, encoding="utf-8") + logger.info(f" Wrote primary (unchanged): {context.primary_file}") + + # In agentic mode, we consider the patch applied if any files were written + # (either changed files or the primary file was created) + num_applied = 1 if agent_result.changed_files or primary_target.exists() else 0 + logger.info(f"Agentic edit: num_applied={num_applied}") + + # Build code diff for display + original_lines = original_for_patch.splitlines(keepends=True) + new_lines = primary_content.splitlines(keepends=True) + code_diff = "".join( + difflib.unified_diff( + original_lines, + new_lines, + fromfile="a/main." + self.lang_ext, + tofile="b/main." + self.lang_ext, + ) + ) + + api_cost = 0.0 + if agent_result.metrics: + api_cost = ( + agent_result.metrics.get("total_cost") + or agent_result.metrics.get("estimated_total_cost") + or 0.0 + ) + + serialized_changed = _serialize_changed_files(agent_result.changed_files) + actual_model = agent_result.model + + meta_edit_data = { + "patch_type": "agentic", + "api_costs": api_cost, + "num_applied": num_applied, + "patch_name": None, + "patch_description": None, + "error_attempt": None, + "novelty_attempt": novelty_attempt, + "resample_attempt": resample_attempt, + "patch_attempt": 1, + "agent_session_path": str(session_root), + "agent_session_log": agent_result.session_log, + "agent_commands": [asdict(cmd) for cmd in agent_result.commands_run], + "agent_metrics": agent_result.metrics, + "agent_changed_files": serialized_changed, + "agent_code_diffs": _build_code_diffs(agent_result.changed_files), + "agent_primary_file": str(primary_filename), + # Use bandit-selected model for bandit learning, fall back to actual model + "model_name": bandit_model + or _agent_model_name(selected_backend, actual_model), + "agent_backend": selected_backend, + "agent_session_id": agent_result.session_id, + "agent_resumed_from_parent": resumed_from_parent, + "bandit_selected_model": bandit_model, + } + + # Note: Bandit update happens in _process_completed_job() after evaluation, + # using the model_name stored in metadata (same pattern as legacy path) + + return code_diff, meta_edit_data, num_applied + + def _resolve_evaluator_mode(self) -> str: + """Resolve evaluator mode after considering agentic defaults.""" + mode = (self.evo_config.evaluator.mode or "auto").lower() + if mode == "legacy": + return "legacy" + if mode == "agentic": + return "agentic" + if mode == "auto": + return "agentic" if self.evo_config.agentic_mode else "legacy" + raise ValueError(f"Unknown evaluator mode: {self.evo_config.evaluator.mode}") + + def _run_legacy_evaluation_sync( + self, exec_fname: str, results_dir: str + ) -> tuple[dict, float]: + """Run legacy evaluation synchronously via subprocess. + + This is thread-safe and can be called from worker threads. + Returns (results_dict, runtime_seconds) in the expected format: + {"correct": {"correct": bool}, "metrics": {...}} + """ + import subprocess + + eval_command = self._build_eval_command(exec_fname, results_dir) + if not eval_command: + logger.warning("No eval command configured for legacy evaluation") + return {"correct": {"correct": False}, "metrics": {"combined_score": 0.0}}, 0.0 + + Path(results_dir).mkdir(parents=True, exist_ok=True) + metrics_path = Path(results_dir) / "metrics.json" + correct_path = Path(results_dir) / "correct.json" + + start_time = time.time() + try: + result = subprocess.run( + eval_command, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + if result.returncode != 0: + logger.warning( + f"Legacy eval failed (exit {result.returncode}): {result.stderr[:500]}" + ) + except subprocess.TimeoutExpired: + logger.warning("Legacy eval timed out after 5 minutes") + except Exception as e: + logger.warning(f"Legacy eval error: {e}") + + rtime = time.time() - start_time + + # Parse correct.json + correct_val = False + if correct_path.exists(): + try: + content = correct_path.read_text(encoding="utf-8").strip() + if content: + correct_data = json.loads(content) + correct_val = correct_data.get("correct", False) + except Exception as e: + logger.warning(f"Failed to parse correct.json: {e}") + + # Parse metrics.json + metrics_val = {"combined_score": 0.0} + if metrics_path.exists(): + try: + content = metrics_path.read_text(encoding="utf-8").strip() + if content: + metrics_val = json.loads(content) + except Exception as e: + logger.warning(f"Failed to parse metrics.json: {e}") + + # Return in expected format + return { + "correct": {"correct": correct_val}, + "metrics": metrics_val, + }, rtime + + def _build_eval_command(self, exec_fname: str, results_dir: str) -> List[str]: + """Build the evaluation command from job config.""" + eval_program = self.job_config.eval_program_path + if not eval_program: + return [] + # Build command: python3 --program_path --results_dir + # Or use the raw eval_command if set in job_config + if hasattr(self.job_config, "eval_command") and self.job_config.eval_command: + return self.job_config.eval_command.split() + # Resolve to absolute path if relative (important for agentic eval which changes workdir) + eval_program_path = Path(eval_program) + if not eval_program_path.is_absolute(): + eval_program_path = (Path.cwd() / eval_program_path).resolve() + # Resolve exec_fname and results_dir to absolute paths too + exec_fname_path = Path(exec_fname) + if not exec_fname_path.is_absolute(): + exec_fname_path = (Path.cwd() / exec_fname_path).resolve() + results_dir_path = Path(results_dir) + if not results_dir_path.is_absolute(): + results_dir_path = (Path.cwd() / results_dir_path).resolve() + return [ + "python3", str(eval_program_path), + "--program_path", str(exec_fname_path), + "--results_dir", str(results_dir_path), + ] + + def _run_agentic_evaluation( + self, + *, + exec_fname: str, + results_dir: str, + generation_dir: Path, + generation: int, + parent_id: Optional[str] = None, + patch_type: Optional[str] = None, + ) -> tuple[Dict[str, Any], float]: + """Run evaluation using the agentic evaluator (LLM-powered).""" + if self.agentic_evaluator is None: + raise RuntimeError("Agentic evaluator not initialized") + + repo_root = generation_dir.resolve() + Path(results_dir).mkdir(parents=True, exist_ok=True) + metrics_path = Path(results_dir) / "metrics.json" + eval_sessions_root = self.agentic_eval_sessions_dir + eval_sessions_root.mkdir(parents=True, exist_ok=True) + eval_command = self._build_eval_command(exec_fname, results_dir) + run_root = Path(self.results_dir).resolve() + + def _rel_to_run_path(raw: Union[str, Path]) -> str: + try: + resolved = Path(raw).resolve() + return str(resolved.relative_to(run_root)) + except Exception: + return str(raw) + + # --- Evaluation integrity snapshot --- + # Policy: evaluator may create new artifacts but must not modify pre-existing files + results_path = Path(results_dir).resolve() + try: + results_rel = results_path.relative_to(repo_root) + except Exception: + results_rel = None + + ignored_dir_parts = {"__pycache__", ".pytest_cache", ".hydra", ".git", ".venv"} + ignored_suffixes = {".pyc", ".pyo"} + + def _should_ignore_integrity_path(rel_path: Path) -> bool: + if not rel_path.parts: + return True + if ( + results_rel is not None + and rel_path.parts[: len(results_rel.parts)] == results_rel.parts + ): + return True + if rel_path.suffix in ignored_suffixes: + return True + if any(part in ignored_dir_parts for part in rel_path.parts): + return True + return False + + def _snapshot_integrity(root: Path) -> Dict[str, str]: + snapshot: Dict[str, str] = {} + for abs_path in root.rglob("*"): + if not abs_path.is_file(): + continue + rel = abs_path.relative_to(root) + if _should_ignore_integrity_path(rel): + continue + try: + digest = hashlib.sha256(abs_path.read_bytes()).hexdigest() + except Exception: + continue + snapshot[rel.as_posix()] = digest + return snapshot + + integrity_pre = _snapshot_integrity(repo_root) + + # Convert paths to be relative to repo_root for the evaluator + # The agent runs with workdir=repo_root, so paths need to be relative + try: + rel_program_path = Path(exec_fname).resolve().relative_to(repo_root) + except ValueError: + rel_program_path = Path(exec_fname).name # Fallback to just filename + + try: + rel_results_path = Path(results_dir).resolve().relative_to(repo_root) + except ValueError: + rel_results_path = Path("results") # Fallback + + try: + rel_metrics_path = metrics_path.resolve().relative_to(repo_root) + except ValueError: + rel_metrics_path = Path("results/metrics.json") # Fallback + + start = time.time() + result = None + try: + result = self.agentic_evaluator.evaluate( + repo_root=repo_root, + eval_command=eval_command, + program_path=rel_program_path, + results_path=rel_results_path, + metrics_path=rel_metrics_path, + eval_sessions_root=eval_sessions_root, + task_name=self.job_config.eval_program_path or "agentic_evaluator", + results_dir=str(self.results_dir), + eval_prompt=getattr( + self.evo_config.evaluator.agentic, "eval_prompt", None + ), + max_score=self.evo_config.max_score, + ) + except (CodexExecutionError, ShinkaExecutionError) as exc: + # If metrics missing or empty, emit fallback so run can proceed + metrics_content = "" + if metrics_path.exists(): + metrics_content = metrics_path.read_text(encoding="utf-8").strip() + if not metrics_content: + metrics_path.parent.mkdir(parents=True, exist_ok=True) + fallback = { + "combined_score": 0.0, + "correct": False, + "details": f"Agentic evaluator failed: {exc}", + } + metrics_path.write_text(json.dumps(fallback), encoding="utf-8") + metrics_content = json.dumps(fallback) + try: + metrics = json.loads(metrics_content) + except json.JSONDecodeError: + metrics = {"combined_score": 0.0, "error": "Invalid metrics JSON"} + # If metrics exist and have a correct flag, use it; otherwise default to False + correct_from_metrics = bool(metrics.get("correct", False)) + result = AgenticEvaluatorResult( + metrics=metrics, + correct=correct_from_metrics, + error_message=str(exc), + stdout_log="", + stderr_log="", + session_log=[], + commands_run=[], + session_log_path=metrics_path.parent / "session_log.missing", + session_events=[], + session_id=None, + session_dir=metrics_path.parent, + elapsed_seconds=time.time() - start, + ) + rtime = time.time() - start + + integrity_post = _snapshot_integrity(repo_root) + modified_existing = sorted( + p + for p in integrity_pre.keys() + if p in integrity_post and integrity_pre[p] != integrity_post[p] + ) + deleted_existing = sorted( + p for p in integrity_pre.keys() if p not in integrity_post + ) + new_files_created = sorted( + p for p in integrity_post.keys() if p not in integrity_pre + ) + + integrity_status = "clean" + if modified_existing or deleted_existing: + integrity_status = "violation" + elif new_files_created: + integrity_status = "artifacts_only" + + integrity_meta = { + "policy": "no_modify_preexisting_files", + "status": integrity_status, + "modified_existing_count": len(modified_existing), + "deleted_existing_count": len(deleted_existing), + "new_files_created_count": len(new_files_created), + } + + # If integrity violated, force incorrect + effective_correct = result.correct + effective_error = result.error_message + effective_metrics = dict(result.metrics or {}) + + if integrity_status == "violation": + effective_correct = False + sample_paths = (modified_existing + deleted_existing)[:10] + integrity_msg = f"Evaluation integrity violation: evaluator modified files ({', '.join(sample_paths)})" + effective_error = ( + f"{effective_error} | {integrity_msg}" + if effective_error + else integrity_msg + ) + + events_preview = result.session_events[-AGENTIC_EVAL_PREVIEW_LIMIT:] + agentic_meta = { + "session_dir": _rel_to_run_path(result.session_dir), + "session_log_path": _rel_to_run_path(result.session_log_path), + "session_id": result.session_id, + "commands_run": [asdict(cmd) for cmd in result.commands_run], + "generation": generation, + "elapsed_seconds": result.elapsed_seconds, + "status": "error" if effective_error else "success", + "correct": effective_correct, + "metrics_path": _rel_to_run_path(metrics_path), + "metrics": effective_metrics, + "error_message": effective_error, + "stdout_log": result.stdout_log, + "stderr_log": result.stderr_log, + "events_preview": events_preview, + "system_prompt": result.system_prompt, + "user_prompt": result.user_prompt, + "integrity": integrity_meta, + } + + results_payload = { + "metrics": effective_metrics, + "correct": { + "correct": effective_correct, + "error": effective_error, + }, + "stdout_log": result.stdout_log, + "stderr_log": result.stderr_log, + "agentic_eval": agentic_meta, + } + + return results_payload, rtime diff --git a/shinka/core/sampler.py b/shinka/core/sampler.py index 6008f3357..236bb46f4 100644 --- a/shinka/core/sampler.py +++ b/shinka/core/sampler.py @@ -1,21 +1,16 @@ +import logging from typing import List, Optional, Tuple + import numpy as np + from shinka.database import Program -from shinka.prompts import ( - construct_eval_history_msg, - perf_str, - format_text_feedback_section, - BASE_SYSTEM_MSG, - DIFF_SYS_FORMAT, - DIFF_ITER_MSG, - FULL_ITER_MSG, - FULL_SYS_FORMATS, - CROSS_SYS_FORMAT, - CROSS_ITER_MSG, - get_cross_component, -) +from shinka.prompts import (BASE_SYSTEM_MSG, CROSS_ITER_MSG, CROSS_SYS_FORMAT, + DIFF_ITER_MSG, DIFF_SYS_FORMAT, FULL_ITER_MSG, + FULL_SYS_FORMATS, construct_eval_history_msg, + format_text_feedback_section, get_cross_component, + perf_str) +from shinka.prompts.prompts_agentic import AGENTIC_ITER_MSG, AGENTIC_SYS_FORMAT from shinka.prompts.prompts_init import INIT_SYSTEM_MSG, INIT_USER_MSG -import logging logger = logging.getLogger(__name__) @@ -28,6 +23,7 @@ def __init__( patch_types: Optional[List[str]] = None, patch_type_probs: Optional[List[float]] = None, use_text_feedback: bool = False, + agentic_mode: bool = False, ): if patch_types is None: patch_types = ["diff"] @@ -46,6 +42,8 @@ def __init__( ) # Whether to use text feedback in the prompt self.use_text_feedback = use_text_feedback + # Agentic mode: CLI harness owns system prompt, we only provide task context + self.agentic_mode = agentic_mode def initial_program_prompt(self) -> Tuple[str, str]: """Generate the prompt for the initial program.""" @@ -69,6 +67,10 @@ def sample( top_k_inspirations: List[Program], meta_recommendations: Optional[str] = None, ) -> Tuple[str, str, str]: + # Agentic mode: CLI harness owns system prompt, we provide task in user msg + if self.agentic_mode: + return self._sample_agentic(parent, meta_recommendations) + if self.task_sys_msg is None: sys_msg = BASE_SYSTEM_MSG else: @@ -179,3 +181,45 @@ def sample( eval_history_msg + "\n" + iter_msg, patch_type, ) + + def _sample_agentic( + self, + parent: Program, + meta_recommendations: Optional[str] = None, + ) -> Tuple[str, str, str]: + """Generate prompts for agentic mode. + + In agentic mode, the CLI harness (Codex, Claude CLI, Gemini CLI) owns the + system prompt. We only provide task context in the user message. + + Returns: + Tuple of (system_msg, user_msg, patch_type) where: + - system_msg is empty (harness provides its own) + - user_msg contains task context and current score + - patch_type is "agentic" + """ + # Task context from config + task_context = self.task_sys_msg or "Improve the program." + + # Score context + score_context = perf_str(parent.combined_score, parent.public_metrics) + + # Text feedback section + text_feedback_section = "" + if self.use_text_feedback and parent.text_feedback: + text_feedback_section = "\n" + format_text_feedback_section( + parent.text_feedback + ) + + # Add meta-recommendations if provided + if meta_recommendations not in [None, "none"]: + task_context += "\n\n# Potential Recommendations\n" + task_context += meta_recommendations + + user_msg = AGENTIC_ITER_MSG.format( + task_context=task_context, + score_context=score_context, + text_feedback_section=text_feedback_section, + ) + + return (AGENTIC_SYS_FORMAT, user_msg, "agentic") diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py index 7e1d1e5d3..6ae210632 100644 --- a/shinka/core/wrap_eval.py +++ b/shinka/core/wrap_eval.py @@ -1,10 +1,12 @@ import importlib.util import json import os +import pickle +import sys import time +from typing import Any, Callable, Dict, List, Optional, Tuple + import numpy as np -import pickle -from typing import Callable, Any, Dict, List, Tuple, Optional DEFAULT_METRICS_ON_ERROR = { "combined_score": 0.0, @@ -19,15 +21,23 @@ def load_program(program_path: str) -> Any: """Loads a Python module dynamically from a given file path.""" - spec = importlib.util.spec_from_file_location("program", program_path) - if spec is None: - raise ImportError(f"Could not load spec for module at {program_path}") - if spec.loader is None: - raise ImportError(f"Spec loader is None for module at {program_path}") + program_dir = os.path.abspath(os.path.dirname(program_path) or ".") + sys_path_before = list(sys.path) + if program_dir not in sys.path: + sys.path.insert(0, program_dir) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module + try: + spec = importlib.util.spec_from_file_location("program", program_path) + if spec is None: + raise ImportError(f"Could not load spec for module at {program_path}") + if spec.loader is None: + raise ImportError(f"Spec loader is None for module at {program_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + finally: + sys.path[:] = sys_path_before def save_json_results( @@ -96,6 +106,9 @@ def run_shinka_eval( num_valid_runs = 0 num_invalid_runs = 0 + all_run_results: List[Any] = [] + execution_times: List[float] = [] + try: module = load_program(program_path) if not hasattr(module, experiment_fn_name): @@ -105,9 +118,6 @@ def run_shinka_eval( ) experiment_fn = getattr(module, experiment_fn_name) - all_run_results: List[Any] = [] - execution_times: List[float] = [] - for i in range(num_runs): kwargs: Dict[str, Any] = {} if get_experiment_kwargs: diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py index 4116567e9..70cd5d3a1 100644 --- a/shinka/database/complexity.py +++ b/shinka/database/complexity.py @@ -259,8 +259,8 @@ def analyze_code_metrics(code_string, language="python"): # If Python parsing fails, fall back to C++ analysis return analyze_cpp_complexity(code_string) - # For C/C++/CUDA and other languages, use regex-based analysis - elif language in ["cpp", "c", "cuda", "c++"]: + # For C/C++/CUDA/Rust/Swift/JSON and other languages, use regex-based analysis + elif language in ["cpp", "c", "cuda", "c++", "rust", "swift", "json", "json5"]: return analyze_cpp_complexity(code_string) # For unknown languages, use simple line-based complexity diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py index 69fdf5432..2118763c4 100644 --- a/shinka/database/dbase.py +++ b/shinka/database/dbase.py @@ -50,7 +50,7 @@ def clean_nan_values(obj: Any) -> Any: @dataclass class DatabaseConfig: - db_path: Optional[str] = None + db_path: str = "evolution_db.sqlite" num_islands: int = 4 archive_size: int = 100 @@ -82,6 +82,9 @@ class DatabaseConfig: # Beam search parent selection parameters num_beams: int = 5 + # Embedding model name + embedding_model: str = "text-embedding-3-small" + def db_retry(max_retries=5, initial_delay=0.1, backoff_factor=2): """ @@ -248,12 +251,22 @@ class ProgramDatabase: populations, and an archive of elites. """ - def __init__(self, config: DatabaseConfig, read_only: bool = False): + def __init__( + self, + config: DatabaseConfig, + embedding_model: str = "text-embedding-3-small", + read_only: bool = False, + ): self.config = config self.conn: Optional[sqlite3.Connection] = None self.cursor: Optional[sqlite3.Cursor] = None self.read_only = read_only - self.embedding_client = EmbeddingClient() + # Only create embedding client if not in read-only mode + # (e.g., WebUI doesn't need it for visualization) + if not read_only: + self.embedding_client = EmbeddingClient(model_name=embedding_model) + else: + self.embedding_client = None self.last_iteration: int = 0 self.best_program_id: Optional[str] = None diff --git a/shinka/database/display.py b/shinka/database/display.py index 4c34d3445..c622044ad 100644 --- a/shinka/database/display.py +++ b/shinka/database/display.py @@ -1,10 +1,11 @@ import json import logging import time +from typing import Any, Callable, Optional + import numpy as np -from typing import Optional, Callable, Any -import rich.box # type: ignore import rich # type: ignore +import rich.box # type: ignore from rich.columns import Columns as RichColumns # type: ignore from rich.console import Console as RichConsole # type: ignore from rich.table import Table as RichTable # type: ignore @@ -122,6 +123,18 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None): else: time_display = f"{time_val:.1f}s" + # Safely extract metadata fields for display + metadata = program.metadata or {} + patch_name_raw = metadata.get("patch_name", "[dim]N/A[/dim]") + if patch_name_raw is None: + patch_name_raw = "[dim]N/A[/dim]" + patch_name = str(patch_name_raw)[:30] + + patch_type_raw = metadata.get("patch_type", "[dim]N/A[/dim]") + if patch_type_raw is None: + patch_type_raw = "[dim]N/A[/dim]" + patch_type = str(patch_type_raw) + # Add the data row island_display = ( f"I-{program.island_idx}" if program.island_idx is not None else "N/A" @@ -131,8 +144,8 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None): island_display, status_display, score_display, - program.metadata.get("patch_name", "[dim]N/A[/dim]")[:30], - program.metadata.get("patch_type", "[dim]N/A[/dim]"), + patch_name, + patch_type, f"{program.complexity:.1f}", cost_display, time_display, @@ -196,9 +209,11 @@ def print_summary(self, console: Optional[RichConsole] = None) -> None: # Add Best Score to the top of the summary table summary_table.add_row( "Overall Best Score", - f"[bold cyan]{best_score:.2f}[/bold cyan]" - if num_with_scores > 0 - else "[dim]N/A[/dim]", + ( + f"[bold cyan]{best_score:.2f}[/bold cyan]" + if num_with_scores > 0 + else "[dim]N/A[/dim]" + ), ) # Gather data for summary @@ -457,8 +472,8 @@ def print_summary(self, console: Optional[RichConsole] = None) -> None: correct_str, score_str, f"{prog.complexity:.1f}", - prog.metadata.get("patch_name", "N/A")[:30], - prog.metadata.get("patch_type", "N/A")[:6], + (prog.metadata.get("patch_name") or "N/A")[:30], + (prog.metadata.get("patch_type") or "N/A")[:6], island_display, str(children_count), ts_str, @@ -588,8 +603,9 @@ def format_program_row(prog, role_name): time_display = f"{time_val:.1f}s" # Patch name and type - patch_name = prog.metadata.get("patch_name", "[dim]N/A[/dim]")[:30] - patch_type = prog.metadata.get("patch_type", "[dim]N/A[/dim]") + metadata = prog.metadata or {} + patch_name = (metadata.get("patch_name") or "[dim]N/A[/dim]")[:30] + patch_type = metadata.get("patch_type") or "[dim]N/A[/dim]" return [ role_name, diff --git a/shinka/database/inspirations.py b/shinka/database/inspirations.py index ee564dfa1..42c3859d8 100644 --- a/shinka/database/inspirations.py +++ b/shinka/database/inspirations.py @@ -72,6 +72,7 @@ def sample_context(self, parent: Any, n: int) -> List[Any]: self.cursor.execute( """ SELECT p.id FROM programs p + JOIN archive a ON p.id = a.program_id WHERE p.island_idx = ? AND p.correct = 1 ORDER BY p.combined_score DESC LIMIT ? @@ -93,7 +94,8 @@ def sample_context(self, parent: Any, n: int) -> List[Any]: placeholders_rand = ",".join("?" * len(insp_ids)) sql_rand = f""" SELECT p.id FROM programs p - WHERE p.island_idx = ? AND p.correct = 1 + JOIN archive a ON p.id = a.program_id + WHERE p.island_idx = ? AND p.correct = 1 AND p.id NOT IN ({placeholders_rand}) ORDER BY RANDOM() LIMIT ? """ @@ -111,9 +113,10 @@ def sample_context(self, parent: Any, n: int) -> List[Any]: needed = n - len(inspirations) if needed > 0: placeholders_rand = ",".join("?" * len(insp_ids)) - sql_rand = f"""SELECT id FROM programs - WHERE correct = 1 - AND id NOT IN ({placeholders_rand}) + sql_rand = f"""SELECT p.id FROM programs p + JOIN archive a ON p.id = a.program_id + WHERE p.correct = 1 + AND p.id NOT IN ({placeholders_rand}) ORDER BY RANDOM() LIMIT ? """ params_rand = list(insp_ids) + [needed] diff --git a/shinka/database/islands.py b/shinka/database/islands.py index 9975eac3b..d721ec3ff 100644 --- a/shinka/database/islands.py +++ b/shinka/database/islands.py @@ -488,7 +488,7 @@ def _print_migration_summary(self, migrations_summary: Dict) -> None: f"{generation}", score_str, str(children), - (patch_name[:28] if patch_name != "N/A" else "N/A"), + (patch_name[:28] if patch_name and patch_name != "N/A" else "N/A"), patch_type, f"{complexity:.1f}" if complexity else "N/A", ) @@ -682,6 +682,16 @@ def copy_program_to_islands(self, program: Any) -> List[str]: f"Created copy {new_id[:8]}... of program {program.id[:8]}... " f"for island {island_idx}" ) + + # Add the copied program to the archive if it's correct + # This ensures it can be used as inspiration for that island + if program.correct: + self.cursor.execute( + "INSERT OR IGNORE INTO archive (program_id) VALUES (?)", + (new_id,), + ) + logger.debug(f"Added copy {new_id[:8]}... to archive (correct program)") + self.conn.commit() logger.info( f"Created {len(created_ids)} copies of program " diff --git a/shinka/edit/__init__.py b/shinka/edit/__init__.py index 33d4b52ed..cc0f7f98f 100644 --- a/shinka/edit/__init__.py +++ b/shinka/edit/__init__.py @@ -1,3 +1,4 @@ +from .agentic import AgentContext, AgenticEditor, AgentResult, CommandResult from .apply_diff import apply_diff_patch, redact_immutable from .apply_full import apply_full_patch from .summary import summarize_diff @@ -7,4 +8,8 @@ "apply_diff_patch", "apply_full_patch", "summarize_diff", + "AgenticEditor", + "AgentContext", + "AgentResult", + "CommandResult", ] diff --git a/shinka/edit/agentic.py b/shinka/edit/agentic.py new file mode 100644 index 000000000..f3b78e13f --- /dev/null +++ b/shinka/edit/agentic.py @@ -0,0 +1,334 @@ +"""Agentic editing harness with a pluggable backend (Codex default).""" + +from __future__ import annotations + +import base64 +import json +import logging +import shutil +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +from .codex_cli import run_codex_task +from .event_utils import extract_session_id +from .types import AgentRunner + +logger = logging.getLogger(__name__) + +MAX_BASE_FILE_SIZE = 100 * 1024 * 1024 # 100MB +MAX_BINARY_FILE_SIZE = 50 * 1024 * 1024 # 50MB +MAX_FILES_TO_SCAN = 10_000 + + +@dataclass +class CommandResult: + """Represents a command execution issued by the agent.""" + + command: Optional[str] + status: Optional[str] + exit_code: Optional[int] + stdout: Optional[str] = None + stderr: Optional[str] = None + + +@dataclass +class AgentResult: + """Container for the outcome of an agentic editing session.""" + + changed_files: Dict[Path, str] + session_log: List[str] + commands_run: List[CommandResult] + final_message: Optional[str] = None + metrics: Dict[str, float] = field(default_factory=dict) + session_log_path: Optional[Path] = None + session_events: List[Dict[str, Any]] = field(default_factory=list) + binary_changed_files: Dict[Path, str] = field(default_factory=dict) + session_id: Optional[str] = None + model: Optional[str] = None # Actual model from CLI init event + + +@dataclass +class AgentContext: + """Inputs required to run an agentic editing session. + + Note on system_prompt: In agentic mode, the harness (Codex/Gemini/Claude CLI) + owns the system prompt. This field contains only AGENTIC_SYS_FORMAT (operational + instructions for sandbox editing), NOT task-specific context. Task context + (task_sys_msg from config) is included in the user_prompt as "# Task Context". + This ensures we don't override the CLI's native system behavior. + """ + + user_prompt: str + language: str + base_files: Dict[Path, str] + primary_file: Path + system_prompt: Optional[str] = None + metadata: Dict[str, Any] = field(default_factory=dict) + resume_session_id: Optional[str] = None + + +class AgenticEditor: + """Drive an agentic editing session within a dedicated scratch directory. + + Backend is selected by the caller (Codex/Gemini/Claude/ShinkaAgent); Codex + is only the default runner, not a requirement. + """ + + def __init__( + self, + scratch_dir: Path, + config, + *, + runner: AgentRunner = run_codex_task, + codex_runner: AgentRunner | None = None, # Deprecated: use runner + ) -> None: + self.scratch_dir = Path(scratch_dir) + self.config = config + # Accept the legacy codex_runner keyword for backward compatibility + self.runner = runner if codex_runner is None else codex_runner + + def _prepare_scratch(self, base_files: Dict[Path, str]) -> Dict[Path, str]: + # Preserve session_meta.json if it exists (written by runner.py for visualization) + meta_path = self.scratch_dir / "session_meta.json" + preserved_meta = None + if meta_path.exists(): + try: + preserved_meta = meta_path.read_text(encoding="utf-8") + except Exception: + pass + + scratch_resolved = self.scratch_dir.resolve() + + if self.scratch_dir.exists(): + shutil.rmtree(self.scratch_dir) + self.scratch_dir.mkdir(parents=True, exist_ok=True, mode=0o700) + + # Restore session_meta.json + if preserved_meta is not None: + try: + meta_path.write_text(preserved_meta, encoding="utf-8") + except Exception: + pass + + baseline: Dict[Path, str] = {} + for relative_path, content in base_files.items(): + if relative_path.is_absolute(): + raise ValueError("Base file paths must be relative to the scratch root") + target = self.scratch_dir / relative_path + try: + if not target.resolve().is_relative_to(scratch_resolved): + raise ValueError( + f"Base file path '{relative_path}' escapes scratch directory" + ) + except (OSError, ValueError) as e: + raise ValueError( + f"Invalid base file path '{relative_path}': {e}" + ) from e + + content_bytes = len(content.encode("utf-8")) + if content_bytes > MAX_BASE_FILE_SIZE: + raise ValueError( + f"Base file {relative_path} exceeds max size " + f"({content_bytes} > {MAX_BASE_FILE_SIZE} bytes)" + ) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + baseline[relative_path] = content + return baseline + + def run_session(self, context: AgentContext) -> AgentResult: + baseline = self._prepare_scratch(context.base_files) + + session_log: List[str] = [] + commands: List[CommandResult] = [] + start_time = time.monotonic() + + session_log_path = self.scratch_dir / "session_log.jsonl" + event_count = 0 + session_events: List[Dict[str, Any]] = [] + binary_changed_files: Dict[Path, str] = {} + session_id: Optional[str] = None + model_from_event: Optional[str] = None # Actual model from CLI init event + + # Telemetry aggregation + usage_metrics: Dict[str, float] = { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + "total_cost_usd": 0.0, + } + + with session_log_path.open("w", encoding="utf-8") as event_handle: + for event in self.runner( + user_prompt=context.user_prompt, + system_prompt=context.system_prompt, + workdir=self.scratch_dir, + profile=self.config.cli_profile, + sandbox=self.config.sandbox, + approval_mode=self.config.approval_mode, + max_seconds=self.config.max_seconds, + max_events=self.config.max_events, + extra_cli_config=self.config.extra_cli_config, + cli_path=self.config.cli_path, + resume_session_id=context.resume_session_id, + session_kind="edit", + parent_id=context.metadata.get("parent_id"), + generation=context.metadata.get("generation"), + patch_type=context.metadata.get("patch_type"), + results_dir=context.metadata.get("results_dir"), + ): + if isinstance(event, dict): + json.dump(event, event_handle) + event_handle.write("\n") + event_count += 1 + session_events.append(event) + if session_id is None: + candidate = extract_session_id(event) + if candidate: + session_id = candidate + + # Handle standard event types + item = event.get("item") if isinstance(event, dict) else None + if item: + item_type = item.get("type") + if item_type == "agent_message": + text = item.get("text") + if text: + session_log.append(text) + elif item_type == "command_execution": + commands.append( + CommandResult( + command=item.get("command"), + status=item.get("status"), + exit_code=item.get("exit_code"), + stdout=item.get("stdout"), + stderr=item.get("stderr"), + ) + ) + + # Handle direct event types + event_type = event.get("type") + + # Capture model from init event (Claude CLI and ShinkaAgent emit this) + if event_type == "init" and model_from_event is None: + model_candidate = event.get("model") + if isinstance(model_candidate, str) and model_candidate: + model_from_event = model_candidate + + if event_type == "usage": + usage = event.get("usage") + if isinstance(usage, dict): + usage_metrics["input_tokens"] += float( + usage.get("input_tokens", 0) + ) + usage_metrics["output_tokens"] += float( + usage.get("output_tokens", 0) + ) + usage_metrics["total_tokens"] += float( + usage.get("total_tokens", 0) + ) + # Use real cost from Claude CLI if available + if "total_cost_usd" in usage: + usage_metrics["total_cost_usd"] += float( + usage.get("total_cost_usd", 0.0) + ) + + elapsed = time.monotonic() - start_time + + changed_files: Dict[Path, str] = {} + files_checked = 0 + scratch_resolved = self.scratch_dir.resolve() + + for file_path in self.scratch_dir.rglob("*"): + # Prevent unbounded scans in pathological scratch trees. + if files_checked >= MAX_FILES_TO_SCAN: + break + + if not file_path.is_file(): + continue + + # Avoid following symlinks/paths that escape the sandbox. + try: + if not file_path.resolve().is_relative_to(scratch_resolved): + continue + except (OSError, ValueError): + continue + + rel_path = file_path.relative_to(self.scratch_dir) + + # Skip internal session files - they shouldn't be part of the program + if str(rel_path) in ("session_log.jsonl", "session_meta.json"): + continue + + files_checked += 1 + try: + new_content = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError: + try: + if file_path.stat().st_size > MAX_BINARY_FILE_SIZE: + continue + except OSError: + continue + raw_bytes = file_path.read_bytes() + binary_changed_files[rel_path] = base64.b64encode(raw_bytes).decode( + "ascii" + ) + continue + + baseline_content = baseline.get(rel_path) + if baseline_content is None: + # New file created + changed_files[rel_path] = new_content + elif baseline_content != new_content: + # Existing file modified + changed_files[rel_path] = new_content + + if not changed_files and files_checked > 0: + logger.info( + "Agentic session completed but no files changed. " + f"Checked {files_checked} files in {self.scratch_dir}. " + f"Baseline files: {len(baseline)}" + ) + elif changed_files: + logger.info( + f"Agentic session changed {len(changed_files)} files: {[str(p) for p in changed_files.keys()]}" + ) + + # Use real cost if available (Claude CLI provides total_cost_usd), + # otherwise fallback to token-based placeholder estimate + real_cost = usage_metrics.get("total_cost_usd", 0.0) + fallback_cost = usage_metrics["total_tokens"] / 1000.0 # rough placeholder + final_cost = real_cost if real_cost > 0 else fallback_cost + + metrics = { + "elapsed_seconds": elapsed, + "commands_run": float(len(commands)), + "messages_logged": float(len(session_log)), + "events_logged": float(event_count), + "estimated_input_tokens": usage_metrics["input_tokens"], + "estimated_output_tokens": usage_metrics["output_tokens"], + "estimated_total_tokens": usage_metrics["total_tokens"], + "estimated_total_cost": final_cost, + "total_cost": final_cost, + "input_tokens": usage_metrics["input_tokens"], + "output_tokens": usage_metrics["output_tokens"], + "total_tokens": usage_metrics["total_tokens"], + "real_cost_available": real_cost > 0, + } + + final_message = session_log[-1] if session_log else None + + return AgentResult( + changed_files=changed_files, + binary_changed_files=binary_changed_files, + session_log=session_log, + commands_run=commands, + final_message=final_message, + metrics=metrics, + session_log_path=session_log_path, + session_events=session_events, + session_id=session_id, + model=model_from_event, + ) diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py index ead28e231..a45d0482a 100644 --- a/shinka/edit/apply_diff.py +++ b/shinka/edit/apply_diff.py @@ -142,8 +142,12 @@ def _clean_evolve_markers(text: str) -> str: def redact_immutable(text: str, no_state: bool = False) -> str: + ranges = _mutable_ranges(text) + # If no EVOLVE-BLOCK markers found, return the full text for embedding + if not ranges: + return text out = [] - for a, b in _mutable_ranges(text): + for a, b in ranges: # keep immutable gap as a 1-liner placeholder if not no_state: out.append("<… non-evolvable code omitted …>") @@ -698,12 +702,12 @@ def apply_diff_patch( patch_str = _strip_trailing_whitespace(patch_str) # Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers - if language in ["cuda", "cpp"]: - patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str) - patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str) + if language in ["cuda", "cpp", "rust", "swift", "json", "json5"]: + patch_str = re.sub(r"// EVOLVE-BLOCK-START\\n", "", patch_str) + patch_str = re.sub(r"// EVOLVE-BLOCK-END\\n", "", patch_str) elif language == "python": - patch_str = re.sub(r"# EVOLVE-BLOCK START\\n", "", patch_str) - patch_str = re.sub(r"# EVOLVE-BLOCK END\\n", "", patch_str) + patch_str = re.sub(r"# EVOLVE-BLOCK-START\\n", "", patch_str) + patch_str = re.sub(r"# EVOLVE-BLOCK-END\\n", "", patch_str) else: raise ValueError(f"Language {language} not supported") @@ -730,6 +734,12 @@ def apply_diff_patch( suffix = ".cpp" elif language == "cuda": suffix = ".cu" + elif language == "rust": + suffix = ".rs" + elif language == "swift": + suffix = ".swift" + elif language in ["json", "json5"]: + suffix = ".json" else: raise ValueError(f"Language {language} not supported") diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py index b7e2e2b37..ac6288128 100644 --- a/shinka/edit/apply_full.py +++ b/shinka/edit/apply_full.py @@ -1,6 +1,6 @@ from pathlib import Path from typing import Optional, Union -from .apply_diff import write_git_diff, _mutable_ranges +from .apply_diff import write_git_diff, _mutable_ranges, EVOLVE_START, EVOLVE_END from shinka.llm import extract_between import logging @@ -72,10 +72,15 @@ def apply_full_patch( updated_content = "" last_end = 0 - # Check if patch_code contains EVOLVE-BLOCK markers - patch_mutable_ranges = _mutable_ranges(patch_code) + # Detect EVOLVE markers presence in the patch content + patch_has_start = EVOLVE_START.search(patch_code) is not None + patch_has_end = EVOLVE_END.search(patch_code) is not None + patch_has_both = patch_has_start and patch_has_end + patch_has_none = not patch_has_start and not patch_has_end - if patch_mutable_ranges: + if patch_has_both: + # Patch contains both EVOLVE-BLOCK markers, extract from them + patch_mutable_ranges = _mutable_ranges(patch_code) # Patch contains EVOLVE-BLOCK markers, extract from them for i, (start, end) in enumerate(mutable_ranges): # Add immutable part before this mutable range @@ -91,47 +96,158 @@ def apply_full_patch( updated_content += replacement_content last_end = end - else: + elif patch_has_none: # Patch doesn't contain EVOLVE-BLOCK markers # Assume entire patch content should replace all mutable regions if len(mutable_ranges) == 1: - # Single mutable region, replace with entire patch content + # Single mutable region. If the patch appears to be a full-file + # rewrite that omitted EVOLVE markers, safely extract only the + # content intended for the evolve block by matching immutable + # prefix/suffix from the original file. start, end = mutable_ranges[0] - # The mutable range ends before "EVOLVE-BLOCK-END" text - # We need to find the actual start of the comment line - if language == "python": - end_marker = "# EVOLVE-BLOCK-END" - elif language in ["cuda", "cpp"]: - end_marker = "// EVOLVE-BLOCK-END" - else: - end_marker = "# EVOLVE-BLOCK-END" # Default fallback - - end_marker_pos = original.find(end_marker, end - 5) - if end_marker_pos == -1: - # Fallback: use the original end position - end_marker_pos = end + # Immutable portions that remain outside the evolve block + immutable_prefix = original[:start] + immutable_suffix = original[end:] - # Ensure proper newline handling around the patch content - if patch_code and not patch_code.startswith("\n"): - patch_code = "\n" + patch_code + # Also compute the portions strictly outside the marker lines + # to detect full-file patches that omitted EVOLVE markers. + # Find the start and end marker line boundaries. + start_match = None + end_match = None + for m in EVOLVE_START.finditer(original): + if m.end() == start: + start_match = m + break + for m in EVOLVE_END.finditer(original): + if m.start() == end: + end_match = m + break - if patch_code and not patch_code.endswith("\n"): - patch_code = patch_code + "\n" - - updated_content = ( - original[:start] + patch_code + original[end_marker_pos:] + prefix_outside = ( + original[: start_match.start()] if start_match else immutable_prefix + ) + suffix_outside = ( + original[end_match.end() :] if end_match else immutable_suffix ) + + # Heuristic: if patch includes the same immutable prefix/suffix + # outside the markers, treat the middle part as the evolve-block + # replacement. Be tolerant to a missing trailing newline in the + # footer by checking both versions. + suffix_opts = (suffix_outside, suffix_outside.rstrip("\r\n")) + if patch_code.startswith(prefix_outside) and any( + patch_code.endswith(sfx) for sfx in suffix_opts + ): + mid_start = len(prefix_outside) + # choose the matching suffix option to compute end + sfx = next(sfx for sfx in suffix_opts if patch_code.endswith(sfx)) + mid_end = len(patch_code) - len(sfx) + replacement_content = patch_code[mid_start:mid_end] + # Ensure marker boundaries stay on their own lines. + # Add a leading newline only if there is a START marker. + if ( + start_match is not None + and replacement_content + and not replacement_content.startswith("\n") + ): + replacement_content = "\n" + replacement_content + # Add a trailing newline only if there is an END marker. + if ( + end_match is not None + and replacement_content + and not replacement_content.endswith("\n") + ): + replacement_content = replacement_content + "\n" + updated_content = ( + immutable_prefix + replacement_content + immutable_suffix + ) + else: + # Otherwise, assume the patch_code represents only the + # evolve-block payload and insert it directly between markers. + # Ensure proper newline handling around the patch content. + payload = patch_code + if ( + start_match is not None + and payload + and not payload.startswith("\n") + ): + payload = "\n" + payload + if end_match is not None and payload and not payload.endswith("\n"): + payload = payload + "\n" + updated_content = immutable_prefix + payload + immutable_suffix else: - # Multiple mutable regions, this is ambiguous + # Multiple EVOLVE-BLOCK regions found, ambiguous without markers error_message = ( "Multiple EVOLVE-BLOCK regions found but patch " "doesn't specify which to replace" ) return original, 0, None, error_message, None, None + else: + # Patch contains exactly one marker (START xor END). + # Only safe to apply when original has a single evolve region. + if len(mutable_ranges) != 1: + error_message = ( + "Patch contains only one EVOLVE-BLOCK marker, but the original " + f"has {len(mutable_ranges)} editable regions; cannot determine target" + ) + return original, 0, None, error_message, None, None + + # Single target region in original + start, end = mutable_ranges[0] + immutable_prefix = original[:start] + immutable_suffix = original[end:] + + # Find exact marker locations in original for newline policy + start_match = None + end_match = None + for m in EVOLVE_START.finditer(original): + if m.end() == start: + start_match = m + break + for m in EVOLVE_END.finditer(original): + if m.start() == end: + end_match = m + break + + # Compute outside-of-markers prefix/suffix from original + prefix_outside = ( + original[: start_match.start()] if start_match else immutable_prefix + ) + suffix_outside = ( + original[end_match.end() :] if end_match else immutable_suffix + ) + + # Extract payload based on which single marker is present in patch + if patch_has_start and not patch_has_end: + m = EVOLVE_START.search(patch_code) + payload = patch_code[m.end() :] if m else patch_code + # Trim footer if the patch included it + for sfx in (suffix_outside, suffix_outside.rstrip("\r\n")): + if sfx and payload.endswith(sfx): + payload = payload[: -len(sfx)] + break + elif patch_has_end and not patch_has_start: + m = EVOLVE_END.search(patch_code) + payload = patch_code[: m.start()] if m else patch_code + # Trim header if the patch included it + for pfx in (prefix_outside, prefix_outside.rstrip("\r\n")): + if pfx and payload.startswith(pfx): + payload = payload[len(pfx) :] + break + else: + payload = patch_code + + # Normalize newlines so markers remain on their own lines + if start_match is not None and payload and not payload.startswith("\n"): + payload = "\n" + payload + if end_match is not None and payload and not payload.endswith("\n"): + payload = payload + "\n" + + updated_content = immutable_prefix + payload + immutable_suffix # Add remaining immutable content after last mutable range - if patch_mutable_ranges and mutable_ranges: + if patch_has_both and mutable_ranges: updated_content += original[mutable_ranges[-1][1] :] num_applied = 1 @@ -146,6 +262,12 @@ def apply_full_patch( suffix = ".cpp" elif language == "cuda": suffix = ".cu" + elif language == "rust": + suffix = ".rs" + elif language == "swift": + suffix = ".swift" + elif language in ["json", "json5"]: + suffix = ".json" else: raise ValueError(f"Language {language} not supported") diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py index 8e542c565..e4c21202f 100644 --- a/shinka/edit/async_apply.py +++ b/shinka/edit/async_apply.py @@ -118,6 +118,31 @@ async def validate_code_async( error_msg = stderr.decode() if stderr else "Unknown compilation error" return False, error_msg + elif language == "rust": + # Use rustc for Rust syntax checking + proc = await asyncio.create_subprocess_exec( + "rustc", + "--crate-type=lib", + "-Zparse-only", + code_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + try: + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=timeout + ) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + return False, f"Validation timeout after {timeout}s" + + if proc.returncode == 0: + return True, None + else: + error_msg = stderr.decode() if stderr else "Unknown compilation error" + return False, error_msg elif language == "cpp": # Use g++ for C++ compilation check proc = await asyncio.create_subprocess_exec( @@ -128,6 +153,31 @@ async def validate_code_async( stderr=asyncio.subprocess.PIPE, ) + try: + stdout, stderr = await asyncio.wait_for( + proc.communicate(), timeout=timeout + ) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + return False, f"Validation timeout after {timeout}s" + + if proc.returncode == 0: + return True, None + else: + error_msg = stderr.decode() if stderr else "Unknown compilation error" + return False, error_msg + elif language == "swift": + # Use swiftc for Swift syntax checking + proc = await asyncio.create_subprocess_exec( + "swiftc", + "-typecheck", + "-parse-as-library", + code_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: stdout, stderr = await asyncio.wait_for( proc.communicate(), timeout=timeout diff --git a/shinka/edit/codex_cli.py b/shinka/edit/codex_cli.py new file mode 100644 index 000000000..116df6dc8 --- /dev/null +++ b/shinka/edit/codex_cli.py @@ -0,0 +1,443 @@ +"""Helpers for interacting with the Codex CLI.""" + +from __future__ import annotations + +import json +import logging +import os +import shutil +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, Iterable, Iterator, Literal, Optional + +from shinka.edit.cost_utils import calculate_cost +from shinka.edit.event_utils import extract_session_id +from shinka.tools.credentials import get_api_key + +logger = logging.getLogger(__name__) + + +class CodexUnavailableError(RuntimeError): + """Raised when the Codex CLI binary cannot be located.""" + + +class CodexExecutionError(RuntimeError): + """Raised when a Codex run fails or exceeds configured limits.""" + + +class CodexAuthError(RuntimeError): + """Raised when Codex authentication cannot be established.""" + + +def _is_interactive() -> bool: + """Check if running in interactive context (avoid hanging in CI/background).""" + return bool(sys.stdin.isatty() and sys.stdout.isatty()) + + +def _status_looks_authenticated(stdout: str, stderr: str) -> bool: + combined = f"{stdout}\n{stderr}".lower() + if "not logged" in combined: + return False + if "unauthorized" in combined: + return False + if "please login" in combined or "please log in" in combined: + return False + return True + + +def _is_codex_authenticated(codex_bin: Path) -> bool: + """Return True if Codex CLI reports an authenticated session.""" + try: + result = subprocess.run( + [str(codex_bin), "login", "status"], + capture_output=True, + text=True, + check=False, + ) + except OSError: + return False + if result.returncode != 0: + return False + return _status_looks_authenticated(result.stdout or "", result.stderr or "") + + +def _login_with_api_key(codex_bin: Path, api_key: str, *, timeout_seconds: int) -> bool: + """Attempt a non-interactive login using an API key via stdin.""" + try: + result = subprocess.run( + [str(codex_bin), "login", "--with-api-key"], + input=f"{api_key}\n", + text=True, + capture_output=True, + timeout=timeout_seconds, + check=False, + ) + except (OSError, subprocess.TimeoutExpired): + return False + return result.returncode == 0 + + +def _login_device_auth(codex_bin: Path, *, timeout_seconds: int) -> bool: + """Attempt a device auth login, inheriting stdio so the user sees the code.""" + try: + result = subprocess.run( + [str(codex_bin), "login", "--device-auth"], + timeout=timeout_seconds, + check=False, + ) + except (OSError, subprocess.TimeoutExpired): + return False + return result.returncode == 0 + + +def _ensure_codex_authenticated( + codex_bin: Path, + *, + api_key: Optional[str] = None, + timeout_seconds: int = 900, + allow_interactive: Optional[bool] = None, +) -> Literal["status", "device_auth", "api_key"]: + """Ensure Codex is authenticated, attempting login flows if needed. + + Order of operations: + 1) `codex login status` (fast path) + 2) If not logged in and interactive, attempt `codex login --device-auth` + 3) If still not logged in and api_key provided, attempt `codex login --with-api-key` + + Raises: + CodexAuthError: If authentication is not available after attempts. + """ + if _is_codex_authenticated(codex_bin): + return "status" + + interactive = _is_interactive() if allow_interactive is None else allow_interactive + if interactive: + if _login_device_auth(codex_bin, timeout_seconds=timeout_seconds): + if _is_codex_authenticated(codex_bin): + return "device_auth" + + if api_key: + if _login_with_api_key(codex_bin, api_key, timeout_seconds=timeout_seconds): + if _is_codex_authenticated(codex_bin): + return "api_key" + + raise CodexAuthError( + "Codex authentication required. Options:\n" + " 1. Run `codex login --device-auth` (requires enabling device code auth in ChatGPT Security Settings first)\n" + " 2. Run `echo $OPENAI_API_KEY | codex login --with-api-key`\n" + " 3. Set OPENAI_API_KEY environment variable or add to ~/.shinka/credentials.json" + ) + + +def ensure_codex_available(codex_path: Optional[str] = None) -> Path: + """Return the resolved path to the Codex CLI binary. + + Args: + codex_path: Optional override pointing directly to the CLI executable. + + Raises: + CodexUnavailableError: If the binary cannot be found or executed. + + Returns: + Path: Absolute path to the Codex CLI binary. + """ + + candidate = codex_path or shutil.which("codex") + if not candidate: + raise CodexUnavailableError( + "Codex CLI not found. Install it with `npm install -g @openai/codex` " + "or add it to PATH, then authenticate via `codex login --device-auth` " + "(requires enabling device code auth in ChatGPT Security Settings) " + "or `codex login --with-api-key`." + ) + + resolved = Path(candidate) + if not resolved.exists() or not resolved.is_file(): + raise CodexUnavailableError( + f"Codex CLI binary not found at resolved path: {resolved}" + ) + + return resolved + + +def validate_codex_setup(codex_path: Optional[str] = None) -> None: + """Validate Codex CLI is installed and authenticated at startup. + + This should be called early (e.g., in EvolutionRunner.__init__) to fail fast + before evolution starts, rather than failing mid-evolution on the first edit. + + Args: + codex_path: Optional override pointing directly to the CLI executable. + + Raises: + CodexUnavailableError: If Codex CLI is not installed. + CodexAuthError: If Codex CLI is not authenticated. + """ + # Check binary is available + codex_bin = ensure_codex_available(codex_path) + + # Check authentication status (without triggering interactive login) + if not _is_codex_authenticated(codex_bin): + raise CodexAuthError( + "Codex CLI is not authenticated. Please run:\n\n" + " $ codex login\n\n" + "This will open your browser for OAuth authentication.\n" + "After authenticating, verify with: codex login status" + ) + + +def _to_primitive(obj: object) -> object: + """Convert OmegaConf DictConfig/ListConfig to primitive Python types.""" + try: + from omegaconf import DictConfig, ListConfig, OmegaConf + if isinstance(obj, (DictConfig, ListConfig)): + return OmegaConf.to_container(obj, resolve=True) + except ImportError: + pass + return obj + + +def _format_extra_config(extra: Dict[str, object]) -> Iterable[str]: + """Yield CLI `-c key=value` pairs from a dictionary.""" + + for key, value in extra.items(): + if value is None: + continue + if isinstance(value, str): + yield "-c" + yield f"{key}={value}" + else: + yield "-c" + yield f"{key}={json.dumps(_to_primitive(value))}" + + +def run_codex_task( + user_prompt: str, + workdir: Path, + *, + system_prompt: Optional[str] = None, + profile: Optional[str], + sandbox: str, + approval_mode: str, + max_seconds: int, + max_events: int, + extra_cli_config: Dict[str, object], + codex_path: Optional[str] = None, + cli_path: Optional[str] = None, # Alias for codex_path + resume_session_id: Optional[str] = None, + session_kind: str = "unknown", + # Metadata params (unused but accepted for API compat with agentic.py) + parent_id: Optional[str] = None, + generation: Optional[int] = None, + patch_type: Optional[str] = None, + results_dir: Optional[str] = None, +) -> Iterator[Dict[str, object]]: + """Execute a Codex CLI task and stream its JSON events. + + Args: + user_prompt: Natural language instruction for Codex. + workdir: Workspace directory Codex should modify. + system_prompt: Optional system instructions (prepended to prompt). + profile: Optional Codex profile name (selects model/settings). + sandbox: Sandbox policy passed to `--sandbox`. + approval_mode: Either `full-auto` or values accepted by + `--ask-for-approval`. + max_seconds: Wall-clock guardrail for the Codex process. + max_events: Maximum number of JSON events to yield before aborting. + extra_cli_config: Additional key/value overrides forwarded via `-c`. + codex_path: Optional explicit path to the CLI binary. + cli_path: Alias for codex_path (for backend-agnostic calls). + resume_session_id: Optional session UUID to resume via + `codex exec resume`. + + Raises: + CodexExecutionError: If Codex fails, times out, or exceeds limits. + CodexUnavailableError: If the CLI binary cannot be located. + + Yields: + Parsed JSON events emitted by the CLI. + """ + + # Use cli_path if provided, fall back to codex_path for backward compat + binary = ensure_codex_available(cli_path or codex_path) + + # Authentication: prefer an existing Codex CLI login (e.g. ChatGPT subscription), + # and only fall back to API key auth when no interactive login is available. + api_key = get_api_key("codex") + try: + auth_method = _ensure_codex_authenticated(binary, api_key=api_key) + except CodexAuthError as exc: + raise CodexExecutionError(str(exc)) from exc + + cmd = [str(binary), "exec"] + if resume_session_id: + cmd.append("resume") + cmd.extend(["--json", "--skip-git-repo-check", "-C", str(workdir)]) + + if profile: + cmd.extend(["--profile", profile]) + + if sandbox: + cmd.extend(["--sandbox", sandbox]) + + if approval_mode == "full-auto": + cmd.append("--full-auto") + elif approval_mode: + cmd.extend(["--ask-for-approval", approval_mode]) + + cmd.extend(_format_extra_config(extra_cli_config)) + + if resume_session_id: + cmd.append(resume_session_id) + + # NOTE: Codex CLI does not support a separate system prompt flag. + # In agentic mode, the harness owns the system prompt entirely - task-specific + # context (task_sys_msg) is included in the user prompt by the sampler. + # The system_prompt param here contains only operational instructions (AGENTIC_SYS_FORMAT) + # which we prepend to the user prompt since Codex has no system prompt mechanism. + full_prompt = user_prompt + if system_prompt: + full_prompt = f"{system_prompt}\n\n{user_prompt}" + + # Prevent the prompt from being interpreted as extra CLI options when it begins + # with '-' / '--' (e.g. "--sandbox host") by terminating option parsing. + cmd.append("--") + cmd.append(full_prompt) + + start_time = time.monotonic() + events_emitted = 0 + + # Token estimation for cost tracking (Codex CLI doesn't emit usage data) + estimated_input_tokens = len(full_prompt) // 4 if full_prompt else 0 + estimated_output_tokens = 0 + # Model priority: extra_cli_config["model"] > profile > FAIL + # We intentionally fail instead of silently falling back to an old model + model_name = extra_cli_config.get("model") or profile + if not model_name: + raise CodexExecutionError( + "No model configured for Codex CLI. " + "Set evo_config.agentic.extra_cli_config.model or evo_config.agentic.cli_profile. " + "Example: evo_config.agentic.extra_cli_config.model=gpt-4.1" + ) + session_id: Optional[str] = None + + env = dict(os.environ) + if auth_method == "api_key" and api_key: + env["OPENAI_API_KEY"] = api_key + + process = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + try: + if not process.stdout: + raise CodexExecutionError("Codex CLI did not provide stdout pipe.") + + while True: + if max_seconds > 0 and time.monotonic() - start_time > max_seconds: + process.kill() + raise CodexExecutionError( + f"Codex task exceeded {max_seconds}s timeout." + ) + + line = process.stdout.readline() + if not line: + if process.poll() is not None: + break + time.sleep(0.05) + continue + + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + except json.JSONDecodeError as exc: # pragma: no cover - defensive + raise CodexExecutionError( + f"Failed to parse Codex event: {line}" + ) from exc + + events_emitted += 1 + if max_events and events_emitted > max_events: + # Don't kill immediately - let this event finish and break gracefully + logger.warning( + f"Codex emitted {events_emitted} events (max: {max_events}) - " + "stopping gracefully with results collected so far" + ) + process.kill() + break # Exit loop gracefully instead of raising error + + if isinstance(event, dict): + extracted_sid = extract_session_id(event) + if extracted_sid: + session_id = extracted_sid + + # Track output content for token estimation + content = event.get("content") or event.get("text") or "" + # Also check nested message content + msg = event.get("message") + if isinstance(msg, dict): + msg_content = msg.get("content") + if isinstance(msg_content, str): + content = msg_content + elif isinstance(msg_content, list): + # Handle content blocks + for block in msg_content: + if isinstance(block, dict) and block.get("type") == "text": + content += block.get("text", "") + + if isinstance(content, str) and content: + estimated_output_tokens += len(content) // 4 + + yield event + + # Emit usage event at session end + total_tokens = estimated_input_tokens + estimated_output_tokens + yield { + "type": "usage", + "session_id": session_id, + "usage": { + "input_tokens": estimated_input_tokens, + "output_tokens": estimated_output_tokens, + "total_tokens": total_tokens, + "total_cost_usd": calculate_cost( + model_name, + estimated_input_tokens, + estimated_output_tokens, + "codex", + ), + }, + "model": model_name, + } + + returncode = process.wait(timeout=1) + if returncode != 0: + stderr_out = process.stderr.read() if process.stderr else "" + # Don't fail if we have actual results (events processed) + # Exit code 1 can happen for benign reasons (e.g., hit max_turns) + if events_emitted > 0: + logger.warning( + f"Codex CLI exited with status {returncode} but produced " + f"{events_emitted} events - continuing with results" + ) + else: + raise CodexExecutionError( + f"Codex CLI exited with status {returncode}: {stderr_out.strip()}" + ) + finally: + if process.poll() is None: + try: + process.kill() + except OSError: + pass + try: + process.wait(timeout=1) + except subprocess.TimeoutExpired: + pass diff --git a/shinka/edit/cost_utils.py b/shinka/edit/cost_utils.py new file mode 100644 index 000000000..6ae8b3439 --- /dev/null +++ b/shinka/edit/cost_utils.py @@ -0,0 +1,63 @@ +"""Cost calculation utilities for CLI backends. + +Provides shared cost calculation using pricing tables from shinka/llm/models/pricing.py. +Used by gemini_cli.py and codex_cli.py to calculate costs from estimated tokens. +""" + +import logging +from typing import Optional + +from shinka.llm.models.pricing import GEMINI_MODELS, OPENAI_MODELS + +logger = logging.getLogger(__name__) + +# Fallback rate when model pricing is unknown +# Set conservatively high so users notice something is wrong +FALLBACK_RATE_PER_TOKEN = 0.00001 # $10/1M tokens (high to be noticeable) + + +def calculate_cost( + model: Optional[str], + input_tokens: int, + output_tokens: int, + backend: str = "auto", +) -> float: + """Calculate cost from tokens using pricing tables. + + Args: + model: Model name (e.g., "gemini-2.5-flash", "gpt-4o"). + input_tokens: Number of input tokens (can be estimated). + output_tokens: Number of output tokens (can be estimated). + backend: Backend hint ("gemini", "codex", or "auto" to detect). + + Returns: + Estimated cost in USD. Returns fallback estimate with warning if model unknown. + """ + if not model: + logger.warning( + "No model specified for cost calculation - using fallback rate. " + "Cost estimate will be inaccurate. Configure model explicitly." + ) + return (input_tokens + output_tokens) * FALLBACK_RATE_PER_TOKEN + + # Try to find model in pricing tables + pricing = None + + if backend == "gemini": + pricing = GEMINI_MODELS.get(model) + elif backend == "codex": + pricing = OPENAI_MODELS.get(model) + else: + # Auto-detect: try both tables + pricing = GEMINI_MODELS.get(model) or OPENAI_MODELS.get(model) + + if not pricing: + logger.warning( + f"Model '{model}' not found in pricing tables (backend={backend}). " + f"Using fallback rate. Add model to shinka/llm/models/pricing.py." + ) + return (input_tokens + output_tokens) * FALLBACK_RATE_PER_TOKEN + + return ( + input_tokens * pricing["input_price"] + output_tokens * pricing["output_price"] + ) diff --git a/shinka/edit/event_utils.py b/shinka/edit/event_utils.py new file mode 100644 index 000000000..9b39a551b --- /dev/null +++ b/shinka/edit/event_utils.py @@ -0,0 +1,42 @@ +"""Shared event utilities for agent backends.""" + +from typing import Any, Dict, Optional + + +def extract_session_id(event: Dict[str, Any]) -> Optional[str]: + """Extract session/thread ID from an agent event payload. + + Handles multiple event formats from different agent backends: + - thread.* events with thread_id (Codex CLI format) + - Direct session_id field (ShinkaAgent/Claude format) + - Nested session.id or session.session_id objects + + Args: + event: Event dictionary from agent backend. + + Returns: + Session ID string if found, None otherwise. + """ + if not isinstance(event, dict): + return None + + # Thread events (Codex CLI format) + event_type = event.get("type") + if isinstance(event_type, str) and event_type.startswith("thread."): + thread_id = event.get("thread_id") + if isinstance(thread_id, str) and thread_id: + return thread_id + + # Direct session_id field (ShinkaAgent/Claude format) + session_id = event.get("session_id") + if isinstance(session_id, str) and session_id: + return session_id + + # Nested session object + session_obj = event.get("session") + if isinstance(session_obj, dict): + candidate = session_obj.get("id") or session_obj.get("session_id") + if isinstance(candidate, str) and candidate: + return candidate + + return None diff --git a/shinka/edit/shinka_agent.py b/shinka/edit/shinka_agent.py new file mode 100644 index 000000000..770d3997c --- /dev/null +++ b/shinka/edit/shinka_agent.py @@ -0,0 +1,376 @@ +"""Native ShinkaAgent backend using shinka/llm/LLMClient. + +This module implements a native, model-agnostic agentic editing backend +that uses Shinka's existing LLM infrastructure. Unlike the CLI wrappers +(Codex, Gemini, Claude), ShinkaAgent runs entirely in-process, providing +full control over the agent loop and leveraging existing LLM ensembling. + +The design follows the mini-SWE-agent pattern: +- Single bash action per response (enforced via regex) +- Linear message history (no branching) +- subprocess.run() for action execution (stateless) +- Termination via magic output string + +Reference: https://github.com/SWE-agent/mini-swe-agent +""" + +from __future__ import annotations + +import logging +import os +import re +import subprocess +import time +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional + +from shinka.llm import LLMClient + +logger = logging.getLogger(__name__) + + +class ShinkaUnavailableError(RuntimeError): + """Raised when no LLM API keys are configured.""" + + +class ShinkaExecutionError(RuntimeError): + """Raised when the agent loop fails or times out.""" + + +# Regex to extract bash code block (trailing newline optional for robustness) +ACTION_RE = re.compile(r"```bash\s*\n(.*?)(?:\n)?```", re.DOTALL) + +# System prompt for bash-only agent +SHINKA_SYSTEM_PROMPT = """You are an expert software engineer working inside a sandboxed repository. + +IMPORTANT RULES: +1. You can ONLY interact via bash commands in ```bash...``` blocks +2. You can include multiple bash blocks per response - all will be executed in order +3. Only edit code between EVOLVE-BLOCK-START and EVOLVE-BLOCK-END markers +4. Use standard tools: cat, sed, echo, python, etc. +5. Keep responses concise - avoid lengthy explanations + +When your task is complete, include this exact text in your response: +COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT + +Example response: +I'll read the current file first. +```bash +cat main.py +``` + +After seeing the output, make targeted edits to improve the score. +""" + +# Observation template +OBSERVATION_TEMPLATE = """OBSERVATION: +Exit code: {exit_code} +{output}""" + +# Max characters for observation to avoid context overflow +MAX_OBSERVATION_CHARS = 16000 + +def ensure_shinka_available() -> bool: + """Check that at least one LLM provider API key is configured.""" + from shinka.tools.credentials import PROVIDER_ENV_VAR_MAP, get_api_key + + # Check environment variables + for env_var in set(PROVIDER_ENV_VAR_MAP.values()): + if os.environ.get(env_var): + return True + + # Check credential store + for provider, env_var in PROVIDER_ENV_VAR_MAP.items(): + if key := get_api_key(provider): + os.environ[env_var] = key + return True + + raise ShinkaUnavailableError( + "No LLM API keys found. Set one of: " + ", ".join(set(PROVIDER_ENV_VAR_MAP.values())) + ) + + +def _truncate_output(text: str, max_chars: int = MAX_OBSERVATION_CHARS) -> str: + """Truncate output to avoid context overflow.""" + if len(text) <= max_chars: + return text + half = max_chars // 2 + return f"{text[:half]}\n... [truncated {len(text) - max_chars} chars] ...\n{text[-half:]}" + + +def _execute_bash(command: str, cwd: Path, timeout: int = 120) -> tuple[int, str, str]: + """Execute a bash command and return (exit_code, stdout, stderr).""" + # Skip empty commands + if not command.strip(): + return 0, "", "(empty command skipped)" + + # Validate workdir exists and is directory + if not cwd.exists() or not cwd.is_dir(): + return 1, "", f"Invalid working directory: {cwd}" + + try: + result = subprocess.run( + command, + shell=True, + cwd=str(cwd), + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return 1, "", f"Command timed out after {timeout}s" + except Exception as e: + return 1, "", str(e) + + +def run_shinka_task( + user_prompt: str, + workdir: Path, + *, + system_prompt: Optional[str] = None, + profile: Optional[str], + sandbox: str, + approval_mode: str, + max_seconds: int, + max_events: int, + extra_cli_config: Dict[str, Any], + codex_path: Optional[str] = None, + cli_path: Optional[str] = None, # Alias for codex_path (unused for ShinkaAgent) + resume_session_id: Optional[str] = None, + session_kind: str = "unknown", + # Metadata params for session registry tracking + parent_id: Optional[str] = None, + generation: Optional[int] = None, + patch_type: Optional[str] = None, + results_dir: Optional[str] = None, +) -> Iterator[Dict[str, Any]]: + """Execute a ShinkaAgent task and stream JSON events. + + This function implements the AgentRunner protocol for native in-process + agent execution using shinka/llm/LLMClient. + + Args: + user_prompt: Natural language instruction for the agent. + workdir: Workspace directory the agent should modify. + system_prompt: Optional system instructions (combined with base prompt). + profile: Optional model name override. + sandbox: Sandbox policy (ignored for ShinkaAgent - runs locally). + approval_mode: Approval mode (ignored for ShinkaAgent - full-auto). + max_seconds: Wall-clock timeout for the session. + max_events: Maximum number of LLM turns before stopping. + extra_cli_config: Additional config (model, temperature, etc.). + codex_path: Ignored for ShinkaAgent. + resume_session_id: Optional session UUID to resume (future feature). + session_kind: Session type label for UI tracking. + + Yields: + Parsed JSON events in the same format as CLI wrappers: + - init: Session start with session_id, model, timestamp + - agent_message: LLM response text + - command_execution: Bash command result + - usage: Token/cost telemetry at session end + + Raises: + ShinkaUnavailableError: If no API keys are configured. + ShinkaExecutionError: If the agent loop fails catastrophically. + """ + ensure_shinka_available() + + session_id = resume_session_id or str(uuid.uuid4()) + start_time = time.monotonic() + + # Determine model(s) to use + # Priority: extra_cli_config["model"] > profile > FAIL + # We intentionally fail instead of silently falling back to an old model + model_name = extra_cli_config.get("model") or profile + if not model_name: + raise ShinkaExecutionError( + "No model configured for ShinkaAgent. " + "Set evo_config.agentic.extra_cli_config.model or evo_config.agentic.cli_profile. " + "Example: evo_config.agentic.extra_cli_config.model=gpt-4.1" + ) + model_names = [model_name] if isinstance(model_name, str) else list(model_name) + + # Extract LLM kwargs from extra_cli_config with proper key mapping + # LLMClient uses 'temperatures' (plural) but config often has 'temperature' + llm_kwargs = {} + if "temperature" in extra_cli_config: + llm_kwargs["temperatures"] = extra_cli_config["temperature"] + if "max_tokens" in extra_cli_config: + llm_kwargs["max_tokens"] = extra_cli_config["max_tokens"] + # IMPORTANT: reasoning_efforts controls thinking tokens for reasoning models + # Without this, Gemini and other reasoning models may return empty responses + # Default to "auto" (no thinking) for agentic mode to avoid response format issues + if "reasoning_efforts" in extra_cli_config: + llm_kwargs["reasoning_efforts"] = extra_cli_config["reasoning_efforts"] + else: + # Explicitly set to "auto" to disable thinking tokens in agentic mode + # This avoids Gemini returning empty/None content due to thinking mode + llm_kwargs["reasoning_efforts"] = "auto" + + # Initialize LLMClient with configured models + llm = LLMClient(model_names=model_names, verbose=False, **llm_kwargs) + + # NOTE: ShinkaAgent has its own SHINKA_SYSTEM_PROMPT that defines how the + # agent operates (bash-only, one block per response, etc.). In agentic mode, + # task-specific context (task_sys_msg) is included in the user prompt by the + # sampler. The system_prompt param here contains only operational instructions + # (AGENTIC_SYS_FORMAT) which we prepend to our SHINKA_SYSTEM_PROMPT. + base_system = SHINKA_SYSTEM_PROMPT + if system_prompt: + base_system = f"{system_prompt}\n\n{SHINKA_SYSTEM_PROMPT}" + + # Message history for multi-turn conversation + messages: List[Dict[str, str]] = [] + + # Cost tracking + total_input_tokens = 0 + total_output_tokens = 0 + total_cost = 0.0 + + # Emit init event + yield { + "type": "init", + "session_id": session_id, + "model": model_names[0], + "timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + } + + # Add initial user message + current_msg = user_prompt + turn_count = 0 + + while True: + # Check time limit + elapsed = time.monotonic() - start_time + if max_seconds > 0 and elapsed > max_seconds: + yield { + "type": "agent_message", + "item": { + "type": "agent_message", + "text": f"[Session timed out after {elapsed:.1f}s]", + }, + "session_id": session_id, + } + break + + # Check turn limit + turn_count += 1 + if max_events > 0 and turn_count > max_events: + yield { + "type": "agent_message", + "item": { + "type": "agent_message", + "text": f"[Session reached max turns: {max_events}]", + }, + "session_id": session_id, + } + break + + # Query LLM + llm_call_kwargs = llm.get_kwargs() + response = llm.query( + msg=current_msg, + system_msg=base_system, + msg_history=messages, + llm_kwargs=llm_call_kwargs, + ) + + if response is None or response.content is None: + yield { + "type": "agent_message", + "item": { + "type": "agent_message", + "text": "[LLM returned empty response]", + }, + "session_id": session_id, + } + break + + # Track costs using actual values from QueryResult + total_cost += response.cost or 0.0 + total_input_tokens += response.input_tokens or 0 + total_output_tokens += response.output_tokens or 0 + + # Update message history + messages.append({"role": "user", "content": current_msg}) + messages.append({"role": "assistant", "content": response.content}) + + # Emit agent message event + yield { + "type": "agent_message", + "item": {"type": "agent_message", "text": response.content}, + "session_id": session_id, + } + + # Parse ALL bash actions - execute all commands before checking termination + # (Some models output multiple bash blocks in one response) + action_matches = list(ACTION_RE.finditer(response.content)) + has_termination = ( + "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" in response.content + ) + + # Execute ALL bash blocks in sequence + observations = [] + for action_match in action_matches: + command = action_match.group(1).strip() + + # Execute command + exit_code, stdout, stderr = _execute_bash(command, workdir) + + # Format observation + output = stdout + stderr + output = _truncate_output(output) + observation = OBSERVATION_TEMPLATE.format( + exit_code=exit_code, + output=output or "(no output)", + ) + observations.append(observation) + + # Emit command execution event + yield { + "type": "command_execution", + "item": { + "type": "command_execution", + "command": command, + "status": "success" if exit_code == 0 else "error", + "exit_code": exit_code, + "stdout": _truncate_output(stdout, 8000), + "stderr": _truncate_output(stderr, 8000), + }, + "session_id": session_id, + } + + # Combine all observations for next message + if observations: + current_msg = "\n\n".join(observations) + + # Check for termination AFTER executing any bash commands + if has_termination: + logger.info( + f"ShinkaAgent completed task in {turn_count} turns, " + f"{elapsed:.1f}s, cost=${total_cost:.4f}" + ) + break + + # If no bash action and no termination, prompt for one + if not action_matches: + current_msg = ( + "Please provide a bash command in ```bash...``` block, " + "or say COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT if done." + ) + + # Emit usage event at end + yield { + "type": "usage", + "session_id": session_id, + "usage": { + "input_tokens": total_input_tokens, + "output_tokens": total_output_tokens, + "total_tokens": total_input_tokens + total_output_tokens, + "total_cost_usd": total_cost, + }, + } diff --git a/shinka/edit/types.py b/shinka/edit/types.py new file mode 100644 index 000000000..cf49f3c7b --- /dev/null +++ b/shinka/edit/types.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, Iterator, Optional, Protocol + + +class AgentRunner(Protocol): + """Protocol for an agent runner that executes a prompt in a workspace.""" + + def __call__( + self, + user_prompt: str, + workdir: Path, + *, + system_prompt: Optional[str] = None, + profile: Optional[str], + sandbox: str, + approval_mode: str, + max_seconds: int, + max_events: int, + extra_cli_config: Dict[str, Any], + codex_path: Optional[str] = None, + resume_session_id: Optional[str] = None, + session_kind: str = "unknown", + ) -> Iterator[Dict[str, Any]]: ... diff --git a/shinka/eval/__init__.py b/shinka/eval/__init__.py new file mode 100644 index 000000000..17b3faf5d --- /dev/null +++ b/shinka/eval/__init__.py @@ -0,0 +1,3 @@ +"""Agentic evaluation utilities.""" + +from .agentic import AgenticEvaluator, AgenticEvaluatorResult # noqa: F401 diff --git a/shinka/eval/agentic.py b/shinka/eval/agentic.py new file mode 100644 index 000000000..0c1dfe08a --- /dev/null +++ b/shinka/eval/agentic.py @@ -0,0 +1,260 @@ +"""Agentic evaluator that uses LLM to assess code and write metrics. + +The evaluator can: +1. Run an evaluation command and parse the output +2. Write metrics.json itself with qualitative judgment +3. Use custom evaluation criteria (eval_prompt) for domain-specific assessment +""" + +from __future__ import annotations + +import json +import logging +import time +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence + +from shinka.edit.agentic import CommandResult +from shinka.edit.codex_cli import CodexExecutionError, run_codex_task +from shinka.edit.event_utils import extract_session_id +from shinka.edit.types import AgentRunner +from shinka.prompts import AGENTIC_EVAL_SYS, AGENTIC_EVAL_USER + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: # pragma: no cover + from shinka.core.runner import AgenticEvaluatorConfig + + +@dataclass +class AgenticEvaluatorResult: + """Structured output from an agentic evaluation session.""" + + metrics: Dict[str, Any] + correct: bool + error_message: Optional[str] + stdout_log: str + stderr_log: str + session_log: List[str] + commands_run: List[CommandResult] + session_log_path: Path + session_events: List[Dict[str, Any]] + session_id: Optional[str] + session_dir: Path + elapsed_seconds: float + # Prompts used for evaluation (for debugging/UI display) + system_prompt: Optional[str] = None + user_prompt: Optional[str] = None + + +class AgenticEvaluator: + """Drive the Codex-based evaluator from the repository root.""" + + def __init__( + self, + config: "AgenticEvaluatorConfig", + *, + codex_runner: AgentRunner = None, + agent_runner: AgentRunner = None, # Alias for codex_runner + ) -> None: + self.config = config + # Accept either codex_runner or agent_runner for backward compatibility + self.codex_runner = codex_runner or agent_runner or run_codex_task + + def evaluate( + self, + *, + repo_root: Path, + eval_command: Sequence[str], + program_path: Path, + results_path: Path, + metrics_path: Path, + eval_sessions_root: Path, + task_name: str, + results_dir: Optional[str] = None, + eval_prompt: Optional[str] = None, + max_score: float = 100.0, + ) -> AgenticEvaluatorResult: + session_uuid = uuid.uuid4().hex + session_dir = eval_sessions_root / session_uuid + session_dir.mkdir(parents=True, exist_ok=True) + session_log_path = session_dir / "session_log.jsonl" + + user_prompt, system_prompt = self._build_prompt( + task_name=task_name, + eval_command=eval_command, + program_path=program_path, + results_path=results_path, + metrics_path=metrics_path, + eval_prompt=eval_prompt, + max_score=max_score, + ) + + session_log: List[str] = [] + commands: List[CommandResult] = [] + session_events: List[Dict[str, Any]] = [] + resolved_session_id: Optional[str] = None + + start_time = time.monotonic() + with session_log_path.open("w", encoding="utf-8") as handle: + for event in self.codex_runner( + user_prompt=user_prompt, + system_prompt=system_prompt, + workdir=repo_root, + profile=self.config.cli_profile, + sandbox=self.config.sandbox, + approval_mode=self.config.approval_mode, + max_seconds=self.config.max_seconds, + max_events=self.config.max_events, + extra_cli_config=self.config.extra_cli_config, + cli_path=self.config.cli_path, + session_kind="eval", + results_dir=results_dir, + ): + if isinstance(event, dict): + json.dump(event, handle) + handle.write("\n") + handle.flush() # Flush for real-time visibility + session_events.append(event) + if resolved_session_id is None: + resolved_session_id = extract_session_id(event) + + item = event.get("item") if isinstance(event, dict) else None + if not item: + continue + if item.get("type") == "agent_message": + text = item.get("text") + if text: + session_log.append(text) + elif item.get("type") == "command_execution": + commands.append( + CommandResult( + command=item.get("command"), + status=item.get("status"), + exit_code=item.get("exit_code"), + stdout=item.get("stdout"), + stderr=item.get("stderr"), + ) + ) + elapsed = time.monotonic() - start_time + + # Convert relative metrics_path to absolute path for checking + # (metrics_path is relative to repo_root, not the current working directory) + metrics_absolute = repo_root / metrics_path if not metrics_path.is_absolute() else metrics_path + + if not metrics_absolute.exists(): + raise CodexExecutionError( + f"Agentic evaluator did not produce metrics at {metrics_path}" + ) + + # Parse metrics with error handling for malformed JSON + try: + metrics = json.loads(metrics_absolute.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse metrics.json: {e}") + metrics = {"error": f"Invalid JSON in metrics: {e}", "combined_score": 0} + + # Read 'correct' from metrics.json (consolidated schema) + # Fall back to correct.json for backward compatibility + if "correct" in metrics: + correct_flag = bool(metrics.get("correct", False)) + error_msg = metrics.get("details") if not correct_flag else None + else: + # Backward compatibility: try reading from separate correct.json + correct_payload: Dict[str, Any] = {} + # Convert relative results_path to absolute path for file operations + results_absolute = repo_root / results_path if not results_path.is_absolute() else results_path + correct_file = results_absolute / "correct.json" + if correct_file.exists(): + try: + correct_payload = json.loads( + correct_file.read_text(encoding="utf-8") + ) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse correct.json: {e}") + correct_payload = {"correct": False, "error": f"Invalid JSON: {e}"} + correct_flag = bool(correct_payload.get("correct", False)) + error_msg = correct_payload.get("error") + + stdout_log = "\n".join((cmd.stdout or "") for cmd in commands if cmd.stdout) + stderr_log = "\n".join((cmd.stderr or "") for cmd in commands if cmd.stderr) + + metrics.setdefault("evaluation_time_seconds", elapsed) + + return AgenticEvaluatorResult( + metrics=metrics, + correct=correct_flag, + error_message=error_msg, + stdout_log=stdout_log, + stderr_log=stderr_log, + session_log=session_log, + commands_run=commands, + session_log_path=session_log_path, + session_events=session_events, + session_id=resolved_session_id, + session_dir=session_dir, + elapsed_seconds=elapsed, + system_prompt=system_prompt, + user_prompt=user_prompt, + ) + + def _build_prompt( + self, + *, + task_name: str, + eval_command: Sequence[str], + program_path: Path, + results_path: Path, + metrics_path: Path, + eval_prompt: Optional[str] = None, + max_score: float = 100.0, + ) -> tuple[str, str]: + # Build evaluation criteria section if custom prompt provided + eval_criteria = "" + if eval_prompt: + eval_criteria = f"\nEvaluation criteria:\n{eval_prompt.strip()}\n" + + # Program directory is the parent of the program file + program_dir = program_path.parent if hasattr(program_path, "parent") else Path(program_path).parent + + if eval_command: + # Standard case: run eval command and write metrics + command_str = " ".join(eval_command) + user = AGENTIC_EVAL_USER.format( + task_name=task_name, + eval_command=command_str, + program_dir=program_dir, + program_path=program_path, + results_path=results_path, + metrics_path=metrics_path, + max_score=max_score, + eval_criteria=eval_criteria, + ) + else: + # No eval command - LLM judges the code directly + user = f"""# Evaluation Task (no script provided) + +- Task: {task_name} +- Working directory: repository root +- Program path: {program_path} +- Results path: {results_path} +- Metrics JSON: {metrics_path} +- Max score: {max_score} + +No evaluation command was supplied. +1) Inspect the workspace/program as needed. +2) Judge the submission against the evaluation criteria below. +3) Write a single JSON file at the metrics path with this schema: + {{"combined_score": , "correct": , "details": }}. + - combined_score: How well the code performed + - correct: true if code runs without critical errors (be generous for open-ended tasks) + - details: Brief explanation of score and any issues + You may add more fields if useful. +4) If you cannot score, still create the file with fallback values (score=0, correct=false). +{eval_criteria} +Finish after metrics.json is written. +""" + + return user.strip(), AGENTIC_EVAL_SYS.format(max_score=max_score).strip() diff --git a/shinka/launch/scheduler.py b/shinka/launch/scheduler.py index 5782613ee..c5b86632a 100644 --- a/shinka/launch/scheduler.py +++ b/shinka/launch/scheduler.py @@ -21,6 +21,7 @@ class JobConfig: """Base job configuration""" eval_program_path: Optional[str] = "evaluate.py" + eval_command: Optional[str] = None # e.g. "python3 main.py --headless" extra_cmd_args: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: @@ -84,6 +85,7 @@ def __init__( self.config = config self.verbose = verbose self.executor = ThreadPoolExecutor(max_workers=max_workers) + self._shutdown = False if self.job_type == "local": self.monitor = monitor_local @@ -138,7 +140,13 @@ def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]: ] if self.config.extra_cmd_args: for k, v in self.config.extra_cmd_args.items(): - cmd.extend([f"--{k}", str(v)]) + # Handle boolean flags + if isinstance(v, bool): + if v: # Only append flag if True + cmd.append(f"--{k}") + else: + # For non-boolean values, append both flag and value + cmd.extend([f"--{k}", str(v)]) return cmd def run( @@ -370,4 +378,11 @@ def cancel_job(): def shutdown(self): """Shutdown the thread pool executor.""" - self.executor.shutdown(wait=True) + if not self._shutdown: + self.executor.shutdown(wait=True) + self._shutdown = True + + def __del__(self): + """Ensure executor is shut down on garbage collection.""" + if not self._shutdown: + self.shutdown() diff --git a/shinka/llm/dynamic_sampling.py b/shinka/llm/dynamic_sampling.py index 6c038d9fa..eb0cd8cb3 100644 --- a/shinka/llm/dynamic_sampling.py +++ b/shinka/llm/dynamic_sampling.py @@ -28,7 +28,8 @@ def _logdiffexp(a_log, b_log): def _logexpm1(z): z = np.asarray(z, dtype=float) - return np.where(z > 50.0, z, np.log(np.expm1(z))) + with np.errstate(divide='ignore', invalid='ignore'): + return np.where(z > 50.0, z, np.log(np.expm1(z))) class BanditBase(ABC): @@ -433,12 +434,13 @@ def decay(self, factor: float) -> None: if self.use_exponential_scaling and self.asymmetric_scaling: # shrink in exp space to match original score scale s = self.s - log1p_term = np.where( - s > 0.0, - s + np.log(one_minus_factor + np.exp(-s)), - np.log1p(one_minus_factor * np.exp(s)), - ) - self.s = s + np.log(factor) - log1p_term + with np.errstate(divide='ignore', invalid='ignore'): + log1p_term = np.where( + s > 0.0, + s + np.log(one_minus_factor + np.exp(-s)), + np.log1p(one_minus_factor * np.exp(s)), + ) + self.s = s + np.log(factor) - log1p_term if self.adaptive_scale and np.isfinite(self._obs_max): means_log = self._mean() diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py index a5c6b07cc..d6b2fbd65 100644 --- a/shinka/llm/embedding.py +++ b/shinka/llm/embedding.py @@ -1,9 +1,11 @@ +import logging import os +from typing import List, Optional, Tuple, Union + +import google.generativeai as genai +import numpy as np import openai import pandas as pd -from typing import Union, List, Optional, Tuple -import numpy as np -import logging logger = logging.getLogger(__name__) @@ -20,13 +22,23 @@ "azure-text-embedding-3-large", ] +GEMINI_EMBEDDING_MODELS = [ + "gemini-embedding-exp-03-07", + "gemini-embedding-001", +] + OPENAI_EMBEDDING_COSTS = { "text-embedding-3-small": 0.02 / M, "text-embedding-3-large": 0.13 / M, } +# Gemini embedding costs (approximate - check current pricing) +GEMINI_EMBEDDING_COSTS = { + "gemini-embedding-exp-03-07": 0.0 / M, # Experimental model, often free + "gemini-embedding-001": 0.0 / M, # Check current pricing +} -def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]: +def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]: if model_name in OPENAI_EMBEDDING_MODELS: client = openai.OpenAI() model_to_use = model_name @@ -38,6 +50,14 @@ def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]: api_version=os.getenv("AZURE_API_VERSION"), azure_endpoint=os.getenv("AZURE_API_ENDPOINT"), ) + elif model_name in GEMINI_EMBEDDING_MODELS: + # Configure Gemini API + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError("GEMINI_API_KEY environment variable not set for Gemini models") + genai.configure(api_key=api_key) + client = "gemini" # Use string identifier for Gemini + model_to_use = model_name else: raise ValueError(f"Invalid embedding model: {model_name}") @@ -52,9 +72,10 @@ def __init__( Initialize the EmbeddingClient. Args: - model (str): The OpenAI embedding model name to use. + model (str): The OpenAI, Azure, or Gemini embedding model name to use. """ self.client, self.model = get_client_model(model_name) + self.model_name = model_name self.verbose = verbose def get_embedding( @@ -76,18 +97,60 @@ def get_embedding( single_code = True else: single_code = False + # Handle Gemini models + if self.model_name in GEMINI_EMBEDDING_MODELS: + try: + embeddings = [] + total_tokens = 0 + + for text in code: + result = genai.embed_content( + model=f"models/{self.model}", + content=text, + task_type="retrieval_document" + ) + embeddings.append(result['embedding']) + total_tokens += len(text.split()) + + cost_per_token = GEMINI_EMBEDDING_COSTS.get(self.model) + if cost_per_token is None: + logger.warning( + f"Gemini embedding model '{self.model}' not in pricing table. " + "Using 0 cost. Add to GEMINI_EMBEDDING_COSTS if needed." + ) + cost_per_token = 0.0 + cost = total_tokens * cost_per_token + + if single_code: + return embeddings[0] if embeddings else [], cost + else: + return embeddings, cost + except Exception as e: + logger.warning(f"Gemini embedding failed for model '{self.model}': {e}") + if single_code: + return [], 0.0 + else: + return [[]], 0.0 + # Handle OpenAI and Azure models (same interface) try: response = self.client.embeddings.create( model=self.model, input=code, encoding_format="float" ) - cost = response.usage.total_tokens * OPENAI_EMBEDDING_COSTS[self.model] + cost_per_token = OPENAI_EMBEDDING_COSTS.get(self.model) + if cost_per_token is None: + logger.warning( + f"OpenAI embedding model '{self.model}' not in pricing table. " + "Using 0 cost. Add to OPENAI_EMBEDDING_COSTS if needed." + ) + cost_per_token = 0.0 + cost = response.usage.total_tokens * cost_per_token # Extract embedding from response if single_code: return response.data[0].embedding, cost else: return [d.embedding for d in response.data], cost except Exception as e: - logger.info(f"Error getting embedding: {e}") + logger.warning(f"OpenAI/Azure embedding failed for model '{self.model}': {e}") if single_code: return [], 0.0 else: @@ -458,8 +521,8 @@ def plot_3d_scatter( patch_type: Optional[list] = None, ): import matplotlib.pyplot as plt - from matplotlib.lines import Line2D from matplotlib.colors import ListedColormap + from matplotlib.lines import Line2D # Create figure and 3D axes with adjusted size and spacing fig = plt.figure(figsize=(8, 6)) diff --git a/shinka/llm/models/gemini.py b/shinka/llm/models/gemini.py index 1730fbaec..3ac7bda3d 100644 --- a/shinka/llm/models/gemini.py +++ b/shinka/llm/models/gemini.py @@ -58,23 +58,17 @@ def query_gemini( else: raise ValueError("Gemini does not support structured output.") + # Handle None content gracefully (can happen with reasoning models) + raw_content = text if text else "" + + # Extract thought if present thought_match = re.search( - r"(.*?)", response.choices[0].message.content, re.DOTALL + r"(.*?)", raw_content, re.DOTALL ) - thought = thought_match.group(1) if thought_match else "" - content_match = re.search( - r"(.*?)", response.choices[0].message.content, re.DOTALL - ) - if content_match: - # Extract everything before and after the tag as content - content = ( - response.choices[0].message.content[: content_match.start()] - + response.choices[0].message.content[content_match.end() :] - ).strip() - else: - content = response.choices[0].message.content + # Content is everything outside thought tags + content = re.sub(r".*?", "", raw_content, flags=re.DOTALL).strip() input_cost = GEMINI_MODELS[model]["input_price"] * response.usage.prompt_tokens output_tokens = response.usage.total_tokens - response.usage.prompt_tokens diff --git a/shinka/llm/models/openai.py b/shinka/llm/models/openai.py index a966b2a94..1d6e0a305 100644 --- a/shinka/llm/models/openai.py +++ b/shinka/llm/models/openai.py @@ -48,10 +48,21 @@ def query_openai( ], **kwargs, ) + # Handle None response.output defensively + if response.output is None or len(response.output) == 0: + raise ValueError( + f"OpenAI model '{model}' returned empty output. " + "This model may not support the responses API or returned an invalid response." + ) try: content = response.output[0].content[0].text - except Exception: - # Reasoning models - ResponseOutputMessage + except (TypeError, IndexError, AttributeError): + # Reasoning models - ResponseOutputMessage (output[1] contains the text) + if len(response.output) < 2: + raise ValueError( + f"OpenAI model '{model}' returned unexpected response structure. " + f"Expected reasoning model format but got {len(response.output)} output items." + ) content = response.output[1].content[0].text new_msg_history.append({"role": "assistant", "content": content}) else: diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py index c9c101a2c..4c1df9b27 100644 --- a/shinka/llm/models/pricing.py +++ b/shinka/llm/models/pricing.py @@ -35,6 +35,15 @@ "input_price": 3.0 / M, "output_price": 15.0 / M, }, + "claude-sonnet-4-5-20250929": { + "input_price": 3.0 / M, + "output_price": 15.0 / M, + }, + # Claude Haiku 4.5 (Oct 2025) - $1/$5 per million tokens + "claude-haiku-4-5-20251001": { + "input_price": 1.0 / M, + "output_price": 5.0 / M, + }, } OPENAI_MODELS = { @@ -114,6 +123,20 @@ "input_price": 0.05 / M, "output_price": 0.4 / M, }, + "gpt-5.1": { + "input_price": 1.25 / M, + "output_price": 10.0 / M, + }, + # GPT-5.1 Codex Mini - optimized for agentic coding tasks + "gpt-5.1-codex-mini": { + "input_price": 0.75 / M, + "output_price": 3.0 / M, + }, + # GPT-5.2 pricing (Dec 2025) + "gpt-5.2": { + "input_price": 1.75 / M, + "output_price": 14.0 / M, + }, } @@ -141,6 +164,14 @@ "input_price": 0.1 / M, "output_price": 0.4 / M, }, + "gemini-3-pro-preview": { + "input_price": 2.0 / M, + "output_price": 12.0 / M, + }, + "gemini-3-flash-preview": { + "input_price": 0.5 / M, + "output_price": 3.0 / M, + }, } BEDROCK_MODELS = { @@ -171,11 +202,15 @@ "gpt-5", "gpt-5-mini", "gpt-5-nano", + "gpt-5.1", + "gpt-5.1-codex-mini", + "gpt-5.2", ] REASONING_CLAUDE_MODELS = [ "claude-3-7-sonnet-20250219", "claude-4-sonnet-20250514", + "claude-sonnet-4-5-20250929", ] REASONING_DEEPSEEK_MODELS = [ @@ -186,6 +221,8 @@ "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite-preview-06-17", + "gemini-3-pro-preview", + "gemini-3-flash-preview", ] REASONING_AZURE_MODELS = [ diff --git a/shinka/llm/query.py b/shinka/llm/query.py index a7288df8e..9686fdf87 100644 --- a/shinka/llm/query.py +++ b/shinka/llm/query.py @@ -137,18 +137,14 @@ def sample_model_kwargs( r_effort = random.choice(reasoning_efforts) think_bool = r_effort != "auto" if think_bool: - thinking_tokens = [ - t - for t in THINKING_TOKENS.values() - if t < kwargs_dict["max_tokens"] and t >= 1024 - ] + t = THINKING_TOKENS[r_effort] + thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024 + # Note: extra_body is passed directly to the API, not double-nested kwargs_dict["extra_body"] = { - "extra_body": { - "google": { - "thinking_config": { - "thinking_budget": random.choice(thinking_tokens), - "include_thoughts": True, - } + "google": { + "thinking_config": { + "thinking_budget": thinking_tokens, + "include_thoughts": True, } } } @@ -157,19 +153,17 @@ def sample_model_kwargs( REASONING_CLAUDE_MODELS + REASONING_BEDROCK_MODELS ): kwargs_dict["max_tokens"] = min(random.choice(max_tokens), 16384) - think_bool = random.choice(reasoning_efforts) != "auto" + r_effort = random.choice(reasoning_efforts) + think_bool = r_effort != "auto" if think_bool: # filter thinking tokens to be smaller than max_tokens # not auto THINKING_TOKENS - thinking_tokens = [ - t - for t in THINKING_TOKENS.values() - if t < kwargs_dict["max_tokens"] and t >= 1024 - ] + t = THINKING_TOKENS[r_effort] + thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024 # sample only from thinking tokens that are valid kwargs_dict["thinking"] = { "type": "enabled", - "budget_tokens": random.choice(thinking_tokens), + "budget_tokens": thinking_tokens, } else: @@ -192,12 +186,14 @@ def query( model_name: str, msg: str, system_msg: str, - msg_history: List = [], + msg_history: Optional[List] = None, output_model: Optional[BaseModel] = None, model_posteriors: Optional[Dict[str, float]] = None, **kwargs, ) -> QueryResult: """Query the LLM.""" + if msg_history is None: + msg_history = [] client, model_name = get_client_llm( model_name, structured_output=output_model is not None ) diff --git a/shinka/prompts/__init__.py b/shinka/prompts/__init__.py index bda20e4ef..99acdfb76 100644 --- a/shinka/prompts/__init__.py +++ b/shinka/prompts/__init__.py @@ -1,21 +1,15 @@ +from .prompts_agentic import AGENTIC_ITER_MSG, AGENTIC_SYS_FORMAT +from .prompts_agentic_eval import AGENTIC_EVAL_SYS, AGENTIC_EVAL_USER from .prompts_base import ( + BASE_SYSTEM_MSG, construct_eval_history_msg, construct_individual_program_msg, - perf_str, format_text_feedback_section, - BASE_SYSTEM_MSG, -) -from .prompts_diff import DIFF_SYS_FORMAT, DIFF_ITER_MSG -from .prompts_full import ( - FULL_SYS_FORMAT_DEFAULT, - FULL_ITER_MSG, - FULL_SYS_FORMATS, -) -from .prompts_cross import ( - CROSS_SYS_FORMAT, - CROSS_ITER_MSG, - get_cross_component, + perf_str, ) +from .prompts_cross import CROSS_ITER_MSG, CROSS_SYS_FORMAT, get_cross_component +from .prompts_diff import DIFF_ITER_MSG, DIFF_SYS_FORMAT +from .prompts_full import FULL_ITER_MSG, FULL_SYS_FORMAT_DEFAULT, FULL_SYS_FORMATS from .prompts_init import INIT_SYSTEM_MSG, INIT_USER_MSG from .prompts_meta import ( META_STEP1_SYSTEM_MSG, @@ -51,4 +45,8 @@ "META_STEP3_USER_MSG", "NOVELTY_SYSTEM_MSG", "NOVELTY_USER_MSG", + "AGENTIC_SYS_FORMAT", + "AGENTIC_ITER_MSG", + "AGENTIC_EVAL_SYS", + "AGENTIC_EVAL_USER", ] diff --git a/shinka/prompts/prompts_agentic.py b/shinka/prompts/prompts_agentic.py new file mode 100644 index 000000000..1e0972859 --- /dev/null +++ b/shinka/prompts/prompts_agentic.py @@ -0,0 +1,29 @@ +"""Prompt fragments specialized for agentic editing sessions. + +IMPORTANT ARCHITECTURE NOTE: +In agentic mode, the CLI harness (Codex, Claude CLI, Gemini CLI) owns the system +prompt. These harnesses inject their own instructions for tool use, file editing, +and shell access. Shinka should NOT provide a system prompt - it would conflict +with or duplicate the harness's instructions. + +Instead, task context goes in the USER prompt as a "# Task" section. The harness +sees this as the user's request and applies its own system prompt with tool +instructions. +""" + +# Empty - CLI harness provides its own system prompt with tool/shell instructions. +# Do NOT add content here; it would conflict with harness prompts. +AGENTIC_SYS_FORMAT = "" + + +AGENTIC_ITER_MSG = """# Task + +{task_context} + +# Score + +{score_context} +{text_feedback_section} + +Explore the workspace and make improvements. When done, explain what you changed and why. +""" diff --git a/shinka/prompts/prompts_agentic_eval.py b/shinka/prompts/prompts_agentic_eval.py new file mode 100644 index 000000000..58a723866 --- /dev/null +++ b/shinka/prompts/prompts_agentic_eval.py @@ -0,0 +1,70 @@ +"""Prompt templates for agentic evaluation sessions. + +These prompts instruct the LLM evaluator to: +1. Run the evaluation command (if provided) +2. Write metrics.json with combined_score, correct, and details +3. Support custom evaluation criteria via eval_prompt +""" + +AGENTIC_EVAL_SYS = """ +You are an autonomous evaluator operating inside the repository workspace. Run +exact shell commands when provided, capture their outputs, and write the final +metrics to disk. Follow these rules: + +1) If an evaluation command is provided, execute it verbatim (except for simple + helpers like `mkdir -p` for missing directories). +2) Always ensure a metrics JSON file exists at the requested path. If it does + not exist yet, create it yourself. Required schema: + {{ + "combined_score": , + "correct": , + "details": "" + }} + - `combined_score`: How well the code performed (0 = failure, {max_score} = perfect) + - `correct`: Set to true if the code runs without critical errors and produces + reasonable output. Set to false if there are crashes, import errors, or + fundamental failures. For open-ended/creative tasks, be generous - if the + code works and does something meaningful, mark it correct. + - `details`: Brief explanation of the score and any issues encountered + You may add additional fields beyond these three required ones. +3) If the command fails or you cannot compute metrics, describe the issue inside + `...` and still emit metrics.json with + `combined_score: 0`, `correct: false`, and `details` explaining the failure. +4) Do not modify source files beyond what the evaluation command itself does. +""" + +AGENTIC_EVAL_USER = """ +# Evaluation Task + +- Task: {task_name} +- Working directory: repository root +- Program directory: {program_dir} +- Program path: {program_path} +- Results path: {results_path} +- Output metrics path: {metrics_path} +- Max score: {max_score} + +IMPORTANT: First change to the program directory, then run this command: + +``` +cd {program_dir} && {eval_command} +``` + +After it finishes, YOU MUST write YOUR evaluation results to `{metrics_path}` (NOT to +any existing metrics.json - you must write to the exact path shown above). + +Write this schema to {metrics_path}: +```json +{{ + "combined_score": , + "correct": , + "details": "" +}} +``` + +If the command fails, still write {metrics_path} with `combined_score: 0`, +`correct: false`, and describe the failure in `details`. Also wrap the error +in `...`. +{eval_criteria} +Stop ONLY after you have written the file at {metrics_path}. +""" diff --git a/shinka/tools/__init__.py b/shinka/tools/__init__.py new file mode 100644 index 000000000..c4273ee73 --- /dev/null +++ b/shinka/tools/__init__.py @@ -0,0 +1 @@ +"""Utility scripts and helpers for Shinka.""" diff --git a/shinka/tools/credentials.py b/shinka/tools/credentials.py new file mode 100644 index 000000000..18afe0568 --- /dev/null +++ b/shinka/tools/credentials.py @@ -0,0 +1,134 @@ +"""Minimal credential helpers for Shinka. + +This module provides a tiny, dependency-free way to load API keys from either: +1) Environment variables (preferred) +2) A local JSON credential store at ~/.shinka/credentials.json (optional) + +The intent is to reduce workflow friction for running CLI-backed agents while +keeping backward compatibility (no required setup) and avoiding accidental key +logging. +""" + +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path +from typing import Any, Optional + +logger = logging.getLogger(__name__) + +DEFAULT_CREDENTIALS_PATH = Path.home() / ".shinka" / "credentials.json" + +# Provider -> canonical environment variable name. +# NOTE: Keep this mapping small and explicit. Callers can still pass a raw env +# var name to get_api_key() for other providers. +PROVIDER_ENV_VAR_MAP: dict[str, str] = { + "codex": "OPENAI_API_KEY", + "openai": "OPENAI_API_KEY", + "claude": "ANTHROPIC_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "gemini": "GOOGLE_API_KEY", + "google": "GOOGLE_API_KEY", + "deepseek": "DEEPSEEK_API_KEY", +} + + +def _safe_get_str(mapping: Any, key: str) -> Optional[str]: + if not isinstance(mapping, dict): + return None + value = mapping.get(key) + if not isinstance(value, str): + return None + stripped = value.strip() + return stripped or None + + +def _load_credentials(path: Path) -> dict[str, Any]: + """Load the credentials JSON document, returning an empty dict on failure.""" + + try: + raw = path.read_text(encoding="utf-8") + except OSError: + return {} + + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return {} + + return parsed if isinstance(parsed, dict) else {} + + +def get_api_key( + provider: str, *, credentials_path: Optional[Path] = None +) -> Optional[str]: + """Return an API key for a provider, if available. + + Resolution order: + 1) Environment variable (canonical for known providers) + 2) ~/.shinka/credentials.json if present + + Supported credential file formats (examples): + - {"OPENAI_API_KEY": "sk-..."} + - {"codex": "sk-..."} (provider name as key) + - {"providers": {"codex": {"api_key": "sk-..."}}} + + Args: + provider: Provider name (e.g. "codex") or an env var name. + credentials_path: Optional override for the credential file path. + + Returns: + The API key string, or None if not found. + """ + + provider_key = (provider or "").strip() + if not provider_key: + return None + + provider_lower = provider_key.lower() + env_var = PROVIDER_ENV_VAR_MAP.get(provider_lower) + if env_var is None and provider_key.isupper() and "_" in provider_key: + env_var = provider_key + + if env_var: + value = os.environ.get(env_var) + if isinstance(value, str) and value.strip(): + logger.debug(f"Using API key for '{provider}' from environment variable ${env_var}") + return value.strip() + + path = credentials_path or DEFAULT_CREDENTIALS_PATH + if not path.exists(): + logger.debug(f"No API key found for '{provider}': env var ${env_var} not set, {path} does not exist") + return None + + doc = _load_credentials(path) + if not doc: + logger.debug(f"No API key found for '{provider}': credential file {path} is empty or invalid") + return None + + # Common: store keys by env var name. + if env_var: + value = _safe_get_str(doc, env_var) + if value: + logger.debug(f"Using API key for '{provider}' from {path} (key: {env_var})") + return value + + # Convenience: store keys by provider name. + value = _safe_get_str(doc, provider_lower) + if value: + logger.debug(f"Using API key for '{provider}' from {path} (key: {provider_lower})") + return value + + # Nested structure: {"providers": {"codex": {"api_key": "..."} }} + providers = doc.get("providers") + if isinstance(providers, dict): + provider_section = providers.get(provider_lower) + value = _safe_get_str(provider_section, "api_key") + if value: + logger.debug(f"Using API key for '{provider}' from {path} (nested: providers.{provider_lower}.api_key)") + return value + + logger.debug(f"No API key found for '{provider}' in {path}") + return None diff --git a/shinka/webui/__init__.py b/shinka/webui/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/shinka/webui/viz_tree.html b/shinka/webui/viz_tree.html index 7b104bbd3..b84ca890f 100644 --- a/shinka/webui/viz_tree.html +++ b/shinka/webui/viz_tree.html @@ -77,6 +77,22 @@ display: flex; flex-direction: column; } + + /* Remove horizontal padding when displaying code to maximize width */ + #details-panel:has(#agent-code.active), + #details-panel:has(#code-diff.active) { + padding-left: 0; + padding-right: 0; + } + + /* Add back horizontal padding to non-code elements when code tab is active */ + #details-panel:has(#agent-code.active) #node-summary, + #details-panel:has(#agent-code.active) #tabs, + #details-panel:has(#code-diff.active) #node-summary, + #details-panel:has(#code-diff.active) #tabs { + padding-left: 20px; + padding-right: 20px; + } .node { cursor: pointer; @@ -171,16 +187,29 @@ border-top: none; overflow: auto; } + + /* Reduce horizontal padding when code tab is active */ + #tab-content:has(#agent-code.active), + #tab-content:has(#code-diff.active) { + padding: 15px 0; + } .content-section { display: none; background-color: #f8f9fa; padding: 20px; } - + .content-section.active { display: block; } + + /* Remove extra padding from code sections to maximize width */ + #agent-code.content-section, + #code-diff.content-section { + padding: 0; + background-color: #fff; + } pre { background-color: #f5f5f5; @@ -221,7 +250,7 @@ #agent-code, #solution-code { background-color: #f8f8f8; border-radius: 4px; - padding-bottom: 5px; + padding: 0; } /* Make sure code blocks in Python tabs have good contrast */ @@ -1057,6 +1086,17 @@ .code-container { display: flex; + width: 100%; + overflow-x: auto; + } + + .code-controls { + display: flex; + gap: 10px; + align-items: center; + padding: 10px 15px; + background-color: #f8f8f8; + border-bottom: 1px solid #e1e4e8; } .line-numbers-gutter { @@ -1075,6 +1115,11 @@ display: block; } + #agent-code-wrapper { + width: 100%; + overflow-x: auto; + } + #agent-code-wrapper pre { margin: 0; flex: 1; @@ -1412,6 +1457,7 @@
Islands
+

Select a node from the tree to view code.

@@ -1604,7 +1650,7 @@
Islands
downloadBtn.addEventListener('click', () => { const codeElement = document.querySelector('#agent-code-wrapper pre code'); const selectedNodeId = getSelectedNodeId(); - + if (codeElement && window.treeData && selectedNodeId) { const nodeData = window.treeData.find(d => d.id === selectedNodeId); if (nodeData) { @@ -1614,17 +1660,36 @@
Islands
const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; - const agentName = (nodeData.metadata.patch_name || 'agent').replace(/\s+/g, '_'); - const gen = nodeData.generation; - const language = nodeData.language || 'py'; - const extension = { - 'python': 'py', - 'cpp': 'cpp', - 'javascript': 'js', - 'cuda': 'cu' - }[language] || language; - - a.download = `${agentName}_gen${gen}.${extension}`; + + // Use the selected file's path if in multi-file mode + let filename; + const fileSelector = document.getElementById('code-file-selector'); + if (window._codeFiles && window._codeFiles.length > 1 && fileSelector) { + const selectedIdx = parseInt(fileSelector.value, 10) || 0; + const selectedFile = window._codeFiles[selectedIdx]; + if (selectedFile && selectedFile.path) { + // Use just the filename part, prefixed with gen + const pathParts = selectedFile.path.split('/'); + const basename = pathParts[pathParts.length - 1]; + filename = `gen${nodeData.generation}_${basename}`; + } + } + + // Fallback to generic naming if not multi-file + if (!filename) { + const agentName = (nodeData.metadata.patch_name || 'agent').replace(/\s+/g, '_'); + const gen = nodeData.generation; + const language = nodeData.language || 'py'; + const extension = { + 'python': 'py', + 'cpp': 'cpp', + 'javascript': 'js', + 'cuda': 'cu' + }[language] || language; + filename = `${agentName}_gen${gen}.${extension}`; + } + + a.download = filename; document.body.appendChild(a); a.click(); document.body.removeChild(a); @@ -3512,7 +3577,8 @@

❌ Failed to Load Database

'init': d3.symbolDiamond, 'full': d3.symbolCircle, 'diff': d3.symbolSquare, - 'cross': d3.symbolCross + 'cross': d3.symbolCross, + 'agentic': d3.symbolTriangle // Triangle for agentic patches }; const getShape = (patchType) => shapeMap[patchType] || d3.symbolCircle; const symbol = d3.symbol().size(2500); @@ -4328,11 +4394,11 @@

Cumulative Cost Breakdown

// Update node summary document.getElementById("node-summary").innerHTML = ` -

${agentName} (Gen ${data.generation})

-

ID: ${data.id}

-

Parent ID: ${data.parent_id || 'None'}

+

${escapeHtml(agentName)} (Gen ${data.generation})

+

ID: ${escapeHtml(String(data.id))}

+

Parent ID: ${escapeHtml(String(data.parent_id || 'None'))}

Score: ${formatScore(score)}

- ${data.error ? `

Error: ${data.error}

` : ''} + ${data.error ? `

Error: ${escapeHtml(String(data.error))}

` : ''} `; @@ -4368,7 +4434,7 @@

${agentName} (Gen ${data.generation})

if (data.metadata) { for (const [key, value] of Object.entries(data.metadata)) { - if (key !== 'thought' && key !== 'code_analysis_metrics' && key !== 'patch_description' && key !== 'stdout_log' && key !== 'stderr_log' && key !== 'llm_result') { + if (key !== 'thought' && key !== 'code_analysis_metrics' && key !== 'patch_description' && key !== 'stdout_log' && key !== 'stderr_log' && key !== 'llm_result' && key !== 'agent_code_diffs') { let valueHtml; if (typeof value === 'object' && value !== null) { @@ -4656,43 +4722,38 @@

Selected Node Details

nodeDetailsContainer.innerHTML = nodeDetailsHtml; - // Update code tab + // Update code tab (supports multi-file code from agentic backend) const codeWrapper = document.getElementById("agent-code-wrapper"); - if (data.code) { + const dropdownContainer = document.getElementById("code-file-selector-container"); + const codeFiles = getCodeFilesForNode(data); + if (codeFiles.length > 0) { document.querySelector('#agent-code .code-controls').style.display = 'flex'; - - const sanitizedCode = escapeHtml(data.code); - const lines = data.code.split('\n'); - const lineNumbers = Array.from({length: lines.length}, (_, i) => `${i + 1}`).join(''); + const rendered = renderMultiFileCode(codeFiles, data.language); + codeWrapper.innerHTML = rendered.content; + dropdownContainer.innerHTML = rendered.dropdown; - codeWrapper.innerHTML = ` -
-
${lineNumbers}
-
${sanitizedCode}
-
- `; - - // Use a slight delay to ensure the DOM has updated + // Use a slight delay to ensure the DOM has updated, then apply highlighting setTimeout(() => { - const codeBlock = codeWrapper.querySelector('code'); - if (codeBlock) { + codeWrapper.querySelectorAll('code').forEach(codeBlock => { // Ensure hljs is available if (typeof hljs !== 'undefined') { hljs.highlightElement(codeBlock); } else { console.warn('highlight.js not found, skipping syntax highlighting.'); } - } + }); }, 50); } else { document.querySelector('#agent-code .code-controls').style.display = 'none'; codeWrapper.innerHTML = "

No code available for this node.

"; + dropdownContainer.innerHTML = ''; } - // Update diff tab + // Update diff tab (supports multi-file diffs from agentic backend) const diffWrapper = document.getElementById("code-diff"); - if (data.code_diff) { - diffWrapper.innerHTML = `
${formatDiff(data.code_diff)}
`; + const diffFiles = getDiffFilesForNode(data); + if (diffFiles.length > 0) { + diffWrapper.innerHTML = renderMultiFileDiff(diffFiles); } else { diffWrapper.innerHTML = "

No code diff available for this node.

"; } @@ -4808,6 +4869,212 @@

Selected Node Details

}).join(''); } + // Get diff statistics (additions and deletions count) + function getDiffStats(diffText) { + if (!diffText) return { additions: 0, deletions: 0 }; + const lines = diffText.split('\n'); + let additions = 0, deletions = 0; + lines.forEach(line => { + if (line.startsWith('+') && !line.startsWith('+++')) additions++; + else if (line.startsWith('-') && !line.startsWith('---')) deletions++; + }); + return { additions, deletions }; + } + + // Get default primary file path based on language + function defaultPrimaryPath(language) { + const langPaths = { python: 'main.py', javascript: 'main.js', typescript: 'main.ts', swift: 'main.swift' }; + return langPaths[language] || 'main.py'; + } + + // Extract diff files from a node (supports multi-file agentic diffs) + function getDiffFilesForNode(node) { + // Check for array of diffs (multi-file format) + if (node && Array.isArray(node.code_diffs) && node.code_diffs.length > 0) { + return node.code_diffs.map(diffEntry => ({ + path: diffEntry.path || node.metadata?.agent_primary_file || defaultPrimaryPath(node.language), + diff: diffEntry.diff || '', + })); + } + + // Check metadata.agent_code_diffs (array or dict format from agentic backend) + if (node && node.metadata?.agent_code_diffs) { + const diffs = node.metadata.agent_code_diffs; + // Handle array format: [{path: "file.py", diff: "..."}, ...] + if (Array.isArray(diffs) && diffs.length > 0) { + return diffs.map(diffEntry => ({ + path: diffEntry.path || node.metadata?.agent_primary_file || defaultPrimaryPath(node.language), + diff: diffEntry.diff || '', + })); + } + // Handle dict format: {"file.py": "diff content", ...} + if (typeof diffs === 'object' && !Array.isArray(diffs)) { + const entries = Object.entries(diffs); + if (entries.length > 0) { + return entries.map(([path, diff]) => ({ path, diff: diff || '' })); + } + } + } + + // Fallback to single code_diff + if (node && node.code_diff) { + return [{ + path: node.metadata?.agent_primary_file || defaultPrimaryPath(node.language), + diff: node.code_diff, + }]; + } + + return []; + } + + // Render multi-file diff viewer + function renderMultiFileDiff(diffFiles) { + if (!diffFiles || diffFiles.length === 0) { + return '

No code diff available for this node.

'; + } + + // Calculate totals + const totals = diffFiles.reduce((acc, file) => { + const stats = getDiffStats(file.diff); + acc.additions += stats.additions; + acc.deletions += stats.deletions; + return acc; + }, { additions: 0, deletions: 0 }); + + const filesLabel = diffFiles.length === 1 ? 'file changed' : 'files changed'; + const autoExpand = diffFiles.length === 1; + + let html = ` +
+ ${diffFiles.length} ${filesLabel} + +${totals.additions} + -${totals.deletions} +
+ `; + + diffFiles.forEach((diffEntry, idx) => { + const stats = getDiffStats(diffEntry.diff); + const isCollapsed = !autoExpand && idx > 0; + const diffContent = diffEntry.diff ? formatDiff(diffEntry.diff) : '

No diff content for this file.

'; + + html += ` +
+
+ ${isCollapsed ? '+' : '-'} + ${escapeHtml(diffEntry.path || 'File ' + (idx + 1))} + +${stats.additions} + -${stats.deletions} +
+
+
${diffContent}
+
+
+ `; + }); + + return html; + } + + // Extract code files from a node (supports multi-file agentic code) + function getCodeFilesForNode(node) { + const files = []; + + // Check for agent_changed_files in metadata (dict format: {filepath: content}) + if (node && node.metadata?.agent_changed_files && typeof node.metadata.agent_changed_files === 'object') { + const changedFiles = node.metadata.agent_changed_files; + const primaryFile = node.metadata?.agent_primary_file; + + // Add primary file first if it exists + if (primaryFile && changedFiles[primaryFile]) { + files.push({ path: primaryFile, code: changedFiles[primaryFile] }); + } + + // Add remaining files + Object.entries(changedFiles).forEach(([path, code]) => { + if (path !== primaryFile) { + files.push({ path, code: code || '' }); + } + }); + } + + // If no multi-file data but we have single code, return that + if (files.length === 0 && node && node.code) { + const primaryPath = node.metadata?.agent_primary_file || defaultPrimaryPath(node.language); + files.push({ path: primaryPath, code: node.code }); + } + + return files; + } + + // Render multi-file code viewer with dropdown selector + function renderMultiFileCode(codeFiles, language) { + if (!codeFiles || codeFiles.length === 0) { + return { content: '

No code available for this node.

', dropdown: '' }; + } + + // Store files globally for dropdown switching + window._codeFiles = codeFiles; + window._codeLanguage = language; + + // Build dropdown (for placing in header) + let dropdownHtml = ''; + if (codeFiles.length > 1) { + const options = codeFiles.map((f, i) => + `` + ).join(''); + dropdownHtml = ` + + `; + } + + // Render first file by default + const file = codeFiles[0]; + const sanitizedCode = escapeHtml(file.code); + const lines = file.code.split('\n'); + const lineNumbers = Array.from({length: lines.length}, (_, i) => `${i + 1}`).join(''); + + const contentHtml = ` +
+
+
${lineNumbers}
+
${sanitizedCode}
+
+
+ `; + + return { content: contentHtml, dropdown: dropdownHtml }; + } + + // Switch displayed code file via dropdown + window._switchCodeFile = function(index) { + const files = window._codeFiles; + const language = window._codeLanguage || 'python'; + if (!files || !files[index]) return; + + const file = files[index]; + const sanitizedCode = escapeHtml(file.code); + const lines = file.code.split('\n'); + const lineNumbers = Array.from({length: lines.length}, (_, i) => `${i + 1}`).join(''); + + const container = document.getElementById('code-file-content'); + if (container) { + container.innerHTML = ` +
+
${lineNumbers}
+
${sanitizedCode}
+
+ `; + // Re-apply syntax highlighting + const codeBlock = container.querySelector('code'); + if (codeBlock && typeof hljs !== 'undefined') { + hljs.highlightElement(codeBlock); + } + } + }; + // Get CSS class for score display function getScoreClass(score) { if (score === null || score === undefined) { @@ -5370,12 +5637,26 @@

Selected Node Details

window.resizeTimeout = setTimeout(function() { if (window.treeData) { + // Get selected node ID from URL (more reliable than DOM after redraw) + const urlParams = new URLSearchParams(window.location.search); + const selectedNodeId = urlParams.get('selected_node') || getSelectedNodeId(); + renderGraph(window.treeData); + + // Restore the selected node after redraw + if (selectedNodeId && window.treeData) { + const nodeStillExists = window.treeData.find(d => d.id === selectedNodeId); + if (nodeStillExists) { + setTimeout(() => { + selectNodeById(selectedNodeId, false, false); + }, 100); + } + } } else { // Full reload only if necessary - const resultSelect = document.getElementById('result-select'); - if (resultSelect.value) { - loadDatabase(resultSelect.value); + const resultSelect = document.getElementById('result-select'); + if (resultSelect.value) { + loadDatabase(resultSelect.value); } } }, 300); @@ -7196,7 +7477,8 @@

Selected Node Details

'init': d3.symbolDiamond, 'full': d3.symbolCircle, 'diff': d3.symbolSquare, - 'cross': d3.symbolCross + 'cross': d3.symbolCross, + 'agentic': d3.symbolTriangle // Triangle for agentic patches }; const getShape = (patchType) => shapeMap[patchType] || d3.symbolCircle; const symbol = d3.symbol().size(1500); // Smaller size for island trees @@ -9617,4 +9899,4 @@
Debug Information:
- \ No newline at end of file + diff --git a/tests/test_agentic_editor.py b/tests/test_agentic_editor.py new file mode 100644 index 000000000..1a927d425 --- /dev/null +++ b/tests/test_agentic_editor.py @@ -0,0 +1,903 @@ +"""Comprehensive tests for shinka/edit/agentic.py.""" + +from __future__ import annotations + +import base64 +import json +from pathlib import Path +from typing import Any, Dict, Iterator, Optional +from unittest.mock import MagicMock + +import pytest + +from shinka.edit.agentic import ( + AgentContext, + AgentResult, + AgenticEditor, + CommandResult, + MAX_BASE_FILE_SIZE, + MAX_BINARY_FILE_SIZE, +) + + +@pytest.fixture +def mock_config(): + """Create a mock config for AgenticEditor.""" + config = MagicMock() + config.cli_profile = "test_profile" + config.sandbox = "enabled" + config.approval_mode = "auto" + config.max_seconds = 300 + config.max_turns = 20 + config.extra_cli_config = {} + config.cli_path = None + return config + + +@pytest.fixture +def scratch_dir(tmp_path: Path) -> Path: + """Create a temporary scratch directory.""" + return tmp_path / "scratch" + + +# ============================================================================ +# Scratch Directory Tests (_prepare_scratch method) +# ============================================================================ + + +def test_prepare_scratch_basic(scratch_dir: Path, mock_config): + """Test basic file writing to scratch directory.""" + editor = AgenticEditor(scratch_dir, mock_config) + + base_files = { + Path("main.py"): "def hello():\n print('world')\n", + Path("utils.py"): "def helper():\n return 42\n", + } + + baseline = editor._prepare_scratch(base_files) + + # Check that scratch directory was created + assert scratch_dir.exists() + assert scratch_dir.is_dir() + + # Check that files were written + assert (scratch_dir / "main.py").exists() + assert (scratch_dir / "utils.py").exists() + + # Check file contents + assert (scratch_dir / "main.py").read_text() == "def hello():\n print('world')\n" + assert (scratch_dir / "utils.py").read_text() == "def helper():\n return 42\n" + + # Check baseline return value + assert baseline == base_files + + +def test_prepare_scratch_preserves_session_meta(scratch_dir: Path, mock_config): + """Test that session_meta.json is preserved across prepare_scratch calls.""" + editor = AgenticEditor(scratch_dir, mock_config) + + # Create scratch directory with session_meta.json + scratch_dir.mkdir(parents=True) + meta_content = json.dumps({"session_id": "test_123", "parent_id": "parent_456"}) + (scratch_dir / "session_meta.json").write_text(meta_content, encoding="utf-8") + (scratch_dir / "old_file.py").write_text("old content") + + # Prepare scratch with new files + base_files = {Path("new_file.py"): "new content"} + editor._prepare_scratch(base_files) + + # Check that session_meta.json was preserved + assert (scratch_dir / "session_meta.json").exists() + assert (scratch_dir / "session_meta.json").read_text(encoding="utf-8") == meta_content + + # Check that old file was removed + assert not (scratch_dir / "old_file.py").exists() + + # Check that new file was created + assert (scratch_dir / "new_file.py").exists() + + +def test_prepare_scratch_rejects_absolute_paths(scratch_dir: Path, mock_config): + """Test ValueError for absolute paths in base_files.""" + editor = AgenticEditor(scratch_dir, mock_config) + + base_files = { + Path("/etc/passwd"): "malicious content", + } + + with pytest.raises(ValueError, match="must be relative"): + editor._prepare_scratch(base_files) + + +def test_prepare_scratch_rejects_path_traversal(scratch_dir: Path, mock_config): + """Test ValueError for ../ path traversal attempts.""" + editor = AgenticEditor(scratch_dir, mock_config) + + base_files = { + Path("../escape.py"): "escaped content", + } + + with pytest.raises(ValueError, match="escapes scratch directory"): + editor._prepare_scratch(base_files) + + # Also test more complex traversal + base_files = { + Path("subdir/../../escape.py"): "escaped content", + } + + with pytest.raises(ValueError, match="escapes scratch directory"): + editor._prepare_scratch(base_files) + + +def test_prepare_scratch_file_size_limit(scratch_dir: Path, mock_config): + """Test MAX_BASE_FILE_SIZE enforcement.""" + editor = AgenticEditor(scratch_dir, mock_config) + + # Create a file that exceeds the size limit + large_content = "x" * (MAX_BASE_FILE_SIZE + 1) + base_files = { + Path("large_file.txt"): large_content, + } + + with pytest.raises(ValueError, match="exceeds max size"): + editor._prepare_scratch(base_files) + + +# ============================================================================ +# Session Execution Tests (run_session method with mocked runner) +# ============================================================================ + + +def mock_runner_basic( + user_prompt: str, + workdir: Path, + **kwargs +) -> Iterator[Dict[str, Any]]: + """Basic mock runner that yields controlled events.""" + # Init event with model + yield { + "type": "init", + "model": "claude-opus-4-5", + "session_id": "sess_abc123", + } + + # Agent message + yield { + "type": "event", + "item": { + "type": "agent_message", + "text": "I'll help you with that task.", + }, + } + + # Write a file + (workdir / "output.py").write_text("def new_function():\n return 'hello'\n") + + # Usage event + yield { + "type": "usage", + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "total_cost_usd": 0.0025, + }, + } + + # Final message + yield { + "type": "event", + "item": { + "type": "agent_message", + "text": "Task completed successfully.", + }, + } + + +def test_run_session_detects_changed_files(scratch_dir: Path, mock_config): + """Test that changed files are detected correctly.""" + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_basic) + + base_files = { + Path("existing.py"): "original content", + } + + context = AgentContext( + user_prompt="Create a new function", + language="python", + base_files=base_files, + primary_file=Path("existing.py"), + ) + + result = editor.run_session(context) + + # Check that new file was detected + assert Path("output.py") in result.changed_files + assert result.changed_files[Path("output.py")] == "def new_function():\n return 'hello'\n" + + # Check that existing file wasn't changed + assert Path("existing.py") not in result.changed_files + + +def test_run_session_handles_binary_files(scratch_dir: Path, mock_config): + """Test base64 encoding of binary files.""" + def mock_runner_with_binary(user_prompt: str, workdir: Path, **kwargs): + # Create a binary file + binary_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR' + (workdir / "image.png").write_bytes(binary_data) + + yield {"type": "init", "model": "test-model"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Created image"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_binary) + + context = AgentContext( + user_prompt="Create an image", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Binary file should be in binary_changed_files, not changed_files + assert Path("image.png") not in result.changed_files + assert Path("image.png") in result.binary_changed_files + + # Check base64 encoding + expected_b64 = base64.b64encode(b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR').decode("ascii") + assert result.binary_changed_files[Path("image.png")] == expected_b64 + + +def test_run_session_skips_internal_files(scratch_dir: Path, mock_config): + """Test that session_log.jsonl and session_meta.json are not in changed_files.""" + def mock_runner_with_internal_files(user_prompt: str, workdir: Path, **kwargs): + # Create internal files + (workdir / "session_meta.json").write_text('{"test": "meta"}') + (workdir / "real_change.py").write_text("changed code") + + yield {"type": "init"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_internal_files) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Internal files should be excluded + assert Path("session_log.jsonl") not in result.changed_files + assert Path("session_meta.json") not in result.changed_files + + # Real changes should be included + assert Path("real_change.py") in result.changed_files + + +def test_run_session_cost_metrics(scratch_dir: Path, mock_config): + """Test usage aggregation from events.""" + def mock_runner_with_multiple_usage(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init", "model": "test-model"} + + # First API call + yield { + "type": "usage", + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "total_cost_usd": 0.002, + }, + } + + # Second API call + yield { + "type": "usage", + "usage": { + "input_tokens": 200, + "output_tokens": 75, + "total_tokens": 275, + "total_cost_usd": 0.003, + }, + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_multiple_usage) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Check aggregated metrics + assert result.metrics["input_tokens"] == 300.0 + assert result.metrics["output_tokens"] == 125.0 + assert result.metrics["total_tokens"] == 425.0 + assert result.metrics["total_cost"] == 0.005 + assert result.metrics["real_cost_available"] is True + + +def test_run_session_extracts_model_from_init(scratch_dir: Path, mock_config): + """Test model extraction from init event.""" + def mock_runner_with_model(user_prompt: str, workdir: Path, **kwargs): + yield { + "type": "init", + "model": "claude-sonnet-4-5", + "session_id": "test_session", + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Working..."}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_model) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Model should be extracted from init event + assert result.model == "claude-sonnet-4-5" + assert result.session_id == "test_session" + + +def test_run_session_command_execution(scratch_dir: Path, mock_config): + """Test that command executions are captured.""" + def mock_runner_with_commands(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + # Command execution event + yield { + "type": "event", + "item": { + "type": "command_execution", + "command": "pytest tests/", + "status": "completed", + "exit_code": 0, + "stdout": "All tests passed", + "stderr": "", + }, + } + + yield { + "type": "event", + "item": { + "type": "command_execution", + "command": "pylint code.py", + "status": "failed", + "exit_code": 1, + "stdout": "", + "stderr": "Linting errors found", + }, + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Commands executed"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_commands) + + context = AgentContext( + user_prompt="Run tests", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Check that commands were captured + assert len(result.commands_run) == 2 + + # First command + assert result.commands_run[0].command == "pytest tests/" + assert result.commands_run[0].status == "completed" + assert result.commands_run[0].exit_code == 0 + assert result.commands_run[0].stdout == "All tests passed" + + # Second command + assert result.commands_run[1].command == "pylint code.py" + assert result.commands_run[1].status == "failed" + assert result.commands_run[1].exit_code == 1 + assert result.commands_run[1].stderr == "Linting errors found" + + +def test_run_session_session_log_accumulation(scratch_dir: Path, mock_config): + """Test that agent messages are accumulated in session_log.""" + def mock_runner_with_messages(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Starting task..."}, + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Processing files..."}, + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Task completed!"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_with_messages) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Check session log + assert len(result.session_log) == 3 + assert result.session_log[0] == "Starting task..." + assert result.session_log[1] == "Processing files..." + assert result.session_log[2] == "Task completed!" + + # Final message should be the last one + assert result.final_message == "Task completed!" + + +def test_run_session_fallback_cost_estimate(scratch_dir: Path, mock_config): + """Test fallback cost estimation when no real cost is provided.""" + def mock_runner_no_cost(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + # Usage without cost_usd + yield { + "type": "usage", + "usage": { + "input_tokens": 1000, + "output_tokens": 500, + "total_tokens": 1500, + }, + } + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_no_cost) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Should use fallback cost estimate (tokens / 1000) + assert result.metrics["total_tokens"] == 1500.0 + assert result.metrics["total_cost"] == 1.5 # 1500 / 1000 + assert result.metrics["real_cost_available"] is False + + +def test_run_session_detects_modified_files(scratch_dir: Path, mock_config): + """Test that modifications to existing files are detected.""" + def mock_runner_modify(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + # Modify existing file + existing_file = workdir / "existing.py" + existing_file.write_text("modified content") + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Modified file"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_modify) + + base_files = { + Path("existing.py"): "original content", + } + + context = AgentContext( + user_prompt="Modify file", + language="python", + base_files=base_files, + primary_file=Path("existing.py"), + ) + + result = editor.run_session(context) + + # Modified file should be in changed_files + assert Path("existing.py") in result.changed_files + assert result.changed_files[Path("existing.py")] == "modified content" + + +def test_run_session_with_nested_directories(scratch_dir: Path, mock_config): + """Test handling of files in nested directories.""" + def mock_runner_nested(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + # Create nested structure + (workdir / "src" / "module").mkdir(parents=True) + (workdir / "src" / "module" / "code.py").write_text("nested code") + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Created nested files"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_nested) + + context = AgentContext( + user_prompt="Create nested structure", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Check nested file was detected + nested_path = Path("src") / "module" / "code.py" + assert nested_path in result.changed_files + assert result.changed_files[nested_path] == "nested code" + + +def test_run_session_events_logged_to_jsonl(scratch_dir: Path, mock_config): + """Test that all events are logged to session_log.jsonl.""" + def mock_runner_events(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init", "model": "test"} + yield {"type": "usage", "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}} + yield {"type": "event", "item": {"type": "agent_message", "text": "Done"}} + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_events) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Check that session log file exists + assert result.session_log_path is not None + assert result.session_log_path.exists() + + # Read and parse JSONL + lines = result.session_log_path.read_text().strip().split("\n") + events = [json.loads(line) for line in lines] + + # Should have 3 events + assert len(events) == 3 + assert events[0]["type"] == "init" + assert events[1]["type"] == "usage" + assert events[2]["type"] == "event" + + # Also check session_events in result + assert len(result.session_events) == 3 + + +def test_run_session_large_binary_files_skipped(scratch_dir: Path, mock_config): + """Test that binary files exceeding MAX_BINARY_FILE_SIZE are skipped.""" + def mock_runner_large_binary(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + + # Create a binary file exceeding the limit with non-UTF8 data + # Use 0xFF bytes which will fail UTF-8 decoding + large_binary = b'\xff' * (MAX_BINARY_FILE_SIZE + 1) + (workdir / "large.bin").write_bytes(large_binary) + + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Created large binary"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_large_binary) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Large binary should be skipped + assert Path("large.bin") not in result.changed_files + assert Path("large.bin") not in result.binary_changed_files + + +def test_run_session_backward_compat_codex_runner(scratch_dir: Path, mock_config): + """Test backward compatibility with codex_runner parameter.""" + def mock_codex_runner(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init", "model": "codex-model"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Codex runner works"}, + } + + # Use deprecated codex_runner parameter + editor = AgenticEditor(scratch_dir, mock_config, codex_runner=mock_codex_runner) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Should use the codex_runner + assert result.model == "codex-model" + assert "Codex runner works" in result.session_log + + +def test_agent_context_with_metadata(scratch_dir: Path, mock_config): + """Test that metadata is passed through to runner.""" + captured_kwargs = {} + + def mock_runner_capture(user_prompt: str, workdir: Path, **kwargs): + captured_kwargs.update(kwargs) + yield {"type": "init"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_capture) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + metadata={ + "parent_id": "parent_123", + "generation": 5, + "patch_type": "full", + "results_dir": "/tmp/results", + }, + ) + + result = editor.run_session(context) + + # Check that metadata was passed to runner + assert captured_kwargs["parent_id"] == "parent_123" + assert captured_kwargs["generation"] == 5 + assert captured_kwargs["patch_type"] == "full" + assert captured_kwargs["results_dir"] == "/tmp/results" + + +def test_agent_context_with_system_prompt(scratch_dir: Path, mock_config): + """Test that system_prompt is passed to runner.""" + captured_kwargs = {} + + def mock_runner_capture(user_prompt: str, workdir: Path, **kwargs): + captured_kwargs.update(kwargs) + yield {"type": "init"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_capture) + + system_prompt = "You are a helpful coding assistant." + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + system_prompt=system_prompt, + ) + + result = editor.run_session(context) + + # Check that system_prompt was passed to runner + assert captured_kwargs["system_prompt"] == system_prompt + + +def test_agent_context_with_resume_session(scratch_dir: Path, mock_config): + """Test resuming a session with resume_session_id.""" + captured_kwargs = {} + + def mock_runner_capture(user_prompt: str, workdir: Path, **kwargs): + captured_kwargs.update(kwargs) + yield {"type": "init", "session_id": "resumed_session_456"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Resumed"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_capture) + + context = AgentContext( + user_prompt="Continue", + language="python", + base_files={}, + primary_file=Path("main.py"), + resume_session_id="session_to_resume_123", + ) + + result = editor.run_session(context) + + # Check that resume_session_id was passed to runner + assert captured_kwargs["resume_session_id"] == "session_to_resume_123" + assert result.session_id == "resumed_session_456" + + +# ============================================================================ +# Edge Cases and Error Handling +# ============================================================================ + + +def test_run_session_no_changes(scratch_dir: Path, mock_config): + """Test session that completes without making any changes.""" + def mock_runner_no_changes(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "No changes needed"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_no_changes) + + context = AgentContext( + user_prompt="Review code", + language="python", + base_files={Path("code.py"): "def foo(): pass"}, + primary_file=Path("code.py"), + ) + + result = editor.run_session(context) + + # Should have no changed files + assert len(result.changed_files) == 0 + assert len(result.binary_changed_files) == 0 + + +def test_run_session_empty_base_files(scratch_dir: Path, mock_config): + """Test session with no base files.""" + def mock_runner_create(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + (workdir / "new.py").write_text("created from scratch") + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Created new file"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_create) + + context = AgentContext( + user_prompt="Create file", + language="python", + base_files={}, + primary_file=Path("new.py"), + ) + + result = editor.run_session(context) + + # New file should be detected + assert Path("new.py") in result.changed_files + + +def test_prepare_scratch_creates_parent_directories(scratch_dir: Path, mock_config): + """Test that parent directories are created for nested files.""" + editor = AgenticEditor(scratch_dir, mock_config) + + base_files = { + Path("a/b/c/deep.py"): "deep file", + } + + baseline = editor._prepare_scratch(base_files) + + # Check that nested structure was created + assert (scratch_dir / "a" / "b" / "c" / "deep.py").exists() + assert (scratch_dir / "a" / "b" / "c" / "deep.py").read_text() == "deep file" + + +def test_run_session_metrics_include_elapsed_time(scratch_dir: Path, mock_config): + """Test that elapsed_seconds is included in metrics.""" + def mock_runner_simple(user_prompt: str, workdir: Path, **kwargs): + yield {"type": "init"} + yield { + "type": "event", + "item": {"type": "agent_message", "text": "Done"}, + } + + editor = AgenticEditor(scratch_dir, mock_config, runner=mock_runner_simple) + + context = AgentContext( + user_prompt="Test", + language="python", + base_files={}, + primary_file=Path("main.py"), + ) + + result = editor.run_session(context) + + # Should have elapsed_seconds metric + assert "elapsed_seconds" in result.metrics + assert result.metrics["elapsed_seconds"] > 0 + + +def test_prepare_scratch_handles_unicode(scratch_dir: Path, mock_config): + """Test handling of unicode content in base files.""" + editor = AgenticEditor(scratch_dir, mock_config) + + base_files = { + Path("unicode.py"): "# ζ—₯本θͺžγ‚³γƒ‘γƒ³γƒˆ\ndef hello():\n print('こんにけは')\n", + } + + baseline = editor._prepare_scratch(base_files) + + # Check unicode was preserved + content = (scratch_dir / "unicode.py").read_text(encoding="utf-8") + assert "ζ—₯本θͺž" in content + assert "こんにけは" in content + + +def test_command_result_dataclass(): + """Test CommandResult dataclass construction.""" + cmd = CommandResult( + command="pytest", + status="completed", + exit_code=0, + stdout="All tests passed", + stderr="", + ) + + assert cmd.command == "pytest" + assert cmd.status == "completed" + assert cmd.exit_code == 0 + assert cmd.stdout == "All tests passed" + assert cmd.stderr == "" + + +def test_agent_result_default_fields(): + """Test AgentResult default field values.""" + result = AgentResult( + changed_files={Path("test.py"): "content"}, + session_log=["message1", "message2"], + commands_run=[], + ) + + assert result.final_message is None + assert result.metrics == {} + assert result.session_log_path is None + assert result.session_events == [] + assert result.binary_changed_files == {} + assert result.session_id is None + assert result.model is None diff --git a/tests/test_agentic_evaluator.py b/tests/test_agentic_evaluator.py new file mode 100644 index 000000000..c13635139 --- /dev/null +++ b/tests/test_agentic_evaluator.py @@ -0,0 +1,591 @@ +"""Comprehensive tests for shinka/eval/agentic.py - Agentic evaluator.""" + +import json +import time +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional +from unittest.mock import MagicMock + +import pytest + +from shinka.core.runner import AgenticEvaluatorConfig +from shinka.edit.codex_cli import CodexExecutionError +from shinka.eval.agentic import AgenticEvaluator, AgenticEvaluatorResult + + +@pytest.fixture +def mock_config(): + """Create a mock AgenticEvaluatorConfig.""" + config = MagicMock(spec=AgenticEvaluatorConfig) + config.cli_profile = "test-profile" + config.sandbox = True + config.approval_mode = "auto" + config.max_seconds = 300 + config.max_events = 100 + config.extra_cli_config = {} + config.cli_path = None + return config + + +@pytest.fixture +def temp_workspace(tmp_path): + """Create a temporary workspace with typical structure.""" + workspace = { + "repo_root": tmp_path / "repo", + "program_path": tmp_path / "repo" / "solution.py", + "results_path": tmp_path / "repo" / "results", + "metrics_path": tmp_path / "repo" / "results" / "metrics.json", + "eval_sessions_root": tmp_path / "eval_sessions", + } + workspace["repo_root"].mkdir(parents=True) + workspace["results_path"].mkdir(parents=True) + workspace["eval_sessions_root"].mkdir(parents=True) + workspace["program_path"].write_text("# Test program\nprint('Hello')\n") + return workspace + + +def make_mock_runner( + session_events: List[Dict[str, Any]], + include_metrics: bool = True, + metrics_data: Optional[Dict[str, Any]] = None, +) -> callable: + """Create a mock agent runner that yields events and optionally creates metrics.json.""" + + def mock_runner( + user_prompt: str, + system_prompt: str, + workdir: Path, + profile: str, + sandbox: bool, + approval_mode: str, + max_seconds: int, + max_events: int, + extra_cli_config: Dict[str, Any], + cli_path: Optional[str], + session_kind: str, + results_dir: Optional[str], + ) -> Iterator[Dict[str, Any]]: + """Mock runner that yields session events.""" + # Yield all session events + for event in session_events: + yield event + + # Optionally write metrics.json after all events + if include_metrics: + metrics_file = workdir / "results" / "metrics.json" + metrics_file.parent.mkdir(parents=True, exist_ok=True) + data = metrics_data or { + "combined_score": 85.0, + "correct": True, + "details": "Test passed successfully", + } + metrics_file.write_text(json.dumps(data)) + + return mock_runner + + +def test_agentic_evaluator_success(mock_config, temp_workspace): + """Test successful evaluation with metrics written.""" + # Create mock session events + session_events = [ + { + "type": "thread.init", + "thread_id": "test-thread-123", + "item": { + "type": "agent_message", + "text": "Starting evaluation", + }, + }, + { + "type": "thread.message", + "thread_id": "test-thread-123", + "item": { + "type": "command_execution", + "command": "python solution.py", + "status": "success", + "exit_code": 0, + "stdout": "Test output", + "stderr": "", + }, + }, + { + "type": "thread.message", + "thread_id": "test-thread-123", + "item": { + "type": "agent_message", + "text": "Evaluation complete, metrics written", + }, + }, + ] + + mock_runner = make_mock_runner(session_events) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test_task", + ) + + # Verify result structure + assert isinstance(result, AgenticEvaluatorResult) + assert result.correct is True + assert result.metrics["combined_score"] == 85.0 + assert result.metrics["details"] == "Test passed successfully" + assert result.error_message is None + assert result.session_id == "test-thread-123" + assert len(result.session_log) == 2 + assert len(result.commands_run) == 1 + assert result.commands_run[0].command == "python solution.py" + assert result.commands_run[0].exit_code == 0 + assert result.stdout_log == "Test output" + assert result.stderr_log == "" + assert result.elapsed_seconds > 0 + assert result.session_log_path.exists() + assert result.system_prompt is not None + assert result.user_prompt is not None + + +def test_agentic_evaluator_no_metrics(mock_config, temp_workspace): + """Test error when metrics.json not produced.""" + # Events that don't write metrics.json + session_events = [ + { + "type": "thread.init", + "thread_id": "test-thread-456", + "item": { + "type": "agent_message", + "text": "Evaluation started but failed", + }, + }, + ] + + mock_runner = make_mock_runner(session_events, include_metrics=False) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + with pytest.raises(CodexExecutionError) as exc_info: + evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test_task", + ) + + assert "did not produce metrics" in str(exc_info.value) + assert str(temp_workspace["metrics_path"]) in str(exc_info.value) + + +def test_agentic_evaluator_malformed_json(mock_config, temp_workspace): + """Test handling of invalid JSON in metrics.json.""" + session_events = [ + { + "type": "thread.message", + "thread_id": "test-thread-789", + "item": { + "type": "agent_message", + "text": "Writing malformed metrics", + }, + }, + ] + + def mock_runner_with_bad_json(**kwargs) -> Iterator[Dict[str, Any]]: + for event in session_events: + yield event + # Write invalid JSON + metrics_file = kwargs["workdir"] / "results" / "metrics.json" + metrics_file.parent.mkdir(parents=True, exist_ok=True) + metrics_file.write_text("{invalid json content") + + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner_with_bad_json) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test_task", + ) + + # Should handle gracefully with error in metrics + assert "error" in result.metrics + assert "Invalid JSON in metrics" in result.metrics["error"] + assert result.metrics["combined_score"] == 0 + + +def test_agentic_evaluator_custom_eval_prompt(mock_config, temp_workspace): + """Test eval_prompt injection into user prompt.""" + custom_eval_prompt = """ + Check for the following: + - Code quality and readability + - Proper error handling + - Performance optimization + """ + + session_events = [ + { + "type": "thread.message", + "item": { + "type": "agent_message", + "text": "Evaluating with custom criteria", + }, + }, + ] + + mock_runner = make_mock_runner(session_events) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test_task", + eval_prompt=custom_eval_prompt, + ) + + # Verify custom prompt was included + assert result.user_prompt is not None + assert "Evaluation criteria:" in result.user_prompt + assert "Code quality and readability" in result.user_prompt + assert "Proper error handling" in result.user_prompt + + +def test_agentic_evaluator_no_command_mode(mock_config, temp_workspace): + """Test LLM-as-judge mode with no eval command.""" + session_events = [ + { + "type": "thread.message", + "item": { + "type": "agent_message", + "text": "Inspecting code directly", + }, + }, + ] + + mock_runner = make_mock_runner( + session_events, + metrics_data={ + "combined_score": 75.0, + "correct": True, + "details": "LLM judged the code as good", + }, + ) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=[], # Empty command = LLM-as-judge mode + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test_task", + eval_prompt="Judge code quality", + ) + + # Verify no-command mode prompt + assert result.user_prompt is not None + assert "no script provided" in result.user_prompt.lower() + assert "Inspect the workspace/program" in result.user_prompt + assert "Judge the submission" in result.user_prompt + assert result.correct is True + assert result.metrics["combined_score"] == 75.0 + + +def test_build_prompt_with_eval_criteria(mock_config): + """Test prompt construction with evaluation criteria.""" + evaluator = AgenticEvaluator(mock_config) + + user_prompt, system_prompt = evaluator._build_prompt( + task_name="code_quality_check", + eval_command=["pytest", "tests/"], + program_path=Path("/repo/solution.py"), + results_path=Path("/repo/results"), + metrics_path=Path("/repo/results/metrics.json"), + eval_prompt="Focus on test coverage and code style", + max_score=100.0, + ) + + # Verify user prompt includes all components + assert "code_quality_check" in user_prompt + assert "pytest tests/" in user_prompt + assert "/repo/solution.py" in user_prompt + assert "/repo/results/metrics.json" in user_prompt + assert "Evaluation criteria:" in user_prompt + assert "Focus on test coverage and code style" in user_prompt + assert "Max score: 100.0" in user_prompt + + # Verify system prompt + assert "autonomous evaluator" in system_prompt.lower() + assert "metrics JSON file" in system_prompt + assert "combined_score" in system_prompt + + +def test_build_prompt_default(mock_config): + """Test default prompt construction without eval_prompt.""" + evaluator = AgenticEvaluator(mock_config) + + user_prompt, system_prompt = evaluator._build_prompt( + task_name="basic_test", + eval_command=["python", "test.py"], + program_path=Path("/repo/main.py"), + results_path=Path("/repo/out"), + metrics_path=Path("/repo/out/metrics.json"), + eval_prompt=None, + max_score=50.0, + ) + + # Verify no eval criteria section when none provided + assert "Evaluation criteria:" not in user_prompt + assert "basic_test" in user_prompt + assert "python test.py" in user_prompt + assert "Max score: 50.0" in user_prompt + + # System prompt should be present + assert system_prompt + assert "50.0" in system_prompt + + +def test_extract_session_id_from_events(mock_config, temp_workspace): + """Test session ID extraction from various event formats.""" + # Test with thread.init event + events_thread = [ + { + "type": "thread.init", + "thread_id": "thread-abc-123", + "item": {"type": "agent_message", "text": "Starting"}, + }, + ] + + mock_runner = make_mock_runner(events_thread) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test", + ) + assert result.session_id == "thread-abc-123" + + # Test with direct session_id field + events_session = [ + { + "type": "custom", + "session_id": "session-xyz-456", + "item": {"type": "agent_message", "text": "Starting"}, + }, + ] + + mock_runner2 = make_mock_runner(events_session) + evaluator2 = AgenticEvaluator(mock_config, codex_runner=mock_runner2) + result2 = evaluator2.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test", + ) + assert result2.session_id == "session-xyz-456" + + # Test with nested session object + events_nested = [ + { + "type": "custom", + "session": {"id": "nested-session-789"}, + "item": {"type": "agent_message", "text": "Starting"}, + }, + ] + + mock_runner3 = make_mock_runner(events_nested) + evaluator3 = AgenticEvaluator(mock_config, codex_runner=mock_runner3) + result3 = evaluator3.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="test", + ) + assert result3.session_id == "nested-session-789" + + +def test_agentic_evaluator_backward_compatibility_correct_json( + mock_config, temp_workspace +): + """Test backward compatibility with separate correct.json file.""" + session_events = [ + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "Evaluation done"}, + }, + ] + + def mock_runner_with_legacy(**kwargs) -> Iterator[Dict[str, Any]]: + for event in session_events: + yield event + # Write old-style metrics without 'correct' field + metrics_file = kwargs["workdir"] / "results" / "metrics.json" + metrics_file.parent.mkdir(parents=True, exist_ok=True) + metrics_file.write_text( + json.dumps({"combined_score": 90.0, "details": "Legacy format"}) + ) + # Write separate correct.json + correct_file = kwargs["workdir"] / "results" / "correct.json" + correct_file.write_text(json.dumps({"correct": True})) + + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner_with_legacy) + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="legacy_test", + ) + + # Should read correct flag from correct.json + assert result.correct is True + assert result.error_message is None + assert result.metrics["combined_score"] == 90.0 + + +def test_agentic_evaluator_agent_runner_alias(mock_config, temp_workspace): + """Test agent_runner parameter alias for backward compatibility.""" + session_events = [ + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "Using alias"}, + }, + ] + + mock_runner = make_mock_runner(session_events) + # Use agent_runner instead of codex_runner + evaluator = AgenticEvaluator(mock_config, agent_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="alias_test", + ) + + assert result.metrics["combined_score"] == 85.0 + + +def test_agentic_evaluator_max_score_propagation(mock_config, temp_workspace): + """Test that max_score parameter is properly propagated to prompts.""" + session_events = [ + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "Custom max score"}, + }, + ] + + mock_runner = make_mock_runner( + session_events, + metrics_data={"combined_score": 150.0, "correct": True, "details": "Excellent"}, + ) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["python", "eval.py"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="custom_max_score_test", + max_score=200.0, + ) + + # Verify max_score in prompts + assert "200.0" in result.system_prompt + assert "200.0" in result.user_prompt + assert result.metrics["combined_score"] == 150.0 + + +def test_agentic_evaluator_session_log_persistence(mock_config, temp_workspace): + """Test that session logs are properly written to disk.""" + session_events = [ + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "First message"}, + }, + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "Second message"}, + }, + ] + + mock_runner = make_mock_runner(session_events) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="log_test", + ) + + # Verify session log file exists and contains events + assert result.session_log_path.exists() + log_content = result.session_log_path.read_text() + assert log_content.count("\n") == len(session_events) # One line per event + # Verify JSONL format + for line in log_content.strip().split("\n"): + assert json.loads(line) # Should be valid JSON + + +def test_agentic_evaluator_evaluation_time_in_metrics(mock_config, temp_workspace): + """Test that evaluation_time_seconds is added to metrics.""" + session_events = [ + { + "type": "thread.message", + "item": {"type": "agent_message", "text": "Processing"}, + }, + ] + + mock_runner = make_mock_runner(session_events) + evaluator = AgenticEvaluator(mock_config, codex_runner=mock_runner) + + start = time.monotonic() + result = evaluator.evaluate( + repo_root=temp_workspace["repo_root"], + eval_command=["echo", "test"], + program_path=temp_workspace["program_path"], + results_path=temp_workspace["results_path"], + metrics_path=temp_workspace["metrics_path"], + eval_sessions_root=temp_workspace["eval_sessions_root"], + task_name="timing_test", + ) + elapsed = time.monotonic() - start + + # Verify evaluation_time_seconds is in metrics + assert "evaluation_time_seconds" in result.metrics + assert result.metrics["evaluation_time_seconds"] > 0 + assert result.metrics["evaluation_time_seconds"] <= elapsed + 0.1 # Small tolerance + assert result.elapsed_seconds == result.metrics["evaluation_time_seconds"] diff --git a/tests/test_codex_device_auth.py b/tests/test_codex_device_auth.py new file mode 100644 index 000000000..ca5e2683d --- /dev/null +++ b/tests/test_codex_device_auth.py @@ -0,0 +1,74 @@ +import subprocess +from pathlib import Path + +import pytest + +from shinka.edit.codex_cli import ( + CodexAuthError, + _ensure_codex_authenticated as ensure_codex_authenticated, +) + + +def test_ensure_codex_authenticated_noop_when_logged_in(monkeypatch): + calls = [] + + def fake_run(args, **kwargs): + calls.append((args, kwargs)) + if args[1:] == ["login", "status"]: + return subprocess.CompletedProcess(args, 0, stdout="Logged in", stderr="") + raise AssertionError(f"Unexpected call: {args}") + + monkeypatch.setattr(subprocess, "run", fake_run) + + method = ensure_codex_authenticated(Path("/bin/codex"), allow_interactive=False) + assert method == "status" + assert [args for args, _ in calls] == [[str(Path("/bin/codex")), "login", "status"]] + + +def test_ensure_codex_authenticated_uses_api_key_login(monkeypatch): + calls = [] + status_calls = {"count": 0} + + def fake_run(args, **kwargs): + calls.append((args, kwargs)) + if args[1:] == ["login", "status"]: + status_calls["count"] += 1 + if status_calls["count"] == 1: + return subprocess.CompletedProcess( + args, 1, stdout="", stderr="Not logged in" + ) + return subprocess.CompletedProcess(args, 0, stdout="Logged in", stderr="") + + if args[1:] == ["login", "--with-api-key"]: + assert kwargs.get("input", "").startswith("sk-test") + return subprocess.CompletedProcess(args, 0, stdout="", stderr="") + + raise AssertionError(f"Unexpected call: {args}") + + monkeypatch.setattr(subprocess, "run", fake_run) + + method = ensure_codex_authenticated( + Path("/bin/codex"), + api_key="sk-test", + allow_interactive=False, + ) + assert method == "api_key" + + called = [a for a, _ in calls] + assert called[0][1:] == ["login", "status"] + assert called[1][1:] == ["login", "--with-api-key"] + assert called[2][1:] == ["login", "status"] + + +def test_ensure_codex_authenticated_raises_when_noninteractive(monkeypatch): + def fake_run(args, **kwargs): + if args[1:] == ["login", "status"]: + return subprocess.CompletedProcess( + args, 1, stdout="", stderr="Not logged in" + ) + raise AssertionError(f"Unexpected call: {args}") + + monkeypatch.setattr(subprocess, "run", fake_run) + + with pytest.raises(CodexAuthError): + ensure_codex_authenticated(Path("/bin/codex"), allow_interactive=False) diff --git a/tests/test_credentials.py b/tests/test_credentials.py new file mode 100644 index 000000000..d25fb27f5 --- /dev/null +++ b/tests/test_credentials.py @@ -0,0 +1,24 @@ +import json + +from shinka.tools.credentials import get_api_key + + +def test_get_api_key_prefers_env(monkeypatch, tmp_path): + monkeypatch.setenv("OPENAI_API_KEY", "env-key") + credentials_path = tmp_path / "credentials.json" + credentials_path.write_text(json.dumps({"OPENAI_API_KEY": "file-key"})) + assert get_api_key("codex", credentials_path=credentials_path) == "env-key" + + +def test_get_api_key_from_credentials_env_var_name(monkeypatch, tmp_path): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + credentials_path = tmp_path / "credentials.json" + credentials_path.write_text(json.dumps({"OPENAI_API_KEY": "file-key"})) + assert get_api_key("codex", credentials_path=credentials_path) == "file-key" + + +def test_get_api_key_from_credentials_provider_name(monkeypatch, tmp_path): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + credentials_path = tmp_path / "credentials.json" + credentials_path.write_text(json.dumps({"codex": "file-key"})) + assert get_api_key("codex", credentials_path=credentials_path) == "file-key" diff --git a/tests/test_edit_base.py b/tests/test_edit_base.py index edc0e1178..67c6f2e20 100644 --- a/tests/test_edit_base.py +++ b/tests/test_edit_base.py @@ -161,6 +161,110 @@ def new_func2(): # Should have replaced both evolve blocks with new content +def test_apply_full_patch_full_file_without_markers_extracts_block_only(): + """Full-file patch without EVOLVE markers should not copy immutable code + into the evolve block; only the block payload is replaced.""" + original_content = """# Header line\n# EVOLVE-BLOCK-START\nold_line()\n# EVOLVE-BLOCK-END\n# Footer line\n""" + + # Patch is the entire file content but with the EVOLVE markers omitted. + patch_content = """```python +new_line() +another_new_line() +```""" + + expected = """# Header line +# EVOLVE-BLOCK-START +new_line() +another_new_line() +# EVOLVE-BLOCK-END +# Footer line +""" + + result = apply_full_patch( + patch_str=patch_content, + original_str=original_content, + language="python", + verbose=False, + ) + updated_content, num_applied, output_path, error, patch_txt, diff_path = result + + assert error is None + assert num_applied == 1 + assert updated_content == expected + + +def test_apply_full_patch_patch_with_start_marker_only(): + """Patch has only START marker; original has both markers.""" + original_content = """# Header line +# EVOLVE-BLOCK-START +old_line() +# EVOLVE-BLOCK-END +# Footer line +""" + + patch_content = """```python +# Header line +# EVOLVE-BLOCK-START +new_line() +# Footer line +```""" + + expected = """# Header line +# EVOLVE-BLOCK-START +new_line() +# EVOLVE-BLOCK-END +# Footer line +""" + + result = apply_full_patch( + patch_str=patch_content, + original_str=original_content, + language="python", + verbose=False, + ) + updated_content, num_applied, output_path, error, patch_txt, diff_path = result + + assert error is None + assert num_applied == 1 + assert updated_content == expected + + +def test_apply_full_patch_patch_with_end_marker_only(): + """Patch has only END marker; original has both markers.""" + original_content = """# Header line +# EVOLVE-BLOCK-START +old_line() +# EVOLVE-BLOCK-END +# Footer line +""" + + patch_content = """```python +# Header line +new_line() +# EVOLVE-BLOCK-END +# Footer line +```""" + + expected = """# Header line +# EVOLVE-BLOCK-START +new_line() +# EVOLVE-BLOCK-END +# Footer line +""" + + result = apply_full_patch( + patch_str=patch_content, + original_str=original_content, + language="python", + verbose=False, + ) + updated_content, num_applied, output_path, error, patch_txt, diff_path = result + + assert error is None + assert num_applied == 1 + assert updated_content == expected + + def test_apply_full_patch_no_evolve_blocks(): """Test apply_full_patch with no EVOLVE-BLOCK regions - should error.""" original_content = """# Just regular code @@ -221,6 +325,41 @@ def new_function(): assert updated_content == original_content # Should return original content +def test_apply_full_patch_patch_with_single_marker_ambiguous_multiple_regions(): + """Single marker in patch is ambiguous when original has multiple regions.""" + original_content = """# Header +# EVOLVE-BLOCK-START +func1() +# EVOLVE-BLOCK-END + +# EVOLVE-BLOCK-START +func2() +# EVOLVE-BLOCK-END +# Footer +""" + + # Patch includes only START marker + patch_content = """```python +# Header +# EVOLVE-BLOCK-START +new_code() +# Footer +```""" + + updated_content, num_applied, output_path, error, patch_txt, diff_path = ( + apply_full_patch( + patch_str=patch_content, + original_str=original_content, + language="python", + verbose=False, + ) + ) + + assert num_applied == 0 + assert error is not None + assert "only one EVOLVE-BLOCK marker" in error + + def test_apply_full_patch_invalid_extraction(): """Test apply_full_patch with invalid code extraction.""" original_content = """# EVOLVE-BLOCK-START diff --git a/tests/test_shinka_agent.py b/tests/test_shinka_agent.py new file mode 100644 index 000000000..dca0db5d4 --- /dev/null +++ b/tests/test_shinka_agent.py @@ -0,0 +1,577 @@ +"""Tests for shinka/edit/shinka_agent.py - Native agentic editing backend.""" + +import subprocess +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from shinka.edit.shinka_agent import ( + ACTION_RE, + MAX_OBSERVATION_CHARS, + ShinkaExecutionError, + ShinkaUnavailableError, + _execute_bash, + _truncate_output, + ensure_shinka_available, + run_shinka_task, +) +from shinka.llm.models.result import QueryResult + + +# ============================================================================ +# Core Functionality Tests - ensure_shinka_available +# ============================================================================ + + +def test_ensure_shinka_available_with_env_var(monkeypatch): + """Test that ensure_shinka_available returns True when env var is set.""" + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + assert ensure_shinka_available() is True + + +def test_ensure_shinka_available_with_credentials_file(monkeypatch): + """Test that ensure_shinka_available returns True when credentials file has key.""" + # Clear all env vars + for var in [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "DEEPSEEK_API_KEY", + "GOOGLE_API_KEY", + "AWS_ACCESS_KEY_ID", + ]: + monkeypatch.delenv(var, raising=False) + + # Mock get_api_key to return a key for codex + # The function imports get_api_key inside, so we patch it at the source + with patch("shinka.tools.credentials.get_api_key") as mock_get_api_key: + mock_get_api_key.return_value = "creds-file-key" + result = ensure_shinka_available() + + assert result is True + # Verify the key was set in environment + import os + + assert os.environ.get("OPENAI_API_KEY") == "creds-file-key" + + +def test_ensure_shinka_available_raises_when_none(monkeypatch): + """Test that ensure_shinka_available raises when no keys are available.""" + # Clear all env vars + for var in [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "DEEPSEEK_API_KEY", + "GOOGLE_API_KEY", + "AWS_ACCESS_KEY_ID", + ]: + monkeypatch.delenv(var, raising=False) + + # Mock get_api_key to return None + # The function imports get_api_key inside, so we patch it at the source + with patch("shinka.tools.credentials.get_api_key") as mock_get_api_key: + mock_get_api_key.return_value = None + + with pytest.raises(ShinkaUnavailableError) as exc_info: + ensure_shinka_available() + + assert "No LLM API keys found" in str(exc_info.value) + + +# ============================================================================ +# Bash Execution Tests - _execute_bash +# ============================================================================ + + +def test_execute_bash_success(tmp_path): + """Test successful bash command execution.""" + workdir = tmp_path + test_file = workdir / "test.txt" + test_file.write_text("hello world") + + exit_code, stdout, stderr = _execute_bash(f"cat {test_file}", workdir) + + assert exit_code == 0 + assert "hello world" in stdout + assert stderr == "" + + +def test_execute_bash_timeout(tmp_path, monkeypatch): + """Test bash command timeout handling.""" + workdir = tmp_path + + # Mock subprocess.run to raise TimeoutExpired + original_run = subprocess.run + + def mock_run(*args, **kwargs): + raise subprocess.TimeoutExpired(cmd="sleep 1000", timeout=1) + + monkeypatch.setattr(subprocess, "run", mock_run) + + exit_code, stdout, stderr = _execute_bash("sleep 1000", workdir, timeout=1) + + assert exit_code == 1 + assert stdout == "" + assert "timed out after 1s" in stderr + + +def test_execute_bash_nonzero_exit(tmp_path): + """Test bash command with non-zero exit code.""" + workdir = tmp_path + + # Run a command that will fail + exit_code, stdout, stderr = _execute_bash( + "cat nonexistent_file_12345.txt", workdir + ) + + assert exit_code == 1 + assert "No such file or directory" in stderr or "cannot open" in stderr.lower() + + +# ============================================================================ +# Agent Loop Tests - run_shinka_task with mocked LLM +# ============================================================================ + + +def test_run_shinka_task_single_turn(tmp_path, monkeypatch): + """Test run_shinka_task with single turn: bash block then termination.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + # Create a test file + test_file = workdir / "test.py" + test_file.write_text("print('hello')") + + # Mock LLMClient + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + # First response: bash command + termination + response1 = QueryResult( + content="Let me read the file.\n```bash\ncat test.py\n```\nCOMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=100, + output_tokens=50, + cost=0.01, + ) + + mock_llm.query.return_value = response1 + mock_llm.get_kwargs.return_value = {} + + # Run the task + events = list( + run_shinka_task( + user_prompt="Read the file", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, + ) + ) + + # Verify events + assert len(events) >= 3 # init, agent_message, command_execution, usage + assert events[0]["type"] == "init" + assert events[-1]["type"] == "usage" + + # Check that bash command was executed + command_events = [e for e in events if e["type"] == "command_execution"] + assert len(command_events) == 1 + assert "cat test.py" in command_events[0]["item"]["command"] + assert "hello" in command_events[0]["item"]["stdout"] + + +def test_run_shinka_task_multi_turn(tmp_path, monkeypatch): + """Test run_shinka_task with multiple turns and observations.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + test_file = workdir / "test.py" + test_file.write_text("x = 1") + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + # Response sequence + responses = [ + QueryResult( + content="```bash\ncat test.py\n```", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=100, + output_tokens=30, + cost=0.005, + ), + QueryResult( + content="```bash\necho 'y = 2' >> test.py\n```", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=150, + output_tokens=40, + cost=0.007, + ), + QueryResult( + content="Done! COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=180, + output_tokens=20, + cost=0.003, + ), + ] + + mock_llm.query.side_effect = responses + mock_llm.get_kwargs.return_value = {} + + events = list( + run_shinka_task( + user_prompt="Modify the file", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=120, + max_events=10, + extra_cli_config={}, + ) + ) + + # Check that we got multiple command executions + command_events = [e for e in events if e["type"] == "command_execution"] + assert len(command_events) == 2 + + # Check total cost tracking + usage_event = [e for e in events if e["type"] == "usage"][0] + assert usage_event["usage"]["total_cost_usd"] == pytest.approx(0.015, rel=1e-5) + assert usage_event["usage"]["input_tokens"] == 430 + assert usage_event["usage"]["output_tokens"] == 90 + + +def test_run_shinka_task_termination_signal(tmp_path, monkeypatch): + """Test run_shinka_task properly handles COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + response = QueryResult( + content="Task is complete. COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=50, + output_tokens=20, + cost=0.002, + ) + + mock_llm.query.return_value = response + mock_llm.get_kwargs.return_value = {} + + events = list( + run_shinka_task( + user_prompt="Do nothing", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, + ) + ) + + # Should terminate after first message + agent_messages = [e for e in events if e["type"] == "agent_message"] + # Only one real agent message (no timeout/max turns messages) + assert len(agent_messages) == 1 + assert "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" in agent_messages[0]["item"]["text"] + + +def test_run_shinka_task_max_events(tmp_path, monkeypatch): + """Test that run_shinka_task respects max_events limit.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + # Response that never terminates + response = QueryResult( + content="```bash\necho 'still working'\n```", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=100, + output_tokens=30, + cost=0.005, + ) + + mock_llm.query.return_value = response + mock_llm.get_kwargs.return_value = {} + + events = list( + run_shinka_task( + user_prompt="Keep working", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=1000, + max_events=3, # Limit to 3 turns + extra_cli_config={}, + ) + ) + + # Should stop after max_events + agent_messages = [e for e in events if e["type"] == "agent_message"] + # Last message should be about reaching max turns + timeout_message = [ + m for m in agent_messages if "reached max turns" in m["item"]["text"] + ] + assert len(timeout_message) == 1 + + +def test_run_shinka_task_empty_response(tmp_path, monkeypatch): + """Test handling when LLM returns None or empty response.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + # Return None response + mock_llm.query.return_value = None + mock_llm.get_kwargs.return_value = {} + + events = list( + run_shinka_task( + user_prompt="Test empty", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, + ) + ) + + # Should have an error message + agent_messages = [e for e in events if e["type"] == "agent_message"] + error_messages = [ + m for m in agent_messages if "empty response" in m["item"]["text"] + ] + assert len(error_messages) == 1 + + +def test_run_shinka_task_no_model_configured(tmp_path, monkeypatch): + """Test that run_shinka_task raises error when no model is configured.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + with pytest.raises(ShinkaExecutionError) as exc_info: + list( + run_shinka_task( + user_prompt="Test", + workdir=workdir, + profile=None, # No profile + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, # No model in config either + ) + ) + + assert "No model configured" in str(exc_info.value) + + +# ============================================================================ +# Utility Tests +# ============================================================================ + + +def test_action_regex_extraction(): + """Test ACTION_RE regex extracts bash blocks correctly.""" + # Test single bash block + text1 = "Let me run this command:\n```bash\necho 'hello'\n```\nDone!" + match1 = ACTION_RE.search(text1) + assert match1 is not None + assert match1.group(1).strip() == "echo 'hello'" + + # Test multiline bash block + text2 = """I'll do this: +```bash +cd /tmp +ls -la +pwd +``` +That's it.""" + match2 = ACTION_RE.search(text2) + assert match2 is not None + extracted = match2.group(1).strip() + assert "cd /tmp" in extracted + assert "ls -la" in extracted + assert "pwd" in extracted + + # Test no bash block + text3 = "No commands here, just text." + match3 = ACTION_RE.search(text3) + assert match3 is None + + # Test first bash block only (should ignore second) + text4 = "```bash\nfirst\n```\nsome text\n```bash\nsecond\n```" + match4 = ACTION_RE.search(text4) + assert match4 is not None + assert match4.group(1).strip() == "first" + + +def test_truncate_output(): + """Test _truncate_output respects max_chars limit.""" + # Short text - no truncation + short_text = "short" + assert _truncate_output(short_text, 100) == short_text + + # Long text - should truncate + long_text = "a" * 20000 + truncated = _truncate_output(long_text, MAX_OBSERVATION_CHARS) + + assert len(truncated) < len(long_text) + assert "truncated" in truncated + # Should have first half and last half + assert truncated.startswith("a" * 100) # First part + assert truncated.endswith("a" * 100) # Last part + + # Custom max_chars + custom_truncated = _truncate_output(long_text, 1000) + assert len(custom_truncated) < 1100 # Some overhead for truncation message + assert "truncated" in custom_truncated + + # Edge case: exactly at limit + exact_text = "x" * 100 + assert _truncate_output(exact_text, 100) == exact_text + + +# ============================================================================ +# Integration-style Tests +# ============================================================================ + + +def test_run_shinka_task_with_system_prompt(tmp_path, monkeypatch): + """Test that system_prompt is properly combined with base prompt.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + response = QueryResult( + content="COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=50, + output_tokens=10, + cost=0.001, + ) + + mock_llm.query.return_value = response + mock_llm.get_kwargs.return_value = {} + + custom_system = "Custom instructions here." + + list( + run_shinka_task( + user_prompt="Test", + workdir=workdir, + system_prompt=custom_system, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, + ) + ) + + # Verify system_msg passed to query includes custom prompt + call_args = mock_llm.query.call_args + system_msg_used = call_args.kwargs["system_msg"] + assert custom_system in system_msg_used + assert "You are an expert software engineer" in system_msg_used + + +def test_run_shinka_task_bash_then_termination(tmp_path, monkeypatch): + """Test that bash command is executed even when termination signal is present.""" + workdir = tmp_path + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + + test_file = workdir / "output.txt" + + with patch("shinka.edit.shinka_agent.LLMClient") as mock_llm_class: + mock_llm = Mock() + mock_llm_class.return_value = mock_llm + + # Response with both bash and termination + response = QueryResult( + content=f"```bash\necho 'test' > {test_file}\n```\nCOMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT", + msg="test", + system_msg="sys", + new_msg_history=[], + model_name="gpt-4", + kwargs={}, + input_tokens=100, + output_tokens=50, + cost=0.01, + ) + + mock_llm.query.return_value = response + mock_llm.get_kwargs.return_value = {} + + events = list( + run_shinka_task( + user_prompt="Create file", + workdir=workdir, + profile="gpt-4", + sandbox="none", + approval_mode="auto", + max_seconds=60, + max_events=10, + extra_cli_config={}, + ) + ) + + # Verify bash was executed + command_events = [e for e in events if e["type"] == "command_execution"] + assert len(command_events) == 1 + assert test_file.exists() + assert test_file.read_text().strip() == "test"