From 1b4c179fea614643fdd0d54d822f3918119eda06 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Thu, 25 Sep 2025 06:38:52 +0200
Subject: [PATCH 01/68] Update README.md with arxiv
---
README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 55f40d262..5929d744d 100644
--- a/README.md
+++ b/README.md
@@ -7,12 +7,12 @@
-
+
-`ShinkaEvolve` is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements.
+[`ShinkaEvolve`](https://arxiv.org/abs/2509.19349) is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. By leveraging the creative capabilities of LLMs and the optimization power of evolutionary search, `ShinkaEvolve` enables automated exploration and improvement of scientific code. The system is inspired by the [AI Scientist](https://sakana.ai/ai-scientist/), [AlphaEvolve](https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/) and the [Darwin Goedel Machine](https://sakana.ai/dgm/): It maintains a population of programs that evolve over generations, with an ensemble of LLMs acting as intelligent mutation operators that suggest code improvements.
The framework supports **parallel evaluation of candidates** locally or on a Slurm cluster. It maintains an archive of successful solutions, enabling knowledge transfer between different evolutionary islands. `ShinkaEvolve` is particularly well-suited for scientific tasks where there is a verifier available and the goal is to optimize performance metrics while maintaining code correctness and readability.
@@ -313,4 +313,4 @@ If you use `ShinkaEvolve` in your research, please cite it as follows:
journal={arXiv preprint},
year={2025}
}
-```
\ No newline at end of file
+```
From 2fb7548ce032da3c24e0a34893c8feb5413795dd Mon Sep 17 00:00:00 2001
From: "takeru.fukushima" <100330935+takeruhukushima@users.noreply.github.com>
Date: Thu, 25 Sep 2025 16:33:57 +0900
Subject: [PATCH 02/68] add google gemini embeding model
---
examples/shinka_tutorial.ipynb | 15 ++++++++++
shinka/core/runner.py | 7 ++++-
shinka/database/dbase.py | 7 +++--
shinka/llm/embedding.py | 52 ++++++++++++++++++++++++++++++++--
4 files changed, 76 insertions(+), 5 deletions(-)
diff --git a/examples/shinka_tutorial.ipynb b/examples/shinka_tutorial.ipynb
index 66a71a073..c6d818994 100644
--- a/examples/shinka_tutorial.ipynb
+++ b/examples/shinka_tutorial.ipynb
@@ -237,6 +237,17 @@
"if not llm_models:\n",
" llm_models = [\"gpt-5-mini\"] # fallback if no keys detected\n",
"\n",
+ "# pick embedding model based on available keys\n",
+ "embedding_model_name = \"\"\n",
+ "if os.getenv(\"GEMINI_API_KEY\"):\n",
+ " embedding_model_name = \"gemini-embedding-001\"\n",
+ "elif os.getenv(\"OPENAI_API_KEY\"):\n",
+ " embedding_model_name = \"text-embedding-3-small\"\n",
+ "else:\n",
+ " embedding_model_name = \"text-embedding-3-small\"\n",
+ "print(f\"β
Embedding model selected: {embedding_model_name}\")\n",
+ "\n",
+ "\n",
"# unique experiment directory\n",
"timestamp = dt.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"run_tag = f\"{timestamp}_weighted_fast\"\n",
@@ -271,6 +282,8 @@
" max_novelty_attempts=3,\n",
" # ensemble llm selection among candidates based on past performance\n",
" llm_dynamic_selection=None, # e.g. \"ucb1\"\n",
+ " # set embedding model\n",
+ " embedding_model=embedding_model_name,\n",
")\n",
"\n",
"db_config = DatabaseConfig(\n",
@@ -286,11 +299,13 @@
" enforce_island_separation=True,\n",
" parent_selection_strategy=\"weighted\",\n",
" parent_selection_lambda=10.0,\n",
+ " \n",
")\n",
"\n",
"job_config = LocalJobConfig(eval_program_path=\"evaluate.py\")\n",
"\n",
"print(\"llm_models:\", llm_models)\n",
+ "print(\"embedding_model:\", embedding_model_name)\n",
"print(\"results_dir:\", evo_config.results_dir)"
]
},
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index 3c818742c..c8c7c431c 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -158,7 +158,12 @@ def __init__(
# Initialize database and scheduler
db_config.db_path = str(db_path)
- self.db = ProgramDatabase(config=db_config)
+ embedding_model_to_use = (
+ evo_config.embedding_model or "text-embedding-3-small"
+ )
+ self.db = ProgramDatabase(
+ config=db_config, embedding_model=embedding_model_to_use
+ )
self.scheduler = JobScheduler(
job_type=evo_config.job_type,
config=job_config, # type: ignore
diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index 69fdf5432..c6a2b89bf 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -82,6 +82,9 @@ class DatabaseConfig:
# Beam search parent selection parameters
num_beams: int = 5
+ # Embedding model name
+ embedding_model: str = "text-embedding-3-small"
+
def db_retry(max_retries=5, initial_delay=0.1, backoff_factor=2):
"""
@@ -248,12 +251,12 @@ class ProgramDatabase:
populations, and an archive of elites.
"""
- def __init__(self, config: DatabaseConfig, read_only: bool = False):
+ def __init__(self, config: DatabaseConfig,embedding_model: str = "text-embedding-3-small", read_only: bool = False):
self.config = config
self.conn: Optional[sqlite3.Connection] = None
self.cursor: Optional[sqlite3.Cursor] = None
self.read_only = read_only
- self.embedding_client = EmbeddingClient()
+ self.embedding_client = EmbeddingClient(model_name=embedding_model)
self.last_iteration: int = 0
self.best_program_id: Optional[str] = None
diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py
index a5c6b07cc..1f2ad495f 100644
--- a/shinka/llm/embedding.py
+++ b/shinka/llm/embedding.py
@@ -1,5 +1,6 @@
import os
import openai
+import google.generativeai as genai
import pandas as pd
from typing import Union, List, Optional, Tuple
import numpy as np
@@ -20,13 +21,23 @@
"azure-text-embedding-3-large",
]
+GEMINI_EMBEDDING_MODELS = [
+ "gemini-embedding-exp-03-07",
+ "gemini-embedding-001",
+]
+
OPENAI_EMBEDDING_COSTS = {
"text-embedding-3-small": 0.02 / M,
"text-embedding-3-large": 0.13 / M,
}
+# Gemini embedding costs (approximate - check current pricing)
+GEMINI_EMBEDDING_COSTS = {
+ "gemini-embedding-exp-03-07": 0.0 / M, # Experimental model, often free
+ "gemini-embedding-001": 0.0 / M, # Check current pricing
+}
-def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]:
+def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]:
if model_name in OPENAI_EMBEDDING_MODELS:
client = openai.OpenAI()
model_to_use = model_name
@@ -38,6 +49,14 @@ def get_client_model(model_name: str) -> tuple[openai.OpenAI, str]:
api_version=os.getenv("AZURE_API_VERSION"),
azure_endpoint=os.getenv("AZURE_API_ENDPOINT"),
)
+ elif model_name in GEMINI_EMBEDDING_MODELS:
+ # Configure Gemini API
+ api_key = os.getenv("GOOGLE_API_KEY")
+ if not api_key:
+ raise ValueError("GOOGLE_API_KEY environment variable not set for Gemini models")
+ genai.configure(api_key=api_key)
+ client = "gemini" # Use string identifier for Gemini
+ model_to_use = model_name
else:
raise ValueError(f"Invalid embedding model: {model_name}")
@@ -52,9 +71,10 @@ def __init__(
Initialize the EmbeddingClient.
Args:
- model (str): The OpenAI embedding model name to use.
+ model (str): The OpenAI, Azure, or Gemini embedding model name to use.
"""
self.client, self.model = get_client_model(model_name)
+ self.model_name = model_name
self.verbose = verbose
def get_embedding(
@@ -76,6 +96,34 @@ def get_embedding(
single_code = True
else:
single_code = False
+ # Handle Gemini models
+ if self.model_name in GEMINI_EMBEDDING_MODELS:
+ try:
+ embeddings = []
+ total_tokens = 0
+
+ for text in code:
+ result = genai.embed_content(
+ model=f"models/{self.model}",
+ content=text,
+ task_type="retrieval_document"
+ )
+ embeddings.append(result['embedding'])
+ total_tokens += len(text.split())
+
+ cost = total_tokens * GEMINI_EMBEDDING_COSTS.get(self.model, 0.0)
+
+ if single_code:
+ return embeddings[0] if embeddings else [], cost
+ else:
+ return embeddings, cost
+ except Exception as e:
+ logger.error(f"Error getting Gemini embedding: {e}")
+ if single_code:
+ return [], 0.0
+ else:
+ return [[]], 0.0
+ # Handle OpenAI and Azure models (same interface)
try:
response = self.client.embeddings.create(
model=self.model, input=code, encoding_format="float"
From 27af71c2db24c3ebba14d9ac7f0f6e9aee2aff7f Mon Sep 17 00:00:00 2001
From: Dixing Xu
Date: Thu, 25 Sep 2025 18:13:56 +0800
Subject: [PATCH 03/68] fix: Fix database summary when patch_name metadata is
missing
---
shinka/database/display.py | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/shinka/database/display.py b/shinka/database/display.py
index 4c34d3445..3e55439bf 100644
--- a/shinka/database/display.py
+++ b/shinka/database/display.py
@@ -122,6 +122,18 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None):
else:
time_display = f"{time_val:.1f}s"
+ # Safely extract metadata fields for display
+ metadata = program.metadata or {}
+ patch_name_raw = metadata.get("patch_name", "[dim]N/A[/dim]")
+ if patch_name_raw is None:
+ patch_name_raw = "[dim]N/A[/dim]"
+ patch_name = str(patch_name_raw)[:30]
+
+ patch_type_raw = metadata.get("patch_type", "[dim]N/A[/dim]")
+ if patch_type_raw is None:
+ patch_type_raw = "[dim]N/A[/dim]"
+ patch_type = str(patch_type_raw)
+
# Add the data row
island_display = (
f"I-{program.island_idx}" if program.island_idx is not None else "N/A"
@@ -131,8 +143,8 @@ def print_program_summary(self, program, console: Optional[RichConsole] = None):
island_display,
status_display,
score_display,
- program.metadata.get("patch_name", "[dim]N/A[/dim]")[:30],
- program.metadata.get("patch_type", "[dim]N/A[/dim]"),
+ patch_name,
+ patch_type,
f"{program.complexity:.1f}",
cost_display,
time_display,
From 9586cdbe7025537ffa9f22b641cc2aa3f95cddc7 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Fri, 26 Sep 2025 09:32:04 +0200
Subject: [PATCH 04/68] Update README.md
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 5929d744d..0098c7556 100644
--- a/README.md
+++ b/README.md
@@ -52,9 +52,9 @@ For detailed installation instructions and usage examples, see the [Getting Star
| Example | Description | Environment Setup |
|---------|-------------|-------------------|
| β [Circle Packing](examples/circle_packing) | Optimize circle packing to maximize radii. | `LocalJobConfig` |
-| π€ [Agent Design](examples/agent_design) | Design agent scaffolds for math tasks. | `LocalJobConfig` |
+| π€ [Agent Design](examples/adas_aime) | Design agent scaffolds for math tasks. | `LocalJobConfig` |
| π― [ALE-Bench](examples/ale_bench) | Code optimization for ALE-Bench tasks. | `LocalJobConfig` |
-| β¨ [Novelty Generator](examples/novelty_generator_bck) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` |
+| β¨ [Novelty Generator](examples/novelty_generator) | Generate creative, surprising outputs (e.g., ASCII art). | `LocalJobConfig` |
## `shinka` Run with Python API π
From a60bc9e4782ee77a5684841a6252c87ece6fe562 Mon Sep 17 00:00:00 2001
From: Koki-Kazaore
Date: Sun, 28 Sep 2025 19:12:28 +0900
Subject: [PATCH 05/68] docs: change repo name on the onboarding doc
---
docs/getting_started.md | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 234158839..a866c011f 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -53,7 +53,7 @@ pip install uv
```bash
git clone
-cd shinka
+cd ShinkaEvolve
# Create virtual environment with Python 3.11
uv venv --python 3.11
@@ -79,7 +79,7 @@ conda activate shinka
```bash
git clone
-cd shinka
+cd ShinkaEvolve
pip install -e .
```
@@ -249,7 +249,7 @@ from shinka.core import run_shinka_eval
def main(program_path: str, results_dir: str):
"""Main evaluation function called by Shinka"""
-
+
metrics, correct, error_msg = run_shinka_eval(
program_path=program_path,
results_dir=results_dir,
@@ -268,11 +268,11 @@ def main(program_path: str, results_dir: str):
def validate_packing(run_output):
"""Returns (is_valid: bool, error_msg: str or None)"""
centers, radii, reported_sum = run_output
-
+
# Check constraints (bounds, overlaps, etc.)
if constraint_violated:
return False, "Specific error description"
-
+
return True, None # Valid solution
```
@@ -280,10 +280,10 @@ def validate_packing(run_output):
```python
def aggregate_metrics(results, results_dir):
"""Returns metrics dictionary with required structure"""
-
+
# Extract data from results
centers, radii, reported_sum = results[0]
-
+
return {
"combined_score": float(reported_sum), # PRIMARY FITNESS (higher = better)
"public": { # Visible in WebUI/logs
From 00035528af09a03b36d42a4e276f9f61c3e124d7 Mon Sep 17 00:00:00 2001
From: Edoardo Cetin <32273096+Aladoro@users.noreply.github.com>
Date: Sun, 28 Sep 2025 20:47:42 +0900
Subject: [PATCH 06/68] Update README
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 0098c7556..b0dba5f7d 100644
--- a/README.md
+++ b/README.md
@@ -308,9 +308,9 @@ If you use `ShinkaEvolve` in your research, please cite it as follows:
```
@article{lange2025shinka,
- title={ShinkaEvolve: Towards Open-Ended and Sample-Efficient Program Evolution},
+ title={ShinkaEvolve: Towards Open-Ended And Sample-Efficient Program Evolution},
author={Lange, Robert Tjarko and Imajuku, Yuki and Cetin, Edoardo},
- journal={arXiv preprint},
+ journal={arXiv preprint arXiv:2509.19349},
year={2025}
}
```
From be2e2037c90a6cf081d9a8eb38e2ccedd48e6211 Mon Sep 17 00:00:00 2001
From: vicruz99
Date: Sun, 12 Oct 2025 14:55:07 +0100
Subject: [PATCH 07/68] Added a doc explaining how to add suport for a local
LLM and embedding model
---
docs/support_local_llm.md | 232 ++++++++++++++++++++++++++++++++++++++
1 file changed, 232 insertions(+)
create mode 100644 docs/support_local_llm.md
diff --git a/docs/support_local_llm.md b/docs/support_local_llm.md
new file mode 100644
index 000000000..5f406e7b9
--- /dev/null
+++ b/docs/support_local_llm.md
@@ -0,0 +1,232 @@
+
+# π§© Integrating Local LLMs into **ShinkaEvolve**
+
+## π§ Overview
+
+The original **ShinkaEvolve** code does **not** include built-in support for running **local LLMs**.
+To enable this functionality, parts of the codebase can be modified to integrate locally hosted models.
+
+---
+
+## ποΈ Code Organization
+
+**ShinkaEvolve** uses a **modular architecture** that supports multiple **LLM providers**.
+The relevant code for LLM interaction is located in the **`LLM/`** folder, which manages all model communications.
+ShinkaEvolve distinguishes between two LLM types:
+
+* **Regular LLMs**
+* **Embedding LLMs**
+
+---
+
+## βοΈ Adding a Regular LLM
+
+To add support for a **regular LLM**, follow these steps. They will show an example of adding support for gpt-oss models running with unsloth, which provides an API compatible with OpenAI API (v1/completions).
+This LLM can then be specified in the configuration variables:
+
+```yaml
+llm_models:
+meta_llm_models:
+```
+
+---
+
+### π§ Step 1: Modify the Client
+
+The file **`client.py`** is responsible for creating clients that interact with LLMs.
+Each client instance is later used to query a specific model.
+
+To add a local model, introduce a new client configuration.
+The API URL is extracted from the model name, which follows this format:
+
+```
+local-gptoss-unsloth-url
+```
+
+#### Example
+
+```python
+elif "local-gptoss-unsloth" in model_name:
+ # Extract URL from model name
+ pattern = r"https?://"
+ match = re.search(pattern, model_name)
+ if match:
+ start_index = match.start()
+ url = model_name[start_index:]
+ else:
+ raise ValueError(f"Invalid URL in model name: {model_name}")
+
+ # Create OpenAI-compatible client
+ client = openai.OpenAI(
+ api_key="filler",
+ base_url=url
+ )
+
+ # Structured output mode (if required)
+ if structured_output:
+ client = instructor.from_openai(
+ client,
+ mode=instructor.Mode.JSON,
+ )
+```
+
+---
+
+### π Step 2: Create the Local Query Function
+
+Inside the **`models/`** folder, create a new subfolder to store the query functions for your local models:
+
+```
+LLM/models/local/
+```
+
+> Donβt forget to include an empty `__init__.py` file.
+
+This folder should contain a **custom query function** for the local model. I called my file local_gptoss_unsloth.py.
+It should follow the same structure as other functions in `LLM/models/`, but with small adjustments.
+
+#### My Key Adjustments
+
+* Replace `max_output_tokens` with **`max_tokens`** to match the local API.
+* Extract additional response metadata such as:
+
+ * `total_tokens`
+ * `thinking_tokens` (if your model includes reasoning traces)
+
+This function is later imported and registered in **`query.py`**.
+
+---
+
+### π§© Step 3: Update `__init__.py`
+
+Configure **`__init__.py`** to include and expose the new local query function, so it can be imported elsewhere.
+
+```
+from .local.local_gptoss_unsloth import query_local_gptoss_unsloth # ADDED THIS LINE
+from .result import QueryResult
+
+__all__ = [
+ "query_anthropic",
+ "query_openai",
+ "query_deepseek",
+ "query_gemini",
+ "query_local_gptoss_unsloth", # ADDED THIS LINE
+ "QueryResult",
+]
+```
+
+---
+
+### π¬ Step 4: Update `query.py`
+
+Import and register the new local query function in query.py.
+
+#### Imports
+
+```python
+from .models import (
+ query_anthropic,
+ query_openai,
+ query_deepseek,
+ query_gemini,
+ query_local_gptoss_unsloth, # ADDED THIS LINE
+ QueryResult,
+)
+```
+
+#### Model Selection Logic
+
+```python
+elif "local-gptoss-unsloth" in model_name: # ADDED THIS LINE
+ query_fn = query_local_gptoss_unsloth
+```
+
+---
+
+### π§ Step 5: Other Observations
+
+The file **`query.py`** also defines functions such as:
+
+* `sample_model_kwargs`
+* `sample_batch_kwargs`
+
+However, these are **not referenced anywhere else** in the repository, so no modifications are required here for now.
+
+---
+
+### β
Summary
+
+| Step | File | Change | Description |
+| ---- | -------------------------------------------- | -------------------- | -------------------------------------------------------- |
+| 1 | `client.py` | Add new client block | Create OpenAI-compatible client for local LLM |
+| 2 | `models/local/query_local_gptoss_unsloth.py` | New function | Query local model, adjust tokens, extract reasoning info |
+| 3 | `__init__.py` | Add import | Expose new query function |
+| 4 | `query.py` | Register model | Add conditional for local LLM |
+| 5 | β | Review only | Ignored unused functions |
+
+---
+
+## 𧬠Adding a Local Embedding Model
+
+For embedding models, you can use **Ollama**, which follows the **OpenAI API** format.
+The only relevant file is **`embedding.py`**.
+
+### Code Addition
+
+```python
+elif model_name.startswith("local-"):
+ # Pattern: local-(model-name)-(http or https url)
+ match = re.match(r"local-(.+?)-(https?://.+)", model_name)
+ if match:
+ model_to_use = match.group(1)
+ url = match.group(2)
+ else:
+ raise ValueError(f"Invalid local model format: {model_name}")
+
+ client = openai.OpenAI(
+ base_url=url,
+ api_key="filler"
+ )
+```
+
+#### Notes
+
+* Compatible with **any Ollama model**.
+* The model name must follow this convention:
+
+ ```
+ local-model-name-url
+ ```
+* The code extracts both `model-name` and `url`, and uses them to query Ollama.
+
+---
+
+### Query Logic
+
+The existing line in **`embedding.py`** remains unchanged:
+
+```python
+response = self.client.embeddings.create(
+ model=self.model,
+ input=code,
+ encoding_format="float"
+)
+```
+
+For local embedding models, `self.model` corresponds to the extracted model name.
+The only addition to the **Embedding Client** class:
+
+```python
+elif self.model_name.startswith("local-"):
+ cost = 0.0
+```
+
+---
+
+## π Result
+
+ShinkaEvolve can now connect to **locally hosted LLMs** and **embedding models** through **OpenAI-compatible APIs**.
+This setup supports **Ollama** and other frameworks such as **gpt-oss** under **Unsloth**.
+
+If your model has different requirements, follow the same pattern with a distinct model identifier and your own custom logic.
+
From bf0c1d47576f5cb34870a9bad26592e50b3eb4cc Mon Sep 17 00:00:00 2001
From: LiaCastaneda
Date: Mon, 13 Oct 2025 11:04:22 +0200
Subject: [PATCH 08/68] Add rust to supported languages
---
shinka/core/runner.py | 9 ++++++---
shinka/database/complexity.py | 4 ++--
shinka/edit/apply_diff.py | 4 +++-
shinka/edit/apply_full.py | 4 +++-
shinka/edit/async_apply.py | 26 ++++++++++++++++++++++++++
5 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index 3c818742c..37b876d00 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -231,6 +231,8 @@ def __init__(
self.lang_ext = "cpp"
elif self.evo_config.language == "python":
self.lang_ext = "py"
+ elif self.evo_config.language == "rust":
+ self.lang_ext = "rs"
else:
msg = f"Language {self.evo_config.language} not supported"
raise ValueError(msg)
@@ -1096,9 +1098,10 @@ def run_patch(
# error_attempt is already set from apply_patch or default
pass
- # Only consider the diff summary for the original.py file!!!
- if "original.py" in diff_summary:
- diff_summary = diff_summary["original.py"]
+ # Only consider the diff summary for the original source file
+ original_filename = f"original.{self.lang_ext}"
+ if original_filename in diff_summary:
+ diff_summary = diff_summary[original_filename]
meta_edit_data = {
"patch_type": patch_type,
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 4116567e9..933d7f4e6 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -259,8 +259,8 @@ def analyze_code_metrics(code_string, language="python"):
# If Python parsing fails, fall back to C++ analysis
return analyze_cpp_complexity(code_string)
- # For C/C++/CUDA and other languages, use regex-based analysis
- elif language in ["cpp", "c", "cuda", "c++"]:
+ # For C/C++/CUDA/Rust and other languages, use regex-based analysis
+ elif language in ["cpp", "c", "cuda", "c++", "rust"]:
return analyze_cpp_complexity(code_string)
# For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index ead28e231..4b5f29148 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
patch_str = _strip_trailing_whitespace(patch_str)
# Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
- if language in ["cuda", "cpp"]:
+ if language in ["cuda", "cpp", "rust"]:
patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
elif language == "python":
@@ -730,6 +730,8 @@ def apply_diff_patch(
suffix = ".cpp"
elif language == "cuda":
suffix = ".cu"
+ elif language == "rust":
+ suffix = ".rs"
else:
raise ValueError(f"Language {language} not supported")
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index b7e2e2b37..9b14f21ee 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -102,7 +102,7 @@ def apply_full_patch(
# We need to find the actual start of the comment line
if language == "python":
end_marker = "# EVOLVE-BLOCK-END"
- elif language in ["cuda", "cpp"]:
+ elif language in ["cuda", "cpp", "rust"]:
end_marker = "// EVOLVE-BLOCK-END"
else:
end_marker = "# EVOLVE-BLOCK-END" # Default fallback
@@ -146,6 +146,8 @@ def apply_full_patch(
suffix = ".cpp"
elif language == "cuda":
suffix = ".cu"
+ elif language == "rust":
+ suffix = ".rs"
else:
raise ValueError(f"Language {language} not supported")
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index 8e542c565..4ffd15bed 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -118,6 +118,32 @@ async def validate_code_async(
error_msg = stderr.decode() if stderr else "Unknown compilation error"
return False, error_msg
+ elif language == "rust":
+ # Use rustc for Rust syntax checking
+ proc = await asyncio.create_subprocess_exec(
+ "rustc",
+ "--crate-type=lib",
+ "-Zparse-only",
+ code_path,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+
+ try:
+ stdout, stderr = await asyncio.wait_for(
+ proc.communicate(), timeout=timeout
+ )
+ except asyncio.TimeoutError:
+ proc.kill()
+ await proc.wait()
+ return False, f"Validation timeout after {timeout}s"
+
+ if proc.returncode == 0:
+ return True, None
+ else:
+ error_msg = stderr.decode() if stderr else "Unknown compilation error"
+ return False, error_msg
+
elif language == "cpp":
# Use g++ for C++ compilation check
proc = await asyncio.create_subprocess_exec(
From 77d1819454673d0f007f5f9044e87475a1b56a14 Mon Sep 17 00:00:00 2001
From: Takuya Akiba
Date: Tue, 14 Oct 2025 23:44:38 +0900
Subject: [PATCH 09/68] Ensure setuptools discovers subpackages
---
pyproject.toml | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index e3ec455af..f05429b60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,9 +48,11 @@ dependencies = [
]
[tool.setuptools]
-packages = ["shinka"]
script-files = ["shinka/shinka_launch", "shinka/shinka_visualize"]
+[tool.setuptools.packages.find]
+include = ["shinka", "shinka.*"]
+
[tool.setuptools.package-data]
"*" = ["*"]
From 929f072e7879852893b959aa4079d903c27aa76f Mon Sep 17 00:00:00 2001
From: Takuya Akiba
Date: Tue, 14 Oct 2025 23:44:59 +0900
Subject: [PATCH 10/68] Mark shinka.webui as a package
---
shinka/webui/__init__.py | 0
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 shinka/webui/__init__.py
diff --git a/shinka/webui/__init__.py b/shinka/webui/__init__.py
new file mode 100644
index 000000000..e69de29bb
From 23ace365b4123f6369b98b0bcc5a853984c7da72 Mon Sep 17 00:00:00 2001
From: 51616
Date: Fri, 24 Oct 2025 13:28:16 +0000
Subject: [PATCH 11/68] fix apply_full.py when the patch has incomplete (0,1)
markers instead of expected 2 (end and start) markers
---
shinka/edit/apply_full.py | 174 +++++++++++++++++++++++++++++++-------
tests/test_edit_base.py | 139 ++++++++++++++++++++++++++++++
2 files changed, 284 insertions(+), 29 deletions(-)
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index b7e2e2b37..e0b76c892 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -1,6 +1,6 @@
from pathlib import Path
from typing import Optional, Union
-from .apply_diff import write_git_diff, _mutable_ranges
+from .apply_diff import write_git_diff, _mutable_ranges, EVOLVE_START, EVOLVE_END
from shinka.llm import extract_between
import logging
@@ -72,10 +72,15 @@ def apply_full_patch(
updated_content = ""
last_end = 0
- # Check if patch_code contains EVOLVE-BLOCK markers
- patch_mutable_ranges = _mutable_ranges(patch_code)
+ # Detect EVOLVE markers presence in the patch content
+ patch_has_start = EVOLVE_START.search(patch_code) is not None
+ patch_has_end = EVOLVE_END.search(patch_code) is not None
+ patch_has_both = patch_has_start and patch_has_end
+ patch_has_none = not patch_has_start and not patch_has_end
- if patch_mutable_ranges:
+ if patch_has_both:
+ # Patch contains both EVOLVE-BLOCK markers, extract from them
+ patch_mutable_ranges = _mutable_ranges(patch_code)
# Patch contains EVOLVE-BLOCK markers, extract from them
for i, (start, end) in enumerate(mutable_ranges):
# Add immutable part before this mutable range
@@ -91,47 +96,158 @@ def apply_full_patch(
updated_content += replacement_content
last_end = end
- else:
+ elif patch_has_none:
# Patch doesn't contain EVOLVE-BLOCK markers
# Assume entire patch content should replace all mutable regions
if len(mutable_ranges) == 1:
- # Single mutable region, replace with entire patch content
+ # Single mutable region. If the patch appears to be a full-file
+ # rewrite that omitted EVOLVE markers, safely extract only the
+ # content intended for the evolve block by matching immutable
+ # prefix/suffix from the original file.
start, end = mutable_ranges[0]
- # The mutable range ends before "EVOLVE-BLOCK-END" text
- # We need to find the actual start of the comment line
- if language == "python":
- end_marker = "# EVOLVE-BLOCK-END"
- elif language in ["cuda", "cpp"]:
- end_marker = "// EVOLVE-BLOCK-END"
- else:
- end_marker = "# EVOLVE-BLOCK-END" # Default fallback
-
- end_marker_pos = original.find(end_marker, end - 5)
- if end_marker_pos == -1:
- # Fallback: use the original end position
- end_marker_pos = end
+ # Immutable portions that remain outside the evolve block
+ immutable_prefix = original[:start]
+ immutable_suffix = original[end:]
- # Ensure proper newline handling around the patch content
- if patch_code and not patch_code.startswith("\n"):
- patch_code = "\n" + patch_code
+ # Also compute the portions strictly outside the marker lines
+ # to detect full-file patches that omitted EVOLVE markers.
+ # Find the start and end marker line boundaries.
+ start_match = None
+ end_match = None
+ for m in EVOLVE_START.finditer(original):
+ if m.end() == start:
+ start_match = m
+ break
+ for m in EVOLVE_END.finditer(original):
+ if m.start() == end:
+ end_match = m
+ break
- if patch_code and not patch_code.endswith("\n"):
- patch_code = patch_code + "\n"
-
- updated_content = (
- original[:start] + patch_code + original[end_marker_pos:]
+ prefix_outside = (
+ original[: start_match.start()] if start_match else immutable_prefix
+ )
+ suffix_outside = (
+ original[end_match.end() :] if end_match else immutable_suffix
)
+
+ # Heuristic: if patch includes the same immutable prefix/suffix
+ # outside the markers, treat the middle part as the evolve-block
+ # replacement. Be tolerant to a missing trailing newline in the
+ # footer by checking both versions.
+ suffix_opts = (suffix_outside, suffix_outside.rstrip("\r\n"))
+ if patch_code.startswith(prefix_outside) and any(
+ patch_code.endswith(sfx) for sfx in suffix_opts
+ ):
+ mid_start = len(prefix_outside)
+ # choose the matching suffix option to compute end
+ sfx = next(sfx for sfx in suffix_opts if patch_code.endswith(sfx))
+ mid_end = len(patch_code) - len(sfx)
+ replacement_content = patch_code[mid_start:mid_end]
+ # Ensure marker boundaries stay on their own lines.
+ # Add a leading newline only if there is a START marker.
+ if (
+ start_match is not None
+ and replacement_content
+ and not replacement_content.startswith("\n")
+ ):
+ replacement_content = "\n" + replacement_content
+ # Add a trailing newline only if there is an END marker.
+ if (
+ end_match is not None
+ and replacement_content
+ and not replacement_content.endswith("\n")
+ ):
+ replacement_content = replacement_content + "\n"
+ updated_content = (
+ immutable_prefix + replacement_content + immutable_suffix
+ )
+ else:
+ # Otherwise, assume the patch_code represents only the
+ # evolve-block payload and insert it directly between markers.
+ # Ensure proper newline handling around the patch content.
+ payload = patch_code
+ if (
+ start_match is not None
+ and payload
+ and not payload.startswith("\n")
+ ):
+ payload = "\n" + payload
+ if end_match is not None and payload and not payload.endswith("\n"):
+ payload = payload + "\n"
+ updated_content = immutable_prefix + payload + immutable_suffix
else:
- # Multiple mutable regions, this is ambiguous
+ # Multiple EVOLVE-BLOCK regions found, ambiguous without markers
error_message = (
"Multiple EVOLVE-BLOCK regions found but patch "
"doesn't specify which to replace"
)
return original, 0, None, error_message, None, None
+ else:
+ # Patch contains exactly one marker (START xor END).
+ # Only safe to apply when original has a single evolve region.
+ if len(mutable_ranges) != 1:
+ error_message = (
+ "Patch contains only one EVOLVE-BLOCK marker, but the original "
+ f"has {len(mutable_ranges)} editable regions; cannot determine target"
+ )
+ return original, 0, None, error_message, None, None
+
+ # Single target region in original
+ start, end = mutable_ranges[0]
+ immutable_prefix = original[:start]
+ immutable_suffix = original[end:]
+
+ # Find exact marker locations in original for newline policy
+ start_match = None
+ end_match = None
+ for m in EVOLVE_START.finditer(original):
+ if m.end() == start:
+ start_match = m
+ break
+ for m in EVOLVE_END.finditer(original):
+ if m.start() == end:
+ end_match = m
+ break
+
+ # Compute outside-of-markers prefix/suffix from original
+ prefix_outside = (
+ original[: start_match.start()] if start_match else immutable_prefix
+ )
+ suffix_outside = (
+ original[end_match.end() :] if end_match else immutable_suffix
+ )
+
+ # Extract payload based on which single marker is present in patch
+ if patch_has_start and not patch_has_end:
+ m = EVOLVE_START.search(patch_code)
+ payload = patch_code[m.end() :] if m else patch_code
+ # Trim footer if the patch included it
+ for sfx in (suffix_outside, suffix_outside.rstrip("\r\n")):
+ if sfx and payload.endswith(sfx):
+ payload = payload[: -len(sfx)]
+ break
+ elif patch_has_end and not patch_has_start:
+ m = EVOLVE_END.search(patch_code)
+ payload = patch_code[: m.start()] if m else patch_code
+ # Trim header if the patch included it
+ for pfx in (prefix_outside, prefix_outside.rstrip("\r\n")):
+ if pfx and payload.startswith(pfx):
+ payload = payload[len(pfx) :]
+ break
+ else:
+ payload = patch_code
+
+ # Normalize newlines so markers remain on their own lines
+ if start_match is not None and payload and not payload.startswith("\n"):
+ payload = "\n" + payload
+ if end_match is not None and payload and not payload.endswith("\n"):
+ payload = payload + "\n"
+
+ updated_content = immutable_prefix + payload + immutable_suffix
# Add remaining immutable content after last mutable range
- if patch_mutable_ranges and mutable_ranges:
+ if patch_has_both and mutable_ranges:
updated_content += original[mutable_ranges[-1][1] :]
num_applied = 1
diff --git a/tests/test_edit_base.py b/tests/test_edit_base.py
index edc0e1178..67c6f2e20 100644
--- a/tests/test_edit_base.py
+++ b/tests/test_edit_base.py
@@ -161,6 +161,110 @@ def new_func2():
# Should have replaced both evolve blocks with new content
+def test_apply_full_patch_full_file_without_markers_extracts_block_only():
+ """Full-file patch without EVOLVE markers should not copy immutable code
+ into the evolve block; only the block payload is replaced."""
+ original_content = """# Header line\n# EVOLVE-BLOCK-START\nold_line()\n# EVOLVE-BLOCK-END\n# Footer line\n"""
+
+ # Patch is the entire file content but with the EVOLVE markers omitted.
+ patch_content = """```python
+new_line()
+another_new_line()
+```"""
+
+ expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+another_new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+ result = apply_full_patch(
+ patch_str=patch_content,
+ original_str=original_content,
+ language="python",
+ verbose=False,
+ )
+ updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+ assert error is None
+ assert num_applied == 1
+ assert updated_content == expected
+
+
+def test_apply_full_patch_patch_with_start_marker_only():
+ """Patch has only START marker; original has both markers."""
+ original_content = """# Header line
+# EVOLVE-BLOCK-START
+old_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+ patch_content = """```python
+# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# Footer line
+```"""
+
+ expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+ result = apply_full_patch(
+ patch_str=patch_content,
+ original_str=original_content,
+ language="python",
+ verbose=False,
+ )
+ updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+ assert error is None
+ assert num_applied == 1
+ assert updated_content == expected
+
+
+def test_apply_full_patch_patch_with_end_marker_only():
+ """Patch has only END marker; original has both markers."""
+ original_content = """# Header line
+# EVOLVE-BLOCK-START
+old_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+ patch_content = """```python
+# Header line
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+```"""
+
+ expected = """# Header line
+# EVOLVE-BLOCK-START
+new_line()
+# EVOLVE-BLOCK-END
+# Footer line
+"""
+
+ result = apply_full_patch(
+ patch_str=patch_content,
+ original_str=original_content,
+ language="python",
+ verbose=False,
+ )
+ updated_content, num_applied, output_path, error, patch_txt, diff_path = result
+
+ assert error is None
+ assert num_applied == 1
+ assert updated_content == expected
+
+
def test_apply_full_patch_no_evolve_blocks():
"""Test apply_full_patch with no EVOLVE-BLOCK regions - should error."""
original_content = """# Just regular code
@@ -221,6 +325,41 @@ def new_function():
assert updated_content == original_content # Should return original content
+def test_apply_full_patch_patch_with_single_marker_ambiguous_multiple_regions():
+ """Single marker in patch is ambiguous when original has multiple regions."""
+ original_content = """# Header
+# EVOLVE-BLOCK-START
+func1()
+# EVOLVE-BLOCK-END
+
+# EVOLVE-BLOCK-START
+func2()
+# EVOLVE-BLOCK-END
+# Footer
+"""
+
+ # Patch includes only START marker
+ patch_content = """```python
+# Header
+# EVOLVE-BLOCK-START
+new_code()
+# Footer
+```"""
+
+ updated_content, num_applied, output_path, error, patch_txt, diff_path = (
+ apply_full_patch(
+ patch_str=patch_content,
+ original_str=original_content,
+ language="python",
+ verbose=False,
+ )
+ )
+
+ assert num_applied == 0
+ assert error is not None
+ assert "only one EVOLVE-BLOCK marker" in error
+
+
def test_apply_full_patch_invalid_extraction():
"""Test apply_full_patch with invalid code extraction."""
original_content = """# EVOLVE-BLOCK-START
From c5b1abe80331532aed5ce1e1fbd7fd5e7d14b087 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Mon, 27 Oct 2025 16:20:22 +0100
Subject: [PATCH 12/68] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index b0dba5f7d..7a59f760e 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ The framework supports **parallel evaluation of candidates** locally or on a Slu
| π **[Tutorial Notebook](examples/shinka_tutorial.ipynb)** | Interactive walkthrough of Shinka features | Hands-on examples, configuration, best practices |
| βοΈ **[Configuration](docs/configuration.md)** | Comprehensive configuration reference | All config options, optimization settings, advanced features |
| π¨ **[WebUI](docs/webui.md)** | Interactive visualization and monitoring | Real-time tracking, result analysis, debugging tools |
+|πΉοΈ **[Local LLM Support](https://github.com/SakanaAI/ShinkaEvolve/blob/main/docs/support_local_llm.md)**| Instructions for Local LLMs | How to setup local LLMs on your machine|
## Installation & Quick Start π
From ded457647e3fe9d50d2ddf756d00d66ae890f0bd Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Mon, 27 Oct 2025 20:36:19 +0100
Subject: [PATCH 13/68] Update inspirations.py - archive
---
shinka/database/inspirations.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/shinka/database/inspirations.py b/shinka/database/inspirations.py
index ee564dfa1..42c3859d8 100644
--- a/shinka/database/inspirations.py
+++ b/shinka/database/inspirations.py
@@ -72,6 +72,7 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
self.cursor.execute(
"""
SELECT p.id FROM programs p
+ JOIN archive a ON p.id = a.program_id
WHERE p.island_idx = ? AND p.correct = 1
ORDER BY p.combined_score DESC
LIMIT ?
@@ -93,7 +94,8 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
placeholders_rand = ",".join("?" * len(insp_ids))
sql_rand = f"""
SELECT p.id FROM programs p
- WHERE p.island_idx = ? AND p.correct = 1
+ JOIN archive a ON p.id = a.program_id
+ WHERE p.island_idx = ? AND p.correct = 1
AND p.id NOT IN ({placeholders_rand})
ORDER BY RANDOM() LIMIT ?
"""
@@ -111,9 +113,10 @@ def sample_context(self, parent: Any, n: int) -> List[Any]:
needed = n - len(inspirations)
if needed > 0:
placeholders_rand = ",".join("?" * len(insp_ids))
- sql_rand = f"""SELECT id FROM programs
- WHERE correct = 1
- AND id NOT IN ({placeholders_rand})
+ sql_rand = f"""SELECT p.id FROM programs p
+ JOIN archive a ON p.id = a.program_id
+ WHERE p.correct = 1
+ AND p.id NOT IN ({placeholders_rand})
ORDER BY RANDOM() LIMIT ?
"""
params_rand = list(insp_ids) + [needed]
From ee6e8a5e98478e53948ddacb94588a727c10521b Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Mon, 27 Oct 2025 21:07:23 +0100
Subject: [PATCH 14/68] Update dependencies gemini embed
---
pyproject.toml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pyproject.toml b/pyproject.toml
index f05429b60..f60d0b659 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
"adjustText",
"markdown",
"aiofiles",
+ "google-generativeai",
]
[tool.setuptools]
From a759778b5f410528a99a878e724c5e6ac7511ed2 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Thu, 30 Oct 2025 11:07:50 +0100
Subject: [PATCH 15/68] Update dbase.py path default
---
shinka/database/dbase.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index c6a2b89bf..aef4f7219 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -50,7 +50,7 @@ def clean_nan_values(obj: Any) -> Any:
@dataclass
class DatabaseConfig:
- db_path: Optional[str] = None
+ db_path: str = "evolution_db.sqlite"
num_islands: int = 4
archive_size: int = 100
From c097a8821ff081c433fa285874448467e5b9f04a Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Thu, 30 Oct 2025 21:03:34 +0100
Subject: [PATCH 16/68] Fix reasoning token sampling
---
shinka/llm/query.py | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/shinka/llm/query.py b/shinka/llm/query.py
index a7288df8e..218ae33eb 100644
--- a/shinka/llm/query.py
+++ b/shinka/llm/query.py
@@ -137,16 +137,13 @@ def sample_model_kwargs(
r_effort = random.choice(reasoning_efforts)
think_bool = r_effort != "auto"
if think_bool:
- thinking_tokens = [
- t
- for t in THINKING_TOKENS.values()
- if t < kwargs_dict["max_tokens"] and t >= 1024
- ]
+ t = THINKING_TOKENS[r_effort]
+ thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024
kwargs_dict["extra_body"] = {
"extra_body": {
"google": {
"thinking_config": {
- "thinking_budget": random.choice(thinking_tokens),
+ "thinking_budget": thinking_tokens,
"include_thoughts": True,
}
}
@@ -161,15 +158,12 @@ def sample_model_kwargs(
if think_bool:
# filter thinking tokens to be smaller than max_tokens
# not auto THINKING_TOKENS
- thinking_tokens = [
- t
- for t in THINKING_TOKENS.values()
- if t < kwargs_dict["max_tokens"] and t >= 1024
- ]
+ t = THINKING_TOKENS[r_effort]
+ thinking_tokens = t if t < kwargs_dict["max_tokens"] else 1024
# sample only from thinking tokens that are valid
kwargs_dict["thinking"] = {
"type": "enabled",
- "budget_tokens": random.choice(thinking_tokens),
+ "budget_tokens": thinking_tokens,
}
else:
From 6d5e208ae04e18ba906d8f2c6e77ae6facf0afb7 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Thu, 30 Oct 2025 22:49:31 +0100
Subject: [PATCH 17/68] Fix anthropic budget sampling
---
shinka/llm/query.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/shinka/llm/query.py b/shinka/llm/query.py
index 218ae33eb..c88c7d7c3 100644
--- a/shinka/llm/query.py
+++ b/shinka/llm/query.py
@@ -154,7 +154,8 @@ def sample_model_kwargs(
REASONING_CLAUDE_MODELS + REASONING_BEDROCK_MODELS
):
kwargs_dict["max_tokens"] = min(random.choice(max_tokens), 16384)
- think_bool = random.choice(reasoning_efforts) != "auto"
+ r_effort = random.choice(reasoning_efforts)
+ think_bool = r_effort != "auto"
if think_bool:
# filter thinking tokens to be smaller than max_tokens
# not auto THINKING_TOKENS
From 9b4d7c760ab9b0d13ee0fb672c24cc0f14336c4d Mon Sep 17 00:00:00 2001
From: RobertTLange
Date: Sun, 2 Nov 2025 10:00:19 +0100
Subject: [PATCH 18/68] fix shinka_launch --help
---
configs/config.yaml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/configs/config.yaml b/configs/config.yaml
index 9702c6617..577e1dfe2 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -2,9 +2,9 @@ defaults:
- _self_
- database@_global_: island_small
- evolution@_global_: small_budget
- - task@_global_: mad_tf
+ - task@_global_: circle_packing
- cluster@_global_: local
- - variant@_global_: mad_tf_example
+ - variant@_global_: circle_packing_example
verbose: false
results_dir: results
From d7a3f7e77d45c156b45bbc92f2a39de7e5b4e131 Mon Sep 17 00:00:00 2001
From: RobertTLange
Date: Sun, 2 Nov 2025 10:05:49 +0100
Subject: [PATCH 19/68] fix wrap_eval catch
---
shinka/core/wrap_eval.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/shinka/core/wrap_eval.py b/shinka/core/wrap_eval.py
index 7e1d1e5d3..bf2cf92eb 100644
--- a/shinka/core/wrap_eval.py
+++ b/shinka/core/wrap_eval.py
@@ -96,6 +96,9 @@ def run_shinka_eval(
num_valid_runs = 0
num_invalid_runs = 0
+ all_run_results: List[Any] = []
+ execution_times: List[float] = []
+
try:
module = load_program(program_path)
if not hasattr(module, experiment_fn_name):
@@ -105,9 +108,6 @@ def run_shinka_eval(
)
experiment_fn = getattr(module, experiment_fn_name)
- all_run_results: List[Any] = []
- execution_times: List[float] = []
-
for i in range(num_runs):
kwargs: Dict[str, Any] = {}
if get_experiment_kwargs:
From 397e0fd67e6c04c7b82124da715c2cdc99d53efa Mon Sep 17 00:00:00 2001
From: RobertTLange
Date: Sun, 2 Nov 2025 10:10:10 +0100
Subject: [PATCH 20/68] add documentation for resuming experiments
---
docs/getting_started.md | 69 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 69 insertions(+)
diff --git a/docs/getting_started.md b/docs/getting_started.md
index a866c011f..d40c16b59 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -331,6 +331,75 @@ The `run_shinka_eval` function returns three values:
## Advanced Usage
+### Resuming Experiments
+
+If you need to pause and resume an evolutionary run, or extend a completed run with more generations, Shinka supports seamless resumption from existing results.
+
+#### How Resuming Works
+
+When you specify an existing `results_dir` that contains a database, Shinka will:
+- Detect the previous run automatically
+- Restore the population database and all program history
+- Resume meta-recommendations from the last checkpoint
+- Continue from the last completed generation
+
+#### Using the CLI (Hydra)
+
+```bash
+# Resume an existing run and extend to 50 generations
+shinka_launch \
+ variant=circle_packing_example \
+ evo_config.results_dir=results_20250101_120000 \
+ evo_config.num_generations=50
+
+# Or with a custom task
+shinka_launch \
+ task=circle_packing \
+ database=island_small \
+ evolution=small_budget \
+ cluster=local \
+ evo_config.results_dir=path/to/previous/results \
+ evo_config.num_generations=100
+```
+
+#### Using the Python API
+
+```python
+from shinka.core import EvolutionRunner, EvolutionConfig
+from shinka.database import DatabaseConfig
+from shinka.launch import LocalJobConfig
+
+# Point to existing results directory
+evo_config = EvolutionConfig(
+ num_generations=50, # Extend to 50 total generations
+ results_dir="results_20250101_120000", # Existing results
+ # ... other config parameters ...
+)
+
+job_config = LocalJobConfig(
+ eval_program_path="examples/circle_packing/evaluate.py",
+)
+
+db_config = DatabaseConfig(
+ archive_size=20,
+ num_islands=2,
+)
+
+# Run will automatically detect and resume
+runner = EvolutionRunner(
+ evo_config=evo_config,
+ job_config=job_config,
+ db_config=db_config,
+)
+runner.run()
+```
+
+**Important Notes:**
+- The `num_generations` parameter should be set to the **total** number of generations you want (not additional generations)
+- For example, if you completed 20 generations and want 30 more, set `num_generations=50`
+- The database configuration (number of islands, archive size, etc.) should match the original run
+- All previous progress, including the best solutions and meta-recommendations, will be preserved
+
### Environment Management for Local Jobs
When running jobs locally, you have several options for managing Python environments:
From f6896dc03d63571c12506fcc85fced52a93da4b0 Mon Sep 17 00:00:00 2001
From: RobertTLange
Date: Sun, 2 Nov 2025 10:26:33 +0100
Subject: [PATCH 21/68] fix OAI dependency db for visualization
---
shinka/database/dbase.py | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/shinka/database/dbase.py b/shinka/database/dbase.py
index aef4f7219..2118763c4 100644
--- a/shinka/database/dbase.py
+++ b/shinka/database/dbase.py
@@ -251,12 +251,22 @@ class ProgramDatabase:
populations, and an archive of elites.
"""
- def __init__(self, config: DatabaseConfig,embedding_model: str = "text-embedding-3-small", read_only: bool = False):
+ def __init__(
+ self,
+ config: DatabaseConfig,
+ embedding_model: str = "text-embedding-3-small",
+ read_only: bool = False,
+ ):
self.config = config
self.conn: Optional[sqlite3.Connection] = None
self.cursor: Optional[sqlite3.Cursor] = None
self.read_only = read_only
- self.embedding_client = EmbeddingClient(model_name=embedding_model)
+ # Only create embedding client if not in read-only mode
+ # (e.g., WebUI doesn't need it for visualization)
+ if not read_only:
+ self.embedding_client = EmbeddingClient(model_name=embedding_model)
+ else:
+ self.embedding_client = None
self.last_iteration: int = 0
self.best_program_id: Optional[str] = None
From 1d9d498054af8da462c1cf6f14aa3cb566973108 Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Sun, 2 Nov 2025 13:27:56 +0100
Subject: [PATCH 22/68] Fix init program island copying -> archive
---
shinka/database/islands.py | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/shinka/database/islands.py b/shinka/database/islands.py
index 9975eac3b..341dea79c 100644
--- a/shinka/database/islands.py
+++ b/shinka/database/islands.py
@@ -682,6 +682,16 @@ def copy_program_to_islands(self, program: Any) -> List[str]:
f"Created copy {new_id[:8]}... of program {program.id[:8]}... "
f"for island {island_idx}"
)
+
+ # Add the copied program to the archive if it's correct
+ # This ensures it can be used as inspiration for that island
+ if program.correct:
+ self.cursor.execute(
+ "INSERT OR IGNORE INTO archive (program_id) VALUES (?)",
+ (new_id,),
+ )
+ logger.debug(f"Added copy {new_id[:8]}... to archive (correct program)")
+
self.conn.commit()
logger.info(
f"Created {len(created_ids)} copies of program "
From 2f01b3ed549793fda12aa1f5b157cc617ec80eb1 Mon Sep 17 00:00:00 2001
From: "takeru.fukushima" <100330935+takeruhukushima@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:28:09 +0900
Subject: [PATCH 23/68] fix:GEMINI_API_KEY name error
---
shinka/llm/embedding.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/shinka/llm/embedding.py b/shinka/llm/embedding.py
index 1f2ad495f..4082ad58b 100644
--- a/shinka/llm/embedding.py
+++ b/shinka/llm/embedding.py
@@ -51,9 +51,9 @@ def get_client_model(model_name: str) -> tuple[Union[openai.OpenAI, str], str]:
)
elif model_name in GEMINI_EMBEDDING_MODELS:
# Configure Gemini API
- api_key = os.getenv("GOOGLE_API_KEY")
+ api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
- raise ValueError("GOOGLE_API_KEY environment variable not set for Gemini models")
+ raise ValueError("GEMINI_API_KEY environment variable not set for Gemini models")
genai.configure(api_key=api_key)
client = "gemini" # Use string identifier for Gemini
model_to_use = model_name
From f5f7e68f2ec3423291ac9e98bb1836478b757df0 Mon Sep 17 00:00:00 2001
From: ifsheldon
Date: Sat, 8 Nov 2025 17:29:11 +0800
Subject: [PATCH 24/68] use dependency-groups.dev
---
pyproject.toml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index f60d0b659..5802a1522 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,8 +57,8 @@ include = ["shinka", "shinka.*"]
[tool.setuptools.package-data]
"*" = ["*"]
-[tool.uv]
-dev-dependencies = [
+[dependency-groups]
+dev = [
"pytest>=6.0",
"black",
"isort",
From 14739fc5e364eda8fc7ff184f5811e45d0d00657 Mon Sep 17 00:00:00 2001
From: Arun Parthiban
Date: Sat, 8 Nov 2025 07:05:54 -0500
Subject: [PATCH 25/68] Add support for Claude Sonnet 4.5
(claude-sonnet-4-5-20250929)
---
shinka/llm/models/pricing.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index c9c101a2c..a4595a99d 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -35,6 +35,10 @@
"input_price": 3.0 / M,
"output_price": 15.0 / M,
},
+ "claude-sonnet-4-5-20250929": {
+ "input_price": 3.0 / M,
+ "output_price": 15.0 / M,
+ },
}
OPENAI_MODELS = {
@@ -176,6 +180,7 @@
REASONING_CLAUDE_MODELS = [
"claude-3-7-sonnet-20250219",
"claude-4-sonnet-20250514",
+ "claude-sonnet-4-5-20250929",
]
REASONING_DEEPSEEK_MODELS = [
From ed9f51f49305d14091f339a1487ed9e534f96591 Mon Sep 17 00:00:00 2001
From: Jeethu Rao
Date: Mon, 3 Nov 2025 16:09:08 +0000
Subject: [PATCH 26/68] Add Swift language support
---
shinka/core/runner.py | 2 ++
shinka/database/complexity.py | 4 ++--
shinka/edit/apply_diff.py | 4 +++-
shinka/edit/apply_full.py | 2 ++
shinka/edit/async_apply.py | 26 +++++++++++++++++++++++++-
5 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index f1b5e947d..975ab5373 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -238,6 +238,8 @@ def __init__(
self.lang_ext = "py"
elif self.evo_config.language == "rust":
self.lang_ext = "rs"
+ elif self.evo_config.language == "swift":
+ self.lang_ext = "swift"
else:
msg = f"Language {self.evo_config.language} not supported"
raise ValueError(msg)
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 933d7f4e6..30a46aa31 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -259,8 +259,8 @@ def analyze_code_metrics(code_string, language="python"):
# If Python parsing fails, fall back to C++ analysis
return analyze_cpp_complexity(code_string)
- # For C/C++/CUDA/Rust and other languages, use regex-based analysis
- elif language in ["cpp", "c", "cuda", "c++", "rust"]:
+ # For C/C++/CUDA/Rust/Swift and other languages, use regex-based analysis
+ elif language in ["cpp", "c", "cuda", "c++", "rust", "swift"]:
return analyze_cpp_complexity(code_string)
# For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index 4b5f29148..af1dff747 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
patch_str = _strip_trailing_whitespace(patch_str)
# Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
- if language in ["cuda", "cpp", "rust"]:
+ if language in ["cuda", "cpp", "rust", "swift"]:
patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
elif language == "python":
@@ -732,6 +732,8 @@ def apply_diff_patch(
suffix = ".cu"
elif language == "rust":
suffix = ".rs"
+ elif language == "swift":
+ suffix = ".swift"
else:
raise ValueError(f"Language {language} not supported")
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index 4cc4ddca4..f175aec74 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -264,6 +264,8 @@ def apply_full_patch(
suffix = ".cu"
elif language == "rust":
suffix = ".rs"
+ elif language == "swift":
+ suffix = ".swift"
else:
raise ValueError(f"Language {language} not supported")
diff --git a/shinka/edit/async_apply.py b/shinka/edit/async_apply.py
index 4ffd15bed..e4c21202f 100644
--- a/shinka/edit/async_apply.py
+++ b/shinka/edit/async_apply.py
@@ -143,7 +143,6 @@ async def validate_code_async(
else:
error_msg = stderr.decode() if stderr else "Unknown compilation error"
return False, error_msg
-
elif language == "cpp":
# Use g++ for C++ compilation check
proc = await asyncio.create_subprocess_exec(
@@ -154,6 +153,31 @@ async def validate_code_async(
stderr=asyncio.subprocess.PIPE,
)
+ try:
+ stdout, stderr = await asyncio.wait_for(
+ proc.communicate(), timeout=timeout
+ )
+ except asyncio.TimeoutError:
+ proc.kill()
+ await proc.wait()
+ return False, f"Validation timeout after {timeout}s"
+
+ if proc.returncode == 0:
+ return True, None
+ else:
+ error_msg = stderr.decode() if stderr else "Unknown compilation error"
+ return False, error_msg
+ elif language == "swift":
+ # Use swiftc for Swift syntax checking
+ proc = await asyncio.create_subprocess_exec(
+ "swiftc",
+ "-typecheck",
+ "-parse-as-library",
+ code_path,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(), timeout=timeout
From 0437118c4518139f79a96b4b44e173bb05b39745 Mon Sep 17 00:00:00 2001
From: Aladoro
Date: Tue, 11 Nov 2025 03:44:03 +0000
Subject: [PATCH 27/68] ignore warning for correct behavior when no improvement
is detected, keeping the tracked llm scores in log space to -inf
---
shinka/llm/dynamic_sampling.py | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/shinka/llm/dynamic_sampling.py b/shinka/llm/dynamic_sampling.py
index 6c038d9fa..eb0cd8cb3 100644
--- a/shinka/llm/dynamic_sampling.py
+++ b/shinka/llm/dynamic_sampling.py
@@ -28,7 +28,8 @@ def _logdiffexp(a_log, b_log):
def _logexpm1(z):
z = np.asarray(z, dtype=float)
- return np.where(z > 50.0, z, np.log(np.expm1(z)))
+ with np.errstate(divide='ignore', invalid='ignore'):
+ return np.where(z > 50.0, z, np.log(np.expm1(z)))
class BanditBase(ABC):
@@ -433,12 +434,13 @@ def decay(self, factor: float) -> None:
if self.use_exponential_scaling and self.asymmetric_scaling:
# shrink in exp space to match original score scale
s = self.s
- log1p_term = np.where(
- s > 0.0,
- s + np.log(one_minus_factor + np.exp(-s)),
- np.log1p(one_minus_factor * np.exp(s)),
- )
- self.s = s + np.log(factor) - log1p_term
+ with np.errstate(divide='ignore', invalid='ignore'):
+ log1p_term = np.where(
+ s > 0.0,
+ s + np.log(one_minus_factor + np.exp(-s)),
+ np.log1p(one_minus_factor * np.exp(s)),
+ )
+ self.s = s + np.log(factor) - log1p_term
if self.adaptive_scale and np.isfinite(self._obs_max):
means_log = self._mean()
From 259e786777cf042535d129cbdbc41653c18b8e91 Mon Sep 17 00:00:00 2001
From: Jai Menon <87035087+jm424@users.noreply.github.com>
Date: Wed, 12 Nov 2025 11:29:56 -0500
Subject: [PATCH 28/68] Allow boolean flags for eval jobs
Currently flags are passed on as key-value pairs but that approach doesn't extend to boolean flags
---
shinka/launch/scheduler.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/shinka/launch/scheduler.py b/shinka/launch/scheduler.py
index 5782613ee..4e824c3ff 100644
--- a/shinka/launch/scheduler.py
+++ b/shinka/launch/scheduler.py
@@ -138,7 +138,13 @@ def _build_command(self, exec_fname_t: str, results_dir_t: str) -> List[str]:
]
if self.config.extra_cmd_args:
for k, v in self.config.extra_cmd_args.items():
- cmd.extend([f"--{k}", str(v)])
+ # Handle boolean flags
+ if isinstance(v, bool):
+ if v: # Only append flag if True
+ cmd.append(f"--{k}")
+ else:
+ # For non-boolean values, append both flag and value
+ cmd.extend([f"--{k}", str(v)])
return cmd
def run(
From 3251a701661d2eedf77e2473bba8c2a022295cf1 Mon Sep 17 00:00:00 2001
From: Jeremy Cochoy
Date: Mon, 17 Nov 2025 15:52:53 +0100
Subject: [PATCH 29/68] Add json support
---
shinka/core/runner.py | 2 ++
shinka/database/complexity.py | 2 +-
shinka/edit/apply_diff.py | 4 +++-
shinka/edit/apply_full.py | 2 ++
4 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index f1b5e947d..be76994ed 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -238,6 +238,8 @@ def __init__(
self.lang_ext = "py"
elif self.evo_config.language == "rust":
self.lang_ext = "rs"
+ elif self.evo_config.language in ["json", "json5"]:
+ self.lang_ext = "json"
else:
msg = f"Language {self.evo_config.language} not supported"
raise ValueError(msg)
diff --git a/shinka/database/complexity.py b/shinka/database/complexity.py
index 933d7f4e6..714ebaae8 100644
--- a/shinka/database/complexity.py
+++ b/shinka/database/complexity.py
@@ -260,7 +260,7 @@ def analyze_code_metrics(code_string, language="python"):
return analyze_cpp_complexity(code_string)
# For C/C++/CUDA/Rust and other languages, use regex-based analysis
- elif language in ["cpp", "c", "cuda", "c++", "rust"]:
+ elif language in ["cpp", "c", "cuda", "c++", "rust", "json", "json5"]:
return analyze_cpp_complexity(code_string)
# For unknown languages, use simple line-based complexity
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index 4b5f29148..6465ffe96 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -698,7 +698,7 @@ def apply_diff_patch(
patch_str = _strip_trailing_whitespace(patch_str)
# Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
- if language in ["cuda", "cpp", "rust"]:
+ if language in ["cuda", "cpp", "rust", "json", "json5"]:
patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
elif language == "python":
@@ -732,6 +732,8 @@ def apply_diff_patch(
suffix = ".cu"
elif language == "rust":
suffix = ".rs"
+ elif language in ["json", "json5"]:
+ suffix = ".json"
else:
raise ValueError(f"Language {language} not supported")
diff --git a/shinka/edit/apply_full.py b/shinka/edit/apply_full.py
index 4cc4ddca4..5dd336547 100644
--- a/shinka/edit/apply_full.py
+++ b/shinka/edit/apply_full.py
@@ -264,6 +264,8 @@ def apply_full_patch(
suffix = ".cu"
elif language == "rust":
suffix = ".rs"
+ elif language in ["json", "json5"]:
+ suffix = ".json"
else:
raise ValueError(f"Language {language} not supported")
From ed8f1b4ab2093ab5f489c72f0c585625b7de1fee Mon Sep 17 00:00:00 2001
From: Jai Menon <87035087+jm424@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:43:15 -0500
Subject: [PATCH 30/68] llm: Add GPT-5.1 and Gemini 3 Pro models
---
shinka/llm/models/pricing.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/shinka/llm/models/pricing.py b/shinka/llm/models/pricing.py
index a4595a99d..91e965c75 100644
--- a/shinka/llm/models/pricing.py
+++ b/shinka/llm/models/pricing.py
@@ -118,6 +118,10 @@
"input_price": 0.05 / M,
"output_price": 0.4 / M,
},
+ "gpt-5.1": {
+ "input_price": 1.25 / M,
+ "output_price": 10.0 / M,
+ },
}
@@ -145,6 +149,10 @@
"input_price": 0.1 / M,
"output_price": 0.4 / M,
},
+ "gemini-3-pro-preview" : {
+ "input_price": 2.0 / M,
+ "output_price": 12.0 / M,
+ },
}
BEDROCK_MODELS = {
@@ -191,6 +199,7 @@
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.5-flash-lite-preview-06-17",
+ "gemini-3-pro-preview",
]
REASONING_AZURE_MODELS = [
From ecf762bc6c6af3ac92920714b6287eb04d9aa2bb Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Sat, 22 Nov 2025 17:13:08 +0100
Subject: [PATCH 31/68] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 7a59f760e..4404c24d9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
The framework supports **parallel evaluation of candidates** locally or on a Slurm cluster. It maintains an archive of successful solutions, enabling knowledge transfer between different evolutionary islands. `ShinkaEvolve` is particularly well-suited for scientific tasks where there is a verifier available and the goal is to optimize performance metrics while maintaining code correctness and readability.
-
+
## Documentation π
From c686d7fb97e620d83730270ccfbc8e4fc253c08a Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Sat, 22 Nov 2025 17:21:35 +0100
Subject: [PATCH 32/68] Update getting_started.md
---
docs/getting_started.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/docs/getting_started.md b/docs/getting_started.md
index d40c16b59..03bc54c80 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -2,6 +2,8 @@
Shinka is a framework that combines Large Language Models (LLMs) with evolutionary algorithms to drive scientific discovery. This guide will help you get started with installing, configuring, and running your first evolutionary experiments.
+
+
## Table of Contents
1. [What is Shinka?](#what-is-shinka)
From bad5b37002b482e4771eb0c4fa49d6e31d4cc30e Mon Sep 17 00:00:00 2001
From: Robert Tjarko Lange
Date: Wed, 3 Dec 2025 10:58:48 +0100
Subject: [PATCH 33/68] Update apply_diff.py
---
shinka/edit/apply_diff.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/shinka/edit/apply_diff.py b/shinka/edit/apply_diff.py
index d33f58042..7d2161056 100644
--- a/shinka/edit/apply_diff.py
+++ b/shinka/edit/apply_diff.py
@@ -699,11 +699,11 @@ def apply_diff_patch(
# Remove the EVOLVE-BLOCK START and EVOLVE-BLOCK END markers
if language in ["cuda", "cpp", "rust", "swift", "json", "json5"]:
- patch_str = re.sub(r"// EVOLVE-BLOCK START\\n", "", patch_str)
- patch_str = re.sub(r"// EVOLVE-BLOCK END\\n", "", patch_str)
+ patch_str = re.sub(r"// EVOLVE-BLOCK-START\\n", "", patch_str)
+ patch_str = re.sub(r"// EVOLVE-BLOCK-END\\n", "", patch_str)
elif language == "python":
- patch_str = re.sub(r"# EVOLVE-BLOCK START\\n", "", patch_str)
- patch_str = re.sub(r"# EVOLVE-BLOCK END\\n", "", patch_str)
+ patch_str = re.sub(r"# EVOLVE-BLOCK-START\\n", "", patch_str)
+ patch_str = re.sub(r"# EVOLVE-BLOCK-END\\n", "", patch_str)
else:
raise ValueError(f"Language {language} not supported")
From e12fe6b8eec9a466af59fe1657768a9c985d1b9f Mon Sep 17 00:00:00 2001
From: george
Date: Sun, 7 Dec 2025 02:02:32 +0000
Subject: [PATCH 34/68] feat: Agentic backend core and routing logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This commit adds the foundational agentic multi-turn editing architecture:
**New Components:**
- AgenticConfig and EvaluatorConfig dataclasses for configuration
- _run_agentic_patch() method for multi-turn agent sessions
- Support for ShinkaAgent (native) and Codex CLI backends
- AgenticEditor harness for managing agent sessions
- Session registry for tracking active agent processes
- Embedding corpus builder for multi-file novelty support
**Integration Points:**
- agentic_mode flag in EvolutionConfig (disabled by default)
- Routing in run_patch() to agentic path when enabled
- Multi-file diff generation for visualization
**Preserved:**
- All existing language support (Swift, JSON, etc.)
- Legacy single-file patch workflow unchanged
- No deletions to async_apply.py, pricing.py, or scheduler.py
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5
---
.gitignore | 1 +
configs/evolution/agentic.yaml | 30 ++
shinka/core/embedding_corpus.py | 220 ++++++++++++
shinka/core/runner.py | 461 ++++++++++++++++++++++++-
shinka/edit/__init__.py | 5 +
shinka/edit/agentic.py | 310 +++++++++++++++++
shinka/edit/codex_cli.py | 295 ++++++++++++++++
shinka/edit/cost_utils.py | 52 +++
shinka/edit/shinka_agent.py | 407 ++++++++++++++++++++++
shinka/edit/types.py | 25 ++
shinka/eval/__init__.py | 3 +
shinka/eval/agentic.py | 198 +++++++++++
shinka/prompts/__init__.py | 6 +
shinka/prompts/prompts_agentic.py | 76 ++++
shinka/prompts/prompts_agentic_eval.py | 39 +++
shinka/tools/__init__.py | 1 +
shinka/tools/codex_session_registry.py | 149 ++++++++
17 files changed, 2273 insertions(+), 5 deletions(-)
create mode 100644 configs/evolution/agentic.yaml
create mode 100644 shinka/core/embedding_corpus.py
create mode 100644 shinka/edit/agentic.py
create mode 100644 shinka/edit/codex_cli.py
create mode 100644 shinka/edit/cost_utils.py
create mode 100644 shinka/edit/shinka_agent.py
create mode 100644 shinka/edit/types.py
create mode 100644 shinka/eval/__init__.py
create mode 100644 shinka/eval/agentic.py
create mode 100644 shinka/prompts/prompts_agentic.py
create mode 100644 shinka/prompts/prompts_agentic_eval.py
create mode 100644 shinka/tools/__init__.py
create mode 100644 shinka/tools/codex_session_registry.py
diff --git a/.gitignore b/.gitignore
index 42545fbf7..1b269d71a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,3 +173,4 @@ cython_debug/
# PyPI configuration file
.pypirc
+results/
diff --git a/configs/evolution/agentic.yaml b/configs/evolution/agentic.yaml
new file mode 100644
index 000000000..391f64d87
--- /dev/null
+++ b/configs/evolution/agentic.yaml
@@ -0,0 +1,30 @@
+evo_config:
+ _target_: shinka.core.EvolutionConfig
+ agentic_mode: true
+ agentic:
+ _target_: shinka.core.runner.AgenticConfig
+ backend: "gemini"
+ cli_profile: null
+ sandbox: "workspace-write"
+ approval_mode: "full-auto"
+ max_turns: 50
+ max_seconds: 0
+ cli_path: null
+ extra_cli_config: {}
+ resume_parent_session: false
+ # Use /tmp to isolate scratch dirs from git repos, preventing Codex CLI
+ # from discovering parent AGENTS.md files. Set to null to use results_dir.
+ scratch_dir_base: "/tmp/shinka_scratch"
+ evaluator:
+ _target_: shinka.core.runner.EvaluatorConfig
+ mode: auto
+ agentic:
+ _target_: shinka.core.runner.AgenticEvaluatorConfig
+ cli_profile: null
+ sandbox: "workspace-write"
+ approval_mode: "full-auto"
+ max_turns: 80
+ max_seconds: 0
+ cli_path: null
+ extra_cli_config: {}
+ results_dir: ${output_dir}
diff --git a/shinka/core/embedding_corpus.py b/shinka/core/embedding_corpus.py
new file mode 100644
index 000000000..9088edfeb
--- /dev/null
+++ b/shinka/core/embedding_corpus.py
@@ -0,0 +1,220 @@
+import fnmatch
+import hashlib
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Iterable, List, Optional, Sequence, Set
+
+
+import re
+
+@dataclass
+class EmbeddingCorpus:
+ """Result of building an embedding corpus for a generation directory."""
+
+ text: str
+ included_files: List[str] = field(default_factory=list)
+ skipped_files: List[str] = field(default_factory=list)
+ binary_files: List[str] = field(default_factory=list)
+ truncated: bool = False
+ total_bytes: int = 0
+
+
+def extract_file_content(corpus_text: str, filename: str) -> Optional[str]:
+ """
+ Extract the content of a specific file from a corpus text dump.
+ Returns None if the file is not found or the corpus format is invalid.
+ """
+ if not corpus_text:
+ return None
+
+ # Regex to find the file header and capture content until the next header or end of string
+ # Header format: === FILE: {filename} ({size} bytes)[TRUNCATED?] ===
+ escaped_filename = re.escape(filename)
+ # Look for header at start of string or after a newline
+ pattern = rf"(?:^|\n)=== FILE: {escaped_filename} \(\d+ bytes\)(?: \[TRUNCATED\])? ===\n(.*?)(?=\n=== FILE: |$)"
+
+ match = re.search(pattern, corpus_text, re.DOTALL)
+ if match:
+ return match.group(1)
+
+ return None
+
+
+
+def _is_text_bytes(buf: bytes) -> bool:
+ """Heuristic: treat content as binary if it contains null bytes."""
+ if not buf:
+ return True
+ return b"\x00" not in buf
+
+
+def _sha256_prefix(buf: bytes, length: int = 8) -> str:
+ return hashlib.sha256(buf).hexdigest()[:length]
+
+
+def _matches_any(patterns: Sequence[str], path: str) -> bool:
+ if not patterns:
+ return False
+ p_obj = Path(path)
+ for pat in patterns:
+ if pat in ("**", "**/*"):
+ return True
+ if fnmatch.fnmatch(path, pat):
+ return True
+ try:
+ if p_obj.match(pat):
+ return True
+ except Exception:
+ continue
+ return False
+
+
+def build_embedding_corpus(
+ root: Path,
+ *,
+ include_globs: Sequence[str],
+ exclude_globs: Sequence[str],
+ max_files: int,
+ max_total_bytes: int,
+ max_bytes_per_file: int,
+ changed_first: Optional[Iterable[Path]] = None,
+ exclude_dirs: Optional[Set[str]] = None,
+ exclude_suffixes: Optional[Set[str]] = None,
+ exclude_files: Optional[Set[str]] = None,
+) -> EmbeddingCorpus:
+ """
+ Build a deterministic, artifact-agnostic corpus from a generation directory.
+
+ Text files contribute their (possibly truncated) content. Binary files and
+ over-limit files contribute small placeholders (path, size, hash) so changes
+ are still visible to novelty checks without embedding raw bytes.
+ """
+
+ root = root.resolve()
+ exclude_dirs = exclude_dirs or set()
+ exclude_suffixes = exclude_suffixes or set()
+ exclude_files = exclude_files or set()
+
+ def should_skip(rel: Path) -> bool:
+ if rel.name in exclude_files:
+ return True
+ if rel.suffix in exclude_suffixes:
+ return True
+ if rel.parts and rel.parts[0] in exclude_dirs:
+ return True
+ rel_posix = rel.as_posix()
+ if exclude_globs and _matches_any(exclude_globs, rel_posix):
+ return True
+ if include_globs and not _matches_any(include_globs, rel_posix):
+ return True
+ return False
+
+ seen: Set[Path] = set()
+ ordered_candidates: List[Path] = []
+
+ # Prioritize explicitly changed files (if provided)
+ if changed_first:
+ for p in changed_first:
+ abs_path = (root / p).resolve() if not p.is_absolute() else p
+ if abs_path.is_file() and abs_path.is_relative_to(root):
+ rel = abs_path.relative_to(root)
+ if rel not in seen and not should_skip(rel):
+ seen.add(rel)
+ ordered_candidates.append(rel)
+
+ # Discover remaining files
+ for path in sorted(root.rglob("*")):
+ if not path.is_file():
+ continue
+ try:
+ rel = path.relative_to(root)
+ except ValueError:
+ continue
+ if rel in seen:
+ continue
+ if should_skip(rel):
+ continue
+ seen.add(rel)
+ ordered_candidates.append(rel)
+
+ segments: List[str] = []
+ included_files: List[str] = []
+ skipped_files: List[str] = []
+ binary_files: List[str] = []
+ truncated = False
+ total_bytes = 0
+
+ for rel in ordered_candidates:
+ if len(included_files) >= max_files:
+ truncated = True
+ skipped_files.extend([r.as_posix() for r in ordered_candidates[len(included_files) :]])
+ break
+
+ abs_path = root / rel
+ try:
+ raw = abs_path.read_bytes()
+ except Exception:
+ skipped_files.append(rel.as_posix())
+ continue
+
+ size = len(raw)
+ to_embed = raw[: max_bytes_per_file]
+ file_truncated = size > max_bytes_per_file
+
+ if total_bytes >= max_total_bytes:
+ truncated = True
+ skipped_files.append(rel.as_posix())
+ continue
+
+ is_text = _is_text_bytes(to_embed)
+ rel_posix = rel.as_posix()
+
+ if is_text:
+ try:
+ text = to_embed.decode("utf-8", errors="replace")
+ except Exception:
+ is_text = False
+
+ if not is_text:
+ placeholder = (
+ f"[BINARY FILE] {rel_posix} size={size} sha256={_sha256_prefix(raw)}"
+ )
+ addition = placeholder + "\n"
+ if total_bytes + len(addition) > max_total_bytes:
+ truncated = True
+ skipped_files.append(rel_posix)
+ continue
+ segments.append(placeholder)
+ included_files.append(rel_posix)
+ binary_files.append(rel_posix)
+ total_bytes += len(addition)
+ continue
+
+ # Text path header for clarity/determinism
+ header = f"=== FILE: {rel_posix} ({size} bytes){' [TRUNCATED]' if file_truncated else ''} ===\n"
+ addition_len = len(header) + len(text) + 1 # trailing newline
+ if total_bytes + addition_len > max_total_bytes:
+ # Try to fit partial content
+ remaining = max_total_bytes - total_bytes - len(header) - 1
+ if remaining <= 0:
+ truncated = True
+ skipped_files.append(rel_posix)
+ continue
+ text = text[:remaining]
+ addition_len = len(header) + len(text) + 1
+ truncated = True
+
+ segments.append(header + text + "\n")
+ included_files.append(rel_posix)
+ total_bytes += addition_len
+
+ corpus_text = "".join(segments)
+
+ return EmbeddingCorpus(
+ text=corpus_text,
+ included_files=included_files,
+ skipped_files=skipped_files,
+ binary_files=binary_files,
+ truncated=truncated,
+ total_bytes=total_bytes,
+ )
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index a0dd5f81d..54e89b62b 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -1,3 +1,5 @@
+import difflib
+import json
import shutil
import uuid
import time
@@ -7,7 +9,7 @@
from rich.table import Table
from rich.console import Console
import rich.box
-from typing import List, Optional, Union, cast
+from typing import Any, Dict, List, Literal, Optional, Union, cast
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field, asdict
@@ -22,18 +24,82 @@
AsymmetricUCB,
)
from shinka.edit import (
+ AgentContext,
+ AgenticEditor,
+ CommandResult,
apply_diff_patch,
apply_full_patch,
summarize_diff,
redact_immutable,
)
+from shinka.edit.codex_cli import (
+ CodexExecutionError,
+ CodexUnavailableError,
+ ensure_codex_available,
+ run_codex_task,
+)
+from shinka.edit.shinka_agent import (
+ ensure_shinka_available,
+ run_shinka_task,
+ ShinkaUnavailableError,
+ ShinkaExecutionError,
+)
from shinka.core.sampler import PromptSampler
from shinka.core.summarizer import MetaSummarizer
from shinka.core.novelty_judge import NoveltyJudge
+from shinka.core.embedding_corpus import (
+ build_embedding_corpus,
+ extract_file_content,
+ EmbeddingCorpus,
+)
from shinka.logo import print_gradient_logo
FOLDER_PREFIX = "gen"
+# Directories to exclude when copying workspace files for agentic edits
+WORKSPACE_EXCLUDE_DIRS = {
+ "results",
+ "workspace_snapshot",
+ "agent_sessions",
+ ".hydra",
+ "__pycache__",
+}
+WORKSPACE_EXCLUDE_SUFFIXES = {".pyc", ".pyo"}
+WORKSPACE_EXCLUDE_FILES = {
+ "rewrite.txt",
+ "edit.diff",
+ "session_log.jsonl",
+}
+
+
+@dataclass
+class AgenticConfig:
+ """Configuration options for agentic editing sessions.
+
+ This config supports Codex CLI and ShinkaAgent backends.
+ The `backend` field selects which one to use.
+ """
+
+ backend: str = "shinka" # "shinka" or "codex"
+ cli_profile: Optional[str] = None
+ sandbox: str = "workspace-write"
+ approval_mode: str = "full-auto"
+ max_turns: int = 50
+ max_seconds: int = 0
+ cli_path: Optional[str] = None
+ extra_cli_config: Dict[str, Any] = field(default_factory=dict)
+ resume_parent_session: bool = False
+ # Base directory for scratch workspaces. Using /tmp ensures scratch dirs are
+ # outside any git repo, preventing CLI from discovering parent AGENTS.md files.
+ scratch_dir_base: Optional[str] = "/tmp/shinka_scratch"
+
+
+@dataclass
+class EvaluatorConfig:
+ """Evaluator selection configuration."""
+
+ mode: Literal["auto", "legacy", "agentic"] = "legacy"
+
@dataclass
class EvolutionConfig:
@@ -62,6 +128,29 @@ class EvolutionConfig:
novelty_llm_models: Optional[List[str]] = None
novelty_llm_kwargs: dict = field(default_factory=lambda: {})
use_text_feedback: bool = False
+ # Agentic editing configuration
+ agentic_mode: bool = False
+ agentic: AgenticConfig = field(default_factory=AgenticConfig)
+ evaluator: EvaluatorConfig = field(default_factory=EvaluatorConfig)
+ # Multi-file support: directory containing additional files to copy
+ init_support_dir: Optional[str] = None
+ # Embedding corpus configuration for multi-file novelty
+ embedding_include_globs: List[str] = field(default_factory=lambda: ["**/*"])
+ embedding_exclude_globs: List[str] = field(
+ default_factory=lambda: [
+ "results/**",
+ "workspace_snapshot/**",
+ "agent_sessions/**",
+ ".hydra/**",
+ "__pycache__/**",
+ "*.pyc",
+ "*.pyo",
+ ]
+ )
+ embedding_max_files: int = 200
+ embedding_max_total_bytes: int = 500_000
+ embedding_max_bytes_per_file: int = 200_000
+ embedding_use_changed_files_first: bool = True
@dataclass
@@ -71,6 +160,7 @@ class RunningJob:
job_id: Union[str, Popen, ProcessWithLogging]
exec_fname: str
results_dir: str
+ generation_dir: Path
start_time: float
generation: int
parent_id: Optional[str]
@@ -81,6 +171,9 @@ class RunningJob:
code_embedding: List[float] = field(default_factory=list)
embed_cost: float = 0.0
novelty_cost: float = 0.0
+ # For multi-file embedding corpus
+ corpus_text: str = ""
+ corpus_meta: dict = field(default_factory=dict)
# Set up logging
@@ -626,10 +719,9 @@ def _submit_new_job(self):
self.next_generation_to_submit += 1
- exec_fname = (
- f"{self.results_dir}/{FOLDER_PREFIX}_{current_gen}/main.{self.lang_ext}"
- )
- results_dir = f"{self.results_dir}/{FOLDER_PREFIX}_{current_gen}/results"
+ generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{current_gen}"
+ exec_fname = str(generation_dir / f"main.{self.lang_ext}")
+ results_dir = str(generation_dir / "results")
Path(results_dir).mkdir(parents=True, exist_ok=True)
# Get current meta-recommendations for this job
@@ -744,6 +836,7 @@ def _submit_new_job(self):
job_id=job_id,
exec_fname=exec_fname,
results_dir=results_dir,
+ generation_dir=generation_dir,
start_time=time.time(),
generation=current_gen,
parent_id=parent_id,
@@ -983,6 +1076,18 @@ def run_patch(
meta_recommendations=meta_recs,
)
+ # Route to agentic patch if enabled
+ if self.evo_config.agentic_mode:
+ return self._run_agentic_patch(
+ parent_program=parent_program,
+ generation=generation,
+ patch_sys=patch_sys,
+ patch_msg=patch_msg,
+ patch_type=patch_type,
+ novelty_attempt=novelty_attempt,
+ resample_attempt=resample_attempt,
+ )
+
if patch_type in ["full", "cross"]:
apply_patch = apply_full_patch
elif patch_type == "diff":
@@ -1298,3 +1403,349 @@ def _restore_meta_memory(self) -> None:
)
else:
logger.info("No previous meta memory state found - starting fresh")
+
+ def _collect_parent_workspace_files(
+ self, parent_program: Program
+ ) -> Dict[Path, str]:
+ """Collect workspace files from parent program's generation directory."""
+ workspace_files: Dict[Path, str] = {}
+ parent_metadata = parent_program.metadata or {}
+
+ # Check if parent has stored changed files from agentic edit
+ agent_changed = parent_metadata.get("agent_changed_files")
+ if agent_changed and isinstance(agent_changed, dict):
+ for rel_path_str, content in agent_changed.items():
+ workspace_files[Path(rel_path_str)] = content
+
+ return workspace_files
+
+ def _hydrate_generation_directory(
+ self, parent_program: Program, generation_dir: Path
+ ) -> None:
+ """Copy workspace files from parent to new generation directory."""
+ workspace_files = self._collect_parent_workspace_files(parent_program)
+ for rel_path, content in workspace_files.items():
+ target_path = generation_dir / rel_path
+ target_path.parent.mkdir(parents=True, exist_ok=True)
+ target_path.write_text(content, encoding="utf-8")
+
+ def _build_embedding_corpus(
+ self, generation_dir: Path, meta_patch_data: Optional[dict] = None
+ ) -> EmbeddingCorpus:
+ """Build embedding corpus from generation directory for multi-file novelty."""
+ # Get changed files from agentic edit for prioritization
+ changed_first: Optional[List[str]] = None
+ if meta_patch_data and self.evo_config.embedding_use_changed_files_first:
+ agent_changed = meta_patch_data.get("agent_changed_files")
+ if agent_changed:
+ changed_first = list(agent_changed.keys())
+
+ return build_embedding_corpus(
+ root_dir=generation_dir,
+ include_globs=self.evo_config.embedding_include_globs,
+ exclude_globs=self.evo_config.embedding_exclude_globs,
+ max_files=self.evo_config.embedding_max_files,
+ max_total_bytes=self.evo_config.embedding_max_total_bytes,
+ max_bytes_per_file=self.evo_config.embedding_max_bytes_per_file,
+ changed_first=changed_first,
+ )
+
+ def _run_agentic_patch(
+ self,
+ *,
+ parent_program: Program,
+ generation: int,
+ patch_sys: str,
+ patch_msg: str,
+ patch_type: str,
+ novelty_attempt: int,
+ resample_attempt: int,
+ ) -> tuple[Optional[str], dict, int]:
+ """Execute an agentic editing session via CLI backend (Codex or ShinkaAgent)."""
+
+ primary_filename = Path(f"main.{self.lang_ext}")
+
+ # Extract content from corpus; fallback to raw code if not a corpus
+ primary_content = extract_file_content(parent_program.code, str(primary_filename))
+ if primary_content is None:
+ if "=== FILE:" not in parent_program.code:
+ primary_content = parent_program.code
+ else:
+ primary_content = extract_file_content(parent_program.code, "main.py")
+ if primary_content is None:
+ primary_content = parent_program.code
+
+ base_files: Dict[Path, str] = {primary_filename: primary_content}
+ base_files.update(self._collect_parent_workspace_files(parent_program))
+
+ session_root: Optional[Path] = None
+ parent_metadata = parent_program.metadata or {}
+ resume_session_id: Optional[str] = None
+ resumed_from_parent = False
+
+ if self.evo_config.agentic.resume_parent_session:
+ candidate = parent_metadata.get("agent_session_id")
+ if isinstance(candidate, str) and candidate.strip():
+ resume_session_id = candidate.strip()
+ resumed_from_parent = True
+
+ def _serialize_changed_files(
+ changed_files: Optional[Dict[Path, str]]
+ ) -> Dict[str, str]:
+ if not changed_files:
+ return {}
+ serialized: Dict[str, str] = {}
+ for rel_path, content in changed_files.items():
+ if rel_path == primary_filename:
+ continue
+ serialized[str(rel_path)] = content
+ return serialized
+
+ def _build_code_diffs(
+ changed_files: Optional[Dict[Path, str]]
+ ) -> List[Dict[str, str]]:
+ """Build multi-file diffs for frontend display."""
+ if not changed_files:
+ return []
+ diffs: List[Dict[str, str]] = []
+ for rel_path, new_content in changed_files.items():
+ before = base_files.get(rel_path, "")
+ before_lines = before.splitlines(keepends=True)
+ after_lines = new_content.splitlines(keepends=True)
+ diff_text = "".join(
+ difflib.unified_diff(
+ before_lines,
+ after_lines,
+ fromfile=f"a/{rel_path}",
+ tofile=f"b/{rel_path}",
+ )
+ )
+ diffs.append({"path": str(rel_path), "diff": diff_text})
+ return diffs
+
+ def _agent_model_name(backend: str, actual_model: Optional[str] = None) -> str:
+ """Determine model name with priority: actual > config > profile > fallback."""
+ if actual_model:
+ return actual_model
+ extra_cli = self.evo_config.agentic.extra_cli_config
+ if extra_cli:
+ model_override = extra_cli.get("model") if isinstance(extra_cli, dict) else None
+ if model_override:
+ return str(model_override)
+ if self.evo_config.agentic.cli_profile:
+ return self.evo_config.agentic.cli_profile
+ return f"{backend}-default"
+
+ selected_backend = self.evo_config.agentic.backend
+
+ def failure_meta(
+ message: str,
+ *,
+ session_log: Optional[List[str]] = None,
+ commands: Optional[List[CommandResult]] = None,
+ metrics: Optional[Dict[str, float]] = None,
+ session_id: Optional[str] = None,
+ changed_files: Optional[Dict[Path, str]] = None,
+ ) -> tuple[Optional[str], dict, int]:
+ api_cost = 0.0
+ if metrics:
+ api_cost = (
+ metrics.get("total_cost")
+ or metrics.get("estimated_total_cost")
+ or 0.0
+ )
+ serialized_changed = _serialize_changed_files(changed_files)
+ meta_edit_data = {
+ "patch_type": "agentic",
+ "api_costs": api_cost,
+ "num_applied": 0,
+ "patch_name": None,
+ "patch_description": None,
+ "error_attempt": message,
+ "novelty_attempt": novelty_attempt,
+ "resample_attempt": resample_attempt,
+ "patch_attempt": 1,
+ "agent_session_path": str(session_root) if session_root else None,
+ "agent_session_log": session_log or [],
+ "agent_commands": [asdict(cmd) for cmd in commands or []],
+ "agent_metrics": metrics or {},
+ "agent_changed_files": serialized_changed,
+ "agent_code_diffs": _build_code_diffs(changed_files),
+ "agent_primary_file": str(primary_filename),
+ "model_name": _agent_model_name(selected_backend),
+ "agent_backend": selected_backend,
+ "agent_session_id": session_id,
+ "agent_resumed_from_parent": resumed_from_parent,
+ }
+ return None, meta_edit_data, 0
+
+ # Ensure backend is available
+ try:
+ if selected_backend == "shinka":
+ ensure_shinka_available()
+ else:
+ ensure_codex_available(self.evo_config.agentic.cli_path)
+ except (CodexUnavailableError, ShinkaUnavailableError) as exc:
+ return failure_meta(str(exc))
+
+ # Create scratch directory
+ session_uuid = str(uuid.uuid4())
+ if self.evo_config.agentic.scratch_dir_base:
+ scratch_base = Path(self.evo_config.agentic.scratch_dir_base)
+ scratch_base.mkdir(parents=True, exist_ok=True)
+ session_root = scratch_base / session_uuid
+ else:
+ session_root = Path(self.results_dir) / "agent_sessions" / session_uuid
+
+ session_root.mkdir(parents=True, exist_ok=True)
+
+ # Write session metadata
+ session_meta = {
+ "parent_id": parent_program.id,
+ "generation": generation,
+ "patch_type": patch_type,
+ "novelty_attempt": novelty_attempt,
+ "resample_attempt": resample_attempt,
+ "start_time": time.time(),
+ "results_dir": str(self.results_dir),
+ }
+ try:
+ with open(session_root / "session_meta.json", "w") as f:
+ json.dump(session_meta, f, indent=2)
+ except Exception as e:
+ logger.warning(f"Failed to write session_meta.json: {e}")
+
+ # Build context for agent
+ helper_files = [p for p in base_files.keys() if p != primary_filename]
+ system_prompt = patch_sys.strip()
+ if helper_files:
+ helper_listing = "\n".join(f"- {path.as_posix()}" for path in sorted(helper_files))
+ system_prompt += (
+ "\n\n# Workspace Files\n"
+ "The following helper files were copied from the parent program:\n"
+ f"{helper_listing}"
+ )
+
+ context = AgentContext(
+ user_prompt=patch_msg.strip(),
+ system_prompt=system_prompt,
+ language=self.evo_config.language,
+ base_files=base_files,
+ primary_file=primary_filename,
+ metadata={
+ "generation": generation,
+ "novelty_attempt": novelty_attempt,
+ "resample_attempt": resample_attempt,
+ "patch_type": patch_type,
+ "results_dir": str(self.results_dir),
+ },
+ resume_session_id=resume_session_id,
+ )
+
+ editor = AgenticEditor(
+ scratch_dir=session_root,
+ config=self.evo_config.agentic,
+ runner=run_shinka_task if selected_backend == "shinka" else run_codex_task,
+ )
+
+ try:
+ agent_result = editor.run_session(context)
+ except (CodexExecutionError, ShinkaExecutionError) as exc:
+ return failure_meta(str(exc))
+
+ # Create generation directory
+ generation_dir = Path(self.results_dir) / f"{FOLDER_PREFIX}_{generation}"
+ if generation_dir.exists():
+ shutil.rmtree(generation_dir)
+ generation_dir.mkdir(parents=True, exist_ok=True)
+ self._hydrate_generation_directory(parent_program, generation_dir)
+
+ patch_dir = str(generation_dir)
+
+ # Get primary file content from agent result
+ primary_content = agent_result.changed_files.get(
+ context.primary_file, base_files[context.primary_file]
+ )
+ patch_str = f"```{self.evo_config.language}\n{primary_content}\n```"
+ original_for_patch = base_files[context.primary_file]
+
+ # Apply patch to create output file
+ (
+ _,
+ num_applied,
+ output_path,
+ error_msg,
+ patch_txt,
+ patch_path,
+ ) = apply_full_patch(
+ original_code=original_for_patch,
+ code_response=patch_str,
+ patch_dir=patch_dir,
+ language=self.evo_config.language,
+ )
+
+ if num_applied < 1:
+ return failure_meta(
+ error_msg or "Agent produced no valid code",
+ session_log=agent_result.session_log,
+ commands=agent_result.commands_run,
+ metrics=agent_result.metrics,
+ session_id=agent_result.session_id,
+ changed_files=agent_result.changed_files,
+ )
+
+ # Write helper files to generation directory
+ for rel_path, content in agent_result.changed_files.items():
+ if rel_path == context.primary_file:
+ continue
+ target = generation_dir / rel_path
+ target.parent.mkdir(parents=True, exist_ok=True)
+ target.write_text(content, encoding="utf-8")
+
+ # Build code diff for display
+ original_lines = original_for_patch.splitlines(keepends=True)
+ new_lines = primary_content.splitlines(keepends=True)
+ code_diff = "".join(
+ difflib.unified_diff(
+ original_lines,
+ new_lines,
+ fromfile="a/main." + self.lang_ext,
+ tofile="b/main." + self.lang_ext,
+ )
+ )
+
+ api_cost = 0.0
+ if agent_result.metrics:
+ api_cost = (
+ agent_result.metrics.get("total_cost")
+ or agent_result.metrics.get("estimated_total_cost")
+ or 0.0
+ )
+
+ serialized_changed = _serialize_changed_files(agent_result.changed_files)
+ actual_model = agent_result.model
+
+ meta_edit_data = {
+ "patch_type": "agentic",
+ "api_costs": api_cost,
+ "num_applied": num_applied,
+ "patch_name": None,
+ "patch_description": None,
+ "error_attempt": None,
+ "novelty_attempt": novelty_attempt,
+ "resample_attempt": resample_attempt,
+ "patch_attempt": 1,
+ "agent_session_path": str(session_root),
+ "agent_session_log": agent_result.session_log,
+ "agent_commands": [asdict(cmd) for cmd in agent_result.commands_run],
+ "agent_metrics": agent_result.metrics,
+ "agent_changed_files": serialized_changed,
+ "agent_code_diffs": _build_code_diffs(agent_result.changed_files),
+ "agent_primary_file": str(primary_filename),
+ "model_name": _agent_model_name(selected_backend, actual_model),
+ "agent_backend": selected_backend,
+ "agent_session_id": agent_result.session_id,
+ "agent_resumed_from_parent": resumed_from_parent,
+ }
+
+ return code_diff, meta_edit_data, num_applied
diff --git a/shinka/edit/__init__.py b/shinka/edit/__init__.py
index 33d4b52ed..276c2835d 100644
--- a/shinka/edit/__init__.py
+++ b/shinka/edit/__init__.py
@@ -1,10 +1,15 @@
from .apply_diff import apply_diff_patch, redact_immutable
from .apply_full import apply_full_patch
from .summary import summarize_diff
+from .agentic import AgenticEditor, AgentContext, AgentResult, CommandResult
__all__ = [
"redact_immutable",
"apply_diff_patch",
"apply_full_patch",
"summarize_diff",
+ "AgenticEditor",
+ "AgentContext",
+ "AgentResult",
+ "CommandResult",
]
diff --git a/shinka/edit/agentic.py b/shinka/edit/agentic.py
new file mode 100644
index 000000000..311a47ec5
--- /dev/null
+++ b/shinka/edit/agentic.py
@@ -0,0 +1,310 @@
+"""Agentic editing harness with a pluggable backend (Codex default)."""
+
+from __future__ import annotations
+
+import base64
+import json
+import shutil
+import time
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+from .codex_cli import run_codex_task
+from .types import AgentRunner
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CommandResult:
+ """Represents a command execution issued by the agent."""
+
+ command: Optional[str]
+ status: Optional[str]
+ exit_code: Optional[int]
+ stdout: Optional[str] = None
+ stderr: Optional[str] = None
+
+
+@dataclass
+class AgentResult:
+ """Container for the outcome of an agentic editing session."""
+
+ changed_files: Dict[Path, str]
+ session_log: List[str]
+ commands_run: List[CommandResult]
+ final_message: Optional[str] = None
+ metrics: Dict[str, float] = field(default_factory=dict)
+ session_log_path: Optional[Path] = None
+ session_events: List[Dict[str, Any]] = field(default_factory=list)
+ binary_changed_files: Dict[Path, str] = field(default_factory=dict)
+ session_id: Optional[str] = None
+ model: Optional[str] = None # Actual model from CLI init event
+
+
+@dataclass
+class AgentContext:
+ """Inputs required to run an agentic editing session.
+
+ Note on system_prompt: In agentic mode, the harness (Codex/Gemini/Claude CLI)
+ owns the system prompt. This field contains only AGENTIC_SYS_FORMAT (operational
+ instructions for sandbox editing), NOT task-specific context. Task context
+ (task_sys_msg from config) is included in the user_prompt as "# Task Context".
+ This ensures we don't override the CLI's native system behavior.
+ """
+
+ user_prompt: str
+ language: str
+ base_files: Dict[Path, str]
+ primary_file: Path
+ system_prompt: Optional[str] = None
+ metadata: Dict[str, Any] = field(default_factory=dict)
+ resume_session_id: Optional[str] = None
+
+
+class AgenticEditor:
+ """Drive an agentic editing session within a dedicated scratch directory.
+
+ Backend is selected by the caller (Codex/Gemini/Claude/ShinkaAgent); Codex
+ is only the default runner, not a requirement.
+ """
+
+ def __init__(
+ self,
+ scratch_dir: Path,
+ config,
+ *,
+ runner: AgentRunner = run_codex_task,
+ codex_runner: AgentRunner | None = None, # Deprecated: use runner
+ ) -> None:
+ self.scratch_dir = Path(scratch_dir)
+ self.config = config
+ # Accept the legacy codex_runner keyword for backward compatibility
+ self.runner = runner if codex_runner is None else codex_runner
+
+ def _prepare_scratch(self, base_files: Dict[Path, str]) -> Dict[Path, str]:
+ # Preserve session_meta.json if it exists (written by runner.py for visualization)
+ meta_path = self.scratch_dir / "session_meta.json"
+ preserved_meta = None
+ if meta_path.exists():
+ try:
+ preserved_meta = meta_path.read_text(encoding="utf-8")
+ except Exception:
+ pass
+
+ if self.scratch_dir.exists():
+ shutil.rmtree(self.scratch_dir)
+ self.scratch_dir.mkdir(parents=True, exist_ok=True)
+
+ # Restore session_meta.json
+ if preserved_meta is not None:
+ try:
+ meta_path.write_text(preserved_meta, encoding="utf-8")
+ except Exception:
+ pass
+
+ baseline: Dict[Path, str] = {}
+ for relative_path, content in base_files.items():
+ if relative_path.is_absolute():
+ raise ValueError("Base file paths must be relative to the scratch root")
+ target = self.scratch_dir / relative_path
+ target.parent.mkdir(parents=True, exist_ok=True)
+ target.write_text(content, encoding="utf-8")
+ baseline[relative_path] = content
+ return baseline
+
+ def run_session(self, context: AgentContext) -> AgentResult:
+ baseline = self._prepare_scratch(context.base_files)
+
+ session_log: List[str] = []
+ commands: List[CommandResult] = []
+ start_time = time.monotonic()
+
+ session_log_path = self.scratch_dir / "session_log.jsonl"
+ event_count = 0
+ session_events: List[Dict[str, Any]] = []
+ binary_changed_files: Dict[Path, str] = {}
+ session_id: Optional[str] = None
+ model_from_event: Optional[str] = None # Actual model from CLI init event
+
+ # Telemetry aggregation
+ usage_metrics: Dict[str, float] = {
+ "input_tokens": 0,
+ "output_tokens": 0,
+ "total_tokens": 0,
+ "total_cost_usd": 0.0,
+ }
+
+ with session_log_path.open("w", encoding="utf-8") as event_handle:
+ for event in self.runner(
+ user_prompt=context.user_prompt,
+ system_prompt=context.system_prompt,
+ workdir=self.scratch_dir,
+ profile=self.config.cli_profile,
+ sandbox=self.config.sandbox,
+ approval_mode=self.config.approval_mode,
+ max_seconds=self.config.max_seconds,
+ max_events=self.config.max_turns,
+ extra_cli_config=self.config.extra_cli_config,
+ cli_path=self.config.cli_path,
+ resume_session_id=context.resume_session_id,
+ session_kind="edit",
+ parent_id=context.metadata.get("parent_id"),
+ generation=context.metadata.get("generation"),
+ patch_type=context.metadata.get("patch_type"),
+ results_dir=context.metadata.get("results_dir"),
+ ):
+ if isinstance(event, dict):
+ json.dump(event, event_handle)
+ event_handle.write("\n")
+ event_count += 1
+ session_events.append(event)
+ if session_id is None:
+ candidate = _extract_session_id(event)
+ if candidate:
+ session_id = candidate
+
+ # Handle standard event types
+ item = event.get("item") if isinstance(event, dict) else None
+ if item:
+ item_type = item.get("type")
+ if item_type == "agent_message":
+ text = item.get("text")
+ if text:
+ session_log.append(text)
+ elif item_type == "command_execution":
+ commands.append(
+ CommandResult(
+ command=item.get("command"),
+ status=item.get("status"),
+ exit_code=item.get("exit_code"),
+ stdout=item.get("stdout"),
+ stderr=item.get("stderr"),
+ )
+ )
+
+ # Handle direct event types
+ event_type = event.get("type")
+
+ # Capture model from init event (Claude CLI and ShinkaAgent emit this)
+ if event_type == "init" and model_from_event is None:
+ model_candidate = event.get("model")
+ if isinstance(model_candidate, str) and model_candidate:
+ model_from_event = model_candidate
+
+ if event_type == "usage":
+ usage = event.get("usage")
+ if isinstance(usage, dict):
+ usage_metrics["input_tokens"] += float(usage.get("input_tokens", 0))
+ usage_metrics["output_tokens"] += float(usage.get("output_tokens", 0))
+ usage_metrics["total_tokens"] += float(usage.get("total_tokens", 0))
+ # Use real cost from Claude CLI if available
+ if "total_cost_usd" in usage:
+ usage_metrics["total_cost_usd"] += float(usage.get("total_cost_usd", 0.0))
+
+
+ elapsed = time.monotonic() - start_time
+
+ changed_files: Dict[Path, str] = {}
+ files_checked = 0
+
+ for file_path in self.scratch_dir.rglob("*"):
+ if not file_path.is_file():
+ continue
+
+ rel_path = file_path.relative_to(self.scratch_dir)
+
+ # Skip internal session files - they shouldn't be part of the program
+ if str(rel_path) in ("session_log.jsonl", "session_meta.json"):
+ continue
+
+ files_checked += 1
+ try:
+ new_content = file_path.read_text(encoding="utf-8")
+ except UnicodeDecodeError:
+ raw_bytes = file_path.read_bytes()
+ binary_changed_files[rel_path] = base64.b64encode(raw_bytes).decode(
+ "ascii"
+ )
+ continue
+
+ baseline_content = baseline.get(rel_path)
+ if baseline_content is None:
+ # New file created
+ changed_files[rel_path] = new_content
+ elif baseline_content != new_content:
+ # Existing file modified
+ changed_files[rel_path] = new_content
+
+ if not changed_files and files_checked > 0:
+ logger.info(
+ "Agentic session completed but no files changed. "
+ f"Checked {files_checked} files in {self.scratch_dir}. "
+ f"Baseline files: {len(baseline)}"
+ )
+ elif changed_files:
+ logger.info(f"Agentic session changed {len(changed_files)} files: {[str(p) for p in changed_files.keys()]}")
+
+ # Use real cost if available (Claude CLI provides total_cost_usd),
+ # otherwise fallback to token-based placeholder estimate
+ real_cost = usage_metrics.get("total_cost_usd", 0.0)
+ fallback_cost = usage_metrics["total_tokens"] / 1000.0 # rough placeholder
+ final_cost = real_cost if real_cost > 0 else fallback_cost
+
+ metrics = {
+ "elapsed_seconds": elapsed,
+ "commands_run": float(len(commands)),
+ "messages_logged": float(len(session_log)),
+ "events_logged": float(event_count),
+ "estimated_input_tokens": usage_metrics["input_tokens"],
+ "estimated_output_tokens": usage_metrics["output_tokens"],
+ "estimated_total_tokens": usage_metrics["total_tokens"],
+ "estimated_total_cost": final_cost,
+ "total_cost": final_cost,
+ "input_tokens": usage_metrics["input_tokens"],
+ "output_tokens": usage_metrics["output_tokens"],
+ "total_tokens": usage_metrics["total_tokens"],
+ "real_cost_available": real_cost > 0,
+ }
+
+ final_message = session_log[-1] if session_log else None
+
+ return AgentResult(
+ changed_files=changed_files,
+ binary_changed_files=binary_changed_files,
+ session_log=session_log,
+ commands_run=commands,
+ final_message=final_message,
+ metrics=metrics,
+ session_log_path=session_log_path,
+ session_events=session_events,
+ session_id=session_id,
+ model=model_from_event,
+ )
+
+
+def _extract_session_id(event: Dict[str, Any]) -> Optional[str]:
+ """Attempt to pull a Codex session/thread id from an event payload."""
+
+ if not isinstance(event, dict):
+ return None
+
+ event_type = event.get("type")
+ if isinstance(event_type, str) and event_type.startswith("thread."):
+ thread_id = event.get("thread_id")
+ if isinstance(thread_id, str) and thread_id:
+ return thread_id
+
+ session_id = event.get("session_id")
+ if isinstance(session_id, str) and session_id:
+ return session_id
+
+ session_obj = event.get("session")
+ if isinstance(session_obj, dict):
+ candidate = session_obj.get("id") or session_obj.get("session_id")
+ if isinstance(candidate, str) and candidate:
+ return candidate
+
+ return None
diff --git a/shinka/edit/codex_cli.py b/shinka/edit/codex_cli.py
new file mode 100644
index 000000000..1b5af8963
--- /dev/null
+++ b/shinka/edit/codex_cli.py
@@ -0,0 +1,295 @@
+"""Helpers for interacting with the Codex CLI."""
+
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+import time
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, Optional
+
+from shinka.tools.codex_session_registry import (
+ register_session_process,
+ remove_session_process,
+ update_session_process,
+)
+from shinka.edit.cost_utils import calculate_cost
+
+
+class CodexUnavailableError(RuntimeError):
+ """Raised when the Codex CLI binary cannot be located."""
+
+
+class CodexExecutionError(RuntimeError):
+ """Raised when a Codex run fails or exceeds configured limits."""
+
+
+def ensure_codex_available(codex_path: Optional[str] = None) -> Path:
+ """Return the resolved path to the Codex CLI binary.
+
+ Args:
+ codex_path: Optional override pointing directly to the CLI executable.
+
+ Raises:
+ CodexUnavailableError: If the binary cannot be found or executed.
+
+ Returns:
+ Path: Absolute path to the Codex CLI binary.
+ """
+
+ candidate = codex_path or shutil.which("codex")
+ if not candidate:
+ raise CodexUnavailableError(
+ "Codex CLI not found. Install it with `npm install -g @openai/codex` "
+ "or add it to PATH, then authenticate via `codex login`."
+ )
+
+ resolved = Path(candidate)
+ if not resolved.exists() or not resolved.is_file():
+ raise CodexUnavailableError(
+ f"Codex CLI binary not found at resolved path: {resolved}"
+ )
+
+ return resolved
+
+
+def _format_extra_config(extra: Dict[str, object]) -> Iterable[str]:
+ """Yield CLI `-c key=value` pairs from a dictionary."""
+
+ for key, value in extra.items():
+ if value is None:
+ continue
+ if isinstance(value, str):
+ yield "-c"
+ yield f"{key}={value}"
+ else:
+ yield "-c"
+ yield f"{key}={json.dumps(value)}"
+
+
+def run_codex_task(
+ user_prompt: str,
+ workdir: Path,
+ *,
+ system_prompt: Optional[str] = None,
+ profile: Optional[str],
+ sandbox: str,
+ approval_mode: str,
+ max_seconds: int,
+ max_events: int,
+ extra_cli_config: Dict[str, object],
+ codex_path: Optional[str] = None,
+ cli_path: Optional[str] = None, # Alias for codex_path
+ resume_session_id: Optional[str] = None,
+ session_kind: str = "unknown",
+ # Metadata params (unused but accepted for API compat with agentic.py)
+ parent_id: Optional[str] = None,
+ generation: Optional[int] = None,
+ patch_type: Optional[str] = None,
+ results_dir: Optional[str] = None,
+) -> Iterator[Dict[str, object]]:
+ """Execute a Codex CLI task and stream its JSON events.
+
+ Args:
+ user_prompt: Natural language instruction for Codex.
+ workdir: Workspace directory Codex should modify.
+ system_prompt: Optional system instructions (prepended to prompt).
+ profile: Optional Codex profile name (selects model/settings).
+ sandbox: Sandbox policy passed to `--sandbox`.
+ approval_mode: Either `full-auto` or values accepted by
+ `--ask-for-approval`.
+ max_seconds: Wall-clock guardrail for the Codex process.
+ max_events: Maximum number of JSON events to yield before aborting.
+ extra_cli_config: Additional key/value overrides forwarded via `-c`.
+ codex_path: Optional explicit path to the CLI binary.
+ cli_path: Alias for codex_path (for backend-agnostic calls).
+ resume_session_id: Optional session UUID to resume via
+ `codex exec resume`.
+
+ Raises:
+ CodexExecutionError: If Codex fails, times out, or exceeds limits.
+ CodexUnavailableError: If the CLI binary cannot be located.
+
+ Yields:
+ Parsed JSON events emitted by the CLI.
+ """
+
+ # Use cli_path if provided, fall back to codex_path for backward compat
+ binary = ensure_codex_available(cli_path or codex_path)
+
+ cmd = [str(binary), "exec"]
+ if resume_session_id:
+ cmd.append("resume")
+ cmd.extend(["--json", "--skip-git-repo-check", "-C", str(workdir)])
+
+ if profile:
+ cmd.extend(["--profile", profile])
+
+ if sandbox:
+ cmd.extend(["--sandbox", sandbox])
+
+ if approval_mode == "full-auto":
+ cmd.append("--full-auto")
+ elif approval_mode:
+ cmd.extend(["--ask-for-approval", approval_mode])
+
+ cmd.extend(_format_extra_config(extra_cli_config))
+
+ if resume_session_id:
+ cmd.append(resume_session_id)
+
+ # NOTE: Codex CLI does not support a separate system prompt flag.
+ # In agentic mode, the harness owns the system prompt entirely - task-specific
+ # context (task_sys_msg) is included in the user prompt by the sampler.
+ # The system_prompt param here contains only operational instructions (AGENTIC_SYS_FORMAT)
+ # which we prepend to the user prompt since Codex has no system prompt mechanism.
+ full_prompt = user_prompt
+ if system_prompt:
+ full_prompt = f"{system_prompt}\n\n{user_prompt}"
+
+ cmd.append(full_prompt)
+
+ start_time = time.monotonic()
+ events_emitted = 0
+
+ # Token estimation for cost tracking (Codex CLI doesn't emit usage data)
+ estimated_input_tokens = len(full_prompt) // 4 if full_prompt else 0
+ estimated_output_tokens = 0
+ model_name = profile or "gpt-4.1-mini" # Default Codex model (in pricing.py)
+ session_id: Optional[str] = None
+
+ process = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ )
+
+ prompt_preview = full_prompt.strip().splitlines()[0][:160] if full_prompt else ""
+ register_session_process(
+ process.pid,
+ prompt_preview=prompt_preview,
+ workdir=workdir,
+ session_kind=session_kind,
+ parent_id=parent_id,
+ generation=generation,
+ patch_type=patch_type,
+ results_dir=results_dir,
+ )
+
+ try:
+ if not process.stdout:
+ raise CodexExecutionError("Codex CLI did not provide stdout pipe.")
+
+ while True:
+ if max_seconds > 0 and time.monotonic() - start_time > max_seconds:
+ process.kill()
+ raise CodexExecutionError(
+ f"Codex task exceeded {max_seconds}s timeout."
+ )
+
+ line = process.stdout.readline()
+ if not line:
+ if process.poll() is not None:
+ break
+ time.sleep(0.05)
+ continue
+
+ line = line.strip()
+ if not line:
+ continue
+
+ try:
+ event = json.loads(line)
+ except json.JSONDecodeError as exc: # pragma: no cover - defensive
+ raise CodexExecutionError(
+ f"Failed to parse Codex event: {line}"
+ ) from exc
+
+ events_emitted += 1
+ if max_events and events_emitted > max_events:
+ process.kill()
+ raise CodexExecutionError(
+ "Codex emitted more events than allowed (max_events)."
+ )
+
+ if isinstance(event, dict):
+ extracted_sid = _extract_session_id(event)
+ if extracted_sid:
+ session_id = extracted_sid
+ update_session_process(process.pid, session_id=extracted_sid)
+
+ # Track output content for token estimation
+ content = (
+ event.get("content")
+ or event.get("text")
+ or ""
+ )
+ # Also check nested message content
+ msg = event.get("message")
+ if isinstance(msg, dict):
+ msg_content = msg.get("content")
+ if isinstance(msg_content, str):
+ content = msg_content
+ elif isinstance(msg_content, list):
+ # Handle content blocks
+ for block in msg_content:
+ if isinstance(block, dict) and block.get("type") == "text":
+ content += block.get("text", "")
+
+ if isinstance(content, str) and content:
+ estimated_output_tokens += len(content) // 4
+
+ yield event
+
+ # Emit usage event at session end
+ total_tokens = estimated_input_tokens + estimated_output_tokens
+ yield {
+ "type": "usage",
+ "session_id": session_id,
+ "usage": {
+ "input_tokens": estimated_input_tokens,
+ "output_tokens": estimated_output_tokens,
+ "total_tokens": total_tokens,
+ "total_cost_usd": calculate_cost(
+ model_name,
+ estimated_input_tokens,
+ estimated_output_tokens,
+ "codex",
+ ),
+ },
+ "model": model_name,
+ }
+
+ returncode = process.wait(timeout=1)
+ if returncode != 0:
+ stderr_out = process.stderr.read() if process.stderr else ""
+ raise CodexExecutionError(
+ f"Codex CLI exited with status {returncode}: {stderr_out.strip()}"
+ )
+ finally:
+ if process.poll() is None:
+ process.kill()
+ remove_session_process(process.pid)
+
+
+def _extract_session_id(event: Dict[str, object]) -> Optional[str]:
+ """Attempt to pull a session/thread id from a Codex CLI event."""
+
+ if not isinstance(event, dict):
+ return None
+ event_type = event.get("type")
+ if isinstance(event_type, str) and event_type.startswith("thread."):
+ thread_id = event.get("thread_id")
+ if isinstance(thread_id, str) and thread_id:
+ return thread_id
+ session_id = event.get("session_id")
+ if isinstance(session_id, str) and session_id:
+ return session_id
+ session_obj = event.get("session")
+ if isinstance(session_obj, dict):
+ candidate = session_obj.get("id") or session_obj.get("session_id")
+ if isinstance(candidate, str) and candidate:
+ return candidate
+ return None
diff --git a/shinka/edit/cost_utils.py b/shinka/edit/cost_utils.py
new file mode 100644
index 000000000..482c7888f
--- /dev/null
+++ b/shinka/edit/cost_utils.py
@@ -0,0 +1,52 @@
+"""Cost calculation utilities for CLI backends.
+
+Provides shared cost calculation using pricing tables from shinka/llm/models/pricing.py.
+Used by gemini_cli.py and codex_cli.py to calculate costs from estimated tokens.
+"""
+
+from typing import Optional
+
+from shinka.llm.models.pricing import GEMINI_MODELS, OPENAI_MODELS
+
+
+def calculate_cost(
+ model: Optional[str],
+ input_tokens: int,
+ output_tokens: int,
+ backend: str = "auto",
+) -> float:
+ """Calculate cost from tokens using pricing tables.
+
+ Args:
+ model: Model name (e.g., "gemini-2.5-flash", "gpt-4o").
+ input_tokens: Number of input tokens (can be estimated).
+ output_tokens: Number of output tokens (can be estimated).
+ backend: Backend hint ("gemini", "codex", or "auto" to detect).
+
+ Returns:
+ Estimated cost in USD.
+ """
+ if not model:
+ # No model specified - use conservative fallback
+ return (input_tokens + output_tokens) * 0.000002 # $0.002/1K tokens
+
+ # Try to find model in pricing tables
+ pricing = None
+
+ if backend == "gemini":
+ pricing = GEMINI_MODELS.get(model)
+ elif backend == "codex":
+ pricing = OPENAI_MODELS.get(model)
+ else:
+ # Auto-detect: try both tables
+ pricing = GEMINI_MODELS.get(model) or OPENAI_MODELS.get(model)
+
+ if not pricing:
+ # Model not found in pricing tables - use conservative fallback
+ # This handles unknown models gracefully
+ return (input_tokens + output_tokens) * 0.000002 # $0.002/1K tokens
+
+ return (
+ input_tokens * pricing["input_price"]
+ + output_tokens * pricing["output_price"]
+ )
diff --git a/shinka/edit/shinka_agent.py b/shinka/edit/shinka_agent.py
new file mode 100644
index 000000000..0443353bd
--- /dev/null
+++ b/shinka/edit/shinka_agent.py
@@ -0,0 +1,407 @@
+"""Native ShinkaAgent backend using shinka/llm/LLMClient.
+
+This module implements a native, model-agnostic agentic editing backend
+that uses Shinka's existing LLM infrastructure. Unlike the CLI wrappers
+(Codex, Gemini, Claude), ShinkaAgent runs entirely in-process, providing
+full control over the agent loop and leveraging existing LLM ensembling.
+
+The design follows the mini-SWE-agent pattern:
+- Single bash action per response (enforced via regex)
+- Linear message history (no branching)
+- subprocess.run() for action execution (stateless)
+- Termination via magic output string
+
+Reference: https://github.com/SWE-agent/mini-swe-agent
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import subprocess
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
+
+from shinka.llm import LLMClient
+from shinka.tools.codex_session_registry import (
+ register_session_process,
+ remove_session_process,
+ update_session_process,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ShinkaUnavailableError(RuntimeError):
+ """Raised when no LLM API keys are configured."""
+
+
+class ShinkaExecutionError(RuntimeError):
+ """Raised when the agent loop fails or times out."""
+
+
+# Regex to extract bash code block
+ACTION_RE = re.compile(r"```bash\s*\n(.*?)\n```", re.DOTALL)
+
+# System prompt for bash-only agent
+SHINKA_SYSTEM_PROMPT = '''You are an expert software engineer working inside a sandboxed repository.
+
+IMPORTANT RULES:
+1. You can ONLY interact via bash commands in ```bash...``` blocks
+2. ONE bash block per response - additional blocks are ignored
+3. Only edit code between EVOLVE-BLOCK-START and EVOLVE-BLOCK-END markers
+4. Use standard tools: cat, sed, echo, python, etc.
+5. Keep responses concise - avoid lengthy explanations
+
+When your task is complete, include this exact text in your response:
+COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+
+Example response:
+I'll read the current file first.
+```bash
+cat main.py
+```
+
+After seeing the output, make targeted edits to improve the score.
+'''
+
+# Observation template
+OBSERVATION_TEMPLATE = '''OBSERVATION:
+Exit code: {exit_code}
+{output}'''
+
+# Max characters for observation to avoid context overflow
+MAX_OBSERVATION_CHARS = 16000
+
+# Supported API key environment variables
+API_KEY_VARS = [
+ "OPENAI_API_KEY",
+ "ANTHROPIC_API_KEY",
+ "DEEPSEEK_API_KEY",
+ "GOOGLE_API_KEY",
+ "AWS_ACCESS_KEY_ID", # For Bedrock
+]
+
+# Map provider names to env vars for credential store lookup
+PROVIDER_ENV_VAR_MAP = {
+ "codex": "OPENAI_API_KEY",
+ "claude": "ANTHROPIC_API_KEY",
+ "gemini": "GOOGLE_API_KEY",
+ "deepseek": "DEEPSEEK_API_KEY",
+}
+
+
+def ensure_shinka_available() -> bool:
+ """Check that at least one LLM provider API key is configured.
+
+ Checks:
+ 1. Environment variables
+ 2. Unified credential store (~/.shinka/credentials.json)
+
+ Returns:
+ True if at least one API key is found.
+
+ Raises:
+ ShinkaUnavailableError: If no API keys are configured.
+ """
+ # First check environment variables
+ for var in API_KEY_VARS:
+ if os.environ.get(var):
+ return True
+
+ # Then check the unified credential store
+ try:
+ from shinka.tools.credentials import get_api_key
+
+ for provider in PROVIDER_ENV_VAR_MAP.keys():
+ key = get_api_key(provider)
+ if key:
+ # Also set it in the environment so other code can use it
+ env_var = PROVIDER_ENV_VAR_MAP[provider]
+ os.environ[env_var] = key
+ return True
+ except ImportError:
+ pass # credentials module not available
+
+ raise ShinkaUnavailableError(
+ "No LLM API keys found. Set at least one of: " + ", ".join(API_KEY_VARS)
+ )
+
+
+def _truncate_output(text: str, max_chars: int = MAX_OBSERVATION_CHARS) -> str:
+ """Truncate output to avoid context overflow."""
+ if len(text) <= max_chars:
+ return text
+ half = max_chars // 2
+ return f"{text[:half]}\n... [truncated {len(text) - max_chars} chars] ...\n{text[-half:]}"
+
+
+def _execute_bash(command: str, cwd: Path, timeout: int = 120) -> tuple[int, str, str]:
+ """Execute a bash command and return (exit_code, stdout, stderr)."""
+ try:
+ result = subprocess.run(
+ command,
+ shell=True,
+ cwd=str(cwd),
+ capture_output=True,
+ text=True,
+ timeout=timeout,
+ )
+ return result.returncode, result.stdout, result.stderr
+ except subprocess.TimeoutExpired:
+ return 1, "", f"Command timed out after {timeout}s"
+ except Exception as e:
+ return 1, "", str(e)
+
+
+def run_shinka_task(
+ user_prompt: str,
+ workdir: Path,
+ *,
+ system_prompt: Optional[str] = None,
+ profile: Optional[str],
+ sandbox: str,
+ approval_mode: str,
+ max_seconds: int,
+ max_events: int,
+ extra_cli_config: Dict[str, Any],
+ codex_path: Optional[str] = None,
+ cli_path: Optional[str] = None, # Alias for codex_path (unused for ShinkaAgent)
+ resume_session_id: Optional[str] = None,
+ session_kind: str = "unknown",
+ # Metadata params for session registry tracking
+ parent_id: Optional[str] = None,
+ generation: Optional[int] = None,
+ patch_type: Optional[str] = None,
+ results_dir: Optional[str] = None,
+) -> Iterator[Dict[str, Any]]:
+ """Execute a ShinkaAgent task and stream JSON events.
+
+ This function implements the AgentRunner protocol for native in-process
+ agent execution using shinka/llm/LLMClient.
+
+ Args:
+ user_prompt: Natural language instruction for the agent.
+ workdir: Workspace directory the agent should modify.
+ system_prompt: Optional system instructions (combined with base prompt).
+ profile: Optional model name override.
+ sandbox: Sandbox policy (ignored for ShinkaAgent - runs locally).
+ approval_mode: Approval mode (ignored for ShinkaAgent - full-auto).
+ max_seconds: Wall-clock timeout for the session.
+ max_events: Maximum number of LLM turns before stopping.
+ extra_cli_config: Additional config (model, temperature, etc.).
+ codex_path: Ignored for ShinkaAgent.
+ resume_session_id: Optional session UUID to resume (future feature).
+ session_kind: Session type label for UI tracking.
+
+ Yields:
+ Parsed JSON events in the same format as CLI wrappers:
+ - init: Session start with session_id, model, timestamp
+ - agent_message: LLM response text
+ - command_execution: Bash command result
+ - usage: Token/cost telemetry at session end
+
+ Raises:
+ ShinkaUnavailableError: If no API keys are configured.
+ ShinkaExecutionError: If the agent loop fails catastrophically.
+ """
+ ensure_shinka_available()
+
+ session_id = resume_session_id or str(uuid.uuid4())
+ start_time = time.monotonic()
+
+ # Determine model(s) to use
+ # Default to gpt-4.1-mini - good balance of cost/capability for agentic tasks
+ # Can be overridden via config: evo_config.agentic.extra_cli_config.model
+ model_name = profile or extra_cli_config.get("model") or "gpt-4.1-mini"
+ model_names = [model_name] if isinstance(model_name, str) else list(model_name)
+
+ # Extract LLM kwargs from extra_cli_config with proper key mapping
+ # LLMClient uses 'temperatures' (plural) but config often has 'temperature'
+ llm_kwargs = {}
+ if "temperature" in extra_cli_config:
+ llm_kwargs["temperatures"] = extra_cli_config["temperature"]
+ if "max_tokens" in extra_cli_config:
+ llm_kwargs["max_tokens"] = extra_cli_config["max_tokens"]
+
+ # Initialize LLMClient with configured models
+ llm = LLMClient(model_names=model_names, verbose=False, **llm_kwargs)
+
+ # NOTE: ShinkaAgent has its own SHINKA_SYSTEM_PROMPT that defines how the
+ # agent operates (bash-only, one block per response, etc.). In agentic mode,
+ # task-specific context (task_sys_msg) is included in the user prompt by the
+ # sampler. The system_prompt param here contains only operational instructions
+ # (AGENTIC_SYS_FORMAT) which we prepend to our SHINKA_SYSTEM_PROMPT.
+ base_system = SHINKA_SYSTEM_PROMPT
+ if system_prompt:
+ base_system = f"{system_prompt}\n\n{SHINKA_SYSTEM_PROMPT}"
+
+ # Message history for multi-turn conversation
+ messages: List[Dict[str, str]] = []
+
+ # Cost tracking
+ total_input_tokens = 0
+ total_output_tokens = 0
+ total_cost = 0.0
+
+ # Register session (use negative PID to indicate in-process)
+ pseudo_pid = -abs(hash(session_id)) % 100000
+ register_session_process(
+ pseudo_pid,
+ prompt_preview=user_prompt[:160],
+ workdir=workdir,
+ session_kind=session_kind,
+ parent_id=parent_id,
+ generation=generation,
+ patch_type=patch_type,
+ results_dir=results_dir,
+ )
+ update_session_process(pseudo_pid, session_id=session_id)
+
+ try:
+ # Emit init event
+ yield {
+ "type": "init",
+ "session_id": session_id,
+ "model": model_names[0],
+ "timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+ }
+
+ # Add initial user message
+ current_msg = user_prompt
+ turn_count = 0
+
+ while True:
+ # Check time limit
+ elapsed = time.monotonic() - start_time
+ if max_seconds > 0 and elapsed > max_seconds:
+ yield {
+ "type": "agent_message",
+ "item": {
+ "type": "agent_message",
+ "text": f"[Session timed out after {elapsed:.1f}s]",
+ },
+ "session_id": session_id,
+ }
+ break
+
+ # Check turn limit
+ turn_count += 1
+ if max_events > 0 and turn_count > max_events:
+ yield {
+ "type": "agent_message",
+ "item": {
+ "type": "agent_message",
+ "text": f"[Session reached max turns: {max_events}]",
+ },
+ "session_id": session_id,
+ }
+ break
+
+ # Query LLM
+ llm_call_kwargs = llm.get_kwargs()
+ response = llm.query(
+ msg=current_msg,
+ system_msg=base_system,
+ msg_history=messages,
+ llm_kwargs=llm_call_kwargs,
+ )
+
+ if response is None or response.content is None:
+ yield {
+ "type": "agent_message",
+ "item": {
+ "type": "agent_message",
+ "text": "[LLM returned empty response]",
+ },
+ "session_id": session_id,
+ }
+ break
+
+ # Track costs using actual values from QueryResult
+ total_cost += response.cost or 0.0
+ total_input_tokens += response.input_tokens or 0
+ total_output_tokens += response.output_tokens or 0
+
+ # Update message history
+ messages.append({"role": "user", "content": current_msg})
+ messages.append({"role": "assistant", "content": response.content})
+
+ # Emit agent message event
+ yield {
+ "type": "agent_message",
+ "item": {"type": "agent_message", "text": response.content},
+ "session_id": session_id,
+ }
+
+ # Parse bash action FIRST - execute any pending commands before terminating
+ action_match = ACTION_RE.search(response.content)
+ has_termination = "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" in response.content
+
+ # If there's a bash action, execute it even if termination signal is present
+ # This handles the case where the agent says "I'll do X" + bash + "done"
+ if action_match:
+ command = action_match.group(1).strip()
+
+ # Execute command
+ exit_code, stdout, stderr = _execute_bash(command, workdir)
+
+ # Format observation
+ output = stdout + stderr
+ output = _truncate_output(output)
+ observation = OBSERVATION_TEMPLATE.format(
+ exit_code=exit_code,
+ output=output or "(no output)",
+ )
+
+ # Emit command execution event
+ yield {
+ "type": "command_execution",
+ "item": {
+ "type": "command_execution",
+ "command": command,
+ "status": "success" if exit_code == 0 else "error",
+ "exit_code": exit_code,
+ "stdout": _truncate_output(stdout, 8000),
+ "stderr": _truncate_output(stderr, 8000),
+ },
+ "session_id": session_id,
+ }
+
+ # Set next message to observation
+ current_msg = observation
+
+ # Check for termination AFTER executing any bash commands
+ if has_termination:
+ logger.info(
+ f"ShinkaAgent completed task in {turn_count} turns, "
+ f"{elapsed:.1f}s, cost=${total_cost:.4f}"
+ )
+ break
+
+ # If no bash action and no termination, prompt for one
+ if not action_match:
+ current_msg = (
+ "Please provide a bash command in ```bash...``` block, "
+ "or say COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT if done."
+ )
+
+ # Emit usage event at end
+ yield {
+ "type": "usage",
+ "session_id": session_id,
+ "usage": {
+ "input_tokens": total_input_tokens,
+ "output_tokens": total_output_tokens,
+ "total_tokens": total_input_tokens + total_output_tokens,
+ "total_cost_usd": total_cost,
+ },
+ }
+
+ finally:
+ remove_session_process(pseudo_pid)
diff --git a/shinka/edit/types.py b/shinka/edit/types.py
new file mode 100644
index 000000000..e027c49db
--- /dev/null
+++ b/shinka/edit/types.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, Iterator, Optional, Protocol
+
+class AgentRunner(Protocol):
+ """Protocol for an agent runner that executes a prompt in a workspace."""
+
+ def __call__(
+ self,
+ user_prompt: str,
+ workdir: Path,
+ *,
+ system_prompt: Optional[str] = None,
+ profile: Optional[str],
+ sandbox: str,
+ approval_mode: str,
+ max_seconds: int,
+ max_events: int,
+ extra_cli_config: Dict[str, Any],
+ codex_path: Optional[str] = None,
+ resume_session_id: Optional[str] = None,
+ session_kind: str = "unknown",
+ ) -> Iterator[Dict[str, Any]]:
+ ...
diff --git a/shinka/eval/__init__.py b/shinka/eval/__init__.py
new file mode 100644
index 000000000..17b3faf5d
--- /dev/null
+++ b/shinka/eval/__init__.py
@@ -0,0 +1,3 @@
+"""Agentic evaluation utilities."""
+
+from .agentic import AgenticEvaluator, AgenticEvaluatorResult # noqa: F401
diff --git a/shinka/eval/agentic.py b/shinka/eval/agentic.py
new file mode 100644
index 000000000..a5b88a1bd
--- /dev/null
+++ b/shinka/eval/agentic.py
@@ -0,0 +1,198 @@
+"""Codex-powered evaluator that runs deterministic scripts inside the repo."""
+
+from __future__ import annotations
+
+import json
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, TYPE_CHECKING
+
+from shinka.edit.agentic import CommandResult
+from shinka.edit.codex_cli import CodexExecutionError, run_codex_task
+from shinka.edit.types import AgentRunner
+from shinka.prompts import AGENTIC_EVAL_SYS, AGENTIC_EVAL_USER
+
+if TYPE_CHECKING: # pragma: no cover
+ from shinka.core.runner import AgenticEvaluatorConfig
+
+
+@dataclass
+class AgenticEvaluatorResult:
+ """Structured output from a Codex evaluation session."""
+
+ metrics: Dict[str, Any]
+ correct: bool
+ error_message: Optional[str]
+ stdout_log: str
+ stderr_log: str
+ session_log: List[str]
+ commands_run: List[CommandResult]
+ session_log_path: Path
+ session_events: List[Dict[str, Any]]
+ session_id: Optional[str]
+ session_dir: Path
+ elapsed_seconds: float
+
+
+class AgenticEvaluator:
+ """Drive the Codex-based evaluator from the repository root."""
+
+ def __init__(
+ self,
+ config: "AgenticEvaluatorConfig",
+ *,
+ codex_runner: AgentRunner = None,
+ agent_runner: AgentRunner = None, # Alias for codex_runner
+ ) -> None:
+ self.config = config
+ # Accept either codex_runner or agent_runner for backward compatibility
+ self.codex_runner = codex_runner or agent_runner or run_codex_task
+
+ def evaluate(
+ self,
+ *,
+ repo_root: Path,
+ eval_command: Sequence[str],
+ program_path: Path,
+ results_path: Path,
+ metrics_path: Path,
+ eval_sessions_root: Path,
+ task_name: str,
+ results_dir: Optional[str] = None,
+ ) -> AgenticEvaluatorResult:
+ session_uuid = uuid.uuid4().hex
+ session_dir = eval_sessions_root / session_uuid
+ session_dir.mkdir(parents=True, exist_ok=True)
+ session_log_path = session_dir / "session_log.jsonl"
+
+ user_prompt, system_prompt = self._build_prompt(
+ task_name=task_name,
+ eval_command=eval_command,
+ program_path=program_path,
+ results_path=results_path,
+ metrics_path=metrics_path,
+ )
+
+ session_log: List[str] = []
+ commands: List[CommandResult] = []
+ session_events: List[Dict[str, Any]] = []
+ resolved_session_id: Optional[str] = None
+
+ start_time = time.monotonic()
+ with session_log_path.open("w", encoding="utf-8") as handle:
+ for event in self.codex_runner(
+ user_prompt=user_prompt,
+ system_prompt=system_prompt,
+ workdir=repo_root,
+ profile=self.config.codex_profile,
+ sandbox=self.config.sandbox,
+ approval_mode=self.config.approval_mode,
+ max_seconds=self.config.max_seconds,
+ max_events=self.config.max_turns,
+ extra_cli_config=self.config.extra_cli_config,
+ codex_path=self.config.codex_path,
+ session_kind="eval",
+ results_dir=results_dir,
+ ):
+ if isinstance(event, dict):
+ json.dump(event, handle)
+ handle.write("\n")
+ session_events.append(event)
+ if resolved_session_id is None:
+ resolved_session_id = _extract_session_id(event)
+
+ item = event.get("item") if isinstance(event, dict) else None
+ if not item:
+ continue
+ if item.get("type") == "agent_message":
+ text = item.get("text")
+ if text:
+ session_log.append(text)
+ elif item.get("type") == "command_execution":
+ commands.append(
+ CommandResult(
+ command=item.get("command"),
+ status=item.get("status"),
+ exit_code=item.get("exit_code"),
+ stdout=item.get("stdout"),
+ stderr=item.get("stderr"),
+ )
+ )
+ elapsed = time.monotonic() - start_time
+
+ if not metrics_path.exists():
+ raise CodexExecutionError(
+ f"Agentic evaluator did not produce metrics at {metrics_path}"
+ )
+
+ metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
+ correct_payload: Dict[str, Any] = {}
+ correct_file = results_path / "correct.json"
+ if correct_file.exists():
+ correct_payload = json.loads(correct_file.read_text(encoding="utf-8"))
+ correct_flag = bool(correct_payload.get("correct", False))
+ error_msg = correct_payload.get("error")
+
+ stdout_log = "\n".join((cmd.stdout or "") for cmd in commands if cmd.stdout)
+ stderr_log = "\n".join((cmd.stderr or "") for cmd in commands if cmd.stderr)
+
+ metrics.setdefault("evaluation_time_seconds", elapsed)
+
+ return AgenticEvaluatorResult(
+ metrics=metrics,
+ correct=correct_flag,
+ error_message=error_msg,
+ stdout_log=stdout_log,
+ stderr_log=stderr_log,
+ session_log=session_log,
+ commands_run=commands,
+ session_log_path=session_log_path,
+ session_events=session_events,
+ session_id=resolved_session_id,
+ session_dir=session_dir,
+ elapsed_seconds=elapsed,
+ )
+
+ def _build_prompt(
+ self,
+ *,
+ task_name: str,
+ eval_command: Sequence[str],
+ program_path: Path,
+ results_path: Path,
+ metrics_path: Path,
+ ) -> tuple[str, str]:
+ command_str = " ".join(eval_command)
+ user = AGENTIC_EVAL_USER.format(
+ task_name=task_name,
+ eval_command=command_str,
+ program_path=program_path,
+ results_path=results_path,
+ metrics_path=metrics_path,
+ )
+ return user.strip(), AGENTIC_EVAL_SYS.strip()
+
+
+def _extract_session_id(event: Dict[str, Any]) -> Optional[str]:
+ if not isinstance(event, dict):
+ return None
+
+ event_type = event.get("type")
+ if isinstance(event_type, str) and event_type.startswith("thread."):
+ thread_id = event.get("thread_id")
+ if isinstance(thread_id, str) and thread_id:
+ return thread_id
+
+ session_id = event.get("session_id")
+ if isinstance(session_id, str) and session_id:
+ return session_id
+
+ session_obj = event.get("session")
+ if isinstance(session_obj, dict):
+ candidate = session_obj.get("id") or session_obj.get("session_id")
+ if isinstance(candidate, str) and candidate:
+ return candidate
+
+ return None
diff --git a/shinka/prompts/__init__.py b/shinka/prompts/__init__.py
index bda20e4ef..b1b1038d2 100644
--- a/shinka/prompts/__init__.py
+++ b/shinka/prompts/__init__.py
@@ -26,6 +26,8 @@
META_STEP3_USER_MSG,
)
from .prompts_novelty import NOVELTY_SYSTEM_MSG, NOVELTY_USER_MSG
+from .prompts_agentic import AGENTIC_SYS_FORMAT, AGENTIC_ITER_MSG
+from .prompts_agentic_eval import AGENTIC_EVAL_SYS, AGENTIC_EVAL_USER
__all__ = [
"construct_eval_history_msg",
@@ -51,4 +53,8 @@
"META_STEP3_USER_MSG",
"NOVELTY_SYSTEM_MSG",
"NOVELTY_USER_MSG",
+ "AGENTIC_SYS_FORMAT",
+ "AGENTIC_ITER_MSG",
+ "AGENTIC_EVAL_SYS",
+ "AGENTIC_EVAL_USER",
]
diff --git a/shinka/prompts/prompts_agentic.py b/shinka/prompts/prompts_agentic.py
new file mode 100644
index 000000000..0b1329677
--- /dev/null
+++ b/shinka/prompts/prompts_agentic.py
@@ -0,0 +1,76 @@
+"""Prompt fragments specialized for agentic Codex editing sessions."""
+
+AGENTIC_SYS_FORMAT = """
+You are operating inside a sandboxed checkout of the user's repository. You have
+direct shell access and must apply changes by editing the files within this
+workspace instead of replying with diffs or entire rewritten files. Run shell
+commands such as `apply_patch`, `cat <<'EOF'`, text editors, or project CLI
+commands to read and modify files. You may open and change multiple files during
+the same edit as long as every change remains within EVOLVE-BLOCK regions for
+those files, and you keep the program runnable.
+
+Multi-file edits are expected: helper modules, evaluators, assets, and configs
+that live next to the main program are already copied into the workspace for
+you. Update them whenever your change requires supporting code, and feel free to
+run formatters or tests inside the sandbox to validate your work.
+
+When you are satisfied with the repository state, stop issuing shell commands
+and send a single final message formatted exactly like this:
+
+
+short_snake_case_identifier
+
+
+
+Reasoning behind the change and which behaviors or metrics it should improve.
+
+
+
+- main.py: example note about the adjustment you made
+- helpers/motifs.py: describe any helper edits (add more bullets as needed)
+
+
+Do not include raw code or diffs in the final summaryβthe tooling captures the
+actual files automatically. If you forget to modify the files and only describe
+a change, the run will be discarded.
+"""
+
+
+AGENTIC_ITER_MSG = """{task_context}
+# Current program
+
+Here is the current program snapshot for quick reference. You still need to
+inspect and edit the real files in the workspace when making changes.
+
+```{language}
+{code_content}
+```
+
+Here are the current performance metrics:
+
+{performance_metrics}{text_feedback_section}
+
+# Workspace instructions
+
+1. Treat `main.{language}` as the primary entry point, but feel free to open and
+ modify any helper modules (for example, rendering utilities or motif
+ libraries) that sit next to it in the workspace.
+2. Only change code that lies between the `EVOLVE-BLOCK-START` and
+ `EVOLVE-BLOCK-END` markers within each file. Leave scaffold code outside
+ those markers untouched.
+3. Use shell commands to edit files directly: `apply_patch`, `python - <<'PY'`,
+ redirection into files, or other CLI tools are all available. Running tests
+ or formatters (e.g., `pytest`, `ruff`, `black`) is encouraged when it helps
+ validate your edit.
+4. Multi-file edits should stay coherentβif you introduce a function in
+ `main.py`, update the relevant helper modules or configs in the same session
+ so the evaluator can run without manual fixes.
+
+# Task
+
+Propose and implement a concrete improvement that should increase the
+`combined_score`. Think in terms of hill-climbing: inspect the workspace, edit
+the files needed for your idea, and make sure the resulting program still runs.
+When finished, provide the formatted summary described in the system prompt.
+"""
+
diff --git a/shinka/prompts/prompts_agentic_eval.py b/shinka/prompts/prompts_agentic_eval.py
new file mode 100644
index 000000000..6eb4520e1
--- /dev/null
+++ b/shinka/prompts/prompts_agentic_eval.py
@@ -0,0 +1,39 @@
+"""Prompt templates for Codex-based evaluation sessions."""
+
+AGENTIC_EVAL_SYS = """
+You are an autonomous evaluator operating inside the repository workspace. Run
+exact shell commands, capture their outputs, and report the resulting metrics.
+Follow these rules:
+
+1. Execute the provided evaluation command verbatim (except for inserting
+ simple helpers such as `mkdir -p` when a directory is missing).
+2. Inspect the referenced metrics JSON file and copy it verbatim into
+ `{...}` so downstream tools can parse it.
+3. If the command fails or the metrics file is missing, describe the issue
+ inside `...` along with relevant stdout/stderr.
+4. Do not modify source files beyond what the evaluation command itself does.
+"""
+
+AGENTIC_EVAL_USER = """
+# Evaluation Task
+
+- Task: {task_name}
+- Working directory: repository root
+- Program path: {program_path}
+- Results path: {results_path}
+- Metrics JSON: {metrics_path}
+
+Run this command:
+
+```
+{eval_command}
+```
+
+After it finishes:
+1. Verify `{metrics_path}` exists, read it, and include the JSON inside
+ `...`.
+2. If the command fails, capture stdout/stderr and describe the failure inside
+ `...`.
+
+Stop once you have produced the metrics or an error report.
+"""
diff --git a/shinka/tools/__init__.py b/shinka/tools/__init__.py
new file mode 100644
index 000000000..c4273ee73
--- /dev/null
+++ b/shinka/tools/__init__.py
@@ -0,0 +1 @@
+"""Utility scripts and helpers for Shinka."""
diff --git a/shinka/tools/codex_session_registry.py b/shinka/tools/codex_session_registry.py
new file mode 100644
index 000000000..df7b5bff4
--- /dev/null
+++ b/shinka/tools/codex_session_registry.py
@@ -0,0 +1,149 @@
+"""Registry for tracking live Codex CLI sessions and their OS PIDs."""
+
+from __future__ import annotations
+
+import json
+import os
+import signal
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+REGISTRY_DIR = Path.home() / ".codex" / "shinka_sessions"
+
+
+def _ensure_registry_dir() -> None:
+ REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _entry_path(key: str | int) -> Path:
+ _ensure_registry_dir()
+ return REGISTRY_DIR / f"{key}.json"
+
+
+def register_session_process(
+ pid: int,
+ *,
+ prompt_preview: str,
+ workdir: Path,
+ session_kind: str = "unknown",
+ parent_id: Optional[str] = None,
+ generation: Optional[int] = None,
+ patch_type: Optional[str] = None,
+ results_dir: Optional[str] = None,
+ filename_key: Optional[str] = None,
+) -> None:
+ """Persist minimal metadata about a newly spawned Codex CLI process.
+
+ Args:
+ pid: The OS process ID to check for liveness.
+ results_dir: The run's results directory (for matching sessions to runs).
+ filename_key: Optional unique string for the filename. Defaults to str(pid).
+ Use this if multiple sessions might share the same PID (e.g. threads).
+ """
+
+ entry = {
+ "pid": pid,
+ "prompt_preview": prompt_preview.strip(),
+ "workdir": str(workdir),
+ "started_at": time.time(),
+ "session_kind": session_kind,
+ "session_id": None,
+ "status": "running",
+ "parent_id": parent_id,
+ "generation": generation,
+ "patch_type": patch_type,
+ "results_dir": results_dir,
+ }
+
+ key = filename_key if filename_key else pid
+ _entry_path(key).write_text(json.dumps(entry), encoding="utf-8")
+
+
+def update_session_process(pid: int, filename_key: Optional[str] = None, **updates: Any) -> None:
+ """Merge updates into an existing registry entry.
+
+ Args:
+ pid: Legacy argument, used as key if filename_key is None.
+ filename_key: The specific file key to update.
+ """
+ key = filename_key if filename_key else pid
+ path = _entry_path(key)
+ if not path.exists():
+ return
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ except json.JSONDecodeError:
+ data = {}
+ data.update(updates)
+ path.write_text(json.dumps(data), encoding="utf-8")
+
+
+def remove_session_process(pid: int, filename_key: Optional[str] = None) -> None:
+ """Remove an entry once the Codex process exits."""
+ key = filename_key if filename_key else pid
+ path = _entry_path(key)
+ if path.exists():
+ path.unlink(missing_ok=True)
+
+
+def _is_pid_alive(pid: int) -> bool:
+ try:
+ os.kill(pid, 0)
+ except ProcessLookupError:
+ return False
+ except PermissionError:
+ return True
+ except ValueError:
+ # Handle case where pid is invalid (e.g. 0 or negative if passed incorrectly)
+ return False
+ else:
+ return True
+
+
+def list_session_processes() -> List[Dict[str, Any]]:
+ """Return sanitized entries for still-running Codex processes."""
+
+ entries: List[Dict[str, Any]] = []
+ if not REGISTRY_DIR.exists():
+ return entries
+
+ for json_file in REGISTRY_DIR.glob("*.json"):
+ try:
+ data = json.loads(json_file.read_text(encoding="utf-8"))
+ except json.JSONDecodeError:
+ json_file.unlink(missing_ok=True)
+ continue
+
+ pid = data.get("pid")
+ if not isinstance(pid, int):
+ json_file.unlink(missing_ok=True)
+ continue
+
+ if not _is_pid_alive(pid):
+ json_file.unlink(missing_ok=True)
+ continue
+
+ entries.append(
+ {
+ "pid": pid,
+ "session_id": data.get("session_id"),
+ "prompt_preview": data.get("prompt_preview"),
+ "workdir": data.get("workdir"),
+ "started_at": data.get("started_at"),
+ "session_kind": data.get("session_kind"),
+ "status": data.get("status", "running"),
+ "parent_id": data.get("parent_id"),
+ "generation": data.get("generation"),
+ "patch_type": data.get("patch_type"),
+ "results_dir": data.get("results_dir"),
+ "can_stop": True,
+ }
+ )
+ return entries
+
+
+def terminate_session_process(pid: int, sig: signal.Signals = signal.SIGTERM) -> None:
+ """Send a termination signal to a tracked Codex process."""
+
+ os.kill(pid, sig)
From bd4674324678cd58abf0c582ecc3c2e7e9ad9a40 Mon Sep 17 00:00:00 2001
From: george
Date: Sun, 14 Dec 2025 12:48:30 +0000
Subject: [PATCH 35/68] feat: Add multi-file diff viewer and agentic node
indicator
---
shinka/core/novelty_judge.py | 27 +++++++-
shinka/core/runner.py | 5 ++
shinka/webui/viz_tree.html | 127 +++++++++++++++++++++++++++++++++--
3 files changed, 152 insertions(+), 7 deletions(-)
diff --git a/shinka/core/novelty_judge.py b/shinka/core/novelty_judge.py
index 9fe0e0d00..540a6978e 100644
--- a/shinka/core/novelty_judge.py
+++ b/shinka/core/novelty_judge.py
@@ -1,15 +1,23 @@
-from typing import Optional, Tuple, List
+from typing import Any, Callable, Dict, Iterator, Optional, Tuple, List
import logging
from pathlib import Path
from shinka.database import Program
from shinka.llm import LLMClient
from shinka.prompts import NOVELTY_SYSTEM_MSG, NOVELTY_USER_MSG
+# Type for agent runner function (used in agentic mode)
+AgentRunner = Callable[..., Iterator[Dict[str, Any]]]
+
logger = logging.getLogger(__name__)
class NoveltyJudge:
- """Handles novelty assessment for generated code using LLM-based comparison."""
+ """Handles novelty assessment for generated code using LLM-based comparison.
+
+ Supports optional agentic mode where LLM novelty checks can be performed
+ via CLI agents (Codex, ShinkaAgent). When agentic mode is disabled or
+ agent_runner is not provided, falls back to legacy LLMClient-based checks.
+ """
def __init__(
self,
@@ -17,11 +25,26 @@ def __init__(
language: str = "python",
similarity_threshold: float = 1.0,
max_novelty_attempts: int = 3,
+ # Agentic mode parameters (optional, graceful fallback to legacy)
+ agentic_mode: bool = False,
+ agent_runner: Optional[AgentRunner] = None,
+ agent_config: Optional[Any] = None,
):
self.novelty_llm_client = novelty_llm_client
self.language = language
self.similarity_threshold = similarity_threshold
self.max_novelty_attempts = max_novelty_attempts
+ # Store agentic config for future use (not implemented in minimal PR)
+ self.agentic_mode = agentic_mode
+ self.agent_runner = agent_runner
+ self.agent_config = agent_config
+
+ # Log if agentic mode requested but no runner provided
+ if agentic_mode and agent_runner is None:
+ logger.warning(
+ "Agentic mode enabled but no agent_runner provided. "
+ "Falling back to legacy LLMClient-based novelty checks."
+ )
def should_check_novelty(
self,
diff --git a/shinka/core/runner.py b/shinka/core/runner.py
index 54e89b62b..0fba9ce28 100644
--- a/shinka/core/runner.py
+++ b/shinka/core/runner.py
@@ -313,11 +313,16 @@ def __init__(
)
# Initialize NoveltyJudge for novelty assessment
+ # Pass agentic config for potential future use, with graceful fallback
self.novelty_judge = NoveltyJudge(
novelty_llm_client=self.novelty_llm,
language=evo_config.language,
similarity_threshold=evo_config.code_embed_sim_threshold,
max_novelty_attempts=evo_config.max_novelty_attempts,
+ # Agentic novelty (falls back to legacy if agent_runner not set)
+ agentic_mode=evo_config.agentic_mode,
+ agent_runner=None, # Not implemented in minimal PR
+ agent_config=evo_config.agentic if evo_config.agentic_mode else None,
)
# Initialize rich console for formatted output
diff --git a/shinka/webui/viz_tree.html b/shinka/webui/viz_tree.html
index 7b104bbd3..a58610421 100644
--- a/shinka/webui/viz_tree.html
+++ b/shinka/webui/viz_tree.html
@@ -3512,7 +3512,8 @@ β Failed to Load Database
'init': d3.symbolDiamond,
'full': d3.symbolCircle,
'diff': d3.symbolSquare,
- 'cross': d3.symbolCross
+ 'cross': d3.symbolCross,
+ 'agentic': d3.symbolTriangle // Triangle for agentic patches
};
const getShape = (patchType) => shapeMap[patchType] || d3.symbolCircle;
const symbol = d3.symbol().size(2500);
@@ -4689,10 +4690,11 @@ Selected Node Details
codeWrapper.innerHTML = "No code available for this node.
";
}
- // Update diff tab
+ // Update diff tab (supports multi-file diffs from agentic backend)
const diffWrapper = document.getElementById("code-diff");
- if (data.code_diff) {
- diffWrapper.innerHTML = `${formatDiff(data.code_diff)}`;
+ const diffFiles = getDiffFilesForNode(data);
+ if (diffFiles.length > 0) {
+ diffWrapper.innerHTML = renderMultiFileDiff(diffFiles);
} else {
diffWrapper.innerHTML = "No code diff available for this node.
";
}
@@ -4808,6 +4810,120 @@ Selected Node Details
}).join('');
}
+ // Get diff statistics (additions and deletions count)
+ function getDiffStats(diffText) {
+ if (!diffText) return { additions: 0, deletions: 0 };
+ const lines = diffText.split('\n');
+ let additions = 0, deletions = 0;
+ lines.forEach(line => {
+ if (line.startsWith('+') && !line.startsWith('+++')) additions++;
+ else if (line.startsWith('-') && !line.startsWith('---')) deletions++;
+ });
+ return { additions, deletions };
+ }
+
+ // Get default primary file path based on language
+ function defaultPrimaryPath(language) {
+ const langPaths = { python: 'main.py', javascript: 'main.js', typescript: 'main.ts', swift: 'main.swift' };
+ return langPaths[language] || 'main.py';
+ }
+
+ // Extract diff files from a node (supports multi-file agentic diffs)
+ function getDiffFilesForNode(node) {
+ // Check for array of diffs (multi-file format)
+ if (node && Array.isArray(node.code_diffs) && node.code_diffs.length > 0) {
+ return node.code_diffs.map(diffEntry => ({
+ path: diffEntry.path || node.metadata?.agent_primary_file || defaultPrimaryPath(node.language),
+ diff: diffEntry.diff || '',
+ }));
+ }
+
+ // Check metadata.agent_code_diffs (dict format from agentic backend)
+ if (node && node.metadata?.agent_code_diffs && typeof node.metadata.agent_code_diffs === 'object') {
+ const diffs = node.metadata.agent_code_diffs;
+ const entries = Object.entries(diffs);
+ if (entries.length > 0) {
+ return entries.map(([path, diff]) => ({ path, diff: diff || '' }));
+ }
+ }
+
+ // Fallback to single code_diff
+ if (node && node.code_diff) {
+ return [{
+ path: node.metadata?.agent_primary_file || defaultPrimaryPath(node.language),
+ diff: node.code_diff,
+ }];
+ }
+
+ return [];
+ }
+
+ // Render multi-file diff viewer
+ function renderMultiFileDiff(diffFiles) {
+ if (!diffFiles || diffFiles.length === 0) {
+ return 'No code diff available for this node.
';
+ }
+
+ // Calculate totals
+ const totals = diffFiles.reduce((acc, file) => {
+ const stats = getDiffStats(file.diff);
+ acc.additions += stats.additions;
+ acc.deletions += stats.deletions;
+ return acc;
+ }, { additions: 0, deletions: 0 });
+
+ const filesLabel = diffFiles.length === 1 ? 'file changed' : 'files changed';
+ const autoExpand = diffFiles.length === 1;
+
+ let html = `
+
+ ${diffFiles.length} ${filesLabel}
+ +${totals.additions}
+ -${totals.deletions}
+
+ `;
+
+ diffFiles.forEach((diffEntry, idx) => {
+ const stats = getDiffStats(diffEntry.diff);
+ const isCollapsed = !autoExpand && idx > 0;
+ const diffContent = diffEntry.diff ? formatDiff(diffEntry.diff) : 'No diff content for this file.
';
+
+ html += `
+
+ `;
+ });
+
+ // Add collapse/expand toggle script
+ html += `
+
+ `;
+
+ return html;
+ }
+
// Get CSS class for score display
function getScoreClass(score) {
if (score === null || score === undefined) {
@@ -7196,7 +7312,8 @@ Selected Node Details
'init': d3.symbolDiamond,
'full': d3.symbolCircle,
'diff': d3.symbolSquare,
- 'cross': d3.symbolCross
+ 'cross': d3.symbolCross,
+ 'agentic': d3.symbolTriangle // Triangle for agentic patches
};
const getShape = (patchType) => shapeMap[patchType] || d3.symbolCircle;
const symbol = d3.symbol().size(1500); // Smaller size for island trees
From 729ac1a13d3d72adeb505ddf3d20409f03dd8466 Mon Sep 17 00:00:00 2001
From: george
Date: Sun, 7 Dec 2025 02:20:21 +0000
Subject: [PATCH 36/68] feat: Add Boids Flocking multi-file example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Create boid.py with Vector2D and Boid classes
- Create simulation.py with SimulationEnvironment
- Create render.py with terminal and matplotlib renderers
- Create main.py as the entry point
- Create initial.py as suboptimal starting point (score ~48)
- Add task config: configs/task/boids_flocking.yaml
- Add variant config: configs/variant/boids_flocking.yaml
This example demonstrates multi-file editing with evolution.
The initial implementation has deliberately suboptimal weights
to allow room for evolutionary improvement.
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5
---
configs/task/boids_flocking.yaml | 44 ++++
configs/variant/boids_flocking.yaml | 17 ++
examples/boids_flocking/boid.py | 169 +++++++++++++
examples/boids_flocking/initial.py | 340 ++++++++++++++++++++++++++
examples/boids_flocking/main.py | 202 +++++++++++++++
examples/boids_flocking/render.py | 138 +++++++++++
examples/boids_flocking/simulation.py | 195 +++++++++++++++
7 files changed, 1105 insertions(+)
create mode 100644 configs/task/boids_flocking.yaml
create mode 100644 configs/variant/boids_flocking.yaml
create mode 100644 examples/boids_flocking/boid.py
create mode 100644 examples/boids_flocking/initial.py
create mode 100644 examples/boids_flocking/main.py
create mode 100644 examples/boids_flocking/render.py
create mode 100644 examples/boids_flocking/simulation.py
diff --git a/configs/task/boids_flocking.yaml b/configs/task/boids_flocking.yaml
new file mode 100644
index 000000000..21ee57752
--- /dev/null
+++ b/configs/task/boids_flocking.yaml
@@ -0,0 +1,44 @@
+# Boids Flocking Task Configuration
+# Task: Evolve flocking behavior to minimize collisions while maintaining tight grouping
+
+task_name: boids_flocking
+
+# Task description for the LLM
+description: |
+ Optimize the Boids flocking simulation. The goal is to evolve the separation,
+ alignment, and cohesion behaviors to:
+ 1. Minimize collisions between boids
+ 2. Maintain tight grouping (cohesion)
+ 3. Achieve good velocity alignment
+
+ The simulation runs for 1000 steps with 50 boids. Improve the scoring function,
+ behavior weights, and physics parameters to achieve a higher combined score.
+
+# File paths (relative to init_support_dir)
+exec_fname: initial.py
+init_support_dir: examples/boids_flocking
+
+# Language
+language: python
+
+# Evaluation command
+eval_command: python3 initial.py --headless --steps 1000
+
+# Output file names
+metrics_fname: metrics.json
+correct_fname: correct.json
+
+# Scoring configuration
+score_key: combined_score
+higher_is_better: true
+
+# Allowed files for editing (multi-file task)
+allowed_files:
+ - initial.py
+ - boid.py
+ - simulation.py
+ - render.py
+ - main.py
+
+# Primary file (main entry point)
+primary_file: initial.py
diff --git a/configs/variant/boids_flocking.yaml b/configs/variant/boids_flocking.yaml
new file mode 100644
index 000000000..5ca2b8768
--- /dev/null
+++ b/configs/variant/boids_flocking.yaml
@@ -0,0 +1,17 @@
+# Variant configuration for Boids Flocking task
+# This defines default overrides for the boids task
+
+defaults:
+ - /task: boids_flocking
+ - /evolution: small_budget
+
+# Task-specific evolution overrides
+evo_config:
+ # Use smaller population for faster iterations
+ n_pop: 8
+
+ # Enable agentic mode for multi-file editing
+ agentic_mode: false # Set to true for agentic experiments
+
+ # Multi-file embedding support
+ embedding_use_changed_files_first: true
diff --git a/examples/boids_flocking/boid.py b/examples/boids_flocking/boid.py
new file mode 100644
index 000000000..15b513a6f
--- /dev/null
+++ b/examples/boids_flocking/boid.py
@@ -0,0 +1,169 @@
+"""
+Boid class implementing separation, alignment, and cohesion behaviors.
+"""
+
+import math
+from dataclasses import dataclass, field
+from typing import List, Tuple
+
+
+@dataclass
+class Vector2D:
+ """Simple 2D vector for boid physics."""
+ x: float = 0.0
+ y: float = 0.0
+
+ def __add__(self, other: "Vector2D") -> "Vector2D":
+ return Vector2D(self.x + other.x, self.y + other.y)
+
+ def __sub__(self, other: "Vector2D") -> "Vector2D":
+ return Vector2D(self.x - other.x, self.y - other.y)
+
+ def __mul__(self, scalar: float) -> "Vector2D":
+ return Vector2D(self.x * scalar, self.y * scalar)
+
+ def __truediv__(self, scalar: float) -> "Vector2D":
+ if scalar == 0:
+ return Vector2D(0, 0)
+ return Vector2D(self.x / scalar, self.y / scalar)
+
+ def magnitude(self) -> float:
+ return math.sqrt(self.x * self.x + self.y * self.y)
+
+ def normalize(self) -> "Vector2D":
+ mag = self.magnitude()
+ if mag == 0:
+ return Vector2D(0, 0)
+ return self / mag
+
+ def limit(self, max_val: float) -> "Vector2D":
+ mag = self.magnitude()
+ if mag > max_val:
+ return self.normalize() * max_val
+ return Vector2D(self.x, self.y)
+
+ def distance_to(self, other: "Vector2D") -> float:
+ return (self - other).magnitude()
+
+
+@dataclass
+class Boid:
+ """A single boid in the flock."""
+ position: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+ velocity: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+ acceleration: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+
+ # Behavior weights (SUBOPTIMAL: these could be evolved)
+ separation_weight: float = 1.0
+ alignment_weight: float = 1.0
+ cohesion_weight: float = 1.0
+
+ # Physical parameters
+ max_speed: float = 4.0
+ max_force: float = 0.1
+ perception_radius: float = 50.0
+ separation_radius: float = 25.0
+
+ def apply_force(self, force: Vector2D) -> None:
+ """Apply a steering force to the boid."""
+ self.acceleration = self.acceleration + force
+
+ def update(self) -> None:
+ """Update velocity and position."""
+ self.velocity = self.velocity + self.acceleration
+ self.velocity = self.velocity.limit(self.max_speed)
+ self.position = self.position + self.velocity
+ self.acceleration = Vector2D(0, 0)
+
+ def seek(self, target: Vector2D) -> Vector2D:
+ """Calculate steering force toward a target."""
+ desired = target - self.position
+ desired = desired.normalize() * self.max_speed
+ steer = desired - self.velocity
+ return steer.limit(self.max_force)
+
+ def separation(self, neighbors: List["Boid"]) -> Vector2D:
+ """Steer to avoid crowding local flockmates."""
+ steer = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.separation_radius:
+ diff = self.position - other.position
+ diff = diff.normalize()
+ # SUBOPTIMAL: Simple inverse weighting (could use inverse square)
+ diff = diff / d
+ steer = steer + diff
+ count += 1
+
+ if count > 0:
+ steer = steer / count
+ if steer.magnitude() > 0:
+ steer = steer.normalize() * self.max_speed
+ steer = steer - self.velocity
+ steer = steer.limit(self.max_force)
+
+ return steer * self.separation_weight
+
+ def alignment(self, neighbors: List["Boid"]) -> Vector2D:
+ """Steer towards the average heading of local flockmates."""
+ avg_velocity = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.perception_radius:
+ avg_velocity = avg_velocity + other.velocity
+ count += 1
+
+ if count > 0:
+ avg_velocity = avg_velocity / count
+ avg_velocity = avg_velocity.normalize() * self.max_speed
+ steer = avg_velocity - self.velocity
+ steer = steer.limit(self.max_force)
+ return steer * self.alignment_weight
+
+ return Vector2D(0, 0)
+
+ def cohesion(self, neighbors: List["Boid"]) -> Vector2D:
+ """Steer to move toward the average position of local flockmates."""
+ center = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.perception_radius:
+ center = center + other.position
+ count += 1
+
+ if count > 0:
+ center = center / count
+ return self.seek(center) * self.cohesion_weight
+
+ return Vector2D(0, 0)
+
+ def flock(self, boids: List["Boid"]) -> None:
+ """Apply all three flocking behaviors."""
+ # Filter out self from neighbors
+ neighbors = [b for b in boids if b is not self]
+
+ sep = self.separation(neighbors)
+ ali = self.alignment(neighbors)
+ coh = self.cohesion(neighbors)
+
+ self.apply_force(sep)
+ self.apply_force(ali)
+ self.apply_force(coh)
+
+ def wrap_edges(self, width: float, height: float) -> None:
+ """Wrap boid around screen edges."""
+ if self.position.x > width:
+ self.position.x = 0
+ elif self.position.x < 0:
+ self.position.x = width
+
+ if self.position.y > height:
+ self.position.y = 0
+ elif self.position.y < 0:
+ self.position.y = height
diff --git a/examples/boids_flocking/initial.py b/examples/boids_flocking/initial.py
new file mode 100644
index 000000000..0dc9477f4
--- /dev/null
+++ b/examples/boids_flocking/initial.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Initial (SUBOPTIMAL) implementation of Boids Flocking Simulation.
+
+This file serves as the starting point for evolutionary optimization.
+The implementation is deliberately suboptimal to allow room for improvement.
+
+Known issues to evolve:
+1. Behavior weights are not well-tuned
+2. Simple linear distance weighting for separation
+3. Basic collision threshold
+4. Naive scoring function
+5. No adaptive parameters
+
+Target fitness: ~40-50 (should evolve to 85+)
+"""
+
+import argparse
+import json
+import math
+import random
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Tuple, Dict, Any
+
+
+# ============================================================================
+# Vector2D - Basic 2D vector operations
+# ============================================================================
+
+@dataclass
+class Vector2D:
+ x: float = 0.0
+ y: float = 0.0
+
+ def __add__(self, other: "Vector2D") -> "Vector2D":
+ return Vector2D(self.x + other.x, self.y + other.y)
+
+ def __sub__(self, other: "Vector2D") -> "Vector2D":
+ return Vector2D(self.x - other.x, self.y - other.y)
+
+ def __mul__(self, scalar: float) -> "Vector2D":
+ return Vector2D(self.x * scalar, self.y * scalar)
+
+ def __truediv__(self, scalar: float) -> "Vector2D":
+ if scalar == 0:
+ return Vector2D(0, 0)
+ return Vector2D(self.x / scalar, self.y / scalar)
+
+ def magnitude(self) -> float:
+ return math.sqrt(self.x * self.x + self.y * self.y)
+
+ def normalize(self) -> "Vector2D":
+ mag = self.magnitude()
+ if mag == 0:
+ return Vector2D(0, 0)
+ return self / mag
+
+ def limit(self, max_val: float) -> "Vector2D":
+ mag = self.magnitude()
+ if mag > max_val:
+ return self.normalize() * max_val
+ return Vector2D(self.x, self.y)
+
+ def distance_to(self, other: "Vector2D") -> float:
+ return (self - other).magnitude()
+
+
+# ============================================================================
+# Boid - Individual flocking agent
+# ============================================================================
+
+@dataclass
+class Boid:
+ position: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+ velocity: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+ acceleration: Vector2D = field(default_factory=lambda: Vector2D(0, 0))
+
+ # SUBOPTIMAL: These weights could be much better tuned
+ separation_weight: float = 1.5 # Too aggressive
+ alignment_weight: float = 1.0 # Could be higher
+ cohesion_weight: float = 1.0 # Could be higher
+
+ max_speed: float = 4.0
+ max_force: float = 0.1
+ perception_radius: float = 50.0
+ separation_radius: float = 25.0
+
+ def apply_force(self, force: Vector2D) -> None:
+ self.acceleration = self.acceleration + force
+
+ def update(self) -> None:
+ self.velocity = self.velocity + self.acceleration
+ self.velocity = self.velocity.limit(self.max_speed)
+ self.position = self.position + self.velocity
+ self.acceleration = Vector2D(0, 0)
+
+ def seek(self, target: Vector2D) -> Vector2D:
+ desired = target - self.position
+ desired = desired.normalize() * self.max_speed
+ steer = desired - self.velocity
+ return steer.limit(self.max_force)
+
+ def separation(self, neighbors: List["Boid"]) -> Vector2D:
+ """SUBOPTIMAL: Simple inverse distance weighting."""
+ steer = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.separation_radius:
+ diff = self.position - other.position
+ diff = diff.normalize()
+ # SUBOPTIMAL: Linear inverse (should be inverse square)
+ diff = diff / d
+ steer = steer + diff
+ count += 1
+
+ if count > 0:
+ steer = steer / count
+ if steer.magnitude() > 0:
+ steer = steer.normalize() * self.max_speed
+ steer = steer - self.velocity
+ steer = steer.limit(self.max_force)
+
+ return steer * self.separation_weight
+
+ def alignment(self, neighbors: List["Boid"]) -> Vector2D:
+ avg_velocity = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.perception_radius:
+ avg_velocity = avg_velocity + other.velocity
+ count += 1
+
+ if count > 0:
+ avg_velocity = avg_velocity / count
+ avg_velocity = avg_velocity.normalize() * self.max_speed
+ steer = avg_velocity - self.velocity
+ steer = steer.limit(self.max_force)
+ return steer * self.alignment_weight
+
+ return Vector2D(0, 0)
+
+ def cohesion(self, neighbors: List["Boid"]) -> Vector2D:
+ center = Vector2D(0, 0)
+ count = 0
+
+ for other in neighbors:
+ d = self.position.distance_to(other.position)
+ if 0 < d < self.perception_radius:
+ center = center + other.position
+ count += 1
+
+ if count > 0:
+ center = center / count
+ return self.seek(center) * self.cohesion_weight
+
+ return Vector2D(0, 0)
+
+ def flock(self, boids: List["Boid"]) -> None:
+ neighbors = [b for b in boids if b is not self]
+ self.apply_force(self.separation(neighbors))
+ self.apply_force(self.alignment(neighbors))
+ self.apply_force(self.cohesion(neighbors))
+
+ def wrap_edges(self, width: float, height: float) -> None:
+ if self.position.x > width:
+ self.position.x = 0
+ elif self.position.x < 0:
+ self.position.x = width
+ if self.position.y > height:
+ self.position.y = 0
+ elif self.position.y < 0:
+ self.position.y = height
+
+
+# ============================================================================
+# Simulation
+# ============================================================================
+
+class Simulation:
+ def __init__(
+ self,
+ width: float = 800,
+ height: float = 600,
+ num_boids: int = 50
+ ):
+ self.width = width
+ self.height = height
+ self.boids: List[Boid] = []
+ self.collision_count = 0
+ self.step_count = 0
+
+ # Initialize flock
+ for _ in range(num_boids):
+ position = Vector2D(
+ random.uniform(0, width),
+ random.uniform(0, height)
+ )
+ angle = random.uniform(0, 2 * math.pi)
+ speed = random.uniform(2, 4)
+ velocity = Vector2D(
+ math.cos(angle) * speed,
+ math.sin(angle) * speed
+ )
+ self.boids.append(Boid(position=position, velocity=velocity))
+
+ def step(self) -> None:
+ for boid in self.boids:
+ boid.flock(self.boids)
+
+ for boid in self.boids:
+ boid.update()
+ boid.wrap_edges(self.width, self.height)
+
+ # SUBOPTIMAL: Simple collision counting
+ collision_threshold = 10.0
+ for i, b1 in enumerate(self.boids):
+ for b2 in self.boids[i + 1:]:
+ if b1.position.distance_to(b2.position) < collision_threshold:
+ self.collision_count += 1
+
+ self.step_count += 1
+
+ def get_metrics(self) -> Dict[str, float]:
+ # Average separation
+ separations = []
+ for boid in self.boids:
+ min_dist = float("inf")
+ for other in self.boids:
+ if other is not boid:
+ dist = boid.position.distance_to(other.position)
+ min_dist = min(min_dist, dist)
+ if min_dist != float("inf"):
+ separations.append(min_dist)
+ avg_separation = sum(separations) / len(separations) if separations else 0
+
+ # Alignment score
+ alignment_scores = []
+ for boid in self.boids:
+ neighbors = [
+ b for b in self.boids
+ if b is not boid and boid.position.distance_to(b.position) < 50
+ ]
+ if neighbors:
+ avg_vx = sum(n.velocity.x for n in neighbors) / len(neighbors)
+ avg_vy = sum(n.velocity.y for n in neighbors) / len(neighbors)
+ avg_vel = Vector2D(avg_vx, avg_vy)
+ if boid.velocity.magnitude() > 0 and avg_vel.magnitude() > 0:
+ dot = boid.velocity.x * avg_vel.x + boid.velocity.y * avg_vel.y
+ alignment = dot / (boid.velocity.magnitude() * avg_vel.magnitude())
+ alignment_scores.append((alignment + 1) / 2)
+ alignment_score = sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0.5
+
+ # Cohesion score
+ center_x = sum(b.position.x for b in self.boids) / len(self.boids)
+ center_y = sum(b.position.y for b in self.boids) / len(self.boids)
+ center = Vector2D(center_x, center_y)
+ distances = [b.position.distance_to(center) for b in self.boids]
+ avg_dist = sum(distances) / len(distances)
+ max_dist = math.sqrt(self.width**2 + self.height**2) / 4
+ cohesion_score = max(0, 1 - avg_dist / max_dist)
+
+ return {
+ "avg_separation": avg_separation,
+ "alignment_score": alignment_score,
+ "cohesion_score": cohesion_score,
+ "total_collisions": self.collision_count,
+ "collision_rate": self.collision_count / self.step_count if self.step_count > 0 else 0
+ }
+
+
+def calculate_score(metrics: Dict[str, float]) -> float:
+ """SUBOPTIMAL scoring function."""
+ separation_penalty = abs(metrics["avg_separation"] - 30) / 30
+ separation_score = max(0, 1 - separation_penalty)
+ collision_penalty = min(1, metrics["collision_rate"] * 10)
+
+ combined = (
+ 0.25 * separation_score +
+ 0.25 * metrics["alignment_score"] +
+ 0.25 * metrics["cohesion_score"] +
+ 0.25 * (1 - collision_penalty)
+ )
+
+ return max(0, min(100, combined * 100))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--headless", action="store_true")
+ parser.add_argument("--steps", type=int, default=1000)
+ parser.add_argument("--boids", type=int, default=50)
+ parser.add_argument("--output-dir", type=str, default=".")
+ args = parser.parse_args()
+
+ output_dir = Path(args.output_dir)
+
+ print("=" * 60)
+ print("BOIDS FLOCKING SIMULATION (Initial Version)")
+ print("=" * 60)
+
+ sim = Simulation(num_boids=args.boids)
+
+ for step in range(args.steps):
+ sim.step()
+ if (step + 1) % 100 == 0:
+ m = sim.get_metrics()
+ print(f"Step {step + 1}: collisions={m['total_collisions']}, "
+ f"align={m['alignment_score']:.3f}, coh={m['cohesion_score']:.3f}")
+
+ metrics = sim.get_metrics()
+ score = calculate_score(metrics)
+ correct = score >= 40
+
+ print("\n" + "=" * 60)
+ print("RESULTS")
+ print("=" * 60)
+ print(f"Avg Separation: {metrics['avg_separation']:.2f}")
+ print(f"Alignment: {metrics['alignment_score']:.3f}")
+ print(f"Cohesion: {metrics['cohesion_score']:.3f}")
+ print(f"Collisions: {metrics['total_collisions']}")
+ print(f"Score: {score:.2f}")
+ print(f"Correct: {correct}")
+
+ with open(output_dir / "metrics.json", "w") as f:
+ json.dump(metrics, f, indent=2)
+
+ with open(output_dir / "correct.json", "w") as f:
+ json.dump({"correct": correct}, f)
+
+ return 0 if correct else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/examples/boids_flocking/main.py b/examples/boids_flocking/main.py
new file mode 100644
index 000000000..dcd7e4db4
--- /dev/null
+++ b/examples/boids_flocking/main.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+"""
+Boids Flocking Simulation - Main Entry Point
+
+This simulation evolves flocking behavior by optimizing separation, alignment,
+and cohesion weights to minimize collisions while maintaining tight grouping.
+
+Usage:
+ python main.py # Run with visualization
+ python main.py --headless # Run without visualization
+ python main.py --steps 500 # Run for specific number of steps
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from simulation import SimulationEnvironment, SimulationConfig
+from render import create_renderer
+
+
+def parse_args():
+ """Parse command line arguments."""
+ parser = argparse.ArgumentParser(description="Boids Flocking Simulation")
+ parser.add_argument(
+ "--headless",
+ action="store_true",
+ help="Run without graphical output"
+ )
+ parser.add_argument(
+ "--steps",
+ type=int,
+ default=1000,
+ help="Number of simulation steps (default: 1000)"
+ )
+ parser.add_argument(
+ "--boids",
+ type=int,
+ default=50,
+ help="Number of boids in the simulation (default: 50)"
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ default=".",
+ help="Directory for output files"
+ )
+ return parser.parse_args()
+
+
+def calculate_combined_score(metrics: dict) -> float:
+ """
+ Calculate a combined fitness score from the simulation metrics.
+
+ SUBOPTIMAL SCORING (room for evolution):
+ - Simple weighted average
+ - Doesn't account for trade-offs between metrics
+ - Could use more sophisticated aggregation
+ """
+ # Extract key metrics
+ avg_separation = metrics.get("avg_separation", 0)
+ alignment_score = metrics.get("alignment_score", 0.5)
+ cohesion_score = metrics.get("cohesion_score", 0)
+ collision_rate = metrics.get("collision_rate", 1)
+
+ # SUBOPTIMAL: Simple weighting scheme
+ # Ideal separation is around 20-40 (not too close, not too far)
+ separation_penalty = abs(avg_separation - 30) / 30
+ separation_score = max(0, 1 - separation_penalty)
+
+ # Penalize collisions heavily
+ collision_penalty = min(1, collision_rate * 10)
+
+ # Combined score (higher is better)
+ combined = (
+ 0.25 * separation_score +
+ 0.25 * alignment_score +
+ 0.25 * cohesion_score +
+ 0.25 * (1 - collision_penalty)
+ )
+
+ return max(0, min(100, combined * 100))
+
+
+def evaluate_simulation(args) -> dict:
+ """Run simulation and return evaluation results."""
+ # Create simulation config
+ config = SimulationConfig(
+ num_boids=args.boids,
+ max_steps=args.steps,
+ # SUBOPTIMAL weights (evolution should improve these)
+ separation_weight=1.5,
+ alignment_weight=1.0,
+ cohesion_weight=1.0,
+ max_speed=4.0,
+ max_force=0.1,
+ perception_radius=50.0,
+ separation_radius=25.0
+ )
+
+ # Create and run simulation
+ sim = SimulationEnvironment(config)
+
+ # Create renderer if not headless
+ renderer = None
+ if not args.headless:
+ try:
+ renderer = create_renderer(
+ headless=False,
+ width=config.width,
+ height=config.height
+ )
+ except Exception as e:
+ print(f"Warning: Could not create graphical renderer: {e}")
+ print("Falling back to headless mode.")
+
+ # Run simulation
+ for step in range(args.steps):
+ sim.step()
+
+ # Render if available
+ if renderer and hasattr(renderer, "render"):
+ try:
+ positions = sim.get_boid_positions()
+ velocities = sim.get_boid_velocities()
+ renderer.render(positions, velocities, step)
+ except Exception:
+ pass # Continue even if rendering fails
+
+ # Progress output every 100 steps
+ if (step + 1) % 100 == 0:
+ metrics = sim.get_final_metrics()
+ print(
+ f"Step {step + 1}/{args.steps}: "
+ f"collisions={metrics.get('total_collisions', 0)}, "
+ f"alignment={metrics.get('alignment_score', 0):.3f}, "
+ f"cohesion={metrics.get('cohesion_score', 0):.3f}"
+ )
+
+ # Close renderer
+ if renderer and hasattr(renderer, "close"):
+ renderer.close()
+
+ # Get final metrics
+ final_metrics = sim.get_final_metrics()
+ combined_score = calculate_combined_score(final_metrics)
+
+ return {
+ "metrics": final_metrics,
+ "combined_score": combined_score,
+ "correct": combined_score >= 40 # SUBOPTIMAL threshold (should be higher)
+ }
+
+
+def main():
+ """Main entry point."""
+ args = parse_args()
+ output_dir = Path(args.output_dir)
+
+ print("=" * 60)
+ print("BOIDS FLOCKING SIMULATION")
+ print("=" * 60)
+ print(f"Boids: {args.boids}")
+ print(f"Steps: {args.steps}")
+ print(f"Mode: {'Headless' if args.headless else 'Graphical'}")
+ print("=" * 60)
+
+ # Run evaluation
+ result = evaluate_simulation(args)
+
+ # Print results
+ print("\n" + "=" * 60)
+ print("SIMULATION RESULTS")
+ print("=" * 60)
+ metrics = result["metrics"]
+ print(f"Average Separation: {metrics.get('avg_separation', 0):.2f}")
+ print(f"Alignment Score: {metrics.get('alignment_score', 0):.3f}")
+ print(f"Cohesion Score: {metrics.get('cohesion_score', 0):.3f}")
+ print(f"Total Collisions: {metrics.get('total_collisions', 0)}")
+ print(f"Collision Rate: {metrics.get('collision_rate', 0):.4f}")
+ print(f"Combined Score: {result['combined_score']:.2f}")
+ print(f"Correct: {result['correct']}")
+ print("=" * 60)
+
+ # Write output files
+ metrics_file = output_dir / "metrics.json"
+ correct_file = output_dir / "correct.json"
+
+ with open(metrics_file, "w") as f:
+ json.dump(metrics, f, indent=2)
+ print(f"Metrics written to: {metrics_file}")
+
+ with open(correct_file, "w") as f:
+ json.dump({"correct": result["correct"]}, f)
+ print(f"Correctness written to: {correct_file}")
+
+ return 0 if result["correct"] else 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/examples/boids_flocking/render.py b/examples/boids_flocking/render.py
new file mode 100644
index 000000000..e137858c6
--- /dev/null
+++ b/examples/boids_flocking/render.py
@@ -0,0 +1,138 @@
+"""
+Renderer for visualizing the boids simulation.
+Supports both matplotlib (graphical) and terminal (headless) output.
+"""
+
+import math
+from typing import List, Tuple, Optional
+
+
+class TerminalRenderer:
+ """Simple ASCII renderer for headless mode."""
+
+ def __init__(self, width: int = 80, height: int = 24):
+ self.width = width
+ self.height = height
+
+ def render(
+ self,
+ positions: List[Tuple[float, float]],
+ sim_width: float,
+ sim_height: float
+ ) -> str:
+ """Render boids to ASCII art."""
+ grid = [[" " for _ in range(self.width)] for _ in range(self.height)]
+
+ for x, y in positions:
+ # Map simulation coords to terminal coords
+ tx = int((x / sim_width) * (self.width - 1))
+ ty = int((y / sim_height) * (self.height - 1))
+
+ # Clamp to bounds
+ tx = max(0, min(self.width - 1, tx))
+ ty = max(0, min(self.height - 1, ty))
+
+ grid[ty][tx] = "*"
+
+ # Build output string
+ output = "+" + "-" * self.width + "+\n"
+ for row in grid:
+ output += "|" + "".join(row) + "|\n"
+ output += "+" + "-" * self.width + "+"
+
+ return output
+
+
+class MatplotlibRenderer:
+ """Matplotlib-based renderer for graphical output."""
+
+ def __init__(self, width: float = 800, height: float = 600):
+ self.width = width
+ self.height = height
+ self.fig = None
+ self.ax = None
+ self.scatter = None
+ self.quiver = None
+
+ def initialize(self) -> None:
+ """Initialize matplotlib figure."""
+ try:
+ import matplotlib.pyplot as plt
+ from matplotlib.animation import FuncAnimation
+
+ plt.ion()
+ self.fig, self.ax = plt.subplots(figsize=(10, 8))
+ self.ax.set_xlim(0, self.width)
+ self.ax.set_ylim(0, self.height)
+ self.ax.set_aspect("equal")
+ self.ax.set_facecolor("#1a1a2e")
+ self.fig.patch.set_facecolor("#1a1a2e")
+ self.ax.axis("off")
+
+ except ImportError:
+ raise RuntimeError("matplotlib not available for graphical rendering")
+
+ def render(
+ self,
+ positions: List[Tuple[float, float]],
+ velocities: List[Tuple[float, float]],
+ step: int = 0
+ ) -> None:
+ """Render current frame."""
+ import matplotlib.pyplot as plt
+
+ if self.fig is None:
+ self.initialize()
+
+ self.ax.clear()
+ self.ax.set_xlim(0, self.width)
+ self.ax.set_ylim(0, self.height)
+ self.ax.set_facecolor("#1a1a2e")
+ self.ax.axis("off")
+
+ if positions:
+ xs, ys = zip(*positions)
+ vxs, vys = zip(*velocities) if velocities else (None, None)
+
+ # Draw boids as points
+ self.ax.scatter(xs, ys, c="#00d9ff", s=30, alpha=0.8)
+
+ # Draw velocity vectors
+ if vxs and vys:
+ # Normalize velocities for arrow display
+ scale = 5.0
+ self.ax.quiver(
+ xs, ys, vxs, vys,
+ color="#ff6b6b",
+ alpha=0.5,
+ scale=50,
+ width=0.003
+ )
+
+ self.ax.set_title(f"Step: {step}", color="white", fontsize=12)
+ plt.pause(0.001)
+
+ def save_frame(self, filename: str) -> None:
+ """Save current frame to file."""
+ if self.fig:
+ self.fig.savefig(filename, dpi=100, facecolor="#1a1a2e")
+
+ def close(self) -> None:
+ """Close the renderer."""
+ if self.fig:
+ import matplotlib.pyplot as plt
+ plt.close(self.fig)
+
+
+def create_renderer(headless: bool = False, **kwargs) -> Optional[object]:
+ """Factory function to create appropriate renderer."""
+ if headless:
+ return TerminalRenderer(**kwargs)
+ else:
+ renderer = MatplotlibRenderer(**kwargs)
+ try:
+ renderer.initialize()
+ return renderer
+ except RuntimeError:
+ # Fall back to terminal if matplotlib not available
+ return TerminalRenderer()
diff --git a/examples/boids_flocking/simulation.py b/examples/boids_flocking/simulation.py
new file mode 100644
index 000000000..636fc96b6
--- /dev/null
+++ b/examples/boids_flocking/simulation.py
@@ -0,0 +1,195 @@
+"""
+Simulation environment for managing a flock of boids.
+"""
+
+import random
+import math
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Tuple
+
+from boid import Boid, Vector2D
+
+
+@dataclass
+class SimulationConfig:
+ """Configuration for the boids simulation."""
+ width: float = 800.0
+ height: float = 600.0
+ num_boids: int = 50
+ max_steps: int = 1000
+
+ # Boid parameters (SUBOPTIMAL: could be evolved)
+ separation_weight: float = 1.5
+ alignment_weight: float = 1.0
+ cohesion_weight: float = 1.0
+ max_speed: float = 4.0
+ max_force: float = 0.1
+ perception_radius: float = 50.0
+ separation_radius: float = 25.0
+
+
+class SimulationEnvironment:
+ """Manages a flock of boids and runs the simulation."""
+
+ def __init__(self, config: SimulationConfig):
+ self.config = config
+ self.boids: List[Boid] = []
+ self.step_count: int = 0
+ self.collision_count: int = 0
+ self.metrics_history: List[Dict[str, float]] = []
+ self._initialize_flock()
+
+ def _initialize_flock(self) -> None:
+ """Create the initial flock with random positions and velocities."""
+ for _ in range(self.config.num_boids):
+ position = Vector2D(
+ random.uniform(0, self.config.width),
+ random.uniform(0, self.config.height)
+ )
+ angle = random.uniform(0, 2 * math.pi)
+ speed = random.uniform(2, self.config.max_speed)
+ velocity = Vector2D(
+ math.cos(angle) * speed,
+ math.sin(angle) * speed
+ )
+
+ boid = Boid(
+ position=position,
+ velocity=velocity,
+ separation_weight=self.config.separation_weight,
+ alignment_weight=self.config.alignment_weight,
+ cohesion_weight=self.config.cohesion_weight,
+ max_speed=self.config.max_speed,
+ max_force=self.config.max_force,
+ perception_radius=self.config.perception_radius,
+ separation_radius=self.config.separation_radius
+ )
+ self.boids.append(boid)
+
+ def step(self) -> Dict[str, float]:
+ """Run one simulation step and return current metrics."""
+ # Apply flocking behavior to each boid
+ for boid in self.boids:
+ boid.flock(self.boids)
+
+ # Update positions and wrap edges
+ for boid in self.boids:
+ boid.update()
+ boid.wrap_edges(self.config.width, self.config.height)
+
+ # Count collisions (boids too close together)
+ step_collisions = self._count_collisions()
+ self.collision_count += step_collisions
+
+ # Calculate metrics
+ metrics = self._calculate_metrics()
+ metrics["step_collisions"] = step_collisions
+ self.metrics_history.append(metrics)
+
+ self.step_count += 1
+ return metrics
+
+ def _count_collisions(self) -> int:
+ """Count pairs of boids that are too close (collision)."""
+ collision_threshold = 10.0 # Minimum safe distance
+ collisions = 0
+
+ for i, boid1 in enumerate(self.boids):
+ for boid2 in self.boids[i + 1:]:
+ distance = boid1.position.distance_to(boid2.position)
+ if distance < collision_threshold:
+ collisions += 1
+
+ return collisions
+
+ def _calculate_metrics(self) -> Dict[str, float]:
+ """Calculate current flock metrics."""
+ if not self.boids:
+ return {"avg_separation": 0, "alignment_score": 0, "cohesion_score": 0}
+
+ # Average separation (distance to nearest neighbor)
+ separations = []
+ for boid in self.boids:
+ min_dist = float("inf")
+ for other in self.boids:
+ if other is not boid:
+ dist = boid.position.distance_to(other.position)
+ min_dist = min(min_dist, dist)
+ if min_dist != float("inf"):
+ separations.append(min_dist)
+
+ avg_separation = sum(separations) / len(separations) if separations else 0
+
+ # Alignment score (how similar are velocity directions)
+ alignment_scores = []
+ for boid in self.boids:
+ neighbors = [
+ b for b in self.boids
+ if b is not boid and boid.position.distance_to(b.position) < boid.perception_radius
+ ]
+ if neighbors:
+ # Calculate average velocity direction
+ avg_vx = sum(n.velocity.x for n in neighbors) / len(neighbors)
+ avg_vy = sum(n.velocity.y for n in neighbors) / len(neighbors)
+ avg_vel = Vector2D(avg_vx, avg_vy)
+
+ if boid.velocity.magnitude() > 0 and avg_vel.magnitude() > 0:
+ # Dot product normalized (1 = perfect alignment)
+ dot = (boid.velocity.x * avg_vel.x + boid.velocity.y * avg_vel.y)
+ alignment = dot / (boid.velocity.magnitude() * avg_vel.magnitude())
+ alignment_scores.append((alignment + 1) / 2) # Normalize to 0-1
+
+ alignment_score = sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0.5
+
+ # Cohesion score (how close are boids to the flock center)
+ center_x = sum(b.position.x for b in self.boids) / len(self.boids)
+ center_y = sum(b.position.y for b in self.boids) / len(self.boids)
+ center = Vector2D(center_x, center_y)
+
+ distances_to_center = [b.position.distance_to(center) for b in self.boids]
+ avg_distance = sum(distances_to_center) / len(distances_to_center)
+
+ # Normalize cohesion (lower distance = better cohesion)
+ max_expected_distance = math.sqrt(self.config.width**2 + self.config.height**2) / 4
+ cohesion_score = max(0, 1 - avg_distance / max_expected_distance)
+
+ return {
+ "avg_separation": avg_separation,
+ "alignment_score": alignment_score,
+ "cohesion_score": cohesion_score,
+ "avg_distance_to_center": avg_distance
+ }
+
+ def run(self, steps: int = None) -> Dict[str, Any]:
+ """Run simulation for specified steps and return final metrics."""
+ steps = steps or self.config.max_steps
+
+ for _ in range(steps):
+ self.step()
+
+ return self.get_final_metrics()
+
+ def get_final_metrics(self) -> Dict[str, Any]:
+ """Get final aggregated metrics."""
+ if not self.metrics_history:
+ return {}
+
+ # Average over last 100 steps for stability
+ recent = self.metrics_history[-100:] if len(self.metrics_history) >= 100 else self.metrics_history
+
+ return {
+ "avg_separation": sum(m["avg_separation"] for m in recent) / len(recent),
+ "alignment_score": sum(m["alignment_score"] for m in recent) / len(recent),
+ "cohesion_score": sum(m["cohesion_score"] for m in recent) / len(recent),
+ "total_collisions": self.collision_count,
+ "collision_rate": self.collision_count / self.step_count if self.step_count > 0 else 0,
+ "steps_completed": self.step_count
+ }
+
+ def get_boid_positions(self) -> List[Tuple[float, float]]:
+ """Get current positions of all boids for rendering."""
+ return [(b.position.x, b.position.y) for b in self.boids]
+
+ def get_boid_velocities(self) -> List[Tuple[float, float]]:
+ """Get current velocities of all boids for rendering."""
+ return [(b.velocity.x, b.velocity.y) for b in self.boids]
From e7faefebb7c2cc4d3113559d8d8584f80e2e5a4f Mon Sep 17 00:00:00 2001
From: george
Date: Sun, 14 Dec 2025 12:49:16 +0000
Subject: [PATCH 37/68] fix: Remove embedded script tag breaking HTML parser
---
shinka/webui/viz_tree.html | 20 +-------------------
1 file changed, 1 insertion(+), 19 deletions(-)
diff --git a/shinka/webui/viz_tree.html b/shinka/webui/viz_tree.html
index a58610421..eaea6c49f 100644
--- a/shinka/webui/viz_tree.html
+++ b/shinka/webui/viz_tree.html
@@ -4890,7 +4890,7 @@ Selected Node Details
html += `
-