Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
7226d8c
feat: copy benchmarks dir to base worktree when missing in compare
KRRT7 Apr 1, 2026
6c3d450
feat: pytest-benchmark-style auto-calibration for compare benchmarks
KRRT7 Apr 1, 2026
c01f8cc
feat: auto-detect base and head refs in codeflash compare
KRRT7 Apr 1, 2026
a6cabf7
feat: add OPS, Max, IQR, and Outliers columns to compare output
KRRT7 Apr 1, 2026
4ccf21d
feat: add --output flag to codeflash compare for markdown export
KRRT7 Apr 1, 2026
92d4df7
refactor: match pytest-benchmark column layout (Min, Median, Mean, OP…
KRRT7 Apr 1, 2026
3e19151
fix: extract median_ns from BenchmarkStats for optimizer pipeline
KRRT7 Apr 1, 2026
e670bf4
Optimize fmt_delta
codeflash-ai[bot] Apr 1, 2026
7005fa0
Merge pull request #1943 from codeflash-ai/codeflash/optimize-pr1941-…
claude[bot] Apr 1, 2026
74c29b2
fix: update tests for multi-round benchmark plugin
KRRT7 Apr 2, 2026
279a8fc
feat: add --memory flag to codeflash compare for peak memory profiling
KRRT7 Apr 2, 2026
699b70a
feat: support memory-only benchmarks without changed function detection
KRRT7 Apr 2, 2026
ca198ce
fix: update test expectations for multi-round benchmark plugin
KRRT7 Apr 2, 2026
6965e98
feat: add --script mode to codeflash compare
KRRT7 Apr 2, 2026
4e07c98
Optimize validate_and_format_benchmark_table
codeflash-ai[bot] Apr 2, 2026
cfcbae5
style: remove duplicate comments introduced by optimization
github-actions[bot] Apr 2, 2026
ab728f7
Merge pull request #1975 from codeflash-ai/codeflash/optimize-pr1941-…
claude[bot] Apr 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,005 changes: 789 additions & 216 deletions codeflash/benchmarking/compare.py

Large diffs are not rendered by default.

345 changes: 232 additions & 113 deletions codeflash/benchmarking/plugin/plugin.py

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions codeflash/benchmarking/pytest_new_process_memory_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Subprocess entry point for memory profiling benchmarks via pytest-memray.

Runs pytest with --memray --native to profile peak memory per test function.
The codeflash-benchmark plugin is left active (without --codeflash-trace) so it
provides a no-op ``benchmark`` fixture for tests that depend on it.
"""

import sys
from pathlib import Path

benchmarks_root = sys.argv[1]
memray_bin_dir = sys.argv[2]
memray_bin_prefix = sys.argv[3]

if __name__ == "__main__":
import pytest

Path(memray_bin_dir).mkdir(parents=True, exist_ok=True)

exitcode = pytest.main(
[
benchmarks_root,
"--memray",
"--native",
f"--memray-bin-path={memray_bin_dir}",
f"--memray-bin-prefix={memray_bin_prefix}",
"--hide-memray-summary",
"-p",
"no:benchmark",
"-p",
"no:codspeed",
"-p",
"no:cov",
"-p",
"no:profiling",
"-s",
"-o",
"addopts=",
]
)

sys.exit(exitcode)
36 changes: 36 additions & 0 deletions codeflash/benchmarking/trace_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,39 @@ def trace_benchmarks_pytest(
error_section = combined_output
logger.warning(f"Error collecting benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
logger.debug(f"Full pytest output:\n{combined_output}")


def memory_benchmarks_pytest(
benchmarks_root: Path, project_root: Path, memray_bin_dir: Path, memray_bin_prefix: str, timeout: int = 300
) -> None:
benchmark_env = make_env_with_project_root(project_root)
run_args = get_cross_platform_subprocess_run_args(
cwd=project_root, env=benchmark_env, timeout=timeout, check=False, text=True, capture_output=True
)
result = subprocess.run( # noqa: PLW1510
[
SAFE_SYS_EXECUTABLE,
Path(__file__).parent / "pytest_new_process_memory_benchmarks.py",
benchmarks_root,
memray_bin_dir,
memray_bin_prefix,
],
**run_args,
)
if result.returncode != 0:
combined_output = result.stdout
if result.stderr:
combined_output = combined_output + "\n" + result.stderr if combined_output else result.stderr

if "ERROR collecting" in combined_output:
error_pattern = r"={3,}\s*ERRORS\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
match = re.search(error_pattern, combined_output)
error_section = match.group(1) if match else combined_output
elif "FAILURES" in combined_output:
error_pattern = r"={3,}\s*FAILURES\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
match = re.search(error_pattern, combined_output)
error_section = match.group(1) if match else combined_output
else:
error_section = combined_output
logger.warning(f"Error collecting memory benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
logger.debug(f"Full pytest output:\n{combined_output}")
6 changes: 3 additions & 3 deletions codeflash/benchmarking/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


def validate_and_format_benchmark_table(
function_benchmark_timings: dict[str, dict[BenchmarkKey, int]], total_benchmark_timings: dict[BenchmarkKey, int]
function_benchmark_timings: dict[str, dict[BenchmarkKey, float]], total_benchmark_timings: dict[BenchmarkKey, float]
) -> dict[str, list[tuple[BenchmarkKey, float, float, float]]]:
function_to_result = {}
# Process each function's benchmark data
Expand Down Expand Up @@ -77,8 +77,8 @@ def print_benchmark_table(function_to_results: dict[str, list[tuple[BenchmarkKey

def process_benchmark_data(
replay_performance_gain: dict[BenchmarkKey, float],
fto_benchmark_timings: dict[BenchmarkKey, int],
total_benchmark_timings: dict[BenchmarkKey, int],
fto_benchmark_timings: dict[BenchmarkKey, float],
total_benchmark_timings: dict[BenchmarkKey, float],
) -> Optional[ProcessedBenchmarkInfo]:
"""Process benchmark data and generate detailed benchmark information.

Expand Down
15 changes: 14 additions & 1 deletion codeflash/cli_cmds/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,13 +382,26 @@ def _build_parser() -> ArgumentParser:
auth_subparsers.add_parser("status", help="Check authentication status")

compare_parser = subparsers.add_parser("compare", help="Compare benchmark performance between two git refs.")
compare_parser.add_argument("base_ref", help="Base git ref (branch, tag, or commit)")
compare_parser.add_argument(
"base_ref", nargs="?", default=None, help="Base git ref (default: auto-detect from PR or default branch)"
)
compare_parser.add_argument("head_ref", nargs="?", default=None, help="Head git ref (default: current branch)")
compare_parser.add_argument("--pr", type=int, help="Resolve head ref from a PR number (requires gh CLI)")
compare_parser.add_argument(
"--functions", type=str, help="Explicit functions to instrument: 'file.py::func1,func2;other.py::func3'"
)
compare_parser.add_argument("--timeout", type=int, default=600, help="Benchmark timeout in seconds (default: 600)")
compare_parser.add_argument("--output", "-o", type=str, help="Write markdown report to file")
compare_parser.add_argument(
"--memory", action="store_true", help="Profile peak memory usage per benchmark (requires memray, Linux/macOS)"
)
compare_parser.add_argument("--script", type=str, help="Shell command to run as benchmark in each worktree")
compare_parser.add_argument(
"--script-output",
type=str,
dest="script_output",
help="Relative path to JSON results file produced by --script (required with --script)",
)
compare_parser.add_argument("--config-file", type=str, dest="config_file", help="Path to pyproject.toml")

trace_optimize = subparsers.add_parser("optimize", help="Trace and optimize your project.")
Expand Down
147 changes: 126 additions & 21 deletions codeflash/cli_cmds/cmd_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,73 @@
from codeflash.models.function_types import FunctionToOptimize

from codeflash.cli_cmds.console import logger
from codeflash.code_utils.config_parser import parse_config_file


def run_compare(args: Namespace) -> None:
"""Entry point for the compare subcommand."""
# Load project config
pyproject_config, pyproject_file_path = parse_config_file(args.config_file)
# Resolve head_ref: explicit arg > --pr > current branch
head_ref = args.head_ref
if args.pr:
head_ref = resolve_pr_branch(args.pr)
if not head_ref:
head_ref = get_current_branch()
if not head_ref:
logger.error("Must provide head_ref, --pr, or be on a branch")
sys.exit(1)
logger.info(f"Auto-detected head ref: {head_ref}")

# Resolve base_ref: explicit arg > PR base branch > repo default branch
base_ref = args.base_ref
if not base_ref:
base_ref = detect_base_ref(head_ref)
if not base_ref:
logger.error("Could not auto-detect base ref. Provide it explicitly or ensure gh CLI is available.")
sys.exit(1)
logger.info(f"Auto-detected base ref: {base_ref}")

# Script mode: run an arbitrary benchmark command on each worktree (no codeflash config needed)
script_cmd = getattr(args, "script", None)
if script_cmd:
script_output = getattr(args, "script_output", None)
if not script_output:
logger.error("--script-output is required when using --script")
sys.exit(1)

import git

project_root = Path(git.Repo(Path.cwd(), search_parent_directories=True).working_dir)

from codeflash.benchmarking.compare import compare_with_script

result = compare_with_script(
base_ref=base_ref,
head_ref=head_ref,
project_root=project_root,
script_cmd=script_cmd,
script_output=script_output,
timeout=args.timeout,
memory=getattr(args, "memory", False),
)

if not result.base_results and not result.head_results:
logger.warning("No benchmark data collected. Check that --script-output points to a valid JSON file.")
sys.exit(1)

if args.output:
md = result.format_markdown()
Path(args.output).write_text(md, encoding="utf-8")
logger.info(f"Markdown report written to {args.output}")
return

# Standard trace-benchmark mode: requires codeflash config
from codeflash.code_utils.config_parser import parse_config_file

pyproject_config, pyproject_file_path = parse_config_file(args.config_file)
module_root = Path(pyproject_config.get("module_root", ".")).resolve()

from codeflash.cli_cmds.cli import project_root_from_module_root

project_root = project_root_from_module_root(module_root, pyproject_file_path)
tests_root = Path(pyproject_config.get("tests_root", "tests")).resolve()
benchmarks_root_str = pyproject_config.get("benchmarks_root")

Expand All @@ -34,42 +92,89 @@ def run_compare(args: Namespace) -> None:
logger.error(f"benchmarks-root {benchmarks_root} is not a valid directory")
sys.exit(1)

from codeflash.cli_cmds.cli import project_root_from_module_root

project_root = project_root_from_module_root(module_root, pyproject_file_path)

# Resolve head_ref
head_ref = args.head_ref
if args.pr:
head_ref = _resolve_pr_branch(args.pr)
if not head_ref:
logger.error("Must provide head_ref or --pr")
sys.exit(1)

# Parse explicit functions if provided
functions = None
if args.functions:
functions = _parse_functions_arg(args.functions, project_root)
functions = parse_functions_arg(args.functions, project_root)

from codeflash.benchmarking.compare import compare_branches

result = compare_branches(
base_ref=args.base_ref,
base_ref=base_ref,
head_ref=head_ref,
project_root=project_root,
benchmarks_root=benchmarks_root,
tests_root=tests_root,
functions=functions,
timeout=args.timeout,
memory=getattr(args, "memory", False),
)

if not result.base_total_ns and not result.head_total_ns:
if not result.base_stats and not result.head_stats:
logger.warning("No benchmark data collected. Check that benchmarks-root is configured and benchmarks exist.")
sys.exit(1)

if args.output:
md = result.format_markdown()
Path(args.output).write_text(md, encoding="utf-8")
logger.info(f"Markdown report written to {args.output}")


def get_current_branch() -> str | None:
try:
result = subprocess.run(
["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True, check=True
)
branch = result.stdout.strip()
return branch if branch and branch != "HEAD" else None
except (FileNotFoundError, subprocess.CalledProcessError):
return None


def detect_base_ref(head_ref: str) -> str | None:
# Try to find an open PR for this branch and use its base
try:
result = subprocess.run(
["gh", "pr", "view", head_ref, "--json", "baseRefName", "-q", ".baseRefName"],
capture_output=True,
text=True,
check=True,
)
base = result.stdout.strip()
if base:
return base
except (FileNotFoundError, subprocess.CalledProcessError):
pass

# Fall back to repo default branch
try:
result = subprocess.run(
["gh", "repo", "view", "--json", "defaultBranchRef", "-q", ".defaultBranchRef.name"],
capture_output=True,
text=True,
check=True,
)
default = result.stdout.strip()
if default:
return default
except (FileNotFoundError, subprocess.CalledProcessError):
pass

# Last resort: check for common default branch names
try:
for candidate in ("main", "master"):
result = subprocess.run(
["git", "rev-parse", "--verify", candidate], capture_output=True, text=True, check=False
)
if result.returncode == 0:
return candidate
except FileNotFoundError:
pass

return None


def _resolve_pr_branch(pr_number: int) -> str:
"""Resolve a PR number to its head branch name using gh CLI."""
def resolve_pr_branch(pr_number: int) -> str:
try:
result = subprocess.run(
["gh", "pr", "view", str(pr_number), "--json", "headRefName", "-q", ".headRefName"],
Expand All @@ -91,7 +196,7 @@ def _resolve_pr_branch(pr_number: int) -> str:
sys.exit(1)


def _parse_functions_arg(functions_str: str, project_root: Path) -> dict[Path, list[FunctionToOptimize]]:
def parse_functions_arg(functions_str: str, project_root: Path) -> dict[Path, list[FunctionToOptimize]]:
"""Parse --functions arg format: 'file.py::func1,func2;other.py::func3'."""
from codeflash.models.function_types import FunctionToOptimize

Expand Down
3 changes: 2 additions & 1 deletion codeflash/optimization/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ def run_benchmarks(
function_benchmark_timings = CodeFlashBenchmarkPlugin.get_function_benchmark_timings(
self.trace_file
)
total_benchmark_timings = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)
total_benchmark_stats = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)
total_benchmark_timings = {k: v.median_ns for k, v in total_benchmark_stats.items()}
function_to_results = validate_and_format_benchmark_table(
function_benchmark_timings, total_benchmark_timings
)
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ dependencies = [
"filelock>=3.20.3; python_version >= '3.10'",
"filelock<3.20.3; python_version < '3.10'",
"pytest-asyncio>=0.18.0",
"memray>=1.12; sys_platform != 'win32'",
"pytest-memray>=1.7; sys_platform != 'win32'",
]

[project.urls]
Expand Down Expand Up @@ -339,8 +341,8 @@ vcs = "git"

[tool.hatch.build.hooks.version]
path = "codeflash/version.py"
template = """# These version placeholders will be replaced by uv-dynamic-versioning during build.
__version__ = "{version}"
template = """# These version placeholders will be replaced by uv-dynamic-versioning during build.
__version__ = "{version}"
"""


Expand Down
Loading
Loading