diff --git a/codeflash/languages/function_optimizer.py b/codeflash/languages/function_optimizer.py index 09819feb2..a5ffe2ef9 100644 --- a/codeflash/languages/function_optimizer.py +++ b/codeflash/languages/function_optimizer.py @@ -1223,6 +1223,7 @@ def process_single_candidate( eval_ctx.record_successful_candidate(candidate.optimization_id, candidate_result.best_test_runtime, perf_gain) # Check if this is a successful optimization + baseline_cv = original_code_baseline.benchmarking_test_results.timing_coefficient_of_variation() is_successful_opt = speedup_critic( candidate_result, original_code_baseline.runtime, @@ -1231,6 +1232,7 @@ def process_single_candidate( best_throughput_until_now=None, original_concurrency_metrics=original_code_baseline.concurrency_metrics, best_concurrency_ratio_until_now=None, + baseline_timing_cv=baseline_cv, ) and quantity_of_tests_critic(candidate_result) tree = self.build_runtime_info_tree( diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 0296ab24e..71d5d796d 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -1,5 +1,6 @@ from __future__ import annotations +import math from collections import Counter, defaultdict from functools import lru_cache from typing import TYPE_CHECKING @@ -962,17 +963,75 @@ def usable_runtime_data_by_test_case(self) -> dict[InvocationId, list[int]]: return by_id def total_passed_runtime(self) -> int: - """Calculate the sum of runtimes of all test cases that passed. + """Calculate the sum of median runtimes of all test cases that passed. - A testcase runtime is the minimum value of all looped execution runtimes. + A testcase runtime is the median value of all looped execution runtimes. + Using median instead of min reduces sensitivity to outlier-fast iterations + (lucky GC pauses, perfect cache alignment) that produce spurious speedups. :return: The runtime in nanoseconds. """ + import statistics + # TODO this doesn't look at the intersection of tests of baseline and original return sum( - [min(usable_runtime_data) for _, usable_runtime_data in self.usable_runtime_data_by_test_case().items()] + statistics.median_low(usable_runtime_data) + for _, usable_runtime_data in self.usable_runtime_data_by_test_case().items() ) + def timing_coefficient_of_variation(self) -> float: + """Calculate the median per-test-case coefficient of variation (CV) of loop iterations. + + For each test case, computes CV = stdev / mean of its loop iteration runtimes. + Returns the median CV across test cases. This measures actual measurement noise + (how much repeated runs of the same test vary) rather than inter-test-case + differences (which reflect different inputs, not noise). + + Returns 0.0 if no test case has enough loop iterations to compute CV. + """ + import statistics + + runtime_data = self.usable_runtime_data_by_test_case() + if not runtime_data: + return 0.0 + + per_test_cvs: list[float] = [] + + # Use a single-pass Welford algorithm per list to compute sample standard deviation + # and mean in one traversal to avoid the double-pass of statistics.mean + statistics.stdev. + def compute_sample_cv(values: list[int]) -> float | None: + n = 0 + mean = 0.0 + m2 = 0.0 + for x in values: + n += 1 + x_f = float(x) + delta = x_f - mean + mean += delta / n + delta2 = x_f - mean + m2 += delta * delta2 + if n < 2: + return None + # sample variance = m2 / (n - 1) + if mean == 0.0: + return None + sample_variance = m2 / (n - 1) + # Guard against tiny negative rounding artefacts + if sample_variance <= 0.0: + stdev = 0.0 + else: + stdev = math.sqrt(sample_variance) + return stdev / mean + + for runtimes in runtime_data.values(): + cv = compute_sample_cv(runtimes) + if cv is not None: + per_test_cvs.append(cv) + + if not per_test_cvs: + return 0.0 + return statistics.median(per_test_cvs) + def effective_loop_count(self) -> int: """Calculate the effective number of complete loops. diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py index 600c4a537..4f4c3a43d 100644 --- a/codeflash/result/critic.py +++ b/codeflash/result/critic.py @@ -71,6 +71,7 @@ def speedup_critic( best_throughput_until_now: int | None = None, original_concurrency_metrics: ConcurrencyMetrics | None = None, best_concurrency_ratio_until_now: float | None = None, + baseline_timing_cv: float | None = None, ) -> bool: """Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user. @@ -81,6 +82,8 @@ def speedup_critic( - The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime. - The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance. + - If baseline timing CV (coefficient of variation) is available, the noise floor is raised + to at least the observed variance to avoid accepting benchmark noise as real speedup. For async throughput (when available): - Evaluates throughput improvements using MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD @@ -95,31 +98,49 @@ def speedup_critic( if not disable_gh_action_noise and env_utils.is_ci(): noise_floor = noise_floor * 2 # Increase the noise floor in GitHub Actions mode - perf_gain = performance_gain( - original_runtime_ns=original_code_runtime, optimized_runtime_ns=candidate_result.best_test_runtime - ) + # Adapt noise floor to observed baseline variance: if the baseline benchmark has + # high variance, require proportionally larger speedups to avoid accepting noise. + # Cap at 30% to avoid rejecting genuine large speedups on noisy CI runners. + if baseline_timing_cv is not None and baseline_timing_cv > noise_floor: + noise_floor = min(baseline_timing_cv, 0.30) + + # Cache frequently used attributes locally to avoid repeated attribute access + optimized_runtime = candidate_result.best_test_runtime + + perf_gain = performance_gain(original_runtime_ns=original_code_runtime, optimized_runtime_ns=optimized_runtime) runtime_improved = perf_gain > noise_floor # Check runtime comparison with best so far - runtime_is_best = best_runtime_until_now is None or candidate_result.best_test_runtime < best_runtime_until_now + runtime_is_best = best_runtime_until_now is None or optimized_runtime < best_runtime_until_now + + # Fast-path: if there is no async throughput data, runtime is the sole acceptance criteria. + if original_async_throughput is None or candidate_result.async_throughput is None: + return runtime_improved and runtime_is_best + + # At this point async throughput data is available; acceptance is OR of runtime/throughput/concurrency + # Early return if runtime already qualifies to avoid extra work + if runtime_improved and runtime_is_best: + return True + + # Throughput evaluation throughput_improved = True # Default to True if no throughput data throughput_is_best = True # Default to True if no throughput data - - if original_async_throughput is not None and candidate_result.async_throughput is not None: - if original_async_throughput > 0: + original_thr = original_async_throughput + optimized_thr = candidate_result.async_throughput + if original_thr is not None and optimized_thr is not None: + if original_thr > 0: throughput_gain_value = throughput_gain( - original_throughput=original_async_throughput, optimized_throughput=candidate_result.async_throughput + original_throughput=original_thr, optimized_throughput=optimized_thr ) throughput_improved = throughput_gain_value > MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD - throughput_is_best = ( - best_throughput_until_now is None or candidate_result.async_throughput > best_throughput_until_now - ) + throughput_is_best = best_throughput_until_now is None or optimized_thr > best_throughput_until_now + + if throughput_improved and throughput_is_best: + return True # Concurrency evaluation - concurrency_improved = False - concurrency_is_best = True if original_concurrency_metrics is not None and candidate_result.concurrency_metrics is not None: conc_gain = concurrency_gain(original_concurrency_metrics, candidate_result.concurrency_metrics) concurrency_improved = conc_gain > MIN_CONCURRENCY_IMPROVEMENT_THRESHOLD @@ -127,14 +148,10 @@ def speedup_critic( best_concurrency_ratio_until_now is None or candidate_result.concurrency_metrics.concurrency_ratio > best_concurrency_ratio_until_now ) + if concurrency_improved and concurrency_is_best: + return True - # Accept if ANY of: runtime, throughput, or concurrency improves significantly - if original_async_throughput is not None and candidate_result.async_throughput is not None: - throughput_acceptance = throughput_improved and throughput_is_best - runtime_acceptance = runtime_improved and runtime_is_best - concurrency_acceptance = concurrency_improved and concurrency_is_best - return throughput_acceptance or runtime_acceptance or concurrency_acceptance - return runtime_improved and runtime_is_best + return False def get_acceptance_reason( diff --git a/tests/test_critic.py b/tests/test_critic.py index b6a871d47..c377216d4 100644 --- a/tests/test_critic.py +++ b/tests/test_critic.py @@ -799,3 +799,185 @@ def test_parse_concurrency_metrics() -> None: metrics_no_class = parse_concurrency_metrics(test_results_no_class, "my_function") assert metrics_no_class is not None assert metrics_no_class.concurrency_ratio == 2.0 # 5000000 / 2500000 + + +def test_speedup_critic_with_baseline_cv() -> None: + """Test that high baseline variance raises the noise floor to reject noisy speedups.""" + original_code_runtime = 100000 # 100µs — base noise floor is 5% + best_runtime_until_now = 100000 + + # 8% claimed speedup — would pass the default 5% noise floor + candidate_result = OptimizedCandidateResult( + max_loop_count=5, + best_test_runtime=92500, # ~8% faster + behavior_test_results=TestResults(), + benchmarking_test_results=TestResults(), + optimization_candidate_index=0, + total_candidate_timing=12, + ) + + # Without CV: 8% > 5% noise floor → accepted + assert speedup_critic( + candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True + ) + + # With 15% CV: noise floor raised to 15% → 8% improvement rejected + assert not speedup_critic( + candidate_result, + original_code_runtime, + best_runtime_until_now, + disable_gh_action_noise=True, + baseline_timing_cv=0.15, + ) + + # With 3% CV (lower than default noise floor): noise floor stays at 5% → still accepted + assert speedup_critic( + candidate_result, + original_code_runtime, + best_runtime_until_now, + disable_gh_action_noise=True, + baseline_timing_cv=0.03, + ) + + # 25% speedup passes even with 15% CV + fast_candidate = OptimizedCandidateResult( + max_loop_count=5, + best_test_runtime=80000, # 25% faster + behavior_test_results=TestResults(), + benchmarking_test_results=TestResults(), + optimization_candidate_index=0, + total_candidate_timing=12, + ) + assert speedup_critic( + fast_candidate, + original_code_runtime, + best_runtime_until_now, + disable_gh_action_noise=True, + baseline_timing_cv=0.15, + ) + + # CV cap: even with 50% CV, noise floor is capped at 30% — a 40% speedup should pass + big_speedup_candidate = OptimizedCandidateResult( + max_loop_count=5, + best_test_runtime=60000, # 40% faster + behavior_test_results=TestResults(), + benchmarking_test_results=TestResults(), + optimization_candidate_index=0, + total_candidate_timing=12, + ) + assert speedup_critic( + big_speedup_candidate, + original_code_runtime, + best_runtime_until_now, + disable_gh_action_noise=True, + baseline_timing_cv=0.50, + ) + + +def _make_test_invocation(runtime: int, loop_index: int = 0, iteration_id: str = "iter1") -> FunctionTestInvocation: + """Helper to create a FunctionTestInvocation with minimal required fields.""" + return FunctionTestInvocation( + id=InvocationId( + test_module_path="test_mod", + test_class_name="TestClass", + test_function_name="test_func", + function_getting_tested="target", + iteration_id=iteration_id, + ), + file_name=Path("test.py"), + did_pass=True, + runtime=runtime, + test_framework="pytest", + loop_index=loop_index, + test_type=TestType.GENERATED_REGRESSION, + return_value=None, + timed_out=False, + ) + + +def test_total_passed_runtime_uses_median() -> None: + """Verify total_passed_runtime uses median (not min) for timing aggregation.""" + test_results = TestResults( + test_results=[ + _make_test_invocation(100, loop_index=0), + _make_test_invocation(200, loop_index=1), + _make_test_invocation(300, loop_index=2), + ] + ) + + # With min: would be 100. With median_low (3 values): should be 200. + total = test_results.total_passed_runtime() + assert total == 200, f"Expected median_low(100, 200, 300) = 200, got {total}" + + +def test_total_passed_runtime_median_even_count() -> None: + """Verify median_low picks the lower middle value for even-count data.""" + test_results = TestResults( + test_results=[ + _make_test_invocation(100, loop_index=0), + _make_test_invocation(200, loop_index=1), + _make_test_invocation(300, loop_index=2), + _make_test_invocation(400, loop_index=3), + ] + ) + + # median_low of [100, 200, 300, 400] = 200 (lower of two middle values) + total = test_results.total_passed_runtime() + assert total == 200, f"Expected median_low(100, 200, 300, 400) = 200, got {total}" + + +def test_timing_coefficient_of_variation() -> None: + """Verify CV is computed per-test-case, not across all raw iterations.""" + import statistics + + # Zero variance within a single test case + test_results = TestResults( + test_results=[_make_test_invocation(1000, loop_index=i) for i in range(4)] + ) + assert test_results.timing_coefficient_of_variation() == 0.0 + + # Single test case with variance — CV is computed from its loop iterations + runtimes_var = [100, 200, 100, 200] + test_results_var = TestResults( + test_results=[_make_test_invocation(rt, loop_index=i) for i, rt in enumerate(runtimes_var)] + ) + cv = test_results_var.timing_coefficient_of_variation() + expected_cv = statistics.stdev(runtimes_var) / statistics.mean(runtimes_var) + assert abs(cv - expected_cv) < 0.001, f"Expected CV {expected_cv:.4f}, got {cv:.4f}" + + # Empty results + assert TestResults().timing_coefficient_of_variation() == 0.0 + + +def test_timing_cv_multi_test_case() -> None: + """CV should reflect per-test-case noise, not inter-test-case runtime differences.""" + import statistics + + # Two test cases with very different runtimes but low internal variance. + # Old (broken) approach: flatten [100, 100, 10000, 10000] → CV ≈ 1.30 (130%) + # New approach: test_a CV=0.0, test_b CV=0.0 → median=0.0 + test_results = TestResults( + test_results=[ + _make_test_invocation(100, loop_index=0, iteration_id="test_a"), + _make_test_invocation(100, loop_index=1, iteration_id="test_a"), + _make_test_invocation(10000, loop_index=0, iteration_id="test_b"), + _make_test_invocation(10000, loop_index=1, iteration_id="test_b"), + ] + ) + # Both test cases have zero internal variance — the 100x difference in runtimes is not noise + assert test_results.timing_coefficient_of_variation() == 0.0 + + # Two test cases, one noisy and one stable — median picks the middle ground + test_results_mixed = TestResults( + test_results=[ + _make_test_invocation(1000, loop_index=0, iteration_id="stable"), + _make_test_invocation(1000, loop_index=1, iteration_id="stable"), + _make_test_invocation(1000, loop_index=0, iteration_id="noisy"), + _make_test_invocation(2000, loop_index=1, iteration_id="noisy"), + ] + ) + cv = test_results_mixed.timing_coefficient_of_variation() + # stable: CV=0.0, noisy: CV=stdev([1000,2000])/mean([1000,2000]) + noisy_cv = statistics.stdev([1000, 2000]) / statistics.mean([1000, 2000]) + expected_median = statistics.median([0.0, noisy_cv]) + assert abs(cv - expected_median) < 0.001 diff --git a/tests/test_languages/test_java/test_run_and_parse.py b/tests/test_languages/test_java/test_run_and_parse.py index 7d093dbb3..c3fc11d12 100644 --- a/tests/test_languages/test_java/test_run_and_parse.py +++ b/tests/test_languages/test_java/test_run_and_parse.py @@ -527,7 +527,7 @@ def test_performance_inner_loop_count_and_timing(self, java_project): f"Mean runtime {mean_runtime / 1_000_000:.3f}ms not close to expected 10.0ms" ) - # Verify total_passed_runtime sums minimum runtime per test case + # Verify total_passed_runtime sums median runtime per test case # iteration_id is now constant (wrapper ID) across inner iterations, # so all 4 runtimes (2 outer × 2 inner) group under 1 InvocationId key total_runtime = test_results.total_passed_runtime() @@ -542,12 +542,12 @@ def test_performance_inner_loop_count_and_timing(self, java_project): f"Expected 4 runtimes (2 outer × 2 inner) for {test_id.iteration_id}, got {len(test_runtimes)}" ) - # Total should be min of all runtimes ≈ 10ms - # Minimums filter out JIT warmup, so use tighter ±3% tolerance + # Total should be median of all runtimes ≈ 10ms + # Median is more representative than min; use ±5% tolerance for JVM variance expected_total_ns = expected_ns - assert expected_total_ns * 0.97 <= total_runtime <= expected_total_ns * 1.03, ( + assert expected_total_ns * 0.95 <= total_runtime <= expected_total_ns * 1.05, ( f"total_passed_runtime {total_runtime / 1_000_000:.3f}ms not close to expected " - f"{expected_total_ns / 1_000_000:.1f}ms (min of 4 runtimes × 10ms each, ±3%)" + f"{expected_total_ns / 1_000_000:.1f}ms (median of 4 runtimes × 10ms each, ±5%)" ) def test_performance_multiple_test_methods_inner_loop(self, java_project): @@ -612,7 +612,7 @@ def test_performance_multiple_test_methods_inner_loop(self, java_project): f"Mean runtime {mean_runtime / 1_000_000:.3f}ms not close to expected 10.0ms" ) - # Verify total_passed_runtime sums minimum runtime per test case + # Verify total_passed_runtime sums median runtime per test case # iteration_id is now constant (wrapper ID) per call site, so: # 2 test methods = 2 InvocationId keys, each with 4 runtimes (2 outer × 2 inner) total_runtime = test_results.total_passed_runtime() @@ -630,10 +630,10 @@ def test_performance_multiple_test_methods_inner_loop(self, java_project): f"got {len(test_runtimes)}" ) - # Total should be sum of 2 minimums ≈ 20ms - # Minimums filter out JIT warmup, so use tighter ±3% tolerance + # Total should be sum of 2 medians ≈ 20ms + # Median is more representative than min; use ±5% tolerance for JVM variance expected_total_ns = 2 * expected_ns # 2 test cases × 10ms each - assert expected_total_ns * 0.97 <= total_runtime <= expected_total_ns * 1.03, ( + assert expected_total_ns * 0.95 <= total_runtime <= expected_total_ns * 1.05, ( f"total_passed_runtime {total_runtime / 1_000_000:.3f}ms not close to expected " - f"{expected_total_ns / 1_000_000:.1f}ms (2 methods × min of 4 runtimes × 10ms, ±3%)" + f"{expected_total_ns / 1_000_000:.1f}ms (2 methods × median of 4 runtimes × 10ms, ±5%)" )