diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index d8d181cc107..6b1d7941ee5 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -1270,6 +1270,47 @@ def convert_messages_to_anthropic( return system, result +def _cap_cache_control_markers(kwargs: dict, max_markers: int = 4) -> None: + """Cap total cache_control markers in the final kwargs at Anthropic's limit. + + The system_and_3 caching strategy assumes 4 distinct messages each get + ONE marker. But when multiple consecutive tool-role messages merge into + a single Anthropic user message (each carrying its own cache_control on + the resulting tool_result block), the total marker count can exceed 4 — + producing HTTP 400 "A maximum of 4 blocks with cache_control may be + provided". + + Scan system + messages in order (oldest first), keep the rightmost + max_markers, strip the rest. Rightmost wins because the end of the + conversation is where cache-prefix boundaries are most valuable. + Tool definitions are untouched (they live in kwargs['tools'] and are + caller-managed). + """ + locations = [] # dicts carrying cache_control, oldest first + + system = kwargs.get("system") + if isinstance(system, list): + for block in system: + if isinstance(block, dict) and block.get("cache_control"): + locations.append(block) + + for msg in kwargs.get("messages", []) or []: + if isinstance(msg, dict): + if isinstance(msg.get("cache_control"), dict): + locations.append(msg) + content = msg.get("content") + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("cache_control"): + locations.append(block) + + excess = len(locations) - max_markers + if excess <= 0: + return + for container in locations[:excess]: + container.pop("cache_control", None) + + def build_anthropic_kwargs( model: str, messages: List[Dict], @@ -1455,6 +1496,18 @@ def build_anthropic_kwargs( betas.append(_FAST_MODE_BETA) kwargs["extra_headers"] = {"anthropic-beta": ",".join(betas)} + # Safety net: cap cache_control markers at Anthropic's 4-breakpoint limit. + # Prevents HTTP 400 when tool-result merging clusters multiple markers + # onto a single Anthropic message. + # For third-party Anthropic-compatible proxies (LiteLLM-style, e.g. + # llm.echo.tech) strip ALL markers — the proxy auto-injects its own + # cache hints upstream, and anything we send stacks on top and + # deterministically exceeds 4. + if _is_third_party_anthropic_endpoint(base_url): + _cap_cache_control_markers(kwargs, max_markers=0) + else: + _cap_cache_control_markers(kwargs, max_markers=4) + return kwargs diff --git a/agent/prompt_caching.py b/agent/prompt_caching.py index d80f58ea40a..3d968b0c02f 100644 --- a/agent/prompt_caching.py +++ b/agent/prompt_caching.py @@ -38,6 +38,18 @@ def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = last["cache_control"] = cache_marker +def _strip_all_cache_control(msg: dict) -> None: + """Remove any cache_control markers that may have leaked in from previous + turns or upstream history. Prevents accumulation across turns which would + otherwise exceed Anthropic/Bedrock's 4-breakpoint limit.""" + msg.pop("cache_control", None) + content = msg.get("content") + if isinstance(content, list): + for block in content: + if isinstance(block, dict): + block.pop("cache_control", None) + + def apply_anthropic_cache_control( api_messages: List[Dict[str, Any]], cache_ttl: str = "5m", @@ -54,6 +66,12 @@ def apply_anthropic_cache_control( if not messages: return messages + # Scrub any pre-existing cache_control markers so we don't accumulate + # breakpoints across turns (would exceed the 4-breakpoint limit and + # trigger HTTP 400 on Anthropic/Bedrock). + for m in messages: + _strip_all_cache_control(m) + marker = {"type": "ephemeral"} if cache_ttl == "1h": marker["ttl"] = "1h"