Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions agent/anthropic_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,47 @@ def convert_messages_to_anthropic(
return system, result


def _cap_cache_control_markers(kwargs: dict, max_markers: int = 4) -> None:
"""Cap total cache_control markers in the final kwargs at Anthropic's limit.

The system_and_3 caching strategy assumes 4 distinct messages each get
ONE marker. But when multiple consecutive tool-role messages merge into
a single Anthropic user message (each carrying its own cache_control on
the resulting tool_result block), the total marker count can exceed 4 —
producing HTTP 400 "A maximum of 4 blocks with cache_control may be
provided".

Scan system + messages in order (oldest first), keep the rightmost
max_markers, strip the rest. Rightmost wins because the end of the
conversation is where cache-prefix boundaries are most valuable.
Tool definitions are untouched (they live in kwargs['tools'] and are
caller-managed).
"""
locations = [] # dicts carrying cache_control, oldest first

system = kwargs.get("system")
if isinstance(system, list):
for block in system:
if isinstance(block, dict) and block.get("cache_control"):
locations.append(block)

for msg in kwargs.get("messages", []) or []:
if isinstance(msg, dict):
if isinstance(msg.get("cache_control"), dict):
locations.append(msg)
content = msg.get("content")
if isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("cache_control"):
locations.append(block)

excess = len(locations) - max_markers
if excess <= 0:
return
for container in locations[:excess]:
container.pop("cache_control", None)


def build_anthropic_kwargs(
model: str,
messages: List[Dict],
Expand Down Expand Up @@ -1455,6 +1496,18 @@ def build_anthropic_kwargs(
betas.append(_FAST_MODE_BETA)
kwargs["extra_headers"] = {"anthropic-beta": ",".join(betas)}

# Safety net: cap cache_control markers at Anthropic's 4-breakpoint limit.
# Prevents HTTP 400 when tool-result merging clusters multiple markers
# onto a single Anthropic message.
# For third-party Anthropic-compatible proxies (LiteLLM-style, e.g.
# llm.echo.tech) strip ALL markers — the proxy auto-injects its own
# cache hints upstream, and anything we send stacks on top and
# deterministically exceeds 4.
if _is_third_party_anthropic_endpoint(base_url):
_cap_cache_control_markers(kwargs, max_markers=0)
else:
_cap_cache_control_markers(kwargs, max_markers=4)

return kwargs


Expand Down
18 changes: 18 additions & 0 deletions agent/prompt_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool =
last["cache_control"] = cache_marker


def _strip_all_cache_control(msg: dict) -> None:
"""Remove any cache_control markers that may have leaked in from previous
turns or upstream history. Prevents accumulation across turns which would
otherwise exceed Anthropic/Bedrock's 4-breakpoint limit."""
msg.pop("cache_control", None)
content = msg.get("content")
if isinstance(content, list):
for block in content:
if isinstance(block, dict):
block.pop("cache_control", None)


def apply_anthropic_cache_control(
api_messages: List[Dict[str, Any]],
cache_ttl: str = "5m",
Expand All @@ -54,6 +66,12 @@ def apply_anthropic_cache_control(
if not messages:
return messages

# Scrub any pre-existing cache_control markers so we don't accumulate
# breakpoints across turns (would exceed the 4-breakpoint limit and
# trigger HTTP 400 on Anthropic/Bedrock).
for m in messages:
_strip_all_cache_control(m)

marker = {"type": "ephemeral"}
if cache_ttl == "1h":
marker["ttl"] = "1h"
Expand Down