Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions agent/model_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,16 +1122,33 @@ def get_model_context_length(
if ctx:
return ctx

# 6. OpenRouter live API metadata (provider-unaware fallback)
# 6. Provider-aware hardcoded defaults — checked BEFORE OpenRouter
# because the generic aggregator may have stale/incomplete data
# (e.g. OpenRouter reports 128K for MiniMax-M2.7 when it's 204.8K).
# Provider-specific knowledge beats the generic fallback.
model_lower = model.lower()
if effective_provider:
prov_lower = effective_provider.lower()
# Check only provider name against DEFAULT_CONTEXT_LENGTHS keys.
# Broad family keys (e.g. "grok", "claude") should NOT match via
# model name here — that causes OpenRouter models to get stale
# hardcoded values instead of OpenRouter's live metadata (step 7).
# Model-name matching happens at step 8 (no-provider fallback).
for default_model, length in sorted(
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
):
if default_model in prov_lower:
return length

# 7. OpenRouter live API metadata (provider-unaware fallback)
metadata = fetch_model_metadata()
if model in metadata:
return metadata[model].get("context_length", 128000)

# 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
# Only check `default_model in model` (is the key a substring of the input).
# The reverse (`model in default_model`) causes shorter names like
# "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
model_lower = model.lower()
# 8. Hardcoded defaults — model name fallback (no provider known)
# Only check `default_model in model` (is the key a substring of the input).
# The reverse (`model in default_model`) causes shorter names like
# "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
for default_model, length in sorted(
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
):
Expand Down
81 changes: 52 additions & 29 deletions run_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2196,12 +2196,16 @@ def _check_compression_model_feasibility(self) -> None:

threshold = self.context_compressor.threshold_tokens
if aux_context < threshold:
# Cap at 85% of aux model context to leave room for
# system prompt, compression instructions, and summary
# output inside the auxiliary model's window.
usable_aux = int(aux_context * 0.85)
# Auto-correct: lower the live session threshold so
# compression actually works this session. The hard floor
# above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
# so the new threshold is always >= 64K.
old_threshold = threshold
new_threshold = aux_context
new_threshold = min(usable_aux, old_threshold)
self.context_compressor.threshold_tokens = new_threshold
# Keep threshold_percent in sync so future main-model
# context_length changes (update_model) re-derive from a
Expand All @@ -2211,34 +2215,53 @@ def _check_compression_model_feasibility(self) -> None:
self.context_compressor.threshold_percent = (
new_threshold / main_ctx
)
safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
msg = (
f"⚠ Compression model ({aux_model}) context is "
f"{aux_context:,} tokens, but the main model's "
f"compression threshold was {old_threshold:,} tokens. "
f"Auto-lowered this session's threshold to "
f"{new_threshold:,} tokens so compression can run.\n"
f" To make this permanent, edit config.yaml — either:\n"
f" 1. Use a larger compression model:\n"
f" auxiliary:\n"
f" compression:\n"
f" model: <model-with-{old_threshold:,}+-context>\n"
f" 2. Lower the compression threshold:\n"
f" compression:\n"
f" threshold: 0.{safe_pct:02d}"
)
self._compression_warning = msg
self._emit_status(msg)
logger.warning(
"Auxiliary compression model %s has %d token context, "
"below the main model's compression threshold of %d "
"tokens — auto-lowered session threshold to %d to "
"keep compression working.",
aux_model,
aux_context,
old_threshold,
new_threshold,
)
safe_pct = int((new_threshold / main_ctx) * 100) if main_ctx else 50
# Two severity levels:
# 1) Mismatch <= 2x: compression works fine, just a note
# 2) Mismatch > 2x: warn user to fix config
mismatch_ratio = old_threshold / new_threshold if new_threshold else 999
if mismatch_ratio <= 2.0:
# Minor mismatch — compression works, just log it.
# No user-facing warning; the auto-lower is sufficient.
logger.info(
"Compression threshold auto-capped: %d -> %d tokens "
"(aux model %s has %d ctx, %.0f%% usable). "
"Set 'compression.threshold: 0.%02d' in config.yaml "
"to suppress this adjustment.",
old_threshold,
new_threshold,
aux_model,
aux_context,
85,
safe_pct,
)
else:
# Significant mismatch — warn with actionable fix.
msg = (
f"⚠ Compression model ({aux_model}) has {aux_context:,} "
f"token context but threshold was {old_threshold:,} tokens. "
f"Auto-capped at {new_threshold:,} tokens.\n"
f" Fix permanently — either:\n"
f" 1. Lower threshold in config.yaml:\n"
f" compression:\n"
f" threshold: 0.{safe_pct:02d}\n"
f" 2. Use a larger compression model:\n"
f" auxiliary:\n"
f" compression:\n"
f" model: <model-with-{old_threshold:,}+-context>"
)
self._compression_warning = msg
self._emit_status(msg)
logger.warning(
"Auxiliary compression model %s has %d token context, "
"below the main model's compression threshold of %d "
"tokens — auto-capped session threshold to %d to "
"keep compression working.",
aux_model,
aux_context,
old_threshold,
new_threshold,
)
except ValueError:
# Hard rejections (aux below minimum context) must propagate
# so the session refuses to start.
Expand Down
78 changes: 52 additions & 26 deletions tests/run_agent/test_compression_feasibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def _make_agent(
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
"""Auto-correction: aux >= 64K floor but < threshold → lower threshold
to aux_context so compression still works this session."""
"""Auto-correction: aux >= 64K floor but < threshold → cap threshold
at 85% of aux_context so compression works with safety margin.
With mismatch ratio 100K/68K=1.47 ≤ 2.0, this is a silent adjustment."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
# threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
mock_client = MagicMock()
Expand All @@ -70,20 +71,11 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien

agent._check_compression_model_feasibility()

assert len(messages) == 1
assert "Compression model" in messages[0]
assert "80,000" in messages[0] # aux context
assert "100,000" in messages[0] # old threshold
assert "Auto-lowered" in messages[0]
# Actionable persistence guidance included
assert "config.yaml" in messages[0]
assert "auxiliary:" in messages[0]
assert "compression:" in messages[0]
assert "threshold:" in messages[0]
# Warning stored for gateway replay
assert agent._compression_warning is not None
# Threshold on the live compressor was actually lowered
assert agent.context_compressor.threshold_tokens == 80_000
# Mismatch ratio 100K/68K ≈ 1.47 ≤ 2.0 → silent adjustment, no user message
assert len(messages) == 0
assert agent._compression_warning is None
# Threshold capped at 85% of aux context (80,000 * 0.85 = 68,000)
assert agent.context_compressor.threshold_tokens == 68_000


@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
Expand Down Expand Up @@ -324,7 +316,7 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
"""Auto-correct fires when aux context is one token below the threshold
(and above the 64K hard floor)."""
(and above the 64K hard floor). Mismatch ratio 100K/85K=1.18 ≤ 2.0 → silent."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
Expand All @@ -336,10 +328,37 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):

agent._check_compression_model_feasibility()

# Silent: mismatch ratio is 100K/85K ≈ 1.18 ≤ 2.0
assert len(messages) == 0
assert agent._compression_warning is None
# 85% of 99,999 = 84,999
assert agent.context_compressor.threshold_tokens == 84_999


@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_significant_mismatch_shows_warning(mock_get_client, mock_ctx_len):
"""When mismatch ratio > 2.0, a user-facing warning is shown.
200K ctx * 0.95 threshold = 190K → 85% of 80K = 68K → ratio 2.79 > 2.0."""
agent = _make_agent(main_context=200_000, threshold_percent=0.95)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "tiny-compression-model")

messages = []
agent._emit_status = lambda msg: messages.append(msg)

agent._check_compression_model_feasibility()

assert len(messages) == 1
assert "small-model" in messages[0]
assert "Auto-lowered" in messages[0]
assert agent.context_compressor.threshold_tokens == 99_999
assert "tiny-compression-model" in messages[0]
assert "Auto-capped" in messages[0]
assert "190,000" in messages[0] # old threshold
assert "68,000" in messages[0] # new threshold (85% of 80K)
assert "config.yaml" in messages[0]
assert agent._compression_warning is not None
assert agent.context_compressor.threshold_tokens == 68_000


# ── Two-phase: __init__ + run_conversation replay ───────────────────
Expand All @@ -348,8 +367,9 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
"""__init__ stores the warning; _replay sends it through status_callback."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
"""__init__ stores the warning; _replay sends it through status_callback.
Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns."""
agent = _make_agent(main_context=200_000, threshold_percent=0.95)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
Expand All @@ -360,16 +380,19 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
agent._emit_status = lambda msg: vprint_messages.append(msg)
agent._check_compression_model_feasibility()

assert len(vprint_messages) == 1 # CLI got it
assert len(vprint_messages) == 1 # CLI got it (mismatch > 2.0)
assert "Auto-capped" in vprint_messages[0]
assert agent._compression_warning is not None # stored for replay
# Threshold capped at 85% of 80,000 = 68,000
assert agent.context_compressor.threshold_tokens == 68_000

# Phase 2: gateway wires callback post-init, then run_conversation replays
callback_events = []
agent.status_callback = lambda ev, msg: callback_events.append((ev, msg))
agent._replay_compression_warning()

assert any(
ev == "lifecycle" and "Auto-lowered" in msg
ev == "lifecycle" and "Auto-capped" in msg
for ev, msg in callback_events
)

Expand Down Expand Up @@ -410,8 +433,9 @@ def test_replay_without_callback_is_noop():
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
"""After replay in run_conversation, _compression_warning is cleared
so the warning is not sent again on subsequent turns."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
so the warning is not sent again on subsequent turns.
Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns."""
agent = _make_agent(main_context=200_000, threshold_percent=0.95)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
Expand All @@ -421,6 +445,8 @@ def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_
agent._check_compression_model_feasibility()

assert agent._compression_warning is not None
# Threshold capped at 85% of 80,000 = 68,000
assert agent.context_compressor.threshold_tokens == 68_000

# Simulate what run_conversation does
callback_events = []
Expand Down
Loading