diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 47f9bba94fd..14086f1d3ad 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -1122,16 +1122,33 @@ def get_model_context_length( if ctx: return ctx - # 6. OpenRouter live API metadata (provider-unaware fallback) + # 6. Provider-aware hardcoded defaults — checked BEFORE OpenRouter + # because the generic aggregator may have stale/incomplete data + # (e.g. OpenRouter reports 128K for MiniMax-M2.7 when it's 204.8K). + # Provider-specific knowledge beats the generic fallback. + model_lower = model.lower() + if effective_provider: + prov_lower = effective_provider.lower() + # Check only provider name against DEFAULT_CONTEXT_LENGTHS keys. + # Broad family keys (e.g. "grok", "claude") should NOT match via + # model name here — that causes OpenRouter models to get stale + # hardcoded values instead of OpenRouter's live metadata (step 7). + # Model-name matching happens at step 8 (no-provider fallback). + for default_model, length in sorted( + DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True + ): + if default_model in prov_lower: + return length + + # 7. OpenRouter live API metadata (provider-unaware fallback) metadata = fetch_model_metadata() if model in metadata: return metadata[model].get("context_length", 128000) - # 8. Hardcoded defaults (fuzzy match — longest key first for specificity) - # Only check `default_model in model` (is the key a substring of the input). - # The reverse (`model in default_model`) causes shorter names like - # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M. - model_lower = model.lower() + # 8. Hardcoded defaults — model name fallback (no provider known) + # Only check `default_model in model` (is the key a substring of the input). + # The reverse (`model in default_model`) causes shorter names like + # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M. for default_model, length in sorted( DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True ): diff --git a/run_agent.py b/run_agent.py index e69d30ff2c6..55993915aeb 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2196,12 +2196,16 @@ def _check_compression_model_feasibility(self) -> None: threshold = self.context_compressor.threshold_tokens if aux_context < threshold: + # Cap at 85% of aux model context to leave room for + # system prompt, compression instructions, and summary + # output inside the auxiliary model's window. + usable_aux = int(aux_context * 0.85) # Auto-correct: lower the live session threshold so # compression actually works this session. The hard floor # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH, # so the new threshold is always >= 64K. old_threshold = threshold - new_threshold = aux_context + new_threshold = min(usable_aux, old_threshold) self.context_compressor.threshold_tokens = new_threshold # Keep threshold_percent in sync so future main-model # context_length changes (update_model) re-derive from a @@ -2211,34 +2215,53 @@ def _check_compression_model_feasibility(self) -> None: self.context_compressor.threshold_percent = ( new_threshold / main_ctx ) - safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50 - msg = ( - f"⚠ Compression model ({aux_model}) context is " - f"{aux_context:,} tokens, but the main model's " - f"compression threshold was {old_threshold:,} tokens. " - f"Auto-lowered this session's threshold to " - f"{new_threshold:,} tokens so compression can run.\n" - f" To make this permanent, edit config.yaml — either:\n" - f" 1. Use a larger compression model:\n" - f" auxiliary:\n" - f" compression:\n" - f" model: \n" - f" 2. Lower the compression threshold:\n" - f" compression:\n" - f" threshold: 0.{safe_pct:02d}" - ) - self._compression_warning = msg - self._emit_status(msg) - logger.warning( - "Auxiliary compression model %s has %d token context, " - "below the main model's compression threshold of %d " - "tokens — auto-lowered session threshold to %d to " - "keep compression working.", - aux_model, - aux_context, - old_threshold, - new_threshold, - ) + safe_pct = int((new_threshold / main_ctx) * 100) if main_ctx else 50 + # Two severity levels: + # 1) Mismatch <= 2x: compression works fine, just a note + # 2) Mismatch > 2x: warn user to fix config + mismatch_ratio = old_threshold / new_threshold if new_threshold else 999 + if mismatch_ratio <= 2.0: + # Minor mismatch — compression works, just log it. + # No user-facing warning; the auto-lower is sufficient. + logger.info( + "Compression threshold auto-capped: %d -> %d tokens " + "(aux model %s has %d ctx, %.0f%% usable). " + "Set 'compression.threshold: 0.%02d' in config.yaml " + "to suppress this adjustment.", + old_threshold, + new_threshold, + aux_model, + aux_context, + 85, + safe_pct, + ) + else: + # Significant mismatch — warn with actionable fix. + msg = ( + f"⚠ Compression model ({aux_model}) has {aux_context:,} " + f"token context but threshold was {old_threshold:,} tokens. " + f"Auto-capped at {new_threshold:,} tokens.\n" + f" Fix permanently — either:\n" + f" 1. Lower threshold in config.yaml:\n" + f" compression:\n" + f" threshold: 0.{safe_pct:02d}\n" + f" 2. Use a larger compression model:\n" + f" auxiliary:\n" + f" compression:\n" + f" model: " + ) + self._compression_warning = msg + self._emit_status(msg) + logger.warning( + "Auxiliary compression model %s has %d token context, " + "below the main model's compression threshold of %d " + "tokens — auto-capped session threshold to %d to " + "keep compression working.", + aux_model, + aux_context, + old_threshold, + new_threshold, + ) except ValueError: # Hard rejections (aux below minimum context) must propagate # so the session refuses to start. diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 25dc0c01abf..dfd6916361d 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -56,8 +56,9 @@ def _make_agent( @patch("agent.model_metadata.get_model_context_length", return_value=80_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len): - """Auto-correction: aux >= 64K floor but < threshold → lower threshold - to aux_context so compression still works this session.""" + """Auto-correction: aux >= 64K floor but < threshold → cap threshold + at 85% of aux_context so compression works with safety margin. + With mismatch ratio 100K/68K=1.47 ≤ 2.0, this is a silent adjustment.""" agent = _make_agent(main_context=200_000, threshold_percent=0.50) # threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold) mock_client = MagicMock() @@ -70,20 +71,11 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien agent._check_compression_model_feasibility() - assert len(messages) == 1 - assert "Compression model" in messages[0] - assert "80,000" in messages[0] # aux context - assert "100,000" in messages[0] # old threshold - assert "Auto-lowered" in messages[0] - # Actionable persistence guidance included - assert "config.yaml" in messages[0] - assert "auxiliary:" in messages[0] - assert "compression:" in messages[0] - assert "threshold:" in messages[0] - # Warning stored for gateway replay - assert agent._compression_warning is not None - # Threshold on the live compressor was actually lowered - assert agent.context_compressor.threshold_tokens == 80_000 + # Mismatch ratio 100K/68K ≈ 1.47 ≤ 2.0 → silent adjustment, no user message + assert len(messages) == 0 + assert agent._compression_warning is None + # Threshold capped at 85% of aux context (80,000 * 0.85 = 68,000) + assert agent.context_compressor.threshold_tokens == 68_000 @patch("agent.model_metadata.get_model_context_length", return_value=32_768) @@ -324,7 +316,7 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len): @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len): """Auto-correct fires when aux context is one token below the threshold - (and above the 64K hard floor).""" + (and above the 64K hard floor). Mismatch ratio 100K/85K=1.18 ≤ 2.0 → silent.""" agent = _make_agent(main_context=200_000, threshold_percent=0.50) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" @@ -336,10 +328,37 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len): agent._check_compression_model_feasibility() + # Silent: mismatch ratio is 100K/85K ≈ 1.18 ≤ 2.0 + assert len(messages) == 0 + assert agent._compression_warning is None + # 85% of 99,999 = 84,999 + assert agent.context_compressor.threshold_tokens == 84_999 + + +@patch("agent.model_metadata.get_model_context_length", return_value=80_000) +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_significant_mismatch_shows_warning(mock_get_client, mock_ctx_len): + """When mismatch ratio > 2.0, a user-facing warning is shown. + 200K ctx * 0.95 threshold = 190K → 85% of 80K = 68K → ratio 2.79 > 2.0.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.95) + mock_client = MagicMock() + mock_client.base_url = "https://openrouter.ai/api/v1" + mock_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_client, "tiny-compression-model") + + messages = [] + agent._emit_status = lambda msg: messages.append(msg) + + agent._check_compression_model_feasibility() + assert len(messages) == 1 - assert "small-model" in messages[0] - assert "Auto-lowered" in messages[0] - assert agent.context_compressor.threshold_tokens == 99_999 + assert "tiny-compression-model" in messages[0] + assert "Auto-capped" in messages[0] + assert "190,000" in messages[0] # old threshold + assert "68,000" in messages[0] # new threshold (85% of 80K) + assert "config.yaml" in messages[0] + assert agent._compression_warning is not None + assert agent.context_compressor.threshold_tokens == 68_000 # ── Two-phase: __init__ + run_conversation replay ─────────────────── @@ -348,8 +367,9 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len): @patch("agent.model_metadata.get_model_context_length", return_value=80_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len): - """__init__ stores the warning; _replay sends it through status_callback.""" - agent = _make_agent(main_context=200_000, threshold_percent=0.50) + """__init__ stores the warning; _replay sends it through status_callback. + Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.95) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" mock_client.api_key = "sk-aux" @@ -360,8 +380,11 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len): agent._emit_status = lambda msg: vprint_messages.append(msg) agent._check_compression_model_feasibility() - assert len(vprint_messages) == 1 # CLI got it + assert len(vprint_messages) == 1 # CLI got it (mismatch > 2.0) + assert "Auto-capped" in vprint_messages[0] assert agent._compression_warning is not None # stored for replay + # Threshold capped at 85% of 80,000 = 68,000 + assert agent.context_compressor.threshold_tokens == 68_000 # Phase 2: gateway wires callback post-init, then run_conversation replays callback_events = [] @@ -369,7 +392,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len): agent._replay_compression_warning() assert any( - ev == "lifecycle" and "Auto-lowered" in msg + ev == "lifecycle" and "Auto-capped" in msg for ev, msg in callback_events ) @@ -410,8 +433,9 @@ def test_replay_without_callback_is_noop(): @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len): """After replay in run_conversation, _compression_warning is cleared - so the warning is not sent again on subsequent turns.""" - agent = _make_agent(main_context=200_000, threshold_percent=0.50) + so the warning is not sent again on subsequent turns. + Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.95) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" mock_client.api_key = "sk-aux" @@ -421,6 +445,8 @@ def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_ agent._check_compression_model_feasibility() assert agent._compression_warning is not None + # Threshold capped at 85% of 80,000 = 68,000 + assert agent.context_compressor.threshold_tokens == 68_000 # Simulate what run_conversation does callback_events = []