NousResearch · StefanIsMe · Apr 21, 2026 · Apr 21, 2026
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
@@ -1122,16 +1122,33 @@ def get_model_context_length(
         if ctx:
             return ctx
 
-    # 6. OpenRouter live API metadata (provider-unaware fallback)
+    # 6. Provider-aware hardcoded defaults — checked BEFORE OpenRouter
+    #    because the generic aggregator may have stale/incomplete data
+    #    (e.g. OpenRouter reports 128K for MiniMax-M2.7 when it's 204.8K).
+    #    Provider-specific knowledge beats the generic fallback.
+    model_lower = model.lower()
+    if effective_provider:
+        prov_lower = effective_provider.lower()
+        # Check only provider name against DEFAULT_CONTEXT_LENGTHS keys.
+        # Broad family keys (e.g. "grok", "claude") should NOT match via
+        # model name here — that causes OpenRouter models to get stale
+        # hardcoded values instead of OpenRouter's live metadata (step 7).
+        # Model-name matching happens at step 8 (no-provider fallback).
+        for default_model, length in sorted(
+            DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
+        ):
+            if default_model in prov_lower:
+                return length
+
+    # 7. OpenRouter live API metadata (provider-unaware fallback)
     metadata = fetch_model_metadata()
     if model in metadata:
         return metadata[model].get("context_length", 128000)
 
-    # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
-    # Only check `default_model in model` (is the key a substring of the input).
-    # The reverse (`model in default_model`) causes shorter names like
-    # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
-    model_lower = model.lower()
+    # 8. Hardcoded defaults — model name fallback (no provider known)
+    #    Only check `default_model in model` (is the key a substring of the input).
+    #    The reverse (`model in default_model`) causes shorter names like
+    #    "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
     for default_model, length in sorted(
         DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
     ):

diff --git a/run_agent.py b/run_agent.py
@@ -2196,12 +2196,16 @@ def _check_compression_model_feasibility(self) -> None:
 
             threshold = self.context_compressor.threshold_tokens
             if aux_context < threshold:
+                # Cap at 85% of aux model context to leave room for
+                # system prompt, compression instructions, and summary
+                # output inside the auxiliary model's window.
+                usable_aux = int(aux_context * 0.85)
                 # Auto-correct: lower the live session threshold so
                 # compression actually works this session.  The hard floor
                 # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
                 # so the new threshold is always >= 64K.
                 old_threshold = threshold
-                new_threshold = aux_context
+                new_threshold = min(usable_aux, old_threshold)
                 self.context_compressor.threshold_tokens = new_threshold
                 # Keep threshold_percent in sync so future main-model
                 # context_length changes (update_model) re-derive from a
@@ -2211,34 +2215,53 @@ def _check_compression_model_feasibility(self) -> None:
                     self.context_compressor.threshold_percent = (
                         new_threshold / main_ctx
                     )
-                safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
-                msg = (
-                    f"⚠ Compression model ({aux_model}) context is "
-                    f"{aux_context:,} tokens, but the main model's "
-                    f"compression threshold was {old_threshold:,} tokens. "
-                    f"Auto-lowered this session's threshold to "
-                    f"{new_threshold:,} tokens so compression can run.\n"
-                    f"  To make this permanent, edit config.yaml — either:\n"
-                    f"  1. Use a larger compression model:\n"
-                    f"       auxiliary:\n"
-                    f"         compression:\n"
-                    f"           model: <model-with-{old_threshold:,}+-context>\n"
-                    f"  2. Lower the compression threshold:\n"
-                    f"       compression:\n"
-                    f"         threshold: 0.{safe_pct:02d}"
-                )
-                self._compression_warning = msg
-                self._emit_status(msg)
-                logger.warning(
-                    "Auxiliary compression model %s has %d token context, "
-                    "below the main model's compression threshold of %d "
-                    "tokens — auto-lowered session threshold to %d to "
-                    "keep compression working.",
-                    aux_model,
-                    aux_context,
-                    old_threshold,
-                    new_threshold,
-                )
+                safe_pct = int((new_threshold / main_ctx) * 100) if main_ctx else 50
+                # Two severity levels:
+                # 1) Mismatch <= 2x: compression works fine, just a note
+                # 2) Mismatch > 2x: warn user to fix config
+                mismatch_ratio = old_threshold / new_threshold if new_threshold else 999
+                if mismatch_ratio <= 2.0:
+                    # Minor mismatch — compression works, just log it.
+                    # No user-facing warning; the auto-lower is sufficient.
+                    logger.info(
+                        "Compression threshold auto-capped: %d -> %d tokens "
+                        "(aux model %s has %d ctx, %.0f%% usable). "
+                        "Set 'compression.threshold: 0.%02d' in config.yaml "
+                        "to suppress this adjustment.",
+                        old_threshold,
+                        new_threshold,
+                        aux_model,
+                        aux_context,
+                        85,
+                        safe_pct,
+                    )
+                else:
+                    # Significant mismatch — warn with actionable fix.
+                    msg = (
+                        f"⚠ Compression model ({aux_model}) has {aux_context:,} "
+                        f"token context but threshold was {old_threshold:,} tokens. "
+                        f"Auto-capped at {new_threshold:,} tokens.\n"
+                        f"  Fix permanently — either:\n"
+                        f"  1. Lower threshold in config.yaml:\n"
+                        f"       compression:\n"
+                        f"         threshold: 0.{safe_pct:02d}\n"
+                        f"  2. Use a larger compression model:\n"
+                        f"       auxiliary:\n"
+                        f"         compression:\n"
+                        f"           model: <model-with-{old_threshold:,}+-context>"
+                    )
+                    self._compression_warning = msg
+                    self._emit_status(msg)
+                    logger.warning(
+                        "Auxiliary compression model %s has %d token context, "
+                        "below the main model's compression threshold of %d "
+                        "tokens — auto-capped session threshold to %d to "
+                        "keep compression working.",
+                        aux_model,
+                        aux_context,
+                        old_threshold,
+                        new_threshold,
+                    )
         except ValueError:
             # Hard rejections (aux below minimum context) must propagate
             # so the session refuses to start.

diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
@@ -56,8 +56,9 @@ def _make_agent(
 @patch("agent.model_metadata.get_model_context_length", return_value=80_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
-    """Auto-correction: aux >= 64K floor but < threshold → lower threshold
-    to aux_context so compression still works this session."""
+    """Auto-correction: aux >= 64K floor but < threshold → cap threshold
+    at 85% of aux_context so compression works with safety margin.
+    With mismatch ratio 100K/68K=1.47 ≤ 2.0, this is a silent adjustment."""
     agent = _make_agent(main_context=200_000, threshold_percent=0.50)
     # threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
     mock_client = MagicMock()
@@ -70,20 +71,11 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
 
     agent._check_compression_model_feasibility()
 
-    assert len(messages) == 1
-    assert "Compression model" in messages[0]
-    assert "80,000" in messages[0]        # aux context
-    assert "100,000" in messages[0]       # old threshold
-    assert "Auto-lowered" in messages[0]
-    # Actionable persistence guidance included
-    assert "config.yaml" in messages[0]
-    assert "auxiliary:" in messages[0]
-    assert "compression:" in messages[0]
-    assert "threshold:" in messages[0]
-    # Warning stored for gateway replay
-    assert agent._compression_warning is not None
-    # Threshold on the live compressor was actually lowered
-    assert agent.context_compressor.threshold_tokens == 80_000
+    # Mismatch ratio 100K/68K ≈ 1.47 ≤ 2.0 → silent adjustment, no user message
+    assert len(messages) == 0
+    assert agent._compression_warning is None
+    # Threshold capped at 85% of aux context (80,000 * 0.85 = 68,000)
+    assert agent.context_compressor.threshold_tokens == 68_000
 
 
 @patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@@ -324,7 +316,7 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
     """Auto-correct fires when aux context is one token below the threshold
-    (and above the 64K hard floor)."""
+    (and above the 64K hard floor). Mismatch ratio 100K/85K=1.18 ≤ 2.0 → silent."""
     agent = _make_agent(main_context=200_000, threshold_percent=0.50)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
@@ -336,10 +328,37 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
 
     agent._check_compression_model_feasibility()
 
+    # Silent: mismatch ratio is 100K/85K ≈ 1.18 ≤ 2.0
+    assert len(messages) == 0
+    assert agent._compression_warning is None
+    # 85% of 99,999 = 84,999
+    assert agent.context_compressor.threshold_tokens == 84_999
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_significant_mismatch_shows_warning(mock_get_client, mock_ctx_len):
+    """When mismatch ratio > 2.0, a user-facing warning is shown.
+    200K ctx * 0.95 threshold = 190K → 85% of 80K = 68K → ratio 2.79 > 2.0."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.95)
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "tiny-compression-model")
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    agent._check_compression_model_feasibility()
+
     assert len(messages) == 1
-    assert "small-model" in messages[0]
-    assert "Auto-lowered" in messages[0]
-    assert agent.context_compressor.threshold_tokens == 99_999
+    assert "tiny-compression-model" in messages[0]
+    assert "Auto-capped" in messages[0]
+    assert "190,000" in messages[0]  # old threshold
+    assert "68,000" in messages[0]   # new threshold (85% of 80K)
+    assert "config.yaml" in messages[0]
+    assert agent._compression_warning is not None
+    assert agent.context_compressor.threshold_tokens == 68_000
 
 
 # ── Two-phase: __init__ + run_conversation replay ───────────────────
@@ -348,8 +367,9 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
 @patch("agent.model_metadata.get_model_context_length", return_value=80_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
-    """__init__ stores the warning; _replay sends it through status_callback."""
-    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    """__init__ stores the warning; _replay sends it through status_callback.
+    Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.95)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
     mock_client.api_key = "sk-aux"
@@ -360,16 +380,19 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
     agent._emit_status = lambda msg: vprint_messages.append(msg)
     agent._check_compression_model_feasibility()
 
-    assert len(vprint_messages) == 1  # CLI got it
+    assert len(vprint_messages) == 1  # CLI got it (mismatch > 2.0)
+    assert "Auto-capped" in vprint_messages[0]
     assert agent._compression_warning is not None  # stored for replay
+    # Threshold capped at 85% of 80,000 = 68,000
+    assert agent.context_compressor.threshold_tokens == 68_000
 
     # Phase 2: gateway wires callback post-init, then run_conversation replays
     callback_events = []
     agent.status_callback = lambda ev, msg: callback_events.append((ev, msg))
     agent._replay_compression_warning()
 
     assert any(
-        ev == "lifecycle" and "Auto-lowered" in msg
+        ev == "lifecycle" and "Auto-capped" in msg
         for ev, msg in callback_events
     )
 
@@ -410,8 +433,9 @@ def test_replay_without_callback_is_noop():
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
     """After replay in run_conversation, _compression_warning is cleared
-    so the warning is not sent again on subsequent turns."""
-    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    so the warning is not sent again on subsequent turns.
+    Uses threshold_percent=0.95 so mismatch ratio 190K/68K=2.79 > 2.0 → warns."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.95)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
     mock_client.api_key = "sk-aux"
@@ -421,6 +445,8 @@ def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_
     agent._check_compression_model_feasibility()
 
     assert agent._compression_warning is not None
+    # Threshold capped at 85% of 80,000 = 68,000
+    assert agent.context_compressor.threshold_tokens == 68_000
 
     # Simulate what run_conversation does
     callback_events = []