diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 50d4d86afb0..a051d58411e 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -129,6 +129,7 @@ def _fixed_temperature_for_model( return OMIT_TEMPERATURE return None + # Default auxiliary models for direct API-key providers (cheap/fast for side tasks) _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = { "gemini": "gemini-3-flash-preview", @@ -265,7 +266,9 @@ def _select_pool_entry(provider: str) -> Tuple[bool, Optional[Any]]: try: return True, pool.select() except Exception as exc: - logger.debug("Auxiliary client: could not select pool entry for %s: %s", provider, exc) + logger.debug( + "Auxiliary client: could not select pool entry for %s: %s", provider, exc + ) return True, None @@ -327,7 +330,11 @@ def _convert_content_for_responses(content: Any) -> Any: elif ptype == "image_url": # chat.completions nests the URL: {"image_url": {"url": "..."}} image_data = part.get("image_url", {}) - url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data) + url = ( + image_data.get("url", "") + if isinstance(image_data, dict) + else str(image_data) + ) entry: Dict[str, Any] = {"type": "input_image", "image_url": url} # Preserve detail if specified detail = image_data.get("detail") if isinstance(image_data, dict) else None @@ -369,10 +376,12 @@ def create(self, **kwargs) -> Any: if role == "system": instructions = content if isinstance(content, str) else str(content) else: - input_msgs.append({ - "role": role, - "content": _convert_content_for_responses(content), - }) + input_msgs.append( + { + "role": role, + "content": _convert_content_for_responses(content), + } + ) resp_kwargs: Dict[str, Any] = { "model": model, @@ -393,12 +402,14 @@ def create(self, **kwargs) -> Any: name = fn.get("name") if not name: continue - converted.append({ - "type": "function", - "name": name, - "description": fn.get("description", ""), - "parameters": fn.get("parameters", {}), - }) + converted.append( + { + "type": "function", + "name": name, + "description": fn.get("description", ""), + "parameters": fn.get("parameters", {}), + } + ) if converted: resp_kwargs["tools"] = converted @@ -443,13 +454,20 @@ def create(self, **kwargs) -> Any: # a function_call response with incidental text should not # be collapsed into a plain-text message. assembled = "".join(collected_text_deltas) - final.output = [SimpleNamespace( - type="message", role="assistant", status="completed", - content=[SimpleNamespace(type="output_text", text=assembled)], - )] + final.output = [ + SimpleNamespace( + type="message", + role="assistant", + status="completed", + content=[ + SimpleNamespace(type="output_text", text=assembled) + ], + ) + ] logger.debug( "Codex auxiliary: synthesized from %d deltas (%d chars)", - len(collected_text_deltas), len(assembled), + len(collected_text_deltas), + len(assembled), ) # Extract text and tool calls from the Responses output. @@ -464,19 +482,21 @@ def _item_get(obj: Any, key: str, default: Any = None) -> Any: for item in getattr(final, "output", []): item_type = _item_get(item, "type") if item_type == "message": - for part in (_item_get(item, "content") or []): + for part in _item_get(item, "content") or []: ptype = _item_get(part, "type") if ptype in ("output_text", "text"): text_parts.append(_item_get(part, "text", "")) elif item_type == "function_call": - tool_calls_raw.append(SimpleNamespace( - id=_item_get(item, "call_id", ""), - type="function", - function=SimpleNamespace( - name=_item_get(item, "name", ""), - arguments=_item_get(item, "arguments", "{}"), - ), - )) + tool_calls_raw.append( + SimpleNamespace( + id=_item_get(item, "call_id", ""), + type="function", + function=SimpleNamespace( + name=_item_get(item, "name", ""), + arguments=_item_get(item, "arguments", "{}"), + ), + ) + ) resp_usage = getattr(final, "usage", None) if resp_usage: @@ -546,6 +566,7 @@ def __init__(self, sync_adapter: _CodexCompletionsAdapter): async def create(self, **kwargs) -> Any: import asyncio + return await asyncio.to_thread(self._sync.create, **kwargs) @@ -574,13 +595,18 @@ def __init__(self, real_client: Any, model: str, is_oauth: bool = False): self._is_oauth = is_oauth def create(self, **kwargs) -> Any: - from agent.anthropic_adapter import build_anthropic_kwargs, normalize_anthropic_response + from agent.anthropic_adapter import ( + build_anthropic_kwargs, + normalize_anthropic_response, + ) messages = kwargs.get("messages", []) model = kwargs.get("model", self._model) tools = kwargs.get("tools") tool_choice = kwargs.get("tool_choice") - max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000 + max_tokens = ( + kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000 + ) temperature = kwargs.get("temperature") normalized_tool_choice = None @@ -607,6 +633,7 @@ def create(self, **kwargs) -> Any: # additionally strips these keys as a safety net — keep both layers. if temperature is not None: from agent.anthropic_adapter import _forbids_sampling_params + if not _forbids_sampling_params(model): anthropic_kwargs["temperature"] = temperature @@ -617,7 +644,9 @@ def create(self, **kwargs) -> Any: if hasattr(response, "usage") and response.usage: prompt_tokens = getattr(response.usage, "input_tokens", 0) or 0 completion_tokens = getattr(response.usage, "output_tokens", 0) or 0 - total_tokens = getattr(response.usage, "total_tokens", 0) or (prompt_tokens + completion_tokens) + total_tokens = getattr(response.usage, "total_tokens", 0) or ( + prompt_tokens + completion_tokens + ) usage = SimpleNamespace( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -644,7 +673,14 @@ def __init__(self, adapter: _AnthropicCompletionsAdapter): class AnthropicAuxiliaryClient: """OpenAI-client-compatible wrapper over a native Anthropic client.""" - def __init__(self, real_client: Any, model: str, api_key: str, base_url: str, is_oauth: bool = False): + def __init__( + self, + real_client: Any, + model: str, + api_key: str, + base_url: str, + is_oauth: bool = False, + ): self._real_client = real_client adapter = _AnthropicCompletionsAdapter(real_client, model, is_oauth=is_oauth) self.chat = _AnthropicChatShim(adapter) @@ -663,6 +699,7 @@ def __init__(self, sync_adapter: _AnthropicCompletionsAdapter): async def create(self, **kwargs) -> Any: import asyncio + return await asyncio.to_thread(self._sync.create, **kwargs) @@ -745,6 +782,7 @@ def _read_codex_access_token() -> Optional[str]: try: from hermes_cli.auth import _read_codex_tokens + data = _read_codex_tokens() tokens = data.get("tokens", {}) access_token = tokens.get("access_token") @@ -755,6 +793,7 @@ def _read_codex_access_token() -> Optional[str]: # prevent fallback to working providers (e.g. Anthropic). try: import base64 + payload = access_token.split(".")[1] payload += "=" * (-len(payload) % 4) claims = json.loads(base64.urlsafe_b64decode(payload)) @@ -778,7 +817,10 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: credentials, or (None, None) if none are configured. """ try: - from hermes_cli.auth import PROVIDER_REGISTRY, resolve_api_key_provider_credentials + from hermes_cli.auth import ( + PROVIDER_REGISTRY, + resolve_api_key_provider_credentials, + ) except ImportError: logger.debug("Could not import PROVIDER_REGISTRY for API-key fallback") return None, None @@ -792,6 +834,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: # as auxiliary fallback when the user's primary provider fails. try: from hermes_cli.auth import is_provider_explicitly_configured + if not is_provider_explicitly_configured("anthropic"): continue except ImportError: @@ -805,7 +848,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: continue base_url = _to_openai_base_url( - _pool_runtime_base_url(entry, pconfig.inference_base_url) or pconfig.inference_base_url + _pool_runtime_base_url(entry, pconfig.inference_base_url) + or pconfig.inference_base_url ) model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id) if model is None: @@ -831,7 +875,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: continue base_url = _to_openai_base_url( - str(creds.get("base_url", "")).strip().rstrip("/") or pconfig.inference_base_url + str(creds.get("base_url", "")).strip().rstrip("/") + or pconfig.inference_base_url ) model = _API_KEY_PROVIDER_AUX_MODELS.get(provider_id) if model is None: @@ -857,24 +902,27 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: # ── Provider resolution helpers ───────────────────────────────────────────── - def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]: pool_present, entry = _select_pool_entry("openrouter") if pool_present: or_key = _pool_runtime_api_key(entry) if not or_key: return None, None - base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL + base_url = ( + _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL + ) logger.debug("Auxiliary client: OpenRouter via pool") - return OpenAI(api_key=or_key, base_url=base_url, - default_headers=_OR_HEADERS), _OPENROUTER_MODEL + return OpenAI( + api_key=or_key, base_url=base_url, default_headers=_OR_HEADERS + ), _OPENROUTER_MODEL or_key = os.getenv("OPENROUTER_API_KEY") if not or_key: return None, None logger.debug("Auxiliary client: OpenRouter") - return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL, - default_headers=_OR_HEADERS), _OPENROUTER_MODEL + return OpenAI( + api_key=or_key, base_url=OPENROUTER_BASE_URL, default_headers=_OR_HEADERS + ), _OPENROUTER_MODEL def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]: @@ -883,6 +931,7 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]: # to avoid piling more requests onto the tapped RPH bucket. try: from agent.nous_rate_guard import nous_rate_limit_remaining + _remaining = nous_rate_limit_remaining() if _remaining is not None and _remaining > 0: logger.debug( @@ -907,16 +956,24 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]: # models instead: mimo-v2-omni for vision, mimo-v2-pro for text tasks. try: from hermes_cli.models import check_nous_free_tier + if check_nous_free_tier(): - model = _NOUS_FREE_TIER_VISION_MODEL if vision else _NOUS_FREE_TIER_AUX_MODEL - logger.debug("Free-tier Nous account — using %s for auxiliary/%s", - model, "vision" if vision else "text") + model = ( + _NOUS_FREE_TIER_VISION_MODEL if vision else _NOUS_FREE_TIER_AUX_MODEL + ) + logger.debug( + "Free-tier Nous account — using %s for auxiliary/%s", + model, + "vision" if vision else "text", + ) except Exception: pass return ( OpenAI( api_key=_nous_api_key(nous), - base_url=str(nous.get("inference_base_url") or _nous_base_url()).rstrip("/"), + base_url=str(nous.get("inference_base_url") or _nous_base_url()).rstrip( + "/" + ), ), model, ) @@ -930,6 +987,7 @@ def _read_main_model() -> str: """ try: from hermes_cli.config import load_config + cfg = load_config() model_cfg = cfg.get("model", {}) if isinstance(model_cfg, str) and model_cfg.strip(): @@ -951,6 +1009,7 @@ def _read_main_provider() -> str: """ try: from hermes_cli.config import load_config + cfg = load_config() model_cfg = cfg.get("model", {}) if isinstance(model_cfg, dict): @@ -1028,15 +1087,21 @@ def _validate_proxy_env_urls() -> None: """ from urllib.parse import urlparse - for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY", - "https_proxy", "http_proxy", "all_proxy"): + for key in ( + "HTTPS_PROXY", + "HTTP_PROXY", + "ALL_PROXY", + "https_proxy", + "http_proxy", + "all_proxy", + ): value = str(os.environ.get(key) or "").strip() if not value: continue try: parsed = urlparse(value) if parsed.scheme: - _ = parsed.port # raises ValueError for e.g. '6153export' + _ = parsed.port # raises ValueError for e.g. '6153export' except ValueError as exc: raise RuntimeError( f"Malformed proxy environment variable {key}={value!r}. " @@ -1054,7 +1119,7 @@ def _validate_base_url(base_url: str) -> None: try: parsed = urlparse(candidate) if parsed.scheme in {"http", "https"}: - _ = parsed.port # raises ValueError for malformed ports + _ = parsed.port # raises ValueError for malformed ports except ValueError as exc: raise RuntimeError( f"Malformed custom endpoint URL: {candidate!r}. " @@ -1074,7 +1139,11 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]: if custom_base.lower().startswith(_CODEX_AUX_BASE_URL.lower()): return None, None model = _read_main_model() or "gpt-4o-mini" - logger.debug("Auxiliary client: custom endpoint (%s, api_mode=%s)", model, custom_mode or "chat_completions") + logger.debug( + "Auxiliary client: custom endpoint (%s, api_mode=%s)", + model, + custom_mode or "chat_completions", + ) if custom_mode == "codex_responses": real_client = OpenAI(api_key=custom_key, base_url=custom_base) return CodexAuxiliaryClient(real_client, model), model @@ -1103,7 +1172,10 @@ def _try_codex() -> Tuple[Optional[Any], Optional[str]]: if pool_present: codex_token = _pool_runtime_api_key(entry) if codex_token: - base_url = _pool_runtime_base_url(entry, _CODEX_AUX_BASE_URL) or _CODEX_AUX_BASE_URL + base_url = ( + _pool_runtime_base_url(entry, _CODEX_AUX_BASE_URL) + or _CODEX_AUX_BASE_URL + ) else: codex_token = _read_codex_access_token() if not codex_token: @@ -1125,7 +1197,10 @@ def _try_codex() -> Tuple[Optional[Any], Optional[str]]: def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]: try: - from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token + from agent.anthropic_adapter import ( + build_anthropic_client, + resolve_anthropic_token, + ) except ImportError: return None, None @@ -1143,9 +1218,14 @@ def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]: # Allow base URL override from config.yaml model.base_url, but only # when the configured provider is anthropic — otherwise a non-Anthropic # base_url (e.g. Codex endpoint) would leak into Anthropic requests. - base_url = _pool_runtime_base_url(entry, _ANTHROPIC_DEFAULT_BASE_URL) if pool_present else _ANTHROPIC_DEFAULT_BASE_URL + base_url = ( + _pool_runtime_base_url(entry, _ANTHROPIC_DEFAULT_BASE_URL) + if pool_present + else _ANTHROPIC_DEFAULT_BASE_URL + ) try: from hermes_cli.config import load_config + cfg = load_config() model_cfg = cfg.get("model") if isinstance(model_cfg, dict): @@ -1158,9 +1238,15 @@ def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]: pass from agent.anthropic_adapter import _is_oauth_token + is_oauth = _is_oauth_token(token) model = _API_KEY_PROVIDER_AUX_MODELS.get("anthropic", "claude-haiku-4-5-20251001") - logger.debug("Auxiliary client: Anthropic native (%s) at %s (oauth=%s)", model, base_url, is_oauth) + logger.debug( + "Auxiliary client: Anthropic native (%s) at %s (oauth=%s)", + model, + base_url, + is_oauth, + ) try: real_client = build_anthropic_client(token, base_url) except ImportError: @@ -1168,7 +1254,9 @@ def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]: # missing — build_anthropic_client raises ImportError at call time # when _anthropic_sdk is None. Treat as unavailable. return None, None - return AnthropicAuxiliaryClient(real_client, model, token, base_url, is_oauth=is_oauth), model + return AnthropicAuxiliaryClient( + real_client, model, token, base_url, is_oauth=is_oauth + ), model _AUTO_PROVIDER_LABELS = { @@ -1225,9 +1313,16 @@ def _is_payment_error(exc: Exception) -> bool: # OpenRouter and other providers include "credits" or "afford" in 402 bodies, # but sometimes wrap them in 429 or other codes. if status in (402, 429, None): - if any(kw in err_lower for kw in ("credits", "insufficient funds", - "can only afford", "billing", - "payment required")): + if any( + kw in err_lower + for kw in ( + "credits", + "insufficient funds", + "can only afford", + "billing", + "payment required", + ) + ): return True return False @@ -1249,11 +1344,17 @@ def _is_connection_error(exc: Exception) -> bool: if any(kw in err_type for kw in ("Connection", "Timeout", "DNS", "SSL")): return True err_lower = str(exc).lower() - if any(kw in err_lower for kw in ( - "connection refused", "name or service not known", - "no route to host", "network is unreachable", - "timed out", "connection reset", - )): + if any( + kw in err_lower + for kw in ( + "connection refused", + "name or service not known", + "no route to host", + "network is unreachable", + "timed out", + "connection reset", + ) + ): return True return False @@ -1280,9 +1381,14 @@ def _try_payment_fallback( if main_provider and main_provider.lower() in skip: skip_labels.add(main_provider.lower()) # Map common resolved_provider values back to chain labels. - _alias_to_label = {"openrouter": "openrouter", "nous": "nous", - "openai-codex": "openai-codex", "codex": "openai-codex", - "custom": "local/custom", "local/custom": "local/custom"} + _alias_to_label = { + "openrouter": "openrouter", + "nous": "nous", + "openai-codex": "openai-codex", + "codex": "openai-codex", + "custom": "local/custom", + "local/custom": "local/custom", + } skip_chain_labels = {_alias_to_label.get(s, s) for s in skip_labels} tried = [] @@ -1293,19 +1399,28 @@ def _try_payment_fallback( if client is not None: logger.info( "Auxiliary %s: %s on %s — falling back to %s (%s)", - task or "call", reason, failed_provider, label, model or "default", + task or "call", + reason, + failed_provider, + label, + model or "default", ) return client, model, label tried.append(label) logger.warning( "Auxiliary %s: %s on %s and no fallback available (tried: %s)", - task or "call", reason, failed_provider, ", ".join(tried), + task or "call", + reason, + failed_provider, + ", ".join(tried), ) return None, None, "" -def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Optional[OpenAI], Optional[str]]: +def _resolve_auto( + main_runtime: Optional[Dict[str, Any]] = None, +) -> Tuple[Optional[OpenAI], Optional[str]]: """Full auto-detection chain. Priority: @@ -1335,15 +1450,19 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option if not _stale_base_url_warned: _env_base = os.getenv("OPENAI_BASE_URL", "").strip() _cfg_provider = runtime_provider or _read_main_provider() - if (_env_base and _cfg_provider - and _cfg_provider != "custom" - and not _cfg_provider.startswith("custom:")): + if ( + _env_base + and _cfg_provider + and _cfg_provider != "custom" + and not _cfg_provider.startswith("custom:") + ): logger.warning( "OPENAI_BASE_URL is set (%s) but model.provider is '%s'. " "Auxiliary clients may route to the wrong endpoint. " "Run: hermes model to reconfigure, or remove " "OPENAI_BASE_URL from ~/.hermes/.env", - _env_base, _cfg_provider, + _env_base, + _cfg_provider, ) _stale_base_url_warned = True @@ -1356,12 +1475,13 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option # config.yaml (auxiliary..provider) still win over this. main_provider = runtime_provider or _read_main_provider() main_model = runtime_model or _read_main_model() - if (main_provider and main_model - and main_provider not in ("auto", "")): + if main_provider and main_model and main_provider not in ("auto", ""): resolved_provider = main_provider explicit_base_url = None explicit_api_key = None - if runtime_base_url and (main_provider == "custom" or main_provider.startswith("custom:")): + if runtime_base_url and ( + main_provider == "custom" or main_provider.startswith("custom:") + ): resolved_provider = "custom" explicit_base_url = runtime_base_url explicit_api_key = runtime_api_key or None @@ -1373,8 +1493,11 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option api_mode=runtime_api_mode or None, ) if client is not None: - logger.info("Auxiliary auto-detect: using main provider %s (%s)", - main_provider, resolved or main_model) + logger.info( + "Auxiliary auto-detect: using main provider %s (%s)", + main_provider, + resolved or main_model, + ) return client, resolved or main_model # ── Step 2: aggregator / fallback chain ────────────────────────────── @@ -1383,16 +1506,24 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option client, model = try_fn() if client is not None: if tried: - logger.info("Auxiliary auto-detect: using %s (%s) — skipped: %s", - label, model or "default", ", ".join(tried)) + logger.info( + "Auxiliary auto-detect: using %s (%s) — skipped: %s", + label, + model or "default", + ", ".join(tried), + ) else: - logger.info("Auxiliary auto-detect: using %s (%s)", label, model or "default") + logger.info( + "Auxiliary auto-detect: using %s (%s)", label, model or "default" + ) return client, model tried.append(label) - logger.warning("Auxiliary auto-detect: no provider available (tried: %s). " - "Compression, summarization, and memory flush will not work. " - "Set OPENROUTER_API_KEY or configure a local model in config.yaml.", - ", ".join(tried)) + logger.warning( + "Auxiliary auto-detect: no provider available (tried: %s). " + "Compression, summarization, and memory flush will not work. " + "Set OPENROUTER_API_KEY or configure a local model in config.yaml.", + ", ".join(tried), + ) return None, None @@ -1424,6 +1555,7 @@ def _to_async_client(sync_client, model: str): pass try: from agent.copilot_acp_client import CopilotACPClient + if isinstance(sync_client, CopilotACPClient): return sync_client, model except ImportError: @@ -1445,7 +1577,9 @@ def _to_async_client(sync_client, model: str): return AsyncOpenAI(**async_kwargs), model -def _normalize_resolved_model(model_name: Optional[str], provider: str) -> Optional[str]: +def _normalize_resolved_model( + model_name: Optional[str], provider: str +) -> Optional[str]: """Normalize a resolved model for the provider that will receive it.""" if not model_name: return model_name @@ -1529,8 +1663,10 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): logger.debug( "resolve_provider_client: wrapping client in CodexAuxiliaryClient " "(api_mode=%s, model=%s, base_url=%s)", - api_mode or "auto-detected", final_model_str, - base_url_str[:60] if base_url_str else "") + api_mode or "auto-detected", + final_model_str, + base_url_str[:60] if base_url_str else "", + ) return CodexAuxiliaryClient(client_obj, final_model_str) return client_obj @@ -1546,33 +1682,49 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): if model and "/" in model and resolved and "/" not in resolved: logger.debug( "Dropping OpenRouter-format model %r for non-OpenRouter " - "auxiliary provider (using %r instead)", model, resolved) + "auxiliary provider (using %r instead)", + model, + resolved, + ) model = None final_model = model or resolved - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) # ── OpenRouter ─────────────────────────────────────────────────── if provider == "openrouter": client, default = _try_openrouter() if client is None: - logger.warning("resolve_provider_client: openrouter requested " - "but OPENROUTER_API_KEY not set") + logger.warning( + "resolve_provider_client: openrouter requested " + "but OPENROUTER_API_KEY not set" + ) return None, None final_model = _normalize_resolved_model(model or default, provider) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) # ── Nous Portal (OAuth) ────────────────────────────────────────── if provider == "nous": client, default = _try_nous() if client is None: - logger.warning("resolve_provider_client: nous requested " - "but Nous Portal not configured (run: hermes auth)") + logger.warning( + "resolve_provider_client: nous requested " + "but Nous Portal not configured (run: hermes auth)" + ) return None, None final_model = _normalize_resolved_model(model or default, provider) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) # ── OpenAI Codex (OAuth → Responses API) ───────────────────────── if provider == "openai-codex": @@ -1581,8 +1733,10 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): # access to responses.stream() (e.g., the main agent loop). codex_token = _read_codex_access_token() if not codex_token: - logger.warning("resolve_provider_client: openai-codex requested " - "but no Codex OAuth token found (run: hermes model)") + logger.warning( + "resolve_provider_client: openai-codex requested " + "but no Codex OAuth token found (run: hermes model)" + ) return None, None final_model = _normalize_resolved_model(model or _CODEX_AUX_MODEL, provider) raw_client = OpenAI( @@ -1594,12 +1748,17 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): # Standard path: wrap in CodexAuxiliaryClient adapter client, default = _try_codex() if client is None: - logger.warning("resolve_provider_client: openai-codex requested " - "but no Codex OAuth token found (run: hermes model)") + logger.warning( + "resolve_provider_client: openai-codex requested " + "but no Codex OAuth token found (run: hermes model)" + ) return None, None final_model = _normalize_resolved_model(model or default, provider) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ─────────── if provider == "custom": @@ -1625,28 +1784,37 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): extra["default_headers"] = {"User-Agent": "KimiCLI/1.30.0"} elif base_url_host_matches(custom_base, "api.githubcopilot.com"): from hermes_cli.models import copilot_default_headers + extra["default_headers"] = copilot_default_headers() client = OpenAI(api_key=custom_key, base_url=custom_base, **extra) client = _wrap_if_needed(client, final_model, custom_base) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) # Try custom first, then codex, then API-key providers - for try_fn in (_try_custom_endpoint, _try_codex, - _resolve_api_key_provider): + for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider): client, default = try_fn() if client is not None: final_model = _normalize_resolved_model(model or default, provider) _cbase = str(getattr(client, "base_url", "") or "") client = _wrap_if_needed(client, final_model, _cbase) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) - logger.warning("resolve_provider_client: custom/main requested " - "but no endpoint credentials found") + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) + logger.warning( + "resolve_provider_client: custom/main requested " + "but no endpoint credentials found" + ) return None, None # ── Named custom providers (config.yaml custom_providers list) ─── try: from hermes_cli.runtime_provider import _get_named_custom_provider + custom_entry = _get_named_custom_provider(provider) if custom_entry: custom_base = custom_entry.get("base_url", "").strip() @@ -1657,19 +1825,28 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): custom_key = custom_key or "no-key-required" if custom_base: final_model = _normalize_resolved_model( - model or custom_entry.get("model") or _read_main_model() or "gpt-4o-mini", + model + or custom_entry.get("model") + or _read_main_model() + or "gpt-4o-mini", provider, ) client = OpenAI(api_key=custom_key, base_url=custom_base) client = _wrap_if_needed(client, final_model, custom_base) logger.debug( "resolve_provider_client: named custom provider %r (%s)", - provider, final_model) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + provider, + final_model, + ) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) logger.warning( "resolve_provider_client: named custom provider %r has no base_url", - provider) + provider, + ) return None, None except ImportError: pass @@ -1694,10 +1871,16 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): if provider == "anthropic": client, default_model = _try_anthropic() if client is None: - logger.warning("resolve_provider_client: anthropic requested but no Anthropic credentials found") + logger.warning( + "resolve_provider_client: anthropic requested but no Anthropic credentials found" + ) return None, None final_model = _normalize_resolved_model(model or default_model, provider) - return (_to_async_client(client, final_model) if async_mode else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) creds = resolve_api_key_provider_credentials(provider) api_key = str(creds.get("api_key", "")).strip() @@ -1705,13 +1888,17 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): tried_sources = list(pconfig.api_key_env_vars) if provider == "copilot": tried_sources.append("gh auth token") - logger.debug("resolve_provider_client: provider %s has no API " - "key configured (tried: %s)", - provider, ", ".join(tried_sources)) + logger.debug( + "resolve_provider_client: provider %s has no API " + "key configured (tried: %s)", + provider, + ", ".join(tried_sources), + ) return None, None base_url = _to_openai_base_url( - str(creds.get("base_url", "")).strip().rstrip("/") or pconfig.inference_base_url + str(creds.get("base_url", "")).strip().rstrip("/") + or pconfig.inference_base_url ) default_model = _API_KEY_PROVIDER_AUX_MODELS.get(provider, "") @@ -1734,8 +1921,12 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): from hermes_cli.models import copilot_default_headers headers.update(copilot_default_headers()) - client = OpenAI(api_key=api_key, base_url=base_url, - **({"default_headers": headers} if headers else {})) + + client = OpenAI( + api_key=api_key, + base_url=base_url, + **({"default_headers": headers} if headers else {}) + ) # Copilot GPT-5+ models (except gpt-5-mini) require the Responses # API — they are not accessible via /chat/completions. Wrap the @@ -1744,11 +1935,13 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): if provider == "copilot" and final_model and not raw_codex: try: from hermes_cli.models import _should_use_copilot_responses_api + if _should_use_copilot_responses_api(final_model): logger.debug( "resolve_provider_client: copilot model %s needs " "Responses API — wrapping with CodexAuxiliaryClient", - final_model) + final_model, + ) client = CodexAuxiliaryClient(client, final_model) except ImportError: pass @@ -1759,8 +1952,11 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): client = _wrap_if_needed(client, final_model, base_url) logger.debug("resolve_provider_client: %s (%s)", provider, final_model) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) if pconfig.auth_type == "external_process": creds = resolve_external_process_provider_credentials(provider) @@ -1791,10 +1987,16 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): args=args, ) logger.debug("resolve_provider_client: %s (%s)", provider, final_model) - return (_to_async_client(client, final_model) if async_mode - else (client, final_model)) - logger.warning("resolve_provider_client: external-process provider %s not " - "directly supported", provider) + return ( + _to_async_client(client, final_model) + if async_mode + else (client, final_model) + ) + logger.warning( + "resolve_provider_client: external-process provider %s not " + "directly supported", + provider, + ) return None, None elif pconfig.auth_type in ("oauth_device_code", "oauth_external"): @@ -1804,17 +2006,24 @@ def _wrap_if_needed(client_obj, final_model_str: str, base_url_str: str = ""): if provider == "openai-codex": return resolve_provider_client("openai-codex", model, async_mode) # Other OAuth providers not directly supported - logger.warning("resolve_provider_client: OAuth provider %s not " - "directly supported, try 'auto'", provider) + logger.warning( + "resolve_provider_client: OAuth provider %s not " + "directly supported, try 'auto'", + provider, + ) return None, None - logger.warning("resolve_provider_client: unhandled auth_type %s for %s", - pconfig.auth_type, provider) + logger.warning( + "resolve_provider_client: unhandled auth_type %s for %s", + pconfig.auth_type, + provider, + ) return None, None # ── Public API ────────────────────────────────────────────────────────────── + def get_text_auxiliary_client( task: str = "", *, @@ -1829,7 +2038,9 @@ def get_text_auxiliary_client( Callers may override the returned model via config.yaml (e.g. auxiliary.compression.model, auxiliary.web_extract.model). """ - provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None) + provider, model, base_url, api_key, api_mode = _resolve_task_provider_model( + task or None + ) return resolve_provider_client( provider, model=model, @@ -1840,14 +2051,18 @@ def get_text_auxiliary_client( ) -def get_async_text_auxiliary_client(task: str = "", *, main_runtime: Optional[Dict[str, Any]] = None): +def get_async_text_auxiliary_client( + task: str = "", *, main_runtime: Optional[Dict[str, Any]] = None +): """Return (async_client, model_slug) for async consumers. For standard providers returns (AsyncOpenAI, model). For Codex returns (AsyncCodexAuxiliaryClient, model) which wraps the Responses API. Returns (None, None) when no provider is available. """ - provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None) + provider, model, base_url, api_key, api_mode = _resolve_task_provider_model( + task or None + ) return resolve_provider_client( provider, model=model, @@ -1869,7 +2084,9 @@ def _normalize_vision_provider(provider: Optional[str]) -> str: return _normalize_aux_provider(provider) -def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Optional[str]]: +def _resolve_strict_vision_backend( + provider: str, +) -> Tuple[Optional[Any], Optional[str]]: provider = _normalize_vision_provider(provider) if provider == "openrouter": return _try_openrouter() @@ -1928,12 +2145,18 @@ def resolve_vision_provider_client( backends, so users can intentionally force experimental providers. Auto mode stays conservative and only tries vision backends known to work today. """ - requested, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model( - "vision", provider, model, base_url, api_key - ) + ( + requested, + resolved_model, + resolved_base_url, + resolved_api_key, + resolved_api_mode, + ) = _resolve_task_provider_model("vision", provider, model, base_url, api_key) requested = _normalize_vision_provider(requested) - def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[str]): + def _finalize( + resolved_provider: str, sync_client: Any, default_model: Optional[str] + ): if sync_client is None: return resolved_provider, None, None final_model = resolved_model or default_model @@ -1970,15 +2193,15 @@ def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[ if main_provider and main_provider not in ("auto", ""): vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model) rpc_client, rpc_model = resolve_provider_client( - main_provider, vision_model, - api_mode=resolved_api_mode) + main_provider, vision_model, api_mode=resolved_api_mode + ) if rpc_client is not None: logger.info( "Vision auto-detect: using main provider %s (%s)", - main_provider, rpc_model or vision_model, + main_provider, + rpc_model or vision_model, ) - return _finalize( - main_provider, rpc_client, rpc_model or vision_model) + return _finalize(main_provider, rpc_client, rpc_model or vision_model) # Fall back through aggregators (uses their dedicated vision model, # not the user's main model) when main provider has no client. @@ -1996,8 +2219,9 @@ def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[ sync_client, default_model = _resolve_strict_vision_backend(requested) return _finalize(requested, sync_client, default_model) - client, final_model = _get_cached_client(requested, resolved_model, async_mode, - api_mode=resolved_api_mode) + client, final_model = _get_cached_client( + requested, resolved_model, async_mode, api_mode=resolved_api_mode + ) if client is None: return requested, None, None return requested, client, final_model @@ -2005,7 +2229,7 @@ def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[ def get_auxiliary_extra_body() -> dict: """Return extra_body kwargs for auxiliary API calls. - + Includes Nous Portal product tags when the auxiliary client is backed by Nous Portal. Returns empty dict otherwise. """ @@ -2014,7 +2238,7 @@ def get_auxiliary_extra_body() -> dict: def auxiliary_max_tokens_param(value: int) -> dict: """Return the correct max tokens kwarg for the auxiliary client's provider. - + OpenRouter and local models use 'max_tokens'. Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) requires 'max_completion_tokens'. The Codex adapter translates max_tokens internally, so we use max_tokens @@ -2081,6 +2305,7 @@ def neuter_async_httpx_del() -> None: """ try: from openai._base_client import AsyncHttpxClientWrapper + AsyncHttpxClientWrapper.__del__ = lambda self: None # type: ignore[assignment] except (ImportError, AttributeError): pass # Graceful degradation if the SDK changes its internals @@ -2099,6 +2324,7 @@ def _force_close_async_httpx(client: Any) -> None: """ try: from httpx._client import ClientState + inner = getattr(client, "_client", None) if inner is not None and not getattr(inner, "is_closed", True): inner._state = ClientState.CLOSED @@ -2159,7 +2385,9 @@ def _is_openrouter_client(client: Any) -> bool: return False -def _compat_model(client: Any, model: Optional[str], cached_default: Optional[str]) -> Optional[str]: +def _compat_model( + client: Any, model: Optional[str], cached_default: Optional[str] +) -> Optional[str]: """Drop OpenRouter-format model slugs (with '/') for non-OpenRouter clients. Mirrors the guard in resolve_provider_client() which is skipped on cache hits. @@ -2202,12 +2430,24 @@ def _get_cached_client( if async_mode: try: import asyncio as _aio + current_loop = _aio.get_event_loop() except RuntimeError: pass runtime = _normalize_main_runtime(main_runtime) - runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else () - cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key) + runtime_key = ( + tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) + if provider == "auto" + else () + ) + cache_key = ( + provider, + async_mode, + base_url or "", + api_key or "", + api_mode or "", + runtime_key, + ) with _client_cache_lock: if cache_key in _client_cache: cached_client, cached_default, cached_loop = _client_cache[cache_key] @@ -2301,7 +2541,13 @@ def _resolve_task_provider_model( if task: # Config.yaml is the primary source for per-task overrides. if cfg_base_url: - return "custom", resolved_model, cfg_base_url, cfg_api_key, resolved_api_mode + return ( + "custom", + resolved_model, + cfg_base_url, + cfg_api_key, + resolved_api_mode, + ) if cfg_provider and cfg_provider != "auto": return cfg_provider, resolved_model, None, None, resolved_api_mode @@ -2319,6 +2565,7 @@ def _get_auxiliary_task_config(task: str) -> Dict[str, Any]: return {} try: from hermes_cli.config import load_config + config = load_config() except ImportError: return {} @@ -2394,23 +2641,27 @@ def _convert_openai_images_to_anthropic(messages: list) -> list: media_type = "image/png" if ":" in header and ";" in header: media_type = header.split(":", 1)[1].split(";", 1)[0] - new_content.append({ - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": b64data, - }, - }) + new_content.append( + { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": b64data, + }, + } + ) else: # URL-based image - new_content.append({ - "type": "image", - "source": { - "type": "url", - "url": image_url_val, - }, - }) + new_content.append( + { + "type": "image", + "source": { + "type": "url", + "url": image_url_val, + }, + } + ) changed = True else: new_content.append(block) @@ -2418,6 +2669,33 @@ def _convert_openai_images_to_anthropic(messages: list) -> list: return converted +def _sanitize_message_content(content: Any) -> str: + """Sanitize message content for strict API endpoints (StepFun, etc.). + + Replaces problematic characters that can cause JSON serialization errors + like "Unterminated string" on strict API backends. + """ + if not isinstance(content, str): + content = str(content) if content else "" + sanitized = content + for old, new in ( + ("\\", "\\\\"), + ('"', '\\"'), + ("\n", "\\n"), + ("\r", "\\r"), + ("\t", "\\t"), + ): + sanitized = sanitized.replace(old, new) + return sanitized + + +def _validate_messages_json(messages: list) -> list: + """Validate and sanitize messages for strict API endpoints. + + Returns messages as-is - JSON truncation is now handled in context_compressor.py. + """ + return messages + def _build_call_kwargs( provider: str, @@ -2431,9 +2709,10 @@ def _build_call_kwargs( base_url: Optional[str] = None, ) -> dict: """Build kwargs for .chat.completions.create() with model/provider adjustments.""" + sanitized_messages = _validate_messages_json(messages) kwargs: Dict[str, Any] = { "model": model, - "messages": messages, + "messages": sanitized_messages, "timeout": timeout, } @@ -2449,6 +2728,7 @@ def _build_call_kwargs( # the aux model is flipped to 4.7. if temperature is not None: from agent.anthropic_adapter import _forbids_sampling_params + if _forbids_sampling_params(model): temperature = None @@ -2490,9 +2770,7 @@ def _validate_llm_response(response: Any, task: str = None) -> Any: See #7264. """ if response is None: - raise RuntimeError( - f"Auxiliary {task or 'call'}: LLM returned None response" - ) + raise RuntimeError(f"Auxiliary {task or 'call'}: LLM returned None response") # Allow SimpleNamespace responses from adapters (CodexAuxiliaryClient, # AnthropicAuxiliaryClient) — they have .choices[0].message. try: @@ -2605,22 +2883,34 @@ def call_llm( # resolved_model may be an OpenRouter-format slug that doesn't # work on other providers. if not resolved_base_url: - logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain", - task or "call", resolved_provider) - client, final_model = _get_cached_client("auto", main_runtime=main_runtime) + logger.info( + "Auxiliary %s: provider %s unavailable, trying auto-detection chain", + task or "call", + resolved_provider, + ) + client, final_model = _get_cached_client( + "auto", main_runtime=main_runtime + ) if client is None: raise RuntimeError( f"No LLM provider configured for task={task} provider={resolved_provider}. " - f"Run: hermes setup") + f"Run: hermes setup" + ) effective_timeout = timeout if timeout is not None else _get_task_timeout(task) # Log what we're about to do — makes auxiliary operations visible _base_info = str(getattr(client, "base_url", resolved_base_url) or "") if task: - logger.info("Auxiliary %s: using %s (%s)%s", - task, resolved_provider or "auto", final_model or "default", - f" at {_base_info}" if _base_info and "openrouter" not in _base_info else "") + logger.info( + "Auxiliary %s: using %s (%s)%s", + task, + resolved_provider or "auto", + final_model or "default", + f" at {_base_info}" + if _base_info and "openrouter" not in _base_info + else "", + ) # Pass the client's actual base_url (not just resolved_base_url) so # endpoint-specific temperature overrides can distinguish @@ -2638,8 +2928,7 @@ def call_llm( # Handle max_tokens vs max_completion_tokens retry, then payment fallback. try: - return _validate_llm_response( - client.chat.completions.create(**kwargs), task) + return _validate_llm_response(client.chat.completions.create(**kwargs), task) except Exception as first_err: err_str = str(first_err) if "max_tokens" in err_str or "unsupported_parameter" in err_str: @@ -2647,11 +2936,14 @@ def call_llm( kwargs["max_completion_tokens"] = max_tokens try: return _validate_llm_response( - client.chat.completions.create(**kwargs), task) + client.chat.completions.create(**kwargs), task + ) except Exception as retry_err: # If the max_tokens retry also hits a payment or connection # error, fall through to the fallback chain below. - if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)): + if not ( + _is_payment_error(retry_err) or _is_connection_error(retry_err) + ): raise first_err = retry_err @@ -2667,17 +2959,27 @@ def call_llm( # Codex/OAuth tokens that authenticate but whose endpoint is down, # and providers the user never configured that got picked up by # the auto-detection chain. - should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err) + should_fallback = _is_payment_error(first_err) or _is_connection_error( + first_err + ) # Only try alternative providers when the user didn't explicitly # configure this task's provider. Explicit provider = hard constraint; # auto (the default) = best-effort fallback chain. (#7559) is_auto = resolved_provider in ("auto", "", None) if should_fallback and is_auto: - reason = "payment error" if _is_payment_error(first_err) else "connection error" - logger.info("Auxiliary %s: %s on %s (%s), trying fallback", - task or "call", reason, resolved_provider, first_err) + reason = ( + "payment error" if _is_payment_error(first_err) else "connection error" + ) + logger.info( + "Auxiliary %s: %s on %s (%s), trying fallback", + task or "call", + reason, + resolved_provider, + first_err, + ) fb_client, fb_model, fb_label = _try_payment_fallback( - resolved_provider, task, reason=reason) + resolved_provider, task, reason=reason + ) if fb_client is not None: fb_kwargs = _build_call_kwargs( fb_label, fb_model, messages, @@ -2686,7 +2988,8 @@ def call_llm( extra_body=effective_extra_body, base_url=str(getattr(fb_client, "base_url", "") or "")) return _validate_llm_response( - fb_client.chat.completions.create(**fb_kwargs), task) + fb_client.chat.completions.create(**fb_kwargs), task + ) raise @@ -2716,7 +3019,9 @@ def extract_content_or_reasoning(response) -> str: r"<(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>" r".*?" r"", - "", content, flags=re.DOTALL | re.IGNORECASE, + "", + content, + flags=re.DOTALL | re.IGNORECASE, ).strip() if cleaned: return cleaned @@ -2733,12 +3038,12 @@ def extract_content_or_reasoning(response) -> str: for detail in details: if isinstance(detail, dict): summary = ( - detail.get("summary") - or detail.get("content") - or detail.get("text") + detail.get("summary") or detail.get("content") or detail.get("text") ) if summary and summary not in reasoning_parts: - reasoning_parts.append(summary.strip() if isinstance(summary, str) else str(summary)) + reasoning_parts.append( + summary.strip() if isinstance(summary, str) else str(summary) + ) if reasoning_parts: return "\n\n".join(reasoning_parts) @@ -2811,13 +3116,17 @@ async def async_call_llm( f"variable, or switch to a different provider with `hermes model`." ) if not resolved_base_url: - logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain", - task or "call", resolved_provider) + logger.info( + "Auxiliary %s: provider %s unavailable, trying auto-detection chain", + task or "call", + resolved_provider, + ) client, final_model = _get_cached_client("auto", async_mode=True) if client is None: raise RuntimeError( f"No LLM provider configured for task={task} provider={resolved_provider}. " - f"Run: hermes setup") + f"Run: hermes setup" + ) effective_timeout = timeout if timeout is not None else _get_task_timeout(task) @@ -2837,7 +3146,8 @@ async def async_call_llm( try: return _validate_llm_response( - await client.chat.completions.create(**kwargs), task) + await client.chat.completions.create(**kwargs), task + ) except Exception as first_err: err_str = str(first_err) if "max_tokens" in err_str or "unsupported_parameter" in err_str: @@ -2845,23 +3155,36 @@ async def async_call_llm( kwargs["max_completion_tokens"] = max_tokens try: return _validate_llm_response( - await client.chat.completions.create(**kwargs), task) + await client.chat.completions.create(**kwargs), task + ) except Exception as retry_err: # If the max_tokens retry also hits a payment or connection # error, fall through to the fallback chain below. - if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)): + if not ( + _is_payment_error(retry_err) or _is_connection_error(retry_err) + ): raise first_err = retry_err # ── Payment / connection fallback (mirrors sync call_llm) ───── - should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err) + should_fallback = _is_payment_error(first_err) or _is_connection_error( + first_err + ) is_auto = resolved_provider in ("auto", "", None) if should_fallback and is_auto: - reason = "payment error" if _is_payment_error(first_err) else "connection error" - logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback", - task or "call", reason, resolved_provider, first_err) + reason = ( + "payment error" if _is_payment_error(first_err) else "connection error" + ) + logger.info( + "Auxiliary %s (async): %s on %s (%s), trying fallback", + task or "call", + reason, + resolved_provider, + first_err, + ) fb_client, fb_model, fb_label = _try_payment_fallback( - resolved_provider, task, reason=reason) + resolved_provider, task, reason=reason + ) if fb_client is not None: fb_kwargs = _build_call_kwargs( fb_label, fb_model, messages, @@ -2874,5 +3197,6 @@ async def async_call_llm( if async_fb_model and async_fb_model != fb_kwargs.get("model"): fb_kwargs["model"] = async_fb_model return _validate_llm_response( - await async_fb.chat.completions.create(**fb_kwargs), task) + await async_fb.chat.completions.create(**fb_kwargs), task + ) raise diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 254ac0ac5eb..848f43d16b6 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -147,7 +147,9 @@ def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> if tool_name == "write_file": path = args.get("path", "?") - written_lines = args.get("content", "").count("\n") + 1 if args.get("content") else "?" + written_lines = ( + args.get("content", "").count("\n") + 1 if args.get("content") else "?" + ) return f"[write_file] wrote to {path} ({written_lines} lines)" if tool_name == "search_files": @@ -163,8 +165,14 @@ def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> mode = args.get("mode", "replace") return f"[patch] {mode} in {path} ({content_len:,} chars result)" - if tool_name in ("browser_navigate", "browser_click", "browser_snapshot", - "browser_type", "browser_scroll", "browser_vision"): + if tool_name in ( + "browser_navigate", + "browser_click", + "browser_snapshot", + "browser_type", + "browser_scroll", + "browser_vision", + ): url = args.get("url", "") ref = args.get("ref", "") detail = f" {url}" if url else (f" ref={ref}" if ref else "") @@ -304,7 +312,9 @@ def __init__( self.quiet_mode = quiet_mode self.context_length = get_model_context_length( - model, base_url=base_url, api_key=api_key, + model, + base_url=base_url, + api_key=api_key, config_context_length=config_context_length, provider=provider, ) @@ -322,7 +332,8 @@ def __init__( target_tokens = int(self.threshold_tokens * self.summary_target_ratio) self.tail_token_budget = target_tokens self.max_summary_tokens = min( - int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING, + int(self.context_length * 0.05), + _SUMMARY_TOKENS_CEILING, ) if not quiet_mode: @@ -330,10 +341,14 @@ def __init__( "Context compressor initialized: model=%s context_length=%d " "threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d " "provider=%s base_url=%s", - model, self.context_length, self.threshold_tokens, - threshold_percent * 100, self.summary_target_ratio * 100, + model, + self.context_length, + self.threshold_tokens, + threshold_percent * 100, + self.summary_target_ratio * 100, self.tail_token_budget, - provider or "none", base_url or "none", + provider or "none", + base_url or "none", ) self._context_probed = False # True after a step-down from context error @@ -381,7 +396,9 @@ def should_compress(self, prompt_tokens: int = None) -> bool: # ------------------------------------------------------------------ def _prune_old_tool_results( - self, messages: List[Dict[str, Any]], protect_tail_count: int, + self, + messages: List[Dict[str, Any]], + protect_tail_count: int, protect_tail_tokens: int | None = None, ) -> tuple[List[Dict[str, Any]], int]: """Replace old tool result contents with informative 1-line summaries. @@ -417,7 +434,10 @@ def _prune_old_tool_results( if isinstance(tc, dict): cid = tc.get("id", "") fn = tc.get("function", {}) - call_id_to_tool[cid] = (fn.get("name", "unknown"), fn.get("arguments", "")) + call_id_to_tool[cid] = ( + fn.get("name", "unknown"), + fn.get("arguments", ""), + ) else: cid = getattr(tc, "id", "") or "" fn = getattr(tc, "function", None) @@ -434,13 +454,20 @@ def _prune_old_tool_results( for i in range(len(result) - 1, -1, -1): msg = result[i] raw_content = msg.get("content") or "" - content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content) + content_len = ( + sum(len(p.get("text", "")) for p in raw_content) + if isinstance(raw_content, list) + else len(raw_content) + ) msg_tokens = content_len // _CHARS_PER_TOKEN + 10 for tc in msg.get("tool_calls") or []: if isinstance(tc, dict): args = tc.get("function", {}).get("arguments", "") msg_tokens += len(args) // _CHARS_PER_TOKEN - if accumulated + msg_tokens > protect_tail_tokens and (len(result) - i) >= min_protect: + if ( + accumulated + msg_tokens > protect_tail_tokens + and (len(result) - i) >= min_protect + ): boundary = i break accumulated += msg_tokens @@ -466,7 +493,10 @@ def _prune_old_tool_results( h = hashlib.md5(content.encode("utf-8", errors="replace")).hexdigest()[:12] if h in content_hashes: # This is an older duplicate — replace with back-reference - result[i] = {**msg, "content": "[Duplicate tool output — same content as a more recent call]"} + result[i] = { + **msg, + "content": "[Duplicate tool output — same content as a more recent call]", + } pruned += 1 else: content_hashes[h] = (i, msg.get("tool_call_id", "?")) @@ -493,6 +523,7 @@ def _prune_old_tool_results( result[i] = {**msg, "content": summary} pruned += 1 + # Pass 3: Truncate large tool_call arguments while keeping valid JSON # Pass 3: Truncate large tool_call arguments in assistant messages # outside the protected tail. write_file with 50KB content, for # example, survives pruning entirely without this. @@ -505,8 +536,41 @@ def _prune_old_tool_results( msg = result[i] if msg.get("role") != "assistant" or not msg.get("tool_calls"): continue + new_tcs = [] modified = False + for tc in msg.get("tool_calls", []): + if not isinstance(tc, dict): + new_tcs.append(tc) + continue + + func = tc.get("function", {}) + args_str = func.get("arguments", "") + + if len(args_str) > 500: + # Try to parse and truncate while keeping valid JSON + try: + import json + args_obj = json.loads(args_str) + + # Truncate string values + for key, value in args_obj.items(): + if isinstance(value, str) and len(value) > 200: + args_obj[key] = value[:200] + "...[truncated]" + + # Re-serialize to valid JSON + func["arguments"] = json.dumps(args_obj, ensure_ascii=False) + modified = True + except json.JSONDecodeError: + # If can't parse, drop this tool_call but keep message + pruned.append(msg) + modified = True + break + else: + new_tcs.append(tc) + + if modified and new_tcs: + msg["tool_calls"] = new_tcs for tc in msg["tool_calls"]: if isinstance(tc, dict): args = tc.get("function", {}).get("arguments", "") @@ -539,11 +603,11 @@ def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> i # Truncation limits for the summarizer input. These bound how much of # each message the summary model sees — the budget is the *summary* # model's context window, not the main model's. - _CONTENT_MAX = 6000 # total chars per message body - _CONTENT_HEAD = 4000 # chars kept from the start - _CONTENT_TAIL = 1500 # chars kept from the end - _TOOL_ARGS_MAX = 1500 # tool call argument chars - _TOOL_ARGS_HEAD = 1200 # kept from the start of tool args + _CONTENT_MAX = 6000 # total chars per message body + _CONTENT_HEAD = 4000 # chars kept from the start + _CONTENT_TAIL = 1500 # chars kept from the end + _TOOL_ARGS_MAX = 1500 # tool call argument chars + _TOOL_ARGS_HEAD = 1200 # kept from the start of tool args def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: """Serialize conversation turns into labeled text for the summarizer. @@ -565,14 +629,22 @@ def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: if role == "tool": tool_id = msg.get("tool_call_id", "") if len(content) > self._CONTENT_MAX: - content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:] + content = ( + content[: self._CONTENT_HEAD] + + "\n...[truncated]...\n" + + content[-self._CONTENT_TAIL :] + ) parts.append(f"[TOOL RESULT {tool_id}]: {content}") continue # Assistant messages: include tool call names AND arguments if role == "assistant": if len(content) > self._CONTENT_MAX: - content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:] + content = ( + content[: self._CONTENT_HEAD] + + "\n...[truncated]...\n" + + content[-self._CONTENT_TAIL :] + ) tool_calls = msg.get("tool_calls", []) if tool_calls: tc_parts = [] @@ -583,7 +655,7 @@ def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: args = redact_sensitive_text(fn.get("arguments", "")) # Truncate long arguments but keep enough for context if len(args) > self._TOOL_ARGS_MAX: - args = args[:self._TOOL_ARGS_HEAD] + "..." + args = args[: self._TOOL_ARGS_HEAD] + "..." tc_parts.append(f" {name}({args})") else: fn = getattr(tc, "function", None) @@ -595,12 +667,18 @@ def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: # User and other roles if len(content) > self._CONTENT_MAX: - content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:] + content = ( + content[: self._CONTENT_HEAD] + + "\n...[truncated]...\n" + + content[-self._CONTENT_TAIL :] + ) parts.append(f"[{role.upper()}]: {content}") return "\n\n".join(parts) - def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]: + def _generate_summary( + self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None + ) -> Optional[str]: """Generate a structured summary of conversation turns. Uses a structured template (Goal, Progress, Decisions, Resolved/Pending @@ -744,6 +822,9 @@ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topi The user has requested that this compaction PRIORITISE preserving all information related to the focus topic above. For content related to "{focus_topic}", include full detail — exact values, file paths, command outputs, error messages, and decisions. For content NOT related to the focus topic, summarise more aggressively (brief one-liners or omit if truly irrelevant). The focus topic sections should receive roughly 60-70% of the summary token budget. Even for the focus topic, NEVER preserve API keys, tokens, passwords, or credentials — use [REDACTED].""" try: + from agent.auxiliary_client import _validate_messages_json + + messages = _validate_messages_json([{"role": "user", "content": prompt}]) call_kwargs = { "task": "compression", "main_runtime": { @@ -753,7 +834,7 @@ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topi "api_key": self.api_key, "api_mode": self.api_mode, }, - "messages": [{"role": "user", "content": prompt}], + "messages": messages, "max_tokens": int(summary_budget * 1.3), # timeout resolved from auxiliary.compression.timeout config by call_llm } @@ -774,18 +855,24 @@ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topi return self._with_summary_prefix(summary) except RuntimeError: # No provider configured — long cooldown, unlikely to self-resolve - self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS - logging.warning("Context compression: no provider available for " - "summary. Middle turns will be dropped without summary " - "for %d seconds.", - _SUMMARY_FAILURE_COOLDOWN_SECONDS) + self._summary_failure_cooldown_until = ( + time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS + ) + logging.warning( + "Context compression: no provider available for " + "summary. Middle turns will be dropped without summary " + "for %d seconds.", + _SUMMARY_FAILURE_COOLDOWN_SECONDS, + ) return None except Exception as e: # If the summary model is different from the main model and the # error looks permanent (model not found, 503, 404), fall back to # using the main model instead of entering cooldown that leaves # context growing unbounded. (#8620 sub-issue 4) - _status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None) + _status = getattr(e, "status_code", None) or getattr( + getattr(e, "response", None), "status_code", None + ) _err_str = str(e).lower() _is_model_not_found = ( _status in (404, 503) @@ -803,7 +890,9 @@ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topi logging.warning( "Summary model '%s' not available (%s). " "Falling back to main model '%s' for compression.", - self.summary_model, e, self.model, + self.summary_model, + e, + self.model, ) self.summary_model = "" # empty = use main model self._summary_failure_cooldown_until = 0.0 # no cooldown @@ -811,7 +900,9 @@ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topi # Transient errors (timeout, rate limit, network) — shorter cooldown _transient_cooldown = 60 - self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown + self._summary_failure_cooldown_until = ( + time.monotonic() + _transient_cooldown + ) logging.warning( "Failed to generate context summary: %s. " "Further summary attempts paused for %d seconds.", @@ -826,7 +917,7 @@ def _with_summary_prefix(summary: str) -> str: text = (summary or "").strip() for prefix in (LEGACY_SUMMARY_PREFIX, SUMMARY_PREFIX): if text.startswith(prefix): - text = text[len(prefix):].lstrip() + text = text[len(prefix) :].lstrip() break return f"{SUMMARY_PREFIX}\n{text}" if text else SUMMARY_PREFIX @@ -841,7 +932,9 @@ def _get_tool_call_id(tc) -> str: return tc.get("id", "") return getattr(tc, "id", "") or "" - def _sanitize_tool_pairs(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def _sanitize_tool_pairs( + self, messages: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: """Fix orphaned tool_call / tool_result pairs after compression. Two failure modes: @@ -874,11 +967,18 @@ def _sanitize_tool_pairs(self, messages: List[Dict[str, Any]]) -> List[Dict[str, orphaned_results = result_call_ids - surviving_call_ids if orphaned_results: messages = [ - m for m in messages - if not (m.get("role") == "tool" and m.get("tool_call_id") in orphaned_results) + m + for m in messages + if not ( + m.get("role") == "tool" + and m.get("tool_call_id") in orphaned_results + ) ] if not self.quiet_mode: - logger.info("Compression sanitizer: removed %d orphaned tool result(s)", len(orphaned_results)) + logger.info( + "Compression sanitizer: removed %d orphaned tool result(s)", + len(orphaned_results), + ) # 2. Add stub results for assistant tool_calls whose results were dropped missing_results = surviving_call_ids - result_call_ids @@ -890,14 +990,19 @@ def _sanitize_tool_pairs(self, messages: List[Dict[str, Any]]) -> List[Dict[str, for tc in msg.get("tool_calls") or []: cid = self._get_tool_call_id(tc) if cid in missing_results: - patched.append({ - "role": "tool", - "content": "[Result from earlier conversation — see context summary above]", - "tool_call_id": cid, - }) + patched.append( + { + "role": "tool", + "content": "[Result from earlier conversation — see context summary above]", + "tool_call_id": cid, + } + ) messages = patched if not self.quiet_mode: - logger.info("Compression sanitizer: added %d stub tool result(s)", len(missing_results)) + logger.info( + "Compression sanitizer: added %d stub tool result(s)", + len(missing_results), + ) return messages @@ -931,7 +1036,11 @@ def _align_boundary_backward(self, messages: List[Dict[str, Any]], idx: int) -> check -= 1 # If we landed on the parent assistant with tool_calls, pull the # boundary before it so the whole group gets summarised together. - if check >= 0 and messages[check].get("role") == "assistant" and messages[check].get("tool_calls"): + if ( + check >= 0 + and messages[check].get("role") == "assistant" + and messages[check].get("tool_calls") + ): idx = check return idx @@ -996,7 +1105,9 @@ def _ensure_last_user_message_in_tail( return max(last_user_idx, head_end + 1) def _find_tail_cut_by_tokens( - self, messages: List[Dict[str, Any]], head_end: int, + self, + messages: List[Dict[str, Any]], + head_end: int, token_budget: int | None = None, ) -> int: """Walk backward from the end of messages, accumulating tokens until @@ -1062,7 +1173,12 @@ def _find_tail_cut_by_tokens( # Main compression entry point # ------------------------------------------------------------------ - def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, focus_topic: str = None) -> List[Dict[str, Any]]: + def compress( + self, + messages: List[Dict[str, Any]], + current_tokens: int = None, + focus_topic: str = None, + ) -> List[Dict[str, Any]]: """Compress conversation messages by summarizing middle turns. Algorithm: @@ -1088,15 +1204,21 @@ def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, f if not self.quiet_mode: logger.warning( "Cannot compress: only %d messages (need > %d)", - n_messages, _min_for_compress, + n_messages, + _min_for_compress, ) return messages - display_tokens = current_tokens if current_tokens else self.last_prompt_tokens or estimate_messages_tokens_rough(messages) + display_tokens = ( + current_tokens + if current_tokens + else self.last_prompt_tokens or estimate_messages_tokens_rough(messages) + ) # Phase 1: Prune old tool results (cheap, no LLM call) messages, pruned_count = self._prune_old_tool_results( - messages, protect_tail_count=self.protect_last_n, + messages, + protect_tail_count=self.protect_last_n, protect_tail_tokens=self.tail_token_budget, ) if pruned_count and not self.quiet_mode: @@ -1154,7 +1276,9 @@ def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, f # knows context was lost rather than silently dropping everything. if not summary: if not self.quiet_mode: - logger.warning("Summary generation failed — inserting static fallback context marker") + logger.warning( + "Summary generation failed — inserting static fallback context marker" + ) n_dropped = compress_end - compress_start summary = ( f"{SUMMARY_PREFIX}\n" @@ -1165,8 +1289,16 @@ def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, f ) _merge_summary_into_tail = False - last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user" - first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user" + last_head_role = ( + messages[compress_start - 1].get("role", "user") + if compress_start > 0 + else "user" + ) + first_tail_role = ( + messages[compress_end].get("role", "user") + if compress_end < n_messages + else "user" + ) # Pick a role that avoids consecutive same-role with both neighbors. # Priority: avoid colliding with head (already committed), then tail. if last_head_role in ("assistant", "tool"): @@ -1193,8 +1325,7 @@ def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, f if _merge_summary_into_tail and i == compress_end: original = msg.get("content") or "" msg["content"] = ( - summary - + "\n\n--- END OF CONTEXT SUMMARY — " + summary + "\n\n--- END OF CONTEXT SUMMARY — " "respond to the message below, not the summary above ---\n\n" + original ) @@ -1209,7 +1340,9 @@ def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, f saved_estimate = display_tokens - new_estimate # Anti-thrashing: track compression effectiveness - savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0 + savings_pct = ( + (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0 + ) self._last_compression_savings_pct = savings_pct if savings_pct < 10: self._ineffective_compression_count += 1 diff --git a/gateway/platforms/qqbot/adapter.py b/gateway/platforms/qqbot/adapter.py index df3987f2ebd..93284645841 100644 --- a/gateway/platforms/qqbot/adapter.py +++ b/gateway/platforms/qqbot/adapter.py @@ -535,6 +535,9 @@ async def _listen_loop(self) -> None: quick_disconnect_count = 0 else: backoff_idx += 1 + if backoff_idx >= MAX_RECONNECT_ATTEMPTS: + logger.error("[%s] Max reconnect attempts reached (QQCloseError)", self._log_tag) + return except Exception as exc: if not self._running: