From 64c2286550179b11c9324628d6fa4c9bee0a1419 Mon Sep 17 00:00:00 2001 From: Ebenge Usip <1551356+eusip@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:47:56 +0200 Subject: [PATCH 1/2] fix: retry metadata/agent-state writes on transient network failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updateMetadataBestEffort and updateAgentStateBestEffort previously made a single fire-and-forget attempt. On flaky or mobile connections a transient error would silently swallow the write, leaving the session without a persisted vendorResumeId (e.g. claudeSessionId) and making the session permanently non-resumable. Change both helpers to retry up to 3 times (immediate → +1 s → +2 s) before giving up. Failures on intermediate attempts are logged at debug level; only the final failure is flagged. Behaviour on success is unchanged. --- .../api/session/sessionWritesBestEffort.ts | 59 ++++++++++++++----- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/apps/cli/src/api/session/sessionWritesBestEffort.ts b/apps/cli/src/api/session/sessionWritesBestEffort.ts index 4b678f8c5..2a76a7319 100644 --- a/apps/cli/src/api/session/sessionWritesBestEffort.ts +++ b/apps/cli/src/api/session/sessionWritesBestEffort.ts @@ -1,20 +1,45 @@ import type { AgentState, Metadata } from '@/api/types'; import { logger } from '@/ui/logger'; +const BEST_EFFORT_MAX_ATTEMPTS = 3; +const BEST_EFFORT_RETRY_DELAYS_MS = [1_000, 2_000]; + +async function withRetry( + fn: () => Promise | void, + onFailure: (error: unknown, attempt: number, isFinal: boolean) => void, +): Promise { + for (let attempt = 1; attempt <= BEST_EFFORT_MAX_ATTEMPTS; attempt++) { + try { + await Promise.resolve(fn()); + return; + } catch (error) { + const isFinal = attempt >= BEST_EFFORT_MAX_ATTEMPTS; + onFailure(error, attempt, isFinal); + if (!isFinal) { + await new Promise((resolve) => + setTimeout(resolve, BEST_EFFORT_RETRY_DELAYS_MS[attempt - 1]), + ); + } + } + } +} + export function updateAgentStateBestEffort( session: Readonly<{ updateAgentState: (updater: (state: AgentState) => AgentState) => Promise | void }>, updater: (state: AgentState) => AgentState, logPrefix: string, reason: string, ): void { - try { - const result = session.updateAgentState(updater); - void Promise.resolve(result).catch((error) => { - logger.debug(`${logPrefix} Failed to update agent state (${reason}) (non-fatal)`, error); - }); - } catch (error) { - logger.debug(`${logPrefix} Failed to update agent state (${reason}) (non-fatal)`, error); - } + void withRetry( + () => session.updateAgentState(updater), + (error, attempt, isFinal) => { + if (isFinal) { + logger.debug(`${logPrefix} Failed to update agent state (${reason}) after ${BEST_EFFORT_MAX_ATTEMPTS} attempts (non-fatal)`, error); + } else { + logger.debug(`${logPrefix} Failed to update agent state (${reason}), retrying (attempt ${attempt}/${BEST_EFFORT_MAX_ATTEMPTS}) (non-fatal)`, error); + } + }, + ); } export function updateMetadataBestEffort( @@ -23,12 +48,14 @@ export function updateMetadataBestEffort( logPrefix: string, reason: string, ): void { - try { - const result = session.updateMetadata(updater); - void Promise.resolve(result).catch((error) => { - logger.debug(`${logPrefix} Failed to update session metadata (${reason}) (non-fatal)`, error); - }); - } catch (error) { - logger.debug(`${logPrefix} Failed to update session metadata (${reason}) (non-fatal)`, error); - } + void withRetry( + () => session.updateMetadata(updater), + (error, attempt, isFinal) => { + if (isFinal) { + logger.debug(`${logPrefix} Failed to update session metadata (${reason}) after ${BEST_EFFORT_MAX_ATTEMPTS} attempts (non-fatal)`, error); + } else { + logger.debug(`${logPrefix} Failed to update session metadata (${reason}), retrying (attempt ${attempt}/${BEST_EFFORT_MAX_ATTEMPTS}) (non-fatal)`, error); + } + }, + ); } From fb625126426a9e6f20104509aa33dea04f8651ac Mon Sep 17 00:00:00 2001 From: Ebenge Usip <1551356+eusip@users.noreply.github.com> Date: Thu, 23 Apr 2026 18:51:13 +0200 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?unref=20timer,=20add=20docstrings,=20couple=20delay/attempts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Derive BEST_EFFORT_MAX_ATTEMPTS from BEST_EFFORT_RETRY_DELAYS_MS.length + 1 so the two constants can't silently drift out of sync - unref() the retry timer so pending best-effort retries never hold the Node process open past daemon shutdown - Add JSDoc to withRetry, updateAgentStateBestEffort, updateMetadataBestEffort to satisfy docstring coverage requirement; document intentional retry-all behaviour --- .../api/session/sessionWritesBestEffort.ts | 49 +++++++++++++++++-- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/api/session/sessionWritesBestEffort.ts b/apps/cli/src/api/session/sessionWritesBestEffort.ts index 2a76a7319..9d1de21e7 100644 --- a/apps/cli/src/api/session/sessionWritesBestEffort.ts +++ b/apps/cli/src/api/session/sessionWritesBestEffort.ts @@ -1,9 +1,30 @@ import type { AgentState, Metadata } from '@/api/types'; import { logger } from '@/ui/logger'; -const BEST_EFFORT_MAX_ATTEMPTS = 3; -const BEST_EFFORT_RETRY_DELAYS_MS = [1_000, 2_000]; +/** + * Delays between successive retry attempts (ms). The number of attempts is + * derived from this array: total attempts = delays.length + 1 (the initial + * attempt plus one attempt per delay interval). Keeping the two values coupled + * prevents silent drift if the array is extended without updating a separate + * attempts constant. + * + * All errors are retried unconditionally. This is intentional: the helpers are + * best-effort fire-and-forget writes; distinguishing transient vs permanent + * failures adds complexity that is not warranted here. + */ +const BEST_EFFORT_RETRY_DELAYS_MS = [1_000, 2_000] as const; +const BEST_EFFORT_MAX_ATTEMPTS = BEST_EFFORT_RETRY_DELAYS_MS.length + 1; +/** + * Calls `fn` up to `BEST_EFFORT_MAX_ATTEMPTS` times, waiting + * `BEST_EFFORT_RETRY_DELAYS_MS[n]` between each attempt. + * + * Retry timers are unref'd so they never prevent the Node process from + * exiting if the event loop would otherwise be empty (e.g. during daemon + * shutdown). `onFailure` is invoked after every failed attempt with the error, + * the 1-based attempt number, and a flag indicating whether this was the final + * attempt. + */ async function withRetry( fn: () => Promise | void, onFailure: (error: unknown, attempt: number, isFinal: boolean) => void, @@ -16,14 +37,22 @@ async function withRetry( const isFinal = attempt >= BEST_EFFORT_MAX_ATTEMPTS; onFailure(error, attempt, isFinal); if (!isFinal) { - await new Promise((resolve) => - setTimeout(resolve, BEST_EFFORT_RETRY_DELAYS_MS[attempt - 1]), - ); + await new Promise((resolve) => { + const timer = setTimeout(resolve, BEST_EFFORT_RETRY_DELAYS_MS[attempt - 1]); + // Do not keep the CLI alive solely for a best-effort retry. + timer.unref?.(); + }); } } } } +/** + * Fires a best-effort `updateAgentState` call, retrying up to + * `BEST_EFFORT_MAX_ATTEMPTS` times on transient failure. All errors are + * swallowed after the final attempt; intermediate failures are logged at + * debug level. + */ export function updateAgentStateBestEffort( session: Readonly<{ updateAgentState: (updater: (state: AgentState) => AgentState) => Promise | void }>, updater: (state: AgentState) => AgentState, @@ -42,6 +71,16 @@ export function updateAgentStateBestEffort( ); } +/** + * Fires a best-effort `updateMetadata` call, retrying up to + * `BEST_EFFORT_MAX_ATTEMPTS` times on transient failure. All errors are + * swallowed after the final attempt; intermediate failures are logged at + * debug level. + * + * This is the write path for vendor session IDs (e.g. `claudeSessionId`). + * Retrying here makes sessions resumable even when the initial write races a + * brief network hiccup at session start. + */ export function updateMetadataBestEffort( session: Readonly<{ updateMetadata: (updater: (metadata: Metadata) => Metadata) => Promise | void }>, updater: (metadata: Metadata) => Metadata,