From b56eb882bb88f78c986540d055ea605ecf766483 Mon Sep 17 00:00:00 2001 From: Iouri Goussev Date: Thu, 26 Mar 2026 13:33:54 -0400 Subject: [PATCH 1/3] =?UTF-8?q?refactor(gsd):=20unify=20three=20overlappin?= =?UTF-8?q?g=20error=20classifiers=20into=20single=20classify=E2=86=92deci?= =?UTF-8?q?de=E2=86=92act=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent error classifiers (isTransientNetworkError, classifyProviderError, and inline network-retry logic) had diverging counters and duplicate regex coverage. Every new edge case required patching a different classifier. Replaces all three with a single classifyError() returning a discriminated union ErrorClass, one RetryState object with explicit lifecycle, and a clean classify→decide→act flow in handleAgentEnd. Behavioral change: rate-limit errors no longer trigger model fallback — throttling is a provider issue, switching models on the same provider is pointless. Closes #2577 --- .../gsd/bootstrap/agent-end-recovery.ts | 187 +++++++++++------- .../extensions/gsd/error-classifier.ts | 139 +++++++++++++ .../extensions/gsd/preferences-models.ts | 12 -- src/resources/extensions/gsd/preferences.ts | 1 - .../extensions/gsd/provider-error-pause.ts | 57 ------ .../gsd/tests/provider-errors.test.ts | 140 ++++++------- .../gsd/tests/terminated-transient.test.ts | 75 ++++--- 7 files changed, 368 insertions(+), 243 deletions(-) create mode 100644 src/resources/extensions/gsd/error-classifier.ts diff --git a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts index e9c018101..302671da4 100644 --- a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts +++ b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts @@ -2,14 +2,51 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent"; import { checkAutoStartAfterDiscuss } from "../guided-flow.js"; import { getAutoDashboardData, getAutoModeStartModel, isAutoActive, pauseAuto } from "../auto.js"; -import { getNextFallbackModel, isTransientNetworkError, resolveModelWithFallbacksForUnit } from "../preferences.js"; -import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.js"; +import { getNextFallbackModel, resolveModelWithFallbacksForUnit } from "../preferences.js"; +import { pauseAutoForProviderError } from "../provider-error-pause.js"; import { isSessionSwitchInFlight, resolveAgentEnd } from "../auto-loop.js"; +import { resolveModelId } from "../auto-model-selection.js"; import { clearDiscussionFlowState } from "./write-gate.js"; +import { + classifyError, + createRetryState, + resetRetryState, + isTransient, + type ErrorClass, +} from "../error-classifier.js"; -const networkRetryCounters = new Map(); +const retryState = createRetryState(); +const MAX_NETWORK_RETRIES = 2; const MAX_TRANSIENT_AUTO_RESUMES = 3; -let consecutiveTransientErrors = 0; + +async function pauseTransientWithBackoff( + cls: ErrorClass, + pi: ExtensionAPI, + ctx: ExtensionContext, + errorDetail: string, + isRateLimit: boolean, +): Promise { + retryState.consecutiveTransientCount += 1; + const baseRetryAfterMs = "retryAfterMs" in cls ? cls.retryAfterMs : 15_000; + const retryAfterMs = baseRetryAfterMs * 2 ** Math.max(0, retryState.consecutiveTransientCount - 1); + const allowAutoResume = retryState.consecutiveTransientCount <= MAX_TRANSIENT_AUTO_RESUMES; + if (!allowAutoResume) { + ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning"); + } + await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), { + isRateLimit, + isTransient: allowAutoResume, + retryAfterMs, + resume: allowAutoResume + ? () => { + pi.sendMessage( + { customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false }, + { triggerTurn: true }, + ); + } + : undefined, + }); +} export async function handleAgentEnd( pi: ExtensionAPI, @@ -31,17 +68,26 @@ export async function handleAgentEnd( if (lastMsg && "stopReason" in lastMsg && lastMsg.stopReason === "error") { const errorDetail = "errorMessage" in lastMsg && lastMsg.errorMessage ? `: ${lastMsg.errorMessage}` : ""; const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : ""; + const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined; - if (isTransientNetworkError(errorMsg)) { + // ── 1. Classify ────────────────────────────────────────────────────── + const cls = classifyError(errorMsg, explicitRetryAfterMs); + + // ── 2. Decide & Act ────────────────────────────────────────────────── + + // --- Network errors: same-model retry with backoff --- + if (cls.kind === "network") { const currentModelId = ctx.model?.id ?? "unknown"; - const retryKey = `network-retry:${currentModelId}`; - const currentRetries = networkRetryCounters.get(retryKey) ?? 0; - const maxRetries = 2; - if (currentRetries < maxRetries) { - networkRetryCounters.set(retryKey, currentRetries + 1); - const attempt = currentRetries + 1; - const delayMs = attempt * 3000; - ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`, "warning"); + if (retryState.currentRetryModelId !== currentModelId) { + retryState.networkRetryCount = 0; + retryState.currentRetryModelId = currentModelId; + } + if (retryState.networkRetryCount < MAX_NETWORK_RETRIES) { + retryState.networkRetryCount += 1; + retryState.consecutiveTransientCount += 1; + const attempt = retryState.networkRetryCount; + const delayMs = attempt * cls.retryAfterMs; + ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${MAX_NETWORK_RETRIES} in ${delayMs / 1000}s...`, "warning"); setTimeout(() => { pi.sendMessage( { customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false }, @@ -50,26 +96,56 @@ export async function handleAgentEnd( }, delayMs); return; } - networkRetryCounters.delete(retryKey); + // Network retries exhausted — fall through to model fallback + retryState.networkRetryCount = 0; + retryState.currentRetryModelId = undefined; ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning"); } - const dash = getAutoDashboardData(); - if (dash.currentUnit) { - const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type); - if (modelConfig && modelConfig.fallbacks.length > 0) { - const availableModels = ctx.modelRegistry.getAvailable(); - const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig); - if (nextModelId) { - networkRetryCounters.clear(); - const slashIdx = nextModelId.indexOf("/"); - const modelToSet = slashIdx !== -1 - ? availableModels.find((m) => m.provider.toLowerCase() === nextModelId.substring(0, slashIdx).toLowerCase() && m.id.toLowerCase() === nextModelId.substring(slashIdx + 1).toLowerCase()) - : (availableModels.find((m) => m.id === nextModelId && m.provider === ctx.model?.provider) ?? availableModels.find((m) => m.id === nextModelId)); - if (modelToSet) { - const ok = await pi.setModel(modelToSet, { persist: false }); + // --- Rate limit: skip model fallback, go straight to pause --- + // Rate-limiting is a provider issue, not a model issue. + // Switching models won't help if the provider is throttling you. + if (cls.kind === "rate-limit") { + await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true); + return; + } + + // --- Server/connection/stream errors: try model fallback first --- + if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") { + // Try model fallback + const dash = getAutoDashboardData(); + if (dash.currentUnit) { + const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type); + if (modelConfig && modelConfig.fallbacks.length > 0) { + const availableModels = ctx.modelRegistry.getAvailable(); + const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig); + if (nextModelId) { + retryState.networkRetryCount = 0; + retryState.currentRetryModelId = undefined; + const modelToSet = resolveModelId(nextModelId, availableModels, ctx.model?.provider); + if (modelToSet) { + const ok = await pi.setModel(modelToSet, { persist: false }); + if (ok) { + ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning"); + pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true }); + return; + } + } + } + } + } + + // Try restoring session model + const sessionModel = getAutoModeStartModel(); + if (sessionModel) { + if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) { + const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id); + if (startModel) { + const ok = await pi.setModel(startModel, { persist: false }); if (ok) { - ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning"); + retryState.networkRetryCount = 0; + retryState.currentRetryModelId = undefined; + ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning"); pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true }); return; } @@ -78,56 +154,24 @@ export async function handleAgentEnd( } } - const sessionModel = getAutoModeStartModel(); - if (sessionModel) { - if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) { - const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id); - if (startModel) { - const ok = await pi.setModel(startModel, { persist: false }); - if (ok) { - networkRetryCounters.clear(); - ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning"); - pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true }); - return; - } - } - } + // --- Transient fallback: pause with auto-resume --- + if (isTransient(cls)) { + await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false); + return; } - const classification = classifyProviderError(errorMsg); - const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined; - if (classification.isTransient) { - consecutiveTransientErrors += 1; - } else { - consecutiveTransientErrors = 0; - } - const baseRetryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs; - const retryAfterMs = classification.isTransient - ? baseRetryAfterMs * 2 ** Math.max(0, consecutiveTransientErrors - 1) - : baseRetryAfterMs; - const allowAutoResume = classification.isTransient && consecutiveTransientErrors <= MAX_TRANSIENT_AUTO_RESUMES; - if (classification.isTransient && !allowAutoResume) { - ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning"); - } + // --- Permanent / unknown: pause indefinitely --- await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), { - isRateLimit: classification.isRateLimit, - isTransient: allowAutoResume, - retryAfterMs, - resume: allowAutoResume - ? () => { - pi.sendMessage( - { customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false }, - { triggerTurn: true }, - ); - } - : undefined, + isRateLimit: false, + isTransient: false, + retryAfterMs: 0, }); return; } + // ── Success path ───────────────────────────────────────────────────────── try { - consecutiveTransientErrors = 0; - networkRetryCounters.clear(); + resetRetryState(retryState); resolveAgentEnd(event); } catch (err) { const message = err instanceof Error ? err.message : String(err); @@ -139,4 +183,3 @@ export async function handleAgentEnd( } } } - diff --git a/src/resources/extensions/gsd/error-classifier.ts b/src/resources/extensions/gsd/error-classifier.ts new file mode 100644 index 000000000..786a5c5d6 --- /dev/null +++ b/src/resources/extensions/gsd/error-classifier.ts @@ -0,0 +1,139 @@ +/** + * Unified error classifier for provider/network/server errors. + * + * Consolidates patterns from: + * - isTransientNetworkError() in preferences-models.ts + * - classifyProviderError() in provider-error-pause.ts + * + * Single entry point: classifyError(errorMsg, retryAfterMs?) + * + * @see https://github.com/gsd-build/gsd/issues/2577 + */ + +// ── ErrorClass discriminated union ────────────────────────────────────────── + +export type ErrorClass = + | { kind: "network"; retryAfterMs: number } + | { kind: "rate-limit"; retryAfterMs: number } + | { kind: "server"; retryAfterMs: number } + | { kind: "stream"; retryAfterMs: number } + | { kind: "connection"; retryAfterMs: number } + | { kind: "model-error" } + | { kind: "permanent" } + | { kind: "unknown" }; + +// ── RetryState ────────────────────────────────────────────────────────────── + +export interface RetryState { + networkRetryCount: number; + consecutiveTransientCount: number; + currentRetryModelId: string | undefined; +} + +export function createRetryState(): RetryState { + return { networkRetryCount: 0, consecutiveTransientCount: 0, currentRetryModelId: undefined }; +} + +export function resetRetryState(state: RetryState): void { + state.networkRetryCount = 0; + state.consecutiveTransientCount = 0; + state.currentRetryModelId = undefined; +} + +// ── Classification ────────────────────────────────────────────────────────── + +const PERMANENT_RE = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i; +const RATE_LIMIT_RE = /rate.?limit|too many requests|429/i; +const NETWORK_RE = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i; +const SERVER_RE = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i; +// ECONNRESET/ECONNREFUSED are in NETWORK_RE (same-model retry first). +const CONNECTION_RE = /terminated|connection.?refused|other side closed|EPIPE|network.?(?:is\s+)?unavailable/i; +const STREAM_RE = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i; +const RESET_DELAY_RE = /reset in (\d+)s/i; + +/** + * Classify an error message into one of the ErrorClass kinds. + * + * Classification order: + * 1. Permanent (auth/billing/quota) — unless also rate-limited + * 2. Rate limit (429, rate.?limit, too many requests) + * 3. Network (ECONNRESET, ETIMEDOUT, socket hang up, fetch failed, dns) + * 4. Server (500/502/503, overloaded, server_error) + * 5. Connection (terminated, ECONNREFUSED, EPIPE, other side closed) + * 6. Stream truncation (malformed JSON from mid-stream cut) + * 7. Unknown + */ +export function classifyError(errorMsg: string, retryAfterMs?: number): ErrorClass { + const isPermanent = PERMANENT_RE.test(errorMsg); + const isRateLimit = RATE_LIMIT_RE.test(errorMsg); + + // 1. Permanent — but rate limit takes precedence + if (isPermanent && !isRateLimit) { + return { kind: "permanent" }; + } + + // 2. Rate limit + if (isRateLimit) { + if (retryAfterMs != null && retryAfterMs > 0) { + return { kind: "rate-limit", retryAfterMs }; + } + const resetMatch = errorMsg.match(RESET_DELAY_RE); + const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000; + return { kind: "rate-limit", retryAfterMs: delayMs }; + } + + // 3. Network errors — same-model retry candidate + if (NETWORK_RE.test(errorMsg)) { + // Exclude if also matches permanent signals (already handled above for + // rate-limit, but double-check for non-rate-limit permanent overlap like + // "billing" appearing alongside "network"). + return { kind: "network", retryAfterMs: retryAfterMs ?? 3_000 }; + } + + // 4. Server errors — try fallback model + if (SERVER_RE.test(errorMsg)) { + return { kind: "server", retryAfterMs: retryAfterMs ?? 30_000 }; + } + + // 5. Connection errors — try fallback model + if (CONNECTION_RE.test(errorMsg)) { + return { kind: "connection", retryAfterMs: retryAfterMs ?? 15_000 }; + } + + // 6. Stream truncation — downstream symptom of connection drop + if (STREAM_RE.test(errorMsg)) { + return { kind: "stream", retryAfterMs: retryAfterMs ?? 15_000 }; + } + + // 7. Unknown + return { kind: "unknown" }; +} + +// ── Helpers ───────────────────────────────────────────────────────────────── + +/** Returns true for all transient (auto-resumable) error kinds. */ +export function isTransient(cls: ErrorClass): boolean { + switch (cls.kind) { + case "network": + case "rate-limit": + case "server": + case "stream": + case "connection": + return true; + default: + return false; + } +} + +/** + * Backward-compatible thin wrapper. + * + * Returns true when the error is a transient *network* error specifically + * (worth retrying the same model). Permanent signals (auth, billing, quota) + * cause this to return false even if a network keyword is present. + */ +export function isTransientNetworkError(errorMsg: string): boolean { + if (!errorMsg) return false; + const cls = classifyError(errorMsg); + return cls.kind === "network"; +} diff --git a/src/resources/extensions/gsd/preferences-models.ts b/src/resources/extensions/gsd/preferences-models.ts index 303c43470..9e393b06d 100644 --- a/src/resources/extensions/gsd/preferences-models.ts +++ b/src/resources/extensions/gsd/preferences-models.ts @@ -125,18 +125,6 @@ export function getNextFallbackModel( } } -/** - * Detect whether an error message indicates a transient network error - * (worth retrying the same model) vs a permanent provider error - * (auth failure, quota exceeded, etc. -- should fall back immediately). - */ -export function isTransientNetworkError(errorMsg: string): boolean { - if (!errorMsg) return false; - const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg); - const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg); - return hasNetworkSignal && !hasPermanentSignal; -} - /** * Validate a model ID string. * Returns true if the ID looks like a valid model identifier. diff --git a/src/resources/extensions/gsd/preferences.ts b/src/resources/extensions/gsd/preferences.ts index 0b0b82927..d13560580 100644 --- a/src/resources/extensions/gsd/preferences.ts +++ b/src/resources/extensions/gsd/preferences.ts @@ -68,7 +68,6 @@ export { resolveModelForUnit, resolveModelWithFallbacksForUnit, getNextFallbackModel, - isTransientNetworkError, validateModelId, updatePreferencesModels, resolveDynamicRoutingConfig, diff --git a/src/resources/extensions/gsd/provider-error-pause.ts b/src/resources/extensions/gsd/provider-error-pause.ts index 7a5414999..f184a691d 100644 --- a/src/resources/extensions/gsd/provider-error-pause.ts +++ b/src/resources/extensions/gsd/provider-error-pause.ts @@ -2,63 +2,6 @@ export type ProviderErrorPauseUI = { notify(message: string, level?: "info" | "warning" | "error" | "success"): void; }; -/** - * Classify a provider error as transient (auto-resume) or permanent (manual resume). - * - * Transient: rate limits, server errors (500/502/503), overloaded, internal errors. - * These are expected to self-resolve and should auto-resume after a delay. - * - * Permanent: auth errors, invalid API key, billing issues. - * These require user intervention and should pause indefinitely. - */ -export function classifyProviderError(errorMsg: string): { - isTransient: boolean; - isRateLimit: boolean; - suggestedDelayMs: number; -} { - const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg); - const isServerError = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i.test(errorMsg); - - // Connection/process errors — transient, auto-resume after brief backoff (#2309). - // These indicate the process was killed, the connection was reset, or a network - // blip occurred. They are NOT permanent failures. - const isConnectionError = /terminated|connection.?reset|connection.?refused|other side closed|fetch failed|network.?(?:is\s+)?unavailable|ECONNREFUSED|ECONNRESET|EPIPE/i.test(errorMsg); - - // Permanent errors — never auto-resume - const isPermanent = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i.test(errorMsg); - - if (isPermanent && !isRateLimit) { - return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 }; - } - - if (isRateLimit) { - // Try to extract retry-after from the message - const resetMatch = errorMsg.match(/reset in (\d+)s/i); - const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000; // default 60s for rate limits - return { isTransient: true, isRateLimit: true, suggestedDelayMs: delayMs }; - } - - if (isServerError) { - return { isTransient: true, isRateLimit: false, suggestedDelayMs: 30_000 }; // 30s for server errors - } - - if (isConnectionError) { - return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s for connection errors - } - - // Stream-truncation JSON parse errors — transient (#2572). - // When the API stream is cut mid-chunk, pi tries to reassemble the partial - // tool-call JSON and gets a SyntaxError. This is the downstream symptom of - // a connection drop — same root cause as ECONNRESET, one layer up. - const isMalformedStream = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i.test(errorMsg); - if (isMalformedStream) { - return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s, same as connection errors - } - - // Unknown error — treat as permanent (user reviews) - return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 }; -} - /** * Pause auto-mode due to a provider error. * diff --git a/src/resources/extensions/gsd/tests/provider-errors.test.ts b/src/resources/extensions/gsd/tests/provider-errors.test.ts index 0512b4d90..901e9a0c7 100644 --- a/src/resources/extensions/gsd/tests/provider-errors.test.ts +++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts @@ -1,6 +1,6 @@ /** * Provider error handling tests — consolidated from: - * - provider-error-classify.test.ts (classifyProviderError) + * - provider-error-classify.test.ts (classifyError) * - network-error-fallback.test.ts (isTransientNetworkError, getNextFallbackModel) * - agent-end-provider-error.test.ts (pauseAutoForProviderError) */ @@ -10,102 +10,104 @@ import assert from "node:assert/strict"; import { readFileSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; -import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts"; -import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts"; +import { classifyError, isTransient, isTransientNetworkError } from "../error-classifier.ts"; +import { pauseAutoForProviderError } from "../provider-error-pause.ts"; +import { getNextFallbackModel } from "../preferences.ts"; const __dirname = dirname(fileURLToPath(import.meta.url)); -// ── classifyProviderError ──────────────────────────────────────────────────── +// ── classifyError ──────────────────────────────────────────────────────────── -test("classifyProviderError detects rate limit from 429", () => { - const result = classifyProviderError("HTTP 429 Too Many Requests"); - assert.ok(result.isTransient); - assert.ok(result.isRateLimit); - assert.ok(result.suggestedDelayMs > 0); +test("classifyError detects rate limit from 429", () => { + const result = classifyError("HTTP 429 Too Many Requests"); + assert.ok(isTransient(result)); + assert.equal(result.kind, "rate-limit"); + assert.ok("retryAfterMs" in result && result.retryAfterMs > 0); }); -test("classifyProviderError detects rate limit from message", () => { - const result = classifyProviderError("rate limit exceeded"); - assert.ok(result.isTransient); - assert.ok(result.isRateLimit); +test("classifyError detects rate limit from message", () => { + const result = classifyError("rate limit exceeded"); + assert.ok(isTransient(result)); + assert.equal(result.kind, "rate-limit"); }); -test("classifyProviderError extracts reset delay from message", () => { - const result = classifyProviderError("rate limit exceeded, reset in 45s"); - assert.ok(result.isRateLimit); - assert.equal(result.suggestedDelayMs, 45000); +test("classifyError extracts reset delay from message", () => { + const result = classifyError("rate limit exceeded, reset in 45s"); + assert.equal(result.kind, "rate-limit"); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 45000); }); -test("classifyProviderError defaults to 60s for rate limit without reset", () => { - const result = classifyProviderError("429 too many requests"); - assert.ok(result.isRateLimit); - assert.equal(result.suggestedDelayMs, 60_000); +test("classifyError defaults to 60s for rate limit without reset", () => { + const result = classifyError("429 too many requests"); + assert.equal(result.kind, "rate-limit"); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 60_000); }); -test("classifyProviderError detects Anthropic internal server error", () => { +test("classifyError detects Anthropic internal server error", () => { const msg = '{"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}'; - const result = classifyProviderError(msg); - assert.ok(result.isTransient); - assert.ok(!result.isRateLimit); - assert.equal(result.suggestedDelayMs, 30_000); + const result = classifyError(msg); + assert.ok(isTransient(result)); + assert.equal(result.kind, "server"); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000); }); -test("classifyProviderError detects Codex server_error from extracted message", () => { +test("classifyError detects Codex server_error from extracted message", () => { // After fix, mapCodexEvents extracts the nested error type and produces // "Codex server_error: " instead of raw JSON. const msg = "Codex server_error: An error occurred while processing your request."; - const result = classifyProviderError(msg); - assert.ok(result.isTransient); - assert.ok(!result.isRateLimit); - assert.equal(result.suggestedDelayMs, 30_000); + const result = classifyError(msg); + assert.ok(isTransient(result)); + assert.equal(result.kind, "server"); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000); }); -test("classifyProviderError detects overloaded error", () => { - const result = classifyProviderError("overloaded_error: Overloaded"); - assert.ok(result.isTransient); - assert.equal(result.suggestedDelayMs, 30_000); +test("classifyError detects overloaded error", () => { + const result = classifyError("overloaded_error: Overloaded"); + assert.ok(isTransient(result)); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000); }); -test("classifyProviderError detects 503 service unavailable", () => { - const result = classifyProviderError("HTTP 503 Service Unavailable"); - assert.ok(result.isTransient); +test("classifyError detects 503 service unavailable", () => { + const result = classifyError("HTTP 503 Service Unavailable"); + assert.ok(isTransient(result)); }); -test("classifyProviderError detects 502 bad gateway", () => { - const result = classifyProviderError("HTTP 502 Bad Gateway"); - assert.ok(result.isTransient); +test("classifyError detects 502 bad gateway", () => { + const result = classifyError("HTTP 502 Bad Gateway"); + assert.ok(isTransient(result)); }); -test("classifyProviderError detects auth error as permanent", () => { - const result = classifyProviderError("unauthorized: invalid API key"); - assert.ok(!result.isTransient); - assert.ok(!result.isRateLimit); +test("classifyError detects auth error as permanent", () => { + const result = classifyError("unauthorized: invalid API key"); + assert.ok(!isTransient(result)); + assert.equal(result.kind, "permanent"); }); -test("classifyProviderError detects billing error as permanent", () => { - const result = classifyProviderError("billing issue: payment required"); - assert.ok(!result.isTransient); +test("classifyError detects billing error as permanent", () => { + const result = classifyError("billing issue: payment required"); + assert.ok(!isTransient(result)); }); -test("classifyProviderError detects quota exceeded as permanent", () => { - const result = classifyProviderError("quota exceeded for this month"); - assert.ok(!result.isTransient); +test("classifyError detects quota exceeded as permanent", () => { + const result = classifyError("quota exceeded for this month"); + assert.ok(!isTransient(result)); }); -test("classifyProviderError treats unknown error as permanent", () => { - const result = classifyProviderError("something went wrong"); - assert.ok(!result.isTransient); +test("classifyError treats unknown error as not transient", () => { + const result = classifyError("something went wrong"); + assert.ok(!isTransient(result)); + assert.equal(result.kind, "unknown"); }); -test("classifyProviderError treats empty string as permanent", () => { - const result = classifyProviderError(""); - assert.ok(!result.isTransient); +test("classifyError treats empty string as not transient", () => { + const result = classifyError(""); + assert.ok(!isTransient(result)); }); -test("classifyProviderError: rate limit takes precedence over auth keywords", () => { - const result = classifyProviderError("429 unauthorized rate limit"); - assert.ok(result.isRateLimit); - assert.ok(result.isTransient); +test("classifyError: rate limit takes precedence over auth keywords", () => { + const result = classifyError("429 unauthorized rate limit"); + assert.equal(result.kind, "rate-limit"); + assert.ok(isTransient(result)); }); // ── isTransientNetworkError ────────────────────────────────────────────────── @@ -265,8 +267,8 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8"); assert.ok( - src.includes("consecutiveTransientErrors"), - "agent-end-recovery.ts must track consecutiveTransientErrors for escalating backoff (#1166)", + src.includes("consecutiveTransientCount"), + "agent-end-recovery.ts must track consecutiveTransientCount for escalating backoff (#1166)", ); assert.ok( src.includes("MAX_TRANSIENT_AUTO_RESUMES"), @@ -274,15 +276,13 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b ); }); -test("agent-end-recovery.ts resets consecutive transient error counter on success", () => { +test("agent-end-recovery.ts resets retry state before resolveAgentEnd on success", () => { const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8"); - // After successful agent_end (before resolveAgentEnd), the counter must be reset. - // Use a regex across the success block so CRLF checkouts on Windows do not - // push the reset line outside a fixed substring window. + // After successful agent_end, resetRetryState must be called before resolveAgentEnd. assert.ok( - /consecutiveTransientErrors\s*=\s*0\s*;[\s\S]{0,250}resolveAgentEnd/.test(src), - "consecutive transient error counter must be reset before resolveAgentEnd on the success path (#1166)", + /resetRetryState[\s\S]{0,250}resolveAgentEnd/.test(src), + "resetRetryState must be called before resolveAgentEnd on the success path (#1166)", ); }); @@ -291,7 +291,7 @@ test("agent-end-recovery.ts applies escalating delay for repeated transient erro // Must contain the exponential backoff formula (may span multiple lines) assert.ok( - src.includes("2 ** Math.max(0, consecutiveTransientErrors"), + src.includes("2 ** Math.max(0, retryState.consecutiveTransientCount"), "agent-end-recovery.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)", ); }); diff --git a/src/resources/extensions/gsd/tests/terminated-transient.test.ts b/src/resources/extensions/gsd/tests/terminated-transient.test.ts index 52d178625..4c8aa91a7 100644 --- a/src/resources/extensions/gsd/tests/terminated-transient.test.ts +++ b/src/resources/extensions/gsd/tests/terminated-transient.test.ts @@ -1,73 +1,86 @@ /** * terminated-transient.test.ts — Regression test for #2309. * - * classifyProviderError should treat 'terminated' errors (process killed, + * classifyError should treat 'terminated' errors (process killed, * connection reset) as transient with auto-resume, not permanent. */ import test from "node:test"; import assert from "node:assert/strict"; -import { classifyProviderError } from "../provider-error-pause.ts"; +import { classifyError, isTransient } from "../error-classifier.ts"; test("#2309: 'terminated' errors should be classified as transient", () => { - const result = classifyProviderError("terminated"); - assert.equal(result.isTransient, true, "'terminated' should be transient"); - assert.equal(result.isRateLimit, false, "'terminated' is not a rate limit"); - assert.ok(result.suggestedDelayMs > 0, "'terminated' should have a retry delay"); + const result = classifyError("terminated"); + assert.equal(isTransient(result), true, "'terminated' should be transient"); + assert.equal(result.kind, "connection", "'terminated' matches connection"); + assert.equal(result.kind !== "rate-limit", true, "'terminated' is not a rate limit"); + assert.ok("retryAfterMs" in result && result.retryAfterMs > 0, "'terminated' should have a retry delay"); + assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "'terminated' should use 15s backoff"); }); -test("#2309: 'connection reset' errors should be classified as transient", () => { - const result = classifyProviderError("connection reset by peer"); - assert.equal(result.isTransient, true, "'connection reset' should be transient"); +test("#2309: 'connection reset by peer' errors should be classified as transient (network)", () => { + const result = classifyError("connection reset by peer"); + assert.equal(isTransient(result), true, "'connection reset by peer' should be transient"); + assert.equal(result.kind, "network", "'connection reset by peer' matches NETWORK_RE (connection.*reset) before CONNECTION_RE"); + assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff"); }); test("#2309: 'other side closed' errors should be classified as transient", () => { - const result = classifyProviderError("other side closed the connection"); - assert.equal(result.isTransient, true, "'other side closed' should be transient"); + const result = classifyError("other side closed the connection"); + assert.equal(isTransient(result), true, "'other side closed' should be transient"); + assert.equal(result.kind, "connection", "'other side closed' matches CONNECTION_RE"); }); test("#2309: 'fetch failed' errors should be classified as transient", () => { - const result = classifyProviderError("fetch failed: network error"); - assert.equal(result.isTransient, true, "'fetch failed' should be transient"); + const result = classifyError("fetch failed: network error"); + assert.equal(isTransient(result), true, "'fetch failed' should be transient"); + assert.equal(result.kind, "network", "'fetch failed' matches NETWORK_RE"); + assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff"); }); test("#2309: 'connection refused' errors should be classified as transient", () => { - const result = classifyProviderError("ECONNREFUSED: connection refused"); - assert.equal(result.isTransient, true, "'connection refused' should be transient"); + const result = classifyError("ECONNREFUSED: connection refused"); + assert.equal(isTransient(result), true, "'connection refused' should be transient"); + assert.equal(result.kind, "network", "'ECONNREFUSED' matches NETWORK_RE (same-model retry)"); }); test("#2309: permanent errors are still permanent", () => { - const authResult = classifyProviderError("unauthorized: invalid API key"); - assert.equal(authResult.isTransient, false, "auth errors should stay permanent"); - assert.equal(authResult.suggestedDelayMs, 0, "permanent errors have no delay"); + const authResult = classifyError("unauthorized: invalid API key"); + assert.equal(isTransient(authResult), false, "auth errors should stay permanent"); + assert.equal(authResult.kind, "permanent", "auth errors are permanent"); + assert.equal("retryAfterMs" in authResult, false, "permanent errors have no retryAfterMs"); }); test("#2309: rate limits are still transient", () => { - const rlResult = classifyProviderError("rate limit exceeded (429)"); - assert.equal(rlResult.isTransient, true, "rate limits are still transient"); - assert.equal(rlResult.isRateLimit, true, "rate limits are flagged as rate limits"); + const rlResult = classifyError("rate limit exceeded (429)"); + assert.equal(isTransient(rlResult), true, "rate limits are still transient"); + assert.equal(rlResult.kind, "rate-limit", "rate limits are flagged as rate-limit kind"); }); // --- #2572: stream-truncation JSON parse errors should be transient --- test("#2572: 'Expected double-quoted property name' (truncated stream) is transient", () => { - const result = classifyProviderError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)"); - assert.equal(result.isTransient, true, "truncated-stream JSON parse error should be transient"); - assert.equal(result.isRateLimit, false, "not a rate limit"); - assert.equal(result.suggestedDelayMs, 15_000, "should use 15s backoff like connection errors"); + const result = classifyError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)"); + assert.equal(isTransient(result), true, "truncated-stream JSON parse error should be transient"); + assert.equal(result.kind, "stream", "JSON parse errors are stream kind"); + assert.equal(result.kind !== "rate-limit", true, "not a rate limit"); + assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "should use 15s backoff"); }); test("#2572: 'Unexpected end of JSON input' (truncated stream) is transient", () => { - const result = classifyProviderError("Unexpected end of JSON input"); - assert.equal(result.isTransient, true, "'Unexpected end of JSON input' should be transient"); + const result = classifyError("Unexpected end of JSON input"); + assert.equal(isTransient(result), true, "'Unexpected end of JSON input' should be transient"); + assert.equal(result.kind, "stream", "JSON parse errors are stream kind"); }); test("#2572: 'Unexpected token' in JSON (truncated stream) is transient", () => { - const result = classifyProviderError("Unexpected token < in JSON at position 0"); - assert.equal(result.isTransient, true, "'Unexpected token in JSON' should be transient"); + const result = classifyError("Unexpected token < in JSON at position 0"); + assert.equal(isTransient(result), true, "'Unexpected token in JSON' should be transient"); + assert.equal(result.kind, "stream", "JSON parse errors are stream kind"); }); test("#2572: 'SyntaxError' with JSON context (truncated stream) is transient", () => { - const result = classifyProviderError("SyntaxError: JSON.parse: unexpected character at line 1 column 1"); - assert.equal(result.isTransient, true, "'SyntaxError...JSON' should be transient"); + const result = classifyError("SyntaxError: JSON.parse: unexpected character at line 1 column 1"); + assert.equal(isTransient(result), true, "'SyntaxError...JSON' should be transient"); + assert.equal(result.kind, "stream", "JSON parse errors are stream kind"); }); From 878ed00f1bc7a04f6128b99f92a7fd03590b9d5d Mon Sep 17 00:00:00 2001 From: Iouri Goussev Date: Thu, 26 Mar 2026 13:39:52 -0400 Subject: [PATCH 2/3] fix(gsd): remove redundant assertions that fail TS2367 typecheck After assert.equal narrows result.kind to a literal type, comparing it against a different literal is flagged as always-true by tsc. Co-Authored-By: Claude Opus 4.6 --- src/resources/extensions/gsd/tests/terminated-transient.test.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/resources/extensions/gsd/tests/terminated-transient.test.ts b/src/resources/extensions/gsd/tests/terminated-transient.test.ts index 4c8aa91a7..f15223f60 100644 --- a/src/resources/extensions/gsd/tests/terminated-transient.test.ts +++ b/src/resources/extensions/gsd/tests/terminated-transient.test.ts @@ -13,7 +13,6 @@ test("#2309: 'terminated' errors should be classified as transient", () => { const result = classifyError("terminated"); assert.equal(isTransient(result), true, "'terminated' should be transient"); assert.equal(result.kind, "connection", "'terminated' matches connection"); - assert.equal(result.kind !== "rate-limit", true, "'terminated' is not a rate limit"); assert.ok("retryAfterMs" in result && result.retryAfterMs > 0, "'terminated' should have a retry delay"); assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "'terminated' should use 15s backoff"); }); @@ -63,7 +62,6 @@ test("#2572: 'Expected double-quoted property name' (truncated stream) is transi const result = classifyError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)"); assert.equal(isTransient(result), true, "truncated-stream JSON parse error should be transient"); assert.equal(result.kind, "stream", "JSON parse errors are stream kind"); - assert.equal(result.kind !== "rate-limit", true, "not a rate limit"); assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "should use 15s backoff"); }); From 7d0130fa0f96444c04216209d6d4bc2fe9b54446 Mon Sep 17 00:00:00 2001 From: Lex Christopherson Date: Thu, 26 Mar 2026 16:43:26 -0600 Subject: [PATCH 3/3] Merge branch 'main' into fix/unified-error-classifier Resolve conflicts in provider-error-pause.ts and provider-errors.test.ts. Add stream_exhausted(_without_result) pattern to unified CONNECTION_RE (ported from main's classifyProviderError addition). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/resources/extensions/gsd/error-classifier.ts | 2 +- src/resources/extensions/gsd/tests/provider-errors.test.ts | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/resources/extensions/gsd/error-classifier.ts b/src/resources/extensions/gsd/error-classifier.ts index 786a5c5d6..c63927b98 100644 --- a/src/resources/extensions/gsd/error-classifier.ts +++ b/src/resources/extensions/gsd/error-classifier.ts @@ -47,7 +47,7 @@ const RATE_LIMIT_RE = /rate.?limit|too many requests|429/i; const NETWORK_RE = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i; const SERVER_RE = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i; // ECONNRESET/ECONNREFUSED are in NETWORK_RE (same-model retry first). -const CONNECTION_RE = /terminated|connection.?refused|other side closed|EPIPE|network.?(?:is\s+)?unavailable/i; +const CONNECTION_RE = /terminated|connection.?refused|other side closed|EPIPE|network.?(?:is\s+)?unavailable|stream_exhausted(?:_without_result)?/i; const STREAM_RE = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i; const RESET_DELAY_RE = /reset in (\d+)s/i; diff --git a/src/resources/extensions/gsd/tests/provider-errors.test.ts b/src/resources/extensions/gsd/tests/provider-errors.test.ts index 901e9a0c7..dfe07867c 100644 --- a/src/resources/extensions/gsd/tests/provider-errors.test.ts +++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts @@ -43,6 +43,13 @@ test("classifyError defaults to 60s for rate limit without reset", () => { assert.ok("retryAfterMs" in result && result.retryAfterMs === 60_000); }); +test("classifyError treats stream_exhausted_without_result as transient connection failure", () => { + const result = classifyError("stream_exhausted_without_result"); + assert.ok(isTransient(result)); + assert.equal(result.kind, "connection"); + assert.ok("retryAfterMs" in result && result.retryAfterMs === 15_000); +}); + test("classifyError detects Anthropic internal server error", () => { const msg = '{"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}'; const result = classifyError(msg);