From 1e979ff626d2766974e2e2e81e1e70c90bafc7fd Mon Sep 17 00:00:00 2001 From: Tom Boucher Date: Tue, 17 Mar 2026 16:59:22 -0400 Subject: [PATCH] fix: retry transient network errors before model fallback (#941) (#945) Previously, any provider error during auto-mode immediately triggered the model fallback chain. This meant providers with occasional network flakiness (e.g. zai-coding-plan) would get abandoned after a single transient error, barely getting used before the fallback took over. Now, transient network errors (ECONNRESET, ETIMEDOUT, socket hang up, DNS failures, etc.) are retried up to 2 times with linear backoff (3s, 6s) before falling back to the next model. Permanent errors (auth, quota, billing) still trigger immediate fallback. Changes: - index.ts: Add network retry loop before fallback chain in agent_end error handler. Track retry counts per model in networkRetryCounters map. Clear counters on successful unit completion and model switches. - preferences.ts: Extract isTransientNetworkError() as testable utility. Matches network signals while excluding permanent auth/billing errors. - network-error-fallback.test.ts: Add 12 tests for transient error detection covering all signal patterns and exclusion cases. --- src/resources/extensions/gsd/index.ts | 48 ++++++++++++++++- src/resources/extensions/gsd/preferences.ts | 12 +++++ .../gsd/tests/network-error-fallback.test.ts | 52 ++++++++++++++++++- 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/src/resources/extensions/gsd/index.ts b/src/resources/extensions/gsd/index.ts index b337d141a..ad6d95613 100644 --- a/src/resources/extensions/gsd/index.ts +++ b/src/resources/extensions/gsd/index.ts @@ -44,6 +44,7 @@ import { resolveAllSkillReferences, resolveModelWithFallbacksForUnit, getNextFallbackModel, + isTransientNetworkError, } from "./preferences.js"; import { hasSkillSnapshot, detectNewSkills, formatSkillsXml } from "./skill-discovery.js"; import { @@ -93,6 +94,11 @@ function loadAgentInstructions(): string | null { // ── Depth verification state ────────────────────────────────────────────── let depthVerificationDone = false; +// ── Network error retry counters ────────────────────────────────────────── +// Tracks per-model retry attempts for transient network errors. +// Cleared when a model switch occurs or retries are exhausted. +const networkRetryCounters = new Map(); + export function isDepthVerified(): boolean { return depthVerificationDone; } @@ -728,6 +734,43 @@ export default function (pi: ExtensionAPI) { ? `: ${lastMsg.errorMessage}` : ""; + const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : ""; + + // ── Transient network error retry ────────────────────────────────── + // Before falling back to a different model, retry the current model + // for transient network errors (connection reset, timeout, DNS, etc.). + // This prevents providers with occasional network flakiness from being + // immediately abandoned in favor of fallback models (#941). + if (isTransientNetworkError(errorMsg)) { + const currentModelId = ctx.model?.id ?? "unknown"; + const retryKey = `network-retry:${currentModelId}`; + const maxRetries = 2; + const currentRetries = networkRetryCounters.get(retryKey) ?? 0; + + if (currentRetries < maxRetries) { + networkRetryCounters.set(retryKey, currentRetries + 1); + const attempt = currentRetries + 1; + const delayMs = attempt * 3000; // 3s, 6s backoff + ctx.ui.notify( + `Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`, + "warning", + ); + setTimeout(() => { + pi.sendMessage( + { customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false }, + { triggerTurn: true }, + ); + }, delayMs); + return; + } + // Retries exhausted — clear counter and fall through to fallback logic + networkRetryCounters.delete(retryKey); + ctx.ui.notify( + `Network retries exhausted for ${currentModelId}. Attempting model fallback.`, + "warning", + ); + } + const dash = getAutoDashboardData(); if (dash.currentUnit) { const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type); @@ -738,6 +781,9 @@ export default function (pi: ExtensionAPI) { const nextModelId = getNextFallbackModel(currentModelId, modelConfig); if (nextModelId) { + // Clear any network retry counters when switching models + networkRetryCounters.clear(); + let modelToSet; const slashIdx = nextModelId.indexOf("/"); if (slashIdx !== -1) { @@ -772,7 +818,6 @@ export default function (pi: ExtensionAPI) { } // Detect rate-limit errors and extract retry delay for auto-resume - const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : ""; const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg); const retryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs @@ -792,6 +837,7 @@ export default function (pi: ExtensionAPI) { } try { + networkRetryCounters.clear(); // Clear network retry state on successful unit completion await handleAgentEnd(ctx, pi); } catch (err) { // Safety net: if handleAgentEnd throws despite its internal try-catch, diff --git a/src/resources/extensions/gsd/preferences.ts b/src/resources/extensions/gsd/preferences.ts index 762318493..2d79976c5 100644 --- a/src/resources/extensions/gsd/preferences.ts +++ b/src/resources/extensions/gsd/preferences.ts @@ -595,6 +595,18 @@ export function getNextFallbackModel( } } +/** + * Detect whether an error message indicates a transient network error + * (worth retrying the same model) vs a permanent provider error + * (auth failure, quota exceeded, etc. — should fall back immediately). + */ +export function isTransientNetworkError(errorMsg: string): boolean { + if (!errorMsg) return false; + const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg); + const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg); + return hasNetworkSignal && !hasPermanentSignal; +} + export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined { const prefs = loadEffectiveGSDPreferences(); if (!prefs?.preferences.models) return undefined; diff --git a/src/resources/extensions/gsd/tests/network-error-fallback.test.ts b/src/resources/extensions/gsd/tests/network-error-fallback.test.ts index 9a5708df0..41f7f7694 100644 --- a/src/resources/extensions/gsd/tests/network-error-fallback.test.ts +++ b/src/resources/extensions/gsd/tests/network-error-fallback.test.ts @@ -6,7 +6,7 @@ import assert from "node:assert/strict"; // just test that `resolveModelWithFallbacksForUnit` returns the correct format since // the fallback rotation logic itself was verified manually. -import { getNextFallbackModel } from "../preferences.ts"; +import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts"; test("getNextFallbackModel selects next fallback if current is a fallback", () => { const modelConfig = { primary: "model-a", fallbacks: ["model-b", "model-c"] }; @@ -52,3 +52,53 @@ test("getNextFallbackModel returns primary if current model is undefined", () => assert.equal(nextModelId, "model-a", "should default to primary if current is undefined"); }); + +// ── isTransientNetworkError tests ──────────────────────────────────────────── + +test("isTransientNetworkError detects ECONNRESET", () => { + assert.ok(isTransientNetworkError("fetch failed: ECONNRESET")); +}); + +test("isTransientNetworkError detects ETIMEDOUT", () => { + assert.ok(isTransientNetworkError("ETIMEDOUT: request timed out")); +}); + +test("isTransientNetworkError detects generic network error", () => { + assert.ok(isTransientNetworkError("network error")); +}); + +test("isTransientNetworkError detects socket hang up", () => { + assert.ok(isTransientNetworkError("socket hang up")); +}); + +test("isTransientNetworkError detects fetch failed", () => { + assert.ok(isTransientNetworkError("fetch failed")); +}); + +test("isTransientNetworkError detects connection reset", () => { + assert.ok(isTransientNetworkError("connection was reset by peer")); +}); + +test("isTransientNetworkError detects DNS errors", () => { + assert.ok(isTransientNetworkError("dns resolution failed")); +}); + +test("isTransientNetworkError rejects auth errors", () => { + assert.ok(!isTransientNetworkError("unauthorized: invalid API key")); +}); + +test("isTransientNetworkError rejects quota errors", () => { + assert.ok(!isTransientNetworkError("quota exceeded")); +}); + +test("isTransientNetworkError rejects billing errors", () => { + assert.ok(!isTransientNetworkError("billing issue: network payment required")); +}); + +test("isTransientNetworkError rejects empty string", () => { + assert.ok(!isTransientNetworkError("")); +}); + +test("isTransientNetworkError rejects non-network errors", () => { + assert.ok(!isTransientNetworkError("model not found")); +});