fix: retry transient network errors before model fallback (#941) (#945)

Previously, any provider error during auto-mode immediately triggered the
model fallback chain. This meant providers with occasional network flakiness
(e.g. zai-coding-plan) would get abandoned after a single transient error,
barely getting used before the fallback took over.

Now, transient network errors (ECONNRESET, ETIMEDOUT, socket hang up, DNS
failures, etc.) are retried up to 2 times with linear backoff (3s, 6s)
before falling back to the next model. Permanent errors (auth, quota,
billing) still trigger immediate fallback.

Changes:
- index.ts: Add network retry loop before fallback chain in agent_end error
  handler. Track retry counts per model in networkRetryCounters map.
  Clear counters on successful unit completion and model switches.
- preferences.ts: Extract isTransientNetworkError() as testable utility.
  Matches network signals while excluding permanent auth/billing errors.
- network-error-fallback.test.ts: Add 12 tests for transient error detection
  covering all signal patterns and exclusion cases.
This commit is contained in:
Tom Boucher 2026-03-17 16:59:22 -04:00 committed by GitHub
parent 5bae521af0
commit 1e979ff626
3 changed files with 110 additions and 2 deletions

View file

@ -44,6 +44,7 @@ import {
resolveAllSkillReferences,
resolveModelWithFallbacksForUnit,
getNextFallbackModel,
isTransientNetworkError,
} from "./preferences.js";
import { hasSkillSnapshot, detectNewSkills, formatSkillsXml } from "./skill-discovery.js";
import {
@ -93,6 +94,11 @@ function loadAgentInstructions(): string | null {
// ── Depth verification state ──────────────────────────────────────────────
let depthVerificationDone = false;
// ── Network error retry counters ──────────────────────────────────────────
// Tracks per-model retry attempts for transient network errors.
// Cleared when a model switch occurs or retries are exhausted.
const networkRetryCounters = new Map<string, number>();
export function isDepthVerified(): boolean {
return depthVerificationDone;
}
@ -728,6 +734,43 @@ export default function (pi: ExtensionAPI) {
? `: ${lastMsg.errorMessage}`
: "";
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
// ── Transient network error retry ──────────────────────────────────
// Before falling back to a different model, retry the current model
// for transient network errors (connection reset, timeout, DNS, etc.).
// This prevents providers with occasional network flakiness from being
// immediately abandoned in favor of fallback models (#941).
if (isTransientNetworkError(errorMsg)) {
const currentModelId = ctx.model?.id ?? "unknown";
const retryKey = `network-retry:${currentModelId}`;
const maxRetries = 2;
const currentRetries = networkRetryCounters.get(retryKey) ?? 0;
if (currentRetries < maxRetries) {
networkRetryCounters.set(retryKey, currentRetries + 1);
const attempt = currentRetries + 1;
const delayMs = attempt * 3000; // 3s, 6s backoff
ctx.ui.notify(
`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`,
"warning",
);
setTimeout(() => {
pi.sendMessage(
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false },
{ triggerTurn: true },
);
}, delayMs);
return;
}
// Retries exhausted — clear counter and fall through to fallback logic
networkRetryCounters.delete(retryKey);
ctx.ui.notify(
`Network retries exhausted for ${currentModelId}. Attempting model fallback.`,
"warning",
);
}
const dash = getAutoDashboardData();
if (dash.currentUnit) {
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
@ -738,6 +781,9 @@ export default function (pi: ExtensionAPI) {
const nextModelId = getNextFallbackModel(currentModelId, modelConfig);
if (nextModelId) {
// Clear any network retry counters when switching models
networkRetryCounters.clear();
let modelToSet;
const slashIdx = nextModelId.indexOf("/");
if (slashIdx !== -1) {
@ -772,7 +818,6 @@ export default function (pi: ExtensionAPI) {
}
// Detect rate-limit errors and extract retry delay for auto-resume
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg);
const retryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number")
? lastMsg.retryAfterMs
@ -792,6 +837,7 @@ export default function (pi: ExtensionAPI) {
}
try {
networkRetryCounters.clear(); // Clear network retry state on successful unit completion
await handleAgentEnd(ctx, pi);
} catch (err) {
// Safety net: if handleAgentEnd throws despite its internal try-catch,

View file

@ -595,6 +595,18 @@ export function getNextFallbackModel(
}
}
/**
* Detect whether an error message indicates a transient network error
* (worth retrying the same model) vs a permanent provider error
* (auth failure, quota exceeded, etc. should fall back immediately).
*/
export function isTransientNetworkError(errorMsg: string): boolean {
if (!errorMsg) return false;
const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg);
const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg);
return hasNetworkSignal && !hasPermanentSignal;
}
export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
const prefs = loadEffectiveGSDPreferences();
if (!prefs?.preferences.models) return undefined;

View file

@ -6,7 +6,7 @@ import assert from "node:assert/strict";
// just test that `resolveModelWithFallbacksForUnit` returns the correct format since
// the fallback rotation logic itself was verified manually.
import { getNextFallbackModel } from "../preferences.ts";
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
test("getNextFallbackModel selects next fallback if current is a fallback", () => {
const modelConfig = { primary: "model-a", fallbacks: ["model-b", "model-c"] };
@ -52,3 +52,53 @@ test("getNextFallbackModel returns primary if current model is undefined", () =>
assert.equal(nextModelId, "model-a", "should default to primary if current is undefined");
});
// ── isTransientNetworkError tests ────────────────────────────────────────────
test("isTransientNetworkError detects ECONNRESET", () => {
assert.ok(isTransientNetworkError("fetch failed: ECONNRESET"));
});
test("isTransientNetworkError detects ETIMEDOUT", () => {
assert.ok(isTransientNetworkError("ETIMEDOUT: request timed out"));
});
test("isTransientNetworkError detects generic network error", () => {
assert.ok(isTransientNetworkError("network error"));
});
test("isTransientNetworkError detects socket hang up", () => {
assert.ok(isTransientNetworkError("socket hang up"));
});
test("isTransientNetworkError detects fetch failed", () => {
assert.ok(isTransientNetworkError("fetch failed"));
});
test("isTransientNetworkError detects connection reset", () => {
assert.ok(isTransientNetworkError("connection was reset by peer"));
});
test("isTransientNetworkError detects DNS errors", () => {
assert.ok(isTransientNetworkError("dns resolution failed"));
});
test("isTransientNetworkError rejects auth errors", () => {
assert.ok(!isTransientNetworkError("unauthorized: invalid API key"));
});
test("isTransientNetworkError rejects quota errors", () => {
assert.ok(!isTransientNetworkError("quota exceeded"));
});
test("isTransientNetworkError rejects billing errors", () => {
assert.ok(!isTransientNetworkError("billing issue: network payment required"));
});
test("isTransientNetworkError rejects empty string", () => {
assert.ok(!isTransientNetworkError(""));
});
test("isTransientNetworkError rejects non-network errors", () => {
assert.ok(!isTransientNetworkError("model not found"));
});