fix: retry transient network errors before model fallback (#941) (#945)

Previously, any provider error during auto-mode immediately triggered the model fallback chain. This meant providers with occasional network flakiness (e.g. zai-coding-plan) would get abandoned after a single transient error, barely getting used before the fallback took over. Now, transient network errors (ECONNRESET, ETIMEDOUT, socket hang up, DNS failures, etc.) are retried up to 2 times with linear backoff (3s, 6s) before falling back to the next model. Permanent errors (auth, quota, billing) still trigger immediate fallback. Changes: - index.ts: Add network retry loop before fallback chain in agent_end error handler. Track retry counts per model in networkRetryCounters map. Clear counters on successful unit completion and model switches. - preferences.ts: Extract isTransientNetworkError() as testable utility. Matches network signals while excluding permanent auth/billing errors. - network-error-fallback.test.ts: Add 12 tests for transient error detection covering all signal patterns and exclusion cases.
2026-03-17 16:59:22 -04:00 · 2026-03-17 16:59:22 -04:00 · 1e979ff626
commit 1e979ff626
parent 5bae521af0
3 changed files with 110 additions and 2 deletions
--- a/src/resources/extensions/gsd/index.ts
+++ b/src/resources/extensions/gsd/index.ts
@ -44,6 +44,7 @@ import {
  resolveAllSkillReferences,
  resolveModelWithFallbacksForUnit,
  getNextFallbackModel,
+  isTransientNetworkError,
 } from "./preferences.js";
 import { hasSkillSnapshot, detectNewSkills, formatSkillsXml } from "./skill-discovery.js";
 import {
@ -93,6 +94,11 @@ function loadAgentInstructions(): string | null {
 // ── Depth verification state ──────────────────────────────────────────────
 let depthVerificationDone = false;

+// ── Network error retry counters ──────────────────────────────────────────
+// Tracks per-model retry attempts for transient network errors.
+// Cleared when a model switch occurs or retries are exhausted.
+const networkRetryCounters = new Map<string, number>();
+
 export function isDepthVerified(): boolean {
  return depthVerificationDone;
 }
@ -728,6 +734,43 @@ export default function (pi: ExtensionAPI) {
          ? `: ${lastMsg.errorMessage}`
          : "";

+      const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
+
+      // ── Transient network error retry ──────────────────────────────────
+      // Before falling back to a different model, retry the current model
+      // for transient network errors (connection reset, timeout, DNS, etc.).
+      // This prevents providers with occasional network flakiness from being
+      // immediately abandoned in favor of fallback models (#941).
+      if (isTransientNetworkError(errorMsg)) {
+        const currentModelId = ctx.model?.id ?? "unknown";
+        const retryKey = `network-retry:${currentModelId}`;
+        const maxRetries = 2;
+        const currentRetries = networkRetryCounters.get(retryKey) ?? 0;
+
+        if (currentRetries < maxRetries) {
+          networkRetryCounters.set(retryKey, currentRetries + 1);
+          const attempt = currentRetries + 1;
+          const delayMs = attempt * 3000; // 3s, 6s backoff
+          ctx.ui.notify(
+            `Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`,
+            "warning",
+          );
+          setTimeout(() => {
+            pi.sendMessage(
+              { customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false },
+              { triggerTurn: true },
+            );
+          }, delayMs);
+          return;
+        }
+        // Retries exhausted — clear counter and fall through to fallback logic
+        networkRetryCounters.delete(retryKey);
+        ctx.ui.notify(
+          `Network retries exhausted for ${currentModelId}. Attempting model fallback.`,
+          "warning",
+        );
+      }
+
      const dash = getAutoDashboardData();
      if (dash.currentUnit) {
        const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
@ -738,6 +781,9 @@ export default function (pi: ExtensionAPI) {
          const nextModelId = getNextFallbackModel(currentModelId, modelConfig);

          if (nextModelId) {
+            // Clear any network retry counters when switching models
+            networkRetryCounters.clear();
+
            let modelToSet;
            const slashIdx = nextModelId.indexOf("/");
            if (slashIdx !== -1) {
@ -772,7 +818,6 @@ export default function (pi: ExtensionAPI) {
      }

      // Detect rate-limit errors and extract retry delay for auto-resume
-      const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
      const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg);
      const retryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number")
        ? lastMsg.retryAfterMs
@ -792,6 +837,7 @@ export default function (pi: ExtensionAPI) {
    }

    try {
+      networkRetryCounters.clear(); // Clear network retry state on successful unit completion
      await handleAgentEnd(ctx, pi);
    } catch (err) {
      // Safety net: if handleAgentEnd throws despite its internal try-catch,
--- a/src/resources/extensions/gsd/preferences.ts
+++ b/src/resources/extensions/gsd/preferences.ts
@ -595,6 +595,18 @@ export function getNextFallbackModel(
  }
 }

+/**
+ * Detect whether an error message indicates a transient network error
+ * (worth retrying the same model) vs a permanent provider error
+ * (auth failure, quota exceeded, etc. — should fall back immediately).
+ */
+export function isTransientNetworkError(errorMsg: string): boolean {
+  if (!errorMsg) return false;
+  const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg);
+  const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg);
+  return hasNetworkSignal && !hasPermanentSignal;
+}
+
 export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
  const prefs = loadEffectiveGSDPreferences();
  if (!prefs?.preferences.models) return undefined;
--- a/src/resources/extensions/gsd/tests/network-error-fallback.test.ts
+++ b/src/resources/extensions/gsd/tests/network-error-fallback.test.ts
@ -6,7 +6,7 @@ import assert from "node:assert/strict";
 // just test that `resolveModelWithFallbacksForUnit` returns the correct format since
 // the fallback rotation logic itself was verified manually.

-import { getNextFallbackModel } from "../preferences.ts";
+import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";

 test("getNextFallbackModel selects next fallback if current is a fallback", () => {
    const modelConfig = { primary: "model-a", fallbacks: ["model-b", "model-c"] };
@ -52,3 +52,53 @@ test("getNextFallbackModel returns primary if current model is undefined", () =>

    assert.equal(nextModelId, "model-a", "should default to primary if current is undefined");
 });
+
+// ── isTransientNetworkError tests ────────────────────────────────────────────
+
+test("isTransientNetworkError detects ECONNRESET", () => {
+    assert.ok(isTransientNetworkError("fetch failed: ECONNRESET"));
+});
+
+test("isTransientNetworkError detects ETIMEDOUT", () => {
+    assert.ok(isTransientNetworkError("ETIMEDOUT: request timed out"));
+});
+
+test("isTransientNetworkError detects generic network error", () => {
+    assert.ok(isTransientNetworkError("network error"));
+});
+
+test("isTransientNetworkError detects socket hang up", () => {
+    assert.ok(isTransientNetworkError("socket hang up"));
+});
+
+test("isTransientNetworkError detects fetch failed", () => {
+    assert.ok(isTransientNetworkError("fetch failed"));
+});
+
+test("isTransientNetworkError detects connection reset", () => {
+    assert.ok(isTransientNetworkError("connection was reset by peer"));
+});
+
+test("isTransientNetworkError detects DNS errors", () => {
+    assert.ok(isTransientNetworkError("dns resolution failed"));
+});
+
+test("isTransientNetworkError rejects auth errors", () => {
+    assert.ok(!isTransientNetworkError("unauthorized: invalid API key"));
+});
+
+test("isTransientNetworkError rejects quota errors", () => {
+    assert.ok(!isTransientNetworkError("quota exceeded"));
+});
+
+test("isTransientNetworkError rejects billing errors", () => {
+    assert.ok(!isTransientNetworkError("billing issue: network payment required"));
+});
+
+test("isTransientNetworkError rejects empty string", () => {
+    assert.ok(!isTransientNetworkError(""));
+});
+
+test("isTransientNetworkError rejects non-network errors", () => {
+    assert.ok(!isTransientNetworkError("model not found"));
+});