Previously, any provider error during auto-mode immediately triggered the model fallback chain. This meant providers with occasional network flakiness (e.g. zai-coding-plan) would get abandoned after a single transient error, barely getting used before the fallback took over. Now, transient network errors (ECONNRESET, ETIMEDOUT, socket hang up, DNS failures, etc.) are retried up to 2 times with linear backoff (3s, 6s) before falling back to the next model. Permanent errors (auth, quota, billing) still trigger immediate fallback. Changes: - index.ts: Add network retry loop before fallback chain in agent_end error handler. Track retry counts per model in networkRetryCounters map. Clear counters on successful unit completion and model switches. - preferences.ts: Extract isTransientNetworkError() as testable utility. Matches network signals while excluding permanent auth/billing errors. - network-error-fallback.test.ts: Add 12 tests for transient error detection covering all signal patterns and exclusion cases.
This commit is contained in:
parent
5bae521af0
commit
1e979ff626
3 changed files with 110 additions and 2 deletions
|
|
@ -44,6 +44,7 @@ import {
|
|||
resolveAllSkillReferences,
|
||||
resolveModelWithFallbacksForUnit,
|
||||
getNextFallbackModel,
|
||||
isTransientNetworkError,
|
||||
} from "./preferences.js";
|
||||
import { hasSkillSnapshot, detectNewSkills, formatSkillsXml } from "./skill-discovery.js";
|
||||
import {
|
||||
|
|
@ -93,6 +94,11 @@ function loadAgentInstructions(): string | null {
|
|||
// ── Depth verification state ──────────────────────────────────────────────
|
||||
let depthVerificationDone = false;
|
||||
|
||||
// ── Network error retry counters ──────────────────────────────────────────
|
||||
// Tracks per-model retry attempts for transient network errors.
|
||||
// Cleared when a model switch occurs or retries are exhausted.
|
||||
const networkRetryCounters = new Map<string, number>();
|
||||
|
||||
export function isDepthVerified(): boolean {
|
||||
return depthVerificationDone;
|
||||
}
|
||||
|
|
@ -728,6 +734,43 @@ export default function (pi: ExtensionAPI) {
|
|||
? `: ${lastMsg.errorMessage}`
|
||||
: "";
|
||||
|
||||
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
|
||||
|
||||
// ── Transient network error retry ──────────────────────────────────
|
||||
// Before falling back to a different model, retry the current model
|
||||
// for transient network errors (connection reset, timeout, DNS, etc.).
|
||||
// This prevents providers with occasional network flakiness from being
|
||||
// immediately abandoned in favor of fallback models (#941).
|
||||
if (isTransientNetworkError(errorMsg)) {
|
||||
const currentModelId = ctx.model?.id ?? "unknown";
|
||||
const retryKey = `network-retry:${currentModelId}`;
|
||||
const maxRetries = 2;
|
||||
const currentRetries = networkRetryCounters.get(retryKey) ?? 0;
|
||||
|
||||
if (currentRetries < maxRetries) {
|
||||
networkRetryCounters.set(retryKey, currentRetries + 1);
|
||||
const attempt = currentRetries + 1;
|
||||
const delayMs = attempt * 3000; // 3s, 6s backoff
|
||||
ctx.ui.notify(
|
||||
`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`,
|
||||
"warning",
|
||||
);
|
||||
setTimeout(() => {
|
||||
pi.sendMessage(
|
||||
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false },
|
||||
{ triggerTurn: true },
|
||||
);
|
||||
}, delayMs);
|
||||
return;
|
||||
}
|
||||
// Retries exhausted — clear counter and fall through to fallback logic
|
||||
networkRetryCounters.delete(retryKey);
|
||||
ctx.ui.notify(
|
||||
`Network retries exhausted for ${currentModelId}. Attempting model fallback.`,
|
||||
"warning",
|
||||
);
|
||||
}
|
||||
|
||||
const dash = getAutoDashboardData();
|
||||
if (dash.currentUnit) {
|
||||
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
|
||||
|
|
@ -738,6 +781,9 @@ export default function (pi: ExtensionAPI) {
|
|||
const nextModelId = getNextFallbackModel(currentModelId, modelConfig);
|
||||
|
||||
if (nextModelId) {
|
||||
// Clear any network retry counters when switching models
|
||||
networkRetryCounters.clear();
|
||||
|
||||
let modelToSet;
|
||||
const slashIdx = nextModelId.indexOf("/");
|
||||
if (slashIdx !== -1) {
|
||||
|
|
@ -772,7 +818,6 @@ export default function (pi: ExtensionAPI) {
|
|||
}
|
||||
|
||||
// Detect rate-limit errors and extract retry delay for auto-resume
|
||||
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
|
||||
const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg);
|
||||
const retryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number")
|
||||
? lastMsg.retryAfterMs
|
||||
|
|
@ -792,6 +837,7 @@ export default function (pi: ExtensionAPI) {
|
|||
}
|
||||
|
||||
try {
|
||||
networkRetryCounters.clear(); // Clear network retry state on successful unit completion
|
||||
await handleAgentEnd(ctx, pi);
|
||||
} catch (err) {
|
||||
// Safety net: if handleAgentEnd throws despite its internal try-catch,
|
||||
|
|
|
|||
|
|
@ -595,6 +595,18 @@ export function getNextFallbackModel(
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect whether an error message indicates a transient network error
|
||||
* (worth retrying the same model) vs a permanent provider error
|
||||
* (auth failure, quota exceeded, etc. — should fall back immediately).
|
||||
*/
|
||||
export function isTransientNetworkError(errorMsg: string): boolean {
|
||||
if (!errorMsg) return false;
|
||||
const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg);
|
||||
const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg);
|
||||
return hasNetworkSignal && !hasPermanentSignal;
|
||||
}
|
||||
|
||||
export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
|
||||
const prefs = loadEffectiveGSDPreferences();
|
||||
if (!prefs?.preferences.models) return undefined;
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import assert from "node:assert/strict";
|
|||
// just test that `resolveModelWithFallbacksForUnit` returns the correct format since
|
||||
// the fallback rotation logic itself was verified manually.
|
||||
|
||||
import { getNextFallbackModel } from "../preferences.ts";
|
||||
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
|
||||
|
||||
test("getNextFallbackModel selects next fallback if current is a fallback", () => {
|
||||
const modelConfig = { primary: "model-a", fallbacks: ["model-b", "model-c"] };
|
||||
|
|
@ -52,3 +52,53 @@ test("getNextFallbackModel returns primary if current model is undefined", () =>
|
|||
|
||||
assert.equal(nextModelId, "model-a", "should default to primary if current is undefined");
|
||||
});
|
||||
|
||||
// ── isTransientNetworkError tests ────────────────────────────────────────────
|
||||
|
||||
test("isTransientNetworkError detects ECONNRESET", () => {
|
||||
assert.ok(isTransientNetworkError("fetch failed: ECONNRESET"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects ETIMEDOUT", () => {
|
||||
assert.ok(isTransientNetworkError("ETIMEDOUT: request timed out"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects generic network error", () => {
|
||||
assert.ok(isTransientNetworkError("network error"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects socket hang up", () => {
|
||||
assert.ok(isTransientNetworkError("socket hang up"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects fetch failed", () => {
|
||||
assert.ok(isTransientNetworkError("fetch failed"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects connection reset", () => {
|
||||
assert.ok(isTransientNetworkError("connection was reset by peer"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError detects DNS errors", () => {
|
||||
assert.ok(isTransientNetworkError("dns resolution failed"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError rejects auth errors", () => {
|
||||
assert.ok(!isTransientNetworkError("unauthorized: invalid API key"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError rejects quota errors", () => {
|
||||
assert.ok(!isTransientNetworkError("quota exceeded"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError rejects billing errors", () => {
|
||||
assert.ok(!isTransientNetworkError("billing issue: network payment required"));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError rejects empty string", () => {
|
||||
assert.ok(!isTransientNetworkError(""));
|
||||
});
|
||||
|
||||
test("isTransientNetworkError rejects non-network errors", () => {
|
||||
assert.ok(!isTransientNetworkError("model not found"));
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue