refactor(gsd): unify three overlapping error classifiers into single classify→decide→act pipeline

Three independent error classifiers (isTransientNetworkError, classifyProviderError,
and inline network-retry logic) had diverging counters and duplicate regex coverage.
Every new edge case required patching a different classifier.

Replaces all three with a single classifyError() returning a discriminated union
ErrorClass, one RetryState object with explicit lifecycle, and a clean
classify→decide→act flow in handleAgentEnd.

Behavioral change: rate-limit errors no longer trigger model fallback — throttling
is a provider issue, switching models on the same provider is pointless.

Closes #2577
This commit is contained in:
Iouri Goussev 2026-03-26 13:33:54 -04:00
parent 913984c26e
commit b56eb882bb
7 changed files with 368 additions and 243 deletions

View file

@ -2,14 +2,51 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
import { checkAutoStartAfterDiscuss } from "../guided-flow.js";
import { getAutoDashboardData, getAutoModeStartModel, isAutoActive, pauseAuto } from "../auto.js";
import { getNextFallbackModel, isTransientNetworkError, resolveModelWithFallbacksForUnit } from "../preferences.js";
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.js";
import { getNextFallbackModel, resolveModelWithFallbacksForUnit } from "../preferences.js";
import { pauseAutoForProviderError } from "../provider-error-pause.js";
import { isSessionSwitchInFlight, resolveAgentEnd } from "../auto-loop.js";
import { resolveModelId } from "../auto-model-selection.js";
import { clearDiscussionFlowState } from "./write-gate.js";
import {
classifyError,
createRetryState,
resetRetryState,
isTransient,
type ErrorClass,
} from "../error-classifier.js";
const networkRetryCounters = new Map<string, number>();
const retryState = createRetryState();
const MAX_NETWORK_RETRIES = 2;
const MAX_TRANSIENT_AUTO_RESUMES = 3;
let consecutiveTransientErrors = 0;
async function pauseTransientWithBackoff(
cls: ErrorClass,
pi: ExtensionAPI,
ctx: ExtensionContext,
errorDetail: string,
isRateLimit: boolean,
): Promise<void> {
retryState.consecutiveTransientCount += 1;
const baseRetryAfterMs = "retryAfterMs" in cls ? cls.retryAfterMs : 15_000;
const retryAfterMs = baseRetryAfterMs * 2 ** Math.max(0, retryState.consecutiveTransientCount - 1);
const allowAutoResume = retryState.consecutiveTransientCount <= MAX_TRANSIENT_AUTO_RESUMES;
if (!allowAutoResume) {
ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning");
}
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
isRateLimit,
isTransient: allowAutoResume,
retryAfterMs,
resume: allowAutoResume
? () => {
pi.sendMessage(
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false },
{ triggerTurn: true },
);
}
: undefined,
});
}
export async function handleAgentEnd(
pi: ExtensionAPI,
@ -31,17 +68,26 @@ export async function handleAgentEnd(
if (lastMsg && "stopReason" in lastMsg && lastMsg.stopReason === "error") {
const errorDetail = "errorMessage" in lastMsg && lastMsg.errorMessage ? `: ${lastMsg.errorMessage}` : "";
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined;
if (isTransientNetworkError(errorMsg)) {
// ── 1. Classify ──────────────────────────────────────────────────────
const cls = classifyError(errorMsg, explicitRetryAfterMs);
// ── 2. Decide & Act ──────────────────────────────────────────────────
// --- Network errors: same-model retry with backoff ---
if (cls.kind === "network") {
const currentModelId = ctx.model?.id ?? "unknown";
const retryKey = `network-retry:${currentModelId}`;
const currentRetries = networkRetryCounters.get(retryKey) ?? 0;
const maxRetries = 2;
if (currentRetries < maxRetries) {
networkRetryCounters.set(retryKey, currentRetries + 1);
const attempt = currentRetries + 1;
const delayMs = attempt * 3000;
ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`, "warning");
if (retryState.currentRetryModelId !== currentModelId) {
retryState.networkRetryCount = 0;
retryState.currentRetryModelId = currentModelId;
}
if (retryState.networkRetryCount < MAX_NETWORK_RETRIES) {
retryState.networkRetryCount += 1;
retryState.consecutiveTransientCount += 1;
const attempt = retryState.networkRetryCount;
const delayMs = attempt * cls.retryAfterMs;
ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${MAX_NETWORK_RETRIES} in ${delayMs / 1000}s...`, "warning");
setTimeout(() => {
pi.sendMessage(
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false },
@ -50,26 +96,56 @@ export async function handleAgentEnd(
}, delayMs);
return;
}
networkRetryCounters.delete(retryKey);
// Network retries exhausted — fall through to model fallback
retryState.networkRetryCount = 0;
retryState.currentRetryModelId = undefined;
ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning");
}
const dash = getAutoDashboardData();
if (dash.currentUnit) {
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
if (modelConfig && modelConfig.fallbacks.length > 0) {
const availableModels = ctx.modelRegistry.getAvailable();
const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig);
if (nextModelId) {
networkRetryCounters.clear();
const slashIdx = nextModelId.indexOf("/");
const modelToSet = slashIdx !== -1
? availableModels.find((m) => m.provider.toLowerCase() === nextModelId.substring(0, slashIdx).toLowerCase() && m.id.toLowerCase() === nextModelId.substring(slashIdx + 1).toLowerCase())
: (availableModels.find((m) => m.id === nextModelId && m.provider === ctx.model?.provider) ?? availableModels.find((m) => m.id === nextModelId));
if (modelToSet) {
const ok = await pi.setModel(modelToSet, { persist: false });
// --- Rate limit: skip model fallback, go straight to pause ---
// Rate-limiting is a provider issue, not a model issue.
// Switching models won't help if the provider is throttling you.
if (cls.kind === "rate-limit") {
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true);
return;
}
// --- Server/connection/stream errors: try model fallback first ---
if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
// Try model fallback
const dash = getAutoDashboardData();
if (dash.currentUnit) {
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
if (modelConfig && modelConfig.fallbacks.length > 0) {
const availableModels = ctx.modelRegistry.getAvailable();
const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig);
if (nextModelId) {
retryState.networkRetryCount = 0;
retryState.currentRetryModelId = undefined;
const modelToSet = resolveModelId(nextModelId, availableModels, ctx.model?.provider);
if (modelToSet) {
const ok = await pi.setModel(modelToSet, { persist: false });
if (ok) {
ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning");
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
return;
}
}
}
}
}
// Try restoring session model
const sessionModel = getAutoModeStartModel();
if (sessionModel) {
if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) {
const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id);
if (startModel) {
const ok = await pi.setModel(startModel, { persist: false });
if (ok) {
ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning");
retryState.networkRetryCount = 0;
retryState.currentRetryModelId = undefined;
ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning");
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
return;
}
@ -78,56 +154,24 @@ export async function handleAgentEnd(
}
}
const sessionModel = getAutoModeStartModel();
if (sessionModel) {
if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) {
const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id);
if (startModel) {
const ok = await pi.setModel(startModel, { persist: false });
if (ok) {
networkRetryCounters.clear();
ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning");
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
return;
}
}
}
// --- Transient fallback: pause with auto-resume ---
if (isTransient(cls)) {
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false);
return;
}
const classification = classifyProviderError(errorMsg);
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined;
if (classification.isTransient) {
consecutiveTransientErrors += 1;
} else {
consecutiveTransientErrors = 0;
}
const baseRetryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
const retryAfterMs = classification.isTransient
? baseRetryAfterMs * 2 ** Math.max(0, consecutiveTransientErrors - 1)
: baseRetryAfterMs;
const allowAutoResume = classification.isTransient && consecutiveTransientErrors <= MAX_TRANSIENT_AUTO_RESUMES;
if (classification.isTransient && !allowAutoResume) {
ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning");
}
// --- Permanent / unknown: pause indefinitely ---
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
isRateLimit: classification.isRateLimit,
isTransient: allowAutoResume,
retryAfterMs,
resume: allowAutoResume
? () => {
pi.sendMessage(
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false },
{ triggerTurn: true },
);
}
: undefined,
isRateLimit: false,
isTransient: false,
retryAfterMs: 0,
});
return;
}
// ── Success path ─────────────────────────────────────────────────────────
try {
consecutiveTransientErrors = 0;
networkRetryCounters.clear();
resetRetryState(retryState);
resolveAgentEnd(event);
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
@ -139,4 +183,3 @@ export async function handleAgentEnd(
}
}
}

View file

@ -0,0 +1,139 @@
/**
* Unified error classifier for provider/network/server errors.
*
* Consolidates patterns from:
* - isTransientNetworkError() in preferences-models.ts
* - classifyProviderError() in provider-error-pause.ts
*
* Single entry point: classifyError(errorMsg, retryAfterMs?)
*
* @see https://github.com/gsd-build/gsd/issues/2577
*/
// ── ErrorClass discriminated union ──────────────────────────────────────────
export type ErrorClass =
| { kind: "network"; retryAfterMs: number }
| { kind: "rate-limit"; retryAfterMs: number }
| { kind: "server"; retryAfterMs: number }
| { kind: "stream"; retryAfterMs: number }
| { kind: "connection"; retryAfterMs: number }
| { kind: "model-error" }
| { kind: "permanent" }
| { kind: "unknown" };
// ── RetryState ──────────────────────────────────────────────────────────────
export interface RetryState {
networkRetryCount: number;
consecutiveTransientCount: number;
currentRetryModelId: string | undefined;
}
export function createRetryState(): RetryState {
return { networkRetryCount: 0, consecutiveTransientCount: 0, currentRetryModelId: undefined };
}
export function resetRetryState(state: RetryState): void {
state.networkRetryCount = 0;
state.consecutiveTransientCount = 0;
state.currentRetryModelId = undefined;
}
// ── Classification ──────────────────────────────────────────────────────────
const PERMANENT_RE = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i;
const RATE_LIMIT_RE = /rate.?limit|too many requests|429/i;
const NETWORK_RE = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i;
const SERVER_RE = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i;
// ECONNRESET/ECONNREFUSED are in NETWORK_RE (same-model retry first).
const CONNECTION_RE = /terminated|connection.?refused|other side closed|EPIPE|network.?(?:is\s+)?unavailable/i;
const STREAM_RE = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i;
const RESET_DELAY_RE = /reset in (\d+)s/i;
/**
* Classify an error message into one of the ErrorClass kinds.
*
* Classification order:
* 1. Permanent (auth/billing/quota) unless also rate-limited
* 2. Rate limit (429, rate.?limit, too many requests)
* 3. Network (ECONNRESET, ETIMEDOUT, socket hang up, fetch failed, dns)
* 4. Server (500/502/503, overloaded, server_error)
* 5. Connection (terminated, ECONNREFUSED, EPIPE, other side closed)
* 6. Stream truncation (malformed JSON from mid-stream cut)
* 7. Unknown
*/
export function classifyError(errorMsg: string, retryAfterMs?: number): ErrorClass {
const isPermanent = PERMANENT_RE.test(errorMsg);
const isRateLimit = RATE_LIMIT_RE.test(errorMsg);
// 1. Permanent — but rate limit takes precedence
if (isPermanent && !isRateLimit) {
return { kind: "permanent" };
}
// 2. Rate limit
if (isRateLimit) {
if (retryAfterMs != null && retryAfterMs > 0) {
return { kind: "rate-limit", retryAfterMs };
}
const resetMatch = errorMsg.match(RESET_DELAY_RE);
const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000;
return { kind: "rate-limit", retryAfterMs: delayMs };
}
// 3. Network errors — same-model retry candidate
if (NETWORK_RE.test(errorMsg)) {
// Exclude if also matches permanent signals (already handled above for
// rate-limit, but double-check for non-rate-limit permanent overlap like
// "billing" appearing alongside "network").
return { kind: "network", retryAfterMs: retryAfterMs ?? 3_000 };
}
// 4. Server errors — try fallback model
if (SERVER_RE.test(errorMsg)) {
return { kind: "server", retryAfterMs: retryAfterMs ?? 30_000 };
}
// 5. Connection errors — try fallback model
if (CONNECTION_RE.test(errorMsg)) {
return { kind: "connection", retryAfterMs: retryAfterMs ?? 15_000 };
}
// 6. Stream truncation — downstream symptom of connection drop
if (STREAM_RE.test(errorMsg)) {
return { kind: "stream", retryAfterMs: retryAfterMs ?? 15_000 };
}
// 7. Unknown
return { kind: "unknown" };
}
// ── Helpers ─────────────────────────────────────────────────────────────────
/** Returns true for all transient (auto-resumable) error kinds. */
export function isTransient(cls: ErrorClass): boolean {
switch (cls.kind) {
case "network":
case "rate-limit":
case "server":
case "stream":
case "connection":
return true;
default:
return false;
}
}
/**
* Backward-compatible thin wrapper.
*
* Returns true when the error is a transient *network* error specifically
* (worth retrying the same model). Permanent signals (auth, billing, quota)
* cause this to return false even if a network keyword is present.
*/
export function isTransientNetworkError(errorMsg: string): boolean {
if (!errorMsg) return false;
const cls = classifyError(errorMsg);
return cls.kind === "network";
}

View file

@ -125,18 +125,6 @@ export function getNextFallbackModel(
}
}
/**
* Detect whether an error message indicates a transient network error
* (worth retrying the same model) vs a permanent provider error
* (auth failure, quota exceeded, etc. -- should fall back immediately).
*/
export function isTransientNetworkError(errorMsg: string): boolean {
if (!errorMsg) return false;
const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg);
const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg);
return hasNetworkSignal && !hasPermanentSignal;
}
/**
* Validate a model ID string.
* Returns true if the ID looks like a valid model identifier.

View file

@ -68,7 +68,6 @@ export {
resolveModelForUnit,
resolveModelWithFallbacksForUnit,
getNextFallbackModel,
isTransientNetworkError,
validateModelId,
updatePreferencesModels,
resolveDynamicRoutingConfig,

View file

@ -2,63 +2,6 @@ export type ProviderErrorPauseUI = {
notify(message: string, level?: "info" | "warning" | "error" | "success"): void;
};
/**
* Classify a provider error as transient (auto-resume) or permanent (manual resume).
*
* Transient: rate limits, server errors (500/502/503), overloaded, internal errors.
* These are expected to self-resolve and should auto-resume after a delay.
*
* Permanent: auth errors, invalid API key, billing issues.
* These require user intervention and should pause indefinitely.
*/
export function classifyProviderError(errorMsg: string): {
isTransient: boolean;
isRateLimit: boolean;
suggestedDelayMs: number;
} {
const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg);
const isServerError = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i.test(errorMsg);
// Connection/process errors — transient, auto-resume after brief backoff (#2309).
// These indicate the process was killed, the connection was reset, or a network
// blip occurred. They are NOT permanent failures.
const isConnectionError = /terminated|connection.?reset|connection.?refused|other side closed|fetch failed|network.?(?:is\s+)?unavailable|ECONNREFUSED|ECONNRESET|EPIPE/i.test(errorMsg);
// Permanent errors — never auto-resume
const isPermanent = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i.test(errorMsg);
if (isPermanent && !isRateLimit) {
return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 };
}
if (isRateLimit) {
// Try to extract retry-after from the message
const resetMatch = errorMsg.match(/reset in (\d+)s/i);
const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000; // default 60s for rate limits
return { isTransient: true, isRateLimit: true, suggestedDelayMs: delayMs };
}
if (isServerError) {
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 30_000 }; // 30s for server errors
}
if (isConnectionError) {
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s for connection errors
}
// Stream-truncation JSON parse errors — transient (#2572).
// When the API stream is cut mid-chunk, pi tries to reassemble the partial
// tool-call JSON and gets a SyntaxError. This is the downstream symptom of
// a connection drop — same root cause as ECONNRESET, one layer up.
const isMalformedStream = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i.test(errorMsg);
if (isMalformedStream) {
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s, same as connection errors
}
// Unknown error — treat as permanent (user reviews)
return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 };
}
/**
* Pause auto-mode due to a provider error.
*

View file

@ -1,6 +1,6 @@
/**
* Provider error handling tests consolidated from:
* - provider-error-classify.test.ts (classifyProviderError)
* - provider-error-classify.test.ts (classifyError)
* - network-error-fallback.test.ts (isTransientNetworkError, getNextFallbackModel)
* - agent-end-provider-error.test.ts (pauseAutoForProviderError)
*/
@ -10,102 +10,104 @@ import assert from "node:assert/strict";
import { readFileSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts";
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
import { classifyError, isTransient, isTransientNetworkError } from "../error-classifier.ts";
import { pauseAutoForProviderError } from "../provider-error-pause.ts";
import { getNextFallbackModel } from "../preferences.ts";
const __dirname = dirname(fileURLToPath(import.meta.url));
// ── classifyProviderError ────────────────────────────────────────────────────
// ── classifyError ────────────────────────────────────────────────────────────
test("classifyProviderError detects rate limit from 429", () => {
const result = classifyProviderError("HTTP 429 Too Many Requests");
assert.ok(result.isTransient);
assert.ok(result.isRateLimit);
assert.ok(result.suggestedDelayMs > 0);
test("classifyError detects rate limit from 429", () => {
const result = classifyError("HTTP 429 Too Many Requests");
assert.ok(isTransient(result));
assert.equal(result.kind, "rate-limit");
assert.ok("retryAfterMs" in result && result.retryAfterMs > 0);
});
test("classifyProviderError detects rate limit from message", () => {
const result = classifyProviderError("rate limit exceeded");
assert.ok(result.isTransient);
assert.ok(result.isRateLimit);
test("classifyError detects rate limit from message", () => {
const result = classifyError("rate limit exceeded");
assert.ok(isTransient(result));
assert.equal(result.kind, "rate-limit");
});
test("classifyProviderError extracts reset delay from message", () => {
const result = classifyProviderError("rate limit exceeded, reset in 45s");
assert.ok(result.isRateLimit);
assert.equal(result.suggestedDelayMs, 45000);
test("classifyError extracts reset delay from message", () => {
const result = classifyError("rate limit exceeded, reset in 45s");
assert.equal(result.kind, "rate-limit");
assert.ok("retryAfterMs" in result && result.retryAfterMs === 45000);
});
test("classifyProviderError defaults to 60s for rate limit without reset", () => {
const result = classifyProviderError("429 too many requests");
assert.ok(result.isRateLimit);
assert.equal(result.suggestedDelayMs, 60_000);
test("classifyError defaults to 60s for rate limit without reset", () => {
const result = classifyError("429 too many requests");
assert.equal(result.kind, "rate-limit");
assert.ok("retryAfterMs" in result && result.retryAfterMs === 60_000);
});
test("classifyProviderError detects Anthropic internal server error", () => {
test("classifyError detects Anthropic internal server error", () => {
const msg = '{"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}';
const result = classifyProviderError(msg);
assert.ok(result.isTransient);
assert.ok(!result.isRateLimit);
assert.equal(result.suggestedDelayMs, 30_000);
const result = classifyError(msg);
assert.ok(isTransient(result));
assert.equal(result.kind, "server");
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
});
test("classifyProviderError detects Codex server_error from extracted message", () => {
test("classifyError detects Codex server_error from extracted message", () => {
// After fix, mapCodexEvents extracts the nested error type and produces
// "Codex server_error: <message>" instead of raw JSON.
const msg = "Codex server_error: An error occurred while processing your request.";
const result = classifyProviderError(msg);
assert.ok(result.isTransient);
assert.ok(!result.isRateLimit);
assert.equal(result.suggestedDelayMs, 30_000);
const result = classifyError(msg);
assert.ok(isTransient(result));
assert.equal(result.kind, "server");
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
});
test("classifyProviderError detects overloaded error", () => {
const result = classifyProviderError("overloaded_error: Overloaded");
assert.ok(result.isTransient);
assert.equal(result.suggestedDelayMs, 30_000);
test("classifyError detects overloaded error", () => {
const result = classifyError("overloaded_error: Overloaded");
assert.ok(isTransient(result));
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
});
test("classifyProviderError detects 503 service unavailable", () => {
const result = classifyProviderError("HTTP 503 Service Unavailable");
assert.ok(result.isTransient);
test("classifyError detects 503 service unavailable", () => {
const result = classifyError("HTTP 503 Service Unavailable");
assert.ok(isTransient(result));
});
test("classifyProviderError detects 502 bad gateway", () => {
const result = classifyProviderError("HTTP 502 Bad Gateway");
assert.ok(result.isTransient);
test("classifyError detects 502 bad gateway", () => {
const result = classifyError("HTTP 502 Bad Gateway");
assert.ok(isTransient(result));
});
test("classifyProviderError detects auth error as permanent", () => {
const result = classifyProviderError("unauthorized: invalid API key");
assert.ok(!result.isTransient);
assert.ok(!result.isRateLimit);
test("classifyError detects auth error as permanent", () => {
const result = classifyError("unauthorized: invalid API key");
assert.ok(!isTransient(result));
assert.equal(result.kind, "permanent");
});
test("classifyProviderError detects billing error as permanent", () => {
const result = classifyProviderError("billing issue: payment required");
assert.ok(!result.isTransient);
test("classifyError detects billing error as permanent", () => {
const result = classifyError("billing issue: payment required");
assert.ok(!isTransient(result));
});
test("classifyProviderError detects quota exceeded as permanent", () => {
const result = classifyProviderError("quota exceeded for this month");
assert.ok(!result.isTransient);
test("classifyError detects quota exceeded as permanent", () => {
const result = classifyError("quota exceeded for this month");
assert.ok(!isTransient(result));
});
test("classifyProviderError treats unknown error as permanent", () => {
const result = classifyProviderError("something went wrong");
assert.ok(!result.isTransient);
test("classifyError treats unknown error as not transient", () => {
const result = classifyError("something went wrong");
assert.ok(!isTransient(result));
assert.equal(result.kind, "unknown");
});
test("classifyProviderError treats empty string as permanent", () => {
const result = classifyProviderError("");
assert.ok(!result.isTransient);
test("classifyError treats empty string as not transient", () => {
const result = classifyError("");
assert.ok(!isTransient(result));
});
test("classifyProviderError: rate limit takes precedence over auth keywords", () => {
const result = classifyProviderError("429 unauthorized rate limit");
assert.ok(result.isRateLimit);
assert.ok(result.isTransient);
test("classifyError: rate limit takes precedence over auth keywords", () => {
const result = classifyError("429 unauthorized rate limit");
assert.equal(result.kind, "rate-limit");
assert.ok(isTransient(result));
});
// ── isTransientNetworkError ──────────────────────────────────────────────────
@ -265,8 +267,8 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
assert.ok(
src.includes("consecutiveTransientErrors"),
"agent-end-recovery.ts must track consecutiveTransientErrors for escalating backoff (#1166)",
src.includes("consecutiveTransientCount"),
"agent-end-recovery.ts must track consecutiveTransientCount for escalating backoff (#1166)",
);
assert.ok(
src.includes("MAX_TRANSIENT_AUTO_RESUMES"),
@ -274,15 +276,13 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b
);
});
test("agent-end-recovery.ts resets consecutive transient error counter on success", () => {
test("agent-end-recovery.ts resets retry state before resolveAgentEnd on success", () => {
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
// After successful agent_end (before resolveAgentEnd), the counter must be reset.
// Use a regex across the success block so CRLF checkouts on Windows do not
// push the reset line outside a fixed substring window.
// After successful agent_end, resetRetryState must be called before resolveAgentEnd.
assert.ok(
/consecutiveTransientErrors\s*=\s*0\s*;[\s\S]{0,250}resolveAgentEnd/.test(src),
"consecutive transient error counter must be reset before resolveAgentEnd on the success path (#1166)",
/resetRetryState[\s\S]{0,250}resolveAgentEnd/.test(src),
"resetRetryState must be called before resolveAgentEnd on the success path (#1166)",
);
});
@ -291,7 +291,7 @@ test("agent-end-recovery.ts applies escalating delay for repeated transient erro
// Must contain the exponential backoff formula (may span multiple lines)
assert.ok(
src.includes("2 ** Math.max(0, consecutiveTransientErrors"),
src.includes("2 ** Math.max(0, retryState.consecutiveTransientCount"),
"agent-end-recovery.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)",
);
});

View file

@ -1,73 +1,86 @@
/**
* terminated-transient.test.ts Regression test for #2309.
*
* classifyProviderError should treat 'terminated' errors (process killed,
* classifyError should treat 'terminated' errors (process killed,
* connection reset) as transient with auto-resume, not permanent.
*/
import test from "node:test";
import assert from "node:assert/strict";
import { classifyProviderError } from "../provider-error-pause.ts";
import { classifyError, isTransient } from "../error-classifier.ts";
test("#2309: 'terminated' errors should be classified as transient", () => {
const result = classifyProviderError("terminated");
assert.equal(result.isTransient, true, "'terminated' should be transient");
assert.equal(result.isRateLimit, false, "'terminated' is not a rate limit");
assert.ok(result.suggestedDelayMs > 0, "'terminated' should have a retry delay");
const result = classifyError("terminated");
assert.equal(isTransient(result), true, "'terminated' should be transient");
assert.equal(result.kind, "connection", "'terminated' matches connection");
assert.equal(result.kind !== "rate-limit", true, "'terminated' is not a rate limit");
assert.ok("retryAfterMs" in result && result.retryAfterMs > 0, "'terminated' should have a retry delay");
assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "'terminated' should use 15s backoff");
});
test("#2309: 'connection reset' errors should be classified as transient", () => {
const result = classifyProviderError("connection reset by peer");
assert.equal(result.isTransient, true, "'connection reset' should be transient");
test("#2309: 'connection reset by peer' errors should be classified as transient (network)", () => {
const result = classifyError("connection reset by peer");
assert.equal(isTransient(result), true, "'connection reset by peer' should be transient");
assert.equal(result.kind, "network", "'connection reset by peer' matches NETWORK_RE (connection.*reset) before CONNECTION_RE");
assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff");
});
test("#2309: 'other side closed' errors should be classified as transient", () => {
const result = classifyProviderError("other side closed the connection");
assert.equal(result.isTransient, true, "'other side closed' should be transient");
const result = classifyError("other side closed the connection");
assert.equal(isTransient(result), true, "'other side closed' should be transient");
assert.equal(result.kind, "connection", "'other side closed' matches CONNECTION_RE");
});
test("#2309: 'fetch failed' errors should be classified as transient", () => {
const result = classifyProviderError("fetch failed: network error");
assert.equal(result.isTransient, true, "'fetch failed' should be transient");
const result = classifyError("fetch failed: network error");
assert.equal(isTransient(result), true, "'fetch failed' should be transient");
assert.equal(result.kind, "network", "'fetch failed' matches NETWORK_RE");
assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff");
});
test("#2309: 'connection refused' errors should be classified as transient", () => {
const result = classifyProviderError("ECONNREFUSED: connection refused");
assert.equal(result.isTransient, true, "'connection refused' should be transient");
const result = classifyError("ECONNREFUSED: connection refused");
assert.equal(isTransient(result), true, "'connection refused' should be transient");
assert.equal(result.kind, "network", "'ECONNREFUSED' matches NETWORK_RE (same-model retry)");
});
test("#2309: permanent errors are still permanent", () => {
const authResult = classifyProviderError("unauthorized: invalid API key");
assert.equal(authResult.isTransient, false, "auth errors should stay permanent");
assert.equal(authResult.suggestedDelayMs, 0, "permanent errors have no delay");
const authResult = classifyError("unauthorized: invalid API key");
assert.equal(isTransient(authResult), false, "auth errors should stay permanent");
assert.equal(authResult.kind, "permanent", "auth errors are permanent");
assert.equal("retryAfterMs" in authResult, false, "permanent errors have no retryAfterMs");
});
test("#2309: rate limits are still transient", () => {
const rlResult = classifyProviderError("rate limit exceeded (429)");
assert.equal(rlResult.isTransient, true, "rate limits are still transient");
assert.equal(rlResult.isRateLimit, true, "rate limits are flagged as rate limits");
const rlResult = classifyError("rate limit exceeded (429)");
assert.equal(isTransient(rlResult), true, "rate limits are still transient");
assert.equal(rlResult.kind, "rate-limit", "rate limits are flagged as rate-limit kind");
});
// --- #2572: stream-truncation JSON parse errors should be transient ---
test("#2572: 'Expected double-quoted property name' (truncated stream) is transient", () => {
const result = classifyProviderError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)");
assert.equal(result.isTransient, true, "truncated-stream JSON parse error should be transient");
assert.equal(result.isRateLimit, false, "not a rate limit");
assert.equal(result.suggestedDelayMs, 15_000, "should use 15s backoff like connection errors");
const result = classifyError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)");
assert.equal(isTransient(result), true, "truncated-stream JSON parse error should be transient");
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
assert.equal(result.kind !== "rate-limit", true, "not a rate limit");
assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "should use 15s backoff");
});
test("#2572: 'Unexpected end of JSON input' (truncated stream) is transient", () => {
const result = classifyProviderError("Unexpected end of JSON input");
assert.equal(result.isTransient, true, "'Unexpected end of JSON input' should be transient");
const result = classifyError("Unexpected end of JSON input");
assert.equal(isTransient(result), true, "'Unexpected end of JSON input' should be transient");
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
});
test("#2572: 'Unexpected token' in JSON (truncated stream) is transient", () => {
const result = classifyProviderError("Unexpected token < in JSON at position 0");
assert.equal(result.isTransient, true, "'Unexpected token in JSON' should be transient");
const result = classifyError("Unexpected token < in JSON at position 0");
assert.equal(isTransient(result), true, "'Unexpected token in JSON' should be transient");
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
});
test("#2572: 'SyntaxError' with JSON context (truncated stream) is transient", () => {
const result = classifyProviderError("SyntaxError: JSON.parse: unexpected character at line 1 column 1");
assert.equal(result.isTransient, true, "'SyntaxError...JSON' should be transient");
const result = classifyError("SyntaxError: JSON.parse: unexpected character at line 1 column 1");
assert.equal(isTransient(result), true, "'SyntaxError...JSON' should be transient");
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
});