refactor(gsd): unify three overlapping error classifiers into single classify→decide→act pipeline
Three independent error classifiers (isTransientNetworkError, classifyProviderError, and inline network-retry logic) had diverging counters and duplicate regex coverage. Every new edge case required patching a different classifier. Replaces all three with a single classifyError() returning a discriminated union ErrorClass, one RetryState object with explicit lifecycle, and a clean classify→decide→act flow in handleAgentEnd. Behavioral change: rate-limit errors no longer trigger model fallback — throttling is a provider issue, switching models on the same provider is pointless. Closes #2577
This commit is contained in:
parent
913984c26e
commit
b56eb882bb
7 changed files with 368 additions and 243 deletions
|
|
@ -2,14 +2,51 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
|
|||
|
||||
import { checkAutoStartAfterDiscuss } from "../guided-flow.js";
|
||||
import { getAutoDashboardData, getAutoModeStartModel, isAutoActive, pauseAuto } from "../auto.js";
|
||||
import { getNextFallbackModel, isTransientNetworkError, resolveModelWithFallbacksForUnit } from "../preferences.js";
|
||||
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.js";
|
||||
import { getNextFallbackModel, resolveModelWithFallbacksForUnit } from "../preferences.js";
|
||||
import { pauseAutoForProviderError } from "../provider-error-pause.js";
|
||||
import { isSessionSwitchInFlight, resolveAgentEnd } from "../auto-loop.js";
|
||||
import { resolveModelId } from "../auto-model-selection.js";
|
||||
import { clearDiscussionFlowState } from "./write-gate.js";
|
||||
import {
|
||||
classifyError,
|
||||
createRetryState,
|
||||
resetRetryState,
|
||||
isTransient,
|
||||
type ErrorClass,
|
||||
} from "../error-classifier.js";
|
||||
|
||||
const networkRetryCounters = new Map<string, number>();
|
||||
const retryState = createRetryState();
|
||||
const MAX_NETWORK_RETRIES = 2;
|
||||
const MAX_TRANSIENT_AUTO_RESUMES = 3;
|
||||
let consecutiveTransientErrors = 0;
|
||||
|
||||
async function pauseTransientWithBackoff(
|
||||
cls: ErrorClass,
|
||||
pi: ExtensionAPI,
|
||||
ctx: ExtensionContext,
|
||||
errorDetail: string,
|
||||
isRateLimit: boolean,
|
||||
): Promise<void> {
|
||||
retryState.consecutiveTransientCount += 1;
|
||||
const baseRetryAfterMs = "retryAfterMs" in cls ? cls.retryAfterMs : 15_000;
|
||||
const retryAfterMs = baseRetryAfterMs * 2 ** Math.max(0, retryState.consecutiveTransientCount - 1);
|
||||
const allowAutoResume = retryState.consecutiveTransientCount <= MAX_TRANSIENT_AUTO_RESUMES;
|
||||
if (!allowAutoResume) {
|
||||
ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning");
|
||||
}
|
||||
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
|
||||
isRateLimit,
|
||||
isTransient: allowAutoResume,
|
||||
retryAfterMs,
|
||||
resume: allowAutoResume
|
||||
? () => {
|
||||
pi.sendMessage(
|
||||
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false },
|
||||
{ triggerTurn: true },
|
||||
);
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
}
|
||||
|
||||
export async function handleAgentEnd(
|
||||
pi: ExtensionAPI,
|
||||
|
|
@ -31,17 +68,26 @@ export async function handleAgentEnd(
|
|||
if (lastMsg && "stopReason" in lastMsg && lastMsg.stopReason === "error") {
|
||||
const errorDetail = "errorMessage" in lastMsg && lastMsg.errorMessage ? `: ${lastMsg.errorMessage}` : "";
|
||||
const errorMsg = ("errorMessage" in lastMsg && lastMsg.errorMessage) ? String(lastMsg.errorMessage) : "";
|
||||
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined;
|
||||
|
||||
if (isTransientNetworkError(errorMsg)) {
|
||||
// ── 1. Classify ──────────────────────────────────────────────────────
|
||||
const cls = classifyError(errorMsg, explicitRetryAfterMs);
|
||||
|
||||
// ── 2. Decide & Act ──────────────────────────────────────────────────
|
||||
|
||||
// --- Network errors: same-model retry with backoff ---
|
||||
if (cls.kind === "network") {
|
||||
const currentModelId = ctx.model?.id ?? "unknown";
|
||||
const retryKey = `network-retry:${currentModelId}`;
|
||||
const currentRetries = networkRetryCounters.get(retryKey) ?? 0;
|
||||
const maxRetries = 2;
|
||||
if (currentRetries < maxRetries) {
|
||||
networkRetryCounters.set(retryKey, currentRetries + 1);
|
||||
const attempt = currentRetries + 1;
|
||||
const delayMs = attempt * 3000;
|
||||
ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${maxRetries} in ${delayMs / 1000}s...`, "warning");
|
||||
if (retryState.currentRetryModelId !== currentModelId) {
|
||||
retryState.networkRetryCount = 0;
|
||||
retryState.currentRetryModelId = currentModelId;
|
||||
}
|
||||
if (retryState.networkRetryCount < MAX_NETWORK_RETRIES) {
|
||||
retryState.networkRetryCount += 1;
|
||||
retryState.consecutiveTransientCount += 1;
|
||||
const attempt = retryState.networkRetryCount;
|
||||
const delayMs = attempt * cls.retryAfterMs;
|
||||
ctx.ui.notify(`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${MAX_NETWORK_RETRIES} in ${delayMs / 1000}s...`, "warning");
|
||||
setTimeout(() => {
|
||||
pi.sendMessage(
|
||||
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — retrying after transient network error.", display: false },
|
||||
|
|
@ -50,26 +96,56 @@ export async function handleAgentEnd(
|
|||
}, delayMs);
|
||||
return;
|
||||
}
|
||||
networkRetryCounters.delete(retryKey);
|
||||
// Network retries exhausted — fall through to model fallback
|
||||
retryState.networkRetryCount = 0;
|
||||
retryState.currentRetryModelId = undefined;
|
||||
ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning");
|
||||
}
|
||||
|
||||
const dash = getAutoDashboardData();
|
||||
if (dash.currentUnit) {
|
||||
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
|
||||
if (modelConfig && modelConfig.fallbacks.length > 0) {
|
||||
const availableModels = ctx.modelRegistry.getAvailable();
|
||||
const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig);
|
||||
if (nextModelId) {
|
||||
networkRetryCounters.clear();
|
||||
const slashIdx = nextModelId.indexOf("/");
|
||||
const modelToSet = slashIdx !== -1
|
||||
? availableModels.find((m) => m.provider.toLowerCase() === nextModelId.substring(0, slashIdx).toLowerCase() && m.id.toLowerCase() === nextModelId.substring(slashIdx + 1).toLowerCase())
|
||||
: (availableModels.find((m) => m.id === nextModelId && m.provider === ctx.model?.provider) ?? availableModels.find((m) => m.id === nextModelId));
|
||||
if (modelToSet) {
|
||||
const ok = await pi.setModel(modelToSet, { persist: false });
|
||||
// --- Rate limit: skip model fallback, go straight to pause ---
|
||||
// Rate-limiting is a provider issue, not a model issue.
|
||||
// Switching models won't help if the provider is throttling you.
|
||||
if (cls.kind === "rate-limit") {
|
||||
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true);
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Server/connection/stream errors: try model fallback first ---
|
||||
if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
|
||||
// Try model fallback
|
||||
const dash = getAutoDashboardData();
|
||||
if (dash.currentUnit) {
|
||||
const modelConfig = resolveModelWithFallbacksForUnit(dash.currentUnit.type);
|
||||
if (modelConfig && modelConfig.fallbacks.length > 0) {
|
||||
const availableModels = ctx.modelRegistry.getAvailable();
|
||||
const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig);
|
||||
if (nextModelId) {
|
||||
retryState.networkRetryCount = 0;
|
||||
retryState.currentRetryModelId = undefined;
|
||||
const modelToSet = resolveModelId(nextModelId, availableModels, ctx.model?.provider);
|
||||
if (modelToSet) {
|
||||
const ok = await pi.setModel(modelToSet, { persist: false });
|
||||
if (ok) {
|
||||
ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning");
|
||||
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try restoring session model
|
||||
const sessionModel = getAutoModeStartModel();
|
||||
if (sessionModel) {
|
||||
if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) {
|
||||
const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id);
|
||||
if (startModel) {
|
||||
const ok = await pi.setModel(startModel, { persist: false });
|
||||
if (ok) {
|
||||
ctx.ui.notify(`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`, "warning");
|
||||
retryState.networkRetryCount = 0;
|
||||
retryState.currentRetryModelId = undefined;
|
||||
ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning");
|
||||
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
|
||||
return;
|
||||
}
|
||||
|
|
@ -78,56 +154,24 @@ export async function handleAgentEnd(
|
|||
}
|
||||
}
|
||||
|
||||
const sessionModel = getAutoModeStartModel();
|
||||
if (sessionModel) {
|
||||
if (ctx.model?.id !== sessionModel.id || ctx.model?.provider !== sessionModel.provider) {
|
||||
const startModel = ctx.modelRegistry.getAvailable().find((m) => m.provider === sessionModel.provider && m.id === sessionModel.id);
|
||||
if (startModel) {
|
||||
const ok = await pi.setModel(startModel, { persist: false });
|
||||
if (ok) {
|
||||
networkRetryCounters.clear();
|
||||
ctx.ui.notify(`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`, "warning");
|
||||
pi.sendMessage({ customType: "gsd-auto-timeout-recovery", content: "Continue execution.", display: false }, { triggerTurn: true });
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
// --- Transient fallback: pause with auto-resume ---
|
||||
if (isTransient(cls)) {
|
||||
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false);
|
||||
return;
|
||||
}
|
||||
|
||||
const classification = classifyProviderError(errorMsg);
|
||||
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined;
|
||||
if (classification.isTransient) {
|
||||
consecutiveTransientErrors += 1;
|
||||
} else {
|
||||
consecutiveTransientErrors = 0;
|
||||
}
|
||||
const baseRetryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
|
||||
const retryAfterMs = classification.isTransient
|
||||
? baseRetryAfterMs * 2 ** Math.max(0, consecutiveTransientErrors - 1)
|
||||
: baseRetryAfterMs;
|
||||
const allowAutoResume = classification.isTransient && consecutiveTransientErrors <= MAX_TRANSIENT_AUTO_RESUMES;
|
||||
if (classification.isTransient && !allowAutoResume) {
|
||||
ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning");
|
||||
}
|
||||
// --- Permanent / unknown: pause indefinitely ---
|
||||
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
|
||||
isRateLimit: classification.isRateLimit,
|
||||
isTransient: allowAutoResume,
|
||||
retryAfterMs,
|
||||
resume: allowAutoResume
|
||||
? () => {
|
||||
pi.sendMessage(
|
||||
{ customType: "gsd-auto-timeout-recovery", content: "Continue execution — provider error recovery delay elapsed.", display: false },
|
||||
{ triggerTurn: true },
|
||||
);
|
||||
}
|
||||
: undefined,
|
||||
isRateLimit: false,
|
||||
isTransient: false,
|
||||
retryAfterMs: 0,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// ── Success path ─────────────────────────────────────────────────────────
|
||||
try {
|
||||
consecutiveTransientErrors = 0;
|
||||
networkRetryCounters.clear();
|
||||
resetRetryState(retryState);
|
||||
resolveAgentEnd(event);
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
|
|
@ -139,4 +183,3 @@ export async function handleAgentEnd(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
139
src/resources/extensions/gsd/error-classifier.ts
Normal file
139
src/resources/extensions/gsd/error-classifier.ts
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
/**
|
||||
* Unified error classifier for provider/network/server errors.
|
||||
*
|
||||
* Consolidates patterns from:
|
||||
* - isTransientNetworkError() in preferences-models.ts
|
||||
* - classifyProviderError() in provider-error-pause.ts
|
||||
*
|
||||
* Single entry point: classifyError(errorMsg, retryAfterMs?)
|
||||
*
|
||||
* @see https://github.com/gsd-build/gsd/issues/2577
|
||||
*/
|
||||
|
||||
// ── ErrorClass discriminated union ──────────────────────────────────────────
|
||||
|
||||
export type ErrorClass =
|
||||
| { kind: "network"; retryAfterMs: number }
|
||||
| { kind: "rate-limit"; retryAfterMs: number }
|
||||
| { kind: "server"; retryAfterMs: number }
|
||||
| { kind: "stream"; retryAfterMs: number }
|
||||
| { kind: "connection"; retryAfterMs: number }
|
||||
| { kind: "model-error" }
|
||||
| { kind: "permanent" }
|
||||
| { kind: "unknown" };
|
||||
|
||||
// ── RetryState ──────────────────────────────────────────────────────────────
|
||||
|
||||
export interface RetryState {
|
||||
networkRetryCount: number;
|
||||
consecutiveTransientCount: number;
|
||||
currentRetryModelId: string | undefined;
|
||||
}
|
||||
|
||||
export function createRetryState(): RetryState {
|
||||
return { networkRetryCount: 0, consecutiveTransientCount: 0, currentRetryModelId: undefined };
|
||||
}
|
||||
|
||||
export function resetRetryState(state: RetryState): void {
|
||||
state.networkRetryCount = 0;
|
||||
state.consecutiveTransientCount = 0;
|
||||
state.currentRetryModelId = undefined;
|
||||
}
|
||||
|
||||
// ── Classification ──────────────────────────────────────────────────────────
|
||||
|
||||
const PERMANENT_RE = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i;
|
||||
const RATE_LIMIT_RE = /rate.?limit|too many requests|429/i;
|
||||
const NETWORK_RE = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i;
|
||||
const SERVER_RE = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i;
|
||||
// ECONNRESET/ECONNREFUSED are in NETWORK_RE (same-model retry first).
|
||||
const CONNECTION_RE = /terminated|connection.?refused|other side closed|EPIPE|network.?(?:is\s+)?unavailable/i;
|
||||
const STREAM_RE = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i;
|
||||
const RESET_DELAY_RE = /reset in (\d+)s/i;
|
||||
|
||||
/**
|
||||
* Classify an error message into one of the ErrorClass kinds.
|
||||
*
|
||||
* Classification order:
|
||||
* 1. Permanent (auth/billing/quota) — unless also rate-limited
|
||||
* 2. Rate limit (429, rate.?limit, too many requests)
|
||||
* 3. Network (ECONNRESET, ETIMEDOUT, socket hang up, fetch failed, dns)
|
||||
* 4. Server (500/502/503, overloaded, server_error)
|
||||
* 5. Connection (terminated, ECONNREFUSED, EPIPE, other side closed)
|
||||
* 6. Stream truncation (malformed JSON from mid-stream cut)
|
||||
* 7. Unknown
|
||||
*/
|
||||
export function classifyError(errorMsg: string, retryAfterMs?: number): ErrorClass {
|
||||
const isPermanent = PERMANENT_RE.test(errorMsg);
|
||||
const isRateLimit = RATE_LIMIT_RE.test(errorMsg);
|
||||
|
||||
// 1. Permanent — but rate limit takes precedence
|
||||
if (isPermanent && !isRateLimit) {
|
||||
return { kind: "permanent" };
|
||||
}
|
||||
|
||||
// 2. Rate limit
|
||||
if (isRateLimit) {
|
||||
if (retryAfterMs != null && retryAfterMs > 0) {
|
||||
return { kind: "rate-limit", retryAfterMs };
|
||||
}
|
||||
const resetMatch = errorMsg.match(RESET_DELAY_RE);
|
||||
const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000;
|
||||
return { kind: "rate-limit", retryAfterMs: delayMs };
|
||||
}
|
||||
|
||||
// 3. Network errors — same-model retry candidate
|
||||
if (NETWORK_RE.test(errorMsg)) {
|
||||
// Exclude if also matches permanent signals (already handled above for
|
||||
// rate-limit, but double-check for non-rate-limit permanent overlap like
|
||||
// "billing" appearing alongside "network").
|
||||
return { kind: "network", retryAfterMs: retryAfterMs ?? 3_000 };
|
||||
}
|
||||
|
||||
// 4. Server errors — try fallback model
|
||||
if (SERVER_RE.test(errorMsg)) {
|
||||
return { kind: "server", retryAfterMs: retryAfterMs ?? 30_000 };
|
||||
}
|
||||
|
||||
// 5. Connection errors — try fallback model
|
||||
if (CONNECTION_RE.test(errorMsg)) {
|
||||
return { kind: "connection", retryAfterMs: retryAfterMs ?? 15_000 };
|
||||
}
|
||||
|
||||
// 6. Stream truncation — downstream symptom of connection drop
|
||||
if (STREAM_RE.test(errorMsg)) {
|
||||
return { kind: "stream", retryAfterMs: retryAfterMs ?? 15_000 };
|
||||
}
|
||||
|
||||
// 7. Unknown
|
||||
return { kind: "unknown" };
|
||||
}
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Returns true for all transient (auto-resumable) error kinds. */
|
||||
export function isTransient(cls: ErrorClass): boolean {
|
||||
switch (cls.kind) {
|
||||
case "network":
|
||||
case "rate-limit":
|
||||
case "server":
|
||||
case "stream":
|
||||
case "connection":
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Backward-compatible thin wrapper.
|
||||
*
|
||||
* Returns true when the error is a transient *network* error specifically
|
||||
* (worth retrying the same model). Permanent signals (auth, billing, quota)
|
||||
* cause this to return false even if a network keyword is present.
|
||||
*/
|
||||
export function isTransientNetworkError(errorMsg: string): boolean {
|
||||
if (!errorMsg) return false;
|
||||
const cls = classifyError(errorMsg);
|
||||
return cls.kind === "network";
|
||||
}
|
||||
|
|
@ -125,18 +125,6 @@ export function getNextFallbackModel(
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect whether an error message indicates a transient network error
|
||||
* (worth retrying the same model) vs a permanent provider error
|
||||
* (auth failure, quota exceeded, etc. -- should fall back immediately).
|
||||
*/
|
||||
export function isTransientNetworkError(errorMsg: string): boolean {
|
||||
if (!errorMsg) return false;
|
||||
const hasNetworkSignal = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i.test(errorMsg);
|
||||
const hasPermanentSignal = /auth|unauthorized|forbidden|invalid.*key|quota|billing/i.test(errorMsg);
|
||||
return hasNetworkSignal && !hasPermanentSignal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a model ID string.
|
||||
* Returns true if the ID looks like a valid model identifier.
|
||||
|
|
|
|||
|
|
@ -68,7 +68,6 @@ export {
|
|||
resolveModelForUnit,
|
||||
resolveModelWithFallbacksForUnit,
|
||||
getNextFallbackModel,
|
||||
isTransientNetworkError,
|
||||
validateModelId,
|
||||
updatePreferencesModels,
|
||||
resolveDynamicRoutingConfig,
|
||||
|
|
|
|||
|
|
@ -2,63 +2,6 @@ export type ProviderErrorPauseUI = {
|
|||
notify(message: string, level?: "info" | "warning" | "error" | "success"): void;
|
||||
};
|
||||
|
||||
/**
|
||||
* Classify a provider error as transient (auto-resume) or permanent (manual resume).
|
||||
*
|
||||
* Transient: rate limits, server errors (500/502/503), overloaded, internal errors.
|
||||
* These are expected to self-resolve and should auto-resume after a delay.
|
||||
*
|
||||
* Permanent: auth errors, invalid API key, billing issues.
|
||||
* These require user intervention and should pause indefinitely.
|
||||
*/
|
||||
export function classifyProviderError(errorMsg: string): {
|
||||
isTransient: boolean;
|
||||
isRateLimit: boolean;
|
||||
suggestedDelayMs: number;
|
||||
} {
|
||||
const isRateLimit = /rate.?limit|too many requests|429/i.test(errorMsg);
|
||||
const isServerError = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i.test(errorMsg);
|
||||
|
||||
// Connection/process errors — transient, auto-resume after brief backoff (#2309).
|
||||
// These indicate the process was killed, the connection was reset, or a network
|
||||
// blip occurred. They are NOT permanent failures.
|
||||
const isConnectionError = /terminated|connection.?reset|connection.?refused|other side closed|fetch failed|network.?(?:is\s+)?unavailable|ECONNREFUSED|ECONNRESET|EPIPE/i.test(errorMsg);
|
||||
|
||||
// Permanent errors — never auto-resume
|
||||
const isPermanent = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i.test(errorMsg);
|
||||
|
||||
if (isPermanent && !isRateLimit) {
|
||||
return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 };
|
||||
}
|
||||
|
||||
if (isRateLimit) {
|
||||
// Try to extract retry-after from the message
|
||||
const resetMatch = errorMsg.match(/reset in (\d+)s/i);
|
||||
const delayMs = resetMatch ? Number(resetMatch[1]) * 1000 : 60_000; // default 60s for rate limits
|
||||
return { isTransient: true, isRateLimit: true, suggestedDelayMs: delayMs };
|
||||
}
|
||||
|
||||
if (isServerError) {
|
||||
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 30_000 }; // 30s for server errors
|
||||
}
|
||||
|
||||
if (isConnectionError) {
|
||||
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s for connection errors
|
||||
}
|
||||
|
||||
// Stream-truncation JSON parse errors — transient (#2572).
|
||||
// When the API stream is cut mid-chunk, pi tries to reassemble the partial
|
||||
// tool-call JSON and gets a SyntaxError. This is the downstream symptom of
|
||||
// a connection drop — same root cause as ECONNRESET, one layer up.
|
||||
const isMalformedStream = /Unexpected end of JSON|Unexpected token.*JSON|Expected double-quoted property name|SyntaxError.*JSON/i.test(errorMsg);
|
||||
if (isMalformedStream) {
|
||||
return { isTransient: true, isRateLimit: false, suggestedDelayMs: 15_000 }; // 15s, same as connection errors
|
||||
}
|
||||
|
||||
// Unknown error — treat as permanent (user reviews)
|
||||
return { isTransient: false, isRateLimit: false, suggestedDelayMs: 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Pause auto-mode due to a provider error.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* Provider error handling tests — consolidated from:
|
||||
* - provider-error-classify.test.ts (classifyProviderError)
|
||||
* - provider-error-classify.test.ts (classifyError)
|
||||
* - network-error-fallback.test.ts (isTransientNetworkError, getNextFallbackModel)
|
||||
* - agent-end-provider-error.test.ts (pauseAutoForProviderError)
|
||||
*/
|
||||
|
|
@ -10,102 +10,104 @@ import assert from "node:assert/strict";
|
|||
import { readFileSync } from "node:fs";
|
||||
import { join, dirname } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts";
|
||||
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
|
||||
import { classifyError, isTransient, isTransientNetworkError } from "../error-classifier.ts";
|
||||
import { pauseAutoForProviderError } from "../provider-error-pause.ts";
|
||||
import { getNextFallbackModel } from "../preferences.ts";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
// ── classifyProviderError ────────────────────────────────────────────────────
|
||||
// ── classifyError ────────────────────────────────────────────────────────────
|
||||
|
||||
test("classifyProviderError detects rate limit from 429", () => {
|
||||
const result = classifyProviderError("HTTP 429 Too Many Requests");
|
||||
assert.ok(result.isTransient);
|
||||
assert.ok(result.isRateLimit);
|
||||
assert.ok(result.suggestedDelayMs > 0);
|
||||
test("classifyError detects rate limit from 429", () => {
|
||||
const result = classifyError("HTTP 429 Too Many Requests");
|
||||
assert.ok(isTransient(result));
|
||||
assert.equal(result.kind, "rate-limit");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs > 0);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects rate limit from message", () => {
|
||||
const result = classifyProviderError("rate limit exceeded");
|
||||
assert.ok(result.isTransient);
|
||||
assert.ok(result.isRateLimit);
|
||||
test("classifyError detects rate limit from message", () => {
|
||||
const result = classifyError("rate limit exceeded");
|
||||
assert.ok(isTransient(result));
|
||||
assert.equal(result.kind, "rate-limit");
|
||||
});
|
||||
|
||||
test("classifyProviderError extracts reset delay from message", () => {
|
||||
const result = classifyProviderError("rate limit exceeded, reset in 45s");
|
||||
assert.ok(result.isRateLimit);
|
||||
assert.equal(result.suggestedDelayMs, 45000);
|
||||
test("classifyError extracts reset delay from message", () => {
|
||||
const result = classifyError("rate limit exceeded, reset in 45s");
|
||||
assert.equal(result.kind, "rate-limit");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs === 45000);
|
||||
});
|
||||
|
||||
test("classifyProviderError defaults to 60s for rate limit without reset", () => {
|
||||
const result = classifyProviderError("429 too many requests");
|
||||
assert.ok(result.isRateLimit);
|
||||
assert.equal(result.suggestedDelayMs, 60_000);
|
||||
test("classifyError defaults to 60s for rate limit without reset", () => {
|
||||
const result = classifyError("429 too many requests");
|
||||
assert.equal(result.kind, "rate-limit");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs === 60_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects Anthropic internal server error", () => {
|
||||
test("classifyError detects Anthropic internal server error", () => {
|
||||
const msg = '{"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}';
|
||||
const result = classifyProviderError(msg);
|
||||
assert.ok(result.isTransient);
|
||||
assert.ok(!result.isRateLimit);
|
||||
assert.equal(result.suggestedDelayMs, 30_000);
|
||||
const result = classifyError(msg);
|
||||
assert.ok(isTransient(result));
|
||||
assert.equal(result.kind, "server");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects Codex server_error from extracted message", () => {
|
||||
test("classifyError detects Codex server_error from extracted message", () => {
|
||||
// After fix, mapCodexEvents extracts the nested error type and produces
|
||||
// "Codex server_error: <message>" instead of raw JSON.
|
||||
const msg = "Codex server_error: An error occurred while processing your request.";
|
||||
const result = classifyProviderError(msg);
|
||||
assert.ok(result.isTransient);
|
||||
assert.ok(!result.isRateLimit);
|
||||
assert.equal(result.suggestedDelayMs, 30_000);
|
||||
const result = classifyError(msg);
|
||||
assert.ok(isTransient(result));
|
||||
assert.equal(result.kind, "server");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects overloaded error", () => {
|
||||
const result = classifyProviderError("overloaded_error: Overloaded");
|
||||
assert.ok(result.isTransient);
|
||||
assert.equal(result.suggestedDelayMs, 30_000);
|
||||
test("classifyError detects overloaded error", () => {
|
||||
const result = classifyError("overloaded_error: Overloaded");
|
||||
assert.ok(isTransient(result));
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs === 30_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects 503 service unavailable", () => {
|
||||
const result = classifyProviderError("HTTP 503 Service Unavailable");
|
||||
assert.ok(result.isTransient);
|
||||
test("classifyError detects 503 service unavailable", () => {
|
||||
const result = classifyError("HTTP 503 Service Unavailable");
|
||||
assert.ok(isTransient(result));
|
||||
});
|
||||
|
||||
test("classifyProviderError detects 502 bad gateway", () => {
|
||||
const result = classifyProviderError("HTTP 502 Bad Gateway");
|
||||
assert.ok(result.isTransient);
|
||||
test("classifyError detects 502 bad gateway", () => {
|
||||
const result = classifyError("HTTP 502 Bad Gateway");
|
||||
assert.ok(isTransient(result));
|
||||
});
|
||||
|
||||
test("classifyProviderError detects auth error as permanent", () => {
|
||||
const result = classifyProviderError("unauthorized: invalid API key");
|
||||
assert.ok(!result.isTransient);
|
||||
assert.ok(!result.isRateLimit);
|
||||
test("classifyError detects auth error as permanent", () => {
|
||||
const result = classifyError("unauthorized: invalid API key");
|
||||
assert.ok(!isTransient(result));
|
||||
assert.equal(result.kind, "permanent");
|
||||
});
|
||||
|
||||
test("classifyProviderError detects billing error as permanent", () => {
|
||||
const result = classifyProviderError("billing issue: payment required");
|
||||
assert.ok(!result.isTransient);
|
||||
test("classifyError detects billing error as permanent", () => {
|
||||
const result = classifyError("billing issue: payment required");
|
||||
assert.ok(!isTransient(result));
|
||||
});
|
||||
|
||||
test("classifyProviderError detects quota exceeded as permanent", () => {
|
||||
const result = classifyProviderError("quota exceeded for this month");
|
||||
assert.ok(!result.isTransient);
|
||||
test("classifyError detects quota exceeded as permanent", () => {
|
||||
const result = classifyError("quota exceeded for this month");
|
||||
assert.ok(!isTransient(result));
|
||||
});
|
||||
|
||||
test("classifyProviderError treats unknown error as permanent", () => {
|
||||
const result = classifyProviderError("something went wrong");
|
||||
assert.ok(!result.isTransient);
|
||||
test("classifyError treats unknown error as not transient", () => {
|
||||
const result = classifyError("something went wrong");
|
||||
assert.ok(!isTransient(result));
|
||||
assert.equal(result.kind, "unknown");
|
||||
});
|
||||
|
||||
test("classifyProviderError treats empty string as permanent", () => {
|
||||
const result = classifyProviderError("");
|
||||
assert.ok(!result.isTransient);
|
||||
test("classifyError treats empty string as not transient", () => {
|
||||
const result = classifyError("");
|
||||
assert.ok(!isTransient(result));
|
||||
});
|
||||
|
||||
test("classifyProviderError: rate limit takes precedence over auth keywords", () => {
|
||||
const result = classifyProviderError("429 unauthorized rate limit");
|
||||
assert.ok(result.isRateLimit);
|
||||
assert.ok(result.isTransient);
|
||||
test("classifyError: rate limit takes precedence over auth keywords", () => {
|
||||
const result = classifyError("429 unauthorized rate limit");
|
||||
assert.equal(result.kind, "rate-limit");
|
||||
assert.ok(isTransient(result));
|
||||
});
|
||||
|
||||
// ── isTransientNetworkError ──────────────────────────────────────────────────
|
||||
|
|
@ -265,8 +267,8 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b
|
|||
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
|
||||
|
||||
assert.ok(
|
||||
src.includes("consecutiveTransientErrors"),
|
||||
"agent-end-recovery.ts must track consecutiveTransientErrors for escalating backoff (#1166)",
|
||||
src.includes("consecutiveTransientCount"),
|
||||
"agent-end-recovery.ts must track consecutiveTransientCount for escalating backoff (#1166)",
|
||||
);
|
||||
assert.ok(
|
||||
src.includes("MAX_TRANSIENT_AUTO_RESUMES"),
|
||||
|
|
@ -274,15 +276,13 @@ test("agent-end-recovery.ts tracks consecutive transient errors for escalating b
|
|||
);
|
||||
});
|
||||
|
||||
test("agent-end-recovery.ts resets consecutive transient error counter on success", () => {
|
||||
test("agent-end-recovery.ts resets retry state before resolveAgentEnd on success", () => {
|
||||
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
|
||||
|
||||
// After successful agent_end (before resolveAgentEnd), the counter must be reset.
|
||||
// Use a regex across the success block so CRLF checkouts on Windows do not
|
||||
// push the reset line outside a fixed substring window.
|
||||
// After successful agent_end, resetRetryState must be called before resolveAgentEnd.
|
||||
assert.ok(
|
||||
/consecutiveTransientErrors\s*=\s*0\s*;[\s\S]{0,250}resolveAgentEnd/.test(src),
|
||||
"consecutive transient error counter must be reset before resolveAgentEnd on the success path (#1166)",
|
||||
/resetRetryState[\s\S]{0,250}resolveAgentEnd/.test(src),
|
||||
"resetRetryState must be called before resolveAgentEnd on the success path (#1166)",
|
||||
);
|
||||
});
|
||||
|
||||
|
|
@ -291,7 +291,7 @@ test("agent-end-recovery.ts applies escalating delay for repeated transient erro
|
|||
|
||||
// Must contain the exponential backoff formula (may span multiple lines)
|
||||
assert.ok(
|
||||
src.includes("2 ** Math.max(0, consecutiveTransientErrors"),
|
||||
src.includes("2 ** Math.max(0, retryState.consecutiveTransientCount"),
|
||||
"agent-end-recovery.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)",
|
||||
);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,73 +1,86 @@
|
|||
/**
|
||||
* terminated-transient.test.ts — Regression test for #2309.
|
||||
*
|
||||
* classifyProviderError should treat 'terminated' errors (process killed,
|
||||
* classifyError should treat 'terminated' errors (process killed,
|
||||
* connection reset) as transient with auto-resume, not permanent.
|
||||
*/
|
||||
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { classifyProviderError } from "../provider-error-pause.ts";
|
||||
import { classifyError, isTransient } from "../error-classifier.ts";
|
||||
|
||||
test("#2309: 'terminated' errors should be classified as transient", () => {
|
||||
const result = classifyProviderError("terminated");
|
||||
assert.equal(result.isTransient, true, "'terminated' should be transient");
|
||||
assert.equal(result.isRateLimit, false, "'terminated' is not a rate limit");
|
||||
assert.ok(result.suggestedDelayMs > 0, "'terminated' should have a retry delay");
|
||||
const result = classifyError("terminated");
|
||||
assert.equal(isTransient(result), true, "'terminated' should be transient");
|
||||
assert.equal(result.kind, "connection", "'terminated' matches connection");
|
||||
assert.equal(result.kind !== "rate-limit", true, "'terminated' is not a rate limit");
|
||||
assert.ok("retryAfterMs" in result && result.retryAfterMs > 0, "'terminated' should have a retry delay");
|
||||
assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "'terminated' should use 15s backoff");
|
||||
});
|
||||
|
||||
test("#2309: 'connection reset' errors should be classified as transient", () => {
|
||||
const result = classifyProviderError("connection reset by peer");
|
||||
assert.equal(result.isTransient, true, "'connection reset' should be transient");
|
||||
test("#2309: 'connection reset by peer' errors should be classified as transient (network)", () => {
|
||||
const result = classifyError("connection reset by peer");
|
||||
assert.equal(isTransient(result), true, "'connection reset by peer' should be transient");
|
||||
assert.equal(result.kind, "network", "'connection reset by peer' matches NETWORK_RE (connection.*reset) before CONNECTION_RE");
|
||||
assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff");
|
||||
});
|
||||
|
||||
test("#2309: 'other side closed' errors should be classified as transient", () => {
|
||||
const result = classifyProviderError("other side closed the connection");
|
||||
assert.equal(result.isTransient, true, "'other side closed' should be transient");
|
||||
const result = classifyError("other side closed the connection");
|
||||
assert.equal(isTransient(result), true, "'other side closed' should be transient");
|
||||
assert.equal(result.kind, "connection", "'other side closed' matches CONNECTION_RE");
|
||||
});
|
||||
|
||||
test("#2309: 'fetch failed' errors should be classified as transient", () => {
|
||||
const result = classifyProviderError("fetch failed: network error");
|
||||
assert.equal(result.isTransient, true, "'fetch failed' should be transient");
|
||||
const result = classifyError("fetch failed: network error");
|
||||
assert.equal(isTransient(result), true, "'fetch failed' should be transient");
|
||||
assert.equal(result.kind, "network", "'fetch failed' matches NETWORK_RE");
|
||||
assert.equal("retryAfterMs" in result && result.retryAfterMs, 3_000, "network errors use 3s backoff");
|
||||
});
|
||||
|
||||
test("#2309: 'connection refused' errors should be classified as transient", () => {
|
||||
const result = classifyProviderError("ECONNREFUSED: connection refused");
|
||||
assert.equal(result.isTransient, true, "'connection refused' should be transient");
|
||||
const result = classifyError("ECONNREFUSED: connection refused");
|
||||
assert.equal(isTransient(result), true, "'connection refused' should be transient");
|
||||
assert.equal(result.kind, "network", "'ECONNREFUSED' matches NETWORK_RE (same-model retry)");
|
||||
});
|
||||
|
||||
test("#2309: permanent errors are still permanent", () => {
|
||||
const authResult = classifyProviderError("unauthorized: invalid API key");
|
||||
assert.equal(authResult.isTransient, false, "auth errors should stay permanent");
|
||||
assert.equal(authResult.suggestedDelayMs, 0, "permanent errors have no delay");
|
||||
const authResult = classifyError("unauthorized: invalid API key");
|
||||
assert.equal(isTransient(authResult), false, "auth errors should stay permanent");
|
||||
assert.equal(authResult.kind, "permanent", "auth errors are permanent");
|
||||
assert.equal("retryAfterMs" in authResult, false, "permanent errors have no retryAfterMs");
|
||||
});
|
||||
|
||||
test("#2309: rate limits are still transient", () => {
|
||||
const rlResult = classifyProviderError("rate limit exceeded (429)");
|
||||
assert.equal(rlResult.isTransient, true, "rate limits are still transient");
|
||||
assert.equal(rlResult.isRateLimit, true, "rate limits are flagged as rate limits");
|
||||
const rlResult = classifyError("rate limit exceeded (429)");
|
||||
assert.equal(isTransient(rlResult), true, "rate limits are still transient");
|
||||
assert.equal(rlResult.kind, "rate-limit", "rate limits are flagged as rate-limit kind");
|
||||
});
|
||||
|
||||
// --- #2572: stream-truncation JSON parse errors should be transient ---
|
||||
|
||||
test("#2572: 'Expected double-quoted property name' (truncated stream) is transient", () => {
|
||||
const result = classifyProviderError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)");
|
||||
assert.equal(result.isTransient, true, "truncated-stream JSON parse error should be transient");
|
||||
assert.equal(result.isRateLimit, false, "not a rate limit");
|
||||
assert.equal(result.suggestedDelayMs, 15_000, "should use 15s backoff like connection errors");
|
||||
const result = classifyError("Expected double-quoted property name in JSON at position 23 (line 1 column 24)");
|
||||
assert.equal(isTransient(result), true, "truncated-stream JSON parse error should be transient");
|
||||
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
|
||||
assert.equal(result.kind !== "rate-limit", true, "not a rate limit");
|
||||
assert.equal("retryAfterMs" in result && result.retryAfterMs, 15_000, "should use 15s backoff");
|
||||
});
|
||||
|
||||
test("#2572: 'Unexpected end of JSON input' (truncated stream) is transient", () => {
|
||||
const result = classifyProviderError("Unexpected end of JSON input");
|
||||
assert.equal(result.isTransient, true, "'Unexpected end of JSON input' should be transient");
|
||||
const result = classifyError("Unexpected end of JSON input");
|
||||
assert.equal(isTransient(result), true, "'Unexpected end of JSON input' should be transient");
|
||||
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
|
||||
});
|
||||
|
||||
test("#2572: 'Unexpected token' in JSON (truncated stream) is transient", () => {
|
||||
const result = classifyProviderError("Unexpected token < in JSON at position 0");
|
||||
assert.equal(result.isTransient, true, "'Unexpected token in JSON' should be transient");
|
||||
const result = classifyError("Unexpected token < in JSON at position 0");
|
||||
assert.equal(isTransient(result), true, "'Unexpected token in JSON' should be transient");
|
||||
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
|
||||
});
|
||||
|
||||
test("#2572: 'SyntaxError' with JSON context (truncated stream) is transient", () => {
|
||||
const result = classifyProviderError("SyntaxError: JSON.parse: unexpected character at line 1 column 1");
|
||||
assert.equal(result.isTransient, true, "'SyntaxError...JSON' should be transient");
|
||||
const result = classifyError("SyntaxError: JSON.parse: unexpected character at line 1 column 1");
|
||||
assert.equal(isTransient(result), true, "'SyntaxError...JSON' should be transient");
|
||||
assert.equal(result.kind, "stream", "JSON parse errors are stream kind");
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue