diff --git a/packages/pi-ai/src/providers/openai-codex-responses.ts b/packages/pi-ai/src/providers/openai-codex-responses.ts index 5bce50d0f..3a93e9fa0 100644 --- a/packages/pi-ai/src/providers/openai-codex-responses.ts +++ b/packages/pi-ai/src/providers/openai-codex-responses.ts @@ -368,9 +368,14 @@ async function* mapCodexEvents(events: AsyncIterable>): if (!type) continue; if (type === "error") { - const code = (event as { code?: string }).code || ""; - const message = (event as { message?: string }).message || ""; - throw new Error(`Codex error: ${message || code || JSON.stringify(event)}`); + // Codex error events nest details under event.error (e.g. + // { type: "error", error: { type: "server_error", code: "server_error", message: "..." } }) + const errorObj = (event as { error?: { code?: string; type?: string; message?: string } }).error; + const code = errorObj?.code || (event as { code?: string }).code || ""; + const errorType = errorObj?.type || ""; + const message = errorObj?.message || (event as { message?: string }).message || ""; + const prefix = errorType ? `Codex ${errorType}` : "Codex error"; + throw new Error(`${prefix}: ${message || code || JSON.stringify(event)}`); } if (type === "response.failed") { diff --git a/packages/pi-coding-agent/src/core/agent-session.ts b/packages/pi-coding-agent/src/core/agent-session.ts index b33b165a0..60f35274d 100644 --- a/packages/pi-coding-agent/src/core/agent-session.ts +++ b/packages/pi-coding-agent/src/core/agent-session.ts @@ -2427,7 +2427,7 @@ export class AgentSession { const err = message.errorMessage; // Match: overloaded_error, rate limit, 429, 500, 502, 503, 504, service unavailable, connection errors, fetch failed, terminated, retry delay exceeded, network unavailable / auth expired (transient network failures) - return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server error|internal error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test( + return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test( err, ); } diff --git a/src/resources/extensions/gsd/index.ts b/src/resources/extensions/gsd/index.ts index 4c2b47889..d5ce75951 100644 --- a/src/resources/extensions/gsd/index.ts +++ b/src/resources/extensions/gsd/index.ts @@ -100,6 +100,13 @@ let depthVerificationDone = false; // Cleared when a model switch occurs or retries are exhausted. const networkRetryCounters = new Map(); +// ── Transient error escalation ─────────────────────────────────────────── +// Tracks consecutive transient auto-resume attempts. Each attempt doubles +// the delay. After MAX_TRANSIENT_AUTO_RESUMES consecutive failures, auto-mode +// pauses indefinitely to avoid infinite rapid-fire retries (#1166). +const MAX_TRANSIENT_AUTO_RESUMES = 5; +let consecutiveTransientErrors = 0; + export function isDepthVerified(): boolean { return depthVerificationDone; } @@ -866,11 +873,31 @@ export default function (pi: ExtensionAPI) { const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number") ? lastMsg.retryAfterMs : undefined; - const retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs; + let retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs; + + // ── Escalating backoff for repeated transient errors ────────────── + // Each consecutive transient auto-resume doubles the delay. After + // MAX_TRANSIENT_AUTO_RESUMES consecutive failures, treat as permanent + // to avoid infinite rapid-fire retries (#1166). + let effectiveTransient = classification.isTransient; + if (classification.isTransient) { + consecutiveTransientErrors++; + if (consecutiveTransientErrors > MAX_TRANSIENT_AUTO_RESUMES) { + effectiveTransient = false; + ctx.ui.notify( + `${consecutiveTransientErrors} consecutive transient errors. Pausing indefinitely — resume manually with /gsd auto.`, + "error", + ); + consecutiveTransientErrors = 0; + } else { + // Escalate: base delay × 2^(consecutive-1) → 30s, 60s, 120s, 240s, 480s + retryAfterMs = retryAfterMs * 2 ** (consecutiveTransientErrors - 1); + } + } await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), { isRateLimit: classification.isRateLimit, - isTransient: classification.isTransient, + isTransient: effectiveTransient, retryAfterMs, resume: () => { pi.sendMessage( @@ -884,6 +911,7 @@ export default function (pi: ExtensionAPI) { try { networkRetryCounters.clear(); // Clear network retry state on successful unit completion + consecutiveTransientErrors = 0; // Reset escalating backoff on success await handleAgentEnd(ctx, pi); } catch (err) { // Safety net: if handleAgentEnd throws despite its internal try-catch, diff --git a/src/resources/extensions/gsd/tests/provider-errors.test.ts b/src/resources/extensions/gsd/tests/provider-errors.test.ts index 2a84b61c7..7ac8d0efe 100644 --- a/src/resources/extensions/gsd/tests/provider-errors.test.ts +++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts @@ -7,9 +7,14 @@ import test from "node:test"; import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts"; import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts"; +const __dirname = dirname(fileURLToPath(import.meta.url)); + // ── classifyProviderError ──────────────────────────────────────────────────── test("classifyProviderError detects rate limit from 429", () => { @@ -45,6 +50,16 @@ test("classifyProviderError detects Anthropic internal server error", () => { assert.equal(result.suggestedDelayMs, 30_000); }); +test("classifyProviderError detects Codex server_error from extracted message", () => { + // After fix, mapCodexEvents extracts the nested error type and produces + // "Codex server_error: " instead of raw JSON. + const msg = "Codex server_error: An error occurred while processing your request."; + const result = classifyProviderError(msg); + assert.ok(result.isTransient); + assert.ok(!result.isRateLimit); + assert.equal(result.suggestedDelayMs, 30_000); +}); + test("classifyProviderError detects overloaded error", () => { const result = classifyProviderError("overloaded_error: Overloaded"); assert.ok(result.isTransient); @@ -243,3 +258,81 @@ test("pauseAutoForProviderError falls back to indefinite pause when not rate lim { message: "Auto-mode paused due to provider error: connection refused", level: "warning" }, ]); }); + +// ── Escalating backoff for transient errors (#1166) ───────────────────────── + +test("index.ts tracks consecutive transient errors for escalating backoff", () => { + const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8"); + + assert.ok( + indexSource.includes("consecutiveTransientErrors"), + "index.ts must track consecutiveTransientErrors for escalating backoff (#1166)", + ); + assert.ok( + indexSource.includes("MAX_TRANSIENT_AUTO_RESUMES"), + "index.ts must define MAX_TRANSIENT_AUTO_RESUMES to cap infinite retries (#1166)", + ); +}); + +test("index.ts resets consecutive transient error counter on success", () => { + const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8"); + + // After successful unit completion, the counter must be reset + const marker = "successful unit completion"; + const successSection = indexSource.indexOf(marker); + assert.ok(successSection > -1, "must have success section that clears network retries"); + const nearbyCode = indexSource.slice(Math.max(0, successSection - 100), successSection + 200); + assert.ok( + nearbyCode.includes("consecutiveTransientErrors = 0"), + "consecutive transient error counter must be reset on successful unit completion (#1166)", + ); +}); + +test("index.ts applies escalating delay for repeated transient errors", () => { + const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8"); + + // Must contain the exponential backoff formula + assert.ok( + /retryAfterMs\s*[=*].*2\s*\*\*/.test(indexSource), + "index.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)", + ); +}); + +// ── Codex error extraction (#1166) ────────────────────────────────────────── + +test("openai-codex-responses.ts extracts nested error fields", () => { + const codexSource = readFileSync( + join(__dirname, "../../../../../packages/pi-ai/src/providers/openai-codex-responses.ts"), + "utf-8", + ); + + // Must access event.error.message (nested), not just event.message (top-level) + assert.ok( + codexSource.includes("errorObj?.message"), + "mapCodexEvents must extract message from nested event.error object (#1166)", + ); + assert.ok( + codexSource.includes("errorObj?.type"), + "mapCodexEvents must extract type from nested event.error object (#1166)", + ); +}); + +// ── agent-session retryable regex handles server_error (#1166) ────────────── + +test("agent-session retryable error regex matches server_error (underscore)", () => { + // This regex is extracted from _isRetryableError in agent-session.ts. + // It must match both "server error" (space) and "server_error" (underscore) + // to properly classify Codex streaming errors as retryable. + const retryableRegex = /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i; + + // server_error (with underscore — Codex streaming error format) + assert.ok(retryableRegex.test("Codex server_error: An error occurred")); + // server error (with space — traditional HTTP error format) + assert.ok(retryableRegex.test("server error occurred")); + // internal_error (with underscore) + assert.ok(retryableRegex.test("internal_error: something went wrong")); + // internal error (with space) + assert.ok(retryableRegex.test("internal error")); + // non-retryable errors must not match + assert.ok(!retryableRegex.test("model not found")); +});