Fix Codex server_error handling: extraction, retry matching, escalating backoff (#1106)

* Initial plan

* Fix OpenAI Codex error handling: proper error extraction, retry matching, and escalating backoff

- Fix mapCodexEvents to extract error details from nested event.error object
  (Codex API returns {type:"error", error:{type:"server_error", message:"..."}})
- Fix _isRetryableError regex: server.?error matches both server_error and server error
- Add escalating backoff for repeated transient auto-resumes (30s→60s→120s→240s→480s)
- Cap consecutive transient auto-resumes at 5 before pausing indefinitely
- Reset counter on successful unit completion

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>

* Improve regex test to be behavioral instead of structural

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
This commit is contained in:
Copilot 2026-03-18 01:10:55 -06:00 committed by GitHub
parent 51da6c4f74
commit 1020c140af
4 changed files with 132 additions and 6 deletions

View file

@ -368,9 +368,14 @@ async function* mapCodexEvents(events: AsyncIterable<Record<string, unknown>>):
if (!type) continue;
if (type === "error") {
const code = (event as { code?: string }).code || "";
const message = (event as { message?: string }).message || "";
throw new Error(`Codex error: ${message || code || JSON.stringify(event)}`);
// Codex error events nest details under event.error (e.g.
// { type: "error", error: { type: "server_error", code: "server_error", message: "..." } })
const errorObj = (event as { error?: { code?: string; type?: string; message?: string } }).error;
const code = errorObj?.code || (event as { code?: string }).code || "";
const errorType = errorObj?.type || "";
const message = errorObj?.message || (event as { message?: string }).message || "";
const prefix = errorType ? `Codex ${errorType}` : "Codex error";
throw new Error(`${prefix}: ${message || code || JSON.stringify(event)}`);
}
if (type === "response.failed") {

View file

@ -2427,7 +2427,7 @@ export class AgentSession {
const err = message.errorMessage;
// Match: overloaded_error, rate limit, 429, 500, 502, 503, 504, service unavailable, connection errors, fetch failed, terminated, retry delay exceeded, network unavailable / auth expired (transient network failures)
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server error|internal error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
err,
);
}

View file

@ -100,6 +100,13 @@ let depthVerificationDone = false;
// Cleared when a model switch occurs or retries are exhausted.
const networkRetryCounters = new Map<string, number>();
// ── Transient error escalation ───────────────────────────────────────────
// Tracks consecutive transient auto-resume attempts. Each attempt doubles
// the delay. After MAX_TRANSIENT_AUTO_RESUMES consecutive failures, auto-mode
// pauses indefinitely to avoid infinite rapid-fire retries (#1166).
const MAX_TRANSIENT_AUTO_RESUMES = 5;
let consecutiveTransientErrors = 0;
export function isDepthVerified(): boolean {
return depthVerificationDone;
}
@ -866,11 +873,31 @@ export default function (pi: ExtensionAPI) {
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number")
? lastMsg.retryAfterMs
: undefined;
const retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
let retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
// ── Escalating backoff for repeated transient errors ──────────────
// Each consecutive transient auto-resume doubles the delay. After
// MAX_TRANSIENT_AUTO_RESUMES consecutive failures, treat as permanent
// to avoid infinite rapid-fire retries (#1166).
let effectiveTransient = classification.isTransient;
if (classification.isTransient) {
consecutiveTransientErrors++;
if (consecutiveTransientErrors > MAX_TRANSIENT_AUTO_RESUMES) {
effectiveTransient = false;
ctx.ui.notify(
`${consecutiveTransientErrors} consecutive transient errors. Pausing indefinitely — resume manually with /gsd auto.`,
"error",
);
consecutiveTransientErrors = 0;
} else {
// Escalate: base delay × 2^(consecutive-1) → 30s, 60s, 120s, 240s, 480s
retryAfterMs = retryAfterMs * 2 ** (consecutiveTransientErrors - 1);
}
}
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
isRateLimit: classification.isRateLimit,
isTransient: classification.isTransient,
isTransient: effectiveTransient,
retryAfterMs,
resume: () => {
pi.sendMessage(
@ -884,6 +911,7 @@ export default function (pi: ExtensionAPI) {
try {
networkRetryCounters.clear(); // Clear network retry state on successful unit completion
consecutiveTransientErrors = 0; // Reset escalating backoff on success
await handleAgentEnd(ctx, pi);
} catch (err) {
// Safety net: if handleAgentEnd throws despite its internal try-catch,

View file

@ -7,9 +7,14 @@
import test from "node:test";
import assert from "node:assert/strict";
import { readFileSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts";
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
const __dirname = dirname(fileURLToPath(import.meta.url));
// ── classifyProviderError ────────────────────────────────────────────────────
test("classifyProviderError detects rate limit from 429", () => {
@ -45,6 +50,16 @@ test("classifyProviderError detects Anthropic internal server error", () => {
assert.equal(result.suggestedDelayMs, 30_000);
});
test("classifyProviderError detects Codex server_error from extracted message", () => {
// After fix, mapCodexEvents extracts the nested error type and produces
// "Codex server_error: <message>" instead of raw JSON.
const msg = "Codex server_error: An error occurred while processing your request.";
const result = classifyProviderError(msg);
assert.ok(result.isTransient);
assert.ok(!result.isRateLimit);
assert.equal(result.suggestedDelayMs, 30_000);
});
test("classifyProviderError detects overloaded error", () => {
const result = classifyProviderError("overloaded_error: Overloaded");
assert.ok(result.isTransient);
@ -243,3 +258,81 @@ test("pauseAutoForProviderError falls back to indefinite pause when not rate lim
{ message: "Auto-mode paused due to provider error: connection refused", level: "warning" },
]);
});
// ── Escalating backoff for transient errors (#1166) ─────────────────────────
test("index.ts tracks consecutive transient errors for escalating backoff", () => {
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
assert.ok(
indexSource.includes("consecutiveTransientErrors"),
"index.ts must track consecutiveTransientErrors for escalating backoff (#1166)",
);
assert.ok(
indexSource.includes("MAX_TRANSIENT_AUTO_RESUMES"),
"index.ts must define MAX_TRANSIENT_AUTO_RESUMES to cap infinite retries (#1166)",
);
});
test("index.ts resets consecutive transient error counter on success", () => {
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
// After successful unit completion, the counter must be reset
const marker = "successful unit completion";
const successSection = indexSource.indexOf(marker);
assert.ok(successSection > -1, "must have success section that clears network retries");
const nearbyCode = indexSource.slice(Math.max(0, successSection - 100), successSection + 200);
assert.ok(
nearbyCode.includes("consecutiveTransientErrors = 0"),
"consecutive transient error counter must be reset on successful unit completion (#1166)",
);
});
test("index.ts applies escalating delay for repeated transient errors", () => {
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
// Must contain the exponential backoff formula
assert.ok(
/retryAfterMs\s*[=*].*2\s*\*\*/.test(indexSource),
"index.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)",
);
});
// ── Codex error extraction (#1166) ──────────────────────────────────────────
test("openai-codex-responses.ts extracts nested error fields", () => {
const codexSource = readFileSync(
join(__dirname, "../../../../../packages/pi-ai/src/providers/openai-codex-responses.ts"),
"utf-8",
);
// Must access event.error.message (nested), not just event.message (top-level)
assert.ok(
codexSource.includes("errorObj?.message"),
"mapCodexEvents must extract message from nested event.error object (#1166)",
);
assert.ok(
codexSource.includes("errorObj?.type"),
"mapCodexEvents must extract type from nested event.error object (#1166)",
);
});
// ── agent-session retryable regex handles server_error (#1166) ──────────────
test("agent-session retryable error regex matches server_error (underscore)", () => {
// This regex is extracted from _isRetryableError in agent-session.ts.
// It must match both "server error" (space) and "server_error" (underscore)
// to properly classify Codex streaming errors as retryable.
const retryableRegex = /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i;
// server_error (with underscore — Codex streaming error format)
assert.ok(retryableRegex.test("Codex server_error: An error occurred"));
// server error (with space — traditional HTTP error format)
assert.ok(retryableRegex.test("server error occurred"));
// internal_error (with underscore)
assert.ok(retryableRegex.test("internal_error: something went wrong"));
// internal error (with space)
assert.ok(retryableRegex.test("internal error"));
// non-retryable errors must not match
assert.ok(!retryableRegex.test("model not found"));
});