Fix Codex server_error handling: extraction, retry matching, escalating backoff (#1106)
* Initial plan
* Fix OpenAI Codex error handling: proper error extraction, retry matching, and escalating backoff
- Fix mapCodexEvents to extract error details from nested event.error object
(Codex API returns {type:"error", error:{type:"server_error", message:"..."}})
- Fix _isRetryableError regex: server.?error matches both server_error and server error
- Add escalating backoff for repeated transient auto-resumes (30s→60s→120s→240s→480s)
- Cap consecutive transient auto-resumes at 5 before pausing indefinitely
- Reset counter on successful unit completion
Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
* Improve regex test to be behavioral instead of structural
Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
---------
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
This commit is contained in:
parent
51da6c4f74
commit
1020c140af
4 changed files with 132 additions and 6 deletions
|
|
@ -368,9 +368,14 @@ async function* mapCodexEvents(events: AsyncIterable<Record<string, unknown>>):
|
|||
if (!type) continue;
|
||||
|
||||
if (type === "error") {
|
||||
const code = (event as { code?: string }).code || "";
|
||||
const message = (event as { message?: string }).message || "";
|
||||
throw new Error(`Codex error: ${message || code || JSON.stringify(event)}`);
|
||||
// Codex error events nest details under event.error (e.g.
|
||||
// { type: "error", error: { type: "server_error", code: "server_error", message: "..." } })
|
||||
const errorObj = (event as { error?: { code?: string; type?: string; message?: string } }).error;
|
||||
const code = errorObj?.code || (event as { code?: string }).code || "";
|
||||
const errorType = errorObj?.type || "";
|
||||
const message = errorObj?.message || (event as { message?: string }).message || "";
|
||||
const prefix = errorType ? `Codex ${errorType}` : "Codex error";
|
||||
throw new Error(`${prefix}: ${message || code || JSON.stringify(event)}`);
|
||||
}
|
||||
|
||||
if (type === "response.failed") {
|
||||
|
|
|
|||
|
|
@ -2427,7 +2427,7 @@ export class AgentSession {
|
|||
|
||||
const err = message.errorMessage;
|
||||
// Match: overloaded_error, rate limit, 429, 500, 502, 503, 504, service unavailable, connection errors, fetch failed, terminated, retry delay exceeded, network unavailable / auth expired (transient network failures)
|
||||
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server error|internal error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
|
||||
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
|
||||
err,
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -100,6 +100,13 @@ let depthVerificationDone = false;
|
|||
// Cleared when a model switch occurs or retries are exhausted.
|
||||
const networkRetryCounters = new Map<string, number>();
|
||||
|
||||
// ── Transient error escalation ───────────────────────────────────────────
|
||||
// Tracks consecutive transient auto-resume attempts. Each attempt doubles
|
||||
// the delay. After MAX_TRANSIENT_AUTO_RESUMES consecutive failures, auto-mode
|
||||
// pauses indefinitely to avoid infinite rapid-fire retries (#1166).
|
||||
const MAX_TRANSIENT_AUTO_RESUMES = 5;
|
||||
let consecutiveTransientErrors = 0;
|
||||
|
||||
export function isDepthVerified(): boolean {
|
||||
return depthVerificationDone;
|
||||
}
|
||||
|
|
@ -866,11 +873,31 @@ export default function (pi: ExtensionAPI) {
|
|||
const explicitRetryAfterMs = ("retryAfterMs" in lastMsg && typeof lastMsg.retryAfterMs === "number")
|
||||
? lastMsg.retryAfterMs
|
||||
: undefined;
|
||||
const retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
|
||||
let retryAfterMs = explicitRetryAfterMs ?? classification.suggestedDelayMs;
|
||||
|
||||
// ── Escalating backoff for repeated transient errors ──────────────
|
||||
// Each consecutive transient auto-resume doubles the delay. After
|
||||
// MAX_TRANSIENT_AUTO_RESUMES consecutive failures, treat as permanent
|
||||
// to avoid infinite rapid-fire retries (#1166).
|
||||
let effectiveTransient = classification.isTransient;
|
||||
if (classification.isTransient) {
|
||||
consecutiveTransientErrors++;
|
||||
if (consecutiveTransientErrors > MAX_TRANSIENT_AUTO_RESUMES) {
|
||||
effectiveTransient = false;
|
||||
ctx.ui.notify(
|
||||
`${consecutiveTransientErrors} consecutive transient errors. Pausing indefinitely — resume manually with /gsd auto.`,
|
||||
"error",
|
||||
);
|
||||
consecutiveTransientErrors = 0;
|
||||
} else {
|
||||
// Escalate: base delay × 2^(consecutive-1) → 30s, 60s, 120s, 240s, 480s
|
||||
retryAfterMs = retryAfterMs * 2 ** (consecutiveTransientErrors - 1);
|
||||
}
|
||||
}
|
||||
|
||||
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
|
||||
isRateLimit: classification.isRateLimit,
|
||||
isTransient: classification.isTransient,
|
||||
isTransient: effectiveTransient,
|
||||
retryAfterMs,
|
||||
resume: () => {
|
||||
pi.sendMessage(
|
||||
|
|
@ -884,6 +911,7 @@ export default function (pi: ExtensionAPI) {
|
|||
|
||||
try {
|
||||
networkRetryCounters.clear(); // Clear network retry state on successful unit completion
|
||||
consecutiveTransientErrors = 0; // Reset escalating backoff on success
|
||||
await handleAgentEnd(ctx, pi);
|
||||
} catch (err) {
|
||||
// Safety net: if handleAgentEnd throws despite its internal try-catch,
|
||||
|
|
|
|||
|
|
@ -7,9 +7,14 @@
|
|||
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join, dirname } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { classifyProviderError, pauseAutoForProviderError } from "../provider-error-pause.ts";
|
||||
import { getNextFallbackModel, isTransientNetworkError } from "../preferences.ts";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
// ── classifyProviderError ────────────────────────────────────────────────────
|
||||
|
||||
test("classifyProviderError detects rate limit from 429", () => {
|
||||
|
|
@ -45,6 +50,16 @@ test("classifyProviderError detects Anthropic internal server error", () => {
|
|||
assert.equal(result.suggestedDelayMs, 30_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects Codex server_error from extracted message", () => {
|
||||
// After fix, mapCodexEvents extracts the nested error type and produces
|
||||
// "Codex server_error: <message>" instead of raw JSON.
|
||||
const msg = "Codex server_error: An error occurred while processing your request.";
|
||||
const result = classifyProviderError(msg);
|
||||
assert.ok(result.isTransient);
|
||||
assert.ok(!result.isRateLimit);
|
||||
assert.equal(result.suggestedDelayMs, 30_000);
|
||||
});
|
||||
|
||||
test("classifyProviderError detects overloaded error", () => {
|
||||
const result = classifyProviderError("overloaded_error: Overloaded");
|
||||
assert.ok(result.isTransient);
|
||||
|
|
@ -243,3 +258,81 @@ test("pauseAutoForProviderError falls back to indefinite pause when not rate lim
|
|||
{ message: "Auto-mode paused due to provider error: connection refused", level: "warning" },
|
||||
]);
|
||||
});
|
||||
|
||||
// ── Escalating backoff for transient errors (#1166) ─────────────────────────
|
||||
|
||||
test("index.ts tracks consecutive transient errors for escalating backoff", () => {
|
||||
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
|
||||
|
||||
assert.ok(
|
||||
indexSource.includes("consecutiveTransientErrors"),
|
||||
"index.ts must track consecutiveTransientErrors for escalating backoff (#1166)",
|
||||
);
|
||||
assert.ok(
|
||||
indexSource.includes("MAX_TRANSIENT_AUTO_RESUMES"),
|
||||
"index.ts must define MAX_TRANSIENT_AUTO_RESUMES to cap infinite retries (#1166)",
|
||||
);
|
||||
});
|
||||
|
||||
test("index.ts resets consecutive transient error counter on success", () => {
|
||||
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
|
||||
|
||||
// After successful unit completion, the counter must be reset
|
||||
const marker = "successful unit completion";
|
||||
const successSection = indexSource.indexOf(marker);
|
||||
assert.ok(successSection > -1, "must have success section that clears network retries");
|
||||
const nearbyCode = indexSource.slice(Math.max(0, successSection - 100), successSection + 200);
|
||||
assert.ok(
|
||||
nearbyCode.includes("consecutiveTransientErrors = 0"),
|
||||
"consecutive transient error counter must be reset on successful unit completion (#1166)",
|
||||
);
|
||||
});
|
||||
|
||||
test("index.ts applies escalating delay for repeated transient errors", () => {
|
||||
const indexSource = readFileSync(join(__dirname, "..", "index.ts"), "utf-8");
|
||||
|
||||
// Must contain the exponential backoff formula
|
||||
assert.ok(
|
||||
/retryAfterMs\s*[=*].*2\s*\*\*/.test(indexSource),
|
||||
"index.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)",
|
||||
);
|
||||
});
|
||||
|
||||
// ── Codex error extraction (#1166) ──────────────────────────────────────────
|
||||
|
||||
test("openai-codex-responses.ts extracts nested error fields", () => {
|
||||
const codexSource = readFileSync(
|
||||
join(__dirname, "../../../../../packages/pi-ai/src/providers/openai-codex-responses.ts"),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
// Must access event.error.message (nested), not just event.message (top-level)
|
||||
assert.ok(
|
||||
codexSource.includes("errorObj?.message"),
|
||||
"mapCodexEvents must extract message from nested event.error object (#1166)",
|
||||
);
|
||||
assert.ok(
|
||||
codexSource.includes("errorObj?.type"),
|
||||
"mapCodexEvents must extract type from nested event.error object (#1166)",
|
||||
);
|
||||
});
|
||||
|
||||
// ── agent-session retryable regex handles server_error (#1166) ──────────────
|
||||
|
||||
test("agent-session retryable error regex matches server_error (underscore)", () => {
|
||||
// This regex is extracted from _isRetryableError in agent-session.ts.
|
||||
// It must match both "server error" (space) and "server_error" (underscore)
|
||||
// to properly classify Codex streaming errors as retryable.
|
||||
const retryableRegex = /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i;
|
||||
|
||||
// server_error (with underscore — Codex streaming error format)
|
||||
assert.ok(retryableRegex.test("Codex server_error: An error occurred"));
|
||||
// server error (with space — traditional HTTP error format)
|
||||
assert.ok(retryableRegex.test("server error occurred"));
|
||||
// internal_error (with underscore)
|
||||
assert.ok(retryableRegex.test("internal_error: something went wrong"));
|
||||
// internal error (with space)
|
||||
assert.ok(retryableRegex.test("internal error"));
|
||||
// non-retryable errors must not match
|
||||
assert.ok(!retryableRegex.test("model not found"));
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue