fix(auto): add structured cooldown error and bounded retry budget

Address Codex adversarial review findings:

- Replace string-matched cooldown detection with typed
  CredentialCooldownError (code: AUTH_COOLDOWN, retryAfterMs)
- Add MAX_COOLDOWN_RETRIES (5) cap so cooldown retries can't spin for
  hours on persistent quota exhaustion
- Auto-loop uses retryAfterMs from structured error when available,
  falls back to 35s default
- Export CredentialCooldownError from pi-coding-agent package
- Retain regex fallback for cross-process error propagation

Closes #4052
This commit is contained in:
Jeremy 2026-04-12 09:16:05 -05:00
parent 1ae93e9822
commit d0afe018eb
4 changed files with 78 additions and 17 deletions

View file

@ -1,4 +1,24 @@
import { join } from "node:path";
/**
* Structured error thrown when all credentials for a provider are in a
* backoff window. Carries typed metadata so callers (e.g. the auto-loop)
* can make informed retry decisions instead of string-matching the message.
*/
export class CredentialCooldownError extends Error {
readonly code = "AUTH_COOLDOWN" as const;
/** Milliseconds until the earliest credential becomes available, or undefined if unknown. */
readonly retryAfterMs: number | undefined;
constructor(provider: string, retryAfterMs?: number) {
super(
`All credentials for "${provider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
this.name = "CredentialCooldownError";
this.retryAfterMs = retryAfterMs;
}
}
import { Agent, type AgentMessage, type ThinkingLevel } from "@gsd/pi-agent-core";
import type { Message, Model } from "@gsd/pi-ai";
import { getAgentDir, getDocsPath } from "../config.js";
@ -408,10 +428,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// the retry handler and creating cascading error entries (#3429).
const hasAuth = modelRegistry.authStorage.hasAuth(resolvedProvider);
if (hasAuth) {
throw new Error(
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
}
const model = agent.state.model;
const isOAuth = model && modelRegistry.isUsingOAuth(model);
@ -419,10 +438,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// If credentials exist but are all in a backoff window (quota / rate-limit),
// surface a specific message instead of the misleading "Authentication failed".
if (modelRegistry.authStorage.areAllCredentialsBackedOff(resolvedProvider)) {
throw new Error(
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
}
throw new Error(
`Authentication failed for "${resolvedProvider}". ` +

View file

@ -176,6 +176,7 @@ export { DefaultResourceLoader } from "./core/resource-loader.js";
export {
type CreateAgentSessionOptions,
type CreateAgentSessionResult,
CredentialCooldownError,
// Factory
createAgentSession,
createBashTool,

View file

@ -54,15 +54,33 @@ export function isInfrastructureError(err: unknown): string | null {
*/
export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff
/** Maximum consecutive cooldown retries before the auto-loop gives up. */
export const MAX_COOLDOWN_RETRIES = 5;
/**
* Detect whether an error is a transient credential cooldown that should
* be waited out rather than counted as a consecutive failure.
*
* These errors are generated by getApiKey() in sdk.ts when all credentials
* for a provider are in a backoff window (typically after a 429). The
* auto-loop should pause and retry instead of escalating to hard stop.
* Prefers the structured `CredentialCooldownError` (code: AUTH_COOLDOWN)
* thrown by sdk.ts. Falls back to message matching for errors that
* propagated across process boundaries without the typed class.
*/
export function isTransientCooldownError(err: unknown): boolean {
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
return true;
}
// Fallback: message match for cross-process error propagation
const msg = err instanceof Error ? err.message : String(err);
return /in a cooldown window/i.test(msg);
}
/**
* Extract retryAfterMs from a CredentialCooldownError, if available.
* Returns undefined for unstructured errors or when no retry hint exists.
*/
export function getCooldownRetryAfterMs(err: unknown): number | undefined {
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
return (err as Record<string, unknown>).retryAfterMs as number | undefined;
}
return undefined;
}

View file

@ -27,7 +27,7 @@ import {
runFinalize,
} from "./phases.js";
import { debugLog } from "../debug-logger.js";
import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js";
import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js";
import { resolveEngine } from "../engine-resolver.js";
/**
@ -48,6 +48,7 @@ export async function autoLoop(
let iteration = 0;
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
let consecutiveErrors = 0;
let consecutiveCooldowns = 0;
const recentErrorMessages: string[] = [];
while (s.active) {
@ -203,6 +204,7 @@ export async function autoLoop(
deps.clearUnitTimeout();
consecutiveErrors = 0;
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
debugLog("autoLoop", { phase: "iteration-complete", iteration });
@ -265,6 +267,7 @@ export async function autoLoop(
if (finalizeResult.action === "continue") continue;
consecutiveErrors = 0; // Iteration completed successfully
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
debugLog("autoLoop", { phase: "iteration-complete", iteration });
@ -300,23 +303,44 @@ export async function autoLoop(
break;
}
// ── Credential cooldown: wait and retry without burning error budget ──
// ── Credential cooldown: wait and retry with bounded budget ──
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
// getApiKey() retries couldn't outlast the window, the error surfaces
// here. Wait for the cooldown to clear rather than counting it as a
// consecutive failure — 3 fast cooldown errors would otherwise kill
// the auto session unnecessarily.
// consecutive failure — but cap retries so we don't spin for hours
// on persistent quota exhaustion.
if (isTransientCooldownError(loopErr)) {
consecutiveCooldowns++;
const retryAfterMs = getCooldownRetryAfterMs(loopErr);
debugLog("autoLoop", {
phase: "cooldown-wait",
iteration,
consecutiveCooldowns,
retryAfterMs,
error: msg,
});
if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) {
ctx.ui.notify(
`Auto-mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`,
);
break;
}
const waitMs = (retryAfterMs !== undefined && retryAfterMs > 0 && retryAfterMs <= 60_000)
? retryAfterMs + 500 // Use structured hint + small buffer
: COOLDOWN_FALLBACK_WAIT_MS;
ctx.ui.notify(
`Credentials in cooldown — waiting for rate limit to clear before retrying.`,
`Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`,
"warning",
);
await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS));
await new Promise(resolve => setTimeout(resolve, waitMs));
continue; // Retry iteration without incrementing consecutiveErrors
}