fix(auto): add structured cooldown error and bounded retry budget
Address Codex adversarial review findings: - Replace string-matched cooldown detection with typed CredentialCooldownError (code: AUTH_COOLDOWN, retryAfterMs) - Add MAX_COOLDOWN_RETRIES (5) cap so cooldown retries can't spin for hours on persistent quota exhaustion - Auto-loop uses retryAfterMs from structured error when available, falls back to 35s default - Export CredentialCooldownError from pi-coding-agent package - Retain regex fallback for cross-process error propagation Closes #4052
This commit is contained in:
parent
1ae93e9822
commit
d0afe018eb
4 changed files with 78 additions and 17 deletions
|
|
@ -1,4 +1,24 @@
|
|||
import { join } from "node:path";
|
||||
|
||||
/**
|
||||
* Structured error thrown when all credentials for a provider are in a
|
||||
* backoff window. Carries typed metadata so callers (e.g. the auto-loop)
|
||||
* can make informed retry decisions instead of string-matching the message.
|
||||
*/
|
||||
export class CredentialCooldownError extends Error {
|
||||
readonly code = "AUTH_COOLDOWN" as const;
|
||||
/** Milliseconds until the earliest credential becomes available, or undefined if unknown. */
|
||||
readonly retryAfterMs: number | undefined;
|
||||
|
||||
constructor(provider: string, retryAfterMs?: number) {
|
||||
super(
|
||||
`All credentials for "${provider}" are in a cooldown window. ` +
|
||||
`Please wait a moment and try again, or switch to a different provider.`,
|
||||
);
|
||||
this.name = "CredentialCooldownError";
|
||||
this.retryAfterMs = retryAfterMs;
|
||||
}
|
||||
}
|
||||
import { Agent, type AgentMessage, type ThinkingLevel } from "@gsd/pi-agent-core";
|
||||
import type { Message, Model } from "@gsd/pi-ai";
|
||||
import { getAgentDir, getDocsPath } from "../config.js";
|
||||
|
|
@ -408,10 +428,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
|
|||
// the retry handler and creating cascading error entries (#3429).
|
||||
const hasAuth = modelRegistry.authStorage.hasAuth(resolvedProvider);
|
||||
if (hasAuth) {
|
||||
throw new Error(
|
||||
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
|
||||
`Please wait a moment and try again, or switch to a different provider.`,
|
||||
);
|
||||
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
|
||||
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
|
||||
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
|
||||
}
|
||||
const model = agent.state.model;
|
||||
const isOAuth = model && modelRegistry.isUsingOAuth(model);
|
||||
|
|
@ -419,10 +438,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
|
|||
// If credentials exist but are all in a backoff window (quota / rate-limit),
|
||||
// surface a specific message instead of the misleading "Authentication failed".
|
||||
if (modelRegistry.authStorage.areAllCredentialsBackedOff(resolvedProvider)) {
|
||||
throw new Error(
|
||||
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
|
||||
`Please wait a moment and try again, or switch to a different provider.`,
|
||||
);
|
||||
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
|
||||
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
|
||||
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
|
||||
}
|
||||
throw new Error(
|
||||
`Authentication failed for "${resolvedProvider}". ` +
|
||||
|
|
|
|||
|
|
@ -176,6 +176,7 @@ export { DefaultResourceLoader } from "./core/resource-loader.js";
|
|||
export {
|
||||
type CreateAgentSessionOptions,
|
||||
type CreateAgentSessionResult,
|
||||
CredentialCooldownError,
|
||||
// Factory
|
||||
createAgentSession,
|
||||
createBashTool,
|
||||
|
|
|
|||
|
|
@ -54,15 +54,33 @@ export function isInfrastructureError(err: unknown): string | null {
|
|||
*/
|
||||
export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff
|
||||
|
||||
/** Maximum consecutive cooldown retries before the auto-loop gives up. */
|
||||
export const MAX_COOLDOWN_RETRIES = 5;
|
||||
|
||||
/**
|
||||
* Detect whether an error is a transient credential cooldown that should
|
||||
* be waited out rather than counted as a consecutive failure.
|
||||
*
|
||||
* These errors are generated by getApiKey() in sdk.ts when all credentials
|
||||
* for a provider are in a backoff window (typically after a 429). The
|
||||
* auto-loop should pause and retry instead of escalating to hard stop.
|
||||
* Prefers the structured `CredentialCooldownError` (code: AUTH_COOLDOWN)
|
||||
* thrown by sdk.ts. Falls back to message matching for errors that
|
||||
* propagated across process boundaries without the typed class.
|
||||
*/
|
||||
export function isTransientCooldownError(err: unknown): boolean {
|
||||
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
|
||||
return true;
|
||||
}
|
||||
// Fallback: message match for cross-process error propagation
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
return /in a cooldown window/i.test(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract retryAfterMs from a CredentialCooldownError, if available.
|
||||
* Returns undefined for unstructured errors or when no retry hint exists.
|
||||
*/
|
||||
export function getCooldownRetryAfterMs(err: unknown): number | undefined {
|
||||
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
|
||||
return (err as Record<string, unknown>).retryAfterMs as number | undefined;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ import {
|
|||
runFinalize,
|
||||
} from "./phases.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js";
|
||||
import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js";
|
||||
import { resolveEngine } from "../engine-resolver.js";
|
||||
|
||||
/**
|
||||
|
|
@ -48,6 +48,7 @@ export async function autoLoop(
|
|||
let iteration = 0;
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
let consecutiveErrors = 0;
|
||||
let consecutiveCooldowns = 0;
|
||||
const recentErrorMessages: string[] = [];
|
||||
|
||||
while (s.active) {
|
||||
|
|
@ -203,6 +204,7 @@ export async function autoLoop(
|
|||
|
||||
deps.clearUnitTimeout();
|
||||
consecutiveErrors = 0;
|
||||
consecutiveCooldowns = 0;
|
||||
recentErrorMessages.length = 0;
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
|
||||
debugLog("autoLoop", { phase: "iteration-complete", iteration });
|
||||
|
|
@ -265,6 +267,7 @@ export async function autoLoop(
|
|||
if (finalizeResult.action === "continue") continue;
|
||||
|
||||
consecutiveErrors = 0; // Iteration completed successfully
|
||||
consecutiveCooldowns = 0;
|
||||
recentErrorMessages.length = 0;
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
|
||||
debugLog("autoLoop", { phase: "iteration-complete", iteration });
|
||||
|
|
@ -300,23 +303,44 @@ export async function autoLoop(
|
|||
break;
|
||||
}
|
||||
|
||||
// ── Credential cooldown: wait and retry without burning error budget ──
|
||||
// ── Credential cooldown: wait and retry with bounded budget ──
|
||||
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
|
||||
// getApiKey() retries couldn't outlast the window, the error surfaces
|
||||
// here. Wait for the cooldown to clear rather than counting it as a
|
||||
// consecutive failure — 3 fast cooldown errors would otherwise kill
|
||||
// the auto session unnecessarily.
|
||||
// consecutive failure — but cap retries so we don't spin for hours
|
||||
// on persistent quota exhaustion.
|
||||
if (isTransientCooldownError(loopErr)) {
|
||||
consecutiveCooldowns++;
|
||||
const retryAfterMs = getCooldownRetryAfterMs(loopErr);
|
||||
debugLog("autoLoop", {
|
||||
phase: "cooldown-wait",
|
||||
iteration,
|
||||
consecutiveCooldowns,
|
||||
retryAfterMs,
|
||||
error: msg,
|
||||
});
|
||||
|
||||
if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) {
|
||||
ctx.ui.notify(
|
||||
`Auto-mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`,
|
||||
"error",
|
||||
);
|
||||
await deps.stopAuto(
|
||||
ctx,
|
||||
pi,
|
||||
`${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
const waitMs = (retryAfterMs !== undefined && retryAfterMs > 0 && retryAfterMs <= 60_000)
|
||||
? retryAfterMs + 500 // Use structured hint + small buffer
|
||||
: COOLDOWN_FALLBACK_WAIT_MS;
|
||||
ctx.ui.notify(
|
||||
`Credentials in cooldown — waiting for rate limit to clear before retrying.`,
|
||||
`Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`,
|
||||
"warning",
|
||||
);
|
||||
await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS));
|
||||
await new Promise(resolve => setTimeout(resolve, waitMs));
|
||||
continue; // Retry iteration without incrementing consecutiveErrors
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue