From d0afe018eb6d353aaf4452fcd99b83476213074c Mon Sep 17 00:00:00 2001 From: Jeremy Date: Sun, 12 Apr 2026 09:16:05 -0500 Subject: [PATCH] fix(auto): add structured cooldown error and bounded retry budget Address Codex adversarial review findings: - Replace string-matched cooldown detection with typed CredentialCooldownError (code: AUTH_COOLDOWN, retryAfterMs) - Add MAX_COOLDOWN_RETRIES (5) cap so cooldown retries can't spin for hours on persistent quota exhaustion - Auto-loop uses retryAfterMs from structured error when available, falls back to 35s default - Export CredentialCooldownError from pi-coding-agent package - Retain regex fallback for cross-process error propagation Closes #4052 --- packages/pi-coding-agent/src/core/sdk.ts | 34 +++++++++++++----- packages/pi-coding-agent/src/index.ts | 1 + .../extensions/gsd/auto/infra-errors.ts | 24 +++++++++++-- src/resources/extensions/gsd/auto/loop.ts | 36 +++++++++++++++---- 4 files changed, 78 insertions(+), 17 deletions(-) diff --git a/packages/pi-coding-agent/src/core/sdk.ts b/packages/pi-coding-agent/src/core/sdk.ts index bcf2f27b7..12da38d16 100644 --- a/packages/pi-coding-agent/src/core/sdk.ts +++ b/packages/pi-coding-agent/src/core/sdk.ts @@ -1,4 +1,24 @@ import { join } from "node:path"; + +/** + * Structured error thrown when all credentials for a provider are in a + * backoff window. Carries typed metadata so callers (e.g. the auto-loop) + * can make informed retry decisions instead of string-matching the message. + */ +export class CredentialCooldownError extends Error { + readonly code = "AUTH_COOLDOWN" as const; + /** Milliseconds until the earliest credential becomes available, or undefined if unknown. */ + readonly retryAfterMs: number | undefined; + + constructor(provider: string, retryAfterMs?: number) { + super( + `All credentials for "${provider}" are in a cooldown window. ` + + `Please wait a moment and try again, or switch to a different provider.`, + ); + this.name = "CredentialCooldownError"; + this.retryAfterMs = retryAfterMs; + } +} import { Agent, type AgentMessage, type ThinkingLevel } from "@gsd/pi-agent-core"; import type { Message, Model } from "@gsd/pi-ai"; import { getAgentDir, getDocsPath } from "../config.js"; @@ -408,10 +428,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} // the retry handler and creating cascading error entries (#3429). const hasAuth = modelRegistry.authStorage.hasAuth(resolvedProvider); if (hasAuth) { - throw new Error( - `All credentials for "${resolvedProvider}" are in a cooldown window. ` + - `Please wait a moment and try again, or switch to a different provider.`, - ); + const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider); + const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined; + throw new CredentialCooldownError(resolvedProvider, retryAfterMs); } const model = agent.state.model; const isOAuth = model && modelRegistry.isUsingOAuth(model); @@ -419,10 +438,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} // If credentials exist but are all in a backoff window (quota / rate-limit), // surface a specific message instead of the misleading "Authentication failed". if (modelRegistry.authStorage.areAllCredentialsBackedOff(resolvedProvider)) { - throw new Error( - `All credentials for "${resolvedProvider}" are in a cooldown window. ` + - `Please wait a moment and try again, or switch to a different provider.`, - ); + const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider); + const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined; + throw new CredentialCooldownError(resolvedProvider, retryAfterMs); } throw new Error( `Authentication failed for "${resolvedProvider}". ` + diff --git a/packages/pi-coding-agent/src/index.ts b/packages/pi-coding-agent/src/index.ts index ab7de8bac..54a20b846 100644 --- a/packages/pi-coding-agent/src/index.ts +++ b/packages/pi-coding-agent/src/index.ts @@ -176,6 +176,7 @@ export { DefaultResourceLoader } from "./core/resource-loader.js"; export { type CreateAgentSessionOptions, type CreateAgentSessionResult, + CredentialCooldownError, // Factory createAgentSession, createBashTool, diff --git a/src/resources/extensions/gsd/auto/infra-errors.ts b/src/resources/extensions/gsd/auto/infra-errors.ts index 5953066b6..d0132724c 100644 --- a/src/resources/extensions/gsd/auto/infra-errors.ts +++ b/src/resources/extensions/gsd/auto/infra-errors.ts @@ -54,15 +54,33 @@ export function isInfrastructureError(err: unknown): string | null { */ export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff +/** Maximum consecutive cooldown retries before the auto-loop gives up. */ +export const MAX_COOLDOWN_RETRIES = 5; + /** * Detect whether an error is a transient credential cooldown that should * be waited out rather than counted as a consecutive failure. * - * These errors are generated by getApiKey() in sdk.ts when all credentials - * for a provider are in a backoff window (typically after a 429). The - * auto-loop should pause and retry instead of escalating to hard stop. + * Prefers the structured `CredentialCooldownError` (code: AUTH_COOLDOWN) + * thrown by sdk.ts. Falls back to message matching for errors that + * propagated across process boundaries without the typed class. */ export function isTransientCooldownError(err: unknown): boolean { + if (err && typeof err === "object" && (err as Record).code === "AUTH_COOLDOWN") { + return true; + } + // Fallback: message match for cross-process error propagation const msg = err instanceof Error ? err.message : String(err); return /in a cooldown window/i.test(msg); } + +/** + * Extract retryAfterMs from a CredentialCooldownError, if available. + * Returns undefined for unstructured errors or when no retry hint exists. + */ +export function getCooldownRetryAfterMs(err: unknown): number | undefined { + if (err && typeof err === "object" && (err as Record).code === "AUTH_COOLDOWN") { + return (err as Record).retryAfterMs as number | undefined; + } + return undefined; +} diff --git a/src/resources/extensions/gsd/auto/loop.ts b/src/resources/extensions/gsd/auto/loop.ts index 018f1884b..412ba1e26 100644 --- a/src/resources/extensions/gsd/auto/loop.ts +++ b/src/resources/extensions/gsd/auto/loop.ts @@ -27,7 +27,7 @@ import { runFinalize, } from "./phases.js"; import { debugLog } from "../debug-logger.js"; -import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js"; +import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js"; import { resolveEngine } from "../engine-resolver.js"; /** @@ -48,6 +48,7 @@ export async function autoLoop( let iteration = 0; const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 }; let consecutiveErrors = 0; + let consecutiveCooldowns = 0; const recentErrorMessages: string[] = []; while (s.active) { @@ -203,6 +204,7 @@ export async function autoLoop( deps.clearUnitTimeout(); consecutiveErrors = 0; + consecutiveCooldowns = 0; recentErrorMessages.length = 0; deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } }); debugLog("autoLoop", { phase: "iteration-complete", iteration }); @@ -265,6 +267,7 @@ export async function autoLoop( if (finalizeResult.action === "continue") continue; consecutiveErrors = 0; // Iteration completed successfully + consecutiveCooldowns = 0; recentErrorMessages.length = 0; deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } }); debugLog("autoLoop", { phase: "iteration-complete", iteration }); @@ -300,23 +303,44 @@ export async function autoLoop( break; } - // ── Credential cooldown: wait and retry without burning error budget ── + // ── Credential cooldown: wait and retry with bounded budget ── // A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's // getApiKey() retries couldn't outlast the window, the error surfaces // here. Wait for the cooldown to clear rather than counting it as a - // consecutive failure — 3 fast cooldown errors would otherwise kill - // the auto session unnecessarily. + // consecutive failure — but cap retries so we don't spin for hours + // on persistent quota exhaustion. if (isTransientCooldownError(loopErr)) { + consecutiveCooldowns++; + const retryAfterMs = getCooldownRetryAfterMs(loopErr); debugLog("autoLoop", { phase: "cooldown-wait", iteration, + consecutiveCooldowns, + retryAfterMs, error: msg, }); + + if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) { + ctx.ui.notify( + `Auto-mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`, + "error", + ); + await deps.stopAuto( + ctx, + pi, + `${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`, + ); + break; + } + + const waitMs = (retryAfterMs !== undefined && retryAfterMs > 0 && retryAfterMs <= 60_000) + ? retryAfterMs + 500 // Use structured hint + small buffer + : COOLDOWN_FALLBACK_WAIT_MS; ctx.ui.notify( - `Credentials in cooldown — waiting for rate limit to clear before retrying.`, + `Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`, "warning", ); - await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS)); + await new Promise(resolve => setTimeout(resolve, waitMs)); continue; // Retry iteration without incrementing consecutiveErrors }