From 1ae93e98225ac776d1599890fc0265bd6352be44 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Sun, 12 Apr 2026 09:04:41 -0500 Subject: [PATCH] fix(auto): survive transient 429 credential cooldown in auto sessions getApiKey() retry loop (3 attempts, ~12s) couldn't outlast the 30s rate-limit backoff window, causing cooldown errors to cascade through the auto-loop and trigger a hard stop after 3 consecutive failures. - Add AuthStorage.getEarliestBackoffExpiry() to expose when the next credential becomes available - Update getApiKey() to sleep until backoff expiry (up to 60s) instead of fixed 2s/4s/6s delays - Add isTransientCooldownError() detector in infra-errors.ts - Auto-loop now waits 35s on cooldown errors without incrementing the consecutive error counter Closes #4052 --- .../pi-coding-agent/src/core/auth-storage.ts | 30 +++++++++++++++++++ packages/pi-coding-agent/src/core/sdk.ts | 20 ++++++++++++- .../extensions/gsd/auto/infra-errors.ts | 20 +++++++++++++ src/resources/extensions/gsd/auto/loop.ts | 22 +++++++++++++- 4 files changed, 90 insertions(+), 2 deletions(-) diff --git a/packages/pi-coding-agent/src/core/auth-storage.ts b/packages/pi-coding-agent/src/core/auth-storage.ts index fb1532252..02e2f3103 100644 --- a/packages/pi-coding-agent/src/core/auth-storage.ts +++ b/packages/pi-coding-agent/src/core/auth-storage.ts @@ -559,6 +559,36 @@ export class AuthStorage { return remaining; } + /** + * Get the earliest timestamp at which any credential for this provider + * will become available again. Returns `undefined` when no credentials + * are backed off (i.e. all are immediately available). + * + * Callers can use this to sleep exactly long enough for the cooldown to + * clear instead of using a fixed retry delay that may be shorter than the + * backoff window. + */ + getEarliestBackoffExpiry(provider: string): number | undefined { + const providerMap = this.credentialBackoff.get(provider); + if (!providerMap || providerMap.size === 0) return undefined; + + const now = Date.now(); + let earliest: number | undefined; + + for (const [index, expiresAt] of providerMap) { + if (expiresAt <= now) { + // Already expired — clean up + providerMap.delete(index); + continue; + } + if (earliest === undefined || expiresAt < earliest) { + earliest = expiresAt; + } + } + + return earliest; + } + /** * Check if a credential index is currently backed off. */ diff --git a/packages/pi-coding-agent/src/core/sdk.ts b/packages/pi-coding-agent/src/core/sdk.ts index 07ed24c53..bcf2f27b7 100644 --- a/packages/pi-coding-agent/src/core/sdk.ts +++ b/packages/pi-coding-agent/src/core/sdk.ts @@ -363,8 +363,12 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} // Retry key resolution with backoff to handle transient network failures // (e.g., OAuth token refresh failing due to brief connectivity loss). + // When credentials are in a cooldown window (e.g., after a 429), wait + // for the backoff to expire instead of using fixed delays that are + // shorter than the cooldown duration. const maxAttempts = 3; const baseDelayMs = 2000; + const maxCooldownWaitMs = 60_000; // Don't wait longer than 60s (skip quota-exhausted 30min backoffs) for (let attempt = 1; attempt <= maxAttempts; attempt++) { const key = await modelRegistry.getApiKeyForProvider(resolvedProvider); if (key) return key; @@ -379,7 +383,21 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} const isOAuth = model && modelRegistry.isUsingOAuth(model); if (!hasAuth && !isOAuth) break; - // Wait with exponential backoff before retrying + // If credentials are in a cooldown window, wait for the earliest + // one to expire rather than using a fixed delay that's too short. + const backoffExpiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider); + if (backoffExpiry !== undefined) { + const waitMs = backoffExpiry - Date.now() + 500; // 500ms buffer + if (waitMs > 0 && waitMs <= maxCooldownWaitMs) { + await new Promise(resolve => setTimeout(resolve, waitMs)); + continue; // Retry immediately after cooldown clears + } + if (waitMs > maxCooldownWaitMs) { + break; // Quota-exhausted or very long backoff — don't block + } + } + + // Standard exponential backoff for non-cooldown transient failures await new Promise(resolve => setTimeout(resolve, baseDelayMs * attempt)); } diff --git a/src/resources/extensions/gsd/auto/infra-errors.ts b/src/resources/extensions/gsd/auto/infra-errors.ts index 17c1a553d..5953066b6 100644 --- a/src/resources/extensions/gsd/auto/infra-errors.ts +++ b/src/resources/extensions/gsd/auto/infra-errors.ts @@ -46,3 +46,23 @@ export function isInfrastructureError(err: unknown): string | null { if (msg.includes("database disk image is malformed")) return "SQLITE_CORRUPT"; return null; } + +/** + * Default wait duration when a cooldown error is detected but no specific + * expiry is available from AuthStorage (e.g., error propagated across + * process boundary without structured backoff data). + */ +export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff + +/** + * Detect whether an error is a transient credential cooldown that should + * be waited out rather than counted as a consecutive failure. + * + * These errors are generated by getApiKey() in sdk.ts when all credentials + * for a provider are in a backoff window (typically after a 429). The + * auto-loop should pause and retry instead of escalating to hard stop. + */ +export function isTransientCooldownError(err: unknown): boolean { + const msg = err instanceof Error ? err.message : String(err); + return /in a cooldown window/i.test(msg); +} diff --git a/src/resources/extensions/gsd/auto/loop.ts b/src/resources/extensions/gsd/auto/loop.ts index 3a0c8de10..018f1884b 100644 --- a/src/resources/extensions/gsd/auto/loop.ts +++ b/src/resources/extensions/gsd/auto/loop.ts @@ -27,7 +27,7 @@ import { runFinalize, } from "./phases.js"; import { debugLog } from "../debug-logger.js"; -import { isInfrastructureError } from "./infra-errors.js"; +import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js"; import { resolveEngine } from "../engine-resolver.js"; /** @@ -300,6 +300,26 @@ export async function autoLoop( break; } + // ── Credential cooldown: wait and retry without burning error budget ── + // A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's + // getApiKey() retries couldn't outlast the window, the error surfaces + // here. Wait for the cooldown to clear rather than counting it as a + // consecutive failure — 3 fast cooldown errors would otherwise kill + // the auto session unnecessarily. + if (isTransientCooldownError(loopErr)) { + debugLog("autoLoop", { + phase: "cooldown-wait", + iteration, + error: msg, + }); + ctx.ui.notify( + `Credentials in cooldown — waiting for rate limit to clear before retrying.`, + "warning", + ); + await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS)); + continue; // Retry iteration without incrementing consecutiveErrors + } + consecutiveErrors++; recentErrorMessages.push(msg.length > 120 ? msg.slice(0, 120) + "..." : msg); debugLog("autoLoop", {