fix(auto): survive transient 429 credential cooldown in auto sessions

getApiKey() retry loop (3 attempts, ~12s) couldn't outlast the 30s
rate-limit backoff window, causing cooldown errors to cascade through
the auto-loop and trigger a hard stop after 3 consecutive failures.

- Add AuthStorage.getEarliestBackoffExpiry() to expose when the next
  credential becomes available
- Update getApiKey() to sleep until backoff expiry (up to 60s) instead
  of fixed 2s/4s/6s delays
- Add isTransientCooldownError() detector in infra-errors.ts
- Auto-loop now waits 35s on cooldown errors without incrementing the
  consecutive error counter

Closes #4052
This commit is contained in:
Jeremy 2026-04-12 09:04:41 -05:00
parent b22f7baafb
commit 1ae93e9822
4 changed files with 90 additions and 2 deletions

View file

@ -559,6 +559,36 @@ export class AuthStorage {
return remaining;
}
/**
* Get the earliest timestamp at which any credential for this provider
* will become available again. Returns `undefined` when no credentials
* are backed off (i.e. all are immediately available).
*
* Callers can use this to sleep exactly long enough for the cooldown to
* clear instead of using a fixed retry delay that may be shorter than the
* backoff window.
*/
getEarliestBackoffExpiry(provider: string): number | undefined {
const providerMap = this.credentialBackoff.get(provider);
if (!providerMap || providerMap.size === 0) return undefined;
const now = Date.now();
let earliest: number | undefined;
for (const [index, expiresAt] of providerMap) {
if (expiresAt <= now) {
// Already expired — clean up
providerMap.delete(index);
continue;
}
if (earliest === undefined || expiresAt < earliest) {
earliest = expiresAt;
}
}
return earliest;
}
/**
* Check if a credential index is currently backed off.
*/

View file

@ -363,8 +363,12 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// Retry key resolution with backoff to handle transient network failures
// (e.g., OAuth token refresh failing due to brief connectivity loss).
// When credentials are in a cooldown window (e.g., after a 429), wait
// for the backoff to expire instead of using fixed delays that are
// shorter than the cooldown duration.
const maxAttempts = 3;
const baseDelayMs = 2000;
const maxCooldownWaitMs = 60_000; // Don't wait longer than 60s (skip quota-exhausted 30min backoffs)
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const key = await modelRegistry.getApiKeyForProvider(resolvedProvider);
if (key) return key;
@ -379,7 +383,21 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
const isOAuth = model && modelRegistry.isUsingOAuth(model);
if (!hasAuth && !isOAuth) break;
// Wait with exponential backoff before retrying
// If credentials are in a cooldown window, wait for the earliest
// one to expire rather than using a fixed delay that's too short.
const backoffExpiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
if (backoffExpiry !== undefined) {
const waitMs = backoffExpiry - Date.now() + 500; // 500ms buffer
if (waitMs > 0 && waitMs <= maxCooldownWaitMs) {
await new Promise(resolve => setTimeout(resolve, waitMs));
continue; // Retry immediately after cooldown clears
}
if (waitMs > maxCooldownWaitMs) {
break; // Quota-exhausted or very long backoff — don't block
}
}
// Standard exponential backoff for non-cooldown transient failures
await new Promise(resolve => setTimeout(resolve, baseDelayMs * attempt));
}

View file

@ -46,3 +46,23 @@ export function isInfrastructureError(err: unknown): string | null {
if (msg.includes("database disk image is malformed")) return "SQLITE_CORRUPT";
return null;
}
/**
* Default wait duration when a cooldown error is detected but no specific
* expiry is available from AuthStorage (e.g., error propagated across
* process boundary without structured backoff data).
*/
export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff
/**
* Detect whether an error is a transient credential cooldown that should
* be waited out rather than counted as a consecutive failure.
*
* These errors are generated by getApiKey() in sdk.ts when all credentials
* for a provider are in a backoff window (typically after a 429). The
* auto-loop should pause and retry instead of escalating to hard stop.
*/
export function isTransientCooldownError(err: unknown): boolean {
const msg = err instanceof Error ? err.message : String(err);
return /in a cooldown window/i.test(msg);
}

View file

@ -27,7 +27,7 @@ import {
runFinalize,
} from "./phases.js";
import { debugLog } from "../debug-logger.js";
import { isInfrastructureError } from "./infra-errors.js";
import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js";
import { resolveEngine } from "../engine-resolver.js";
/**
@ -300,6 +300,26 @@ export async function autoLoop(
break;
}
// ── Credential cooldown: wait and retry without burning error budget ──
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
// getApiKey() retries couldn't outlast the window, the error surfaces
// here. Wait for the cooldown to clear rather than counting it as a
// consecutive failure — 3 fast cooldown errors would otherwise kill
// the auto session unnecessarily.
if (isTransientCooldownError(loopErr)) {
debugLog("autoLoop", {
phase: "cooldown-wait",
iteration,
error: msg,
});
ctx.ui.notify(
`Credentials in cooldown — waiting for rate limit to clear before retrying.`,
"warning",
);
await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS));
continue; // Retry iteration without incrementing consecutiveErrors
}
consecutiveErrors++;
recentErrorMessages.push(msg.length > 120 ? msg.slice(0, 120) + "..." : msg);
debugLog("autoLoop", {