fix(auto): survive transient 429 credential cooldown in auto sessions
getApiKey() retry loop (3 attempts, ~12s) couldn't outlast the 30s rate-limit backoff window, causing cooldown errors to cascade through the auto-loop and trigger a hard stop after 3 consecutive failures. - Add AuthStorage.getEarliestBackoffExpiry() to expose when the next credential becomes available - Update getApiKey() to sleep until backoff expiry (up to 60s) instead of fixed 2s/4s/6s delays - Add isTransientCooldownError() detector in infra-errors.ts - Auto-loop now waits 35s on cooldown errors without incrementing the consecutive error counter Closes #4052
This commit is contained in:
parent
b22f7baafb
commit
1ae93e9822
4 changed files with 90 additions and 2 deletions
|
|
@ -559,6 +559,36 @@ export class AuthStorage {
|
|||
return remaining;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the earliest timestamp at which any credential for this provider
|
||||
* will become available again. Returns `undefined` when no credentials
|
||||
* are backed off (i.e. all are immediately available).
|
||||
*
|
||||
* Callers can use this to sleep exactly long enough for the cooldown to
|
||||
* clear instead of using a fixed retry delay that may be shorter than the
|
||||
* backoff window.
|
||||
*/
|
||||
getEarliestBackoffExpiry(provider: string): number | undefined {
|
||||
const providerMap = this.credentialBackoff.get(provider);
|
||||
if (!providerMap || providerMap.size === 0) return undefined;
|
||||
|
||||
const now = Date.now();
|
||||
let earliest: number | undefined;
|
||||
|
||||
for (const [index, expiresAt] of providerMap) {
|
||||
if (expiresAt <= now) {
|
||||
// Already expired — clean up
|
||||
providerMap.delete(index);
|
||||
continue;
|
||||
}
|
||||
if (earliest === undefined || expiresAt < earliest) {
|
||||
earliest = expiresAt;
|
||||
}
|
||||
}
|
||||
|
||||
return earliest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a credential index is currently backed off.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -363,8 +363,12 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
|
|||
|
||||
// Retry key resolution with backoff to handle transient network failures
|
||||
// (e.g., OAuth token refresh failing due to brief connectivity loss).
|
||||
// When credentials are in a cooldown window (e.g., after a 429), wait
|
||||
// for the backoff to expire instead of using fixed delays that are
|
||||
// shorter than the cooldown duration.
|
||||
const maxAttempts = 3;
|
||||
const baseDelayMs = 2000;
|
||||
const maxCooldownWaitMs = 60_000; // Don't wait longer than 60s (skip quota-exhausted 30min backoffs)
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
||||
const key = await modelRegistry.getApiKeyForProvider(resolvedProvider);
|
||||
if (key) return key;
|
||||
|
|
@ -379,7 +383,21 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
|
|||
const isOAuth = model && modelRegistry.isUsingOAuth(model);
|
||||
if (!hasAuth && !isOAuth) break;
|
||||
|
||||
// Wait with exponential backoff before retrying
|
||||
// If credentials are in a cooldown window, wait for the earliest
|
||||
// one to expire rather than using a fixed delay that's too short.
|
||||
const backoffExpiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
|
||||
if (backoffExpiry !== undefined) {
|
||||
const waitMs = backoffExpiry - Date.now() + 500; // 500ms buffer
|
||||
if (waitMs > 0 && waitMs <= maxCooldownWaitMs) {
|
||||
await new Promise(resolve => setTimeout(resolve, waitMs));
|
||||
continue; // Retry immediately after cooldown clears
|
||||
}
|
||||
if (waitMs > maxCooldownWaitMs) {
|
||||
break; // Quota-exhausted or very long backoff — don't block
|
||||
}
|
||||
}
|
||||
|
||||
// Standard exponential backoff for non-cooldown transient failures
|
||||
await new Promise(resolve => setTimeout(resolve, baseDelayMs * attempt));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,3 +46,23 @@ export function isInfrastructureError(err: unknown): string | null {
|
|||
if (msg.includes("database disk image is malformed")) return "SQLITE_CORRUPT";
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default wait duration when a cooldown error is detected but no specific
|
||||
* expiry is available from AuthStorage (e.g., error propagated across
|
||||
* process boundary without structured backoff data).
|
||||
*/
|
||||
export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff
|
||||
|
||||
/**
|
||||
* Detect whether an error is a transient credential cooldown that should
|
||||
* be waited out rather than counted as a consecutive failure.
|
||||
*
|
||||
* These errors are generated by getApiKey() in sdk.ts when all credentials
|
||||
* for a provider are in a backoff window (typically after a 429). The
|
||||
* auto-loop should pause and retry instead of escalating to hard stop.
|
||||
*/
|
||||
export function isTransientCooldownError(err: unknown): boolean {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
return /in a cooldown window/i.test(msg);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ import {
|
|||
runFinalize,
|
||||
} from "./phases.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { isInfrastructureError } from "./infra-errors.js";
|
||||
import { isInfrastructureError, isTransientCooldownError, COOLDOWN_FALLBACK_WAIT_MS } from "./infra-errors.js";
|
||||
import { resolveEngine } from "../engine-resolver.js";
|
||||
|
||||
/**
|
||||
|
|
@ -300,6 +300,26 @@ export async function autoLoop(
|
|||
break;
|
||||
}
|
||||
|
||||
// ── Credential cooldown: wait and retry without burning error budget ──
|
||||
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
|
||||
// getApiKey() retries couldn't outlast the window, the error surfaces
|
||||
// here. Wait for the cooldown to clear rather than counting it as a
|
||||
// consecutive failure — 3 fast cooldown errors would otherwise kill
|
||||
// the auto session unnecessarily.
|
||||
if (isTransientCooldownError(loopErr)) {
|
||||
debugLog("autoLoop", {
|
||||
phase: "cooldown-wait",
|
||||
iteration,
|
||||
error: msg,
|
||||
});
|
||||
ctx.ui.notify(
|
||||
`Credentials in cooldown — waiting for rate limit to clear before retrying.`,
|
||||
"warning",
|
||||
);
|
||||
await new Promise(resolve => setTimeout(resolve, COOLDOWN_FALLBACK_WAIT_MS));
|
||||
continue; // Retry iteration without incrementing consecutiveErrors
|
||||
}
|
||||
|
||||
consecutiveErrors++;
|
||||
recentErrorMessages.push(msg.length > 120 ? msg.slice(0, 120) + "..." : msg);
|
||||
debugLog("autoLoop", {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue