Merge pull request #4053 from jeremymcs/fix/auto-session-credential-cooldown

fix(auto): survive transient 429 credential cooldown
This commit is contained in:
Jeremy McSpadden 2026-04-12 09:42:37 -05:00 committed by GitHub
commit 564a71da37
8 changed files with 536 additions and 10 deletions

View file

@ -423,3 +423,111 @@ describe("AuthStorage — getAll()", () => {
assert.equal((all["openai"] as any).key, "sk-openai");
});
});
// ─── getEarliestBackoffExpiry ─────────────────────────────────────────────────
describe("AuthStorage — getEarliestBackoffExpiry", () => {
it("returns undefined when no credentials are configured for the provider", () => {
const storage = inMemory({});
assert.equal(storage.getEarliestBackoffExpiry("anthropic"), undefined);
});
it("returns undefined when credentials exist but none are backed off", () => {
const storage = inMemory({ anthropic: makeKey("sk-only") });
// No markUsageLimitReached call — credentialBackoff map is empty
assert.equal(storage.getEarliestBackoffExpiry("anthropic"), undefined);
});
it("returns a future timestamp when a single credential is backed off", async () => {
const storage = inMemory({ anthropic: makeKey("sk-only") });
await storage.getApiKey("anthropic");
storage.markUsageLimitReached("anthropic");
const expiry = storage.getEarliestBackoffExpiry("anthropic");
assert.ok(expiry !== undefined, "should return a timestamp");
assert.ok(expiry > Date.now(), "expiry should be in the future");
});
it("returns the earliest expiry when multiple credentials are backed off", async () => {
const storage = inMemory({
anthropic: [makeKey("sk-1"), makeKey("sk-2")],
});
// Back off both credentials with the default rate_limit backoff (30 s)
await storage.getApiKey("anthropic"); // uses index 0
storage.markUsageLimitReached("anthropic"); // backs off index 0
await storage.getApiKey("anthropic"); // uses index 1
storage.markUsageLimitReached("anthropic"); // backs off index 1
const expiry = storage.getEarliestBackoffExpiry("anthropic");
assert.ok(expiry !== undefined, "should return a timestamp");
assert.ok(expiry > Date.now(), "expiry should be in the future");
});
it("returns undefined after backed-off credentials expire (cleans up entries)", () => {
// Manually inject an already-expired backoff entry so we can test
// the cleanup path without actually waiting 30 seconds.
const storage = inMemory({ anthropic: makeKey("sk-only") });
// Access private credentialBackoff map via type assertion to inject expired entry
const credentialBackoff: Map<string, Map<number, number>> =
(storage as any).credentialBackoff;
const providerMap = new Map<number, number>();
// expiresAt in the past
providerMap.set(0, Date.now() - 1_000);
credentialBackoff.set("anthropic", providerMap);
// getEarliestBackoffExpiry should clean up the expired entry and return undefined
const expiry = storage.getEarliestBackoffExpiry("anthropic");
assert.equal(expiry, undefined);
// Confirm the expired entry was removed from the map
assert.equal(providerMap.size, 0, "expired entry should have been deleted");
});
it("returns undefined when provider is not in credentialBackoff map at all", () => {
const storage = inMemory({ openai: makeKey("sk-openai") });
// anthropic has no backoff map entry at all
assert.equal(storage.getEarliestBackoffExpiry("anthropic"), undefined);
});
it("only returns expiry for the requested provider, not other providers", async () => {
const storage = inMemory({
anthropic: makeKey("sk-ant"),
openai: makeKey("sk-oai"),
});
// Back off anthropic
await storage.getApiKey("anthropic");
storage.markUsageLimitReached("anthropic");
// openai is not backed off
assert.equal(storage.getEarliestBackoffExpiry("openai"), undefined);
// anthropic is backed off
const expiry = storage.getEarliestBackoffExpiry("anthropic");
assert.ok(expiry !== undefined);
assert.ok(expiry > Date.now());
});
it("returns the minimum expiry when one credential expires sooner than another", () => {
const storage = inMemory({
anthropic: [makeKey("sk-1"), makeKey("sk-2")],
});
const now = Date.now();
const nearExpiry = now + 5_000; // expires in 5 s
const farExpiry = now + 30_000; // expires in 30 s
// Inject two different backoff expiries manually
const credentialBackoff: Map<string, Map<number, number>> =
(storage as any).credentialBackoff;
const providerMap = new Map<number, number>();
providerMap.set(0, nearExpiry);
providerMap.set(1, farExpiry);
credentialBackoff.set("anthropic", providerMap);
const expiry = storage.getEarliestBackoffExpiry("anthropic");
assert.equal(expiry, nearExpiry, "should return the nearest (smallest) expiry");
});
});

View file

@ -559,6 +559,36 @@ export class AuthStorage {
return remaining;
}
/**
* Get the earliest timestamp at which any credential for this provider
* will become available again. Returns `undefined` when no credentials
* are backed off (i.e. all are immediately available).
*
* Callers can use this to sleep exactly long enough for the cooldown to
* clear instead of using a fixed retry delay that may be shorter than the
* backoff window.
*/
getEarliestBackoffExpiry(provider: string): number | undefined {
const providerMap = this.credentialBackoff.get(provider);
if (!providerMap || providerMap.size === 0) return undefined;
const now = Date.now();
let earliest: number | undefined;
for (const [index, expiresAt] of providerMap) {
if (expiresAt <= now) {
// Already expired — clean up
providerMap.delete(index);
continue;
}
if (earliest === undefined || expiresAt < earliest) {
earliest = expiresAt;
}
}
return earliest;
}
/**
* Check if a credential index is currently backed off.
*/

View file

@ -0,0 +1,89 @@
// pi-coding-agent / CredentialCooldownError unit tests
// Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
import { describe, it } from "node:test";
import assert from "node:assert/strict";
import { CredentialCooldownError } from "./sdk.js";
// ─── CredentialCooldownError ──────────────────────────────────────────────────
describe("CredentialCooldownError", () => {
it("is an instance of Error", () => {
const err = new CredentialCooldownError("anthropic");
assert.ok(err instanceof Error);
});
it("has name set to CredentialCooldownError", () => {
const err = new CredentialCooldownError("anthropic");
assert.equal(err.name, "CredentialCooldownError");
});
it("has code set to AUTH_COOLDOWN", () => {
const err = new CredentialCooldownError("anthropic");
assert.equal(err.code, "AUTH_COOLDOWN");
});
it("message includes the provider name", () => {
const err = new CredentialCooldownError("openai");
assert.ok(
err.message.includes("openai"),
`Expected message to include provider "openai", got: ${err.message}`,
);
});
it("message mentions cooldown window", () => {
const err = new CredentialCooldownError("anthropic");
assert.ok(
/cooldown window/i.test(err.message),
`Expected message to mention "cooldown window", got: ${err.message}`,
);
});
it("retryAfterMs is undefined when not provided", () => {
const err = new CredentialCooldownError("anthropic");
assert.equal(err.retryAfterMs, undefined);
});
it("retryAfterMs holds the provided value when specified", () => {
const err = new CredentialCooldownError("anthropic", 30_000);
assert.equal(err.retryAfterMs, 30_000);
});
it("retryAfterMs is 0 when explicitly passed as 0", () => {
const err = new CredentialCooldownError("anthropic", 0);
assert.equal(err.retryAfterMs, 0);
});
it("code property is readonly and always AUTH_COOLDOWN regardless of provider", () => {
for (const provider of ["anthropic", "openai", "google", "openrouter"]) {
const err = new CredentialCooldownError(provider);
assert.equal(err.code, "AUTH_COOLDOWN", `code should be AUTH_COOLDOWN for provider "${provider}"`);
}
});
it("different providers produce different messages", () => {
const err1 = new CredentialCooldownError("anthropic");
const err2 = new CredentialCooldownError("openai");
assert.notEqual(err1.message, err2.message);
});
it("can be caught as an Error in a try/catch", () => {
let caught: unknown;
try {
throw new CredentialCooldownError("anthropic", 5_000);
} catch (e) {
caught = e;
}
assert.ok(caught instanceof Error);
assert.ok(caught instanceof CredentialCooldownError);
assert.equal((caught as CredentialCooldownError).retryAfterMs, 5_000);
});
it("code property is detectable via plain object check (cross-process pattern)", () => {
const err = new CredentialCooldownError("anthropic", 15_000);
// Simulate cross-process serialization: only plain properties survive JSON round-trip
const plain = { code: err.code, retryAfterMs: err.retryAfterMs, message: err.message };
assert.equal(plain.code, "AUTH_COOLDOWN");
assert.equal(plain.retryAfterMs, 15_000);
});
});

View file

@ -1,4 +1,24 @@
import { join } from "node:path";
/**
* Structured error thrown when all credentials for a provider are in a
* backoff window. Carries typed metadata so callers (e.g. the auto-loop)
* can make informed retry decisions instead of string-matching the message.
*/
export class CredentialCooldownError extends Error {
readonly code = "AUTH_COOLDOWN" as const;
/** Milliseconds until the earliest credential becomes available, or undefined if unknown. */
readonly retryAfterMs: number | undefined;
constructor(provider: string, retryAfterMs?: number) {
super(
`All credentials for "${provider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
this.name = "CredentialCooldownError";
this.retryAfterMs = retryAfterMs;
}
}
import { Agent, type AgentMessage, type ThinkingLevel } from "@gsd/pi-agent-core";
import type { Message, Model } from "@gsd/pi-ai";
import { getAgentDir, getDocsPath } from "../config.js";
@ -363,8 +383,12 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// Retry key resolution with backoff to handle transient network failures
// (e.g., OAuth token refresh failing due to brief connectivity loss).
// When credentials are in a cooldown window (e.g., after a 429), wait
// for the backoff to expire instead of using fixed delays that are
// shorter than the cooldown duration.
const maxAttempts = 3;
const baseDelayMs = 2000;
const maxCooldownWaitMs = 60_000; // Don't wait longer than 60s (skip quota-exhausted 30min backoffs)
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const key = await modelRegistry.getApiKeyForProvider(resolvedProvider);
if (key) return key;
@ -379,7 +403,21 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
const isOAuth = model && modelRegistry.isUsingOAuth(model);
if (!hasAuth && !isOAuth) break;
// Wait with exponential backoff before retrying
// If credentials are in a cooldown window, wait for the earliest
// one to expire rather than using a fixed delay that's too short.
const backoffExpiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
if (backoffExpiry !== undefined) {
const waitMs = backoffExpiry - Date.now() + 500; // 500ms buffer
if (waitMs > 0 && waitMs <= maxCooldownWaitMs) {
await new Promise(resolve => setTimeout(resolve, waitMs));
continue; // Retry immediately after cooldown clears
}
if (waitMs > maxCooldownWaitMs) {
break; // Quota-exhausted or very long backoff — don't block
}
}
// Standard exponential backoff for non-cooldown transient failures
await new Promise(resolve => setTimeout(resolve, baseDelayMs * attempt));
}
@ -390,10 +428,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// the retry handler and creating cascading error entries (#3429).
const hasAuth = modelRegistry.authStorage.hasAuth(resolvedProvider);
if (hasAuth) {
throw new Error(
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
}
const model = agent.state.model;
const isOAuth = model && modelRegistry.isUsingOAuth(model);
@ -401,10 +438,9 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {}
// If credentials exist but are all in a backoff window (quota / rate-limit),
// surface a specific message instead of the misleading "Authentication failed".
if (modelRegistry.authStorage.areAllCredentialsBackedOff(resolvedProvider)) {
throw new Error(
`All credentials for "${resolvedProvider}" are in a cooldown window. ` +
`Please wait a moment and try again, or switch to a different provider.`,
);
const expiry = modelRegistry.authStorage.getEarliestBackoffExpiry(resolvedProvider);
const retryAfterMs = expiry !== undefined ? Math.max(0, expiry - Date.now()) : undefined;
throw new CredentialCooldownError(resolvedProvider, retryAfterMs);
}
throw new Error(
`Authentication failed for "${resolvedProvider}". ` +

View file

@ -176,6 +176,7 @@ export { DefaultResourceLoader } from "./core/resource-loader.js";
export {
type CreateAgentSessionOptions,
type CreateAgentSessionResult,
CredentialCooldownError,
// Factory
createAgentSession,
createBashTool,

View file

@ -46,3 +46,41 @@ export function isInfrastructureError(err: unknown): string | null {
if (msg.includes("database disk image is malformed")) return "SQLITE_CORRUPT";
return null;
}
/**
* Default wait duration when a cooldown error is detected but no specific
* expiry is available from AuthStorage (e.g., error propagated across
* process boundary without structured backoff data).
*/
export const COOLDOWN_FALLBACK_WAIT_MS = 35_000; // 35s — slightly longer than the 30s rate-limit backoff
/** Maximum consecutive cooldown retries before the auto-loop gives up. */
export const MAX_COOLDOWN_RETRIES = 5;
/**
* Detect whether an error is a transient credential cooldown that should
* be waited out rather than counted as a consecutive failure.
*
* Prefers the structured `CredentialCooldownError` (code: AUTH_COOLDOWN)
* thrown by sdk.ts. Falls back to message matching for errors that
* propagated across process boundaries without the typed class.
*/
export function isTransientCooldownError(err: unknown): boolean {
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
return true;
}
// Fallback: message match for cross-process error propagation
const msg = err instanceof Error ? err.message : String(err);
return /in a cooldown window/i.test(msg);
}
/**
* Extract retryAfterMs from a CredentialCooldownError, if available.
* Returns undefined for unstructured errors or when no retry hint exists.
*/
export function getCooldownRetryAfterMs(err: unknown): number | undefined {
if (err && typeof err === "object" && (err as Record<string, unknown>).code === "AUTH_COOLDOWN") {
return (err as Record<string, unknown>).retryAfterMs as number | undefined;
}
return undefined;
}

View file

@ -27,7 +27,7 @@ import {
runFinalize,
} from "./phases.js";
import { debugLog } from "../debug-logger.js";
import { isInfrastructureError } from "./infra-errors.js";
import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js";
import { resolveEngine } from "../engine-resolver.js";
/**
@ -48,6 +48,7 @@ export async function autoLoop(
let iteration = 0;
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
let consecutiveErrors = 0;
let consecutiveCooldowns = 0;
const recentErrorMessages: string[] = [];
while (s.active) {
@ -203,6 +204,7 @@ export async function autoLoop(
deps.clearUnitTimeout();
consecutiveErrors = 0;
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
debugLog("autoLoop", { phase: "iteration-complete", iteration });
@ -265,6 +267,7 @@ export async function autoLoop(
if (finalizeResult.action === "continue") continue;
consecutiveErrors = 0; // Iteration completed successfully
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
debugLog("autoLoop", { phase: "iteration-complete", iteration });
@ -300,6 +303,47 @@ export async function autoLoop(
break;
}
// ── Credential cooldown: wait and retry with bounded budget ──
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
// getApiKey() retries couldn't outlast the window, the error surfaces
// here. Wait for the cooldown to clear rather than counting it as a
// consecutive failure — but cap retries so we don't spin for hours
// on persistent quota exhaustion.
if (isTransientCooldownError(loopErr)) {
consecutiveCooldowns++;
const retryAfterMs = getCooldownRetryAfterMs(loopErr);
debugLog("autoLoop", {
phase: "cooldown-wait",
iteration,
consecutiveCooldowns,
retryAfterMs,
error: msg,
});
if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) {
ctx.ui.notify(
`Auto-mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`,
);
break;
}
const waitMs = (retryAfterMs !== undefined && retryAfterMs > 0 && retryAfterMs <= 60_000)
? retryAfterMs + 500 // Use structured hint + small buffer
: COOLDOWN_FALLBACK_WAIT_MS;
ctx.ui.notify(
`Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`,
"warning",
);
await new Promise(resolve => setTimeout(resolve, waitMs));
continue; // Retry iteration without incrementing consecutiveErrors
}
consecutiveErrors++;
recentErrorMessages.push(msg.length > 120 ? msg.slice(0, 120) + "..." : msg);
debugLog("autoLoop", {

View file

@ -0,0 +1,180 @@
// gsd / infra-errors cooldown detection tests
// Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
import test, { describe } from "node:test";
import assert from "node:assert/strict";
import {
isTransientCooldownError,
getCooldownRetryAfterMs,
MAX_COOLDOWN_RETRIES,
COOLDOWN_FALLBACK_WAIT_MS,
} from "../auto/infra-errors.js";
// ─── Constants ────────────────────────────────────────────────────────────────
describe("infra-errors cooldown constants", () => {
test("COOLDOWN_FALLBACK_WAIT_MS is a positive number greater than the 30s rate-limit backoff", () => {
assert.ok(typeof COOLDOWN_FALLBACK_WAIT_MS === "number");
assert.ok(COOLDOWN_FALLBACK_WAIT_MS > 30_000, "should exceed the 30s rate-limit window");
});
test("MAX_COOLDOWN_RETRIES is a positive integer", () => {
assert.ok(typeof MAX_COOLDOWN_RETRIES === "number");
assert.ok(Number.isInteger(MAX_COOLDOWN_RETRIES));
assert.ok(MAX_COOLDOWN_RETRIES > 0);
});
test("COOLDOWN_FALLBACK_WAIT_MS is 35_000", () => {
assert.equal(COOLDOWN_FALLBACK_WAIT_MS, 35_000);
});
test("MAX_COOLDOWN_RETRIES is 5", () => {
assert.equal(MAX_COOLDOWN_RETRIES, 5);
});
});
// ─── isTransientCooldownError: structured detection ──────────────────────────
describe("isTransientCooldownError — structured code detection", () => {
test("returns true for an object with code === AUTH_COOLDOWN", () => {
const err = { code: "AUTH_COOLDOWN", message: "credentials in cooldown" };
assert.equal(isTransientCooldownError(err), true);
});
test("returns true for a real CredentialCooldownError-shaped error", () => {
// Simulate CredentialCooldownError without importing sdk.ts (leaf-module rule)
const err = Object.assign(new Error('All credentials for "anthropic" are in a cooldown window.'), {
code: "AUTH_COOLDOWN",
retryAfterMs: 30_000,
name: "CredentialCooldownError",
});
assert.equal(isTransientCooldownError(err), true);
});
test("returns false for an object with a different code", () => {
const err = { code: "ENOSPC", message: "disk full" };
assert.equal(isTransientCooldownError(err), false);
});
test("returns false for an object with no code property", () => {
const err = { message: "some random error" };
assert.equal(isTransientCooldownError(err), false);
});
});
// ─── isTransientCooldownError: message fallback ───────────────────────────────
describe("isTransientCooldownError — message fallback (cross-process)", () => {
test("returns true when message contains 'in a cooldown window'", () => {
const err = new Error('All credentials for "openai" are in a cooldown window. Please wait.');
assert.equal(isTransientCooldownError(err), true);
});
test("returns true when message matches case-insensitively", () => {
const err = new Error("credentials IN A COOLDOWN WINDOW");
assert.equal(isTransientCooldownError(err), true);
});
test("returns true for a plain string containing cooldown window phrase", () => {
assert.equal(isTransientCooldownError("all keys in a cooldown window"), true);
});
test("returns false for a generic error message", () => {
const err = new Error("rate limit exceeded");
assert.equal(isTransientCooldownError(err), false);
});
test("returns false for an error message about auth failure without cooldown phrase", () => {
const err = new Error("Authentication failed: invalid API key");
assert.equal(isTransientCooldownError(err), false);
});
});
// ─── isTransientCooldownError: edge cases ────────────────────────────────────
describe("isTransientCooldownError — edge cases", () => {
test("returns false for null", () => {
assert.equal(isTransientCooldownError(null), false);
});
test("returns false for undefined", () => {
assert.equal(isTransientCooldownError(undefined), false);
});
test("returns false for a number", () => {
assert.equal(isTransientCooldownError(42), false);
});
test("returns false for an empty object", () => {
assert.equal(isTransientCooldownError({}), false);
});
test("returns false for an object with code === AUTH_COOLDOWN as a non-string", () => {
// code must be a string matching "AUTH_COOLDOWN" exactly
const err = { code: 42 };
assert.equal(isTransientCooldownError(err), false);
});
});
// ─── getCooldownRetryAfterMs: structured extraction ──────────────────────────
describe("getCooldownRetryAfterMs — structured extraction", () => {
test("returns retryAfterMs when code is AUTH_COOLDOWN and retryAfterMs is set", () => {
const err = { code: "AUTH_COOLDOWN", retryAfterMs: 30_000 };
assert.equal(getCooldownRetryAfterMs(err), 30_000);
});
test("returns undefined when code is AUTH_COOLDOWN but retryAfterMs is absent", () => {
const err = { code: "AUTH_COOLDOWN" };
assert.equal(getCooldownRetryAfterMs(err), undefined);
});
test("returns 0 when retryAfterMs is explicitly 0", () => {
const err = { code: "AUTH_COOLDOWN", retryAfterMs: 0 };
assert.equal(getCooldownRetryAfterMs(err), 0);
});
test("returns undefined for an error with a different code even if retryAfterMs is set", () => {
const err = { code: "ENOSPC", retryAfterMs: 5_000 };
assert.equal(getCooldownRetryAfterMs(err), undefined);
});
test("returns undefined for a plain Error with no code property", () => {
const err = new Error("something went wrong");
assert.equal(getCooldownRetryAfterMs(err), undefined);
});
test("returns retryAfterMs from a full CredentialCooldownError-shaped object", () => {
const err = Object.assign(new Error('All credentials for "anthropic" are in a cooldown window.'), {
code: "AUTH_COOLDOWN",
retryAfterMs: 15_000,
name: "CredentialCooldownError",
});
assert.equal(getCooldownRetryAfterMs(err), 15_000);
});
});
// ─── getCooldownRetryAfterMs: edge cases ─────────────────────────────────────
describe("getCooldownRetryAfterMs — edge cases", () => {
test("returns undefined for null", () => {
assert.equal(getCooldownRetryAfterMs(null), undefined);
});
test("returns undefined for undefined", () => {
assert.equal(getCooldownRetryAfterMs(undefined), undefined);
});
test("returns undefined for a plain string", () => {
assert.equal(getCooldownRetryAfterMs("AUTH_COOLDOWN"), undefined);
});
test("returns undefined for an empty object", () => {
assert.equal(getCooldownRetryAfterMs({}), undefined);
});
test("returns undefined for a number", () => {
assert.equal(getCooldownRetryAfterMs(42), undefined);
});
});