fix: classify long-context entitlement 429 as quota_exhausted, not rate_limit (#2803) (#3257)

The "Extra usage is required for long context requests" error from
Anthropic is a billing gate, not a transient rate limit. Classify it as
quota_exhausted so the handler enters the fallback path instead of an
infinite backoff loop. When no cross-provider fallback exists, attempt a
[1m] to base model downgrade before stopping cleanly.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Tom Boucher 2026-03-30 15:50:36 -04:00 committed by GitHub
parent 50fbd0a837
commit a725fa2d9d
2 changed files with 307 additions and 1 deletions

View file

@ -0,0 +1,255 @@
/**
* RetryHandler tests long-context entitlement 429 error handling (#2803)
*
* Verifies that "Extra usage is required for long context requests" errors
* are classified as quota_exhausted (not rate_limit) and trigger a model
* downgrade from [1m] to base when no cross-provider fallback exists.
*/
import { describe, it, beforeEach, mock, type Mock } from "node:test";
import assert from "node:assert/strict";
import { RetryHandler, type RetryHandlerDeps } from "./retry-handler.js";
import type { Api, AssistantMessage, Model } from "@gsd/pi-ai";
import type { FallbackResolver } from "./fallback-resolver.js";
import type { ModelRegistry } from "./model-registry.js";
import type { SettingsManager } from "./settings-manager.js";
// ─── Helpers ────────────────────────────────────────────────────────────────
function createMockModel(provider: string, id: string): Model<Api> {
return {
id,
name: id,
api: "anthropic" as Api,
provider,
baseUrl: "https://api.anthropic.com",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1_000_000,
maxTokens: 16384,
} as Model<Api>;
}
function errorMessage(msg: string): AssistantMessage {
return {
role: "assistant",
content: [],
api: "anthropic-messages",
provider: "anthropic",
model: "claude-opus-4-6[1m]",
usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
stopReason: "error",
errorMessage: msg,
timestamp: Date.now(),
} as AssistantMessage;
}
interface MockDeps {
deps: RetryHandlerDeps;
emittedEvents: Array<Record<string, any>>;
continueFn: Mock<() => Promise<void>>;
onModelChangeFn: Mock<(model: Model<any>) => void>;
markUsageLimitReached: Mock<(...args: any[]) => boolean>;
findFallback: Mock<(...args: any[]) => Promise<any>>;
findModel: Mock<(provider: string, modelId: string) => Model<Api> | undefined>;
}
function createMockDeps(overrides?: {
model?: Model<Api>;
retryEnabled?: boolean;
markUsageLimitReachedResult?: boolean;
fallbackResult?: any;
findModelResult?: (provider: string, modelId: string) => Model<Api> | undefined;
}): MockDeps {
const model = overrides?.model ?? createMockModel("anthropic", "claude-opus-4-6[1m]");
const emittedEvents: Array<Record<string, any>> = [];
const continueFn = mock.fn(async () => {});
const onModelChangeFn = mock.fn((_model: Model<any>) => {});
const markUsageLimitReached = mock.fn(
() => overrides?.markUsageLimitReachedResult ?? false,
);
const findFallback = mock.fn(async () => overrides?.fallbackResult ?? null);
const findModel = mock.fn(
overrides?.findModelResult ?? ((_provider: string, _modelId: string) => undefined),
);
const messages: Array<{ role: string } & Record<string, any>> = [];
const deps: RetryHandlerDeps = {
agent: {
continue: continueFn,
state: { messages },
setModel: mock.fn(),
replaceMessages: mock.fn((newMessages: any[]) => {
messages.length = 0;
messages.push(...newMessages);
}),
} as any,
settingsManager: {
getRetryEnabled: () => overrides?.retryEnabled ?? true,
getRetrySettings: () => ({
enabled: overrides?.retryEnabled ?? true,
maxRetries: 5,
baseDelayMs: 1000,
maxDelayMs: 30000,
}),
} as unknown as SettingsManager,
modelRegistry: {
authStorage: {
markUsageLimitReached,
},
find: findModel,
} as unknown as ModelRegistry,
fallbackResolver: {
findFallback,
} as unknown as FallbackResolver,
getModel: () => model,
getSessionId: () => "test-session",
emit: (event: any) => emittedEvents.push(event),
onModelChange: onModelChangeFn,
};
return { deps, emittedEvents, continueFn, onModelChangeFn, markUsageLimitReached, findFallback, findModel };
}
// ─── _classifyErrorType (tested via handleRetryableError behavior) ──────────
describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
describe("error classification", () => {
it("classifies 'Extra usage is required for long context requests' as quota_exhausted, not rate_limit", async () => {
// When the error is classified as quota_exhausted AND no alternate credentials
// AND no fallback, the handler should emit fallback_chain_exhausted and stop.
// If misclassified as rate_limit, it would enter the backoff loop instead.
const { deps, emittedEvents, findModel } = createMockDeps({
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
markUsageLimitReachedResult: false, // no alternate credentials
fallbackResult: null, // no cross-provider fallback
findModelResult: () => undefined, // no base model either
});
const handler = new RetryHandler(deps);
const msg = errorMessage(
'429 {"type":"error","error":{"type":"rate_limit_error","message":"Extra usage is required for long context requests."}}'
);
const result = await handler.handleRetryableError(msg);
// Should NOT retry (would be true if misclassified as rate_limit entering backoff)
assert.equal(result, false);
// Should emit fallback_chain_exhausted (quota_exhausted path), NOT auto_retry_start (backoff path)
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
assert.ok(chainExhausted, "Expected fallback_chain_exhausted event for entitlement error");
const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start");
assert.equal(retryStart, undefined, "Should NOT emit auto_retry_start for entitlement error");
});
it("still classifies regular 429 rate limits as rate_limit", async () => {
// A normal "rate limit" 429 should still be classified as rate_limit
const { deps, emittedEvents } = createMockDeps({
model: createMockModel("anthropic", "claude-opus-4-6"),
markUsageLimitReachedResult: false,
fallbackResult: null,
});
const handler = new RetryHandler(deps);
const msg = errorMessage("429 Too Many Requests");
const result = await handler.handleRetryableError(msg);
// Should enter the backoff loop (rate_limit path, not quota_exhausted)
assert.equal(result, true);
const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start");
assert.ok(retryStart, "Regular 429 should enter backoff retry");
});
});
describe("long-context model downgrade", () => {
it("downgrades from [1m] to base model when entitlement error and no fallback", async () => {
const baseModel = createMockModel("anthropic", "claude-opus-4-6");
const { deps, emittedEvents, onModelChangeFn, continueFn } = createMockDeps({
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
markUsageLimitReachedResult: false,
fallbackResult: null,
findModelResult: (provider: string, modelId: string) => {
if (provider === "anthropic" && modelId === "claude-opus-4-6") return baseModel;
return undefined;
},
});
const handler = new RetryHandler(deps);
const msg = errorMessage("Extra usage is required for long context requests.");
const result = await handler.handleRetryableError(msg);
assert.equal(result, true, "Should retry after downgrade");
// Should have called setModel with the base model
const setModelCalls = (deps.agent.setModel as any).mock.calls;
assert.equal(setModelCalls.length, 1);
assert.equal(setModelCalls[0].arguments[0].id, "claude-opus-4-6");
// Should have notified about model change
assert.equal(onModelChangeFn.mock.calls.length, 1);
// Should emit a fallback_provider_switch event indicating downgrade
const switchEvent = emittedEvents.find((e) => e.type === "fallback_provider_switch");
assert.ok(switchEvent, "Expected fallback_provider_switch event for downgrade");
assert.ok(switchEvent!.reason.includes("long context downgrade"), `reason should mention downgrade: ${switchEvent!.reason}`);
});
it("emits fallback_chain_exhausted when base model is also unavailable", async () => {
const { deps, emittedEvents } = createMockDeps({
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
markUsageLimitReachedResult: false,
fallbackResult: null,
findModelResult: () => undefined, // base model not found
});
const handler = new RetryHandler(deps);
const msg = errorMessage("Extra usage is required for long context requests.");
const result = await handler.handleRetryableError(msg);
assert.equal(result, false);
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
assert.ok(chainExhausted, "Expected fallback_chain_exhausted when base model unavailable");
});
it("does not attempt downgrade for non-[1m] models", async () => {
// When a regular model (no [1m] suffix) gets a quota_exhausted error
// with no fallback, it should just stop — no downgrade attempt.
const { deps, emittedEvents } = createMockDeps({
model: createMockModel("anthropic", "claude-opus-4-6"),
markUsageLimitReachedResult: false,
fallbackResult: null,
});
const handler = new RetryHandler(deps);
const msg = errorMessage("Extra usage is required for long context requests.");
const result = await handler.handleRetryableError(msg);
assert.equal(result, false);
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
assert.ok(chainExhausted);
// No downgrade switch should occur
const switchEvent = emittedEvents.find((e) => e.type === "fallback_provider_switch");
assert.equal(switchEvent, undefined, "Should not switch for non-[1m] models");
});
});
describe("isRetryableError", () => {
it("considers long-context entitlement error as retryable", () => {
const { deps } = createMockDeps();
const handler = new RetryHandler(deps);
const msg = errorMessage("Extra usage is required for long context requests.");
assert.equal(handler.isRetryableError(msg), true);
});
});
});

View file

@ -107,7 +107,7 @@ export class RetryHandler {
if (isContextOverflow(message, contextWindow)) return false;
const err = message.errorMessage;
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off|extra usage is required/i.test(
err,
);
}
@ -202,6 +202,10 @@ export class RetryHandler {
// No fallback available either
if (errorType === "quota_exhausted") {
// Try long-context model downgrade ([1m] → base) before giving up
const downgraded = this._tryLongContextDowngrade(message);
if (downgraded) return true;
this._deps.emit({
type: "fallback_chain_exhausted",
reason: `All providers exhausted for ${this._deps.getModel()!.provider}/${this._deps.getModel()!.id}`,
@ -343,12 +347,59 @@ export class RetryHandler {
*/
private _classifyErrorType(errorMessage: string): UsageLimitErrorType {
const err = errorMessage.toLowerCase();
// Long-context entitlement errors are billing gates, not transient rate limits.
// Must be checked before the generic 429/rate_limit regex.
if (/extra usage is required|long context required/i.test(err)) return "quota_exhausted";
if (/quota|billing|exceeded.*limit|usage.*limit/i.test(err)) return "quota_exhausted";
if (/rate.?limit|too many requests|429/i.test(err)) return "rate_limit";
if (/500|502|503|504|server.?error|internal.?error|service.?unavailable/i.test(err)) return "server_error";
return "unknown";
}
/**
* Attempt to downgrade a long-context model (e.g. claude-opus-4-6[1m]) to its
* base model (claude-opus-4-6) when the account lacks the long-context billing
* entitlement. Returns true if the downgrade was initiated.
*/
private _tryLongContextDowngrade(message: AssistantMessage): boolean {
const currentModel = this._deps.getModel();
if (!currentModel) return false;
// Only attempt downgrade for [1m] (or similar long-context) model IDs
const match = currentModel.id.match(/^(.+)\[\d+m\]$/);
if (!match) return false;
const baseModelId = match[1];
const baseModel = this._deps.modelRegistry.find(currentModel.provider, baseModelId);
if (!baseModel) return false;
const previousId = currentModel.id;
this._deps.agent.setModel(baseModel);
this._deps.onModelChange(baseModel);
this._removeLastAssistantError();
this._deps.emit({
type: "fallback_provider_switch",
from: `${currentModel.provider}/${previousId}`,
to: `${baseModel.provider}/${baseModel.id}`,
reason: `long context downgrade: ${previousId}${baseModel.id}`,
});
this._deps.emit({
type: "auto_retry_start",
attempt: this._retryAttempt + 1,
maxAttempts: this._deps.settingsManager.getRetrySettings().maxRetries,
delayMs: 0,
errorMessage: `${message.errorMessage} (long context downgrade)`,
});
setTimeout(() => {
this._deps.agent.continue().catch(() => {});
}, 0);
return true;
}
/** Remove the last assistant error message from agent state */
private _removeLastAssistantError(): void {
const messages = this._deps.agent.state.messages;