The "Extra usage is required for long context requests" error from Anthropic is a billing gate, not a transient rate limit. Classify it as quota_exhausted so the handler enters the fallback path instead of an infinite backoff loop. When no cross-provider fallback exists, attempt a [1m] to base model downgrade before stopping cleanly. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
50fbd0a837
commit
a725fa2d9d
2 changed files with 307 additions and 1 deletions
255
packages/pi-coding-agent/src/core/retry-handler.test.ts
Normal file
255
packages/pi-coding-agent/src/core/retry-handler.test.ts
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
/**
|
||||
* RetryHandler tests — long-context entitlement 429 error handling (#2803)
|
||||
*
|
||||
* Verifies that "Extra usage is required for long context requests" errors
|
||||
* are classified as quota_exhausted (not rate_limit) and trigger a model
|
||||
* downgrade from [1m] to base when no cross-provider fallback exists.
|
||||
*/
|
||||
|
||||
import { describe, it, beforeEach, mock, type Mock } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { RetryHandler, type RetryHandlerDeps } from "./retry-handler.js";
|
||||
import type { Api, AssistantMessage, Model } from "@gsd/pi-ai";
|
||||
import type { FallbackResolver } from "./fallback-resolver.js";
|
||||
import type { ModelRegistry } from "./model-registry.js";
|
||||
import type { SettingsManager } from "./settings-manager.js";
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
function createMockModel(provider: string, id: string): Model<Api> {
|
||||
return {
|
||||
id,
|
||||
name: id,
|
||||
api: "anthropic" as Api,
|
||||
provider,
|
||||
baseUrl: "https://api.anthropic.com",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 1_000_000,
|
||||
maxTokens: 16384,
|
||||
} as Model<Api>;
|
||||
}
|
||||
|
||||
function errorMessage(msg: string): AssistantMessage {
|
||||
return {
|
||||
role: "assistant",
|
||||
content: [],
|
||||
api: "anthropic-messages",
|
||||
provider: "anthropic",
|
||||
model: "claude-opus-4-6[1m]",
|
||||
usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
|
||||
stopReason: "error",
|
||||
errorMessage: msg,
|
||||
timestamp: Date.now(),
|
||||
} as AssistantMessage;
|
||||
}
|
||||
|
||||
interface MockDeps {
|
||||
deps: RetryHandlerDeps;
|
||||
emittedEvents: Array<Record<string, any>>;
|
||||
continueFn: Mock<() => Promise<void>>;
|
||||
onModelChangeFn: Mock<(model: Model<any>) => void>;
|
||||
markUsageLimitReached: Mock<(...args: any[]) => boolean>;
|
||||
findFallback: Mock<(...args: any[]) => Promise<any>>;
|
||||
findModel: Mock<(provider: string, modelId: string) => Model<Api> | undefined>;
|
||||
}
|
||||
|
||||
function createMockDeps(overrides?: {
|
||||
model?: Model<Api>;
|
||||
retryEnabled?: boolean;
|
||||
markUsageLimitReachedResult?: boolean;
|
||||
fallbackResult?: any;
|
||||
findModelResult?: (provider: string, modelId: string) => Model<Api> | undefined;
|
||||
}): MockDeps {
|
||||
const model = overrides?.model ?? createMockModel("anthropic", "claude-opus-4-6[1m]");
|
||||
const emittedEvents: Array<Record<string, any>> = [];
|
||||
const continueFn = mock.fn(async () => {});
|
||||
const onModelChangeFn = mock.fn((_model: Model<any>) => {});
|
||||
const markUsageLimitReached = mock.fn(
|
||||
() => overrides?.markUsageLimitReachedResult ?? false,
|
||||
);
|
||||
const findFallback = mock.fn(async () => overrides?.fallbackResult ?? null);
|
||||
const findModel = mock.fn(
|
||||
overrides?.findModelResult ?? ((_provider: string, _modelId: string) => undefined),
|
||||
);
|
||||
|
||||
const messages: Array<{ role: string } & Record<string, any>> = [];
|
||||
|
||||
const deps: RetryHandlerDeps = {
|
||||
agent: {
|
||||
continue: continueFn,
|
||||
state: { messages },
|
||||
setModel: mock.fn(),
|
||||
replaceMessages: mock.fn((newMessages: any[]) => {
|
||||
messages.length = 0;
|
||||
messages.push(...newMessages);
|
||||
}),
|
||||
} as any,
|
||||
settingsManager: {
|
||||
getRetryEnabled: () => overrides?.retryEnabled ?? true,
|
||||
getRetrySettings: () => ({
|
||||
enabled: overrides?.retryEnabled ?? true,
|
||||
maxRetries: 5,
|
||||
baseDelayMs: 1000,
|
||||
maxDelayMs: 30000,
|
||||
}),
|
||||
} as unknown as SettingsManager,
|
||||
modelRegistry: {
|
||||
authStorage: {
|
||||
markUsageLimitReached,
|
||||
},
|
||||
find: findModel,
|
||||
} as unknown as ModelRegistry,
|
||||
fallbackResolver: {
|
||||
findFallback,
|
||||
} as unknown as FallbackResolver,
|
||||
getModel: () => model,
|
||||
getSessionId: () => "test-session",
|
||||
emit: (event: any) => emittedEvents.push(event),
|
||||
onModelChange: onModelChangeFn,
|
||||
};
|
||||
|
||||
return { deps, emittedEvents, continueFn, onModelChangeFn, markUsageLimitReached, findFallback, findModel };
|
||||
}
|
||||
|
||||
// ─── _classifyErrorType (tested via handleRetryableError behavior) ──────────
|
||||
|
||||
describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
|
||||
|
||||
describe("error classification", () => {
|
||||
it("classifies 'Extra usage is required for long context requests' as quota_exhausted, not rate_limit", async () => {
|
||||
// When the error is classified as quota_exhausted AND no alternate credentials
|
||||
// AND no fallback, the handler should emit fallback_chain_exhausted and stop.
|
||||
// If misclassified as rate_limit, it would enter the backoff loop instead.
|
||||
const { deps, emittedEvents, findModel } = createMockDeps({
|
||||
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
|
||||
markUsageLimitReachedResult: false, // no alternate credentials
|
||||
fallbackResult: null, // no cross-provider fallback
|
||||
findModelResult: () => undefined, // no base model either
|
||||
});
|
||||
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage(
|
||||
'429 {"type":"error","error":{"type":"rate_limit_error","message":"Extra usage is required for long context requests."}}'
|
||||
);
|
||||
|
||||
const result = await handler.handleRetryableError(msg);
|
||||
|
||||
// Should NOT retry (would be true if misclassified as rate_limit entering backoff)
|
||||
assert.equal(result, false);
|
||||
|
||||
// Should emit fallback_chain_exhausted (quota_exhausted path), NOT auto_retry_start (backoff path)
|
||||
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
|
||||
assert.ok(chainExhausted, "Expected fallback_chain_exhausted event for entitlement error");
|
||||
|
||||
const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start");
|
||||
assert.equal(retryStart, undefined, "Should NOT emit auto_retry_start for entitlement error");
|
||||
});
|
||||
|
||||
it("still classifies regular 429 rate limits as rate_limit", async () => {
|
||||
// A normal "rate limit" 429 should still be classified as rate_limit
|
||||
const { deps, emittedEvents } = createMockDeps({
|
||||
model: createMockModel("anthropic", "claude-opus-4-6"),
|
||||
markUsageLimitReachedResult: false,
|
||||
fallbackResult: null,
|
||||
});
|
||||
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage("429 Too Many Requests");
|
||||
|
||||
const result = await handler.handleRetryableError(msg);
|
||||
|
||||
// Should enter the backoff loop (rate_limit path, not quota_exhausted)
|
||||
assert.equal(result, true);
|
||||
|
||||
const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start");
|
||||
assert.ok(retryStart, "Regular 429 should enter backoff retry");
|
||||
});
|
||||
});
|
||||
|
||||
describe("long-context model downgrade", () => {
|
||||
it("downgrades from [1m] to base model when entitlement error and no fallback", async () => {
|
||||
const baseModel = createMockModel("anthropic", "claude-opus-4-6");
|
||||
const { deps, emittedEvents, onModelChangeFn, continueFn } = createMockDeps({
|
||||
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
|
||||
markUsageLimitReachedResult: false,
|
||||
fallbackResult: null,
|
||||
findModelResult: (provider: string, modelId: string) => {
|
||||
if (provider === "anthropic" && modelId === "claude-opus-4-6") return baseModel;
|
||||
return undefined;
|
||||
},
|
||||
});
|
||||
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage("Extra usage is required for long context requests.");
|
||||
|
||||
const result = await handler.handleRetryableError(msg);
|
||||
|
||||
assert.equal(result, true, "Should retry after downgrade");
|
||||
|
||||
// Should have called setModel with the base model
|
||||
const setModelCalls = (deps.agent.setModel as any).mock.calls;
|
||||
assert.equal(setModelCalls.length, 1);
|
||||
assert.equal(setModelCalls[0].arguments[0].id, "claude-opus-4-6");
|
||||
|
||||
// Should have notified about model change
|
||||
assert.equal(onModelChangeFn.mock.calls.length, 1);
|
||||
|
||||
// Should emit a fallback_provider_switch event indicating downgrade
|
||||
const switchEvent = emittedEvents.find((e) => e.type === "fallback_provider_switch");
|
||||
assert.ok(switchEvent, "Expected fallback_provider_switch event for downgrade");
|
||||
assert.ok(switchEvent!.reason.includes("long context downgrade"), `reason should mention downgrade: ${switchEvent!.reason}`);
|
||||
});
|
||||
|
||||
it("emits fallback_chain_exhausted when base model is also unavailable", async () => {
|
||||
const { deps, emittedEvents } = createMockDeps({
|
||||
model: createMockModel("anthropic", "claude-opus-4-6[1m]"),
|
||||
markUsageLimitReachedResult: false,
|
||||
fallbackResult: null,
|
||||
findModelResult: () => undefined, // base model not found
|
||||
});
|
||||
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage("Extra usage is required for long context requests.");
|
||||
|
||||
const result = await handler.handleRetryableError(msg);
|
||||
|
||||
assert.equal(result, false);
|
||||
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
|
||||
assert.ok(chainExhausted, "Expected fallback_chain_exhausted when base model unavailable");
|
||||
});
|
||||
|
||||
it("does not attempt downgrade for non-[1m] models", async () => {
|
||||
// When a regular model (no [1m] suffix) gets a quota_exhausted error
|
||||
// with no fallback, it should just stop — no downgrade attempt.
|
||||
const { deps, emittedEvents } = createMockDeps({
|
||||
model: createMockModel("anthropic", "claude-opus-4-6"),
|
||||
markUsageLimitReachedResult: false,
|
||||
fallbackResult: null,
|
||||
});
|
||||
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage("Extra usage is required for long context requests.");
|
||||
|
||||
const result = await handler.handleRetryableError(msg);
|
||||
|
||||
assert.equal(result, false);
|
||||
const chainExhausted = emittedEvents.find((e) => e.type === "fallback_chain_exhausted");
|
||||
assert.ok(chainExhausted);
|
||||
|
||||
// No downgrade switch should occur
|
||||
const switchEvent = emittedEvents.find((e) => e.type === "fallback_provider_switch");
|
||||
assert.equal(switchEvent, undefined, "Should not switch for non-[1m] models");
|
||||
});
|
||||
});
|
||||
|
||||
describe("isRetryableError", () => {
|
||||
it("considers long-context entitlement error as retryable", () => {
|
||||
const { deps } = createMockDeps();
|
||||
const handler = new RetryHandler(deps);
|
||||
const msg = errorMessage("Extra usage is required for long context requests.");
|
||||
assert.equal(handler.isRetryableError(msg), true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -107,7 +107,7 @@ export class RetryHandler {
|
|||
if (isContextOverflow(message, contextWindow)) return false;
|
||||
|
||||
const err = message.errorMessage;
|
||||
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off/i.test(
|
||||
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|temporarily backed off|extra usage is required/i.test(
|
||||
err,
|
||||
);
|
||||
}
|
||||
|
|
@ -202,6 +202,10 @@ export class RetryHandler {
|
|||
|
||||
// No fallback available either
|
||||
if (errorType === "quota_exhausted") {
|
||||
// Try long-context model downgrade ([1m] → base) before giving up
|
||||
const downgraded = this._tryLongContextDowngrade(message);
|
||||
if (downgraded) return true;
|
||||
|
||||
this._deps.emit({
|
||||
type: "fallback_chain_exhausted",
|
||||
reason: `All providers exhausted for ${this._deps.getModel()!.provider}/${this._deps.getModel()!.id}`,
|
||||
|
|
@ -343,12 +347,59 @@ export class RetryHandler {
|
|||
*/
|
||||
private _classifyErrorType(errorMessage: string): UsageLimitErrorType {
|
||||
const err = errorMessage.toLowerCase();
|
||||
// Long-context entitlement errors are billing gates, not transient rate limits.
|
||||
// Must be checked before the generic 429/rate_limit regex.
|
||||
if (/extra usage is required|long context required/i.test(err)) return "quota_exhausted";
|
||||
if (/quota|billing|exceeded.*limit|usage.*limit/i.test(err)) return "quota_exhausted";
|
||||
if (/rate.?limit|too many requests|429/i.test(err)) return "rate_limit";
|
||||
if (/500|502|503|504|server.?error|internal.?error|service.?unavailable/i.test(err)) return "server_error";
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to downgrade a long-context model (e.g. claude-opus-4-6[1m]) to its
|
||||
* base model (claude-opus-4-6) when the account lacks the long-context billing
|
||||
* entitlement. Returns true if the downgrade was initiated.
|
||||
*/
|
||||
private _tryLongContextDowngrade(message: AssistantMessage): boolean {
|
||||
const currentModel = this._deps.getModel();
|
||||
if (!currentModel) return false;
|
||||
|
||||
// Only attempt downgrade for [1m] (or similar long-context) model IDs
|
||||
const match = currentModel.id.match(/^(.+)\[\d+m\]$/);
|
||||
if (!match) return false;
|
||||
|
||||
const baseModelId = match[1];
|
||||
const baseModel = this._deps.modelRegistry.find(currentModel.provider, baseModelId);
|
||||
if (!baseModel) return false;
|
||||
|
||||
const previousId = currentModel.id;
|
||||
this._deps.agent.setModel(baseModel);
|
||||
this._deps.onModelChange(baseModel);
|
||||
this._removeLastAssistantError();
|
||||
|
||||
this._deps.emit({
|
||||
type: "fallback_provider_switch",
|
||||
from: `${currentModel.provider}/${previousId}`,
|
||||
to: `${baseModel.provider}/${baseModel.id}`,
|
||||
reason: `long context downgrade: ${previousId} → ${baseModel.id}`,
|
||||
});
|
||||
|
||||
this._deps.emit({
|
||||
type: "auto_retry_start",
|
||||
attempt: this._retryAttempt + 1,
|
||||
maxAttempts: this._deps.settingsManager.getRetrySettings().maxRetries,
|
||||
delayMs: 0,
|
||||
errorMessage: `${message.errorMessage} (long context downgrade)`,
|
||||
});
|
||||
|
||||
setTimeout(() => {
|
||||
this._deps.agent.continue().catch(() => {});
|
||||
}, 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Remove the last assistant error message from agent state */
|
||||
private _removeLastAssistantError(): void {
|
||||
const messages = this._deps.agent.state.messages;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue