diff --git a/packages/pi-coding-agent/src/core/retry-handler.test.ts b/packages/pi-coding-agent/src/core/retry-handler.test.ts index 5cd324401..df3c8988d 100644 --- a/packages/pi-coding-agent/src/core/retry-handler.test.ts +++ b/packages/pi-coding-agent/src/core/retry-handler.test.ts @@ -171,6 +171,25 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start"); assert.ok(retryStart, "Regular 429 should enter backoff retry"); }); + + it("classifies OpenRouter credit affordability errors as quota_exhausted", async () => { + const { deps, emittedEvents } = createMockDeps({ + model: createMockModel("openrouter", "openai/gpt-5-pro"), + markUsageLimitReachedResult: false, + fallbackResult: null, + }); + + const handler = new RetryHandler(deps); + const msg = errorMessage( + "402 This request requires more credits, or fewer max_tokens. You requested up to 32000 tokens, but can only afford 329.", + ); + + const result = await handler.handleRetryableError(msg); + + assert.equal(result, true, "affordability error should trigger credit-aware retry"); + const retryStart = emittedEvents.find((e) => e.type === "auto_retry_start"); + assert.ok(retryStart, "Expected immediate retry after reducing max tokens"); + }); }); describe("long-context model downgrade", () => { @@ -271,6 +290,61 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { }); }); + describe("credit-aware maxTokens retry", () => { + it("reduces maxTokens on same model when provider reports affordable cap", async () => { + const expensiveModel = createMockModel("openrouter", "openai/gpt-5-pro"); + expensiveModel.maxTokens = 128000; + + const { deps, emittedEvents, onModelChangeFn } = createMockDeps({ + model: expensiveModel, + markUsageLimitReachedResult: false, + fallbackResult: null, + }); + + const handler = new RetryHandler(deps); + const msg = errorMessage( + "402 This request requires more credits, or fewer max_tokens. You requested up to 32000 tokens, but can only afford 329.", + ); + + const result = await handler.handleRetryableError(msg); + assert.equal(result, true, "should retry after reducing maxTokens"); + + const setModelCalls = (deps.agent.setModel as any).mock.calls; + assert.equal(setModelCalls.length, 1, "should apply one model downgrade"); + const downgraded = setModelCalls[0].arguments[0] as Model; + assert.equal(downgraded.provider, "openrouter"); + assert.equal(downgraded.id, "openai/gpt-5-pro"); + assert.equal(downgraded.maxTokens, 297, "expected affordability cap with safety buffer"); + + assert.equal(onModelChangeFn.mock.calls.length, 1, "should notify about model update"); + const switchEvent = emittedEvents.find((e) => e.type === "fallback_provider_switch"); + assert.ok(switchEvent, "should emit model-adjustment event"); + assert.ok( + String(switchEvent?.reason || "").includes("credit-aware retry"), + "switch reason should mention credit-aware retry", + ); + }); + + it("does not mark credentials in cooldown for affordability quota errors", async () => { + const expensiveModel = createMockModel("openrouter", "openai/gpt-5-pro"); + expensiveModel.maxTokens = 128000; + + const { deps, markUsageLimitReached } = createMockDeps({ + model: expensiveModel, + markUsageLimitReachedResult: false, + fallbackResult: null, + }); + + const handler = new RetryHandler(deps); + const msg = errorMessage( + "402 This request requires more credits, or fewer max_tokens. You requested up to 32000 tokens, but can only afford 329.", + ); + + await handler.handleRetryableError(msg); + assert.equal(markUsageLimitReached.mock.calls.length, 0, "quota error should skip credential cooldown"); + }); + }); + describe("isRetryableError", () => { it("considers long-context entitlement error as retryable", () => { const { deps } = createMockDeps(); @@ -291,6 +365,15 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { ); assert.equal(handler.isRetryableError(msg), false); }); + + it("considers OpenRouter affordability credit errors as retryable", () => { + const { deps } = createMockDeps(); + const handler = new RetryHandler(deps); + const msg = errorMessage( + "402 This request requires more credits, or fewer max_tokens. You requested up to 32000 tokens, but can only afford 329.", + ); + assert.equal(handler.isRetryableError(msg), true); + }); }); describe("third-party block claude-code fallback (#3772)", () => { diff --git a/packages/pi-coding-agent/src/core/retry-handler.ts b/packages/pi-coding-agent/src/core/retry-handler.ts index 78d12c8ba..399d92fd4 100644 --- a/packages/pi-coding-agent/src/core/retry-handler.ts +++ b/packages/pi-coding-agent/src/core/retry-handler.ts @@ -116,7 +116,7 @@ export class RetryHandler { // generated error from getApiKey() when credentials are in a backoff window. // Re-entering the retry handler for that message creates a cascade of empty // error entries in the session file, breaking resume (#3429). - return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available/i.test( + return /overloaded|rate.?limit|too many requests|402|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|requires more credits|can only afford|insufficient credits|not enough credits|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available/i.test( err, ); } @@ -158,6 +158,14 @@ export class RetryHandler { const isRateLimit = errorType === "rate_limit"; const isQuotaError = errorType === "quota_exhausted"; + // Credit-aware retry (OpenRouter-style 402 affordability errors): + // when provider reports "can only afford N", lower maxTokens and retry + // on the same model before rotating credentials/providers. + if (isQuotaError) { + const adjusted = this._tryAffordableMaxTokensRetry(message, retryGeneration); + if (adjusted) return true; + } + // Credential rotation — only for transient rate limits (#3430). // Quota errors ("Extra usage is required") are account-level billing // gates; rotating to another credential on the same account won't help @@ -409,12 +417,63 @@ export class RetryHandler { // Long-context entitlement errors are billing gates, not transient rate limits. // Must be checked before the generic 429/rate_limit regex. if (/extra usage is required|long context required/i.test(err)) return "quota_exhausted"; + if (/requires more credits|can only afford|insufficient credits|not enough credits|credit balance/i.test(err)) + return "quota_exhausted"; if (/quota|billing|exceeded.*limit|usage.*limit/i.test(err)) return "quota_exhausted"; if (/rate.?limit|too many requests|429/i.test(err)) return "rate_limit"; if (/500|502|503|504|server.?error|internal.?error|service.?unavailable/i.test(err)) return "server_error"; return "unknown"; } + /** + * Attempt a same-model retry by reducing maxTokens when provider reports + * an affordability cap (e.g., "can only afford 329"). + */ + private _tryAffordableMaxTokensRetry(message: AssistantMessage, retryGeneration: number): boolean { + const currentModel = this._deps.getModel(); + if (!currentModel || !message.errorMessage) return false; + + // Example: "can only afford 329" + const match = message.errorMessage.match(/can only afford\s+([\d,]+)/i); + if (!match?.[1]) return false; + + const affordable = Number.parseInt(match[1].replace(/,/g, ""), 10); + if (!Number.isFinite(affordable) || affordable <= 0) return false; + + // Leave a small buffer so slight input variance doesn't immediately re-fail. + const safetyBuffer = Math.min(64, Math.max(16, Math.floor(affordable * 0.1))); + const targetMaxTokens = Math.max(64, affordable - safetyBuffer); + const downgradedMaxTokens = Math.min(currentModel.maxTokens, targetMaxTokens); + if (downgradedMaxTokens >= currentModel.maxTokens) return false; + + const downgradedModel = { + ...currentModel, + maxTokens: downgradedMaxTokens, + }; + + this._deps.agent.setModel(downgradedModel); + this._deps.onModelChange(downgradedModel); + this._removeLastAssistantError(); + + this._deps.emit({ + type: "fallback_provider_switch", + from: `${currentModel.provider}/${currentModel.id} (maxTokens=${currentModel.maxTokens})`, + to: `${downgradedModel.provider}/${downgradedModel.id} (maxTokens=${downgradedModel.maxTokens})`, + reason: `credit-aware retry: provider affordable cap ${affordable} tokens`, + }); + + this._deps.emit({ + type: "auto_retry_start", + attempt: this._retryAttempt + 1, + maxAttempts: this._deps.settingsManager.getRetrySettings().maxRetries, + delayMs: 0, + errorMessage: `${message.errorMessage} (reducing max tokens)`, + }); + + this._scheduleContinue(retryGeneration); + return true; + } + /** * Attempt to downgrade a long-context model (e.g. claude-opus-4-6[1m]) to its * base model (claude-opus-4-6) when the account lacks the long-context billing diff --git a/src/resources/extensions/gsd/error-classifier.ts b/src/resources/extensions/gsd/error-classifier.ts index 604167451..b72f88e46 100644 --- a/src/resources/extensions/gsd/error-classifier.ts +++ b/src/resources/extensions/gsd/error-classifier.ts @@ -44,6 +44,9 @@ export function resetRetryState(state: RetryState): void { const PERMANENT_RE = /auth|unauthorized|forbidden|invalid.*key|invalid.*api|billing|quota exceeded|account/i; const RATE_LIMIT_RE = /rate.?limit|too many requests|429/i; +// OpenRouter affordability-style quota errors should be treated as transient +// so core retry logic can lower maxTokens and continue in-session. +const AFFORDABILITY_RE = /requires more credits|can only afford|insufficient credits|not enough credits|fewer max_tokens/i; const NETWORK_RE = /network|ECONNRESET|ETIMEDOUT|ECONNREFUSED|socket hang up|fetch failed|connection.*reset|dns/i; const SERVER_RE = /internal server error|500|502|503|overloaded|server_error|api_error|service.?unavailable/i; // ECONNRESET/ECONNREFUSED are in NETWORK_RE (same-model retry first). @@ -67,7 +70,7 @@ const RESET_DELAY_RE = /reset in (\d+)s/i; */ export function classifyError(errorMsg: string, retryAfterMs?: number): ErrorClass { const isPermanent = PERMANENT_RE.test(errorMsg); - const isRateLimit = RATE_LIMIT_RE.test(errorMsg); + const isRateLimit = RATE_LIMIT_RE.test(errorMsg) || AFFORDABILITY_RE.test(errorMsg); // 1. Permanent — but rate limit takes precedence if (isPermanent && !isRateLimit) { diff --git a/src/resources/extensions/gsd/tests/provider-errors.test.ts b/src/resources/extensions/gsd/tests/provider-errors.test.ts index 34c4ed824..f1867f890 100644 --- a/src/resources/extensions/gsd/tests/provider-errors.test.ts +++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts @@ -32,6 +32,15 @@ test("classifyError detects rate limit from message", () => { assert.equal(result.kind, "rate-limit"); }); +test("classifyError treats OpenRouter affordability errors as transient rate-limit class", () => { + const result = classifyError( + "402 This request requires more credits, or fewer max_tokens. You requested up to 32000 tokens, but can only afford 329.", + ); + assert.ok(isTransient(result)); + assert.equal(result.kind, "rate-limit"); + assert.ok("retryAfterMs" in result && result.retryAfterMs > 0); +}); + test("classifyError extracts reset delay from message", () => { const result = classifyError("rate limit exceeded, reset in 45s"); assert.equal(result.kind, "rate-limit");