diff --git a/packages/pi-coding-agent/src/core/fallback-resolver.test.ts b/packages/pi-coding-agent/src/core/fallback-resolver.test.ts index 4e9ca7cd9..c40a6462b 100644 --- a/packages/pi-coding-agent/src/core/fallback-resolver.test.ts +++ b/packages/pi-coding-agent/src/core/fallback-resolver.test.ts @@ -82,7 +82,7 @@ describe("FallbackResolver — findFallback", () => { assert.equal(result!.chainName, "coding"); }); - it("marks current provider as exhausted", async () => { + it("marks current provider as exhausted for rate_limit errors", async () => { const { resolver, authStorage } = createResolver(); await resolver.findFallback(zaiModel, "rate_limit"); @@ -92,6 +92,18 @@ describe("FallbackResolver — findFallback", () => { assert.equal(fn.mock.calls[0][1], "rate_limit"); }); + it("does NOT mark provider as exhausted for quota_exhausted (per-model quota)", async () => { + const { resolver, authStorage } = createResolver(); + await resolver.findFallback(zaiModel, "quota_exhausted"); + + const fn = authStorage.markProviderExhausted as any; + assert.equal( + fn.mock.calls.length, + 0, + "quota_exhausted should not mark entire provider exhausted — other models may have quota", + ); + }); + it("skips backed-off providers", async () => { const { resolver } = createResolver({ isProviderAvailable: (provider: string) => provider !== "alibaba", diff --git a/packages/pi-coding-agent/src/core/fallback-resolver.ts b/packages/pi-coding-agent/src/core/fallback-resolver.ts index 690dca75d..7220b819a 100644 --- a/packages/pi-coding-agent/src/core/fallback-resolver.ts +++ b/packages/pi-coding-agent/src/core/fallback-resolver.ts @@ -44,8 +44,13 @@ export class FallbackResolver { const { enabled, chains } = this.settingsManager.getFallbackSettings(); if (!enabled) return null; - // Mark the current provider as exhausted at the provider level - this.authStorage.markProviderExhausted(currentModel.provider, errorType); + // Mark the current provider as exhausted at the provider level. + // Skip for quota_exhausted — quotas are typically per-model (e.g. + // google-gemini-cli's Code Assist per-model limits), so other models + // from the same provider may still be available. + if (errorType !== "quota_exhausted") { + this.authStorage.markProviderExhausted(currentModel.provider, errorType); + } // Search all chains for one containing the current model for (const [chainName, entries] of Object.entries(chains)) { diff --git a/packages/pi-coding-agent/src/core/retry-handler.test.ts b/packages/pi-coding-agent/src/core/retry-handler.test.ts index 71086f202..1272955ed 100644 --- a/packages/pi-coding-agent/src/core/retry-handler.test.ts +++ b/packages/pi-coding-agent/src/core/retry-handler.test.ts @@ -513,6 +513,19 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { ); assert.equal(handler.isRetryableError(msg), true); }); + + it("considers 'no capacity' provider errors as retryable", () => { + const { deps } = createMockDeps(); + const handler = new RetryHandler(deps); + const msg = errorMessage( + "No capacity available for model gemini-2.5-pro on the server", + ); + assert.equal( + handler.isRetryableError(msg), + true, + "no capacity errors should be retryable (triggers fallback)", + ); + }); }); describe("third-party block claude-code fallback (#3772)", () => { @@ -608,7 +621,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { }); describe("quota wait before fallback", () => { - it("waits for retryAfterMs before retrying same provider on quota error", async () => { + it("waits for short retryAfterMs before retrying same provider on quota error", async () => { const { deps, emittedEvents, continueFn } = createMockDeps({ model: createMockModel("google-gemini-cli", "gemini-2.5-pro"), fallbackResult: null, @@ -617,14 +630,14 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { const handler = new RetryHandler(deps); const msg = errorMessage( - "You have exhausted your capacity on this model. Your quota will reset after 59s.", + "You have exhausted your capacity on this model. Your quota will reset after 3s.", ); - (msg as any).retryAfterMs = 59000; + (msg as any).retryAfterMs = 3000; const result = await handler.handleRetryableError(msg); - // Should wait and retry, not immediately fail - assert.equal(result, true, "should wait and retry on quota reset"); + // Should wait and retry for short resets (< 5s threshold) + assert.equal(result, true, "should wait and retry on short quota reset"); const retryStart = emittedEvents.find( (e) => e.type === "auto_retry_start", @@ -632,7 +645,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { assert.ok(retryStart, "should emit auto_retry_start with wait"); assert.equal( retryStart!.delayMs, - 59000, + 3000, "should use provider's retry-after delay", ); assert.ok( @@ -651,7 +664,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { ); }); - it("falls through to fallback when retryAfterMs exceeds maxDelayMs", async () => { + it("falls through to fallback when retryAfterMs exceeds short threshold", async () => { const fallbackModel = createMockModel("openai", "gpt-4o"); const { deps, emittedEvents } = createMockDeps({ model: createMockModel("google-gemini-cli", "gemini-2.5-pro"), @@ -659,18 +672,18 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { model: fallbackModel, reason: "cross-provider fallback", }, - retrySettings: { maxRetries: 5, baseDelayMs: 1000, maxDelayMs: 30000 }, + retrySettings: { maxRetries: 5, baseDelayMs: 1000, maxDelayMs: 60000 }, }); const handler = new RetryHandler(deps); const msg = errorMessage( - "You have exhausted your capacity on this model. Your quota will reset after 5m.", + "You have exhausted your capacity on this model. Your quota will reset after 59s.", ); - (msg as any).retryAfterMs = 300000; // 5 minutes, exceeds maxDelayMs + (msg as any).retryAfterMs = 59000; const result = await handler.handleRetryableError(msg); - // Should fall through to fallback since wait is too long + // Should fall through to fallback since wait > 5s threshold assert.equal(result, true, "should fallback when quota reset is too long"); const switchEvent = emittedEvents.find( @@ -764,6 +777,39 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => { ); }); + it("triggers fallback for 'no capacity' server errors", async () => { + // "No capacity available" is a provider-side capacity issue, + // not a credential/rate-limit problem. Should classify as rate_limit + // to trigger the fallback chain. + const fallbackModel = createMockModel("openai", "gpt-4o"); + const { deps, emittedEvents, findFallback } = createMockDeps({ + model: createMockModel("google-gemini-cli", "gemini-2.5-pro"), + markUsageLimitReachedResult: false, + fallbackResult: { + model: fallbackModel, + reason: "free-selection fallback", + }, + }); + + const handler = new RetryHandler(deps); + const msg = errorMessage( + "No capacity available for model gemini-2.5-pro on the server", + ); + + const result = await handler.handleRetryableError(msg); + + assert.equal(result, true, "should retry with fallback provider"); + assert.equal( + findFallback.mock.calls.length, + 1, + "should invoke fallback resolver for capacity errors", + ); + assert.ok( + emittedEvents.some((e) => e.type === "fallback_provider_switch"), + "should emit fallback_provider_switch", + ); + }); + it("still tries cross-provider fallback for quota_exhausted without credential backoff", async () => { const fallbackModel = createMockModel("openai", "gpt-4o"); const { deps, markUsageLimitReached, continueFn } = createMockDeps({ diff --git a/packages/pi-coding-agent/src/core/retry-handler.ts b/packages/pi-coding-agent/src/core/retry-handler.ts index 50261b89a..725312c38 100644 --- a/packages/pi-coding-agent/src/core/retry-handler.ts +++ b/packages/pi-coding-agent/src/core/retry-handler.ts @@ -119,7 +119,7 @@ export class RetryHandler { // generated error from getApiKey() when credentials are in a backoff window. // Re-entering the retry handler for that message creates a cascade of empty // error entries in the session file, breaking resume (#3429). - return /overloaded|rate.?limit|too many requests|402|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|connection.?lost|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|requires more credits|can only afford|insufficient credits|not enough credits|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available/i.test( + return /overloaded|rate.?limit|too many requests|402|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|connection.?lost|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|requires more credits|can only afford|insufficient credits|not enough credits|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available|no capacity|capacity.*available/i.test( err, ); } @@ -211,9 +211,10 @@ export class RetryHandler { const isAuthError = errorType === "auth_error"; if (isRateLimit || isQuotaError || isAuthError) { // For quota errors with a retry-after hint, wait before switching providers. - // The quota may reset quickly (e.g. 59s), so waiting is often better than - // switching to a potentially worse model. - if (isQuotaError && message.retryAfterMs !== undefined && message.retryAfterMs > 0) { + // Only wait if the reset is very short (< 5s); otherwise falling back to + // another provider is faster and keeps auto-mode throughput up. + const QUOTA_WAIT_THRESHOLD_MS = 5_000; + if (isQuotaError && message.retryAfterMs !== undefined && message.retryAfterMs > 0 && message.retryAfterMs <= QUOTA_WAIT_THRESHOLD_MS) { const cap = settings.maxDelayMs > 0 ? settings.maxDelayMs : Infinity; if (message.retryAfterMs <= cap) { this._deps.emit({ @@ -486,6 +487,11 @@ export class RetryHandler { return "quota_exhausted"; if (/rate.?limit|too many requests|429|529|overloaded/i.test(err)) return "rate_limit"; + // Provider-side capacity/server load — the server has no available + // capacity for this model (e.g. "No capacity available for model X"). + // Treat as rate_limit so the fallback chain kicks in immediately. + if (/no capacity|capacity.*available|server.*busy|too busy/i.test(err)) + return "rate_limit"; if ( /500|502|503|504|server.?error|internal.?error|service.?unavailable/i.test( err,