From 46016adf9a86b75b26110df216320363850ecf86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=82CHES?= Date: Thu, 26 Mar 2026 20:03:28 -0600 Subject: [PATCH] fix: let rate-limit errors attempt model fallback before pausing (#2775) * fix: let rate-limit errors attempt model fallback before pausing Rate-limit errors were early-returning with pauseTransientWithBackoff() before reaching model fallback logic. Since rate limits are frequently per-model (not provider-wide), this caused 20+ minute stuck-loops when fallback models were available. Now rate-limit errors enter the same fallback path as other transient errors, only pausing if no fallback model is available. Closes #2770 Co-Authored-By: Claude Opus 4.6 (1M context) * test: add regression test for rate-limit model fallback Verifies that rate-limit errors enter the model fallback path before pausing (#2770), and that the old early-return bypass no longer exists. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .../gsd/bootstrap/agent-end-recovery.ts | 15 +--- .../tests/rate-limit-model-fallback.test.ts | 90 +++++++++++++++++++ 2 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 src/resources/extensions/gsd/tests/rate-limit-model-fallback.test.ts diff --git a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts index 1c5862260..89de63a58 100644 --- a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts +++ b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts @@ -107,16 +107,9 @@ export async function handleAgentEnd( ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning"); } - // --- Rate limit: skip model fallback, go straight to pause --- - // Rate-limiting is a provider issue, not a model issue. - // Switching models won't help if the provider is throttling you. - if (cls.kind === "rate-limit") { - await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true); - return; - } - - // --- Server/connection/stream errors: try model fallback first --- - if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") { + // --- Transient errors: try model fallback first, then pause --- + // Rate limits are often per-model, so switching models can bypass them. + if (cls.kind === "rate-limit" || cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") { // Try model fallback const dash = getAutoDashboardData(); if (dash.currentUnit) { @@ -161,7 +154,7 @@ export async function handleAgentEnd( // --- Transient fallback: pause with auto-resume --- if (isTransient(cls)) { - await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false); + await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, cls.kind === "rate-limit"); return; } diff --git a/src/resources/extensions/gsd/tests/rate-limit-model-fallback.test.ts b/src/resources/extensions/gsd/tests/rate-limit-model-fallback.test.ts new file mode 100644 index 000000000..a375225ef --- /dev/null +++ b/src/resources/extensions/gsd/tests/rate-limit-model-fallback.test.ts @@ -0,0 +1,90 @@ +/** + * rate-limit-model-fallback.test.ts — Regression test for #2770. + * + * Rate-limit errors enter the model fallback path before falling through + * to pause. This verifies the structural contract in agent-end-recovery.ts. + */ + +import test from "node:test"; +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const RECOVERY_PATH = join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"); + +function getRecoverySource(): string { + return readFileSync(RECOVERY_PATH, "utf-8"); +} + +// ── Rate-limit errors attempt model fallback (#2770) ───────────────────────── + +test("rate-limit errors enter the model fallback branch alongside other transient errors", () => { + const src = getRecoverySource(); + + // The condition that gates model fallback must include rate-limit. + // Match the if-condition that contains both "rate-limit" and fallback-related kinds. + const fallbackConditionRe = /if\s*\([^)]*cls\.kind\s*===\s*"rate-limit"[^)]*cls\.kind\s*===\s*"network"/; + const fallbackConditionReAlt = /if\s*\([^)]*cls\.kind\s*===\s*"network"[^)]*cls\.kind\s*===\s*"rate-limit"/; + + assert.ok( + fallbackConditionRe.test(src) || fallbackConditionReAlt.test(src), + 'rate-limit must appear in the same if-condition as network/server for model fallback (#2770)', + ); +}); + +test("rate-limit errors are NOT short-circuited to pause before model fallback", () => { + const src = getRecoverySource(); + + // The old code had a dedicated rate-limit early-return block before the fallback block. + // Verify it no longer exists. + const earlyRateLimitPause = /if\s*\(\s*cls\.kind\s*===\s*"rate-limit"\s*\)\s*\{[^}]*pauseTransientWithBackoff/; + assert.ok( + !earlyRateLimitPause.test(src), + 'rate-limit must NOT have a dedicated early pause before the model fallback path (#2770)', + ); +}); + +test("rate-limit errors fall through to pause if no fallback model is available", () => { + const src = getRecoverySource(); + + // After the fallback block, the transient fallback pause must still fire for rate-limit. + // The isTransient check covers rate-limit (verified by error-classifier tests). + // Verify pauseTransientWithBackoff is called with isRateLimit derived from cls.kind. + assert.ok( + src.includes('cls.kind === "rate-limit"'), + 'agent-end-recovery.ts must reference cls.kind === "rate-limit" for fallback and pause paths (#2770)', + ); + + // The transient fallback pause must pass the isRateLimit flag correctly. + const pauseCallRe = /pauseTransientWithBackoff\([^)]*cls\.kind\s*===\s*"rate-limit"/; + assert.ok( + pauseCallRe.test(src), + 'pauseTransientWithBackoff must receive isRateLimit based on cls.kind === "rate-limit" (#2770)', + ); +}); + +test("other transient errors (server, connection, stream) still attempt model fallback", () => { + const src = getRecoverySource(); + + // All transient kinds must appear in the fallback condition. + for (const kind of ["server", "connection", "stream"]) { + assert.ok( + src.includes(`cls.kind === "${kind}"`), + `model fallback condition must include cls.kind === "${kind}"`, + ); + } +}); + +test("permanent errors still bypass model fallback and pause indefinitely", () => { + const src = getRecoverySource(); + + // The permanent/unknown error handler must exist and call pauseAutoForProviderError + // with isTransient: false. + const permanentPauseRe = /pauseAutoForProviderError[\s\S]{0,300}isTransient:\s*false/; + assert.ok( + permanentPauseRe.test(src), + 'permanent errors must pause with isTransient: false (no auto-resume)', + ); +});