fix: let rate-limit errors attempt model fallback before pausing (#2775)

* fix: let rate-limit errors attempt model fallback before pausing

Rate-limit errors were early-returning with pauseTransientWithBackoff()
before reaching model fallback logic. Since rate limits are frequently
per-model (not provider-wide), this caused 20+ minute stuck-loops when
fallback models were available. Now rate-limit errors enter the same
fallback path as other transient errors, only pausing if no fallback
model is available.

Closes #2770

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* test: add regression test for rate-limit model fallback

Verifies that rate-limit errors enter the model fallback path before
pausing (#2770), and that the old early-return bypass no longer exists.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
TÂCHES 2026-03-26 20:03:28 -06:00 committed by GitHub
parent d7755e596c
commit 46016adf9a
2 changed files with 94 additions and 11 deletions

View file

@ -107,16 +107,9 @@ export async function handleAgentEnd(
ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning");
}
// --- Rate limit: skip model fallback, go straight to pause ---
// Rate-limiting is a provider issue, not a model issue.
// Switching models won't help if the provider is throttling you.
if (cls.kind === "rate-limit") {
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true);
return;
}
// --- Server/connection/stream errors: try model fallback first ---
if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
// --- Transient errors: try model fallback first, then pause ---
// Rate limits are often per-model, so switching models can bypass them.
if (cls.kind === "rate-limit" || cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
// Try model fallback
const dash = getAutoDashboardData();
if (dash.currentUnit) {
@ -161,7 +154,7 @@ export async function handleAgentEnd(
// --- Transient fallback: pause with auto-resume ---
if (isTransient(cls)) {
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false);
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, cls.kind === "rate-limit");
return;
}

View file

@ -0,0 +1,90 @@
/**
* rate-limit-model-fallback.test.ts Regression test for #2770.
*
* Rate-limit errors enter the model fallback path before falling through
* to pause. This verifies the structural contract in agent-end-recovery.ts.
*/
import test from "node:test";
import assert from "node:assert/strict";
import { readFileSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const RECOVERY_PATH = join(__dirname, "..", "bootstrap", "agent-end-recovery.ts");
function getRecoverySource(): string {
return readFileSync(RECOVERY_PATH, "utf-8");
}
// ── Rate-limit errors attempt model fallback (#2770) ─────────────────────────
test("rate-limit errors enter the model fallback branch alongside other transient errors", () => {
const src = getRecoverySource();
// The condition that gates model fallback must include rate-limit.
// Match the if-condition that contains both "rate-limit" and fallback-related kinds.
const fallbackConditionRe = /if\s*\([^)]*cls\.kind\s*===\s*"rate-limit"[^)]*cls\.kind\s*===\s*"network"/;
const fallbackConditionReAlt = /if\s*\([^)]*cls\.kind\s*===\s*"network"[^)]*cls\.kind\s*===\s*"rate-limit"/;
assert.ok(
fallbackConditionRe.test(src) || fallbackConditionReAlt.test(src),
'rate-limit must appear in the same if-condition as network/server for model fallback (#2770)',
);
});
test("rate-limit errors are NOT short-circuited to pause before model fallback", () => {
const src = getRecoverySource();
// The old code had a dedicated rate-limit early-return block before the fallback block.
// Verify it no longer exists.
const earlyRateLimitPause = /if\s*\(\s*cls\.kind\s*===\s*"rate-limit"\s*\)\s*\{[^}]*pauseTransientWithBackoff/;
assert.ok(
!earlyRateLimitPause.test(src),
'rate-limit must NOT have a dedicated early pause before the model fallback path (#2770)',
);
});
test("rate-limit errors fall through to pause if no fallback model is available", () => {
const src = getRecoverySource();
// After the fallback block, the transient fallback pause must still fire for rate-limit.
// The isTransient check covers rate-limit (verified by error-classifier tests).
// Verify pauseTransientWithBackoff is called with isRateLimit derived from cls.kind.
assert.ok(
src.includes('cls.kind === "rate-limit"'),
'agent-end-recovery.ts must reference cls.kind === "rate-limit" for fallback and pause paths (#2770)',
);
// The transient fallback pause must pass the isRateLimit flag correctly.
const pauseCallRe = /pauseTransientWithBackoff\([^)]*cls\.kind\s*===\s*"rate-limit"/;
assert.ok(
pauseCallRe.test(src),
'pauseTransientWithBackoff must receive isRateLimit based on cls.kind === "rate-limit" (#2770)',
);
});
test("other transient errors (server, connection, stream) still attempt model fallback", () => {
const src = getRecoverySource();
// All transient kinds must appear in the fallback condition.
for (const kind of ["server", "connection", "stream"]) {
assert.ok(
src.includes(`cls.kind === "${kind}"`),
`model fallback condition must include cls.kind === "${kind}"`,
);
}
});
test("permanent errors still bypass model fallback and pause indefinitely", () => {
const src = getRecoverySource();
// The permanent/unknown error handler must exist and call pauseAutoForProviderError
// with isTransient: false.
const permanentPauseRe = /pauseAutoForProviderError[\s\S]{0,300}isTransient:\s*false/;
assert.ok(
permanentPauseRe.test(src),
'permanent errors must pause with isTransient: false (no auto-resume)',
);
});