fix: let rate-limit errors attempt model fallback before pausing (#2775)
* fix: let rate-limit errors attempt model fallback before pausing Rate-limit errors were early-returning with pauseTransientWithBackoff() before reaching model fallback logic. Since rate limits are frequently per-model (not provider-wide), this caused 20+ minute stuck-loops when fallback models were available. Now rate-limit errors enter the same fallback path as other transient errors, only pausing if no fallback model is available. Closes #2770 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add regression test for rate-limit model fallback Verifies that rate-limit errors enter the model fallback path before pausing (#2770), and that the old early-return bypass no longer exists. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d7755e596c
commit
46016adf9a
2 changed files with 94 additions and 11 deletions
|
|
@ -107,16 +107,9 @@ export async function handleAgentEnd(
|
|||
ctx.ui.notify(`Network retries exhausted for ${currentModelId}. Attempting model fallback.`, "warning");
|
||||
}
|
||||
|
||||
// --- Rate limit: skip model fallback, go straight to pause ---
|
||||
// Rate-limiting is a provider issue, not a model issue.
|
||||
// Switching models won't help if the provider is throttling you.
|
||||
if (cls.kind === "rate-limit") {
|
||||
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, true);
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Server/connection/stream errors: try model fallback first ---
|
||||
if (cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
|
||||
// --- Transient errors: try model fallback first, then pause ---
|
||||
// Rate limits are often per-model, so switching models can bypass them.
|
||||
if (cls.kind === "rate-limit" || cls.kind === "network" || cls.kind === "server" || cls.kind === "connection" || cls.kind === "stream") {
|
||||
// Try model fallback
|
||||
const dash = getAutoDashboardData();
|
||||
if (dash.currentUnit) {
|
||||
|
|
@ -161,7 +154,7 @@ export async function handleAgentEnd(
|
|||
|
||||
// --- Transient fallback: pause with auto-resume ---
|
||||
if (isTransient(cls)) {
|
||||
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, false);
|
||||
await pauseTransientWithBackoff(cls, pi, ctx, errorDetail, cls.kind === "rate-limit");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* rate-limit-model-fallback.test.ts — Regression test for #2770.
|
||||
*
|
||||
* Rate-limit errors enter the model fallback path before falling through
|
||||
* to pause. This verifies the structural contract in agent-end-recovery.ts.
|
||||
*/
|
||||
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join, dirname } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const RECOVERY_PATH = join(__dirname, "..", "bootstrap", "agent-end-recovery.ts");
|
||||
|
||||
function getRecoverySource(): string {
|
||||
return readFileSync(RECOVERY_PATH, "utf-8");
|
||||
}
|
||||
|
||||
// ── Rate-limit errors attempt model fallback (#2770) ─────────────────────────
|
||||
|
||||
test("rate-limit errors enter the model fallback branch alongside other transient errors", () => {
|
||||
const src = getRecoverySource();
|
||||
|
||||
// The condition that gates model fallback must include rate-limit.
|
||||
// Match the if-condition that contains both "rate-limit" and fallback-related kinds.
|
||||
const fallbackConditionRe = /if\s*\([^)]*cls\.kind\s*===\s*"rate-limit"[^)]*cls\.kind\s*===\s*"network"/;
|
||||
const fallbackConditionReAlt = /if\s*\([^)]*cls\.kind\s*===\s*"network"[^)]*cls\.kind\s*===\s*"rate-limit"/;
|
||||
|
||||
assert.ok(
|
||||
fallbackConditionRe.test(src) || fallbackConditionReAlt.test(src),
|
||||
'rate-limit must appear in the same if-condition as network/server for model fallback (#2770)',
|
||||
);
|
||||
});
|
||||
|
||||
test("rate-limit errors are NOT short-circuited to pause before model fallback", () => {
|
||||
const src = getRecoverySource();
|
||||
|
||||
// The old code had a dedicated rate-limit early-return block before the fallback block.
|
||||
// Verify it no longer exists.
|
||||
const earlyRateLimitPause = /if\s*\(\s*cls\.kind\s*===\s*"rate-limit"\s*\)\s*\{[^}]*pauseTransientWithBackoff/;
|
||||
assert.ok(
|
||||
!earlyRateLimitPause.test(src),
|
||||
'rate-limit must NOT have a dedicated early pause before the model fallback path (#2770)',
|
||||
);
|
||||
});
|
||||
|
||||
test("rate-limit errors fall through to pause if no fallback model is available", () => {
|
||||
const src = getRecoverySource();
|
||||
|
||||
// After the fallback block, the transient fallback pause must still fire for rate-limit.
|
||||
// The isTransient check covers rate-limit (verified by error-classifier tests).
|
||||
// Verify pauseTransientWithBackoff is called with isRateLimit derived from cls.kind.
|
||||
assert.ok(
|
||||
src.includes('cls.kind === "rate-limit"'),
|
||||
'agent-end-recovery.ts must reference cls.kind === "rate-limit" for fallback and pause paths (#2770)',
|
||||
);
|
||||
|
||||
// The transient fallback pause must pass the isRateLimit flag correctly.
|
||||
const pauseCallRe = /pauseTransientWithBackoff\([^)]*cls\.kind\s*===\s*"rate-limit"/;
|
||||
assert.ok(
|
||||
pauseCallRe.test(src),
|
||||
'pauseTransientWithBackoff must receive isRateLimit based on cls.kind === "rate-limit" (#2770)',
|
||||
);
|
||||
});
|
||||
|
||||
test("other transient errors (server, connection, stream) still attempt model fallback", () => {
|
||||
const src = getRecoverySource();
|
||||
|
||||
// All transient kinds must appear in the fallback condition.
|
||||
for (const kind of ["server", "connection", "stream"]) {
|
||||
assert.ok(
|
||||
src.includes(`cls.kind === "${kind}"`),
|
||||
`model fallback condition must include cls.kind === "${kind}"`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test("permanent errors still bypass model fallback and pause indefinitely", () => {
|
||||
const src = getRecoverySource();
|
||||
|
||||
// The permanent/unknown error handler must exist and call pauseAutoForProviderError
|
||||
// with isTransient: false.
|
||||
const permanentPauseRe = /pauseAutoForProviderError[\s\S]{0,300}isTransient:\s*false/;
|
||||
assert.ok(
|
||||
permanentPauseRe.test(src),
|
||||
'permanent errors must pause with isTransient: false (no auto-resume)',
|
||||
);
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue