Merge pull request #3374 from deseltrus/fix/auto-mode-transient-error-resilience
fix(auto): resilient transient error recovery — defer to Core RetryHandler and fix cmdCtx race
This commit is contained in:
commit
393700a649
6 changed files with 134 additions and 6 deletions
|
|
@ -1205,7 +1205,18 @@ export async function startAuto(
|
|||
s.active = true;
|
||||
s.verbose = verboseMode;
|
||||
s.stepMode = requestedStepMode;
|
||||
s.cmdCtx = ctx;
|
||||
// Preserve the original cmdCtx (ExtensionCommandContext with newSession)
|
||||
// when resuming from a provider-error pause. The resume callback receives
|
||||
// an ExtensionContext (from the agent_end hook) which lacks newSession —
|
||||
// using it would crash runUnit with "newSession is not a function".
|
||||
// Only override if the new ctx actually has newSession (user-initiated resume).
|
||||
if ("newSession" in ctx && typeof (ctx as any).newSession === "function") {
|
||||
s.cmdCtx = ctx;
|
||||
} else if (!s.cmdCtx) {
|
||||
// No saved cmdCtx — this shouldn't happen, but handle gracefully
|
||||
s.cmdCtx = ctx as ExtensionCommandContext;
|
||||
}
|
||||
// else: keep existing s.cmdCtx which has the real newSession
|
||||
s.basePath = base;
|
||||
setLogBasePath(base);
|
||||
if (!s.autoStartTime || s.autoStartTime <= 0) s.autoStartTime = Date.now();
|
||||
|
|
|
|||
|
|
@ -1302,11 +1302,29 @@ export async function runUnitPhase(
|
|||
debugLog("autoLoop", { phase: "exit", reason: "provider-pause", isTransient: unitResult.errorContext.isTransient });
|
||||
return { action: "break", reason: "provider-pause" };
|
||||
}
|
||||
// Session creation timeout (not a structural error): pause auto-mode
|
||||
// and let the provider-error-resume timer handle recovery. This matches
|
||||
// the provider-pause path — break out cleanly, don't hard-stop.
|
||||
// Structural errors (TypeError, is not a function) are NOT transient
|
||||
// and must hard-stop to avoid infinite retry loops.
|
||||
if (
|
||||
unitResult.errorContext?.isTransient &&
|
||||
unitResult.errorContext?.category === "timeout"
|
||||
) {
|
||||
ctx.ui.notify(
|
||||
`Session creation timed out for ${unitType} ${unitId}. Will retry.`,
|
||||
"warning",
|
||||
);
|
||||
debugLog("autoLoop", { phase: "session-timeout-pause", unitType, unitId });
|
||||
await deps.pauseAuto(ctx, pi);
|
||||
return { action: "break", reason: "session-timeout" };
|
||||
}
|
||||
// All other cancelled states (structural errors, non-transient failures): hard stop
|
||||
ctx.ui.notify(
|
||||
`Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`,
|
||||
`Session creation failed for ${unitType} ${unitId}: ${unitResult.errorContext?.message ?? "unknown"}. Stopping auto-mode.`,
|
||||
"warning",
|
||||
);
|
||||
await deps.stopAuto(ctx, pi, "Session creation failed");
|
||||
await deps.stopAuto(ctx, pi, `Session creation failed: ${unitResult.errorContext?.message ?? "unknown"}`);
|
||||
debugLog("autoLoop", { phase: "exit", reason: "session-failed" });
|
||||
return { action: "break", reason: "session-failed" };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,17 @@ import {
|
|||
|
||||
const retryState = createRetryState();
|
||||
const MAX_NETWORK_RETRIES = 2;
|
||||
const MAX_TRANSIENT_AUTO_RESUMES = 3;
|
||||
const MAX_TRANSIENT_AUTO_RESUMES = 8;
|
||||
|
||||
/**
|
||||
* Reset the module-level retry state so a resumed auto-session starts fresh.
|
||||
* Called by provider-error-resume.ts before startAuto() — without this, the
|
||||
* consecutiveTransientCount accumulates across pause/resume cycles and locks
|
||||
* out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not consecutive) errors.
|
||||
*/
|
||||
export function resetTransientRetryState(): void {
|
||||
resetRetryState(retryState);
|
||||
}
|
||||
|
||||
async function pauseTransientWithBackoff(
|
||||
cls: ErrorClass,
|
||||
|
|
@ -114,6 +124,29 @@ export async function handleAgentEnd(
|
|||
// ── 1. Classify using rawErrorMsg to avoid prose false-positives ────
|
||||
const cls = classifyError(rawErrorMsg, explicitRetryAfterMs);
|
||||
|
||||
// ── 1b. Defer to Core RetryHandler for transient errors ─────────────
|
||||
// The Core RetryHandler (agent-session.ts) processes retryable errors
|
||||
// AFTER this extension handler, in the same _processAgentEvent() call.
|
||||
// For transient errors (overloaded, rate limit, server), the Core will
|
||||
// retry in-context — same session, same conversation — which is strictly
|
||||
// better than our Layer 2 pause+resume (which creates a new session).
|
||||
//
|
||||
// If we react here AND the Core also retries, we race: pauseAuto tears
|
||||
// down the session while agent.continue() starts a new turn.
|
||||
//
|
||||
// Solution: Do nothing for transient errors. The Core RetryHandler
|
||||
// runs next in _processAgentEvent and will either:
|
||||
// a) Retry successfully → new agent_end (success) → we see it next time
|
||||
// b) Exhaust retries → the agent stays idle, autoLoop's unit timeout
|
||||
// or stuck detection handles it
|
||||
//
|
||||
// We do NOT call resolveAgentEnd here — that would unblock autoLoop
|
||||
// prematurely while the Core is still retrying in the same session.
|
||||
// We do NOT call pauseAuto — that would tear down the session.
|
||||
if (isTransient(cls)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Cap rate-limit backoff for CLI-style providers (openai-codex, google-gemini-cli)
|
||||
// which use per-user quotas with shorter windows (#2922).
|
||||
if (cls.kind === "rate-limit") {
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import type {
|
|||
} from "@gsd/pi-coding-agent";
|
||||
|
||||
import { getAutoDashboardData, startAuto, type AutoDashboardData } from "../auto.js";
|
||||
import { resetTransientRetryState } from "./agent-end-recovery.js";
|
||||
|
||||
type AutoResumeSnapshot = Pick<AutoDashboardData, "active" | "paused" | "stepMode" | "basePath">;
|
||||
|
||||
|
|
@ -42,6 +43,11 @@ export async function resumeAutoAfterProviderDelay(
|
|||
return "missing-base";
|
||||
}
|
||||
|
||||
// Reset the transient retry counter before restarting — without this,
|
||||
// consecutiveTransientCount accumulates across pause/resume cycles and
|
||||
// permanently locks out auto-resume after MAX_TRANSIENT_AUTO_RESUMES errors.
|
||||
resetTransientRetryState();
|
||||
|
||||
await deps.startAuto(
|
||||
ctx as ExtensionCommandContext,
|
||||
pi,
|
||||
|
|
|
|||
|
|
@ -590,9 +590,9 @@ test("unit-end event contains errorContext when unit is cancelled with structure
|
|||
resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true });
|
||||
|
||||
const result = await unitPromise;
|
||||
// Cancelled units break the loop before emitting unit-end
|
||||
// Transient timeout cancellations pause (recoverable) instead of hard-stopping
|
||||
assert.equal(result.action, "break");
|
||||
assert.equal((result as any).reason, "session-failed");
|
||||
assert.equal((result as any).reason, "session-timeout");
|
||||
|
||||
// Verify error classification used structured errorContext on the window entry
|
||||
const entry = loopState.recentUnits[loopState.recentUnits.length - 1];
|
||||
|
|
|
|||
|
|
@ -458,6 +458,66 @@ test("openai-codex-responses.ts extracts nested error fields", () => {
|
|||
);
|
||||
});
|
||||
|
||||
// ── Fix 1: resetTransientRetryState resets module-level singleton ────────────
|
||||
|
||||
test("resetTransientRetryState is exported from agent-end-recovery.ts", () => {
|
||||
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
|
||||
assert.ok(
|
||||
src.includes("export function resetTransientRetryState"),
|
||||
"agent-end-recovery.ts must export resetTransientRetryState for provider-error-resume.ts",
|
||||
);
|
||||
});
|
||||
|
||||
test("provider-error-resume.ts calls resetTransientRetryState before startAuto", () => {
|
||||
const src = readFileSync(join(__dirname, "..", "bootstrap", "provider-error-resume.ts"), "utf-8");
|
||||
assert.ok(
|
||||
src.includes("resetTransientRetryState"),
|
||||
"provider-error-resume.ts must import and call resetTransientRetryState",
|
||||
);
|
||||
// Ensure reset is called BEFORE startAuto — order matters
|
||||
const resetIdx = src.indexOf("resetTransientRetryState()");
|
||||
const startIdx = src.indexOf("await deps.startAuto(");
|
||||
assert.ok(
|
||||
resetIdx !== -1 && startIdx !== -1 && resetIdx < startIdx,
|
||||
"resetTransientRetryState() must be called before deps.startAuto()",
|
||||
);
|
||||
});
|
||||
|
||||
// ── Fix 2: Session creation timeout treated as transient in phases.ts ───────
|
||||
|
||||
test("phases.ts handles timeout session-creation failures with pause instead of stopAuto", () => {
|
||||
const src = readFileSync(join(__dirname, "..", "auto", "phases.ts"), "utf-8");
|
||||
|
||||
// The cancelled + isTransient + category=timeout path must pause, not hard-stop
|
||||
assert.ok(
|
||||
src.includes('category === "timeout"'),
|
||||
"phases.ts must check category === 'timeout' on transient cancelled unitResults",
|
||||
);
|
||||
// Must call pauseAuto (not stopAuto) for timeout cancellations
|
||||
assert.ok(
|
||||
/category === "timeout"[\s\S]{0,300}pauseAuto/.test(src),
|
||||
"phases.ts must call pauseAuto for session-timeout failures (not stopAuto or continue)",
|
||||
);
|
||||
// Must NOT use action: "continue" for transient cancellations (causes infinite loops)
|
||||
assert.ok(
|
||||
!/isTransient[\s\S]{0,500}action:\s*"continue"/.test(src),
|
||||
"phases.ts must NOT return action:continue for cancelled units — use break+pause instead",
|
||||
);
|
||||
});
|
||||
|
||||
// ── Fix 3: MAX_TRANSIENT_AUTO_RESUMES raised to 8 ───────────────────────────
|
||||
|
||||
test("MAX_TRANSIENT_AUTO_RESUMES is at least 8 for sustained overload resilience", () => {
|
||||
const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
|
||||
const match = src.match(/MAX_TRANSIENT_AUTO_RESUMES\s*=\s*(\d+)/);
|
||||
assert.ok(match, "MAX_TRANSIENT_AUTO_RESUMES must be defined");
|
||||
const value = Number(match![1]);
|
||||
assert.ok(
|
||||
value >= 8,
|
||||
`MAX_TRANSIENT_AUTO_RESUMES must be >= 8 for sustained overload resilience, got ${value}`,
|
||||
);
|
||||
});
|
||||
|
||||
// ── agent-session retryable regex handles server_error (#1166) ──────────────
|
||||
|
||||
test("agent-session retryable error regex matches server_error (underscore)", () => {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue