From 44aecc9a3f1c4d33e47876a55f5856550d95a394 Mon Sep 17 00:00:00 2001 From: deseltrus Date: Wed, 1 Apr 2026 07:39:09 +0200 Subject: [PATCH 1/2] =?UTF-8?q?fix(auto):=20resilient=20transient=20error?= =?UTF-8?q?=20recovery=20=E2=80=94=20defer=20to=20Core=20RetryHandler=20an?= =?UTF-8?q?d=20fix=20cmdCtx=20race?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related bugs cause auto-mode to permanently stop on transient provider errors (overloaded_error, rate limits, 503s) that should be recoverable: 1. **Layer 1/Layer 2 race condition:** The extension's handleAgentEnd processes agent_end events BEFORE the Core RetryHandler in _processAgentEvent. For transient errors, both layers reacted simultaneously — the extension called pauseAuto (tearing down the session) while the Core called agent.continue() (in-context retry). This ripped the agent out of its context window mid- recovery. Fix: handleAgentEnd now returns immediately for transient errors, letting the Core retry in-context with full conversation preservation. 2. **retryState accumulation across resume cycles:** consecutiveTransientCount in agent-end-recovery.ts accumulated across pause/resume cycles without resetting, permanently locking out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not per-cycle) errors. Fix: resetTransientRetryState() is called before startAuto() in the resume path. MAX_TRANSIENT_AUTO_RESUMES raised from 3 to 8 to cover ~30 minutes of sustained provider overload. 3. **ExtensionContext lacks newSession():** The provider-error resume callback receives an ExtensionContext (from the agent_end hook), not an ExtensionCommandContext. startAuto() overwrote s.cmdCtx with this incomplete context, causing 'newSession is not a function' on every subsequent runUnit() call. Fix: startAuto() now checks for newSession before overwriting — on provider-error resume, it preserves the original ExtensionCommandContext. Bonus: Session creation timeout (category=timeout) now calls pauseAuto instead of stopAuto, matching the provider-pause path. Structural errors (TypeError) still hard-stop to prevent infinite retry loops. Fixes #2492 --- src/resources/extensions/gsd/auto.ts | 13 +++- src/resources/extensions/gsd/auto/phases.ts | 22 ++++++- .../gsd/bootstrap/agent-end-recovery.ts | 35 ++++++++++- .../gsd/bootstrap/provider-error-resume.ts | 6 ++ .../gsd/tests/journal-integration.test.ts | 4 +- .../gsd/tests/provider-errors.test.ts | 60 +++++++++++++++++++ 6 files changed, 134 insertions(+), 6 deletions(-) diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index e7558e57c..80873381b 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -1191,7 +1191,18 @@ export async function startAuto( s.active = true; s.verbose = verboseMode; s.stepMode = requestedStepMode; - s.cmdCtx = ctx; + // Preserve the original cmdCtx (ExtensionCommandContext with newSession) + // when resuming from a provider-error pause. The resume callback receives + // an ExtensionContext (from the agent_end hook) which lacks newSession — + // using it would crash runUnit with "newSession is not a function". + // Only override if the new ctx actually has newSession (user-initiated resume). + if ("newSession" in ctx && typeof (ctx as any).newSession === "function") { + s.cmdCtx = ctx; + } else if (!s.cmdCtx) { + // No saved cmdCtx — this shouldn't happen, but handle gracefully + s.cmdCtx = ctx as ExtensionCommandContext; + } + // else: keep existing s.cmdCtx which has the real newSession s.basePath = base; setLogBasePath(base); s.unitDispatchCount.clear(); diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 35ecad194..79837bb07 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -1283,11 +1283,29 @@ export async function runUnitPhase( debugLog("autoLoop", { phase: "exit", reason: "provider-pause", isTransient: unitResult.errorContext.isTransient }); return { action: "break", reason: "provider-pause" }; } + // Session creation timeout (not a structural error): pause auto-mode + // and let the provider-error-resume timer handle recovery. This matches + // the provider-pause path — break out cleanly, don't hard-stop. + // Structural errors (TypeError, is not a function) are NOT transient + // and must hard-stop to avoid infinite retry loops. + if ( + unitResult.errorContext?.isTransient && + unitResult.errorContext?.category === "timeout" + ) { + ctx.ui.notify( + `Session creation timed out for ${unitType} ${unitId}. Will retry.`, + "warning", + ); + debugLog("autoLoop", { phase: "session-timeout-pause", unitType, unitId }); + await deps.pauseAuto(ctx, pi); + return { action: "break", reason: "session-timeout" }; + } + // All other cancelled states (structural errors, non-transient failures): hard stop ctx.ui.notify( - `Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`, + `Session creation failed for ${unitType} ${unitId}: ${unitResult.errorContext?.message ?? "unknown"}. Stopping auto-mode.`, "warning", ); - await deps.stopAuto(ctx, pi, "Session creation failed"); + await deps.stopAuto(ctx, pi, `Session creation failed: ${unitResult.errorContext?.message ?? "unknown"}`); debugLog("autoLoop", { phase: "exit", reason: "session-failed" }); return { action: "break", reason: "session-failed" }; } diff --git a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts index ac8737d09..b1dc306d2 100644 --- a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts +++ b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts @@ -19,7 +19,17 @@ import { const retryState = createRetryState(); const MAX_NETWORK_RETRIES = 2; -const MAX_TRANSIENT_AUTO_RESUMES = 3; +const MAX_TRANSIENT_AUTO_RESUMES = 8; + +/** + * Reset the module-level retry state so a resumed auto-session starts fresh. + * Called by provider-error-resume.ts before startAuto() — without this, the + * consecutiveTransientCount accumulates across pause/resume cycles and locks + * out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not consecutive) errors. + */ +export function resetTransientRetryState(): void { + resetRetryState(retryState); +} async function pauseTransientWithBackoff( cls: ErrorClass, @@ -102,6 +112,29 @@ export async function handleAgentEnd( // ── 1. Classify ────────────────────────────────────────────────────── const cls = classifyError(errorMsg, explicitRetryAfterMs); + // ── 1b. Defer to Core RetryHandler for transient errors ───────────── + // The Core RetryHandler (agent-session.ts) processes retryable errors + // AFTER this extension handler, in the same _processAgentEvent() call. + // For transient errors (overloaded, rate limit, server), the Core will + // retry in-context — same session, same conversation — which is strictly + // better than our Layer 2 pause+resume (which creates a new session). + // + // If we react here AND the Core also retries, we race: pauseAuto tears + // down the session while agent.continue() starts a new turn. + // + // Solution: Do nothing for transient errors. The Core RetryHandler + // runs next in _processAgentEvent and will either: + // a) Retry successfully → new agent_end (success) → we see it next time + // b) Exhaust retries → the agent stays idle, autoLoop's unit timeout + // or stuck detection handles it + // + // We do NOT call resolveAgentEnd here — that would unblock autoLoop + // prematurely while the Core is still retrying in the same session. + // We do NOT call pauseAuto — that would tear down the session. + if (isTransient(cls)) { + return; + } + // Cap rate-limit backoff for CLI-style providers (openai-codex, google-gemini-cli) // which use per-user quotas with shorter windows (#2922). if (cls.kind === "rate-limit") { diff --git a/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts b/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts index 35efdcbf5..d5f01f96d 100644 --- a/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts +++ b/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts @@ -5,6 +5,7 @@ import type { } from "@gsd/pi-coding-agent"; import { getAutoDashboardData, startAuto, type AutoDashboardData } from "../auto.js"; +import { resetTransientRetryState } from "./agent-end-recovery.js"; type AutoResumeSnapshot = Pick; @@ -42,6 +43,11 @@ export async function resumeAutoAfterProviderDelay( return "missing-base"; } + // Reset the transient retry counter before restarting — without this, + // consecutiveTransientCount accumulates across pause/resume cycles and + // permanently locks out auto-resume after MAX_TRANSIENT_AUTO_RESUMES errors. + resetTransientRetryState(); + await deps.startAuto( ctx as ExtensionCommandContext, pi, diff --git a/src/resources/extensions/gsd/tests/journal-integration.test.ts b/src/resources/extensions/gsd/tests/journal-integration.test.ts index 846982e26..14e8b8be8 100644 --- a/src/resources/extensions/gsd/tests/journal-integration.test.ts +++ b/src/resources/extensions/gsd/tests/journal-integration.test.ts @@ -589,9 +589,9 @@ test("unit-end event contains errorContext when unit is cancelled with structure resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true }); const result = await unitPromise; - // Cancelled units break the loop before emitting unit-end + // Transient timeout cancellations pause (recoverable) instead of hard-stopping assert.equal(result.action, "break"); - assert.equal((result as any).reason, "session-failed"); + assert.equal((result as any).reason, "session-timeout"); // Verify error classification used structured errorContext on the window entry const entry = loopState.recentUnits[loopState.recentUnits.length - 1]; diff --git a/src/resources/extensions/gsd/tests/provider-errors.test.ts b/src/resources/extensions/gsd/tests/provider-errors.test.ts index e4ec992d4..54a1761b3 100644 --- a/src/resources/extensions/gsd/tests/provider-errors.test.ts +++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts @@ -458,6 +458,66 @@ test("openai-codex-responses.ts extracts nested error fields", () => { ); }); +// ── Fix 1: resetTransientRetryState resets module-level singleton ──────────── + +test("resetTransientRetryState is exported from agent-end-recovery.ts", () => { + const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8"); + assert.ok( + src.includes("export function resetTransientRetryState"), + "agent-end-recovery.ts must export resetTransientRetryState for provider-error-resume.ts", + ); +}); + +test("provider-error-resume.ts calls resetTransientRetryState before startAuto", () => { + const src = readFileSync(join(__dirname, "..", "bootstrap", "provider-error-resume.ts"), "utf-8"); + assert.ok( + src.includes("resetTransientRetryState"), + "provider-error-resume.ts must import and call resetTransientRetryState", + ); + // Ensure reset is called BEFORE startAuto — order matters + const resetIdx = src.indexOf("resetTransientRetryState()"); + const startIdx = src.indexOf("await deps.startAuto("); + assert.ok( + resetIdx !== -1 && startIdx !== -1 && resetIdx < startIdx, + "resetTransientRetryState() must be called before deps.startAuto()", + ); +}); + +// ── Fix 2: Session creation timeout treated as transient in phases.ts ─────── + +test("phases.ts handles timeout session-creation failures with pause instead of stopAuto", () => { + const src = readFileSync(join(__dirname, "..", "auto", "phases.ts"), "utf-8"); + + // The cancelled + isTransient + category=timeout path must pause, not hard-stop + assert.ok( + src.includes('category === "timeout"'), + "phases.ts must check category === 'timeout' on transient cancelled unitResults", + ); + // Must call pauseAuto (not stopAuto) for timeout cancellations + assert.ok( + /category === "timeout"[\s\S]{0,300}pauseAuto/.test(src), + "phases.ts must call pauseAuto for session-timeout failures (not stopAuto or continue)", + ); + // Must NOT use action: "continue" for transient cancellations (causes infinite loops) + assert.ok( + !/isTransient[\s\S]{0,500}action:\s*"continue"/.test(src), + "phases.ts must NOT return action:continue for cancelled units — use break+pause instead", + ); +}); + +// ── Fix 3: MAX_TRANSIENT_AUTO_RESUMES raised to 8 ─────────────────────────── + +test("MAX_TRANSIENT_AUTO_RESUMES is at least 8 for sustained overload resilience", () => { + const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8"); + const match = src.match(/MAX_TRANSIENT_AUTO_RESUMES\s*=\s*(\d+)/); + assert.ok(match, "MAX_TRANSIENT_AUTO_RESUMES must be defined"); + const value = Number(match![1]); + assert.ok( + value >= 8, + `MAX_TRANSIENT_AUTO_RESUMES must be >= 8 for sustained overload resilience, got ${value}`, + ); +}); + // ── agent-session retryable regex handles server_error (#1166) ────────────── test("agent-session retryable error regex matches server_error (underscore)", () => { From 58b58d72902ca5d608d399e557192ee7bff7d444 Mon Sep 17 00:00:00 2001 From: deseltrus Date: Wed, 1 Apr 2026 07:52:09 +0200 Subject: [PATCH 2/2] =?UTF-8?q?ci:=20retrigger=20=E2=80=94=20previous=20fa?= =?UTF-8?q?ilures=20in=20unrelated=20tests=20(DB=20reconciliation,=20state?= =?UTF-8?q?-machine)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit