fix(auto): resilient transient error recovery — defer to Core RetryHandler and fix cmdCtx race

Three related bugs cause auto-mode to permanently stop on transient provider errors (overloaded_error, rate limits, 503s) that should be recoverable: 1. **Layer 1/Layer 2 race condition:** The extension's handleAgentEnd processes agent_end events BEFORE the Core RetryHandler in _processAgentEvent. For transient errors, both layers reacted simultaneously — the extension called pauseAuto (tearing down the session) while the Core called agent.continue() (in-context retry). This ripped the agent out of its context window mid- recovery. Fix: handleAgentEnd now returns immediately for transient errors, letting the Core retry in-context with full conversation preservation. 2. **retryState accumulation across resume cycles:** consecutiveTransientCount in agent-end-recovery.ts accumulated across pause/resume cycles without resetting, permanently locking out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not per-cycle) errors. Fix: resetTransientRetryState() is called before startAuto() in the resume path. MAX_TRANSIENT_AUTO_RESUMES raised from 3 to 8 to cover ~30 minutes of sustained provider overload. 3. **ExtensionContext lacks newSession():** The provider-error resume callback receives an ExtensionContext (from the agent_end hook), not an ExtensionCommandContext. startAuto() overwrote s.cmdCtx with this incomplete context, causing 'newSession is not a function' on every subsequent runUnit() call. Fix: startAuto() now checks for newSession before overwriting — on provider-error resume, it preserves the original ExtensionCommandContext. Bonus: Session creation timeout (category=timeout) now calls pauseAuto instead of stopAuto, matching the provider-pause path. Structural errors (TypeError) still hard-stop to prevent infinite retry loops. Fixes #2492
2026-04-01 07:39:09 +02:00 · 2026-04-01 07:39:09 +02:00 · 44aecc9a3f
commit 44aecc9a3f
parent a6b7febc5e
6 changed files with 134 additions and 6 deletions
--- a/src/resources/extensions/gsd/auto.ts
+++ b/src/resources/extensions/gsd/auto.ts
@ -1191,7 +1191,18 @@ export async function startAuto(
    s.active = true;
    s.verbose = verboseMode;
    s.stepMode = requestedStepMode;
-    s.cmdCtx = ctx;
+    // Preserve the original cmdCtx (ExtensionCommandContext with newSession)
+    // when resuming from a provider-error pause. The resume callback receives
+    // an ExtensionContext (from the agent_end hook) which lacks newSession —
+    // using it would crash runUnit with "newSession is not a function".
+    // Only override if the new ctx actually has newSession (user-initiated resume).
+    if ("newSession" in ctx && typeof (ctx as any).newSession === "function") {
+      s.cmdCtx = ctx;
+    } else if (!s.cmdCtx) {
+      // No saved cmdCtx — this shouldn't happen, but handle gracefully
+      s.cmdCtx = ctx as ExtensionCommandContext;
+    }
+    // else: keep existing s.cmdCtx which has the real newSession
    s.basePath = base;
    setLogBasePath(base);
    s.unitDispatchCount.clear();
--- a/src/resources/extensions/gsd/auto/phases.ts
+++ b/src/resources/extensions/gsd/auto/phases.ts
@ -1283,11 +1283,29 @@ export async function runUnitPhase(
      debugLog("autoLoop", { phase: "exit", reason: "provider-pause", isTransient: unitResult.errorContext.isTransient });
      return { action: "break", reason: "provider-pause" };
    }
+    // Session creation timeout (not a structural error): pause auto-mode
+    // and let the provider-error-resume timer handle recovery. This matches
+    // the provider-pause path — break out cleanly, don't hard-stop.
+    // Structural errors (TypeError, is not a function) are NOT transient
+    // and must hard-stop to avoid infinite retry loops.
+    if (
+      unitResult.errorContext?.isTransient &&
+      unitResult.errorContext?.category === "timeout"
+    ) {
+      ctx.ui.notify(
+        `Session creation timed out for ${unitType} ${unitId}. Will retry.`,
+        "warning",
+      );
+      debugLog("autoLoop", { phase: "session-timeout-pause", unitType, unitId });
+      await deps.pauseAuto(ctx, pi);
+      return { action: "break", reason: "session-timeout" };
+    }
+    // All other cancelled states (structural errors, non-transient failures): hard stop
    ctx.ui.notify(
-      `Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`,
+      `Session creation failed for ${unitType} ${unitId}: ${unitResult.errorContext?.message ?? "unknown"}. Stopping auto-mode.`,
      "warning",
    );
-    await deps.stopAuto(ctx, pi, "Session creation failed");
+    await deps.stopAuto(ctx, pi, `Session creation failed: ${unitResult.errorContext?.message ?? "unknown"}`);
    debugLog("autoLoop", { phase: "exit", reason: "session-failed" });
    return { action: "break", reason: "session-failed" };
  }
--- a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts
+++ b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts
@ -19,7 +19,17 @@ import {

 const retryState = createRetryState();
 const MAX_NETWORK_RETRIES = 2;
-const MAX_TRANSIENT_AUTO_RESUMES = 3;
+const MAX_TRANSIENT_AUTO_RESUMES = 8;
+
+/**
+ * Reset the module-level retry state so a resumed auto-session starts fresh.
+ * Called by provider-error-resume.ts before startAuto() — without this, the
+ * consecutiveTransientCount accumulates across pause/resume cycles and locks
+ * out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not consecutive) errors.
+ */
+export function resetTransientRetryState(): void {
+  resetRetryState(retryState);
+}

 async function pauseTransientWithBackoff(
  cls: ErrorClass,
@ -102,6 +112,29 @@ export async function handleAgentEnd(
    // ── 1. Classify ──────────────────────────────────────────────────────
    const cls = classifyError(errorMsg, explicitRetryAfterMs);

+    // ── 1b. Defer to Core RetryHandler for transient errors ─────────────
+    // The Core RetryHandler (agent-session.ts) processes retryable errors
+    // AFTER this extension handler, in the same _processAgentEvent() call.
+    // For transient errors (overloaded, rate limit, server), the Core will
+    // retry in-context — same session, same conversation — which is strictly
+    // better than our Layer 2 pause+resume (which creates a new session).
+    //
+    // If we react here AND the Core also retries, we race: pauseAuto tears
+    // down the session while agent.continue() starts a new turn.
+    //
+    // Solution: Do nothing for transient errors. The Core RetryHandler
+    // runs next in _processAgentEvent and will either:
+    //   a) Retry successfully → new agent_end (success) → we see it next time
+    //   b) Exhaust retries → the agent stays idle, autoLoop's unit timeout
+    //      or stuck detection handles it
+    //
+    // We do NOT call resolveAgentEnd here — that would unblock autoLoop
+    // prematurely while the Core is still retrying in the same session.
+    // We do NOT call pauseAuto — that would tear down the session.
+    if (isTransient(cls)) {
+      return;
+    }
+
    // Cap rate-limit backoff for CLI-style providers (openai-codex, google-gemini-cli)
    // which use per-user quotas with shorter windows (#2922).
    if (cls.kind === "rate-limit") {
--- a/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts
+++ b/src/resources/extensions/gsd/bootstrap/provider-error-resume.ts
@ -5,6 +5,7 @@ import type {
 } from "@gsd/pi-coding-agent";

 import { getAutoDashboardData, startAuto, type AutoDashboardData } from "../auto.js";
+import { resetTransientRetryState } from "./agent-end-recovery.js";

 type AutoResumeSnapshot = Pick<AutoDashboardData, "active" | "paused" | "stepMode" | "basePath">;

@ -42,6 +43,11 @@ export async function resumeAutoAfterProviderDelay(
    return "missing-base";
  }

+  // Reset the transient retry counter before restarting — without this,
+  // consecutiveTransientCount accumulates across pause/resume cycles and
+  // permanently locks out auto-resume after MAX_TRANSIENT_AUTO_RESUMES errors.
+  resetTransientRetryState();
+
  await deps.startAuto(
    ctx as ExtensionCommandContext,
    pi,
--- a/src/resources/extensions/gsd/tests/journal-integration.test.ts
+++ b/src/resources/extensions/gsd/tests/journal-integration.test.ts
@ -589,9 +589,9 @@ test("unit-end event contains errorContext when unit is cancelled with structure
  resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true });

  const result = await unitPromise;
-  // Cancelled units break the loop before emitting unit-end
+  // Transient timeout cancellations pause (recoverable) instead of hard-stopping
  assert.equal(result.action, "break");
-  assert.equal((result as any).reason, "session-failed");
+  assert.equal((result as any).reason, "session-timeout");

  // Verify error classification used structured errorContext on the window entry
  const entry = loopState.recentUnits[loopState.recentUnits.length - 1];
--- a/src/resources/extensions/gsd/tests/provider-errors.test.ts
+++ b/src/resources/extensions/gsd/tests/provider-errors.test.ts
@ -458,6 +458,66 @@ test("openai-codex-responses.ts extracts nested error fields", () => {
  );
 });

+// ── Fix 1: resetTransientRetryState resets module-level singleton ────────────
+
+test("resetTransientRetryState is exported from agent-end-recovery.ts", () => {
+  const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
+  assert.ok(
+    src.includes("export function resetTransientRetryState"),
+    "agent-end-recovery.ts must export resetTransientRetryState for provider-error-resume.ts",
+  );
+});
+
+test("provider-error-resume.ts calls resetTransientRetryState before startAuto", () => {
+  const src = readFileSync(join(__dirname, "..", "bootstrap", "provider-error-resume.ts"), "utf-8");
+  assert.ok(
+    src.includes("resetTransientRetryState"),
+    "provider-error-resume.ts must import and call resetTransientRetryState",
+  );
+  // Ensure reset is called BEFORE startAuto — order matters
+  const resetIdx = src.indexOf("resetTransientRetryState()");
+  const startIdx = src.indexOf("await deps.startAuto(");
+  assert.ok(
+    resetIdx !== -1 && startIdx !== -1 && resetIdx < startIdx,
+    "resetTransientRetryState() must be called before deps.startAuto()",
+  );
+});
+
+// ── Fix 2: Session creation timeout treated as transient in phases.ts ───────
+
+test("phases.ts handles timeout session-creation failures with pause instead of stopAuto", () => {
+  const src = readFileSync(join(__dirname, "..", "auto", "phases.ts"), "utf-8");
+
+  // The cancelled + isTransient + category=timeout path must pause, not hard-stop
+  assert.ok(
+    src.includes('category === "timeout"'),
+    "phases.ts must check category === 'timeout' on transient cancelled unitResults",
+  );
+  // Must call pauseAuto (not stopAuto) for timeout cancellations
+  assert.ok(
+    /category === "timeout"[\s\S]{0,300}pauseAuto/.test(src),
+    "phases.ts must call pauseAuto for session-timeout failures (not stopAuto or continue)",
+  );
+  // Must NOT use action: "continue" for transient cancellations (causes infinite loops)
+  assert.ok(
+    !/isTransient[\s\S]{0,500}action:\s*"continue"/.test(src),
+    "phases.ts must NOT return action:continue for cancelled units — use break+pause instead",
+  );
+});
+
+// ── Fix 3: MAX_TRANSIENT_AUTO_RESUMES raised to 8 ───────────────────────────
+
+test("MAX_TRANSIENT_AUTO_RESUMES is at least 8 for sustained overload resilience", () => {
+  const src = readFileSync(join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"), "utf-8");
+  const match = src.match(/MAX_TRANSIENT_AUTO_RESUMES\s*=\s*(\d+)/);
+  assert.ok(match, "MAX_TRANSIENT_AUTO_RESUMES must be defined");
+  const value = Number(match![1]);
+  assert.ok(
+    value >= 8,
+    `MAX_TRANSIENT_AUTO_RESUMES must be >= 8 for sustained overload resilience, got ${value}`,
+  );
+});
+
 // ── agent-session retryable regex handles server_error (#1166) ──────────────

 test("agent-session retryable error regex matches server_error (underscore)", () => {