From 4f1ff1fe28e03ff0d1af28a97f520ae1ee0d6849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=82CHES?= Date: Thu, 26 Mar 2026 17:55:20 -0600 Subject: [PATCH] fix: auto-mode stops after provider errors (#2762) (#2764) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Registered 6 MCP tools (gsd_execute, gsd_status, gsd_result, gsd_… - "packages/mcp-server/src/server.ts" - "packages/mcp-server/src/cli.ts" - "packages/mcp-server/src/index.ts" - "packages/rpc-client/dist/index.d.ts" GSD-Task: S05/T02 * docs: Added 31 integration tests, build pipeline, and consumer README f… - "packages/mcp-server/src/mcp-server.test.ts" - "packages/mcp-server/README.md" - "packages/mcp-server/dist/" GSD-Task: S05/T03 * fix: prevent auto-mode hard stop on provider errors and suppress duplicate async_job_result follow-ups (#2762) Two compounding bugs caused auto-mode to silently die after unit completion: 1. async_job_result follow-ups fired after unit completion because deliverResult ran synchronously in the job promise .then() chain, racing with await_job's .then() that sets job.awaited=true. Deferring delivery by one microtask via queueMicrotask ensures await_job marks the job first. 2. Provider error pause converted to hard stop because pauseAuto resolved the unit promise with {status:"cancelled"} but no ErrorContext, so runUnitPhase treated it identically to a session-creation timeout and called stopAuto. Now pauseAuto accepts and forwards ErrorContext, and runUnitPhase checks for category:"provider" to break without hard-stopping. Co-Authored-By: Claude Opus 4.6 (1M context) * test: update source-scan assertion for new pauseAuto signature The structural test checked for `resolveAgentEndCancelled()` with empty parens. Now that pauseAuto passes _errorContext, match the call prefix. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- src/resources/extensions/async-jobs/job-manager.ts | 5 ++++- src/resources/extensions/gsd/auto.ts | 7 +++++-- src/resources/extensions/gsd/auto/phases.ts | 6 ++++++ .../extensions/gsd/bootstrap/agent-end-recovery.ts | 13 +++++++++++-- .../extensions/gsd/tests/agent-end-retry.test.ts | 2 +- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/resources/extensions/async-jobs/job-manager.ts b/src/resources/extensions/async-jobs/job-manager.ts index c5b1abf4e..10ce3cd41 100644 --- a/src/resources/extensions/async-jobs/job-manager.ts +++ b/src/resources/extensions/async-jobs/job-manager.ts @@ -172,7 +172,10 @@ export class AsyncJobManager { private deliverResult(job: Job): void { if (!this.onJobComplete) return; - this.onJobComplete(job); + // Defer delivery by one microtask so await_job's .then() chain runs first + // and can set job.awaited = true before onJobComplete checks it (#2762). + const cb = this.onJobComplete; + queueMicrotask(() => cb(job)); } private scheduleEviction(id: string): void { diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 3c4a50d4e..131272345 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -186,7 +186,7 @@ import { postUnitPostVerification, } from "./auto-post-unit.js"; import { bootstrapAutoSession, type BootstrapDeps } from "./auto-start.js"; -import { autoLoop, resolveAgentEnd, resolveAgentEndCancelled, _resetPendingResolve, isSessionSwitchInFlight, type LoopDeps } from "./auto-loop.js"; +import { autoLoop, resolveAgentEnd, resolveAgentEndCancelled, _resetPendingResolve, isSessionSwitchInFlight, type LoopDeps, type ErrorContext } from "./auto-loop.js"; import { WorktreeResolver, type WorktreeResolverDeps, @@ -800,11 +800,14 @@ export async function stopAuto( export async function pauseAuto( ctx?: ExtensionContext, _pi?: ExtensionAPI, + _errorContext?: ErrorContext, ): Promise { if (!s.active) return; clearUnitTimeout(); // Unblock any pending unit promise so the auto-loop is not orphaned. - resolveAgentEndCancelled(); + // Pass errorContext so runUnitPhase can distinguish user-initiated pause + // from provider-error pause and avoid hard-stopping (#2762). + resolveAgentEndCancelled(_errorContext); s.pausedSessionFile = ctx?.sessionManager?.getSessionFile() ?? null; diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 252797be1..4ef9ce1c1 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -1069,6 +1069,12 @@ export async function runUnitPhase( } if (unitResult.status === "cancelled") { + // Provider-error pause: pauseAuto already handled cleanup and scheduled + // recovery. Don't hard-stop — just break out of the loop (#2762). + if (unitResult.errorContext?.category === "provider") { + debugLog("autoLoop", { phase: "exit", reason: "provider-pause", isTransient: unitResult.errorContext.isTransient }); + return { action: "break", reason: "provider-pause" }; + } ctx.ui.notify( `Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`, "warning", diff --git a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts index 302671da4..1c5862260 100644 --- a/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts +++ b/src/resources/extensions/gsd/bootstrap/agent-end-recovery.ts @@ -33,7 +33,12 @@ async function pauseTransientWithBackoff( if (!allowAutoResume) { ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning"); } - await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), { + await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi, { + message: `Provider error: ${errorDetail}`, + category: "provider", + isTransient: allowAutoResume, + retryAfterMs, + }), { isRateLimit, isTransient: allowAutoResume, retryAfterMs, @@ -161,7 +166,11 @@ export async function handleAgentEnd( } // --- Permanent / unknown: pause indefinitely --- - await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), { + await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi, { + message: `Provider error: ${errorDetail}`, + category: "provider", + isTransient: false, + }), { isRateLimit: false, isTransient: false, retryAfterMs: 0, diff --git a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts index c1ddfc02b..955fabf5a 100644 --- a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts +++ b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts @@ -102,7 +102,7 @@ test("pauseAuto calls resolveAgentEndCancelled to unblock the loop", () => { const fnBlock = source.slice(fnIdx, source.indexOf("\n/**\n * Build", fnIdx + 100)); assert.ok( - fnBlock.includes("resolveAgentEndCancelled()"), + fnBlock.includes("resolveAgentEndCancelled("), "pauseAuto must call resolveAgentEndCancelled to unblock the auto-loop promise", ); });