fix: auto-mode stops after provider errors (#2762) (#2764)

* feat: Registered 6 MCP tools (gsd_execute, gsd_status, gsd_result, gsd_…

- "packages/mcp-server/src/server.ts"
- "packages/mcp-server/src/cli.ts"
- "packages/mcp-server/src/index.ts"
- "packages/rpc-client/dist/index.d.ts"

GSD-Task: S05/T02

* docs: Added 31 integration tests, build pipeline, and consumer README f…

- "packages/mcp-server/src/mcp-server.test.ts"
- "packages/mcp-server/README.md"
- "packages/mcp-server/dist/"

GSD-Task: S05/T03

* fix: prevent auto-mode hard stop on provider errors and suppress duplicate async_job_result follow-ups (#2762)

Two compounding bugs caused auto-mode to silently die after unit completion:

1. async_job_result follow-ups fired after unit completion because deliverResult
   ran synchronously in the job promise .then() chain, racing with await_job's
   .then() that sets job.awaited=true. Deferring delivery by one microtask via
   queueMicrotask ensures await_job marks the job first.

2. Provider error pause converted to hard stop because pauseAuto resolved the
   unit promise with {status:"cancelled"} but no ErrorContext, so runUnitPhase
   treated it identically to a session-creation timeout and called stopAuto.
   Now pauseAuto accepts and forwards ErrorContext, and runUnitPhase checks for
   category:"provider" to break without hard-stopping.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* test: update source-scan assertion for new pauseAuto signature

The structural test checked for `resolveAgentEndCancelled()` with empty
parens. Now that pauseAuto passes _errorContext, match the call prefix.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
TÂCHES 2026-03-26 17:55:20 -06:00 committed by GitHub
parent 1c2d7ab307
commit 4f1ff1fe28
5 changed files with 27 additions and 6 deletions

View file

@ -172,7 +172,10 @@ export class AsyncJobManager {
private deliverResult(job: Job): void {
if (!this.onJobComplete) return;
this.onJobComplete(job);
// Defer delivery by one microtask so await_job's .then() chain runs first
// and can set job.awaited = true before onJobComplete checks it (#2762).
const cb = this.onJobComplete;
queueMicrotask(() => cb(job));
}
private scheduleEviction(id: string): void {

View file

@ -186,7 +186,7 @@ import {
postUnitPostVerification,
} from "./auto-post-unit.js";
import { bootstrapAutoSession, type BootstrapDeps } from "./auto-start.js";
import { autoLoop, resolveAgentEnd, resolveAgentEndCancelled, _resetPendingResolve, isSessionSwitchInFlight, type LoopDeps } from "./auto-loop.js";
import { autoLoop, resolveAgentEnd, resolveAgentEndCancelled, _resetPendingResolve, isSessionSwitchInFlight, type LoopDeps, type ErrorContext } from "./auto-loop.js";
import {
WorktreeResolver,
type WorktreeResolverDeps,
@ -800,11 +800,14 @@ export async function stopAuto(
export async function pauseAuto(
ctx?: ExtensionContext,
_pi?: ExtensionAPI,
_errorContext?: ErrorContext,
): Promise<void> {
if (!s.active) return;
clearUnitTimeout();
// Unblock any pending unit promise so the auto-loop is not orphaned.
resolveAgentEndCancelled();
// Pass errorContext so runUnitPhase can distinguish user-initiated pause
// from provider-error pause and avoid hard-stopping (#2762).
resolveAgentEndCancelled(_errorContext);
s.pausedSessionFile = ctx?.sessionManager?.getSessionFile() ?? null;

View file

@ -1069,6 +1069,12 @@ export async function runUnitPhase(
}
if (unitResult.status === "cancelled") {
// Provider-error pause: pauseAuto already handled cleanup and scheduled
// recovery. Don't hard-stop — just break out of the loop (#2762).
if (unitResult.errorContext?.category === "provider") {
debugLog("autoLoop", { phase: "exit", reason: "provider-pause", isTransient: unitResult.errorContext.isTransient });
return { action: "break", reason: "provider-pause" };
}
ctx.ui.notify(
`Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`,
"warning",

View file

@ -33,7 +33,12 @@ async function pauseTransientWithBackoff(
if (!allowAutoResume) {
ctx.ui.notify(`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`, "warning");
}
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi, {
message: `Provider error: ${errorDetail}`,
category: "provider",
isTransient: allowAutoResume,
retryAfterMs,
}), {
isRateLimit,
isTransient: allowAutoResume,
retryAfterMs,
@ -161,7 +166,11 @@ export async function handleAgentEnd(
}
// --- Permanent / unknown: pause indefinitely ---
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi), {
await pauseAutoForProviderError(ctx.ui, errorDetail, () => pauseAuto(ctx, pi, {
message: `Provider error: ${errorDetail}`,
category: "provider",
isTransient: false,
}), {
isRateLimit: false,
isTransient: false,
retryAfterMs: 0,

View file

@ -102,7 +102,7 @@ test("pauseAuto calls resolveAgentEndCancelled to unblock the loop", () => {
const fnBlock = source.slice(fnIdx, source.indexOf("\n/**\n * Build", fnIdx + 100));
assert.ok(
fnBlock.includes("resolveAgentEndCancelled()"),
fnBlock.includes("resolveAgentEndCancelled("),
"pauseAuto must call resolveAgentEndCancelled to unblock the auto-loop promise",
);
});