diff --git a/src/resources/extensions/gsd/auto-loop.ts b/src/resources/extensions/gsd/auto-loop.ts index 74fcc8f16..6400e9871 100644 --- a/src/resources/extensions/gsd/auto-loop.ts +++ b/src/resources/extensions/gsd/auto-loop.ts @@ -13,4 +13,4 @@ export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _re export { detectStuck } from "./auto/detect-stuck.js"; export { runUnit } from "./auto/run-unit.js"; export type { LoopDeps } from "./auto/loop-deps.js"; -export type { AgentEndEvent, UnitResult } from "./auto/types.js"; +export type { AgentEndEvent, ErrorContext, UnitResult } from "./auto/types.js"; diff --git a/src/resources/extensions/gsd/auto-timers.ts b/src/resources/extensions/gsd/auto-timers.ts index ae3ded014..22b70fa54 100644 --- a/src/resources/extensions/gsd/auto-timers.ts +++ b/src/resources/extensions/gsd/auto-timers.ts @@ -192,7 +192,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void { const message = err instanceof Error ? err.message : String(err); console.error(`[idle-watchdog] Unhandled error: ${message}`); // Unblock any pending unit promise so the auto-loop is not orphaned. - resolveAgentEndCancelled(); + resolveAgentEndCancelled({ message: `Idle watchdog error: ${message}`, category: "idle", isTransient: true }); try { ctx.ui.notify(`Idle watchdog error: ${message}`, "warning"); } catch { /* best effort */ } @@ -226,7 +226,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void { const message = err instanceof Error ? err.message : String(err); console.error(`[hard-timeout] Unhandled error: ${message}`); // Unblock any pending unit promise so the auto-loop is not orphaned. - resolveAgentEndCancelled(); + resolveAgentEndCancelled({ message: `Hard timeout error: ${message}`, category: "timeout", isTransient: true }); try { ctx.ui.notify(`Hard timeout error: ${message}`, "warning"); } catch { /* best effort */ } diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 0f408105f..0f0976752 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -1039,17 +1039,16 @@ export async function runUnitPhase( ); // Tag the most recent window entry with error info for stuck detection - if (unitResult.status === "error" || unitResult.status === "cancelled") { - const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; - if (lastEntry) { + const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; + if (lastEntry) { + if (unitResult.errorContext) { + lastEntry.error = `${unitResult.errorContext.category}:${unitResult.errorContext.message}`.slice(0, 200); + } else if (unitResult.status === "error" || unitResult.status === "cancelled") { lastEntry.error = `${unitResult.status}:${unitType}/${unitId}`; - } - } else if (unitResult.event?.messages?.length) { - const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1]; - const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg); - if (/error|fail|exception/i.test(msgStr)) { - const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; - if (lastEntry) { + } else if (unitResult.event?.messages?.length) { + const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1]; + const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg); + if (/error|fail|exception/i.test(msgStr)) { lastEntry.error = msgStr.slice(0, 200); } } @@ -1122,7 +1121,7 @@ export async function runUnitPhase( s.unitRecoveryCount.delete(`${unitType}/${unitId}`); } - deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); + deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); return { action: "next", data: { unitStartedAt: s.currentUnit.startedAt } }; } diff --git a/src/resources/extensions/gsd/auto/resolve.ts b/src/resources/extensions/gsd/auto/resolve.ts index 0eb3ef751..6de2eaeee 100644 --- a/src/resources/extensions/gsd/auto/resolve.ts +++ b/src/resources/extensions/gsd/auto/resolve.ts @@ -8,7 +8,7 @@ * Imports from: auto/types */ -import type { UnitResult, AgentEndEvent } from "./types.js"; +import type { UnitResult, AgentEndEvent, ErrorContext } from "./types.js"; import type { AutoSession } from "./session.js"; import { debugLog } from "../debug-logger.js"; @@ -77,12 +77,12 @@ export function isSessionSwitchInFlight(): boolean { * blocks to ensure the autoLoop is never stuck awaiting a promise that * will never resolve. Safe to call when no resolver is pending (no-op). */ -export function resolveAgentEndCancelled(): void { +export function resolveAgentEndCancelled(errorContext?: ErrorContext): void { if (_currentResolve) { debugLog("resolveAgentEndCancelled", { status: "resolving-cancelled" }); const r = _currentResolve; _currentResolve = null; - r({ status: "cancelled" }); + r({ status: "cancelled", ...(errorContext ? { errorContext } : {}) }); } } diff --git a/src/resources/extensions/gsd/auto/run-unit.ts b/src/resources/extensions/gsd/auto/run-unit.ts index aa078676b..47512d395 100644 --- a/src/resources/extensions/gsd/auto/run-unit.ts +++ b/src/resources/extensions/gsd/auto/run-unit.ts @@ -58,13 +58,13 @@ export async function runUnit( unitId, error: msg, }); - return { status: "cancelled" }; + return { status: "cancelled", errorContext: { message: `Session creation failed: ${msg}`, category: "session-failed", isTransient: true } }; } if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); if (sessionResult.cancelled) { debugLog("runUnit-session-timeout", { unitType, unitId }); - return { status: "cancelled" }; + return { status: "cancelled", errorContext: { message: "Session creation timed out", category: "timeout", isTransient: true } }; } if (!s.active) { diff --git a/src/resources/extensions/gsd/auto/types.ts b/src/resources/extensions/gsd/auto/types.ts index 59375bd9d..d3e342f82 100644 --- a/src/resources/extensions/gsd/auto/types.ts +++ b/src/resources/extensions/gsd/auto/types.ts @@ -47,12 +47,25 @@ export interface AgentEndEvent { messages: unknown[]; } +/** + * Structured error context attached to a UnitResult when the unit ends + * due to an infrastructure or timeout error (not user-driven cancellation). + */ +export interface ErrorContext { + message: string; + category: "provider" | "timeout" | "idle" | "network" | "aborted" | "session-failed" | "unknown"; + stopReason?: string; + isTransient?: boolean; + retryAfterMs?: number; +} + /** * Result of a single unit execution (one iteration of the loop). */ export interface UnitResult { status: "completed" | "cancelled" | "error"; event?: AgentEndEvent; + errorContext?: ErrorContext; } // ─── Phase pipeline types ──────────────────────────────────────────────────── diff --git a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts index 6db2f9d36..c1ddfc02b 100644 --- a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts +++ b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts @@ -116,7 +116,7 @@ test("auto-timers.ts idle watchdog catch calls resolveAgentEndCancelled", () => // Check that resolveAgentEndCancelled is called near this catch const catchRegion = source.slice(Math.max(0, idleCatchIdx - 200), idleCatchIdx + 200); assert.ok( - catchRegion.includes("resolveAgentEndCancelled()"), + catchRegion.includes("resolveAgentEndCancelled("), "idle watchdog catch block must call resolveAgentEndCancelled", ); }); @@ -129,7 +129,7 @@ test("auto-timers.ts hard timeout catch calls resolveAgentEndCancelled", () => { assert.ok(hardCatchIdx > -1, "hard timeout catch block must exist"); const catchRegion = source.slice(Math.max(0, hardCatchIdx - 200), hardCatchIdx + 200); assert.ok( - catchRegion.includes("resolveAgentEndCancelled()"), + catchRegion.includes("resolveAgentEndCancelled("), "hard timeout catch block must call resolveAgentEndCancelled", ); }); diff --git a/src/resources/extensions/gsd/tests/auto-loop.test.ts b/src/resources/extensions/gsd/tests/auto-loop.test.ts index c77fb44df..5e01617eb 100644 --- a/src/resources/extensions/gsd/tests/auto-loop.test.ts +++ b/src/resources/extensions/gsd/tests/auto-loop.test.ts @@ -1745,6 +1745,41 @@ test("resolveAgentEndCancelled prevents orphaned promise after abort path", asyn assert.equal(result.status, "cancelled"); }); +test("resolveAgentEndCancelled with errorContext passes it through to resolved promise", async () => { + _resetPendingResolve(); + + const { _setCurrentResolve } = await import("../auto/resolve.js"); + + const p = new Promise((r) => { + _setCurrentResolve(r); + }); + + resolveAgentEndCancelled({ message: "test timeout", category: "timeout", isTransient: true }); + + const resolved = await p; + assert.equal(resolved.status, "cancelled"); + assert.ok(resolved.errorContext, "errorContext must be present"); + assert.equal(resolved.errorContext!.category, "timeout"); + assert.equal(resolved.errorContext!.message, "test timeout"); + assert.equal(resolved.errorContext!.isTransient, true); +}); + +test("resolveAgentEndCancelled without args produces no errorContext field", async () => { + _resetPendingResolve(); + + const { _setCurrentResolve } = await import("../auto/resolve.js"); + + const p = new Promise((r) => { + _setCurrentResolve(r); + }); + + resolveAgentEndCancelled(); + + const resolved = await p; + assert.equal(resolved.status, "cancelled"); + assert.equal(resolved.errorContext, undefined, "errorContext must not be present when no args passed"); +}); + // ─── #1571: artifact verification retry ────────────────────────────────────── test("autoLoop re-iterates when postUnitPreVerification returns retry (#1571)", async () => { diff --git a/src/resources/extensions/gsd/tests/journal-integration.test.ts b/src/resources/extensions/gsd/tests/journal-integration.test.ts index ddbc096e5..49f64d7a3 100644 --- a/src/resources/extensions/gsd/tests/journal-integration.test.ts +++ b/src/resources/extensions/gsd/tests/journal-integration.test.ts @@ -505,3 +505,42 @@ test("milestone-transition event is emitted when milestone changes", async () => assert.equal((transitionEvents[0].data as any).to, "M002"); assert.equal(transitionEvents[0].flowId, ic.flowId); }); + +test("unit-end event contains errorContext when unit is cancelled with structured error", async () => { + const capture = createEventCapture(); + const { resolveAgentEndCancelled, _resetPendingResolve } = await import("../auto-loop.js"); + _resetPendingResolve(); + + const deps = makeMockDeps(capture); + const ic = makeIC(deps); + const iterData: IterationData = { + unitType: "execute-task", + unitId: "M001/S01/T01", + prompt: "do stuff", + finalPrompt: "do stuff", + pauseAfterUatDispatch: false, + state: { phase: "executing", activeMilestone: { id: "M001" }, activeSlice: { id: "S01" }, registry: [], blockers: [] } as any, + mid: "M001", + midTitle: "Test", + isRetry: false, + previousTier: undefined, + }; + const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 }; + + const unitPromise = runUnitPhase(ic, iterData, loopState); + await new Promise(r => setTimeout(r, 50)); + + // Resolve with errorContext (simulates a timeout cancel) + resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true }); + + const result = await unitPromise; + // Cancelled units break the loop before emitting unit-end + assert.equal(result.action, "break"); + assert.equal((result as any).reason, "session-failed"); + + // Verify error classification used structured errorContext on the window entry + const entry = loopState.recentUnits[loopState.recentUnits.length - 1]; + assert.ok(entry.error, "window entry must have error set"); + assert.ok(entry.error!.startsWith("timeout:"), "error must start with category from errorContext"); + assert.ok(entry.error!.includes("Hard timeout error"), "error must include the errorContext message"); +});