From dc723b2519477050cf931339cb462598514e2091 Mon Sep 17 00:00:00 2001 From: Iouri Goussev Date: Thu, 26 Mar 2026 01:01:15 -0400 Subject: [PATCH] feat(gsd): wire structured error propagation through UnitResult Add ErrorContext interface to UnitResult so error information (provider errors, timeouts, idle watchdog kills) is no longer discarded at the resolve boundary. The four call sites that previously threw away context now attach typed error metadata with category, message, and transience. Downstream consumers (stuck detection in phases.ts, journal unit-end events) use the structured errorContext field directly instead of fragile regex heuristics on message content. --- src/resources/extensions/gsd/auto-loop.ts | 2 +- src/resources/extensions/gsd/auto-timers.ts | 4 +- src/resources/extensions/gsd/auto/phases.ts | 21 +++++----- src/resources/extensions/gsd/auto/resolve.ts | 6 +-- src/resources/extensions/gsd/auto/run-unit.ts | 4 +- src/resources/extensions/gsd/auto/types.ts | 13 +++++++ .../gsd/tests/agent-end-retry.test.ts | 4 +- .../extensions/gsd/tests/auto-loop.test.ts | 35 +++++++++++++++++ .../gsd/tests/journal-integration.test.ts | 39 +++++++++++++++++++ 9 files changed, 107 insertions(+), 21 deletions(-) diff --git a/src/resources/extensions/gsd/auto-loop.ts b/src/resources/extensions/gsd/auto-loop.ts index 74fcc8f16..6400e9871 100644 --- a/src/resources/extensions/gsd/auto-loop.ts +++ b/src/resources/extensions/gsd/auto-loop.ts @@ -13,4 +13,4 @@ export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _re export { detectStuck } from "./auto/detect-stuck.js"; export { runUnit } from "./auto/run-unit.js"; export type { LoopDeps } from "./auto/loop-deps.js"; -export type { AgentEndEvent, UnitResult } from "./auto/types.js"; +export type { AgentEndEvent, ErrorContext, UnitResult } from "./auto/types.js"; diff --git a/src/resources/extensions/gsd/auto-timers.ts b/src/resources/extensions/gsd/auto-timers.ts index ae3ded014..22b70fa54 100644 --- a/src/resources/extensions/gsd/auto-timers.ts +++ b/src/resources/extensions/gsd/auto-timers.ts @@ -192,7 +192,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void { const message = err instanceof Error ? err.message : String(err); console.error(`[idle-watchdog] Unhandled error: ${message}`); // Unblock any pending unit promise so the auto-loop is not orphaned. - resolveAgentEndCancelled(); + resolveAgentEndCancelled({ message: `Idle watchdog error: ${message}`, category: "idle", isTransient: true }); try { ctx.ui.notify(`Idle watchdog error: ${message}`, "warning"); } catch { /* best effort */ } @@ -226,7 +226,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void { const message = err instanceof Error ? err.message : String(err); console.error(`[hard-timeout] Unhandled error: ${message}`); // Unblock any pending unit promise so the auto-loop is not orphaned. - resolveAgentEndCancelled(); + resolveAgentEndCancelled({ message: `Hard timeout error: ${message}`, category: "timeout", isTransient: true }); try { ctx.ui.notify(`Hard timeout error: ${message}`, "warning"); } catch { /* best effort */ } diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 0f408105f..0f0976752 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -1039,17 +1039,16 @@ export async function runUnitPhase( ); // Tag the most recent window entry with error info for stuck detection - if (unitResult.status === "error" || unitResult.status === "cancelled") { - const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; - if (lastEntry) { + const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; + if (lastEntry) { + if (unitResult.errorContext) { + lastEntry.error = `${unitResult.errorContext.category}:${unitResult.errorContext.message}`.slice(0, 200); + } else if (unitResult.status === "error" || unitResult.status === "cancelled") { lastEntry.error = `${unitResult.status}:${unitType}/${unitId}`; - } - } else if (unitResult.event?.messages?.length) { - const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1]; - const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg); - if (/error|fail|exception/i.test(msgStr)) { - const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1]; - if (lastEntry) { + } else if (unitResult.event?.messages?.length) { + const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1]; + const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg); + if (/error|fail|exception/i.test(msgStr)) { lastEntry.error = msgStr.slice(0, 200); } } @@ -1122,7 +1121,7 @@ export async function runUnitPhase( s.unitRecoveryCount.delete(`${unitType}/${unitId}`); } - deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); + deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); return { action: "next", data: { unitStartedAt: s.currentUnit.startedAt } }; } diff --git a/src/resources/extensions/gsd/auto/resolve.ts b/src/resources/extensions/gsd/auto/resolve.ts index 0eb3ef751..6de2eaeee 100644 --- a/src/resources/extensions/gsd/auto/resolve.ts +++ b/src/resources/extensions/gsd/auto/resolve.ts @@ -8,7 +8,7 @@ * Imports from: auto/types */ -import type { UnitResult, AgentEndEvent } from "./types.js"; +import type { UnitResult, AgentEndEvent, ErrorContext } from "./types.js"; import type { AutoSession } from "./session.js"; import { debugLog } from "../debug-logger.js"; @@ -77,12 +77,12 @@ export function isSessionSwitchInFlight(): boolean { * blocks to ensure the autoLoop is never stuck awaiting a promise that * will never resolve. Safe to call when no resolver is pending (no-op). */ -export function resolveAgentEndCancelled(): void { +export function resolveAgentEndCancelled(errorContext?: ErrorContext): void { if (_currentResolve) { debugLog("resolveAgentEndCancelled", { status: "resolving-cancelled" }); const r = _currentResolve; _currentResolve = null; - r({ status: "cancelled" }); + r({ status: "cancelled", ...(errorContext ? { errorContext } : {}) }); } } diff --git a/src/resources/extensions/gsd/auto/run-unit.ts b/src/resources/extensions/gsd/auto/run-unit.ts index aa078676b..47512d395 100644 --- a/src/resources/extensions/gsd/auto/run-unit.ts +++ b/src/resources/extensions/gsd/auto/run-unit.ts @@ -58,13 +58,13 @@ export async function runUnit( unitId, error: msg, }); - return { status: "cancelled" }; + return { status: "cancelled", errorContext: { message: `Session creation failed: ${msg}`, category: "session-failed", isTransient: true } }; } if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); if (sessionResult.cancelled) { debugLog("runUnit-session-timeout", { unitType, unitId }); - return { status: "cancelled" }; + return { status: "cancelled", errorContext: { message: "Session creation timed out", category: "timeout", isTransient: true } }; } if (!s.active) { diff --git a/src/resources/extensions/gsd/auto/types.ts b/src/resources/extensions/gsd/auto/types.ts index 59375bd9d..d3e342f82 100644 --- a/src/resources/extensions/gsd/auto/types.ts +++ b/src/resources/extensions/gsd/auto/types.ts @@ -47,12 +47,25 @@ export interface AgentEndEvent { messages: unknown[]; } +/** + * Structured error context attached to a UnitResult when the unit ends + * due to an infrastructure or timeout error (not user-driven cancellation). + */ +export interface ErrorContext { + message: string; + category: "provider" | "timeout" | "idle" | "network" | "aborted" | "session-failed" | "unknown"; + stopReason?: string; + isTransient?: boolean; + retryAfterMs?: number; +} + /** * Result of a single unit execution (one iteration of the loop). */ export interface UnitResult { status: "completed" | "cancelled" | "error"; event?: AgentEndEvent; + errorContext?: ErrorContext; } // ─── Phase pipeline types ──────────────────────────────────────────────────── diff --git a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts index 6db2f9d36..c1ddfc02b 100644 --- a/src/resources/extensions/gsd/tests/agent-end-retry.test.ts +++ b/src/resources/extensions/gsd/tests/agent-end-retry.test.ts @@ -116,7 +116,7 @@ test("auto-timers.ts idle watchdog catch calls resolveAgentEndCancelled", () => // Check that resolveAgentEndCancelled is called near this catch const catchRegion = source.slice(Math.max(0, idleCatchIdx - 200), idleCatchIdx + 200); assert.ok( - catchRegion.includes("resolveAgentEndCancelled()"), + catchRegion.includes("resolveAgentEndCancelled("), "idle watchdog catch block must call resolveAgentEndCancelled", ); }); @@ -129,7 +129,7 @@ test("auto-timers.ts hard timeout catch calls resolveAgentEndCancelled", () => { assert.ok(hardCatchIdx > -1, "hard timeout catch block must exist"); const catchRegion = source.slice(Math.max(0, hardCatchIdx - 200), hardCatchIdx + 200); assert.ok( - catchRegion.includes("resolveAgentEndCancelled()"), + catchRegion.includes("resolveAgentEndCancelled("), "hard timeout catch block must call resolveAgentEndCancelled", ); }); diff --git a/src/resources/extensions/gsd/tests/auto-loop.test.ts b/src/resources/extensions/gsd/tests/auto-loop.test.ts index c77fb44df..5e01617eb 100644 --- a/src/resources/extensions/gsd/tests/auto-loop.test.ts +++ b/src/resources/extensions/gsd/tests/auto-loop.test.ts @@ -1745,6 +1745,41 @@ test("resolveAgentEndCancelled prevents orphaned promise after abort path", asyn assert.equal(result.status, "cancelled"); }); +test("resolveAgentEndCancelled with errorContext passes it through to resolved promise", async () => { + _resetPendingResolve(); + + const { _setCurrentResolve } = await import("../auto/resolve.js"); + + const p = new Promise((r) => { + _setCurrentResolve(r); + }); + + resolveAgentEndCancelled({ message: "test timeout", category: "timeout", isTransient: true }); + + const resolved = await p; + assert.equal(resolved.status, "cancelled"); + assert.ok(resolved.errorContext, "errorContext must be present"); + assert.equal(resolved.errorContext!.category, "timeout"); + assert.equal(resolved.errorContext!.message, "test timeout"); + assert.equal(resolved.errorContext!.isTransient, true); +}); + +test("resolveAgentEndCancelled without args produces no errorContext field", async () => { + _resetPendingResolve(); + + const { _setCurrentResolve } = await import("../auto/resolve.js"); + + const p = new Promise((r) => { + _setCurrentResolve(r); + }); + + resolveAgentEndCancelled(); + + const resolved = await p; + assert.equal(resolved.status, "cancelled"); + assert.equal(resolved.errorContext, undefined, "errorContext must not be present when no args passed"); +}); + // ─── #1571: artifact verification retry ────────────────────────────────────── test("autoLoop re-iterates when postUnitPreVerification returns retry (#1571)", async () => { diff --git a/src/resources/extensions/gsd/tests/journal-integration.test.ts b/src/resources/extensions/gsd/tests/journal-integration.test.ts index ddbc096e5..49f64d7a3 100644 --- a/src/resources/extensions/gsd/tests/journal-integration.test.ts +++ b/src/resources/extensions/gsd/tests/journal-integration.test.ts @@ -505,3 +505,42 @@ test("milestone-transition event is emitted when milestone changes", async () => assert.equal((transitionEvents[0].data as any).to, "M002"); assert.equal(transitionEvents[0].flowId, ic.flowId); }); + +test("unit-end event contains errorContext when unit is cancelled with structured error", async () => { + const capture = createEventCapture(); + const { resolveAgentEndCancelled, _resetPendingResolve } = await import("../auto-loop.js"); + _resetPendingResolve(); + + const deps = makeMockDeps(capture); + const ic = makeIC(deps); + const iterData: IterationData = { + unitType: "execute-task", + unitId: "M001/S01/T01", + prompt: "do stuff", + finalPrompt: "do stuff", + pauseAfterUatDispatch: false, + state: { phase: "executing", activeMilestone: { id: "M001" }, activeSlice: { id: "S01" }, registry: [], blockers: [] } as any, + mid: "M001", + midTitle: "Test", + isRetry: false, + previousTier: undefined, + }; + const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 }; + + const unitPromise = runUnitPhase(ic, iterData, loopState); + await new Promise(r => setTimeout(r, 50)); + + // Resolve with errorContext (simulates a timeout cancel) + resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true }); + + const result = await unitPromise; + // Cancelled units break the loop before emitting unit-end + assert.equal(result.action, "break"); + assert.equal((result as any).reason, "session-failed"); + + // Verify error classification used structured errorContext on the window entry + const entry = loopState.recentUnits[loopState.recentUnits.length - 1]; + assert.ok(entry.error, "window entry must have error set"); + assert.ok(entry.error!.startsWith("timeout:"), "error must start with category from errorContext"); + assert.ok(entry.error!.includes("Hard timeout error"), "error must include the errorContext message"); +});