feat(gsd): wire structured error propagation through UnitResult

Add ErrorContext interface to UnitResult so error information (provider
errors, timeouts, idle watchdog kills) is no longer discarded at the
resolve boundary. The four call sites that previously threw away context
now attach typed error metadata with category, message, and transience.

Downstream consumers (stuck detection in phases.ts, journal unit-end
events) use the structured errorContext field directly instead of
fragile regex heuristics on message content.
This commit is contained in:
Iouri Goussev 2026-03-26 01:01:15 -04:00
parent ef5006e16d
commit dc723b2519
9 changed files with 107 additions and 21 deletions

View file

@ -13,4 +13,4 @@ export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _re
export { detectStuck } from "./auto/detect-stuck.js";
export { runUnit } from "./auto/run-unit.js";
export type { LoopDeps } from "./auto/loop-deps.js";
export type { AgentEndEvent, UnitResult } from "./auto/types.js";
export type { AgentEndEvent, ErrorContext, UnitResult } from "./auto/types.js";

View file

@ -192,7 +192,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
const message = err instanceof Error ? err.message : String(err);
console.error(`[idle-watchdog] Unhandled error: ${message}`);
// Unblock any pending unit promise so the auto-loop is not orphaned.
resolveAgentEndCancelled();
resolveAgentEndCancelled({ message: `Idle watchdog error: ${message}`, category: "idle", isTransient: true });
try {
ctx.ui.notify(`Idle watchdog error: ${message}`, "warning");
} catch { /* best effort */ }
@ -226,7 +226,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
const message = err instanceof Error ? err.message : String(err);
console.error(`[hard-timeout] Unhandled error: ${message}`);
// Unblock any pending unit promise so the auto-loop is not orphaned.
resolveAgentEndCancelled();
resolveAgentEndCancelled({ message: `Hard timeout error: ${message}`, category: "timeout", isTransient: true });
try {
ctx.ui.notify(`Hard timeout error: ${message}`, "warning");
} catch { /* best effort */ }

View file

@ -1039,17 +1039,16 @@ export async function runUnitPhase(
);
// Tag the most recent window entry with error info for stuck detection
if (unitResult.status === "error" || unitResult.status === "cancelled") {
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
if (lastEntry) {
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
if (lastEntry) {
if (unitResult.errorContext) {
lastEntry.error = `${unitResult.errorContext.category}:${unitResult.errorContext.message}`.slice(0, 200);
} else if (unitResult.status === "error" || unitResult.status === "cancelled") {
lastEntry.error = `${unitResult.status}:${unitType}/${unitId}`;
}
} else if (unitResult.event?.messages?.length) {
const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1];
const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg);
if (/error|fail|exception/i.test(msgStr)) {
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
if (lastEntry) {
} else if (unitResult.event?.messages?.length) {
const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1];
const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg);
if (/error|fail|exception/i.test(msgStr)) {
lastEntry.error = msgStr.slice(0, 200);
}
}
@ -1122,7 +1121,7 @@ export async function runUnitPhase(
s.unitRecoveryCount.delete(`${unitType}/${unitId}`);
}
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
return { action: "next", data: { unitStartedAt: s.currentUnit.startedAt } };
}

View file

@ -8,7 +8,7 @@
* Imports from: auto/types
*/
import type { UnitResult, AgentEndEvent } from "./types.js";
import type { UnitResult, AgentEndEvent, ErrorContext } from "./types.js";
import type { AutoSession } from "./session.js";
import { debugLog } from "../debug-logger.js";
@ -77,12 +77,12 @@ export function isSessionSwitchInFlight(): boolean {
* blocks to ensure the autoLoop is never stuck awaiting a promise that
* will never resolve. Safe to call when no resolver is pending (no-op).
*/
export function resolveAgentEndCancelled(): void {
export function resolveAgentEndCancelled(errorContext?: ErrorContext): void {
if (_currentResolve) {
debugLog("resolveAgentEndCancelled", { status: "resolving-cancelled" });
const r = _currentResolve;
_currentResolve = null;
r({ status: "cancelled" });
r({ status: "cancelled", ...(errorContext ? { errorContext } : {}) });
}
}

View file

@ -58,13 +58,13 @@ export async function runUnit(
unitId,
error: msg,
});
return { status: "cancelled" };
return { status: "cancelled", errorContext: { message: `Session creation failed: ${msg}`, category: "session-failed", isTransient: true } };
}
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
if (sessionResult.cancelled) {
debugLog("runUnit-session-timeout", { unitType, unitId });
return { status: "cancelled" };
return { status: "cancelled", errorContext: { message: "Session creation timed out", category: "timeout", isTransient: true } };
}
if (!s.active) {

View file

@ -47,12 +47,25 @@ export interface AgentEndEvent {
messages: unknown[];
}
/**
* Structured error context attached to a UnitResult when the unit ends
* due to an infrastructure or timeout error (not user-driven cancellation).
*/
export interface ErrorContext {
message: string;
category: "provider" | "timeout" | "idle" | "network" | "aborted" | "session-failed" | "unknown";
stopReason?: string;
isTransient?: boolean;
retryAfterMs?: number;
}
/**
* Result of a single unit execution (one iteration of the loop).
*/
export interface UnitResult {
status: "completed" | "cancelled" | "error";
event?: AgentEndEvent;
errorContext?: ErrorContext;
}
// ─── Phase pipeline types ────────────────────────────────────────────────────

View file

@ -116,7 +116,7 @@ test("auto-timers.ts idle watchdog catch calls resolveAgentEndCancelled", () =>
// Check that resolveAgentEndCancelled is called near this catch
const catchRegion = source.slice(Math.max(0, idleCatchIdx - 200), idleCatchIdx + 200);
assert.ok(
catchRegion.includes("resolveAgentEndCancelled()"),
catchRegion.includes("resolveAgentEndCancelled("),
"idle watchdog catch block must call resolveAgentEndCancelled",
);
});
@ -129,7 +129,7 @@ test("auto-timers.ts hard timeout catch calls resolveAgentEndCancelled", () => {
assert.ok(hardCatchIdx > -1, "hard timeout catch block must exist");
const catchRegion = source.slice(Math.max(0, hardCatchIdx - 200), hardCatchIdx + 200);
assert.ok(
catchRegion.includes("resolveAgentEndCancelled()"),
catchRegion.includes("resolveAgentEndCancelled("),
"hard timeout catch block must call resolveAgentEndCancelled",
);
});

View file

@ -1745,6 +1745,41 @@ test("resolveAgentEndCancelled prevents orphaned promise after abort path", asyn
assert.equal(result.status, "cancelled");
});
test("resolveAgentEndCancelled with errorContext passes it through to resolved promise", async () => {
_resetPendingResolve();
const { _setCurrentResolve } = await import("../auto/resolve.js");
const p = new Promise<UnitResult>((r) => {
_setCurrentResolve(r);
});
resolveAgentEndCancelled({ message: "test timeout", category: "timeout", isTransient: true });
const resolved = await p;
assert.equal(resolved.status, "cancelled");
assert.ok(resolved.errorContext, "errorContext must be present");
assert.equal(resolved.errorContext!.category, "timeout");
assert.equal(resolved.errorContext!.message, "test timeout");
assert.equal(resolved.errorContext!.isTransient, true);
});
test("resolveAgentEndCancelled without args produces no errorContext field", async () => {
_resetPendingResolve();
const { _setCurrentResolve } = await import("../auto/resolve.js");
const p = new Promise<UnitResult>((r) => {
_setCurrentResolve(r);
});
resolveAgentEndCancelled();
const resolved = await p;
assert.equal(resolved.status, "cancelled");
assert.equal(resolved.errorContext, undefined, "errorContext must not be present when no args passed");
});
// ─── #1571: artifact verification retry ──────────────────────────────────────
test("autoLoop re-iterates when postUnitPreVerification returns retry (#1571)", async () => {

View file

@ -505,3 +505,42 @@ test("milestone-transition event is emitted when milestone changes", async () =>
assert.equal((transitionEvents[0].data as any).to, "M002");
assert.equal(transitionEvents[0].flowId, ic.flowId);
});
test("unit-end event contains errorContext when unit is cancelled with structured error", async () => {
const capture = createEventCapture();
const { resolveAgentEndCancelled, _resetPendingResolve } = await import("../auto-loop.js");
_resetPendingResolve();
const deps = makeMockDeps(capture);
const ic = makeIC(deps);
const iterData: IterationData = {
unitType: "execute-task",
unitId: "M001/S01/T01",
prompt: "do stuff",
finalPrompt: "do stuff",
pauseAfterUatDispatch: false,
state: { phase: "executing", activeMilestone: { id: "M001" }, activeSlice: { id: "S01" }, registry: [], blockers: [] } as any,
mid: "M001",
midTitle: "Test",
isRetry: false,
previousTier: undefined,
};
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
const unitPromise = runUnitPhase(ic, iterData, loopState);
await new Promise(r => setTimeout(r, 50));
// Resolve with errorContext (simulates a timeout cancel)
resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true });
const result = await unitPromise;
// Cancelled units break the loop before emitting unit-end
assert.equal(result.action, "break");
assert.equal((result as any).reason, "session-failed");
// Verify error classification used structured errorContext on the window entry
const entry = loopState.recentUnits[loopState.recentUnits.length - 1];
assert.ok(entry.error, "window entry must have error set");
assert.ok(entry.error!.startsWith("timeout:"), "error must start with category from errorContext");
assert.ok(entry.error!.includes("Hard timeout error"), "error must include the errorContext message");
});