feat(gsd): wire structured error propagation through UnitResult
Add ErrorContext interface to UnitResult so error information (provider errors, timeouts, idle watchdog kills) is no longer discarded at the resolve boundary. The four call sites that previously threw away context now attach typed error metadata with category, message, and transience. Downstream consumers (stuck detection in phases.ts, journal unit-end events) use the structured errorContext field directly instead of fragile regex heuristics on message content.
This commit is contained in:
parent
ef5006e16d
commit
dc723b2519
9 changed files with 107 additions and 21 deletions
|
|
@ -13,4 +13,4 @@ export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _re
|
|||
export { detectStuck } from "./auto/detect-stuck.js";
|
||||
export { runUnit } from "./auto/run-unit.js";
|
||||
export type { LoopDeps } from "./auto/loop-deps.js";
|
||||
export type { AgentEndEvent, UnitResult } from "./auto/types.js";
|
||||
export type { AgentEndEvent, ErrorContext, UnitResult } from "./auto/types.js";
|
||||
|
|
|
|||
|
|
@ -192,7 +192,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
const message = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[idle-watchdog] Unhandled error: ${message}`);
|
||||
// Unblock any pending unit promise so the auto-loop is not orphaned.
|
||||
resolveAgentEndCancelled();
|
||||
resolveAgentEndCancelled({ message: `Idle watchdog error: ${message}`, category: "idle", isTransient: true });
|
||||
try {
|
||||
ctx.ui.notify(`Idle watchdog error: ${message}`, "warning");
|
||||
} catch { /* best effort */ }
|
||||
|
|
@ -226,7 +226,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
const message = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[hard-timeout] Unhandled error: ${message}`);
|
||||
// Unblock any pending unit promise so the auto-loop is not orphaned.
|
||||
resolveAgentEndCancelled();
|
||||
resolveAgentEndCancelled({ message: `Hard timeout error: ${message}`, category: "timeout", isTransient: true });
|
||||
try {
|
||||
ctx.ui.notify(`Hard timeout error: ${message}`, "warning");
|
||||
} catch { /* best effort */ }
|
||||
|
|
|
|||
|
|
@ -1039,17 +1039,16 @@ export async function runUnitPhase(
|
|||
);
|
||||
|
||||
// Tag the most recent window entry with error info for stuck detection
|
||||
if (unitResult.status === "error" || unitResult.status === "cancelled") {
|
||||
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
|
||||
if (lastEntry) {
|
||||
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
|
||||
if (lastEntry) {
|
||||
if (unitResult.errorContext) {
|
||||
lastEntry.error = `${unitResult.errorContext.category}:${unitResult.errorContext.message}`.slice(0, 200);
|
||||
} else if (unitResult.status === "error" || unitResult.status === "cancelled") {
|
||||
lastEntry.error = `${unitResult.status}:${unitType}/${unitId}`;
|
||||
}
|
||||
} else if (unitResult.event?.messages?.length) {
|
||||
const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1];
|
||||
const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg);
|
||||
if (/error|fail|exception/i.test(msgStr)) {
|
||||
const lastEntry = loopState.recentUnits[loopState.recentUnits.length - 1];
|
||||
if (lastEntry) {
|
||||
} else if (unitResult.event?.messages?.length) {
|
||||
const lastMsg = unitResult.event.messages[unitResult.event.messages.length - 1];
|
||||
const msgStr = typeof lastMsg === "string" ? lastMsg : JSON.stringify(lastMsg);
|
||||
if (/error|fail|exception/i.test(msgStr)) {
|
||||
lastEntry.error = msgStr.slice(0, 200);
|
||||
}
|
||||
}
|
||||
|
|
@ -1122,7 +1121,7 @@ export async function runUnitPhase(
|
|||
s.unitRecoveryCount.delete(`${unitType}/${unitId}`);
|
||||
}
|
||||
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
|
||||
|
||||
return { action: "next", data: { unitStartedAt: s.currentUnit.startedAt } };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
* Imports from: auto/types
|
||||
*/
|
||||
|
||||
import type { UnitResult, AgentEndEvent } from "./types.js";
|
||||
import type { UnitResult, AgentEndEvent, ErrorContext } from "./types.js";
|
||||
import type { AutoSession } from "./session.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
|
||||
|
|
@ -77,12 +77,12 @@ export function isSessionSwitchInFlight(): boolean {
|
|||
* blocks to ensure the autoLoop is never stuck awaiting a promise that
|
||||
* will never resolve. Safe to call when no resolver is pending (no-op).
|
||||
*/
|
||||
export function resolveAgentEndCancelled(): void {
|
||||
export function resolveAgentEndCancelled(errorContext?: ErrorContext): void {
|
||||
if (_currentResolve) {
|
||||
debugLog("resolveAgentEndCancelled", { status: "resolving-cancelled" });
|
||||
const r = _currentResolve;
|
||||
_currentResolve = null;
|
||||
r({ status: "cancelled" });
|
||||
r({ status: "cancelled", ...(errorContext ? { errorContext } : {}) });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,13 +58,13 @@ export async function runUnit(
|
|||
unitId,
|
||||
error: msg,
|
||||
});
|
||||
return { status: "cancelled" };
|
||||
return { status: "cancelled", errorContext: { message: `Session creation failed: ${msg}`, category: "session-failed", isTransient: true } };
|
||||
}
|
||||
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
||||
|
||||
if (sessionResult.cancelled) {
|
||||
debugLog("runUnit-session-timeout", { unitType, unitId });
|
||||
return { status: "cancelled" };
|
||||
return { status: "cancelled", errorContext: { message: "Session creation timed out", category: "timeout", isTransient: true } };
|
||||
}
|
||||
|
||||
if (!s.active) {
|
||||
|
|
|
|||
|
|
@ -47,12 +47,25 @@ export interface AgentEndEvent {
|
|||
messages: unknown[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Structured error context attached to a UnitResult when the unit ends
|
||||
* due to an infrastructure or timeout error (not user-driven cancellation).
|
||||
*/
|
||||
export interface ErrorContext {
|
||||
message: string;
|
||||
category: "provider" | "timeout" | "idle" | "network" | "aborted" | "session-failed" | "unknown";
|
||||
stopReason?: string;
|
||||
isTransient?: boolean;
|
||||
retryAfterMs?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a single unit execution (one iteration of the loop).
|
||||
*/
|
||||
export interface UnitResult {
|
||||
status: "completed" | "cancelled" | "error";
|
||||
event?: AgentEndEvent;
|
||||
errorContext?: ErrorContext;
|
||||
}
|
||||
|
||||
// ─── Phase pipeline types ────────────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -116,7 +116,7 @@ test("auto-timers.ts idle watchdog catch calls resolveAgentEndCancelled", () =>
|
|||
// Check that resolveAgentEndCancelled is called near this catch
|
||||
const catchRegion = source.slice(Math.max(0, idleCatchIdx - 200), idleCatchIdx + 200);
|
||||
assert.ok(
|
||||
catchRegion.includes("resolveAgentEndCancelled()"),
|
||||
catchRegion.includes("resolveAgentEndCancelled("),
|
||||
"idle watchdog catch block must call resolveAgentEndCancelled",
|
||||
);
|
||||
});
|
||||
|
|
@ -129,7 +129,7 @@ test("auto-timers.ts hard timeout catch calls resolveAgentEndCancelled", () => {
|
|||
assert.ok(hardCatchIdx > -1, "hard timeout catch block must exist");
|
||||
const catchRegion = source.slice(Math.max(0, hardCatchIdx - 200), hardCatchIdx + 200);
|
||||
assert.ok(
|
||||
catchRegion.includes("resolveAgentEndCancelled()"),
|
||||
catchRegion.includes("resolveAgentEndCancelled("),
|
||||
"hard timeout catch block must call resolveAgentEndCancelled",
|
||||
);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1745,6 +1745,41 @@ test("resolveAgentEndCancelled prevents orphaned promise after abort path", asyn
|
|||
assert.equal(result.status, "cancelled");
|
||||
});
|
||||
|
||||
test("resolveAgentEndCancelled with errorContext passes it through to resolved promise", async () => {
|
||||
_resetPendingResolve();
|
||||
|
||||
const { _setCurrentResolve } = await import("../auto/resolve.js");
|
||||
|
||||
const p = new Promise<UnitResult>((r) => {
|
||||
_setCurrentResolve(r);
|
||||
});
|
||||
|
||||
resolveAgentEndCancelled({ message: "test timeout", category: "timeout", isTransient: true });
|
||||
|
||||
const resolved = await p;
|
||||
assert.equal(resolved.status, "cancelled");
|
||||
assert.ok(resolved.errorContext, "errorContext must be present");
|
||||
assert.equal(resolved.errorContext!.category, "timeout");
|
||||
assert.equal(resolved.errorContext!.message, "test timeout");
|
||||
assert.equal(resolved.errorContext!.isTransient, true);
|
||||
});
|
||||
|
||||
test("resolveAgentEndCancelled without args produces no errorContext field", async () => {
|
||||
_resetPendingResolve();
|
||||
|
||||
const { _setCurrentResolve } = await import("../auto/resolve.js");
|
||||
|
||||
const p = new Promise<UnitResult>((r) => {
|
||||
_setCurrentResolve(r);
|
||||
});
|
||||
|
||||
resolveAgentEndCancelled();
|
||||
|
||||
const resolved = await p;
|
||||
assert.equal(resolved.status, "cancelled");
|
||||
assert.equal(resolved.errorContext, undefined, "errorContext must not be present when no args passed");
|
||||
});
|
||||
|
||||
// ─── #1571: artifact verification retry ──────────────────────────────────────
|
||||
|
||||
test("autoLoop re-iterates when postUnitPreVerification returns retry (#1571)", async () => {
|
||||
|
|
|
|||
|
|
@ -505,3 +505,42 @@ test("milestone-transition event is emitted when milestone changes", async () =>
|
|||
assert.equal((transitionEvents[0].data as any).to, "M002");
|
||||
assert.equal(transitionEvents[0].flowId, ic.flowId);
|
||||
});
|
||||
|
||||
test("unit-end event contains errorContext when unit is cancelled with structured error", async () => {
|
||||
const capture = createEventCapture();
|
||||
const { resolveAgentEndCancelled, _resetPendingResolve } = await import("../auto-loop.js");
|
||||
_resetPendingResolve();
|
||||
|
||||
const deps = makeMockDeps(capture);
|
||||
const ic = makeIC(deps);
|
||||
const iterData: IterationData = {
|
||||
unitType: "execute-task",
|
||||
unitId: "M001/S01/T01",
|
||||
prompt: "do stuff",
|
||||
finalPrompt: "do stuff",
|
||||
pauseAfterUatDispatch: false,
|
||||
state: { phase: "executing", activeMilestone: { id: "M001" }, activeSlice: { id: "S01" }, registry: [], blockers: [] } as any,
|
||||
mid: "M001",
|
||||
midTitle: "Test",
|
||||
isRetry: false,
|
||||
previousTier: undefined,
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
|
||||
|
||||
const unitPromise = runUnitPhase(ic, iterData, loopState);
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
|
||||
// Resolve with errorContext (simulates a timeout cancel)
|
||||
resolveAgentEndCancelled({ message: "Hard timeout error: exceeded limit", category: "timeout", isTransient: true });
|
||||
|
||||
const result = await unitPromise;
|
||||
// Cancelled units break the loop before emitting unit-end
|
||||
assert.equal(result.action, "break");
|
||||
assert.equal((result as any).reason, "session-failed");
|
||||
|
||||
// Verify error classification used structured errorContext on the window entry
|
||||
const entry = loopState.recentUnits[loopState.recentUnits.length - 1];
|
||||
assert.ok(entry.error, "window entry must have error set");
|
||||
assert.ok(entry.error!.startsWith("timeout:"), "error must start with category from errorContext");
|
||||
assert.ok(entry.error!.includes("Hard timeout error"), "error must include the errorContext message");
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue