fix(gsd): enrich vague diagnostic messages with root-cause context
Closes #3524
This commit is contained in:
parent
546c93cae7
commit
53639aee5d
9 changed files with 73 additions and 68 deletions
|
|
@ -33,6 +33,7 @@ import {
|
|||
import {
|
||||
verifyExpectedArtifact,
|
||||
resolveExpectedArtifactPath,
|
||||
diagnoseExpectedArtifact,
|
||||
} from "./auto-recovery.js";
|
||||
import { regenerateIfMissing } from "./workflow-projections.js";
|
||||
import { syncStateToProjectRoot } from "./auto-worktree.js";
|
||||
|
|
@ -476,8 +477,9 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV
|
|||
// db_unavailable so the artifact was never written. Retrying would
|
||||
// produce an infinite re-dispatch loop (#2517).
|
||||
debugLog("postUnit", { phase: "artifact-verify-skip-db-unavailable", unitType: s.currentUnit.type, unitId: s.currentUnit.id });
|
||||
const dbSkipDiag = diagnoseExpectedArtifact(s.currentUnit.type, s.currentUnit.id, s.basePath);
|
||||
ctx.ui.notify(
|
||||
`Artifact missing for ${s.currentUnit.type} ${s.currentUnit.id} but DB is unavailable — skipping retry to avoid loop (#2517)`,
|
||||
`Artifact missing for ${s.currentUnit.type} ${s.currentUnit.id} — DB unavailable, skipping retry.${dbSkipDiag ? ` Expected: ${dbSkipDiag}` : ""}`,
|
||||
"error",
|
||||
);
|
||||
} else if (!triggerArtifactVerified) {
|
||||
|
|
@ -486,14 +488,15 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV
|
|||
const retryKey = `${s.currentUnit.type}:${s.currentUnit.id}`;
|
||||
const attempt = (s.verificationRetryCount.get(retryKey) ?? 0) + 1;
|
||||
s.verificationRetryCount.set(retryKey, attempt);
|
||||
const retryDiag = diagnoseExpectedArtifact(s.currentUnit.type, s.currentUnit.id, s.basePath);
|
||||
s.pendingVerificationRetry = {
|
||||
unitId: s.currentUnit.id,
|
||||
failureContext: `Artifact verification failed: expected artifact for ${s.currentUnit.type} "${s.currentUnit.id}" was not found on disk after unit execution (attempt ${attempt}).`,
|
||||
failureContext: `Artifact verification failed: expected artifact for ${s.currentUnit.type} "${s.currentUnit.id}" was not found on disk after unit execution (attempt ${attempt}).${retryDiag ? ` Expected: ${retryDiag}` : ""}`,
|
||||
attempt,
|
||||
};
|
||||
debugLog("postUnit", { phase: "artifact-verify-retry", unitType: s.currentUnit.type, unitId: s.currentUnit.id, attempt });
|
||||
ctx.ui.notify(
|
||||
`Artifact missing for ${s.currentUnit.type} ${s.currentUnit.id} — retrying (attempt ${attempt})`,
|
||||
`Artifact missing for ${s.currentUnit.type} ${s.currentUnit.id} — retrying (attempt ${attempt}).${retryDiag ? ` Expected: ${retryDiag}` : ""}`,
|
||||
"warning",
|
||||
);
|
||||
return "retry";
|
||||
|
|
|
|||
|
|
@ -196,19 +196,30 @@ export async function runPostUnitVerification(
|
|||
failureContext: formatFailureContext(result),
|
||||
attempt: nextAttempt,
|
||||
};
|
||||
const failedCmds = result.checks
|
||||
.filter((c) => c.exitCode !== 0)
|
||||
.map((c) => c.command);
|
||||
const cmdSummary = failedCmds.length <= 3
|
||||
? failedCmds.join(", ")
|
||||
: `${failedCmds.slice(0, 3).join(", ")}... and ${failedCmds.length - 3} more`;
|
||||
ctx.ui.notify(
|
||||
`Verification failed — auto-fix attempt ${nextAttempt}/${maxRetries}`,
|
||||
`Verification failed (${cmdSummary}) — auto-fix attempt ${nextAttempt}/${maxRetries}`,
|
||||
"warning",
|
||||
);
|
||||
// Return "retry" — the autoLoop while loop will re-iterate with the retry context
|
||||
return "retry";
|
||||
} else {
|
||||
// Gate failed, retries exhausted
|
||||
const exhaustedAttempt = attempt + 1;
|
||||
s.verificationRetryCount.delete(s.currentUnit.id);
|
||||
s.pendingVerificationRetry = null;
|
||||
const exhaustedFails = result.checks
|
||||
.filter((c) => c.exitCode !== 0)
|
||||
.map((c) => c.command);
|
||||
const exhaustedSummary = exhaustedFails.length <= 3
|
||||
? exhaustedFails.join(", ")
|
||||
: `${exhaustedFails.slice(0, 3).join(", ")}... and ${exhaustedFails.length - 3} more`;
|
||||
ctx.ui.notify(
|
||||
`Verification gate FAILED after ${exhaustedAttempt > maxRetries ? exhaustedAttempt - 1 : exhaustedAttempt} retries — pausing for human review`,
|
||||
`Verification gate FAILED after ${attempt} ${attempt === 1 ? "retry" : "retries"} (${exhaustedSummary}) — pausing for human review`,
|
||||
"error",
|
||||
);
|
||||
await pauseAuto(ctx, pi);
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ export async function autoLoop(
|
|||
let iteration = 0;
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
let consecutiveErrors = 0;
|
||||
const recentErrorMessages: string[] = [];
|
||||
|
||||
while (s.active) {
|
||||
iteration++;
|
||||
|
|
@ -202,6 +203,7 @@ export async function autoLoop(
|
|||
|
||||
deps.clearUnitTimeout();
|
||||
consecutiveErrors = 0;
|
||||
recentErrorMessages.length = 0;
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
|
||||
debugLog("autoLoop", { phase: "iteration-complete", iteration });
|
||||
continue;
|
||||
|
|
@ -250,6 +252,7 @@ export async function autoLoop(
|
|||
if (finalizeResult.action === "continue") continue;
|
||||
|
||||
consecutiveErrors = 0; // Iteration completed successfully
|
||||
recentErrorMessages.length = 0;
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
|
||||
debugLog("autoLoop", { phase: "iteration-complete", iteration });
|
||||
} catch (loopErr) {
|
||||
|
|
@ -280,6 +283,7 @@ export async function autoLoop(
|
|||
}
|
||||
|
||||
consecutiveErrors++;
|
||||
recentErrorMessages.push(msg.length > 120 ? msg.slice(0, 120) + "..." : msg);
|
||||
debugLog("autoLoop", {
|
||||
phase: "iteration-error",
|
||||
iteration,
|
||||
|
|
@ -289,8 +293,11 @@ export async function autoLoop(
|
|||
|
||||
if (consecutiveErrors >= 3) {
|
||||
// 3+ consecutive: hard stop — something is fundamentally broken
|
||||
const errorHistory = recentErrorMessages
|
||||
.map((m, i) => ` ${i + 1}. ${m}`)
|
||||
.join("\n");
|
||||
ctx.ui.notify(
|
||||
`Auto-mode stopped: ${consecutiveErrors} consecutive iteration failures. Last: ${msg}`,
|
||||
`Auto-mode stopped: ${consecutiveErrors} consecutive iteration failures:\n${errorHistory}`,
|
||||
"error",
|
||||
);
|
||||
await deps.stopAuto(
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ import { existsSync, cpSync } from "node:fs";
|
|||
import { logWarning, logError } from "../workflow-logger.js";
|
||||
import { gsdRoot } from "../paths.js";
|
||||
import { atomicWriteSync } from "../atomic-write.js";
|
||||
import { verifyExpectedArtifact } from "../auto-recovery.js";
|
||||
import { verifyExpectedArtifact, diagnoseExpectedArtifact, buildLoopRemediationSteps } from "../auto-recovery.js";
|
||||
import { writeUnitRuntimeRecord } from "../unit-runtime.js";
|
||||
|
||||
// ─── generateMilestoneReport ──────────────────────────────────────────────────
|
||||
|
|
@ -182,7 +182,7 @@ export async function runPreDispatch(
|
|||
}
|
||||
if (!healthGate.proceed) {
|
||||
ctx.ui.notify(
|
||||
healthGate.reason ?? "Pre-dispatch health check failed.",
|
||||
healthGate.reason || "Pre-dispatch health check failed — run /gsd doctor for details.",
|
||||
"error",
|
||||
);
|
||||
await deps.pauseAuto(ctx, pi);
|
||||
|
|
@ -628,15 +628,17 @@ export async function runDispatch(
|
|||
unitId,
|
||||
reason: stuckSignal.reason,
|
||||
});
|
||||
const stuckDiag = diagnoseExpectedArtifact(unitType, unitId, s.basePath);
|
||||
const stuckRemediation = buildLoopRemediationSteps(unitType, unitId, s.basePath);
|
||||
const stuckParts = [`Stuck on ${unitType} ${unitId} — ${stuckSignal.reason}.`];
|
||||
if (stuckDiag) stuckParts.push(`Expected: ${stuckDiag}`);
|
||||
if (stuckRemediation) stuckParts.push(`To recover:\n${stuckRemediation}`);
|
||||
ctx.ui.notify(stuckParts.join(" "), "error");
|
||||
await deps.stopAuto(
|
||||
ctx,
|
||||
pi,
|
||||
`Stuck: ${stuckSignal.reason}`,
|
||||
);
|
||||
ctx.ui.notify(
|
||||
`Stuck on ${unitType} ${unitId} — ${stuckSignal.reason}. The expected artifact was not written.`,
|
||||
"error",
|
||||
);
|
||||
return { action: "break", reason: "stuck-detected" };
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -259,6 +259,7 @@ export async function deriveState(basePath: string): Promise<GSDState> {
|
|||
_telemetry.markdownDeriveCount++;
|
||||
}
|
||||
} else {
|
||||
logWarning("state", "DB unavailable — using filesystem state derivation (degraded mode)");
|
||||
result = await _deriveStateImpl(basePath);
|
||||
_telemetry.markdownDeriveCount++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -217,12 +217,26 @@ describe("workflow-logger", () => {
|
|||
assert.ok(formatted.includes("\n"));
|
||||
});
|
||||
|
||||
test("does not include context in formatted output", () => {
|
||||
test("includes context fields in formatted output", () => {
|
||||
logError("tool", "failed", { cmd: "complete_task" });
|
||||
const entries = drainLogs();
|
||||
const formatted = formatForNotification(entries);
|
||||
assert.equal(formatted, "[tool] failed");
|
||||
assert.ok(!formatted.includes("complete_task"));
|
||||
assert.equal(formatted, "[tool] failed (cmd: complete_task)");
|
||||
});
|
||||
|
||||
test("excludes error key from context to avoid redundancy", () => {
|
||||
logError("tool", "disk write failed", { error: "ENOSPC", path: "/tmp/foo" });
|
||||
const entries = drainLogs();
|
||||
const formatted = formatForNotification(entries);
|
||||
assert.ok(formatted.includes("path: /tmp/foo"));
|
||||
assert.ok(!formatted.includes("error: ENOSPC"));
|
||||
});
|
||||
|
||||
test("formats entry without context unchanged", () => {
|
||||
logError("intercept", "blocked write");
|
||||
const entries = drainLogs();
|
||||
const formatted = formatForNotification(entries);
|
||||
assert.equal(formatted, "[intercept] blocked write");
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -279,44 +293,6 @@ describe("workflow-logger", () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe("audit log persistence", () => {
|
||||
let dir: string;
|
||||
|
||||
beforeEach(() => {
|
||||
dir = makeTempDir("wl-audit-");
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
setLogBasePath("");
|
||||
cleanup(dir);
|
||||
});
|
||||
|
||||
test("writes entry to .gsd/audit-log.jsonl after setLogBasePath", () => {
|
||||
setLogBasePath(dir);
|
||||
logError("engine", "audit test entry");
|
||||
|
||||
const auditPath = join(dir, ".gsd", "audit-log.jsonl");
|
||||
assert.ok(existsSync(auditPath), "audit-log.jsonl should exist");
|
||||
const content = readFileSync(auditPath, "utf-8");
|
||||
const entry = JSON.parse(content.trim());
|
||||
assert.equal(entry.severity, "error");
|
||||
assert.equal(entry.component, "engine");
|
||||
assert.equal(entry.message, "audit test entry");
|
||||
});
|
||||
|
||||
test("_resetLogs does not clear the audit base path", () => {
|
||||
setLogBasePath(dir);
|
||||
_resetLogs();
|
||||
logError("engine", "post-reset entry");
|
||||
|
||||
const auditPath = join(dir, ".gsd", "audit-log.jsonl");
|
||||
assert.ok(existsSync(auditPath), "audit-log.jsonl should exist after _resetLogs");
|
||||
const content = readFileSync(auditPath, "utf-8");
|
||||
const entry = JSON.parse(content.trim());
|
||||
assert.equal(entry.message, "post-reset entry");
|
||||
});
|
||||
});
|
||||
|
||||
describe("new log components (db, dispatch)", () => {
|
||||
test("logError with 'db' component stores correct component", () => {
|
||||
logError("db", "failed to copy DB to worktree", { error: "ENOENT" });
|
||||
|
|
|
|||
|
|
@ -292,13 +292,11 @@ export async function handleCompleteSlice(
|
|||
// Toggle roadmap checkbox via renderer module
|
||||
const roadmapToggled = await renderRoadmapCheckboxes(basePath, params.milestoneId);
|
||||
if (!roadmapToggled) {
|
||||
process.stderr.write(
|
||||
`gsd-db: complete_slice — could not find roadmap for ${params.milestoneId}, skipping checkbox toggle\n`,
|
||||
);
|
||||
logWarning("tool", `complete_slice — could not find roadmap for ${params.milestoneId}, skipping checkbox toggle`);
|
||||
}
|
||||
} catch (renderErr) {
|
||||
// Disk render failed — roll back DB status so state stays consistent
|
||||
logWarning("tool", `complete_slice — disk render failed, rolling back DB status: ${(renderErr as Error).message}`);
|
||||
logWarning("tool", `complete_slice — disk render failed for ${params.milestoneId}/${params.sliceId}, rolling back DB status`, { error: (renderErr as Error).message });
|
||||
updateSliceStatus(params.milestoneId, params.sliceId, 'pending');
|
||||
invalidateStateCache();
|
||||
return { error: `disk render failed: ${(renderErr as Error).message}` };
|
||||
|
|
@ -325,7 +323,7 @@ export async function handleCompleteSlice(
|
|||
trigger_reason: params.triggerReason,
|
||||
});
|
||||
} catch (hookErr) {
|
||||
logWarning("tool", `complete-slice post-mutation hook warning: ${(hookErr as Error).message}`);
|
||||
logWarning("tool", `complete-slice post-mutation hook failed for ${params.milestoneId}/${params.sliceId}`, { error: (hookErr as Error).message });
|
||||
}
|
||||
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -174,17 +174,22 @@ export function summarizeLogs(): string | null {
|
|||
|
||||
/**
|
||||
* Format entries for display (used by auto-loop post-unit notification).
|
||||
* Note: context fields are not included in the formatted output.
|
||||
* Includes key context fields (file paths, commands) when present.
|
||||
*/
|
||||
export function formatForNotification(entries: readonly LogEntry[]): string {
|
||||
if (entries.length === 0) return "";
|
||||
if (entries.length === 1) {
|
||||
const e = entries[0];
|
||||
return `[${e.component}] ${e.message}`;
|
||||
}
|
||||
return entries
|
||||
.map((e) => `[${e.component}] ${e.message}`)
|
||||
.join("\n");
|
||||
return entries.map((e) => {
|
||||
let line = `[${e.component}] ${e.message}`;
|
||||
if (e.context) {
|
||||
const ctxParts = Object.entries(e.context)
|
||||
.filter(([k]) => k !== "error") // error is redundant with message
|
||||
.map(([k, v]) => v.includes(",") ? `${k}: "${v}"` : `${k}: ${v}`);
|
||||
if (ctxParts.length > 0) {
|
||||
line += ` (${ctxParts.join(", ")})`;
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}).join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -348,7 +348,9 @@ function _reconcileWorktreeLogsInner(
|
|||
if (conflicts.length > 0) {
|
||||
// D-04: atomic all-or-nothing — block entire merge
|
||||
writeConflictsFile(mainBasePath, conflicts, worktreeBasePath);
|
||||
logError("reconcile", `${conflicts.length} conflict(s) detected`, { count: String(conflicts.length), path: join(mainBasePath, ".gsd", "CONFLICTS.md") });
|
||||
const conflictSummary = conflicts.slice(0, 3).map(c => `${c.entityType}:${c.entityId}`).join(", ");
|
||||
const truncated = conflicts.length > 3 ? `... and ${conflicts.length - 3} more` : "";
|
||||
logError("reconcile", `${conflicts.length} conflict(s) detected on ${conflictSummary}${truncated}. Details: .gsd/CONFLICTS.md`, { count: String(conflicts.length), path: join(mainBasePath, ".gsd", "CONFLICTS.md") });
|
||||
return { autoMerged: 0, conflicts };
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue