From 3ff8989a621c5584ffda31c7b57c346860fefd3b Mon Sep 17 00:00:00 2001 From: Tom Boucher Date: Mon, 13 Apr 2026 12:33:16 -0400 Subject: [PATCH] fix(gsd): address 3 silent-crash secondary issues from #3348 post-#3696 (#4133) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(ci): address 5 pipeline integrity issues from release audit - version-stamp.mjs: regenerate package-lock.json after dev version stamp (mirrors the same fix applied to bump-version.mjs in #4116) - bump-version.mjs: regenerate root and web/package-lock.json after version bump so both lockfiles are always in sync at release time - pipeline.yml: add post-bump validation step that verifies all package.json files parse as valid JSON before the release commit is made - pipeline.yml: split "Commit, tag, and push" — commit+tag+rebase happen before build, but git push is deferred until after build and npm publish both succeed, preventing a broken tag from landing on main - pipeline.yml: emit a ::warning:: annotation when live LLM tests fail so failures are visible in the Actions UI instead of silently swallowed Closes #4118 Co-Authored-By: Claude Sonnet 4.6 * fix(gsd): address 3 silent-crash secondary issues from #3348 post-#3696 Three gaps that remained after the double-fault fix in #3696: 1. unhandledRejection not wired — installEpipeGuard only registered uncaughtException; promise rejections that escaped without a catch were not handled by the GSD error path. Added _gsdRejectionGuard alongside _gsdEpipeGuard. 2. Non-fatal overcorrection — the #3696 fix replaced re-throwing with log-and-continue, leaving the process running in an indeterminate state after any non-EPIPE/non-ENOENT exception. Replaced with writeCrashLog + process.exit(1). writeCrashLog is extracted into bootstrap/crash-log.ts (zero deps) so tests can import it without pulling in the full extension graph. 3. unit-end not emitted after crash-with-side-effects — hameltomor observed that complete-milestone M001 wrote SUMMARY.md and updated the DB but never emitted unit-end (#3348 comment-4237533440). Added emitCrashRecoveredUnitEnd() in crash-recovery.ts: on the next auto-mode startup, if a stale lock references a unit whose unit-start has no matching unit-end in the journal, a synthetic unit-end with status "crash-recovered" is emitted before the lock is cleared. This closes the causal chain for downstream tooling and forensics without requiring changes to the lock file schema. Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- src/resources/extensions/gsd/auto.ts | 5 + .../extensions/gsd/bootstrap/crash-log.ts | 32 +++ .../gsd/bootstrap/register-extension.ts | 26 +- .../extensions/gsd/crash-recovery.ts | 59 +++++ .../gsd/tests/crash-handler-secondary.test.ts | 235 ++++++++++++++++++ 5 files changed, 350 insertions(+), 7 deletions(-) create mode 100644 src/resources/extensions/gsd/bootstrap/crash-log.ts create mode 100644 src/resources/extensions/gsd/tests/crash-handler-secondary.test.ts diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 47e29c0bc..0ebf3a975 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -52,6 +52,7 @@ import { readCrashLock, isLockProcessAlive, formatCrashInfo, + emitCrashRecoveredUnitEnd, } from "./crash-recovery.js"; import { acquireSessionLock, @@ -1332,6 +1333,10 @@ export async function startAuto( } if (freshStartAssessment.lock) { + // Emit a synthetic unit-end for any unit-start that has no closing event. + // This closes the journal gap reported in #3348 where the worker wrote side + // effects (SUMMARY.md, DB updates) but died before emitting unit-end. + emitCrashRecoveredUnitEnd(base, freshStartAssessment.lock); clearLock(base); } diff --git a/src/resources/extensions/gsd/bootstrap/crash-log.ts b/src/resources/extensions/gsd/bootstrap/crash-log.ts new file mode 100644 index 000000000..61278f173 --- /dev/null +++ b/src/resources/extensions/gsd/bootstrap/crash-log.ts @@ -0,0 +1,32 @@ +/** + * crash-log.ts — Write crash diagnostics to ~/.gsd/crash/.log + * + * Zero cross-dependencies: only uses Node.js built-ins so it can be imported + * safely from uncaughtException / unhandledRejection handlers and from tests + * without pulling in the full extension dependency tree. + */ + +import { appendFileSync, mkdirSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; + +/** + * Write a crash log to ~/.gsd/crash/.log (or $GSD_HOME/crash/). + * Never throws — must be safe to call from any error handler. + */ +export function writeCrashLog(err: Error, source: string): void { + try { + const crashDir = join(process.env.GSD_HOME ?? join(homedir(), ".gsd"), "crash"); + mkdirSync(crashDir, { recursive: true }); + const ts = new Date().toISOString().replace(/[:.]/g, "-"); + const logPath = join(crashDir, `${ts}.log`); + const lines = [ + `[gsd] ${source}: ${err.message}`, + `timestamp: ${new Date().toISOString()}`, + `pid: ${process.pid}`, + err.stack ?? "(no stack trace available)", + "", + ]; + appendFileSync(logPath, lines.join("\n")); + } catch { /* never throw from crash handler */ } +} diff --git a/src/resources/extensions/gsd/bootstrap/register-extension.ts b/src/resources/extensions/gsd/bootstrap/register-extension.ts index 024d4a72d..20c801c0a 100644 --- a/src/resources/extensions/gsd/bootstrap/register-extension.ts +++ b/src/resources/extensions/gsd/bootstrap/register-extension.ts @@ -11,6 +11,9 @@ import { registerJournalTools } from "./journal-tools.js"; import { registerQueryTools } from "./query-tools.js"; import { registerHooks } from "./register-hooks.js"; import { registerShortcuts } from "./register-shortcuts.js"; +import { writeCrashLog } from "./crash-log.js"; + +export { writeCrashLog } from "./crash-log.js"; export function handleRecoverableExtensionProcessError(err: Error): boolean { if ((err as NodeJS.ErrnoException).code === "EPIPE") { @@ -33,16 +36,25 @@ export function handleRecoverableExtensionProcessError(err: Error): boolean { function installEpipeGuard(): void { if (!process.listeners("uncaughtException").some((listener) => listener.name === "_gsdEpipeGuard")) { const _gsdEpipeGuard = (err: Error): void => { - if (handleRecoverableExtensionProcessError(err)) { - return; - } - // Log unhandled errors instead of re-throwing — throwing inside an - // uncaughtException handler is a fatal double-fault in Node.js (#3163). - process.stderr.write(`[gsd] uncaught extension error (non-fatal): ${err.message}\n`); - if (err.stack) process.stderr.write(`${err.stack}\n`); + if (handleRecoverableExtensionProcessError(err)) return; + // Write crash log and exit cleanly for unrecoverable errors. + // Logging and continuing was the original double-fault fix (#3163), but + // continuing in an indeterminate state is worse than a clean exit (#3348). + writeCrashLog(err, "uncaughtException"); + process.exit(1); }; process.on("uncaughtException", _gsdEpipeGuard); } + + if (!process.listeners("unhandledRejection").some((listener) => listener.name === "_gsdRejectionGuard")) { + const _gsdRejectionGuard = (reason: unknown, _promise: Promise): void => { + const err = reason instanceof Error ? reason : new Error(String(reason)); + if (handleRecoverableExtensionProcessError(err)) return; + writeCrashLog(err, "unhandledRejection"); + process.exit(1); + }; + process.on("unhandledRejection", _gsdRejectionGuard); + } } export function registerGsdExtension(pi: ExtensionAPI): void { diff --git a/src/resources/extensions/gsd/crash-recovery.ts b/src/resources/extensions/gsd/crash-recovery.ts index 1b147fead..2f76b61ad 100644 --- a/src/resources/extensions/gsd/crash-recovery.ts +++ b/src/resources/extensions/gsd/crash-recovery.ts @@ -15,6 +15,7 @@ import { join } from "node:path"; import { gsdRoot } from "./paths.js"; import { atomicWriteSync } from "./atomic-write.js"; import { effectiveLockFile } from "./session-lock.js"; +import { emitJournalEvent, queryJournal } from "./journal.js"; export interface LockData { pid: number; @@ -118,3 +119,61 @@ export function formatCrashInfo(lock: LockData): string { return lines.join("\n"); } + +/** + * Emit a synthetic unit-end event for a unit that crashed without emitting its own. + * + * Queries the journal to find the most recent unit-start for the crashed unit. + * If a matching unit-end already exists (e.g. the hard timeout fired), this is a + * no-op. Called during crash recovery, before clearing the stale lock. + * + * Addresses the gap reported in #3348 where `unit-start` was emitted but no + * `unit-end` followed — side effects landed but the worker died before closeout. + */ +export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): void { + // Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event. + if (!lock.unitType || !lock.unitId || lock.unitType === "starting") return; + + try { + const all = queryJournal(basePath); + + // Find the most recent unit-start for this unitId + const starts = all.filter( + (e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId, + ); + if (starts.length === 0) return; + + const lastStart = starts[starts.length - 1]; + + // Check if a unit-end was already emitted (e.g. hard timeout fired after the crash) + const alreadyClosed = all.some( + (e) => + e.eventType === "unit-end" && + e.data?.unitId === lock.unitId && + e.causedBy?.flowId === lastStart.flowId && + e.causedBy?.seq === lastStart.seq, + ); + if (alreadyClosed) return; + + // Find the highest seq in this flow for monotonic ordering + const maxSeq = all + .filter((e) => e.flowId === lastStart.flowId) + .reduce((max, e) => Math.max(max, e.seq), lastStart.seq); + + emitJournalEvent(basePath, { + ts: new Date().toISOString(), + flowId: lastStart.flowId, + seq: maxSeq + 1, + eventType: "unit-end", + data: { + unitType: lock.unitType, + unitId: lock.unitId, + status: "crash-recovered", + artifactVerified: false, + }, + causedBy: { flowId: lastStart.flowId, seq: lastStart.seq }, + }); + } catch { + // Never throw from crash recovery path — journal failure must not block recovery + } +} diff --git a/src/resources/extensions/gsd/tests/crash-handler-secondary.test.ts b/src/resources/extensions/gsd/tests/crash-handler-secondary.test.ts new file mode 100644 index 000000000..bb981f16f --- /dev/null +++ b/src/resources/extensions/gsd/tests/crash-handler-secondary.test.ts @@ -0,0 +1,235 @@ +/** + * Regression tests for #3348 secondary issues — crash handler gaps surfaced after #3696 + * + * 1. register-extension.ts: writeCrashLog writes to ~/.gsd/crash/ directory + * 2. register-extension.ts: _gsdRejectionGuard registered for unhandledRejection + * 3. register-extension.ts: _gsdEpipeGuard exits with code 1 for unrecoverable errors (no log-and-continue) + * 4. crash-recovery.ts: emitCrashRecoveredUnitEnd closes open unit-start journal entries + */ + +import { describe, test } from 'node:test'; +import assert from 'node:assert/strict'; +import { existsSync, mkdirSync, readFileSync, readdirSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { randomUUID } from 'node:crypto'; +import { fileURLToPath } from 'node:url'; +import { dirname } from 'node:path'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +function makeTmpBase(): string { + const base = join(tmpdir(), `gsd-test-${randomUUID()}`); + mkdirSync(join(base, '.gsd'), { recursive: true }); + return base; +} + +// ─── register-extension source assertions ──────────────────────────────────── + +const registerExtSrc = readFileSync( + join(__dirname, '..', 'bootstrap', 'register-extension.ts'), + 'utf-8', +); + +describe('register-extension crash handler secondary fixes (#3348)', () => { + test('writeCrashLog is exported and writes a file to the crash directory', async () => { + // Dynamic import so GSD_HOME can be pointed at a temp dir without polluting ~/.gsd + const tmpHome = join(tmpdir(), `gsd-crash-test-${randomUUID()}`); + const origHome = process.env.GSD_HOME; + process.env.GSD_HOME = tmpHome; + try { + const { writeCrashLog } = await import('../bootstrap/crash-log.ts'); + const err = new Error('test crash from secondary regression test'); + writeCrashLog(err, 'uncaughtException'); + + const crashDir = join(tmpHome, 'crash'); + assert.ok(existsSync(crashDir), 'crash directory should be created'); + + const logs = readdirSync(crashDir).filter((f) => f.endsWith('.log')); + assert.equal(logs.length, 1, 'exactly one crash log should be written'); + + const content = readFileSync(join(crashDir, logs[0]), 'utf-8'); + assert.ok(content.includes('test crash from secondary regression test'), 'log should contain error message'); + assert.ok(content.includes('uncaughtException'), 'log should identify the source'); + assert.ok(content.includes('pid:'), 'log should include process pid'); + } finally { + process.env.GSD_HOME = origHome; + rmSync(tmpHome, { recursive: true, force: true }); + } + }); + + test('_gsdRejectionGuard is registered for unhandledRejection', () => { + assert.match( + registerExtSrc, + /_gsdRejectionGuard/, + '_gsdRejectionGuard handler should be defined', + ); + assert.match( + registerExtSrc, + /unhandledRejection/, + 'installEpipeGuard should register an unhandledRejection handler', + ); + }); + + test('_gsdEpipeGuard calls process.exit(1) for unrecoverable errors, not log-and-continue', () => { + // The original #3696 fix replaced "throw err" with a log-and-continue. + // The secondary fix replaces that with writeCrashLog + process.exit(1). + assert.ok( + !registerExtSrc.includes('process.stderr.write(`[gsd] uncaught extension error (non-fatal)'), + '_gsdEpipeGuard should NOT log errors as non-fatal and continue', + ); + assert.match( + registerExtSrc, + /process\.exit\(1\)/, + '_gsdEpipeGuard should call process.exit(1) for unrecoverable errors', + ); + }); + + test('writeCrashLog never throws even when directory is unwritable', async () => { + const { writeCrashLog } = await import('../bootstrap/crash-log.ts'); + const origHome = process.env.GSD_HOME; + // Point at a path that will fail to mkdir (e.g. a file that exists as non-dir) + const tmpFile = join(tmpdir(), `gsd-not-a-dir-${randomUUID()}`); + // Don't create it — mkdirSync with bad path should be caught internally + process.env.GSD_HOME = join(tmpFile, 'nested', 'deeply'); + try { + // Should not throw + assert.doesNotThrow(() => { + writeCrashLog(new Error('should not throw'), 'test'); + }); + } finally { + process.env.GSD_HOME = origHome; + } + }); +}); + +// ─── emitCrashRecoveredUnitEnd ──────────────────────────────────────────────── + +describe('emitCrashRecoveredUnitEnd (#3348)', () => { + test('emits synthetic unit-end when unit-start has no matching unit-end', async () => { + const base = makeTmpBase(); + try { + const { emitJournalEvent, queryJournal } = await import('../journal.ts'); + const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts'); + + const flowId = randomUUID(); + const unitStartSeq = 5; + + // Emit a unit-start with no corresponding unit-end (simulating a crash) + emitJournalEvent(base, { + ts: new Date().toISOString(), + flowId, + seq: unitStartSeq, + eventType: 'unit-start', + data: { unitType: 'execute-task', unitId: 'M001/S01/T01' }, + }); + + const lock = { + pid: 99999, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M001/S01/T01', + unitStartedAt: new Date().toISOString(), + }; + + emitCrashRecoveredUnitEnd(base, lock); + + const events = queryJournal(base); + const ends = events.filter((e) => e.eventType === 'unit-end'); + assert.equal(ends.length, 1, 'should emit exactly one unit-end'); + assert.equal(ends[0].data?.unitId, 'M001/S01/T01'); + assert.equal(ends[0].data?.status, 'crash-recovered'); + assert.equal(ends[0].causedBy?.flowId, flowId); + assert.equal(ends[0].causedBy?.seq, unitStartSeq); + assert.ok(ends[0].seq > unitStartSeq, 'unit-end seq must be higher than unit-start seq'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); + + test('is a no-op when unit-end was already emitted (e.g. hard timeout fired)', async () => { + const base = makeTmpBase(); + try { + const { emitJournalEvent, queryJournal } = await import('../journal.ts'); + const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts'); + + const flowId = randomUUID(); + emitJournalEvent(base, { + ts: new Date().toISOString(), + flowId, + seq: 3, + eventType: 'unit-start', + data: { unitType: 'plan-slice', unitId: 'M001/S02' }, + }); + // Hard timeout already emitted a unit-end + emitJournalEvent(base, { + ts: new Date().toISOString(), + flowId, + seq: 4, + eventType: 'unit-end', + data: { unitType: 'plan-slice', unitId: 'M001/S02', status: 'cancelled' }, + causedBy: { flowId, seq: 3 }, + }); + + const lock = { + pid: 99999, + startedAt: new Date().toISOString(), + unitType: 'plan-slice', + unitId: 'M001/S02', + unitStartedAt: new Date().toISOString(), + }; + emitCrashRecoveredUnitEnd(base, lock); + + const ends = queryJournal(base).filter((e) => e.eventType === 'unit-end'); + assert.equal(ends.length, 1, 'should not emit a duplicate unit-end'); + assert.equal(ends[0].data?.status, 'cancelled', 'original unit-end should be preserved'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); + + test('is a no-op for "starting" pseudo-units (bootstrap crash)', async () => { + const base = makeTmpBase(); + try { + const { queryJournal } = await import('../journal.ts'); + const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts'); + + const lock = { + pid: 99999, + startedAt: new Date().toISOString(), + unitType: 'starting', + unitId: 'bootstrap', + unitStartedAt: new Date().toISOString(), + }; + emitCrashRecoveredUnitEnd(base, lock); + + const events = queryJournal(base); + assert.equal(events.length, 0, 'should emit nothing for starting/bootstrap pseudo-units'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); + + test('is a no-op when no unit-start exists in the journal', async () => { + const base = makeTmpBase(); + try { + const { queryJournal } = await import('../journal.ts'); + const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts'); + + const lock = { + pid: 99999, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M002/S01/T03', + unitStartedAt: new Date().toISOString(), + }; + emitCrashRecoveredUnitEnd(base, lock); + + const events = queryJournal(base); + assert.equal(events.length, 0, 'should emit nothing when there is no journal entry to close'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); +});