fix(gsd): address 3 silent-crash secondary issues from #3348 post-#3696 (#4133)

* fix(ci): address 5 pipeline integrity issues from release audit

- version-stamp.mjs: regenerate package-lock.json after dev version stamp
  (mirrors the same fix applied to bump-version.mjs in #4116)

- bump-version.mjs: regenerate root and web/package-lock.json after version
  bump so both lockfiles are always in sync at release time

- pipeline.yml: add post-bump validation step that verifies all package.json
  files parse as valid JSON before the release commit is made

- pipeline.yml: split "Commit, tag, and push" — commit+tag+rebase happen
  before build, but git push is deferred until after build and npm publish
  both succeed, preventing a broken tag from landing on main

- pipeline.yml: emit a :⚠️: annotation when live LLM tests fail so
  failures are visible in the Actions UI instead of silently swallowed

Closes #4118

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(gsd): address 3 silent-crash secondary issues from #3348 post-#3696

Three gaps that remained after the double-fault fix in #3696:

1. unhandledRejection not wired — installEpipeGuard only registered
   uncaughtException; promise rejections that escaped without a catch
   were not handled by the GSD error path. Added _gsdRejectionGuard
   alongside _gsdEpipeGuard.

2. Non-fatal overcorrection — the #3696 fix replaced re-throwing with
   log-and-continue, leaving the process running in an indeterminate
   state after any non-EPIPE/non-ENOENT exception. Replaced with
   writeCrashLog + process.exit(1). writeCrashLog is extracted into
   bootstrap/crash-log.ts (zero deps) so tests can import it without
   pulling in the full extension graph.

3. unit-end not emitted after crash-with-side-effects — hameltomor
   observed that complete-milestone M001 wrote SUMMARY.md and updated
   the DB but never emitted unit-end (#3348 comment-4237533440). Added
   emitCrashRecoveredUnitEnd() in crash-recovery.ts: on the next
   auto-mode startup, if a stale lock references a unit whose
   unit-start has no matching unit-end in the journal, a synthetic
   unit-end with status "crash-recovered" is emitted before the lock
   is cleared. This closes the causal chain for downstream tooling
   and forensics without requiring changes to the lock file schema.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Tom Boucher 2026-04-13 12:33:16 -04:00 committed by GitHub
parent 3c44e3d4e2
commit 3ff8989a62
5 changed files with 350 additions and 7 deletions

View file

@ -52,6 +52,7 @@ import {
readCrashLock,
isLockProcessAlive,
formatCrashInfo,
emitCrashRecoveredUnitEnd,
} from "./crash-recovery.js";
import {
acquireSessionLock,
@ -1332,6 +1333,10 @@ export async function startAuto(
}
if (freshStartAssessment.lock) {
// Emit a synthetic unit-end for any unit-start that has no closing event.
// This closes the journal gap reported in #3348 where the worker wrote side
// effects (SUMMARY.md, DB updates) but died before emitting unit-end.
emitCrashRecoveredUnitEnd(base, freshStartAssessment.lock);
clearLock(base);
}

View file

@ -0,0 +1,32 @@
/**
* crash-log.ts Write crash diagnostics to ~/.gsd/crash/<timestamp>.log
*
* Zero cross-dependencies: only uses Node.js built-ins so it can be imported
* safely from uncaughtException / unhandledRejection handlers and from tests
* without pulling in the full extension dependency tree.
*/
import { appendFileSync, mkdirSync } from "node:fs";
import { homedir } from "node:os";
import { join } from "node:path";
/**
* Write a crash log to ~/.gsd/crash/<timestamp>.log (or $GSD_HOME/crash/).
* Never throws must be safe to call from any error handler.
*/
export function writeCrashLog(err: Error, source: string): void {
try {
const crashDir = join(process.env.GSD_HOME ?? join(homedir(), ".gsd"), "crash");
mkdirSync(crashDir, { recursive: true });
const ts = new Date().toISOString().replace(/[:.]/g, "-");
const logPath = join(crashDir, `${ts}.log`);
const lines = [
`[gsd] ${source}: ${err.message}`,
`timestamp: ${new Date().toISOString()}`,
`pid: ${process.pid}`,
err.stack ?? "(no stack trace available)",
"",
];
appendFileSync(logPath, lines.join("\n"));
} catch { /* never throw from crash handler */ }
}

View file

@ -11,6 +11,9 @@ import { registerJournalTools } from "./journal-tools.js";
import { registerQueryTools } from "./query-tools.js";
import { registerHooks } from "./register-hooks.js";
import { registerShortcuts } from "./register-shortcuts.js";
import { writeCrashLog } from "./crash-log.js";
export { writeCrashLog } from "./crash-log.js";
export function handleRecoverableExtensionProcessError(err: Error): boolean {
if ((err as NodeJS.ErrnoException).code === "EPIPE") {
@ -33,16 +36,25 @@ export function handleRecoverableExtensionProcessError(err: Error): boolean {
function installEpipeGuard(): void {
if (!process.listeners("uncaughtException").some((listener) => listener.name === "_gsdEpipeGuard")) {
const _gsdEpipeGuard = (err: Error): void => {
if (handleRecoverableExtensionProcessError(err)) {
return;
}
// Log unhandled errors instead of re-throwing — throwing inside an
// uncaughtException handler is a fatal double-fault in Node.js (#3163).
process.stderr.write(`[gsd] uncaught extension error (non-fatal): ${err.message}\n`);
if (err.stack) process.stderr.write(`${err.stack}\n`);
if (handleRecoverableExtensionProcessError(err)) return;
// Write crash log and exit cleanly for unrecoverable errors.
// Logging and continuing was the original double-fault fix (#3163), but
// continuing in an indeterminate state is worse than a clean exit (#3348).
writeCrashLog(err, "uncaughtException");
process.exit(1);
};
process.on("uncaughtException", _gsdEpipeGuard);
}
if (!process.listeners("unhandledRejection").some((listener) => listener.name === "_gsdRejectionGuard")) {
const _gsdRejectionGuard = (reason: unknown, _promise: Promise<unknown>): void => {
const err = reason instanceof Error ? reason : new Error(String(reason));
if (handleRecoverableExtensionProcessError(err)) return;
writeCrashLog(err, "unhandledRejection");
process.exit(1);
};
process.on("unhandledRejection", _gsdRejectionGuard);
}
}
export function registerGsdExtension(pi: ExtensionAPI): void {

View file

@ -15,6 +15,7 @@ import { join } from "node:path";
import { gsdRoot } from "./paths.js";
import { atomicWriteSync } from "./atomic-write.js";
import { effectiveLockFile } from "./session-lock.js";
import { emitJournalEvent, queryJournal } from "./journal.js";
export interface LockData {
pid: number;
@ -118,3 +119,61 @@ export function formatCrashInfo(lock: LockData): string {
return lines.join("\n");
}
/**
* Emit a synthetic unit-end event for a unit that crashed without emitting its own.
*
* Queries the journal to find the most recent unit-start for the crashed unit.
* If a matching unit-end already exists (e.g. the hard timeout fired), this is a
* no-op. Called during crash recovery, before clearing the stale lock.
*
* Addresses the gap reported in #3348 where `unit-start` was emitted but no
* `unit-end` followed side effects landed but the worker died before closeout.
*/
export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): void {
// Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
if (!lock.unitType || !lock.unitId || lock.unitType === "starting") return;
try {
const all = queryJournal(basePath);
// Find the most recent unit-start for this unitId
const starts = all.filter(
(e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId,
);
if (starts.length === 0) return;
const lastStart = starts[starts.length - 1];
// Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
const alreadyClosed = all.some(
(e) =>
e.eventType === "unit-end" &&
e.data?.unitId === lock.unitId &&
e.causedBy?.flowId === lastStart.flowId &&
e.causedBy?.seq === lastStart.seq,
);
if (alreadyClosed) return;
// Find the highest seq in this flow for monotonic ordering
const maxSeq = all
.filter((e) => e.flowId === lastStart.flowId)
.reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
emitJournalEvent(basePath, {
ts: new Date().toISOString(),
flowId: lastStart.flowId,
seq: maxSeq + 1,
eventType: "unit-end",
data: {
unitType: lock.unitType,
unitId: lock.unitId,
status: "crash-recovered",
artifactVerified: false,
},
causedBy: { flowId: lastStart.flowId, seq: lastStart.seq },
});
} catch {
// Never throw from crash recovery path — journal failure must not block recovery
}
}

View file

@ -0,0 +1,235 @@
/**
* Regression tests for #3348 secondary issues crash handler gaps surfaced after #3696
*
* 1. register-extension.ts: writeCrashLog writes to ~/.gsd/crash/ directory
* 2. register-extension.ts: _gsdRejectionGuard registered for unhandledRejection
* 3. register-extension.ts: _gsdEpipeGuard exits with code 1 for unrecoverable errors (no log-and-continue)
* 4. crash-recovery.ts: emitCrashRecoveredUnitEnd closes open unit-start journal entries
*/
import { describe, test } from 'node:test';
import assert from 'node:assert/strict';
import { existsSync, mkdirSync, readFileSync, readdirSync, rmSync } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { randomUUID } from 'node:crypto';
import { fileURLToPath } from 'node:url';
import { dirname } from 'node:path';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
function makeTmpBase(): string {
const base = join(tmpdir(), `gsd-test-${randomUUID()}`);
mkdirSync(join(base, '.gsd'), { recursive: true });
return base;
}
// ─── register-extension source assertions ────────────────────────────────────
const registerExtSrc = readFileSync(
join(__dirname, '..', 'bootstrap', 'register-extension.ts'),
'utf-8',
);
describe('register-extension crash handler secondary fixes (#3348)', () => {
test('writeCrashLog is exported and writes a file to the crash directory', async () => {
// Dynamic import so GSD_HOME can be pointed at a temp dir without polluting ~/.gsd
const tmpHome = join(tmpdir(), `gsd-crash-test-${randomUUID()}`);
const origHome = process.env.GSD_HOME;
process.env.GSD_HOME = tmpHome;
try {
const { writeCrashLog } = await import('../bootstrap/crash-log.ts');
const err = new Error('test crash from secondary regression test');
writeCrashLog(err, 'uncaughtException');
const crashDir = join(tmpHome, 'crash');
assert.ok(existsSync(crashDir), 'crash directory should be created');
const logs = readdirSync(crashDir).filter((f) => f.endsWith('.log'));
assert.equal(logs.length, 1, 'exactly one crash log should be written');
const content = readFileSync(join(crashDir, logs[0]), 'utf-8');
assert.ok(content.includes('test crash from secondary regression test'), 'log should contain error message');
assert.ok(content.includes('uncaughtException'), 'log should identify the source');
assert.ok(content.includes('pid:'), 'log should include process pid');
} finally {
process.env.GSD_HOME = origHome;
rmSync(tmpHome, { recursive: true, force: true });
}
});
test('_gsdRejectionGuard is registered for unhandledRejection', () => {
assert.match(
registerExtSrc,
/_gsdRejectionGuard/,
'_gsdRejectionGuard handler should be defined',
);
assert.match(
registerExtSrc,
/unhandledRejection/,
'installEpipeGuard should register an unhandledRejection handler',
);
});
test('_gsdEpipeGuard calls process.exit(1) for unrecoverable errors, not log-and-continue', () => {
// The original #3696 fix replaced "throw err" with a log-and-continue.
// The secondary fix replaces that with writeCrashLog + process.exit(1).
assert.ok(
!registerExtSrc.includes('process.stderr.write(`[gsd] uncaught extension error (non-fatal)'),
'_gsdEpipeGuard should NOT log errors as non-fatal and continue',
);
assert.match(
registerExtSrc,
/process\.exit\(1\)/,
'_gsdEpipeGuard should call process.exit(1) for unrecoverable errors',
);
});
test('writeCrashLog never throws even when directory is unwritable', async () => {
const { writeCrashLog } = await import('../bootstrap/crash-log.ts');
const origHome = process.env.GSD_HOME;
// Point at a path that will fail to mkdir (e.g. a file that exists as non-dir)
const tmpFile = join(tmpdir(), `gsd-not-a-dir-${randomUUID()}`);
// Don't create it — mkdirSync with bad path should be caught internally
process.env.GSD_HOME = join(tmpFile, 'nested', 'deeply');
try {
// Should not throw
assert.doesNotThrow(() => {
writeCrashLog(new Error('should not throw'), 'test');
});
} finally {
process.env.GSD_HOME = origHome;
}
});
});
// ─── emitCrashRecoveredUnitEnd ────────────────────────────────────────────────
describe('emitCrashRecoveredUnitEnd (#3348)', () => {
test('emits synthetic unit-end when unit-start has no matching unit-end', async () => {
const base = makeTmpBase();
try {
const { emitJournalEvent, queryJournal } = await import('../journal.ts');
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
const flowId = randomUUID();
const unitStartSeq = 5;
// Emit a unit-start with no corresponding unit-end (simulating a crash)
emitJournalEvent(base, {
ts: new Date().toISOString(),
flowId,
seq: unitStartSeq,
eventType: 'unit-start',
data: { unitType: 'execute-task', unitId: 'M001/S01/T01' },
});
const lock = {
pid: 99999,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
};
emitCrashRecoveredUnitEnd(base, lock);
const events = queryJournal(base);
const ends = events.filter((e) => e.eventType === 'unit-end');
assert.equal(ends.length, 1, 'should emit exactly one unit-end');
assert.equal(ends[0].data?.unitId, 'M001/S01/T01');
assert.equal(ends[0].data?.status, 'crash-recovered');
assert.equal(ends[0].causedBy?.flowId, flowId);
assert.equal(ends[0].causedBy?.seq, unitStartSeq);
assert.ok(ends[0].seq > unitStartSeq, 'unit-end seq must be higher than unit-start seq');
} finally {
rmSync(base, { recursive: true, force: true });
}
});
test('is a no-op when unit-end was already emitted (e.g. hard timeout fired)', async () => {
const base = makeTmpBase();
try {
const { emitJournalEvent, queryJournal } = await import('../journal.ts');
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
const flowId = randomUUID();
emitJournalEvent(base, {
ts: new Date().toISOString(),
flowId,
seq: 3,
eventType: 'unit-start',
data: { unitType: 'plan-slice', unitId: 'M001/S02' },
});
// Hard timeout already emitted a unit-end
emitJournalEvent(base, {
ts: new Date().toISOString(),
flowId,
seq: 4,
eventType: 'unit-end',
data: { unitType: 'plan-slice', unitId: 'M001/S02', status: 'cancelled' },
causedBy: { flowId, seq: 3 },
});
const lock = {
pid: 99999,
startedAt: new Date().toISOString(),
unitType: 'plan-slice',
unitId: 'M001/S02',
unitStartedAt: new Date().toISOString(),
};
emitCrashRecoveredUnitEnd(base, lock);
const ends = queryJournal(base).filter((e) => e.eventType === 'unit-end');
assert.equal(ends.length, 1, 'should not emit a duplicate unit-end');
assert.equal(ends[0].data?.status, 'cancelled', 'original unit-end should be preserved');
} finally {
rmSync(base, { recursive: true, force: true });
}
});
test('is a no-op for "starting" pseudo-units (bootstrap crash)', async () => {
const base = makeTmpBase();
try {
const { queryJournal } = await import('../journal.ts');
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
const lock = {
pid: 99999,
startedAt: new Date().toISOString(),
unitType: 'starting',
unitId: 'bootstrap',
unitStartedAt: new Date().toISOString(),
};
emitCrashRecoveredUnitEnd(base, lock);
const events = queryJournal(base);
assert.equal(events.length, 0, 'should emit nothing for starting/bootstrap pseudo-units');
} finally {
rmSync(base, { recursive: true, force: true });
}
});
test('is a no-op when no unit-start exists in the journal', async () => {
const base = makeTmpBase();
try {
const { queryJournal } = await import('../journal.ts');
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
const lock = {
pid: 99999,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M002/S01/T03',
unitStartedAt: new Date().toISOString(),
};
emitCrashRecoveredUnitEnd(base, lock);
const events = queryJournal(base);
assert.equal(events.length, 0, 'should emit nothing when there is no journal entry to close');
} finally {
rmSync(base, { recursive: true, force: true });
}
});
});