* fix(ci): address 5 pipeline integrity issues from release audit - version-stamp.mjs: regenerate package-lock.json after dev version stamp (mirrors the same fix applied to bump-version.mjs in #4116) - bump-version.mjs: regenerate root and web/package-lock.json after version bump so both lockfiles are always in sync at release time - pipeline.yml: add post-bump validation step that verifies all package.json files parse as valid JSON before the release commit is made - pipeline.yml: split "Commit, tag, and push" — commit+tag+rebase happen before build, but git push is deferred until after build and npm publish both succeed, preventing a broken tag from landing on main - pipeline.yml: emit a :⚠️: annotation when live LLM tests fail so failures are visible in the Actions UI instead of silently swallowed Closes #4118 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(gsd): address 3 silent-crash secondary issues from #3348 post-#3696 Three gaps that remained after the double-fault fix in #3696: 1. unhandledRejection not wired — installEpipeGuard only registered uncaughtException; promise rejections that escaped without a catch were not handled by the GSD error path. Added _gsdRejectionGuard alongside _gsdEpipeGuard. 2. Non-fatal overcorrection — the #3696 fix replaced re-throwing with log-and-continue, leaving the process running in an indeterminate state after any non-EPIPE/non-ENOENT exception. Replaced with writeCrashLog + process.exit(1). writeCrashLog is extracted into bootstrap/crash-log.ts (zero deps) so tests can import it without pulling in the full extension graph. 3. unit-end not emitted after crash-with-side-effects — hameltomor observed that complete-milestone M001 wrote SUMMARY.md and updated the DB but never emitted unit-end (#3348 comment-4237533440). Added emitCrashRecoveredUnitEnd() in crash-recovery.ts: on the next auto-mode startup, if a stale lock references a unit whose unit-start has no matching unit-end in the journal, a synthetic unit-end with status "crash-recovered" is emitted before the lock is cleared. This closes the causal chain for downstream tooling and forensics without requiring changes to the lock file schema. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3c44e3d4e2
commit
3ff8989a62
5 changed files with 350 additions and 7 deletions
|
|
@ -52,6 +52,7 @@ import {
|
|||
readCrashLock,
|
||||
isLockProcessAlive,
|
||||
formatCrashInfo,
|
||||
emitCrashRecoveredUnitEnd,
|
||||
} from "./crash-recovery.js";
|
||||
import {
|
||||
acquireSessionLock,
|
||||
|
|
@ -1332,6 +1333,10 @@ export async function startAuto(
|
|||
}
|
||||
|
||||
if (freshStartAssessment.lock) {
|
||||
// Emit a synthetic unit-end for any unit-start that has no closing event.
|
||||
// This closes the journal gap reported in #3348 where the worker wrote side
|
||||
// effects (SUMMARY.md, DB updates) but died before emitting unit-end.
|
||||
emitCrashRecoveredUnitEnd(base, freshStartAssessment.lock);
|
||||
clearLock(base);
|
||||
}
|
||||
|
||||
|
|
|
|||
32
src/resources/extensions/gsd/bootstrap/crash-log.ts
Normal file
32
src/resources/extensions/gsd/bootstrap/crash-log.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* crash-log.ts — Write crash diagnostics to ~/.gsd/crash/<timestamp>.log
|
||||
*
|
||||
* Zero cross-dependencies: only uses Node.js built-ins so it can be imported
|
||||
* safely from uncaughtException / unhandledRejection handlers and from tests
|
||||
* without pulling in the full extension dependency tree.
|
||||
*/
|
||||
|
||||
import { appendFileSync, mkdirSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
/**
|
||||
* Write a crash log to ~/.gsd/crash/<timestamp>.log (or $GSD_HOME/crash/).
|
||||
* Never throws — must be safe to call from any error handler.
|
||||
*/
|
||||
export function writeCrashLog(err: Error, source: string): void {
|
||||
try {
|
||||
const crashDir = join(process.env.GSD_HOME ?? join(homedir(), ".gsd"), "crash");
|
||||
mkdirSync(crashDir, { recursive: true });
|
||||
const ts = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const logPath = join(crashDir, `${ts}.log`);
|
||||
const lines = [
|
||||
`[gsd] ${source}: ${err.message}`,
|
||||
`timestamp: ${new Date().toISOString()}`,
|
||||
`pid: ${process.pid}`,
|
||||
err.stack ?? "(no stack trace available)",
|
||||
"",
|
||||
];
|
||||
appendFileSync(logPath, lines.join("\n"));
|
||||
} catch { /* never throw from crash handler */ }
|
||||
}
|
||||
|
|
@ -11,6 +11,9 @@ import { registerJournalTools } from "./journal-tools.js";
|
|||
import { registerQueryTools } from "./query-tools.js";
|
||||
import { registerHooks } from "./register-hooks.js";
|
||||
import { registerShortcuts } from "./register-shortcuts.js";
|
||||
import { writeCrashLog } from "./crash-log.js";
|
||||
|
||||
export { writeCrashLog } from "./crash-log.js";
|
||||
|
||||
export function handleRecoverableExtensionProcessError(err: Error): boolean {
|
||||
if ((err as NodeJS.ErrnoException).code === "EPIPE") {
|
||||
|
|
@ -33,16 +36,25 @@ export function handleRecoverableExtensionProcessError(err: Error): boolean {
|
|||
function installEpipeGuard(): void {
|
||||
if (!process.listeners("uncaughtException").some((listener) => listener.name === "_gsdEpipeGuard")) {
|
||||
const _gsdEpipeGuard = (err: Error): void => {
|
||||
if (handleRecoverableExtensionProcessError(err)) {
|
||||
return;
|
||||
}
|
||||
// Log unhandled errors instead of re-throwing — throwing inside an
|
||||
// uncaughtException handler is a fatal double-fault in Node.js (#3163).
|
||||
process.stderr.write(`[gsd] uncaught extension error (non-fatal): ${err.message}\n`);
|
||||
if (err.stack) process.stderr.write(`${err.stack}\n`);
|
||||
if (handleRecoverableExtensionProcessError(err)) return;
|
||||
// Write crash log and exit cleanly for unrecoverable errors.
|
||||
// Logging and continuing was the original double-fault fix (#3163), but
|
||||
// continuing in an indeterminate state is worse than a clean exit (#3348).
|
||||
writeCrashLog(err, "uncaughtException");
|
||||
process.exit(1);
|
||||
};
|
||||
process.on("uncaughtException", _gsdEpipeGuard);
|
||||
}
|
||||
|
||||
if (!process.listeners("unhandledRejection").some((listener) => listener.name === "_gsdRejectionGuard")) {
|
||||
const _gsdRejectionGuard = (reason: unknown, _promise: Promise<unknown>): void => {
|
||||
const err = reason instanceof Error ? reason : new Error(String(reason));
|
||||
if (handleRecoverableExtensionProcessError(err)) return;
|
||||
writeCrashLog(err, "unhandledRejection");
|
||||
process.exit(1);
|
||||
};
|
||||
process.on("unhandledRejection", _gsdRejectionGuard);
|
||||
}
|
||||
}
|
||||
|
||||
export function registerGsdExtension(pi: ExtensionAPI): void {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import { join } from "node:path";
|
|||
import { gsdRoot } from "./paths.js";
|
||||
import { atomicWriteSync } from "./atomic-write.js";
|
||||
import { effectiveLockFile } from "./session-lock.js";
|
||||
import { emitJournalEvent, queryJournal } from "./journal.js";
|
||||
|
||||
export interface LockData {
|
||||
pid: number;
|
||||
|
|
@ -118,3 +119,61 @@ export function formatCrashInfo(lock: LockData): string {
|
|||
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit a synthetic unit-end event for a unit that crashed without emitting its own.
|
||||
*
|
||||
* Queries the journal to find the most recent unit-start for the crashed unit.
|
||||
* If a matching unit-end already exists (e.g. the hard timeout fired), this is a
|
||||
* no-op. Called during crash recovery, before clearing the stale lock.
|
||||
*
|
||||
* Addresses the gap reported in #3348 where `unit-start` was emitted but no
|
||||
* `unit-end` followed — side effects landed but the worker died before closeout.
|
||||
*/
|
||||
export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): void {
|
||||
// Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
|
||||
if (!lock.unitType || !lock.unitId || lock.unitType === "starting") return;
|
||||
|
||||
try {
|
||||
const all = queryJournal(basePath);
|
||||
|
||||
// Find the most recent unit-start for this unitId
|
||||
const starts = all.filter(
|
||||
(e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId,
|
||||
);
|
||||
if (starts.length === 0) return;
|
||||
|
||||
const lastStart = starts[starts.length - 1];
|
||||
|
||||
// Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
|
||||
const alreadyClosed = all.some(
|
||||
(e) =>
|
||||
e.eventType === "unit-end" &&
|
||||
e.data?.unitId === lock.unitId &&
|
||||
e.causedBy?.flowId === lastStart.flowId &&
|
||||
e.causedBy?.seq === lastStart.seq,
|
||||
);
|
||||
if (alreadyClosed) return;
|
||||
|
||||
// Find the highest seq in this flow for monotonic ordering
|
||||
const maxSeq = all
|
||||
.filter((e) => e.flowId === lastStart.flowId)
|
||||
.reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
|
||||
|
||||
emitJournalEvent(basePath, {
|
||||
ts: new Date().toISOString(),
|
||||
flowId: lastStart.flowId,
|
||||
seq: maxSeq + 1,
|
||||
eventType: "unit-end",
|
||||
data: {
|
||||
unitType: lock.unitType,
|
||||
unitId: lock.unitId,
|
||||
status: "crash-recovered",
|
||||
artifactVerified: false,
|
||||
},
|
||||
causedBy: { flowId: lastStart.flowId, seq: lastStart.seq },
|
||||
});
|
||||
} catch {
|
||||
// Never throw from crash recovery path — journal failure must not block recovery
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,235 @@
|
|||
/**
|
||||
* Regression tests for #3348 secondary issues — crash handler gaps surfaced after #3696
|
||||
*
|
||||
* 1. register-extension.ts: writeCrashLog writes to ~/.gsd/crash/ directory
|
||||
* 2. register-extension.ts: _gsdRejectionGuard registered for unhandledRejection
|
||||
* 3. register-extension.ts: _gsdEpipeGuard exits with code 1 for unrecoverable errors (no log-and-continue)
|
||||
* 4. crash-recovery.ts: emitCrashRecoveredUnitEnd closes open unit-start journal entries
|
||||
*/
|
||||
|
||||
import { describe, test } from 'node:test';
|
||||
import assert from 'node:assert/strict';
|
||||
import { existsSync, mkdirSync, readFileSync, readdirSync, rmSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { dirname } from 'node:path';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
function makeTmpBase(): string {
|
||||
const base = join(tmpdir(), `gsd-test-${randomUUID()}`);
|
||||
mkdirSync(join(base, '.gsd'), { recursive: true });
|
||||
return base;
|
||||
}
|
||||
|
||||
// ─── register-extension source assertions ────────────────────────────────────
|
||||
|
||||
const registerExtSrc = readFileSync(
|
||||
join(__dirname, '..', 'bootstrap', 'register-extension.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
describe('register-extension crash handler secondary fixes (#3348)', () => {
|
||||
test('writeCrashLog is exported and writes a file to the crash directory', async () => {
|
||||
// Dynamic import so GSD_HOME can be pointed at a temp dir without polluting ~/.gsd
|
||||
const tmpHome = join(tmpdir(), `gsd-crash-test-${randomUUID()}`);
|
||||
const origHome = process.env.GSD_HOME;
|
||||
process.env.GSD_HOME = tmpHome;
|
||||
try {
|
||||
const { writeCrashLog } = await import('../bootstrap/crash-log.ts');
|
||||
const err = new Error('test crash from secondary regression test');
|
||||
writeCrashLog(err, 'uncaughtException');
|
||||
|
||||
const crashDir = join(tmpHome, 'crash');
|
||||
assert.ok(existsSync(crashDir), 'crash directory should be created');
|
||||
|
||||
const logs = readdirSync(crashDir).filter((f) => f.endsWith('.log'));
|
||||
assert.equal(logs.length, 1, 'exactly one crash log should be written');
|
||||
|
||||
const content = readFileSync(join(crashDir, logs[0]), 'utf-8');
|
||||
assert.ok(content.includes('test crash from secondary regression test'), 'log should contain error message');
|
||||
assert.ok(content.includes('uncaughtException'), 'log should identify the source');
|
||||
assert.ok(content.includes('pid:'), 'log should include process pid');
|
||||
} finally {
|
||||
process.env.GSD_HOME = origHome;
|
||||
rmSync(tmpHome, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('_gsdRejectionGuard is registered for unhandledRejection', () => {
|
||||
assert.match(
|
||||
registerExtSrc,
|
||||
/_gsdRejectionGuard/,
|
||||
'_gsdRejectionGuard handler should be defined',
|
||||
);
|
||||
assert.match(
|
||||
registerExtSrc,
|
||||
/unhandledRejection/,
|
||||
'installEpipeGuard should register an unhandledRejection handler',
|
||||
);
|
||||
});
|
||||
|
||||
test('_gsdEpipeGuard calls process.exit(1) for unrecoverable errors, not log-and-continue', () => {
|
||||
// The original #3696 fix replaced "throw err" with a log-and-continue.
|
||||
// The secondary fix replaces that with writeCrashLog + process.exit(1).
|
||||
assert.ok(
|
||||
!registerExtSrc.includes('process.stderr.write(`[gsd] uncaught extension error (non-fatal)'),
|
||||
'_gsdEpipeGuard should NOT log errors as non-fatal and continue',
|
||||
);
|
||||
assert.match(
|
||||
registerExtSrc,
|
||||
/process\.exit\(1\)/,
|
||||
'_gsdEpipeGuard should call process.exit(1) for unrecoverable errors',
|
||||
);
|
||||
});
|
||||
|
||||
test('writeCrashLog never throws even when directory is unwritable', async () => {
|
||||
const { writeCrashLog } = await import('../bootstrap/crash-log.ts');
|
||||
const origHome = process.env.GSD_HOME;
|
||||
// Point at a path that will fail to mkdir (e.g. a file that exists as non-dir)
|
||||
const tmpFile = join(tmpdir(), `gsd-not-a-dir-${randomUUID()}`);
|
||||
// Don't create it — mkdirSync with bad path should be caught internally
|
||||
process.env.GSD_HOME = join(tmpFile, 'nested', 'deeply');
|
||||
try {
|
||||
// Should not throw
|
||||
assert.doesNotThrow(() => {
|
||||
writeCrashLog(new Error('should not throw'), 'test');
|
||||
});
|
||||
} finally {
|
||||
process.env.GSD_HOME = origHome;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ─── emitCrashRecoveredUnitEnd ────────────────────────────────────────────────
|
||||
|
||||
describe('emitCrashRecoveredUnitEnd (#3348)', () => {
|
||||
test('emits synthetic unit-end when unit-start has no matching unit-end', async () => {
|
||||
const base = makeTmpBase();
|
||||
try {
|
||||
const { emitJournalEvent, queryJournal } = await import('../journal.ts');
|
||||
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
|
||||
|
||||
const flowId = randomUUID();
|
||||
const unitStartSeq = 5;
|
||||
|
||||
// Emit a unit-start with no corresponding unit-end (simulating a crash)
|
||||
emitJournalEvent(base, {
|
||||
ts: new Date().toISOString(),
|
||||
flowId,
|
||||
seq: unitStartSeq,
|
||||
eventType: 'unit-start',
|
||||
data: { unitType: 'execute-task', unitId: 'M001/S01/T01' },
|
||||
});
|
||||
|
||||
const lock = {
|
||||
pid: 99999,
|
||||
startedAt: new Date().toISOString(),
|
||||
unitType: 'execute-task',
|
||||
unitId: 'M001/S01/T01',
|
||||
unitStartedAt: new Date().toISOString(),
|
||||
};
|
||||
|
||||
emitCrashRecoveredUnitEnd(base, lock);
|
||||
|
||||
const events = queryJournal(base);
|
||||
const ends = events.filter((e) => e.eventType === 'unit-end');
|
||||
assert.equal(ends.length, 1, 'should emit exactly one unit-end');
|
||||
assert.equal(ends[0].data?.unitId, 'M001/S01/T01');
|
||||
assert.equal(ends[0].data?.status, 'crash-recovered');
|
||||
assert.equal(ends[0].causedBy?.flowId, flowId);
|
||||
assert.equal(ends[0].causedBy?.seq, unitStartSeq);
|
||||
assert.ok(ends[0].seq > unitStartSeq, 'unit-end seq must be higher than unit-start seq');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('is a no-op when unit-end was already emitted (e.g. hard timeout fired)', async () => {
|
||||
const base = makeTmpBase();
|
||||
try {
|
||||
const { emitJournalEvent, queryJournal } = await import('../journal.ts');
|
||||
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
|
||||
|
||||
const flowId = randomUUID();
|
||||
emitJournalEvent(base, {
|
||||
ts: new Date().toISOString(),
|
||||
flowId,
|
||||
seq: 3,
|
||||
eventType: 'unit-start',
|
||||
data: { unitType: 'plan-slice', unitId: 'M001/S02' },
|
||||
});
|
||||
// Hard timeout already emitted a unit-end
|
||||
emitJournalEvent(base, {
|
||||
ts: new Date().toISOString(),
|
||||
flowId,
|
||||
seq: 4,
|
||||
eventType: 'unit-end',
|
||||
data: { unitType: 'plan-slice', unitId: 'M001/S02', status: 'cancelled' },
|
||||
causedBy: { flowId, seq: 3 },
|
||||
});
|
||||
|
||||
const lock = {
|
||||
pid: 99999,
|
||||
startedAt: new Date().toISOString(),
|
||||
unitType: 'plan-slice',
|
||||
unitId: 'M001/S02',
|
||||
unitStartedAt: new Date().toISOString(),
|
||||
};
|
||||
emitCrashRecoveredUnitEnd(base, lock);
|
||||
|
||||
const ends = queryJournal(base).filter((e) => e.eventType === 'unit-end');
|
||||
assert.equal(ends.length, 1, 'should not emit a duplicate unit-end');
|
||||
assert.equal(ends[0].data?.status, 'cancelled', 'original unit-end should be preserved');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('is a no-op for "starting" pseudo-units (bootstrap crash)', async () => {
|
||||
const base = makeTmpBase();
|
||||
try {
|
||||
const { queryJournal } = await import('../journal.ts');
|
||||
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
|
||||
|
||||
const lock = {
|
||||
pid: 99999,
|
||||
startedAt: new Date().toISOString(),
|
||||
unitType: 'starting',
|
||||
unitId: 'bootstrap',
|
||||
unitStartedAt: new Date().toISOString(),
|
||||
};
|
||||
emitCrashRecoveredUnitEnd(base, lock);
|
||||
|
||||
const events = queryJournal(base);
|
||||
assert.equal(events.length, 0, 'should emit nothing for starting/bootstrap pseudo-units');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('is a no-op when no unit-start exists in the journal', async () => {
|
||||
const base = makeTmpBase();
|
||||
try {
|
||||
const { queryJournal } = await import('../journal.ts');
|
||||
const { emitCrashRecoveredUnitEnd } = await import('../crash-recovery.ts');
|
||||
|
||||
const lock = {
|
||||
pid: 99999,
|
||||
startedAt: new Date().toISOString(),
|
||||
unitType: 'execute-task',
|
||||
unitId: 'M002/S01/T03',
|
||||
unitStartedAt: new Date().toISOString(),
|
||||
};
|
||||
emitCrashRecoveredUnitEnd(base, lock);
|
||||
|
||||
const events = queryJournal(base);
|
||||
assert.equal(events.length, 0, 'should emit nothing when there is no journal entry to close');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue