feat: auto-restart headless mode on crash with exponential backoff (#886) (#897)

When the headless child process crashes or errors out, auto-restart
with exponential backoff (5s, 10s, 15s... up to 30s) instead of
exiting immediately. This enables overnight 'fire and forget' runs.

- --max-restarts N (default 3, 0 to disable): controls restart budget
- Only restarts on crashes (exit code !== 0), not on success or blocked
- SIGINT/SIGTERM bypasses restart (user intent to stop)
- Restart count shown in summary output
- Backoff prevents rapid crash loops from burning API credits

The inner loop function (runHeadlessOnce) returns exit status instead
of calling process.exit, letting the outer loop decide whether to
restart or terminate.

This is the first step toward the 'absolute autonomy' goal described
in #886 — process-level resilience for long-running sessions.
This commit is contained in:
Tom Boucher 2026-03-17 11:35:33 -04:00 committed by GitHub
parent d804c759bd
commit 01a6294b23

View file

@ -33,6 +33,7 @@ export interface HeadlessOptions {
contextText?: string // inline text
auto?: boolean // chain into auto-mode after milestone creation
verbose?: boolean // show tool calls in output
maxRestarts?: number // auto-restart on crash (default 3, 0 to disable)
}
interface ExtensionUIRequest {
@ -92,6 +93,12 @@ export function parseHeadlessArgs(argv: string[]): HeadlessOptions {
options.auto = true
} else if (arg === '--verbose') {
options.verbose = true
} else if (arg === '--max-restarts' && i + 1 < args.length) {
options.maxRestarts = parseInt(args[++i], 10)
if (Number.isNaN(options.maxRestarts) || options.maxRestarts < 0) {
process.stderr.write('[headless] Error: --max-restarts must be a non-negative integer\n')
process.exit(1)
}
}
} else if (!positionalStarted) {
positionalStarted = true
@ -279,6 +286,37 @@ function bootstrapGsdProject(basePath: string): void {
}
export async function runHeadless(options: HeadlessOptions): Promise<void> {
const maxRestarts = options.maxRestarts ?? 3
let restartCount = 0
while (true) {
const result = await runHeadlessOnce(options, restartCount)
// Success or blocked — exit normally
if (result.exitCode === 0 || result.exitCode === 2) {
process.exit(result.exitCode)
}
// Crash/error — check if we should restart
if (restartCount >= maxRestarts) {
process.stderr.write(`[headless] Max restarts (${maxRestarts}) reached. Exiting.\n`)
process.exit(result.exitCode)
}
// Don't restart if SIGINT/SIGTERM was received
if (result.interrupted) {
process.exit(result.exitCode)
}
restartCount++
const backoffMs = Math.min(5000 * restartCount, 30_000)
process.stderr.write(`[headless] Restarting in ${(backoffMs / 1000).toFixed(0)}s (attempt ${restartCount}/${maxRestarts})...\n`)
await new Promise(resolve => setTimeout(resolve, backoffMs))
}
}
async function runHeadlessOnce(options: HeadlessOptions, restartCount: number): Promise<{ exitCode: number; interrupted: boolean }> {
let interrupted = false
const startTime = Date.now()
const isNewMilestone = options.command === 'new-milestone'
@ -452,6 +490,7 @@ export async function runHeadless(options: HeadlessOptions): Promise<void> {
// Signal handling
const signalHandler = () => {
process.stderr.write('\n[headless] Interrupted, stopping child process...\n')
interrupted = true
exitCode = 1
client.stop().finally(() => {
clearTimeout(timeoutTimer)
@ -553,6 +592,9 @@ export async function runHeadless(options: HeadlessOptions): Promise<void> {
process.stderr.write(`[headless] Status: ${status}\n`)
process.stderr.write(`[headless] Duration: ${duration}s\n`)
process.stderr.write(`[headless] Events: ${totalEvents} total, ${toolCallCount} tool calls\n`)
if (restartCount > 0) {
process.stderr.write(`[headless] Restarts: ${restartCount}\n`)
}
// On failure, print last 5 events for diagnostics
if (exitCode !== 0) {
@ -565,5 +607,5 @@ export async function runHeadless(options: HeadlessOptions): Promise<void> {
}
}
process.exit(exitCode)
return { exitCode, interrupted }
}