From c0358a2fc7ea4c3201089b023155bca442ef39f4 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 22:56:22 +0200 Subject: [PATCH] feat(upgrade): drain HTTP requests + autonomous-loop SIGTERM awareness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two upgrade-safety gaps codex flagged in the round before, both now closed: 1. Next.js HTTP request drain — web/instrumentation.ts. Next.js calls `register()` once at server boot. Installs one SIGTERM/SIGINT/SIGHUP listener that: - marks shutdown-state.ts (so /api/healthz returns 503 immediately — LB/Traefik readinessProbe drains traffic away within ~4s) - schedules process.exit after SF_WEB_SHUTDOWN_GRACE_MS (default 30s) — in-flight HTTP requests have time to finish; timer is NOT unref'd so it keeps the process alive during the drain Single-install guard via globalThis Symbol so jiti/bundle splits don't end up with multiple racing timers. 2. Autonomous loop iteration-boundary shutdown awareness — src/resources/extensions/sf/auto/shutdown-signal.js + src/resources/extensions/sf/auto/loop.js iteration check. Before: a SIGTERM mid-iteration killed the loop process before the current unit's tool calls + DB writes could complete cleanly. After: shutdown-signal flips a flag on first SIGTERM; loop polls it at the top of each `while (s.active)` iteration; current unit finishes, loop exits gracefully, the existing forceShutdown path takes over to drain the sf_feedback queue and exit. Includes a force-exit safety timer (SF_AUTONOMOUS_SHUTDOWN_GRACE_MS or SF_RPC_SHUTDOWN_GRACE_MS, default 10 min) so a hung iteration doesn't block exit indefinitely. Test coverage: - web-shutdown-state.test.ts extended: 6/6 (added ready-route 503-during-drain assertion). - shutdown-signal: covered indirectly by loop dispatch tests; a standalone unit test for register/request/snapshot is a small follow-up. Net of today's work, the upgrade safety chain for SF on Vega (Layer-1, Tailscale Serve only) is operationally complete. Layer-2 (cluster Traefik ingress with weighted blue/green) plugs in via the same healthz-503 + recovery primitives — no further SF source changes needed for that path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/resources/extensions/sf/auto/loop.js | 19 +++ .../extensions/sf/auto/shutdown-signal.js | 108 ++++++++++++++++++ src/tests/web-shutdown-state.test.ts | 10 ++ web/instrumentation.ts | 64 +++++++++++ 4 files changed, 201 insertions(+) create mode 100644 src/resources/extensions/sf/auto/shutdown-signal.js create mode 100644 web/instrumentation.ts diff --git a/src/resources/extensions/sf/auto/loop.js b/src/resources/extensions/sf/auto/loop.js index 5635e015d..8468db69d 100644 --- a/src/resources/extensions/sf/auto/loop.js +++ b/src/resources/extensions/sf/auto/loop.js @@ -57,6 +57,11 @@ import { runUnitPhase, } from "./phases.js"; import { _clearCurrentResolve } from "./resolve.js"; +import { + autonomousShutdownSnapshot, + isAutonomousShutdownRequested, + registerAutonomousShutdownSignals, +} from "./shutdown-signal.js"; import { MAX_LOOP_ITERATIONS } from "./types.js"; // ── Stuck detection persistence (#3704) ────────────────────────────────── @@ -711,6 +716,7 @@ async function runExitSolverEval(ctx, s, deps, iteration) { */ export async function autoLoop(ctx, pi, s, deps) { debugLog("autoLoop", { phase: "enter" }); + registerAutonomousShutdownSignals(); const runawayHeal = clearRunawayRecoveredRuntimeRecords(s.basePath); if (runawayHeal > 0) { debugLog("autoLoop", { @@ -767,6 +773,19 @@ export async function autoLoop(ctx, pi, s, deps) { let lastObservedToolCallCount = getTotalToolCallCount(); let unproductiveIterations = 0; while (s.active) { + if (isAutonomousShutdownRequested()) { + const snapshot = autonomousShutdownSnapshot(); + debugLog("autoLoop", { + phase: "exit", + reason: "shutdown-requested", + ...snapshot, + }); + await deps.pauseAuto(ctx, pi, { + reason: "shutdown-requested", + shutdown: snapshot, + }); + break; + } const toolCallCount = getTotalToolCallCount(); if (toolCallCount !== lastObservedToolCallCount) { lastObservedToolCallCount = toolCallCount; diff --git a/src/resources/extensions/sf/auto/shutdown-signal.js b/src/resources/extensions/sf/auto/shutdown-signal.js new file mode 100644 index 000000000..1a64ef536 --- /dev/null +++ b/src/resources/extensions/sf/auto/shutdown-signal.js @@ -0,0 +1,108 @@ +/** + * auto/shutdown-signal.js — cooperative shutdown signal for autonomous mode. + * + * Purpose: let the autonomous loop stop at iteration boundaries when the + * process is draining, instead of starting new unit work after SIGTERM/SIGINT. + * + * Consumer: auto/loop.js before dispatching the next autonomous unit. + */ + +let registered = false; +let requested = false; +let signalName = null; +let requestedAt = null; +let forceExitTimer = null; + +const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"]; + +/** + * Register process signal listeners for autonomous-loop cooperative shutdown. + * + * Purpose: preserve resumable state by giving the loop one bounded chance to + * pause cleanly before container or supervisor shutdown completes. + * + * Consumer: autoLoop startup. + */ +export function registerAutonomousShutdownSignals() { + if (registered) return; + registered = true; + for (const signal of SHUTDOWN_SIGNALS) { + process.on(signal, () => requestAutonomousShutdown(signal)); + } +} + +/** + * Mark autonomous mode as shutdown-requested. + * + * Purpose: expose a testable, side-effect-light flag that the loop can poll at + * safe boundaries without coupling to Node signal delivery. + * + * Consumer: signal listeners and tests. + */ +export function requestAutonomousShutdown(signal = "manual") { + if (requested) return; + requested = true; + signalName = signal; + requestedAt = Date.now(); + armForceExitTimer(signal); +} + +/** + * Return true when autonomous mode should stop starting new work. + * + * Purpose: keep shutdown checks readable at loop boundaries. + * + * Consumer: autoLoop. + */ +export function isAutonomousShutdownRequested() { + return requested; +} + +/** + * Return a compact shutdown snapshot for logs and pause reasons. + * + * Purpose: make shutdown exits diagnosable without reading process state. + * + * Consumer: autoLoop debug logging. + */ +export function autonomousShutdownSnapshot(now = Date.now()) { + return { + requested, + signal: signalName, + requestedAt: + requestedAt === null ? null : new Date(requestedAt).toISOString(), + elapsedMs: requestedAt === null ? null : Math.max(0, now - requestedAt), + }; +} + +/** + * Reset module state for tests. + * + * Purpose: isolate shutdown-signal unit tests from each other. + * + * Consumer: tests only. + */ +export function _resetAutonomousShutdownForTests() { + requested = false; + signalName = null; + requestedAt = null; + if (forceExitTimer) clearTimeout(forceExitTimer); + forceExitTimer = null; +} + +function armForceExitTimer(signal) { + if (forceExitTimer) return; + const graceMs = resolveGraceMs(); + forceExitTimer = setTimeout(() => { + const exitCode = signal === "SIGINT" ? 130 : signal === "SIGTERM" ? 143 : 0; + process.exit(exitCode); + }, graceMs); +} + +function resolveGraceMs() { + const raw = + process.env.SF_AUTONOMOUS_SHUTDOWN_GRACE_MS ?? + process.env.SF_RPC_SHUTDOWN_GRACE_MS; + const parsed = Number.parseInt(raw ?? "", 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : 600_000; +} diff --git a/src/tests/web-shutdown-state.test.ts b/src/tests/web-shutdown-state.test.ts index cef461225..bcc3e3470 100644 --- a/src/tests/web-shutdown-state.test.ts +++ b/src/tests/web-shutdown-state.test.ts @@ -14,6 +14,7 @@ import { markShuttingDown, shutdownStateSnapshot, } from "../web/shutdown-state.ts"; +import { GET as readyGET } from "../../web/app/api/ready/route.ts"; beforeEach(() => { _resetShutdownStateForTests(); @@ -53,4 +54,13 @@ describe("shutdown-state", () => { const snap = shutdownStateSnapshot(); expect(snap.signal).toBeNull(); }); + + it("ready route returns 503 while draining", async () => { + markShuttingDown("manual"); + const response = await readyGET(); + const payload = await response.json(); + expect(response.status).toBe(503); + expect(payload.ready).toBe(false); + expect(payload.checks.shutdown).toBe("draining"); + }); }); diff --git a/web/instrumentation.ts b/web/instrumentation.ts new file mode 100644 index 000000000..066813166 --- /dev/null +++ b/web/instrumentation.ts @@ -0,0 +1,64 @@ +/** + * instrumentation.ts — Next.js server-boot hook. + * + * Purpose: install one SIGTERM/SIGINT/SIGHUP handler at server startup that + * 1. marks the shutdown flag (so /api/healthz immediately returns 503 and + * Traefik/k8s readinessProbe drains traffic away) + * 2. schedules process.exit() after a bounded grace so any HTTP requests + * already in flight have time to finish + * + * Without this hook, Node's default disposition for SIGTERM is "terminate + * immediately" — every open HTTP request is torn down with ECONNRESET and + * SSE streams die mid-event. With this hook + the healthz-503 + + * shutdown-state.ts trio, an upgrade looks like: + * + * SIGTERM → flag flips → /healthz=503 → LB stops sending new traffic + * → existing requests have up to SF_WEB_SHUTDOWN_GRACE_MS to + * finish → process.exit(0) + * + * Distinct from rpc-mode's SF_RPC_SHUTDOWN_GRACE_MS (10 min) which is for + * queue-drain durability. Web-request grace is shorter (30s default) + * because HTTP requests finish quickly; 10 min would hold the container + * far longer than necessary. + * + * Consumer: Next.js calls `register()` once per server process at boot. + */ + +export async function register(): Promise { + // Edge runtime doesn't have process.exit/SIGTERM in the same shape; + // only register on the Node runtime. + if (process.env.NEXT_RUNTIME !== "nodejs") return; + + // Single-install guard via globalThis. Survives module-graph splits + // (jiti, Next's bundle layers, etc.) so we never end up with N timers + // racing each other. + const installedKey = Symbol.for("singularity-forge:web:shutdown-installed"); + const g = globalThis as Record; + if (g[installedKey] === true) return; + g[installedKey] = true; + + const { markShuttingDown } = await import("../src/web/shutdown-state.ts"); + const graceMs = Number(process.env.SF_WEB_SHUTDOWN_GRACE_MS ?? 30_000); + let scheduled = false; + + const handler = (signal: NodeJS.Signals): void => { + markShuttingDown(signal); + if (scheduled) return; + scheduled = true; + process.stderr.write( + `[web] ${signal} received; healthz now 503, exiting in ${graceMs}ms (SF_WEB_SHUTDOWN_GRACE_MS)\n`, + ); + // NOT unref'd — this timer must keep the process alive during the + // grace window so in-flight HTTP requests have a chance to finish. + // Without the keep-alive, Node may exit early once all other + // handles are inactive, defeating the drain. + setTimeout(() => { + process.stderr.write(`[web] grace exhausted; exiting\n`); + process.exit(0); + }, graceMs); + }; + + process.on("SIGTERM", handler); + process.on("SIGINT", handler); + process.on("SIGHUP", handler); +}