feat(upgrade): drain HTTP requests + autonomous-loop SIGTERM awareness
Two upgrade-safety gaps codex flagged in the round before, both now
closed:
1. Next.js HTTP request drain — web/instrumentation.ts.
Next.js calls `register()` once at server boot. Installs one
SIGTERM/SIGINT/SIGHUP listener that:
- marks shutdown-state.ts (so /api/healthz returns 503 immediately
— LB/Traefik readinessProbe drains traffic away within ~4s)
- schedules process.exit after SF_WEB_SHUTDOWN_GRACE_MS (default
30s) — in-flight HTTP requests have time to finish; timer is
NOT unref'd so it keeps the process alive during the drain
Single-install guard via globalThis Symbol so jiti/bundle splits
don't end up with multiple racing timers.
2. Autonomous loop iteration-boundary shutdown awareness —
src/resources/extensions/sf/auto/shutdown-signal.js +
src/resources/extensions/sf/auto/loop.js iteration check.
Before: a SIGTERM mid-iteration killed the loop process before
the current unit's tool calls + DB writes could complete cleanly.
After: shutdown-signal flips a flag on first SIGTERM; loop polls
it at the top of each `while (s.active)` iteration; current unit
finishes, loop exits gracefully, the existing forceShutdown path
takes over to drain the sf_feedback queue and exit.
Includes a force-exit safety timer (SF_AUTONOMOUS_SHUTDOWN_GRACE_MS
or SF_RPC_SHUTDOWN_GRACE_MS, default 10 min) so a hung iteration
doesn't block exit indefinitely.
Test coverage:
- web-shutdown-state.test.ts extended: 6/6 (added ready-route
503-during-drain assertion).
- shutdown-signal: covered indirectly by loop dispatch tests; a
standalone unit test for register/request/snapshot is a small
follow-up.
Net of today's work, the upgrade safety chain for SF on Vega (Layer-1,
Tailscale Serve only) is operationally complete. Layer-2 (cluster
Traefik ingress with weighted blue/green) plugs in via the same
healthz-503 + recovery primitives — no further SF source changes
needed for that path.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
40c6148d7e
commit
c0358a2fc7
4 changed files with 201 additions and 0 deletions
|
|
@ -57,6 +57,11 @@ import {
|
|||
runUnitPhase,
|
||||
} from "./phases.js";
|
||||
import { _clearCurrentResolve } from "./resolve.js";
|
||||
import {
|
||||
autonomousShutdownSnapshot,
|
||||
isAutonomousShutdownRequested,
|
||||
registerAutonomousShutdownSignals,
|
||||
} from "./shutdown-signal.js";
|
||||
import { MAX_LOOP_ITERATIONS } from "./types.js";
|
||||
|
||||
// ── Stuck detection persistence (#3704) ──────────────────────────────────
|
||||
|
|
@ -711,6 +716,7 @@ async function runExitSolverEval(ctx, s, deps, iteration) {
|
|||
*/
|
||||
export async function autoLoop(ctx, pi, s, deps) {
|
||||
debugLog("autoLoop", { phase: "enter" });
|
||||
registerAutonomousShutdownSignals();
|
||||
const runawayHeal = clearRunawayRecoveredRuntimeRecords(s.basePath);
|
||||
if (runawayHeal > 0) {
|
||||
debugLog("autoLoop", {
|
||||
|
|
@ -767,6 +773,19 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
let lastObservedToolCallCount = getTotalToolCallCount();
|
||||
let unproductiveIterations = 0;
|
||||
while (s.active) {
|
||||
if (isAutonomousShutdownRequested()) {
|
||||
const snapshot = autonomousShutdownSnapshot();
|
||||
debugLog("autoLoop", {
|
||||
phase: "exit",
|
||||
reason: "shutdown-requested",
|
||||
...snapshot,
|
||||
});
|
||||
await deps.pauseAuto(ctx, pi, {
|
||||
reason: "shutdown-requested",
|
||||
shutdown: snapshot,
|
||||
});
|
||||
break;
|
||||
}
|
||||
const toolCallCount = getTotalToolCallCount();
|
||||
if (toolCallCount !== lastObservedToolCallCount) {
|
||||
lastObservedToolCallCount = toolCallCount;
|
||||
|
|
|
|||
108
src/resources/extensions/sf/auto/shutdown-signal.js
Normal file
108
src/resources/extensions/sf/auto/shutdown-signal.js
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
/**
|
||||
* auto/shutdown-signal.js — cooperative shutdown signal for autonomous mode.
|
||||
*
|
||||
* Purpose: let the autonomous loop stop at iteration boundaries when the
|
||||
* process is draining, instead of starting new unit work after SIGTERM/SIGINT.
|
||||
*
|
||||
* Consumer: auto/loop.js before dispatching the next autonomous unit.
|
||||
*/
|
||||
|
||||
let registered = false;
|
||||
let requested = false;
|
||||
let signalName = null;
|
||||
let requestedAt = null;
|
||||
let forceExitTimer = null;
|
||||
|
||||
const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"];
|
||||
|
||||
/**
|
||||
* Register process signal listeners for autonomous-loop cooperative shutdown.
|
||||
*
|
||||
* Purpose: preserve resumable state by giving the loop one bounded chance to
|
||||
* pause cleanly before container or supervisor shutdown completes.
|
||||
*
|
||||
* Consumer: autoLoop startup.
|
||||
*/
|
||||
export function registerAutonomousShutdownSignals() {
|
||||
if (registered) return;
|
||||
registered = true;
|
||||
for (const signal of SHUTDOWN_SIGNALS) {
|
||||
process.on(signal, () => requestAutonomousShutdown(signal));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark autonomous mode as shutdown-requested.
|
||||
*
|
||||
* Purpose: expose a testable, side-effect-light flag that the loop can poll at
|
||||
* safe boundaries without coupling to Node signal delivery.
|
||||
*
|
||||
* Consumer: signal listeners and tests.
|
||||
*/
|
||||
export function requestAutonomousShutdown(signal = "manual") {
|
||||
if (requested) return;
|
||||
requested = true;
|
||||
signalName = signal;
|
||||
requestedAt = Date.now();
|
||||
armForceExitTimer(signal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true when autonomous mode should stop starting new work.
|
||||
*
|
||||
* Purpose: keep shutdown checks readable at loop boundaries.
|
||||
*
|
||||
* Consumer: autoLoop.
|
||||
*/
|
||||
export function isAutonomousShutdownRequested() {
|
||||
return requested;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a compact shutdown snapshot for logs and pause reasons.
|
||||
*
|
||||
* Purpose: make shutdown exits diagnosable without reading process state.
|
||||
*
|
||||
* Consumer: autoLoop debug logging.
|
||||
*/
|
||||
export function autonomousShutdownSnapshot(now = Date.now()) {
|
||||
return {
|
||||
requested,
|
||||
signal: signalName,
|
||||
requestedAt:
|
||||
requestedAt === null ? null : new Date(requestedAt).toISOString(),
|
||||
elapsedMs: requestedAt === null ? null : Math.max(0, now - requestedAt),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset module state for tests.
|
||||
*
|
||||
* Purpose: isolate shutdown-signal unit tests from each other.
|
||||
*
|
||||
* Consumer: tests only.
|
||||
*/
|
||||
export function _resetAutonomousShutdownForTests() {
|
||||
requested = false;
|
||||
signalName = null;
|
||||
requestedAt = null;
|
||||
if (forceExitTimer) clearTimeout(forceExitTimer);
|
||||
forceExitTimer = null;
|
||||
}
|
||||
|
||||
function armForceExitTimer(signal) {
|
||||
if (forceExitTimer) return;
|
||||
const graceMs = resolveGraceMs();
|
||||
forceExitTimer = setTimeout(() => {
|
||||
const exitCode = signal === "SIGINT" ? 130 : signal === "SIGTERM" ? 143 : 0;
|
||||
process.exit(exitCode);
|
||||
}, graceMs);
|
||||
}
|
||||
|
||||
function resolveGraceMs() {
|
||||
const raw =
|
||||
process.env.SF_AUTONOMOUS_SHUTDOWN_GRACE_MS ??
|
||||
process.env.SF_RPC_SHUTDOWN_GRACE_MS;
|
||||
const parsed = Number.parseInt(raw ?? "", 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : 600_000;
|
||||
}
|
||||
|
|
@ -14,6 +14,7 @@ import {
|
|||
markShuttingDown,
|
||||
shutdownStateSnapshot,
|
||||
} from "../web/shutdown-state.ts";
|
||||
import { GET as readyGET } from "../../web/app/api/ready/route.ts";
|
||||
|
||||
beforeEach(() => {
|
||||
_resetShutdownStateForTests();
|
||||
|
|
@ -53,4 +54,13 @@ describe("shutdown-state", () => {
|
|||
const snap = shutdownStateSnapshot();
|
||||
expect(snap.signal).toBeNull();
|
||||
});
|
||||
|
||||
it("ready route returns 503 while draining", async () => {
|
||||
markShuttingDown("manual");
|
||||
const response = await readyGET();
|
||||
const payload = await response.json();
|
||||
expect(response.status).toBe(503);
|
||||
expect(payload.ready).toBe(false);
|
||||
expect(payload.checks.shutdown).toBe("draining");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
64
web/instrumentation.ts
Normal file
64
web/instrumentation.ts
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* instrumentation.ts — Next.js server-boot hook.
|
||||
*
|
||||
* Purpose: install one SIGTERM/SIGINT/SIGHUP handler at server startup that
|
||||
* 1. marks the shutdown flag (so /api/healthz immediately returns 503 and
|
||||
* Traefik/k8s readinessProbe drains traffic away)
|
||||
* 2. schedules process.exit() after a bounded grace so any HTTP requests
|
||||
* already in flight have time to finish
|
||||
*
|
||||
* Without this hook, Node's default disposition for SIGTERM is "terminate
|
||||
* immediately" — every open HTTP request is torn down with ECONNRESET and
|
||||
* SSE streams die mid-event. With this hook + the healthz-503 +
|
||||
* shutdown-state.ts trio, an upgrade looks like:
|
||||
*
|
||||
* SIGTERM → flag flips → /healthz=503 → LB stops sending new traffic
|
||||
* → existing requests have up to SF_WEB_SHUTDOWN_GRACE_MS to
|
||||
* finish → process.exit(0)
|
||||
*
|
||||
* Distinct from rpc-mode's SF_RPC_SHUTDOWN_GRACE_MS (10 min) which is for
|
||||
* queue-drain durability. Web-request grace is shorter (30s default)
|
||||
* because HTTP requests finish quickly; 10 min would hold the container
|
||||
* far longer than necessary.
|
||||
*
|
||||
* Consumer: Next.js calls `register()` once per server process at boot.
|
||||
*/
|
||||
|
||||
export async function register(): Promise<void> {
|
||||
// Edge runtime doesn't have process.exit/SIGTERM in the same shape;
|
||||
// only register on the Node runtime.
|
||||
if (process.env.NEXT_RUNTIME !== "nodejs") return;
|
||||
|
||||
// Single-install guard via globalThis. Survives module-graph splits
|
||||
// (jiti, Next's bundle layers, etc.) so we never end up with N timers
|
||||
// racing each other.
|
||||
const installedKey = Symbol.for("singularity-forge:web:shutdown-installed");
|
||||
const g = globalThis as Record<symbol, unknown>;
|
||||
if (g[installedKey] === true) return;
|
||||
g[installedKey] = true;
|
||||
|
||||
const { markShuttingDown } = await import("../src/web/shutdown-state.ts");
|
||||
const graceMs = Number(process.env.SF_WEB_SHUTDOWN_GRACE_MS ?? 30_000);
|
||||
let scheduled = false;
|
||||
|
||||
const handler = (signal: NodeJS.Signals): void => {
|
||||
markShuttingDown(signal);
|
||||
if (scheduled) return;
|
||||
scheduled = true;
|
||||
process.stderr.write(
|
||||
`[web] ${signal} received; healthz now 503, exiting in ${graceMs}ms (SF_WEB_SHUTDOWN_GRACE_MS)\n`,
|
||||
);
|
||||
// NOT unref'd — this timer must keep the process alive during the
|
||||
// grace window so in-flight HTTP requests have a chance to finish.
|
||||
// Without the keep-alive, Node may exit early once all other
|
||||
// handles are inactive, defeating the drain.
|
||||
setTimeout(() => {
|
||||
process.stderr.write(`[web] grace exhausted; exiting\n`);
|
||||
process.exit(0);
|
||||
}, graceMs);
|
||||
};
|
||||
|
||||
process.on("SIGTERM", handler);
|
||||
process.on("SIGINT", handler);
|
||||
process.on("SIGHUP", handler);
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue