singularity-forge/scripts/sf-meta-supervisor.mjs
Mikael Hugo d5664f7142 meta-supervisor (node daemon) + R091 triage gate + R091-R094 spec
- scripts/sf-meta-supervisor.mjs: pure-node daemon supervising
  scripts/sf-autonomous-watchdog.sh. Tick=60s, restarts watchdog if dead,
  emits .sf/meta-status.json, halt via .sf/meta-supervisor.halt. Uses
  only node builtins (no SF dist deps) so it survives dist breakage.
- src/headless.ts: R091 — gate the per-cycle handleTriage call on a time
  interval (SF_TRIAGE_INTERVAL_MS, default 30 min) and bump batch size
  (SF_TRIAGE_MAX, default 25, was 5). Drops the ~8min triage hit from
  every cycle while letting daily drain capacity rise.
- .sf/REQUIREMENTS.md: R091 (triage sidecar) + R092 (PDD-completeness
  as routing signal) + R093 (pin model per orchestration agent.yaml) +
  R094 (swarm-role model tier specialization — 8 roles already exist
  in uok/swarm-roles.js; model field per role missing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 14:08:30 +02:00

220 lines
6.4 KiB
JavaScript
Executable file

#!/usr/bin/env -S node
/**
* sf-meta-supervisor.mjs — supervise the SF watchdog itself.
*
* Layers:
* 1. sf headless autonomous (single-cycle worker)
* 2. scripts/sf-autonomous-watchdog.sh (restarts SF cycle on exit)
* 3. scripts/sf-meta-supervisor.mjs (this — restarts watchdog if it dies + status)
*
* Pure node — uses only built-in modules (child_process, fs, path, node:sqlite,
* timers/promises). Does NOT import from SF's dist/, so a broken SF build can
* still be observed and recovered.
*
* Run as: nohup node scripts/sf-meta-supervisor.mjs > .sf/meta-supervisor.log 2>&1 &
* Stop (clean): touch .sf/meta-supervisor.halt
* Stop (hard): pkill -f sf-meta-supervisor.mjs
* Status: cat .sf/meta-status.json
*/
import { spawn } from "node:child_process";
import { appendFileSync, existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
import { DatabaseSync } from "node:sqlite";
import { setTimeout as sleep } from "node:timers/promises";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const REPO_ROOT = resolve(__dirname, "..");
process.chdir(REPO_ROOT);
const STATUS_FILE = ".sf/meta-status.json";
const HALT_FILE = ".sf/meta-supervisor.halt";
const PID_FILE = ".sf/meta-supervisor.pid";
const LOG_FILE = ".sf/meta-supervisor.log";
const WATCHDOG_LOG = ".sf/watchdog.log";
const WATCHDOG_PATTERN = "sf-autonomous-watchdog.sh";
const WATCHDOG_SCRIPT = "scripts/sf-autonomous-watchdog.sh";
const DB_PATH = ".sf/sf.db";
const TICK_S = Number(process.env.SF_META_TICK_S || 60);
function logLine(msg) {
const line = `[${new Date().toISOString()}] ${msg}\n`;
process.stdout.write(line);
}
async function pgrepPid(pattern) {
return new Promise((resolveP) => {
const child = spawn("pgrep", ["-f", pattern], { stdio: ["ignore", "pipe", "ignore"] });
let out = "";
child.stdout.on("data", (d) => (out += d));
child.on("close", () => {
const pid = out.split("\n").map((l) => l.trim()).filter(Boolean)[0];
resolveP(pid ? Number(pid) : null);
});
child.on("error", () => resolveP(null));
});
}
function readLatestWatchdogPidFromLog() {
try {
const raw = readFileSync(WATCHDOG_LOG, "utf8");
const matches = raw.match(/watchdog start, pid=(\d+)/g) || [];
const last = matches[matches.length - 1];
if (!last) return null;
const m = last.match(/pid=(\d+)/);
return m ? Number(m[1]) : null;
} catch {
return null;
}
}
function readLatestCompletionAt() {
try {
const db = new DatabaseSync(DB_PATH, { readOnly: true });
const t = db.prepare("SELECT MAX(completed_at) m FROM tasks WHERE status='complete'").get();
const s = db.prepare("SELECT MAX(completed_at) m FROM slices WHERE status='complete'").get();
db.close();
const candidates = [t?.m, s?.m].filter(Boolean).sort();
return candidates[candidates.length - 1] || null;
} catch {
return null;
}
}
function relaunchWatchdog() {
const child = spawn("bash", [WATCHDOG_SCRIPT], {
detached: true,
stdio: ["ignore", "ignore", "ignore"],
cwd: REPO_ROOT,
});
child.unref();
}
function appendWatchdogLog(msg) {
try {
appendFileSync(WATCHDOG_LOG, `[${new Date().toISOString()}] ${msg}\n`);
} catch {}
}
function writeStatus(payload) {
try {
writeFileSync(STATUS_FILE, JSON.stringify(payload, null, 2));
} catch {}
}
async function main() {
// Single-instance guard via PID file.
if (existsSync(PID_FILE)) {
const existing = Number(readFileSync(PID_FILE, "utf8").trim());
if (existing && existing !== process.pid) {
try {
process.kill(existing, 0);
logLine(`refusing to start — meta-supervisor already running pid=${existing}`);
process.exit(1);
} catch {
// Stale pid file — overwrite.
}
}
}
writeFileSync(PID_FILE, String(process.pid));
// Clear stale halt file from a prior session so we don't immediately exit.
if (existsSync(HALT_FILE)) {
try {
unlinkSync(HALT_FILE);
} catch {}
}
const sessionStartIso = new Date().toISOString();
logLine(`meta-supervisor start, pid=${process.pid}, tick=${TICK_S}s`);
let watchdogRestarts = 0;
let lastCompletionIso = readLatestCompletionAt() || "unknown";
let noCompletionStreak = 0;
const cleanup = (signal) => {
logLine(`signal ${signal} — exiting cleanly`);
try {
unlinkSync(PID_FILE);
} catch {}
process.exit(0);
};
process.on("SIGTERM", () => cleanup("SIGTERM"));
process.on("SIGINT", () => cleanup("SIGINT"));
process.on("SIGHUP", () => cleanup("SIGHUP"));
while (true) {
if (existsSync(HALT_FILE)) {
logLine("halt file detected — exiting cleanly");
writeStatus({
schemaVersion: 1,
updatedAt: new Date().toISOString(),
sessionStartAt: sessionStartIso,
metaSupervisorPid: process.pid,
tickSeconds: TICK_S,
watchdog: { alive: false, pid: null, restartsThisSession: watchdogRestarts },
drift: { cyclesWithNoUnitCompletion: noCompletionStreak, lastCompletionAt: lastCompletionIso },
lastNote: "halted-by-file",
});
try {
unlinkSync(PID_FILE);
} catch {}
process.exit(0);
}
const wpid = await pgrepPid(WATCHDOG_PATTERN);
let note = "ok";
if (!wpid) {
const prevPid = readLatestWatchdogPidFromLog() ?? "unknown";
const msg = `WATCHDOG-RESTART by meta-supervisor: previous pid=${prevPid} vanished. Restarting.`;
logLine(msg);
appendWatchdogLog(msg);
relaunchWatchdog();
watchdogRestarts += 1;
await sleep(2000);
const after = await pgrepPid(WATCHDOG_PATTERN);
note = after ? "restarted-watchdog" : "restart-failed";
} else {
const latest = readLatestCompletionAt();
if (latest) {
if (latest === lastCompletionIso) {
noCompletionStreak += 1;
} else {
noCompletionStreak = 0;
lastCompletionIso = latest;
}
}
}
const finalPid = await pgrepPid(WATCHDOG_PATTERN);
writeStatus({
schemaVersion: 1,
updatedAt: new Date().toISOString(),
sessionStartAt: sessionStartIso,
metaSupervisorPid: process.pid,
tickSeconds: TICK_S,
watchdog: {
alive: Boolean(finalPid),
pid: finalPid ?? null,
restartsThisSession: watchdogRestarts,
},
drift: {
cyclesWithNoUnitCompletion: noCompletionStreak,
lastCompletionAt: lastCompletionIso,
},
lastNote: note,
});
await sleep(TICK_S * 1000);
}
}
main().catch((err) => {
logLine(`fatal: ${err.stack || err.message || String(err)}`);
try {
unlinkSync(PID_FILE);
} catch {}
process.exit(1);
});