- scripts/sf-meta-supervisor.mjs: pure-node daemon supervising scripts/sf-autonomous-watchdog.sh. Tick=60s, restarts watchdog if dead, emits .sf/meta-status.json, halt via .sf/meta-supervisor.halt. Uses only node builtins (no SF dist deps) so it survives dist breakage. - src/headless.ts: R091 — gate the per-cycle handleTriage call on a time interval (SF_TRIAGE_INTERVAL_MS, default 30 min) and bump batch size (SF_TRIAGE_MAX, default 25, was 5). Drops the ~8min triage hit from every cycle while letting daily drain capacity rise. - .sf/REQUIREMENTS.md: R091 (triage sidecar) + R092 (PDD-completeness as routing signal) + R093 (pin model per orchestration agent.yaml) + R094 (swarm-role model tier specialization — 8 roles already exist in uok/swarm-roles.js; model field per role missing). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
220 lines
6.4 KiB
JavaScript
Executable file
220 lines
6.4 KiB
JavaScript
Executable file
#!/usr/bin/env -S node
|
|
/**
|
|
* sf-meta-supervisor.mjs — supervise the SF watchdog itself.
|
|
*
|
|
* Layers:
|
|
* 1. sf headless autonomous (single-cycle worker)
|
|
* 2. scripts/sf-autonomous-watchdog.sh (restarts SF cycle on exit)
|
|
* 3. scripts/sf-meta-supervisor.mjs (this — restarts watchdog if it dies + status)
|
|
*
|
|
* Pure node — uses only built-in modules (child_process, fs, path, node:sqlite,
|
|
* timers/promises). Does NOT import from SF's dist/, so a broken SF build can
|
|
* still be observed and recovered.
|
|
*
|
|
* Run as: nohup node scripts/sf-meta-supervisor.mjs > .sf/meta-supervisor.log 2>&1 &
|
|
* Stop (clean): touch .sf/meta-supervisor.halt
|
|
* Stop (hard): pkill -f sf-meta-supervisor.mjs
|
|
* Status: cat .sf/meta-status.json
|
|
*/
|
|
import { spawn } from "node:child_process";
|
|
import { appendFileSync, existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
import { dirname, join, resolve } from "node:path";
|
|
import { DatabaseSync } from "node:sqlite";
|
|
import { setTimeout as sleep } from "node:timers/promises";
|
|
import { fileURLToPath } from "node:url";
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const REPO_ROOT = resolve(__dirname, "..");
|
|
process.chdir(REPO_ROOT);
|
|
|
|
const STATUS_FILE = ".sf/meta-status.json";
|
|
const HALT_FILE = ".sf/meta-supervisor.halt";
|
|
const PID_FILE = ".sf/meta-supervisor.pid";
|
|
const LOG_FILE = ".sf/meta-supervisor.log";
|
|
const WATCHDOG_LOG = ".sf/watchdog.log";
|
|
const WATCHDOG_PATTERN = "sf-autonomous-watchdog.sh";
|
|
const WATCHDOG_SCRIPT = "scripts/sf-autonomous-watchdog.sh";
|
|
const DB_PATH = ".sf/sf.db";
|
|
|
|
const TICK_S = Number(process.env.SF_META_TICK_S || 60);
|
|
|
|
function logLine(msg) {
|
|
const line = `[${new Date().toISOString()}] ${msg}\n`;
|
|
process.stdout.write(line);
|
|
}
|
|
|
|
async function pgrepPid(pattern) {
|
|
return new Promise((resolveP) => {
|
|
const child = spawn("pgrep", ["-f", pattern], { stdio: ["ignore", "pipe", "ignore"] });
|
|
let out = "";
|
|
child.stdout.on("data", (d) => (out += d));
|
|
child.on("close", () => {
|
|
const pid = out.split("\n").map((l) => l.trim()).filter(Boolean)[0];
|
|
resolveP(pid ? Number(pid) : null);
|
|
});
|
|
child.on("error", () => resolveP(null));
|
|
});
|
|
}
|
|
|
|
function readLatestWatchdogPidFromLog() {
|
|
try {
|
|
const raw = readFileSync(WATCHDOG_LOG, "utf8");
|
|
const matches = raw.match(/watchdog start, pid=(\d+)/g) || [];
|
|
const last = matches[matches.length - 1];
|
|
if (!last) return null;
|
|
const m = last.match(/pid=(\d+)/);
|
|
return m ? Number(m[1]) : null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function readLatestCompletionAt() {
|
|
try {
|
|
const db = new DatabaseSync(DB_PATH, { readOnly: true });
|
|
const t = db.prepare("SELECT MAX(completed_at) m FROM tasks WHERE status='complete'").get();
|
|
const s = db.prepare("SELECT MAX(completed_at) m FROM slices WHERE status='complete'").get();
|
|
db.close();
|
|
const candidates = [t?.m, s?.m].filter(Boolean).sort();
|
|
return candidates[candidates.length - 1] || null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function relaunchWatchdog() {
|
|
const child = spawn("bash", [WATCHDOG_SCRIPT], {
|
|
detached: true,
|
|
stdio: ["ignore", "ignore", "ignore"],
|
|
cwd: REPO_ROOT,
|
|
});
|
|
child.unref();
|
|
}
|
|
|
|
function appendWatchdogLog(msg) {
|
|
try {
|
|
appendFileSync(WATCHDOG_LOG, `[${new Date().toISOString()}] ${msg}\n`);
|
|
} catch {}
|
|
}
|
|
|
|
function writeStatus(payload) {
|
|
try {
|
|
writeFileSync(STATUS_FILE, JSON.stringify(payload, null, 2));
|
|
} catch {}
|
|
}
|
|
|
|
async function main() {
|
|
// Single-instance guard via PID file.
|
|
if (existsSync(PID_FILE)) {
|
|
const existing = Number(readFileSync(PID_FILE, "utf8").trim());
|
|
if (existing && existing !== process.pid) {
|
|
try {
|
|
process.kill(existing, 0);
|
|
logLine(`refusing to start — meta-supervisor already running pid=${existing}`);
|
|
process.exit(1);
|
|
} catch {
|
|
// Stale pid file — overwrite.
|
|
}
|
|
}
|
|
}
|
|
writeFileSync(PID_FILE, String(process.pid));
|
|
|
|
// Clear stale halt file from a prior session so we don't immediately exit.
|
|
if (existsSync(HALT_FILE)) {
|
|
try {
|
|
unlinkSync(HALT_FILE);
|
|
} catch {}
|
|
}
|
|
|
|
const sessionStartIso = new Date().toISOString();
|
|
logLine(`meta-supervisor start, pid=${process.pid}, tick=${TICK_S}s`);
|
|
|
|
let watchdogRestarts = 0;
|
|
let lastCompletionIso = readLatestCompletionAt() || "unknown";
|
|
let noCompletionStreak = 0;
|
|
|
|
const cleanup = (signal) => {
|
|
logLine(`signal ${signal} — exiting cleanly`);
|
|
try {
|
|
unlinkSync(PID_FILE);
|
|
} catch {}
|
|
process.exit(0);
|
|
};
|
|
process.on("SIGTERM", () => cleanup("SIGTERM"));
|
|
process.on("SIGINT", () => cleanup("SIGINT"));
|
|
process.on("SIGHUP", () => cleanup("SIGHUP"));
|
|
|
|
while (true) {
|
|
if (existsSync(HALT_FILE)) {
|
|
logLine("halt file detected — exiting cleanly");
|
|
writeStatus({
|
|
schemaVersion: 1,
|
|
updatedAt: new Date().toISOString(),
|
|
sessionStartAt: sessionStartIso,
|
|
metaSupervisorPid: process.pid,
|
|
tickSeconds: TICK_S,
|
|
watchdog: { alive: false, pid: null, restartsThisSession: watchdogRestarts },
|
|
drift: { cyclesWithNoUnitCompletion: noCompletionStreak, lastCompletionAt: lastCompletionIso },
|
|
lastNote: "halted-by-file",
|
|
});
|
|
try {
|
|
unlinkSync(PID_FILE);
|
|
} catch {}
|
|
process.exit(0);
|
|
}
|
|
|
|
const wpid = await pgrepPid(WATCHDOG_PATTERN);
|
|
let note = "ok";
|
|
|
|
if (!wpid) {
|
|
const prevPid = readLatestWatchdogPidFromLog() ?? "unknown";
|
|
const msg = `WATCHDOG-RESTART by meta-supervisor: previous pid=${prevPid} vanished. Restarting.`;
|
|
logLine(msg);
|
|
appendWatchdogLog(msg);
|
|
relaunchWatchdog();
|
|
watchdogRestarts += 1;
|
|
await sleep(2000);
|
|
const after = await pgrepPid(WATCHDOG_PATTERN);
|
|
note = after ? "restarted-watchdog" : "restart-failed";
|
|
} else {
|
|
const latest = readLatestCompletionAt();
|
|
if (latest) {
|
|
if (latest === lastCompletionIso) {
|
|
noCompletionStreak += 1;
|
|
} else {
|
|
noCompletionStreak = 0;
|
|
lastCompletionIso = latest;
|
|
}
|
|
}
|
|
}
|
|
|
|
const finalPid = await pgrepPid(WATCHDOG_PATTERN);
|
|
writeStatus({
|
|
schemaVersion: 1,
|
|
updatedAt: new Date().toISOString(),
|
|
sessionStartAt: sessionStartIso,
|
|
metaSupervisorPid: process.pid,
|
|
tickSeconds: TICK_S,
|
|
watchdog: {
|
|
alive: Boolean(finalPid),
|
|
pid: finalPid ?? null,
|
|
restartsThisSession: watchdogRestarts,
|
|
},
|
|
drift: {
|
|
cyclesWithNoUnitCompletion: noCompletionStreak,
|
|
lastCompletionAt: lastCompletionIso,
|
|
},
|
|
lastNote: note,
|
|
});
|
|
|
|
await sleep(TICK_S * 1000);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
logLine(`fatal: ${err.stack || err.message || String(err)}`);
|
|
try {
|
|
unlinkSync(PID_FILE);
|
|
} catch {}
|
|
process.exit(1);
|
|
});
|