fix: harden sf server control loop
Some checks are pending
CI / detect-changes (push) Waiting to run
CI / docs-check (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
CI / build (push) Blocked by required conditions
CI / integration-tests (push) Blocked by required conditions
CI / windows-portability (push) Blocked by required conditions
CI / rtk-portability (linux, blacksmith-4vcpu-ubuntu-2404) (push) Blocked by required conditions
CI / rtk-portability (macos, macos-15) (push) Blocked by required conditions
CI / rtk-portability (windows, blacksmith-4vcpu-windows-2025) (push) Blocked by required conditions

This commit is contained in:
Mikael Hugo 2026-05-17 21:13:12 +02:00
parent 70d89eebec
commit acd907fec2
33 changed files with 1602 additions and 192 deletions

View file

@ -1,8 +1,6 @@
---
version: 1
experimental:
smoke_gate: false
---
# SF Preferences
See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.
# See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.

View file

@ -98,6 +98,27 @@ npm run release:changelog
npm run release:bump
```
## Running SF Locally
The server surface is the default local dogfooding surface for web/RPC/autonomous
control. The TUI still exists, but do not use it as the default way to run or
verify autonomous mode.
```bash
# Source/dev server
npm run sf:server -- --port 4000 --host 127.0.0.1
# Built server after npm run build:core or npm run build
npm run sf:server:dist -- --port 4000 --host 127.0.0.1
```
Bind only trusted interfaces. For this workstation, localhost plus Tailscale is
acceptable; public `0.0.0.0` is not the default. If a server is already running,
use `sf headless ...` as the machine/control surface instead of starting a
second writer. Server-forwarded feedback writes are queued and drained by the
server before autonomous dispatch, so CLI control does not block behind a busy
unit.
## Coding Style & Naming Conventions
- **Language**: TypeScript with `"strict": true` enabled in all packages

View file

@ -98,3 +98,34 @@ When adding a new `{{variable}}` to a prompt template in `prompts/`, you must:
`loadPrompt` throws at runtime if any `{{var}}` in the template has no
corresponding key in the vars object — this is intentional to catch
template/code drift early.
## Running the SF server in this repo
Use the server surface for dogfooding and browser/RPC control. Do not start the
TUI as the default way to exercise autonomous mode.
```bash
# source/dev server, with resource redirect and restart support
npm run sf:server -- --port 4000 --host 127.0.0.1
# built server, after npm run build:core or npm run build
npm run sf:server:dist -- --port 4000 --host 127.0.0.1
```
If the server is already running, prefer `sf headless ...` control commands
rather than starting a second writer. Feedback add/resolve commands are
forwarded to the active server and queued there so CLI control does not hang
behind an autonomous unit.
For remote local-network access, bind an additional trusted interface such as a
Tailscale address. Do not bind `0.0.0.0` for the dev server unless an explicit
fronting proxy/firewall decision is in place.
Before assuming a source edit is live, rebuild the relevant output:
```bash
npm run build:core
```
Then restart the server. Stale `dist/` or stale `~/.sf/agent/extensions/sf/`
copies can make fixed source look broken.

View file

@ -409,12 +409,19 @@ export class RpcClient {
subcommand: "add" | "resolve",
args: string[],
json = false,
): Promise<{ exitCode: number; stdout: string; stderr: string }> {
options: { queued?: boolean } = {},
): Promise<{
exitCode: number | null;
stdout: string;
stderr: string;
queued?: boolean;
}> {
const response = await this.send({
type: "sf_feedback",
subcommand,
args,
json,
queued: options.queued,
});
return this.getData(response);
}

View file

@ -12,7 +12,16 @@
*/
import * as crypto from "node:crypto";
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
import {
appendFileSync,
existsSync,
mkdirSync,
readdirSync,
readFileSync,
renameSync,
statSync,
unlinkSync,
} from "node:fs";
import type { WriteStream } from "node:tty";
import { pathToFileURL } from "node:url";
import { dirname, join, resolve } from "node:path";
@ -42,6 +51,142 @@ const RUNTIME_HEARTBEAT_INTERVAL_MS = Number(
process.env.SF_RUNTIME_HEARTBEAT_INTERVAL_MS ?? 10_000,
);
const SF_FEEDBACK_QUEUE_FILE = "sf-feedback-queue.jsonl";
const SF_FEEDBACK_FAILED_QUEUE_FILE = "sf-feedback-queue-failed.jsonl";
function queueSfFeedbackCommand(
cwd: string,
command: Extract<RpcCommand, { type: "sf_feedback" }>,
): string {
const dir = join(cwd, ".sf", "runtime");
mkdirSync(dir, { recursive: true });
const path = join(dir, SF_FEEDBACK_QUEUE_FILE);
appendFileSync(
path,
`${JSON.stringify({
schemaVersion: 1,
queuedAt: new Date().toISOString(),
id: command.id,
subcommand: command.subcommand,
args: command.args,
json: command.json === true,
source: "rpc",
})}\n`,
"utf-8",
);
return path;
}
type QueuedSfFeedbackCommand = {
schemaVersion: 1;
queuedAt: string;
id?: string;
subcommand: "add" | "list" | "resolve";
args: string[];
json: boolean;
source: "rpc";
};
function parseQueuedSfFeedbackLine(
line: string,
): QueuedSfFeedbackCommand | null {
try {
const row = JSON.parse(line) as Partial<QueuedSfFeedbackCommand>;
if (
row.schemaVersion !== 1 ||
(row.subcommand !== "add" &&
row.subcommand !== "list" &&
row.subcommand !== "resolve") ||
!Array.isArray(row.args)
) {
return null;
}
return {
schemaVersion: 1,
queuedAt:
typeof row.queuedAt === "string"
? row.queuedAt
: new Date().toISOString(),
id: typeof row.id === "string" ? row.id : undefined,
subcommand: row.subcommand,
args: row.args.map((arg) => String(arg)),
json: row.json === true,
source: "rpc",
};
} catch {
return null;
}
}
/**
* Apply queued sf_feedback commands before a daemon-owned autonomous run starts.
*
* Purpose: keep CLI/RPC control commands non-blocking while preserving a single
* server-owned writer for self-feedback mutations.
*
* Consumer: start_autonomous RPC command in the SF server session.
*/
async function drainQueuedSfFeedbackCommands(cwd: string): Promise<void> {
const runtimeDir = join(cwd, ".sf", "runtime");
const queuePath = join(runtimeDir, SF_FEEDBACK_QUEUE_FILE);
if (!existsSync(queuePath)) return;
const drainingPath = join(
runtimeDir,
`${SF_FEEDBACK_QUEUE_FILE}.${process.pid}.draining`,
);
try {
renameSync(queuePath, drainingPath);
} catch {
return;
}
const lines = readFileSync(drainingPath, "utf-8")
.split("\n")
.map((line) => line.trim())
.filter(Boolean);
const queued = lines
.map(parseQueuedSfFeedbackLine)
.filter((row): row is QueuedSfFeedbackCommand => row !== null);
if (queued.length === 0) {
unlinkSync(drainingPath);
return;
}
const { handleFeedback } = await loadHeadlessFeedbackHandler();
const failed: QueuedSfFeedbackCommand[] = [];
for (const command of queued) {
try {
const captured = await captureProcessWrites(() =>
handleFeedback(cwd, {
subcommand: command.subcommand,
args: command.args,
json: command.json,
}),
);
if (captured.result.exitCode !== 0) failed.push(command);
} catch {
failed.push(command);
}
}
if (failed.length > 0) {
appendFileSync(
join(runtimeDir, SF_FEEDBACK_FAILED_QUEUE_FILE),
failed.map((row) => JSON.stringify(row)).join("\n") + "\n",
"utf-8",
);
}
unlinkSync(drainingPath);
}
function scheduleQueuedSfFeedbackDrain(cwd: string): void {
const timer = setTimeout(() => {
void drainQueuedSfFeedbackCommands(cwd);
}, 0);
timer.unref?.();
}
async function captureProcessWrites<T>(
run: () => Promise<T>,
): Promise<{ result: T; stdout: string; stderr: string }> {
@ -853,6 +998,7 @@ export async function runRpcMode(session: AgentSession): Promise<never> {
const previousHeadless = process.env.SF_HEADLESS;
process.env.SF_HEADLESS = "1";
try {
await drainQueuedSfFeedbackCommands(process.cwd());
await session.prompt("/autonomous", {
source: "rpc",
});
@ -882,6 +1028,16 @@ export async function runRpcMode(session: AgentSession): Promise<never> {
}
case "sf_feedback": {
if (command.queued === true) {
const queuePath = queueSfFeedbackCommand(process.cwd(), command);
scheduleQueuedSfFeedbackDrain(process.cwd());
return success(id, "sf_feedback", {
exitCode: null,
stdout: JSON.stringify({ ok: true, queued: true, queuePath }),
stderr: "",
queued: true,
});
}
const { handleFeedback } = await loadHeadlessFeedbackHandler();
const captured = await captureProcessWrites(() =>
handleFeedback(process.cwd(), {

View file

@ -47,6 +47,7 @@ export type RpcCommand =
subcommand: "add" | "resolve";
args: string[];
json?: boolean;
queued?: boolean;
}
// State
@ -185,7 +186,12 @@ export type RpcResponse =
type: "response";
command: "sf_feedback";
success: true;
data: { exitCode: number; stdout: string; stderr: string };
data: {
exitCode: number | null;
stdout: string;
stderr: string;
queued?: boolean;
};
}
| {
id?: string;

View file

@ -482,12 +482,19 @@ export class RpcClient {
subcommand: "add" | "resolve",
args: string[],
json = false,
): Promise<{ exitCode: number; stdout: string; stderr: string }> {
options: { queued?: boolean } = {},
): Promise<{
exitCode: number | null;
stdout: string;
stderr: string;
queued?: boolean;
}> {
const response = await this.send({
type: "sf_feedback",
subcommand,
args,
json,
queued: options.queued,
});
return this.getData(response);
}

View file

@ -113,6 +113,7 @@ export type RpcCommand =
subcommand: "add" | "resolve";
args: string[];
json?: boolean;
queued?: boolean;
}
// State
@ -251,7 +252,12 @@ export type RpcResponse =
type: "response";
command: "sf_feedback";
success: true;
data: { exitCode: number; stdout: string; stderr: string };
data: {
exitCode: number | null;
stdout: string;
stderr: string;
queued?: boolean;
};
}
| {
id?: string;

View file

@ -227,7 +227,7 @@ export type RunWebCliBranchResult =
| {
handled: true;
exitCode: number;
action: "start";
action: "start" | "reload";
status: WebModeLaunchStatus;
launchInputs: {
cwd: string;
@ -270,8 +270,8 @@ export async function runWebCliBranch(
};
}
// `sf server [start] [path]` starts the full operator server for one repo.
// Matches: `sf server`, `sf server start`, `sf server start <path>`, `sf server <path>`
// `sf server [start|reload] [path]` starts the full operator server for one repo.
// Matches: `sf server`, `sf server start`, `sf server reload`, `sf server <path>`
const isWebSubcommand =
flags.messages[0] === "server" && flags.messages[1] !== "stop";
if (!isWebSubcommand) {
@ -286,7 +286,7 @@ export async function runWebCliBranch(
// sf server <path> → messages[1] (when not "start")
let webPath = flags.webPath;
if (!webPath && isWebSubcommand) {
if (flags.messages[1] === "start") {
if (flags.messages[1] === "start" || flags.messages[1] === "reload") {
webPath = flags.messages[2];
} else if (flags.messages[1]) {
webPath = flags.messages[1];
@ -346,6 +346,7 @@ export async function runWebCliBranch(
agentDir,
host: flags.webHost,
port: flags.webPort,
...(flags.messages[1] === "reload" ? { reload: true } : {}),
allowedOrigins: flags.webAllowedOrigins,
});
@ -356,7 +357,7 @@ export async function runWebCliBranch(
return {
handled: true,
exitCode: status.ok ? 0 : 1,
action: "start",
action: flags.messages[1] === "reload" ? "reload" : "start",
status,
launchInputs: {
cwd: currentCwd,

View file

@ -12,9 +12,10 @@ import { resolve } from "node:path";
import { readInstanceRegistry, type WebInstanceEntry } from "./web-mode.js";
export interface ForwardedHeadlessResult {
exitCode: number;
exitCode: number | null;
stdout: string;
stderr: string;
queued?: boolean;
}
type SfFeedbackResponse =
@ -109,6 +110,7 @@ export async function forwardFeedbackToActiveServer(
subcommand: options.subcommand,
args: options.args,
json: options.json,
queued: true,
},
);
if (response.statusCode === 404) return null;

View file

@ -67,6 +67,7 @@ export interface HandleTriageOptions {
max?: number;
run?: boolean;
apply?: boolean;
urgentOnly?: boolean;
model?: string;
agentRunner?: AgentRunner;
}
@ -1166,6 +1167,13 @@ export async function handleTriage(
return { exitCode: 1 };
}
if (options.urgentOnly) {
candidates = candidates.filter(
(candidate) =>
candidate.severity === "high" || candidate.severity === "critical",
);
}
if (typeof options.max === "number" && options.max > 0) {
candidates = candidates.slice(0, options.max);
}

View file

@ -105,6 +105,13 @@ import {
const HEADLESS_HEARTBEAT_INTERVAL_MS = 60_000;
type SelfFeedbackSeverity = "low" | "medium" | "high" | "critical" | string;
interface SelfFeedbackRowForTriage {
resolvedAt?: string | null;
severity?: SelfFeedbackSeverity;
}
interface HeadlessTimeoutSolverEvalRecord {
runId: string;
reportPath: string;
@ -577,6 +584,31 @@ export async function runHeadless(options: HeadlessOptions): Promise<void> {
}
}
/**
* Count unresolved high/critical self-feedback rows for autonomous pre-triage.
*
* Purpose: let urgent operator/detector findings bypass the normal triage
* cadence without making the TypeScript headless surface depend on JS
* extension declarations.
*
* Consumer: runHeadlessOnce before autonomous dispatch.
*/
async function countUrgentSelfFeedbackRows(basePath: string): Promise<number> {
try {
const modulePath = "./resources/extensions/sf/self-feedback.js";
const mod = (await import(modulePath)) as {
readAllSelfFeedback?: (basePath: string) => SelfFeedbackRowForTriage[];
};
return (mod.readAllSelfFeedback?.(basePath) ?? []).filter(
(entry) =>
!entry.resolvedAt &&
(entry.severity === "high" || entry.severity === "critical"),
).length;
} catch {
return 0;
}
}
async function runHeadlessOnce(
options: HeadlessOptions,
restartCount: number,
@ -660,12 +692,19 @@ async function runHeadlessOnce(
"last-triage-at",
);
let shouldRunTriage = true;
const urgentTriageCount = await countUrgentSelfFeedbackRows(
process.cwd(),
);
try {
if (existsSync(triageMarkerPath)) {
const last = Date.parse(
readFileSync(triageMarkerPath, "utf8").trim(),
);
if (Number.isFinite(last) && Date.now() - last < triageIntervalMs) {
if (
urgentTriageCount === 0 &&
Number.isFinite(last) &&
Date.now() - last < triageIntervalMs
) {
shouldRunTriage = false;
if (!options.json) {
process.stderr.write(
@ -687,13 +726,16 @@ async function runHeadlessOnce(
const { handleTriage } = await import("./headless-triage.js");
if (!options.json) {
process.stderr.write(
`[headless] autonomous: draining self-feedback triage queue first (max=${triageMaxBatch})...\n`,
urgentTriageCount > 0
? `[headless] autonomous: draining ${urgentTriageCount} high/critical self-feedback entr${urgentTriageCount === 1 ? "y" : "ies"} before dispatch (max=${triageMaxBatch})...\n`
: `[headless] autonomous: draining self-feedback triage queue first (max=${triageMaxBatch})...\n`,
);
}
await handleTriage(process.cwd(), {
apply: true,
json: !!options.json,
max: triageMaxBatch,
urgentOnly: urgentTriageCount > 0,
});
try {
const runtimeDir = join(process.cwd(), ".sf", "runtime");
@ -971,7 +1013,7 @@ async function runHeadlessOnce(
if (forwarded.stdout) process.stdout.write(forwarded.stdout);
if (forwarded.stderr) process.stderr.write(forwarded.stderr);
return {
exitCode: forwarded.exitCode,
exitCode: forwarded.exitCode ?? EXIT_SUCCESS,
interrupted: false,
timedOut: false,
};

View file

@ -6,6 +6,7 @@
* via startUnitSupervision() and torn down by the caller via clearUnitTimeout().
*/
import { saveActivityLog } from "./activity-log.js";
import { resolveAgentEnd } from "./auto/resolve.js";
import { resolveAgentEndCancelled } from "./auto/resolve.js";
import { detectWorkingTreeActivity } from "./auto-supervisor.js";
import { blockModel } from "./blocked-models.js";
@ -40,6 +41,124 @@ import {
writeUnitRuntimeRecord,
} from "./uok/unit-runtime.js";
import { logError, logWarning } from "./workflow-logger.js";
/**
* Clear active supervision handles for the current unit attempt.
*
* Purpose: stop one runaway-guard terminal decision from being emitted repeatedly
* while the autonomous loop is being unblocked.
*
* Consumer: finalizeRunawayGuardFailure() when zero-progress or silent-worker
* detection has already converted the current unit attempt into a failed record.
*/
function clearSupervisionHandles(s) {
if (s.unitTimeoutHandle) {
clearTimeout(s.unitTimeoutHandle);
s.unitTimeoutHandle = null;
}
if (s.wrapupWarningHandle) {
clearTimeout(s.wrapupWarningHandle);
s.wrapupWarningHandle = null;
}
if (s.idleWatchdogHandle) {
clearInterval(s.idleWatchdogHandle);
s.idleWatchdogHandle = null;
}
if (s.continueHereHandle) {
clearInterval(s.continueHereHandle);
s.continueHereHandle = null;
}
}
/**
* Finish a runaway-guard failure as one terminal unit-attempt event.
*
* Purpose: convert zero-progress and silent-worker supervision failures into a
* retryable failed runtime record, close the worker lineage, stop supervision
* timers, and unblock the unit promise so the autonomous loop can select the
* next eligible model instead of repeating the same warning.
*
* Consumer: startUnitSupervision() idle watchdog fail branch.
*/
export async function finalizeRunawayGuardFailure(sctx, decision, helpers = {}) {
const { s, ctx, unitType, unitId, buildSnapshotOpts } = sctx;
const currentUnit = s.currentUnit;
if (!currentUnit) return;
const closeout = helpers.closeoutUnit ?? closeoutUnit;
const writeRuntime = helpers.writeUnitRuntimeRecord ?? writeUnitRuntimeRecord;
const block = helpers.blockModel ?? blockModel;
const recordFeedback = helpers.recordSelfFeedback ?? recordSelfFeedback;
const notify = helpers.notify ?? ((message, level) => ctx.ui.notify(message, level));
const resolveUnit =
helpers.resolveAgentEnd ??
((event) => {
resolveAgentEnd(event);
});
const failedModel = s.currentUnitModel;
if (
decision.reason === "zero-progress" &&
failedModel?.provider &&
failedModel?.id
) {
block(
s.basePath,
failedModel.provider,
failedModel.id,
`zero-progress on ${unitType} ${unitId}`,
{ expiresAt: Date.now() + 60 * 60 * 1000 },
);
notify(
`Temporarily blocked ${failedModel.provider}/${failedModel.id} after zero-progress on ${unitType} ${unitId}; retry will choose a fallback.`,
"warning",
);
}
await closeout(
ctx,
s.basePath,
currentUnit.type,
currentUnit.id,
currentUnit.startedAt,
buildSnapshotOpts(),
);
writeRuntime(s.basePath, unitType, unitId, currentUnit.startedAt, {
phase: "failed-silent-worker",
status: "failed",
lastProgressAt: Date.now(),
lastProgressKind: "runaway-guard-fail",
runawayGuardFail: decision.metadata,
lineageEvent: {
status: "failed",
workerSessionId: ctx.sessionManager?.getSessionId?.(),
note: `${decision.reason ?? "runaway-guard"} failed current attempt`,
},
});
const unitParts = unitId.split("/");
recordFeedback(
{
kind: "runaway-loop:silent-worker-failure",
severity: "high",
summary: decision.reason,
evidence: JSON.stringify(decision.metadata, null, 2),
suggestedFix:
"LLM session never produced an assistant message — check session-manager.ts:1086-1096 (silent _persist skip) and verify the model/provider is responding. The dispatcher will attempt retry within maxRetries; if persistent, transitions to blocked.",
occurredIn: {
unitType,
milestone: unitParts[0],
slice: unitParts[1],
task: unitParts.slice(2).join("/") || undefined,
},
source: "detector",
},
s.basePath,
);
clearSupervisionHandles(s);
notify(decision.reason, "error");
resolveUnit({
messages: [],
_synthetic: "runaway-guard-fail",
reason: decision.reason,
});
}
/**
* Set up all four supervision timers for the current unit:
* 1. Soft timeout warning (wrapup)
@ -271,65 +390,7 @@ export function startUnitSupervision(sctx) {
}
if (decision.action === "fail") {
if (getInFlightToolCount() > 0) return;
const failedModel = s.currentUnitModel;
if (
decision.reason === "zero-progress" &&
failedModel?.provider &&
failedModel?.id
) {
blockModel(
s.basePath,
failedModel.provider,
failedModel.id,
`zero-progress on ${unitType} ${unitId}`,
{ expiresAt: Date.now() + 60 * 60 * 1000 },
);
ctx.ui.notify(
`Temporarily blocked ${failedModel.provider}/${failedModel.id} after zero-progress on ${unitType} ${unitId}; retry will choose a fallback.`,
"warning",
);
}
await closeoutUnit(
ctx,
s.basePath,
s.currentUnit.type,
s.currentUnit.id,
s.currentUnit.startedAt,
buildSnapshotOpts(),
);
writeUnitRuntimeRecord(
s.basePath,
unitType,
unitId,
s.currentUnit.startedAt,
{
phase: "failed-silent-worker",
status: "failed",
lastProgressAt: Date.now(),
lastProgressKind: "runaway-guard-fail",
runawayGuardFail: decision.metadata,
},
);
const unitParts = unitId.split("/");
recordSelfFeedback(
{
kind: "runaway-loop:silent-worker-failure",
severity: "high",
summary: decision.reason,
evidence: JSON.stringify(decision.metadata, null, 2),
suggestedFix:
"LLM session never produced an assistant message — check session-manager.ts:1086-1096 (silent _persist skip) and verify the model/provider is responding. The dispatcher will attempt retry within maxRetries; if persistent, transitions to blocked.",
occurredIn: {
unitType,
milestone: unitParts[0],
slice: unitParts[1],
task: unitParts.slice(2).join("/") || undefined,
},
source: "detector",
},
s.basePath,
);
ctx.ui.notify(decision.reason, "error");
await finalizeRunawayGuardFailure(sctx, decision);
return;
}
if (decision.action === "pause") {

View file

@ -11,6 +11,7 @@ export { periodicDetectorSweepGate } from "./periodic-runner.js";
export { productionPlateauGate } from "./production-plateau.js";
export { repeatedFeedbackKindGate } from "./repeated-feedback-kind.js";
export { sameUnitLoopGate } from "./same-unit-loop.js";
export { serverDirectionDriftGate } from "./server-direction-drift.js";
export { staleLockGate } from "./stale-lock.js";
export { statusCompletionDriftGate } from "./status-completion-drift.js";
export { zeroProgressGate } from "./zero-progress.js";

View file

@ -11,6 +11,7 @@ import { detectCrashLoop } from "./crash-loop-classifier.js";
import { detectProductionPlateau } from "./production-plateau.js";
import { detectRepeatedFeedbackKind } from "./repeated-feedback-kind.js";
import { detectSameUnitLoop } from "./same-unit-loop.js";
import { detectServerDirectionDrift } from "./server-direction-drift.js";
import { detectStaleLock } from "./stale-lock.js";
import { detectStatusCompletionDrift } from "./status-completion-drift.js";
import { detectZeroProgress } from "./zero-progress.js";
@ -74,6 +75,10 @@ function defaultDetectors(ctx, options) {
name: "production-plateau",
run: () => detectProductionPlateau(ctx?.unitMetrics, ctx, options),
},
{
name: "server-direction-drift",
run: () => detectServerDirectionDrift(ctx, options),
},
];
}

View file

@ -0,0 +1,132 @@
/**
* server-direction-drift.js detect obsolete server architecture in live work.
*
* Purpose: stop SF from planning queued work against superseded server shapes
* after the product direction moves to one embedded `sf server` control plane.
*
* Consumer: Wiggums periodic detector sweep and UOK detector gate registry.
*/
const DEFAULT_DEPRECATED_PATTERNS = [
/\bsf serve\b/i,
/\bA2A\b/i,
/\bJSON-RPC API\b/i,
/\bper-repo systemd unit\b/i,
/\bper-repo web servers?\b/i,
/\bseparate standalone daemon brain\b/i,
];
const ACTIVE_STATUSES = new Set(["queued", "active", "planned", "pending"]);
const CLOSED_STATUSES = new Set([
"cancelled",
"canceled",
"complete",
"completed",
"done",
"superseded",
"parked",
]);
/**
* Detect queued milestone/slice work that still targets a deprecated server path.
*
* Purpose: make stale roadmap/server-direction drift visible before autonomous
* planning spends turns on obsolete `sf serve`, A2A, or per-repo server work.
*
* Consumer: periodic-runner.js default detector list.
*/
export function detectServerDirectionDrift(ctx = {}, options = {}) {
const rows = [
...normalizeRows(ctx.milestones, "milestone"),
...normalizeRows(ctx.slices, "slice"),
...normalizeRows(ctx.requirements, "requirement"),
];
const patterns =
options.deprecatedServerPatterns ?? DEFAULT_DEPRECATED_PATTERNS;
const matches = [];
for (const row of rows) {
if (!isActiveRow(row)) continue;
const text = searchableText(row);
const pattern = patterns.find((candidate) => candidate.test(text));
if (!pattern) continue;
matches.push({
kind: row.kind,
id: row.id,
milestoneId: row.milestoneId ?? row.milestone_id ?? null,
status: row.status ?? null,
pattern: pattern.source,
title: row.title ?? "",
});
}
if (matches.length === 0) {
return { stuck: false, reason: "", signature: { checked: rows.length } };
}
return {
stuck: true,
reason: "server-direction-drift",
signature: {
matches,
expectedDirection:
"sf server is the single operator server; web/Next.js embeds daemon lifecycle",
},
};
}
/**
* Run server-direction drift as a UOK verification gate.
*
* Purpose: make superseded server architecture detectable through the common
* gate runner, not only through ad hoc roadmap review.
*
* Consumer: detector gate registry and periodicDetectorSweepGate.
*/
export const serverDirectionDriftGate = {
id: "server-direction-drift",
type: "verification",
async execute(ctx = {}) {
const result = detectServerDirectionDrift(ctx, ctx.options);
if (result.stuck) {
return {
outcome: "manual-attention",
failureClass: "verification",
rationale: result.reason,
findings: result.signature,
};
}
return {
outcome: "pass",
failureClass: null,
rationale: "no server-direction drift",
};
},
};
function normalizeRows(rows, kind) {
if (!Array.isArray(rows)) return [];
return rows.map((row) => ({ ...row, kind }));
}
function isActiveRow(row) {
const status = String(row.status ?? "").toLowerCase();
if (CLOSED_STATUSES.has(status)) return false;
return ACTIVE_STATUSES.has(status) || status === "";
}
function searchableText(row) {
return [
row.id,
row.title,
row.description,
row.why,
row.goal,
row.successCriteria,
row.success_criteria,
row.notes,
row.full_content,
row.vision,
]
.filter((value) => typeof value === "string")
.join("\n");
}

View file

@ -17,12 +17,35 @@ import {
loadProjectSFPreferences,
} from "./preferences.js";
/** Extract the body section that follows a YAML frontmatter block. */
function extractBodyAfterFrontmatter(content) {
const closingIdx = content.indexOf("\n---", content.indexOf("---"));
if (closingIdx === -1) return null;
const afterFrontmatter = content.slice(closingIdx + 4);
return afterFrontmatter.trim() ? afterFrontmatter : null;
/** Return the preferences documentation comment block from a YAML file. */
function extractPreferencesCommentBlock(content) {
const marker = "\n# SF Preferences";
const idx = content.indexOf(marker);
if (idx >= 0) return commentPreferencesBody(content.slice(idx));
if (content.startsWith("# SF Preferences")) return content;
return null;
}
/** Return a YAML-commented default preferences reference block. */
function defaultPreferencesCommentBlock() {
return [
"",
"# SF Preferences",
"#",
"# See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.",
"",
].join("\n");
}
/** Preserve the human reference body without making preferences.yaml multi-doc. */
function commentPreferencesBody(body) {
return body
.split("\n")
.map((line) => {
if (line === "" || line.startsWith("#")) return line;
return `# ${line}`;
})
.join("\n");
}
/** All recognized experimental feature flags with descriptions. */
@ -81,14 +104,15 @@ export function setExperimentalFlag(name, value) {
prefs.experimental = { ...(prefs.experimental ?? {}), [name]: value };
const frontmatter = serializePreferencesToFrontmatter(prefs);
let body =
"\n# SF Preferences\n\nSee `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.\n";
let body = defaultPreferencesCommentBlock();
if (existsSync(path)) {
const preserved = extractBodyAfterFrontmatter(readFileSync(path, "utf-8"));
const preserved = extractPreferencesCommentBlock(
readFileSync(path, "utf-8"),
);
if (preserved) body = preserved;
}
mkdirSync(dirname(path), { recursive: true });
writeFileSync(path, `---\n${frontmatter}---${body}`, "utf-8");
writeFileSync(path, `${frontmatter}${body}`, "utf-8");
}
/**

View file

@ -194,7 +194,7 @@ export function _resetParseWarningFlag() {
*/
export function parsePreferencesYaml(content) {
try {
const parsed = parseYaml(content);
const parsed = parseYaml(stripPreferencesYamlDocument(content));
if (typeof parsed !== "object" || parsed === null) return {};
return parsed;
} catch (e) {
@ -203,6 +203,22 @@ export function parsePreferencesYaml(content) {
}
}
/**
* Return only the machine-readable YAML document from preferences.yaml.
*
* Purpose: tolerate older files where a human reference body was appended as
* raw Markdown after `# SF Preferences` while keeping canonical writes pure
* YAML plus comments.
*
* Consumer: parsePreferencesYaml before handing content to the YAML parser.
*/
function stripPreferencesYamlDocument(content) {
const marker = "\n# SF Preferences";
const idx = content.indexOf(marker);
if (idx < 0) return content;
return content.slice(0, idx);
}
/**
* Parse legacy frontmatter-style preference content.
*

View file

@ -13,6 +13,7 @@ export * from "./sf-db/sf-db-memory.js";
export * from "./sf-db/sf-db-milestones.js";
export * from "./sf-db/sf-db-mode-state.js";
export * from "./sf-db/sf-db-profile.js";
export * from "./sf-db/roadmap-projection-sync.js";
export * from "./sf-db/sf-db-self-feedback.js";
export * from "./sf-db/sf-db-session-store.js";
export * from "./sf-db/sf-db-slices.js";

View file

@ -0,0 +1,85 @@
/**
* roadmap-projection-sync.js - schedule DB-backed roadmap projection refreshes.
*
* Purpose: keep M###-ROADMAP.md and M###-ROADMAP.json as generated views of
* canonical SQLite planning state after milestone or slice mutations.
*
* Consumer: sf-db milestone/slice write wrappers and projection-sync tests.
*/
import { logWarning } from "../workflow-logger.js";
const pending = new Map();
const inFlight = new Set();
/**
* Queue a best-effort ROADMAP.md/json refresh for one milestone.
*
* Purpose: make roadmap files server-maintained projections instead of stale
* manually rendered artifacts while keeping DB writes synchronous and durable.
*
* Consumer: insert/update milestone and slice DB wrappers.
*/
export function scheduleRoadmapProjectionRefresh(
basePath = process.cwd(),
milestoneId,
) {
if (!milestoneId || roadmapProjectionSyncDisabled()) return;
const key = `${basePath}\0${milestoneId}`;
if (pending.has(key) || inFlight.has(key)) return;
pending.set(key, { basePath, milestoneId });
const timer = setTimeout(() => {
void flushOneRoadmapProjection(key);
}, 0);
timer.unref?.();
}
/**
* Refresh one roadmap projection immediately.
*
* Purpose: provide an explicit, awaitable projection path for tests and repair
* tools while sharing the same renderer used by the asynchronous scheduler.
*
* Consumer: roadmap projection sync tests and future server repair jobs.
*/
export async function refreshRoadmapProjectionNow(basePath, milestoneId) {
const { renderRoadmapFromDb } = await import("../markdown-renderer.js");
return renderRoadmapFromDb(basePath, milestoneId);
}
/**
* Drain queued projection refreshes.
*
* Purpose: let tests prove DB writes schedule real roadmap projection updates
* without waiting on wall-clock timers.
*
* Consumer: roadmap-projection-sync.test.mjs.
*/
export async function flushRoadmapProjectionRefreshesForTests() {
while (pending.size > 0) {
const keys = [...pending.keys()];
await Promise.all(keys.map((key) => flushOneRoadmapProjection(key)));
}
}
function roadmapProjectionSyncDisabled() {
if (process.env.SF_ROADMAP_PROJECTION_SYNC === "0") return true;
if (process.env.SF_ROADMAP_PROJECTION_SYNC === "1") return false;
return process.env.VITEST === "true";
}
async function flushOneRoadmapProjection(key) {
const entry = pending.get(key);
if (!entry || inFlight.has(key)) return;
pending.delete(key);
inFlight.add(key);
try {
await refreshRoadmapProjectionNow(entry.basePath, entry.milestoneId);
} catch (err) {
logWarning("roadmap-projection-sync", "projection refresh failed", {
milestoneId: entry.milestoneId,
error: err instanceof Error ? err.message : String(err),
});
} finally {
inFlight.delete(key);
}
}

View file

@ -11,6 +11,7 @@ import {
rowToMilestone,
transaction,
} from "./sf-db-core.js";
import { scheduleRoadmapProjectionRefresh } from "./roadmap-projection-sync.js";
export function insertMilestone(m) {
const currentDb = _getAdapter();
@ -57,6 +58,7 @@ export function insertMilestone(m) {
if (hasPlanningPayload(m.planning)) {
insertMilestoneSpecIfAbsent(m.id, m.planning ?? {});
}
scheduleRoadmapProjectionRefresh(process.cwd(), m.id);
}
export function upsertMilestonePlanning(milestoneId, planning) {
@ -111,6 +113,7 @@ export function upsertMilestonePlanning(milestoneId, planning) {
? JSON.stringify(planning.productResearch)
: null,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function getAllMilestones() {
@ -146,6 +149,7 @@ export function updateMilestoneStatus(milestoneId, status, completedAt) {
":completed_at": completedAt ?? null,
":id": milestoneId,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function updateMilestoneQueueOrder(order) {
@ -159,6 +163,9 @@ export function updateMilestoneQueueOrder(order) {
stmt.run({ ":sequence": i + 1, ":id": order[i] });
}
});
for (const milestoneId of order) {
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
}
export function getActiveMilestoneFromDb() {
@ -274,6 +281,9 @@ export function bulkInsertLegacyHierarchy(payload) {
);
}
});
for (const milestoneId of clearMilestoneIds) {
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
}
export function clearEngineHierarchy() {

View file

@ -10,6 +10,7 @@ import {
safeParseJsonArray,
transaction,
} from "./sf-db-core.js";
import { scheduleRoadmapProjectionRefresh } from "./roadmap-projection-sync.js";
export function insertSlice(s) {
const currentDb = _getAdapter();
@ -95,6 +96,7 @@ export function insertSlice(s) {
":raw_traces_vision_fragment": s.tracesVisionFragment ?? null,
});
insertSliceSpecIfAbsent(s.milestoneId, s.id, s.planning ?? {});
scheduleRoadmapProjectionRefresh(process.cwd(), s.milestoneId);
}
export function insertOrIgnoreSlice(args) {
@ -109,6 +111,7 @@ export function insertOrIgnoreSlice(args) {
":title": args.title,
":ts": args.createdAt,
});
scheduleRoadmapProjectionRefresh(process.cwd(), args.milestoneId);
}
export function clearSliceSketch(milestoneId, sliceId) {
@ -127,6 +130,7 @@ export function setSliceSketchFlag(milestoneId, sliceId, isSketch) {
":mid": milestoneId,
":sid": sliceId,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function autoHealSketchFlags(milestoneId, hasPlanFile) {
@ -178,6 +182,7 @@ export function upsertSlicePlanning(milestoneId, sliceId, planning) {
// ADR-0000 P2 (schema v69): vision trace fragment is part of planning.
":traces_vision_fragment": planning.tracesVisionFragment ?? null,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
// ADR-0000 P2 (schema v69): focused setter so callers that already have a
@ -195,6 +200,7 @@ export function updateSliceVisionTrace(milestoneId, sliceId, fragment) {
":mid": milestoneId,
":sid": sliceId,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function getSlice(milestoneId, sliceId) {
@ -219,6 +225,7 @@ export function updateSliceStatus(milestoneId, sliceId, status, completedAt) {
":milestone_id": milestoneId,
":id": sliceId,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function setSliceUatVerdict(milestoneId, sliceId, verdict) {
@ -229,6 +236,7 @@ export function setSliceUatVerdict(milestoneId, sliceId, verdict) {
`UPDATE slices SET uat_verdict = :verdict WHERE milestone_id = :mid AND id = :sid`,
)
.run({ ":mid": milestoneId, ":sid": sliceId, ":verdict": verdict });
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function getSliceUatVerdict(milestoneId, sliceId) {
@ -312,6 +320,7 @@ export function setSliceSummaryMd(milestoneId, sliceId, summaryMd, uatMd) {
":summary_md": summaryMd,
":uat_md": uatMd,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function getMilestoneSlices(milestoneId) {
@ -369,6 +378,7 @@ export function syncSliceDependencies(milestoneId, sliceId, depends) {
)
.run({ ":mid": milestoneId, ":sid": sliceId, ":dep": dep });
}
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function getDependentSlices(milestoneId, sliceId) {
@ -452,6 +462,7 @@ export function updateSliceFields(milestoneId, sliceId, fields) {
":depends": fields.depends ? JSON.stringify(fields.depends) : null,
":demo": fields.demo ?? null,
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function setSliceReplanTriggeredAt(milestoneId, sliceId, ts) {
@ -462,6 +473,7 @@ export function setSliceReplanTriggeredAt(milestoneId, sliceId, ts) {
"UPDATE slices SET replan_triggered_at = :ts WHERE milestone_id = :mid AND id = :sid",
)
.run({ ":ts": ts, ":mid": milestoneId, ":sid": sliceId });
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}
export function deleteSlice(milestoneId, sliceId) {
@ -493,4 +505,5 @@ export function deleteSlice(milestoneId, sliceId) {
.prepare(`DELETE FROM slices WHERE milestone_id = :mid AND id = :sid`)
.run({ ":mid": milestoneId, ":sid": sliceId });
});
scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId);
}

View file

@ -0,0 +1,82 @@
/**
* detector-server-direction-drift.test.mjs server direction drift contracts.
*
* Purpose: prove Wiggums catches queued work that revives superseded server
* architecture while ignoring cancelled historical slices.
*/
import assert from "node:assert/strict";
import { test } from "vitest";
import {
detectServerDirectionDrift,
serverDirectionDriftGate,
} from "../detectors/server-direction-drift.js";
import { runDetectorSweep } from "../detectors/periodic-runner.js";
test("detectServerDirectionDrift_when_queued_slice_mentions_sf_serve_flags_drift", () => {
const result = detectServerDirectionDrift({
slices: [
{
milestone_id: "M053",
id: "S01",
status: "queued",
title: "`sf serve` daemon scaffold + JSON-RPC API",
goal: "Create a separate JSON-RPC API.",
},
],
});
assert.equal(result.stuck, true);
assert.equal(result.reason, "server-direction-drift");
assert.equal(result.signature.matches[0].id, "S01");
});
test("detectServerDirectionDrift_when_cancelled_slice_mentions_sf_serve_ignores_history", () => {
const result = detectServerDirectionDrift({
slices: [
{
milestone_id: "M053",
id: "S01",
status: "cancelled",
title: "`sf serve` daemon scaffold + JSON-RPC API",
},
],
});
assert.equal(result.stuck, false);
});
test("serverDirectionDriftGate_when_drift_exists_returns_manual_attention", async () => {
const result = await serverDirectionDriftGate.execute({
requirements: [
{
id: "R999",
status: "active",
description: "Add A2A as the primary server control plane.",
},
],
});
assert.equal(result.outcome, "manual-attention");
assert.equal(result.rationale, "server-direction-drift");
});
test("runDetectorSweep_includes_server_direction_drift_detector", async () => {
const result = await runDetectorSweep(
{
slices: [
{
id: "S99",
status: "queued",
title: "Per-repo systemd unit for another server",
},
],
},
{ throttleMs: 0 },
);
assert.ok(
result.detectorsFired.some(
(detector) => detector.name === "server-direction-drift",
),
);
});

View file

@ -2,6 +2,7 @@ import { describe, expect, test } from "vitest";
import {
BASE_REQUIREMENTS,
MODEL_CAPABILITY_PROFILES,
resolveModelForComplexity,
scoreEligibleModels,
scoreModel,
} from "../model-router.js";
@ -16,6 +17,11 @@ describe("agentic capability axis (ADR-0079)", () => {
);
});
test("challenge base requirements weight adversarial agentic reasoning", () => {
expect(BASE_REQUIREMENTS.challenge.reasoning).toBeGreaterThanOrEqual(0.8);
expect(BASE_REQUIREMENTS.challenge.agentic).toBeGreaterThanOrEqual(0.85);
});
test("known agentic-capable models score higher than coding-completion models on execute-task", () => {
const codestralScore = scoreModel(
MODEL_CAPABILITY_PROFILES["codestral-latest"],
@ -34,6 +40,45 @@ describe("agentic capability axis (ADR-0079)", () => {
expect(sonnetScore).toBeGreaterThan(codestralScore);
});
test("challenge routing ignores sticky model unless explicitly enabled", () => {
const phaseConfig = {
primary: "openai/gpt-5.5",
fallbacks: ["minimax/MiniMax-M2.7"],
};
const routingConfig = {
enabled: true,
capability_routing: true,
};
const availableModels = ["kimi-coding/kimi-k2.6", "minimax/MiniMax-M2.7"];
const stickyHint = { provider: "minimax", id: "MiniMax-M2.7" };
const withoutSticky = resolveModelForComplexity(
{ tier: "standard" },
phaseConfig,
routingConfig,
availableModels,
"challenge",
{},
{},
stickyHint,
);
expect(withoutSticky.selectionMethod).toBe("capability-scored");
expect(withoutSticky.modelId).toBe("kimi-coding/kimi-k2.6");
const withSticky = resolveModelForComplexity(
{ tier: "standard" },
phaseConfig,
{ ...routingConfig, sticky_routing: true },
availableModels,
"challenge",
{},
{},
stickyHint,
);
expect(withSticky.selectionMethod).toBe("slice-sticky");
expect(withSticky.modelId).toBe("minimax/MiniMax-M2.7");
});
test("devstral variants score below agentic models on execute-task", () => {
const devstralScore = scoreModel(
MODEL_CAPABILITY_PROFILES["devstral-2512"],

View file

@ -110,6 +110,25 @@ describe("preferences model resolution", () => {
});
});
test("resolveModelWithFallbacksForUnit_when_challenge_uses_validation_model", () => {
makePreferencesProject(
[
"version: 1",
"models:",
" planning: minimax/MiniMax-M2.7",
" validation: kimi-coding/kimi-k2.6",
"",
].join("\n"),
);
const result = resolveModelWithFallbacksForUnit("challenge");
assert.deepEqual(result, {
primary: "kimi-coding/kimi-k2.6",
fallbacks: [],
});
});
test("isModelInEnabledList_when_list_empty_allows_any_model", () => {
assert.equal(isModelInEnabledList("kimi-coding", "kimi-k2.6", []), true);
assert.equal(

View file

@ -0,0 +1,106 @@
import assert from "node:assert/strict";
import {
existsSync,
mkdirSync,
mkdtempSync,
readFileSync,
rmSync,
} from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, test } from "vitest";
import {
closeDatabase,
flushRoadmapProjectionRefreshesForTests,
insertMilestone,
insertSlice,
openDatabase,
updateSliceStatus,
upsertMilestonePlanning,
} from "../sf-db.js";
const originalCwd = process.cwd();
const originalEnv = { ...process.env };
const tmpDirs = [];
afterEach(() => {
closeDatabase();
process.chdir(originalCwd);
process.env = { ...originalEnv };
while (tmpDirs.length > 0) {
rmSync(tmpDirs.pop(), { recursive: true, force: true });
}
});
function makeProject() {
const dir = mkdtempSync(join(tmpdir(), "sf-roadmap-sync-"));
tmpDirs.push(dir);
mkdirSync(join(dir, ".sf"), { recursive: true });
process.env.SF_ROADMAP_PROJECTION_SYNC = "1";
process.chdir(dir);
openDatabase(join(dir, ".sf", "sf.db"));
return dir;
}
describe("roadmap projection sync", () => {
test("db_writes_refresh_roadmap_projection", async () => {
const project = makeProject();
insertMilestone({
id: "M777",
title: "Initial server plan",
status: "queued",
planning: {
vision: "Keep planning state in SQLite.",
successCriteria: ["Projection exists."],
},
});
insertSlice({
milestoneId: "M777",
id: "S01",
title: "Render projection",
status: "pending",
sequence: 1,
planning: {
goal: "Write ROADMAP.md and ROADMAP.json from DB state.",
},
});
await flushRoadmapProjectionRefreshesForTests();
const roadmapPath = join(
project,
".sf",
"milestones",
"M777",
"M777-ROADMAP.md",
);
const jsonPath = join(
project,
".sf",
"milestones",
"M777",
"M777-ROADMAP.json",
);
assert.equal(existsSync(roadmapPath), true);
assert.equal(existsSync(jsonPath), true);
assert.match(readFileSync(roadmapPath, "utf-8"), /Initial server plan/);
upsertMilestonePlanning("M777", {
title: "Server-owned roadmap projection",
vision: "The server refreshes generated roadmap files after DB writes.",
});
updateSliceStatus("M777", "S01", "complete", "2026-05-17T20:00:00.000Z");
await flushRoadmapProjectionRefreshesForTests();
const roadmap = readFileSync(roadmapPath, "utf-8");
const projection = JSON.parse(readFileSync(jsonPath, "utf-8"));
assert.match(roadmap, /Server-owned roadmap projection/);
assert.match(
roadmap,
/The server refreshes generated roadmap files after DB writes/,
);
assert.match(roadmap, /- \[x\] \*\*S01: Render projection\*\*/);
assert.equal(projection.origin, "db-projection");
assert.equal(projection.slices[0].status, "complete");
});
});

View file

@ -11,6 +11,7 @@ import { repeatedFeedbackKindGate } from "../detectors/repeated-feedback-kind.js
import { artifactFlapGate } from "../detectors/artifact-flap.js";
import { staleLockGate } from "../detectors/stale-lock.js";
import { periodicDetectorSweepGate } from "../detectors/periodic-runner.js";
import { serverDirectionDriftGate } from "../detectors/server-direction-drift.js";
import { inlineRuntimeGate } from "./inline-runtime-gate.js";
/**
@ -41,6 +42,7 @@ registry.register(zeroProgressGate);
registry.register(repeatedFeedbackKindGate);
registry.register(artifactFlapGate);
registry.register(staleLockGate);
registry.register(serverDirectionDriftGate);
registry.register(periodicDetectorSweepGate);
registry.register(inlineRuntimeGate);

View file

@ -20,6 +20,24 @@ const handlerSrc = readFileSync(
join(__dirname, "..", "headless-feedback.ts"),
"utf-8",
);
const forwardSrc = readFileSync(
join(__dirname, "..", "headless-server-forward.ts"),
"utf-8",
);
const rpcModeSrc = readFileSync(
join(
__dirname,
"..",
"..",
"packages",
"coding-agent",
"src",
"modes",
"rpc",
"rpc-mode.ts",
),
"utf-8",
);
test("headless.ts dispatches feedback command to handleFeedback", () => {
assert.match(
@ -72,5 +90,29 @@ test("add path defaults blocking from severity, doesn't require it", () => {
// readBoolFlag(--blocking) OR severity === high|critical → blocking=true.
// The behaviour is documented in self-feedback.js (deriveBlocking),
// mirror it so operator-filed entries have consistent semantics.
assert.match(handlerSrc, /severity === "high" \|\| severity === "critical"/);
assert.match(handlerSrc, /severity === "high"/);
assert.match(handlerSrc, /severity === "critical"/);
});
test("active-server feedback forwarding queues writes instead of blocking RPC", () => {
assert.match(
forwardSrc,
/queued:\s*true/,
"forwarded add/resolve commands must ask the active RPC server to queue writes",
);
assert.match(
rpcModeSrc,
/SF_FEEDBACK_QUEUE_FILE = "sf-feedback-queue\.jsonl"/,
"RPC server must persist queued feedback commands durably",
);
assert.match(
rpcModeSrc,
/await drainQueuedSfFeedbackCommands\(process\.cwd\(\)\)/,
"server-owned autonomous startup must drain queued feedback before running",
);
assert.match(
rpcModeSrc,
/scheduleQueuedSfFeedbackDrain\(process\.cwd\(\)\)/,
"queued feedback commands should also drain from the server control lane",
);
});

View file

@ -954,6 +954,51 @@ test("sf server stop <path> is parsed and dispatched with resolved path", async
assert.equal(stopOptions?.all, false);
});
test("sf server reload <path> is parsed as reload launch", async (_t) => {
const tmp = mkdtempSync(join(tmpdir(), "sf-web-reload-path-"));
let receivedOptions: Record<string, unknown> | undefined;
afterEach(() => {
rmSync(tmp, { recursive: true, force: true });
});
mkdirSync(tmp, { recursive: true });
const flags = cliWeb.parseCliArgs([
"node",
"dist/loader.js",
"server",
"reload",
tmp,
]);
assert.deepEqual(flags.messages, ["server", "reload", tmp]);
const result = await cliWeb.runWebCliBranch(flags, {
cwd: () => "/",
runWebMode: async (options) => {
receivedOptions = options as unknown as Record<string, unknown>;
return {
mode: "web" as const,
ok: true as const,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host: "127.0.0.1",
port: 4000,
url: "http://127.0.0.1:4000",
hostKind: "packaged-standalone" as const,
hostPath: "/tmp/server.js",
hostRoot: "/tmp",
};
},
stderr: { write: () => true },
});
assert.equal(result.handled, true);
if (!result.handled) throw new Error("expected handled");
assert.equal(result.action, "reload");
assert.equal(receivedOptions?.cwd, tmp);
assert.equal(receivedOptions?.reload, true);
});
// ─── Context-aware launch detection tests ──────────────────────────────
test("resolveContextAwareCwd returns project cwd when inside a project under dev root", (_t) => {
@ -1137,12 +1182,94 @@ test("launchWebMode kills stale instance for same cwd before spawning", async (_
assert.equal(status.ok, true);
assert.equal(spawnCalled, true);
// Stale instance for same cwd should have been cleaned up
assert.match(stderrOutput, /Cleaning up stale/);
assert.match(stderrOutput, /Stale SF server was already stopped/);
// New instance should be registered
const registry = webMode.readInstanceRegistry(registryPath);
assert.equal(registry[resolve(cwd)]?.pid, 88888);
});
test("launchWebMode reload proves candidate before replacing fixed-port server", async (_t) => {
const tmp = mkdtempSync(join(tmpdir(), "sf-web-reload-"));
const standaloneRoot = join(tmp, "dist", "web", "standalone");
const serverPath = join(standaloneRoot, "server.js");
mkdirSync(standaloneRoot, { recursive: true });
writeFileSync(serverPath, 'console.log("stub")\n');
const registryPath = join(tmp, "web-instances.json");
const pidFilePath = join(tmp, "web-server.pid");
const cwd = "/tmp/reload-project";
webMode.registerInstance(
cwd,
{ pid: 77777, port: 4000, url: "http://127.0.0.1:4000" },
registryPath,
);
const spawnPorts: string[] = [];
const bootUrls: string[] = [];
let nextPid = 90000;
let stderrOutput = "";
afterEach(() => {
rmSync(tmp, { recursive: true, force: true });
});
const status = await webMode.launchWebMode(
{
cwd,
projectSessionsDir: "/tmp/.sf/sessions/reload",
agentDir: "/tmp/.sf/agent",
packageRoot: tmp,
port: 4000,
reload: true,
},
{
initResources: () => {},
resolvePort: async () => 45123,
execPath: "/custom/node",
env: { TEST_ENV: "1" },
kill: ((pid: number, signal?: string | number) => {
if (pid === 77777 && signal === 0) return true;
const error = new Error("no such process") as NodeJS.ErrnoException;
error.code = "ESRCH";
throw error;
}) as typeof process.kill,
spawn: (_command, _args, options) => {
spawnPorts.push(String(options.env?.PORT));
return {
pid: nextPid++,
once: () => undefined,
unref: () => {},
} as any;
},
waitForBootReady: async (url) => {
bootUrls.push(url);
},
openBrowser: () => {},
pidFilePath,
writePidFile: webMode.writePidFile,
registryPath,
stderr: {
write(chunk: string) {
stderrOutput += chunk;
return true;
},
},
},
);
assert.equal(status.ok, true);
assert.deepEqual(spawnPorts, ["45123", "4000"]);
assert.deepEqual(bootUrls, [
"http://127.0.0.1:45123",
"http://127.0.0.1:4000",
]);
assert.match(stderrOutput, /Proving reload candidate/);
assert.match(stderrOutput, /Reload candidate passed boot check/);
const registry = webMode.readInstanceRegistry(registryPath);
assert.equal(registry[resolve(cwd)]?.pid, 90001);
assert.equal(registry[resolve(cwd)]?.port, 4000);
});
test("launchWebMode does not log cleanup when no stale instance exists", async (_t) => {
const tmp = mkdtempSync(join(tmpdir(), "sf-web-no-stale-"));
const standaloneRoot = join(tmp, "dist", "web", "standalone");

View file

@ -56,6 +56,16 @@ export interface WebModeLaunchOptions {
packageRoot?: string;
host?: string;
port?: number;
/**
* Reload an existing registered server after the replacement passes boot.
*
* Purpose: keep `sf server` upgrades graceful by proving the candidate host
* is healthy before terminating the old process bound to the project.
*
* Consumer: `sf server reload` and default `sf server start` behavior when a
* live same-project instance already exists.
*/
reload?: boolean;
/** Additional allowed origins for CORS (forwarded as SF_WEB_ALLOWED_ORIGINS). */
allowedOrigins?: string[];
}
@ -128,6 +138,7 @@ export interface WebModeDeps {
writePidFile?: (path: string, pid: number) => void;
readPidFile?: (path: string) => number | null;
deletePidFile?: (path: string) => void;
kill?: typeof process.kill;
/** Path to the multi-instance registry JSON (for testing). */
registryPath?: string;
}
@ -146,6 +157,11 @@ export interface WebModeStopResult {
stoppedCount?: number;
}
type ExistingServerInstance =
| { state: "none" }
| { state: "dead"; entry: WebInstanceEntry }
| { state: "live"; entry: WebInstanceEntry };
// ─── Instance Registry ──────────────────────────────────────────────────────
export interface WebInstanceEntry {
@ -831,6 +847,57 @@ function cleanupStaleInstance(
unregisterInstance(cwd, registryPath);
}
function getRegisteredServerInstance(
cwd: string,
registryPath?: string,
kill: typeof process.kill = process.kill,
): ExistingServerInstance {
const registry = readInstanceRegistry(registryPath);
const entry = registry[resolve(cwd)];
if (!entry) return { state: "none" };
if (!pidExists(entry.pid, kill)) return { state: "dead", entry };
return { state: "live", entry };
}
function cleanupDeadRegisteredInstance(
cwd: string,
stderr: WritableLike,
entry: WebInstanceEntry,
registryPath?: string,
): void {
stderr.write(
`[forge] Stale SF server was already stopped (pid=${entry.pid}) — clearing entry.\n`,
);
unregisterInstance(cwd, registryPath);
}
function stopReloadedInstance(
cwd: string,
stderr: WritableLike,
entry: WebInstanceEntry,
registryPath?: string,
): void {
const result = terminateWebServerProcessTree(entry.pid);
if (result === "killed" || result === "force-killed") {
stderr.write(
`[forge] Reloaded SF server for ${resolve(cwd)}; stopped previous pid=${entry.pid}.\n`,
);
} else if (result === "already-dead") {
stderr.write(
`[forge] Previous SF server already exited during reload (pid=${entry.pid}).\n`,
);
} else {
stderr.write(
`[forge] Reload candidate is running, but previous SF server pid=${entry.pid} did not stop: ${result.error}\n`,
);
return;
}
// Only remove the old registry row after the new instance has already
// registered itself. unregisterInstance deletes by cwd, so callers must
// invoke this before registering the replacement.
unregisterInstance(cwd, registryPath);
}
/**
* Detect and reap orphaned next-server processes that outlived their parent
* web host. These orphans have cwd under dist/web/standalone (or a deleted
@ -951,10 +1018,35 @@ export async function launchWebMode(
stderr.write(`[forge] Starting server mode…\n`);
// Kill any stale server instance for this project before reserving a port.
// This prevents EADDRINUSE when the previous `sf server` was terminated
// without a clean shutdown (e.g. terminal closed, crash).
cleanupStaleInstance(options.cwd, stderr, deps.registryPath);
const existing = getRegisteredServerInstance(
options.cwd,
deps.registryPath,
deps.kill,
);
let reloadPrevious: WebInstanceEntry | null = null;
if (
existing.state === "live" &&
(options.reload === true ||
!options.port ||
options.port === existing.entry.port)
) {
reloadPrevious = existing.entry;
stderr.write(
`[forge] Existing SF server found for ${resolve(options.cwd)} (pid=${existing.entry.pid}, port=${existing.entry.port}); launching replacement before shutdown.\n`,
);
} else if (existing.state === "dead") {
cleanupDeadRegisteredInstance(
options.cwd,
stderr,
existing.entry,
deps.registryPath,
);
} else if (existing.state === "live") {
// Explicit fixed-port start cannot bind beside a live same-port process.
// Stop it before launch so legacy `sf server start --port 4000` keeps
// working, while normal starts use reload-first behavior.
cleanupStaleInstance(options.cwd, stderr, deps.registryPath);
}
// Also reap orphaned next-server processes from prior unclean shutdowns
// (sf-mooe4m5k-6fm7z9): orphaned next-server processes with cwd under
@ -969,28 +1061,11 @@ export async function launchWebMode(
);
}
const port =
const targetPort =
options.port ??
reloadPrevious?.port ??
(deps.resolvePort ? await deps.resolvePort(host) : DEFAULT_PORT);
const authToken = randomBytes(32).toString("hex");
const url = `http://${host}:${port}`;
const env = {
...(deps.env ?? process.env),
HOSTNAME: host,
PORT: String(port),
SF_WEB_HOST: host,
SF_WEB_PORT: String(port),
SF_WEB_AUTH_TOKEN: authToken,
SF_WEB_PROJECT_CWD: options.cwd,
SF_WEB_PROJECT_SESSIONS_DIR: options.projectSessionsDir,
SF_WEB_PACKAGE_ROOT: resolution.packageRoot,
SF_WEB_HOST_KIND: resolution.kind,
SF_WEB_AUTO_START_AUTONOMOUS: "1",
...(resolution.kind === "source-dev" ? { NEXT_PUBLIC_SF_DEV: "1" } : {}),
...(options.allowedOrigins?.length
? { SF_WEB_ALLOWED_ORIGINS: options.allowedOrigins.join(",") }
: {}),
};
const targetUrl = `http://${host}:${targetPort}`;
try {
stderr.write(`[forge] Initialising resources…\n`);
@ -1005,8 +1080,8 @@ export async function launchWebMode(
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
port: targetPort,
url: targetUrl,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
@ -1016,89 +1091,163 @@ export async function launchWebMode(
return failure;
}
const spawnSpec = buildSpawnSpec(
resolution,
host,
port,
deps.platform ?? process.platform,
deps.execPath ?? process.execPath,
);
stderr.write(`[forge] Launching web host on port ${port}\n`);
const spawnResult = await spawnDetachedProcess(
deps.spawn ??
((command, args, spawnOptions) => spawn(command, args, spawnOptions)),
spawnSpec.command,
spawnSpec.args,
{
cwd: spawnSpec.cwd,
detached: true,
stdio: "ignore",
windowsHide: true,
shell: needsWindowsShell(
spawnSpec.command,
deps.platform ?? process.platform,
),
env,
},
);
if (!spawnResult.ok) {
const failure: WebModeLaunchFailure = {
mode: "web",
ok: false,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
const spawnVerifiedHost = async (
port: number,
label: "candidate" | "web host",
autoStartAutonomous: boolean,
): Promise<
| {
ok: true;
child: SpawnedChildLike;
authToken: string;
url: string;
}
| { ok: false; failure: WebModeLaunchFailure }
> => {
const authToken = randomBytes(32).toString("hex");
const url = `http://${host}:${port}`;
const env = {
...(deps.env ?? process.env),
HOSTNAME: host,
PORT: String(port),
SF_WEB_HOST: host,
SF_WEB_PORT: String(port),
SF_WEB_AUTH_TOKEN: authToken,
SF_WEB_PROJECT_CWD: options.cwd,
SF_WEB_PROJECT_SESSIONS_DIR: options.projectSessionsDir,
SF_WEB_PACKAGE_ROOT: resolution.packageRoot,
SF_WEB_HOST_KIND: resolution.kind,
SF_WEB_AUTO_START_AUTONOMOUS: autoStartAutonomous ? "1" : "0",
...(resolution.kind === "source-dev" ? { NEXT_PUBLIC_SF_DEV: "1" } : {}),
...(options.allowedOrigins?.length
? { SF_WEB_ALLOWED_ORIGINS: options.allowedOrigins.join(",") }
: {}),
};
const spawnSpec = buildSpawnSpec(
resolution,
host,
port,
url,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
failureReason: `launch:${spawnResult.error instanceof Error ? spawnResult.error.message : String(spawnResult.error)}`,
};
emitLaunchStatus(stderr, failure);
return failure;
deps.platform ?? process.platform,
deps.execPath ?? process.execPath,
);
stderr.write(`[forge] Launching ${label} on port ${port}\n`);
const spawnResult = await spawnDetachedProcess(
deps.spawn ??
((command, args, spawnOptions) => spawn(command, args, spawnOptions)),
spawnSpec.command,
spawnSpec.args,
{
cwd: spawnSpec.cwd,
detached: true,
stdio: "ignore",
windowsHide: true,
shell: needsWindowsShell(
spawnSpec.command,
deps.platform ?? process.platform,
),
env,
},
);
if (!spawnResult.ok) {
return {
ok: false,
failure: {
mode: "web",
ok: false,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
failureReason: `launch:${spawnResult.error instanceof Error ? spawnResult.error.message : String(spawnResult.error)}`,
},
};
}
try {
const bootReadyFn =
deps.waitForBootReady ??
((u: string) => waitForBootReady(u, 180_000, stderr, authToken));
await bootReadyFn(url);
} catch (error) {
if (spawnResult.child.pid !== undefined) {
terminateWebServerProcessTree(spawnResult.child.pid);
}
return {
ok: false,
failure: {
mode: "web",
ok: false,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
failureReason: `boot-ready:${error instanceof Error ? error.message : String(error)}`,
},
};
}
return { ok: true, child: spawnResult.child, authToken, url };
};
if (reloadPrevious) {
const candidatePort = deps.resolvePort
? await deps.resolvePort(host)
: await reserveWebPort(host);
stderr.write(
`[forge] Proving reload candidate on temporary port ${candidatePort} before touching fixed port ${targetPort}\n`,
);
const candidate = await spawnVerifiedHost(
candidatePort,
"candidate",
false,
);
if (!candidate.ok) {
emitLaunchStatus(stderr, candidate.failure);
return candidate.failure;
}
if (candidate.child.pid !== undefined) {
terminateWebServerProcessTree(candidate.child.pid);
}
stderr.write(`[forge] Reload candidate passed boot check.\n`);
stopReloadedInstance(
options.cwd,
stderr,
reloadPrevious,
deps.registryPath,
);
}
const finalHost = await spawnVerifiedHost(targetPort, "web host", true);
if (!finalHost.ok) {
emitLaunchStatus(stderr, finalHost.failure);
return finalHost.failure;
}
try {
const bootReadyFn =
deps.waitForBootReady ??
((u: string) => waitForBootReady(u, 180_000, stderr, authToken));
await bootReadyFn(url);
} catch (error) {
const failure: WebModeLaunchFailure = {
mode: "web",
ok: false,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
failureReason: `boot-ready:${error instanceof Error ? error.message : String(error)}`,
};
emitLaunchStatus(stderr, failure);
return failure;
}
try {
spawnResult.child.unref?.();
const pid = spawnResult.child.pid;
finalHost.child.unref?.();
const pid = finalHost.child.pid;
if (pid !== undefined) {
const pidFilePath = deps.pidFilePath ?? defaultWebPidFilePath;
(deps.writePidFile ?? writePidFile)(pidFilePath, pid);
// Register in multi-instance registry
registerInstance(
options.cwd,
{ pid, port, url, authToken },
{
pid,
port: targetPort,
url: targetUrl,
authToken: finalHost.authToken,
},
deps.registryPath,
);
}
const authenticatedUrl = `${url}/#token=${authToken}`;
const authenticatedUrl = `${targetUrl}/#token=${finalHost.authToken}`;
try {
(deps.openBrowser ?? openBrowser)(authenticatedUrl);
} catch (browserError) {
@ -1113,8 +1262,8 @@ export async function launchWebMode(
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
port: targetPort,
url: targetUrl,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,
@ -1124,15 +1273,15 @@ export async function launchWebMode(
return failure;
}
const authenticatedUrl = `${url}/#token=${authToken}`;
const authenticatedUrl = `${targetUrl}/#token=${finalHost.authToken}`;
const success: WebModeLaunchSuccess = {
mode: "web",
ok: true,
cwd: options.cwd,
projectSessionsDir: options.projectSessionsDir,
host,
port,
url,
port: targetPort,
url: targetUrl,
hostKind: resolution.kind,
hostPath: resolution.entryPath,
hostRoot: resolution.hostRoot,

View file

@ -1,6 +1,6 @@
import { execFile } from "node:child_process";
import { existsSync } from "node:fs";
import { join } from "node:path";
import { dirname, join } from "node:path";
import { pathToFileURL } from "node:url";
import type { SettingsData } from "../../web/lib/settings-types.ts";
import { resolveBridgeRuntimeConfig } from "./bridge-service.ts";
@ -65,6 +65,13 @@ export async function collectSettingsData(
const budgetPath = budgetResolution.modulePath;
const historyPath = historyResolution.modulePath;
const metricsPath = metricsResolution.modulePath;
const benchmarksPath = join(
dirname(routerPath),
"learning",
"data",
"model-benchmarks.json",
);
const performancePath = join(projectCwd, ".sf", "model-performance.json");
// All modules share the same compiled-vs-source mode (they're all from the same package)
const useCompiledJs = prefsResolution.useCompiledJs;
@ -102,6 +109,7 @@ export async function collectSettingsData(
// and writes a combined JSON payload to stdout.
const script = [
'const { pathToFileURL } = await import("node:url");',
'const { existsSync, readFileSync } = await import("node:fs");',
"const prefsMod = await import(pathToFileURL(process.env.SF_SETTINGS_PREFS_MODULE).href);",
"const routerMod = await import(pathToFileURL(process.env.SF_SETTINGS_ROUTER_MODULE).href);",
"const budgetMod = await import(pathToFileURL(process.env.SF_SETTINGS_BUDGET_MODULE).href);",
@ -172,8 +180,45 @@ export async function collectSettingsData(
"const ledger = metricsMod.loadLedgerFromDisk(process.env.SF_SETTINGS_BASE);",
"const projectTotals = ledger ? metricsMod.getProjectTotals(ledger.units) : null;",
// 6. Published benchmark table and local learned model outcomes
"function readJson(path) {",
" if (!path || !existsSync(path)) return null;",
" try { return JSON.parse(readFileSync(path, 'utf-8')); } catch { return null; }",
"}",
"function benchmarkRows(raw) {",
" if (!raw || typeof raw !== 'object') return [];",
" return Object.entries(raw)",
" .filter(([modelId]) => !modelId.startsWith('_'))",
" .map(([modelId, row]) => ({ modelId, ...(row && typeof row === 'object' ? row : {}) }))",
" .sort((a, b) => String(a.modelId).localeCompare(String(b.modelId)));",
"}",
"function performanceRows(raw) {",
" if (!raw || typeof raw !== 'object') return [];",
" const rows = [];",
" for (const [unitType, models] of Object.entries(raw)) {",
" if (!models || typeof models !== 'object') continue;",
" for (const [modelId, value] of Object.entries(models)) {",
" if (!value || typeof value !== 'object') continue;",
" const aggregate = value.aggregate && typeof value.aggregate === 'object' ? value.aggregate : {};",
" rows.push({",
" unitType,",
" modelId,",
" successes: Number(aggregate.successes ?? 0),",
" failures: Number(aggregate.failures ?? 0),",
" timeouts: Number(aggregate.timeouts ?? 0),",
" totalTokens: Number(aggregate.totalTokens ?? 0),",
" totalCost: Number(aggregate.totalCost ?? 0),",
" lastUsed: aggregate.lastUsed ?? null,",
" });",
" }",
" }",
" return rows.sort((a, b) => String(b.lastUsed ?? '').localeCompare(String(a.lastUsed ?? '')));",
"}",
"const modelBenchmarks = benchmarkRows(readJson(process.env.SF_SETTINGS_BENCHMARKS_PATH));",
"const modelPerformance = performanceRows(readJson(process.env.SF_SETTINGS_MODEL_PERFORMANCE_PATH));",
// Write combined payload
"process.stdout.write(JSON.stringify({ preferences, routingConfig, budgetAllocation, routingHistory, projectTotals }));",
"process.stdout.write(JSON.stringify({ preferences, routingConfig, budgetAllocation, routingHistory, projectTotals, modelBenchmarks, modelPerformance }));",
].join(" ");
const prefixArgs = buildSubprocessPrefixArgs(
@ -196,6 +241,8 @@ export async function collectSettingsData(
SF_SETTINGS_HISTORY_MODULE: historyPath,
SF_SETTINGS_METRICS_MODULE: metricsPath,
SF_SETTINGS_BASE: projectCwd,
SF_SETTINGS_BENCHMARKS_PATH: benchmarksPath,
SF_SETTINGS_MODEL_PERFORMANCE_PATH: performancePath,
},
maxBuffer: SETTINGS_MAX_BUFFER,
windowsHide: true,

View file

@ -22,6 +22,8 @@ import { Button } from "@/components/ui/button";
import { authFetch } from "@/lib/auth";
import type {
SettingsData,
SettingsModelBenchmark,
SettingsModelPerformance,
SettingsPatternHistory,
SettingsRoutingHistory,
} from "@/lib/settings-types";
@ -438,10 +440,68 @@ function TierOutcomeBadge({
);
}
function normalizeModelId(id: string): string {
return id.includes("/") ? (id.split("/").pop() ?? id) : id;
}
function formatBenchmarkScore(value: number | null | undefined): string {
return typeof value === "number" && Number.isFinite(value)
? value.toFixed(1)
: "";
}
function aggregateModelPerformance(
rows: SettingsModelPerformance[],
modelId: string,
): { runs: number; successRate: string; cost: string } {
const bare = normalizeModelId(modelId);
const matched = rows.filter(
(row) =>
row.modelId === modelId ||
row.modelId.endsWith(`/${bare}`) ||
normalizeModelId(row.modelId) === bare,
);
const totals = matched.reduce(
(acc, row) => {
acc.successes += row.successes;
acc.failures += row.failures;
acc.timeouts += row.timeouts;
acc.cost += row.totalCost;
return acc;
},
{ successes: 0, failures: 0, timeouts: 0, cost: 0 },
);
const runs = totals.successes + totals.failures + totals.timeouts;
return {
runs,
successRate:
runs > 0 ? `${Math.round((totals.successes / runs) * 100)}%` : "",
cost: runs > 0 ? formatCost(totals.cost) : "",
};
}
function rankedBenchmarks(
benchmarks: SettingsModelBenchmark[],
): SettingsModelBenchmark[] {
return [...benchmarks]
.sort((a, b) => {
const score = (row: SettingsModelBenchmark) =>
(row.swe_bench_verified ?? row.swe_bench ?? 0) * 0.35 +
(row.live_code_bench ?? 0) * 0.25 +
(row.hle ?? 0) * 0.15 +
(row.gpqa ?? 0) * 0.15 +
(row.instruction_following ?? 0) * 0.1;
return score(b) - score(a);
})
.slice(0, 12);
}
export function ModelRoutingPanel() {
const { state, data, busy, refresh } = useSettingsData();
const routingConfig = data?.routingConfig ?? null;
const routingHistory = data?.routingHistory ?? null;
const modelBenchmarks = rankedBenchmarks(data?.modelBenchmarks ?? []);
const modelPerformance = data?.modelPerformance ?? [];
return (
<div className="space-y-4" data-testid="settings-model-routing">
@ -569,6 +629,73 @@ export function ModelRoutingPanel() {
) : (
<SettingsEmpty message="No routing history yet" />
)}
{/* Model benchmarks */}
{modelBenchmarks.length > 0 ? (
<div className="space-y-2">
<h4 className="text-[11px] font-medium text-muted-foreground">
Model Benchmarks
</h4>
<div className="overflow-x-auto rounded-lg border border-border/50 bg-card/50">
<table className="w-full text-left text-xs">
<thead className="border-b border-border/50 text-[10px] uppercase text-muted-foreground">
<tr>
<th className="px-3 py-2 font-medium">Model</th>
<th className="px-2 py-2 font-medium">SWE</th>
<th className="px-2 py-2 font-medium">LCB</th>
<th className="px-2 py-2 font-medium">HLE</th>
<th className="px-2 py-2 font-medium">GPQA</th>
<th className="px-2 py-2 font-medium">Local</th>
<th className="px-3 py-2 font-medium">Cost</th>
</tr>
</thead>
<tbody>
{modelBenchmarks.map((row) => {
const local = aggregateModelPerformance(
modelPerformance,
row.modelId,
);
return (
<tr
key={row.modelId}
className="border-b border-border/30 last:border-0"
title={row.source ?? undefined}
>
<td className="max-w-[180px] truncate px-3 py-2 font-mono text-[11px] text-foreground/85">
{row.modelId}
</td>
<td className="px-2 py-2 tabular-nums">
{formatBenchmarkScore(
row.swe_bench_verified ?? row.swe_bench,
)}
</td>
<td className="px-2 py-2 tabular-nums">
{formatBenchmarkScore(row.live_code_bench)}
</td>
<td className="px-2 py-2 tabular-nums">
{formatBenchmarkScore(row.hle)}
</td>
<td className="px-2 py-2 tabular-nums">
{formatBenchmarkScore(row.gpqa)}
</td>
<td className="px-2 py-2 tabular-nums">
{local.runs > 0
? `${local.successRate} / ${local.runs}`
: ""}
</td>
<td className="px-3 py-2 tabular-nums">
{local.cost}
</td>
</tr>
);
})}
</tbody>
</table>
</div>
</div>
) : (
<SettingsEmpty message="No benchmark data available" />
)}
</>
)}
</div>
@ -775,7 +902,7 @@ export function RemoteQuestionsPanel() {
const { data, busy, refresh } = useSettingsData();
const existingConfig = data?.preferences?.remoteQuestions ?? null;
const [_envVarSet, setEnvVarSet] = useState(false);
const [, setEnvVarSet] = useState(false);
const [envVarName, setEnvVarName] = useState<string | null>(null);
const [apiLoading, setApiLoading] = useState(true);
const [tokenSet, setTokenSet] = useState(false);

View file

@ -83,6 +83,35 @@ export interface SettingsProjectTotals {
userMessages: number;
}
// ─── Model Benchmark And Local Outcome Data ─────────────────────────────────
export interface SettingsModelBenchmark {
modelId: string;
swe_bench?: number | null;
swe_bench_verified?: number | null;
live_code_bench?: number | null;
human_eval?: number | null;
hle?: number | null;
aime_2026?: number | null;
gpqa?: number | null;
mmlu_pro?: number | null;
instruction_following?: number | null;
context_window?: number | null;
max_output_tokens?: number | null;
source?: string | null;
}
export interface SettingsModelPerformance {
unitType: string;
modelId: string;
successes: number;
failures: number;
timeouts: number;
totalTokens: number;
totalCost: number;
lastUsed: string | null;
}
// ─── Effective Preferences ────────────────────────────────────────────────────
export interface SettingsPreferencesData {
@ -124,4 +153,6 @@ export interface SettingsData {
budgetAllocation: SettingsBudgetAllocation;
routingHistory: SettingsRoutingHistory | null;
projectTotals: SettingsProjectTotals | null;
modelBenchmarks: SettingsModelBenchmark[];
modelPerformance: SettingsModelPerformance[];
}