diff --git a/.sf/preferences.yaml b/.sf/preferences.yaml index 35dc3b50b..502f15f61 100644 --- a/.sf/preferences.yaml +++ b/.sf/preferences.yaml @@ -1,8 +1,6 @@ ---- version: 1 experimental: smoke_gate: false ---- # SF Preferences -See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation. +# See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation. diff --git a/AGENTS.md b/AGENTS.md index 09b4ee355..789f624fe 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -98,6 +98,27 @@ npm run release:changelog npm run release:bump ``` +## Running SF Locally + +The server surface is the default local dogfooding surface for web/RPC/autonomous +control. The TUI still exists, but do not use it as the default way to run or +verify autonomous mode. + +```bash +# Source/dev server +npm run sf:server -- --port 4000 --host 127.0.0.1 + +# Built server after npm run build:core or npm run build +npm run sf:server:dist -- --port 4000 --host 127.0.0.1 +``` + +Bind only trusted interfaces. For this workstation, localhost plus Tailscale is +acceptable; public `0.0.0.0` is not the default. If a server is already running, +use `sf headless ...` as the machine/control surface instead of starting a +second writer. Server-forwarded feedback writes are queued and drained by the +server before autonomous dispatch, so CLI control does not block behind a busy +unit. + ## Coding Style & Naming Conventions - **Language**: TypeScript with `"strict": true` enabled in all packages diff --git a/CLAUDE.md b/CLAUDE.md index 5e94a3daa..f8dfc197c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -98,3 +98,34 @@ When adding a new `{{variable}}` to a prompt template in `prompts/`, you must: `loadPrompt` throws at runtime if any `{{var}}` in the template has no corresponding key in the vars object — this is intentional to catch template/code drift early. + +## Running the SF server in this repo + +Use the server surface for dogfooding and browser/RPC control. Do not start the +TUI as the default way to exercise autonomous mode. + +```bash +# source/dev server, with resource redirect and restart support +npm run sf:server -- --port 4000 --host 127.0.0.1 + +# built server, after npm run build:core or npm run build +npm run sf:server:dist -- --port 4000 --host 127.0.0.1 +``` + +If the server is already running, prefer `sf headless ...` control commands +rather than starting a second writer. Feedback add/resolve commands are +forwarded to the active server and queued there so CLI control does not hang +behind an autonomous unit. + +For remote local-network access, bind an additional trusted interface such as a +Tailscale address. Do not bind `0.0.0.0` for the dev server unless an explicit +fronting proxy/firewall decision is in place. + +Before assuming a source edit is live, rebuild the relevant output: + +```bash +npm run build:core +``` + +Then restart the server. Stale `dist/` or stale `~/.sf/agent/extensions/sf/` +copies can make fixed source look broken. diff --git a/packages/coding-agent/src/modes/rpc/rpc-client.ts b/packages/coding-agent/src/modes/rpc/rpc-client.ts index a3b5a4655..d91a771a8 100644 --- a/packages/coding-agent/src/modes/rpc/rpc-client.ts +++ b/packages/coding-agent/src/modes/rpc/rpc-client.ts @@ -409,12 +409,19 @@ export class RpcClient { subcommand: "add" | "resolve", args: string[], json = false, - ): Promise<{ exitCode: number; stdout: string; stderr: string }> { + options: { queued?: boolean } = {}, + ): Promise<{ + exitCode: number | null; + stdout: string; + stderr: string; + queued?: boolean; + }> { const response = await this.send({ type: "sf_feedback", subcommand, args, json, + queued: options.queued, }); return this.getData(response); } diff --git a/packages/coding-agent/src/modes/rpc/rpc-mode.ts b/packages/coding-agent/src/modes/rpc/rpc-mode.ts index df1cbecfe..0eb081d72 100644 --- a/packages/coding-agent/src/modes/rpc/rpc-mode.ts +++ b/packages/coding-agent/src/modes/rpc/rpc-mode.ts @@ -12,7 +12,16 @@ */ import * as crypto from "node:crypto"; -import { existsSync, readdirSync, readFileSync, statSync } from "node:fs"; +import { + appendFileSync, + existsSync, + mkdirSync, + readdirSync, + readFileSync, + renameSync, + statSync, + unlinkSync, +} from "node:fs"; import type { WriteStream } from "node:tty"; import { pathToFileURL } from "node:url"; import { dirname, join, resolve } from "node:path"; @@ -42,6 +51,142 @@ const RUNTIME_HEARTBEAT_INTERVAL_MS = Number( process.env.SF_RUNTIME_HEARTBEAT_INTERVAL_MS ?? 10_000, ); +const SF_FEEDBACK_QUEUE_FILE = "sf-feedback-queue.jsonl"; +const SF_FEEDBACK_FAILED_QUEUE_FILE = "sf-feedback-queue-failed.jsonl"; + +function queueSfFeedbackCommand( + cwd: string, + command: Extract, +): string { + const dir = join(cwd, ".sf", "runtime"); + mkdirSync(dir, { recursive: true }); + const path = join(dir, SF_FEEDBACK_QUEUE_FILE); + appendFileSync( + path, + `${JSON.stringify({ + schemaVersion: 1, + queuedAt: new Date().toISOString(), + id: command.id, + subcommand: command.subcommand, + args: command.args, + json: command.json === true, + source: "rpc", + })}\n`, + "utf-8", + ); + return path; +} + +type QueuedSfFeedbackCommand = { + schemaVersion: 1; + queuedAt: string; + id?: string; + subcommand: "add" | "list" | "resolve"; + args: string[]; + json: boolean; + source: "rpc"; +}; + +function parseQueuedSfFeedbackLine( + line: string, +): QueuedSfFeedbackCommand | null { + try { + const row = JSON.parse(line) as Partial; + if ( + row.schemaVersion !== 1 || + (row.subcommand !== "add" && + row.subcommand !== "list" && + row.subcommand !== "resolve") || + !Array.isArray(row.args) + ) { + return null; + } + return { + schemaVersion: 1, + queuedAt: + typeof row.queuedAt === "string" + ? row.queuedAt + : new Date().toISOString(), + id: typeof row.id === "string" ? row.id : undefined, + subcommand: row.subcommand, + args: row.args.map((arg) => String(arg)), + json: row.json === true, + source: "rpc", + }; + } catch { + return null; + } +} + +/** + * Apply queued sf_feedback commands before a daemon-owned autonomous run starts. + * + * Purpose: keep CLI/RPC control commands non-blocking while preserving a single + * server-owned writer for self-feedback mutations. + * + * Consumer: start_autonomous RPC command in the SF server session. + */ +async function drainQueuedSfFeedbackCommands(cwd: string): Promise { + const runtimeDir = join(cwd, ".sf", "runtime"); + const queuePath = join(runtimeDir, SF_FEEDBACK_QUEUE_FILE); + if (!existsSync(queuePath)) return; + + const drainingPath = join( + runtimeDir, + `${SF_FEEDBACK_QUEUE_FILE}.${process.pid}.draining`, + ); + try { + renameSync(queuePath, drainingPath); + } catch { + return; + } + + const lines = readFileSync(drainingPath, "utf-8") + .split("\n") + .map((line) => line.trim()) + .filter(Boolean); + const queued = lines + .map(parseQueuedSfFeedbackLine) + .filter((row): row is QueuedSfFeedbackCommand => row !== null); + if (queued.length === 0) { + unlinkSync(drainingPath); + return; + } + + const { handleFeedback } = await loadHeadlessFeedbackHandler(); + const failed: QueuedSfFeedbackCommand[] = []; + for (const command of queued) { + try { + const captured = await captureProcessWrites(() => + handleFeedback(cwd, { + subcommand: command.subcommand, + args: command.args, + json: command.json, + }), + ); + if (captured.result.exitCode !== 0) failed.push(command); + } catch { + failed.push(command); + } + } + + if (failed.length > 0) { + appendFileSync( + join(runtimeDir, SF_FEEDBACK_FAILED_QUEUE_FILE), + failed.map((row) => JSON.stringify(row)).join("\n") + "\n", + "utf-8", + ); + } + unlinkSync(drainingPath); +} + +function scheduleQueuedSfFeedbackDrain(cwd: string): void { + const timer = setTimeout(() => { + void drainQueuedSfFeedbackCommands(cwd); + }, 0); + timer.unref?.(); +} + async function captureProcessWrites( run: () => Promise, ): Promise<{ result: T; stdout: string; stderr: string }> { @@ -853,6 +998,7 @@ export async function runRpcMode(session: AgentSession): Promise { const previousHeadless = process.env.SF_HEADLESS; process.env.SF_HEADLESS = "1"; try { + await drainQueuedSfFeedbackCommands(process.cwd()); await session.prompt("/autonomous", { source: "rpc", }); @@ -882,6 +1028,16 @@ export async function runRpcMode(session: AgentSession): Promise { } case "sf_feedback": { + if (command.queued === true) { + const queuePath = queueSfFeedbackCommand(process.cwd(), command); + scheduleQueuedSfFeedbackDrain(process.cwd()); + return success(id, "sf_feedback", { + exitCode: null, + stdout: JSON.stringify({ ok: true, queued: true, queuePath }), + stderr: "", + queued: true, + }); + } const { handleFeedback } = await loadHeadlessFeedbackHandler(); const captured = await captureProcessWrites(() => handleFeedback(process.cwd(), { diff --git a/packages/coding-agent/src/modes/rpc/rpc-types.ts b/packages/coding-agent/src/modes/rpc/rpc-types.ts index 8fe2d3e80..8147deb91 100644 --- a/packages/coding-agent/src/modes/rpc/rpc-types.ts +++ b/packages/coding-agent/src/modes/rpc/rpc-types.ts @@ -47,6 +47,7 @@ export type RpcCommand = subcommand: "add" | "resolve"; args: string[]; json?: boolean; + queued?: boolean; } // State @@ -185,7 +186,12 @@ export type RpcResponse = type: "response"; command: "sf_feedback"; success: true; - data: { exitCode: number; stdout: string; stderr: string }; + data: { + exitCode: number | null; + stdout: string; + stderr: string; + queued?: boolean; + }; } | { id?: string; diff --git a/packages/rpc-client/src/rpc-client.ts b/packages/rpc-client/src/rpc-client.ts index 0e510af94..5ea716f97 100644 --- a/packages/rpc-client/src/rpc-client.ts +++ b/packages/rpc-client/src/rpc-client.ts @@ -482,12 +482,19 @@ export class RpcClient { subcommand: "add" | "resolve", args: string[], json = false, - ): Promise<{ exitCode: number; stdout: string; stderr: string }> { + options: { queued?: boolean } = {}, + ): Promise<{ + exitCode: number | null; + stdout: string; + stderr: string; + queued?: boolean; + }> { const response = await this.send({ type: "sf_feedback", subcommand, args, json, + queued: options.queued, }); return this.getData(response); } diff --git a/packages/rpc-client/src/rpc-types.ts b/packages/rpc-client/src/rpc-types.ts index d4832db2f..0c62e85fb 100644 --- a/packages/rpc-client/src/rpc-types.ts +++ b/packages/rpc-client/src/rpc-types.ts @@ -113,6 +113,7 @@ export type RpcCommand = subcommand: "add" | "resolve"; args: string[]; json?: boolean; + queued?: boolean; } // State @@ -251,7 +252,12 @@ export type RpcResponse = type: "response"; command: "sf_feedback"; success: true; - data: { exitCode: number; stdout: string; stderr: string }; + data: { + exitCode: number | null; + stdout: string; + stderr: string; + queued?: boolean; + }; } | { id?: string; diff --git a/src/cli-web-branch.ts b/src/cli-web-branch.ts index c9826294e..aa5dadc0b 100644 --- a/src/cli-web-branch.ts +++ b/src/cli-web-branch.ts @@ -227,7 +227,7 @@ export type RunWebCliBranchResult = | { handled: true; exitCode: number; - action: "start"; + action: "start" | "reload"; status: WebModeLaunchStatus; launchInputs: { cwd: string; @@ -270,8 +270,8 @@ export async function runWebCliBranch( }; } - // `sf server [start] [path]` starts the full operator server for one repo. - // Matches: `sf server`, `sf server start`, `sf server start `, `sf server ` + // `sf server [start|reload] [path]` starts the full operator server for one repo. + // Matches: `sf server`, `sf server start`, `sf server reload`, `sf server ` const isWebSubcommand = flags.messages[0] === "server" && flags.messages[1] !== "stop"; if (!isWebSubcommand) { @@ -286,7 +286,7 @@ export async function runWebCliBranch( // sf server → messages[1] (when not "start") let webPath = flags.webPath; if (!webPath && isWebSubcommand) { - if (flags.messages[1] === "start") { + if (flags.messages[1] === "start" || flags.messages[1] === "reload") { webPath = flags.messages[2]; } else if (flags.messages[1]) { webPath = flags.messages[1]; @@ -346,6 +346,7 @@ export async function runWebCliBranch( agentDir, host: flags.webHost, port: flags.webPort, + ...(flags.messages[1] === "reload" ? { reload: true } : {}), allowedOrigins: flags.webAllowedOrigins, }); @@ -356,7 +357,7 @@ export async function runWebCliBranch( return { handled: true, exitCode: status.ok ? 0 : 1, - action: "start", + action: flags.messages[1] === "reload" ? "reload" : "start", status, launchInputs: { cwd: currentCwd, diff --git a/src/headless-server-forward.ts b/src/headless-server-forward.ts index f1ad28499..d6d75a292 100644 --- a/src/headless-server-forward.ts +++ b/src/headless-server-forward.ts @@ -12,9 +12,10 @@ import { resolve } from "node:path"; import { readInstanceRegistry, type WebInstanceEntry } from "./web-mode.js"; export interface ForwardedHeadlessResult { - exitCode: number; + exitCode: number | null; stdout: string; stderr: string; + queued?: boolean; } type SfFeedbackResponse = @@ -109,6 +110,7 @@ export async function forwardFeedbackToActiveServer( subcommand: options.subcommand, args: options.args, json: options.json, + queued: true, }, ); if (response.statusCode === 404) return null; diff --git a/src/headless-triage.ts b/src/headless-triage.ts index d97873ea1..0b2f42c86 100644 --- a/src/headless-triage.ts +++ b/src/headless-triage.ts @@ -67,6 +67,7 @@ export interface HandleTriageOptions { max?: number; run?: boolean; apply?: boolean; + urgentOnly?: boolean; model?: string; agentRunner?: AgentRunner; } @@ -1166,6 +1167,13 @@ export async function handleTriage( return { exitCode: 1 }; } + if (options.urgentOnly) { + candidates = candidates.filter( + (candidate) => + candidate.severity === "high" || candidate.severity === "critical", + ); + } + if (typeof options.max === "number" && options.max > 0) { candidates = candidates.slice(0, options.max); } diff --git a/src/headless.ts b/src/headless.ts index 12850f56e..53530d832 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -105,6 +105,13 @@ import { const HEADLESS_HEARTBEAT_INTERVAL_MS = 60_000; +type SelfFeedbackSeverity = "low" | "medium" | "high" | "critical" | string; + +interface SelfFeedbackRowForTriage { + resolvedAt?: string | null; + severity?: SelfFeedbackSeverity; +} + interface HeadlessTimeoutSolverEvalRecord { runId: string; reportPath: string; @@ -577,6 +584,31 @@ export async function runHeadless(options: HeadlessOptions): Promise { } } +/** + * Count unresolved high/critical self-feedback rows for autonomous pre-triage. + * + * Purpose: let urgent operator/detector findings bypass the normal triage + * cadence without making the TypeScript headless surface depend on JS + * extension declarations. + * + * Consumer: runHeadlessOnce before autonomous dispatch. + */ +async function countUrgentSelfFeedbackRows(basePath: string): Promise { + try { + const modulePath = "./resources/extensions/sf/self-feedback.js"; + const mod = (await import(modulePath)) as { + readAllSelfFeedback?: (basePath: string) => SelfFeedbackRowForTriage[]; + }; + return (mod.readAllSelfFeedback?.(basePath) ?? []).filter( + (entry) => + !entry.resolvedAt && + (entry.severity === "high" || entry.severity === "critical"), + ).length; + } catch { + return 0; + } +} + async function runHeadlessOnce( options: HeadlessOptions, restartCount: number, @@ -660,12 +692,19 @@ async function runHeadlessOnce( "last-triage-at", ); let shouldRunTriage = true; + const urgentTriageCount = await countUrgentSelfFeedbackRows( + process.cwd(), + ); try { if (existsSync(triageMarkerPath)) { const last = Date.parse( readFileSync(triageMarkerPath, "utf8").trim(), ); - if (Number.isFinite(last) && Date.now() - last < triageIntervalMs) { + if ( + urgentTriageCount === 0 && + Number.isFinite(last) && + Date.now() - last < triageIntervalMs + ) { shouldRunTriage = false; if (!options.json) { process.stderr.write( @@ -687,13 +726,16 @@ async function runHeadlessOnce( const { handleTriage } = await import("./headless-triage.js"); if (!options.json) { process.stderr.write( - `[headless] autonomous: draining self-feedback triage queue first (max=${triageMaxBatch})...\n`, + urgentTriageCount > 0 + ? `[headless] autonomous: draining ${urgentTriageCount} high/critical self-feedback entr${urgentTriageCount === 1 ? "y" : "ies"} before dispatch (max=${triageMaxBatch})...\n` + : `[headless] autonomous: draining self-feedback triage queue first (max=${triageMaxBatch})...\n`, ); } await handleTriage(process.cwd(), { apply: true, json: !!options.json, max: triageMaxBatch, + urgentOnly: urgentTriageCount > 0, }); try { const runtimeDir = join(process.cwd(), ".sf", "runtime"); @@ -971,7 +1013,7 @@ async function runHeadlessOnce( if (forwarded.stdout) process.stdout.write(forwarded.stdout); if (forwarded.stderr) process.stderr.write(forwarded.stderr); return { - exitCode: forwarded.exitCode, + exitCode: forwarded.exitCode ?? EXIT_SUCCESS, interrupted: false, timedOut: false, }; diff --git a/src/resources/extensions/sf/auto-timers.js b/src/resources/extensions/sf/auto-timers.js index b186a6bb1..d971831a7 100644 --- a/src/resources/extensions/sf/auto-timers.js +++ b/src/resources/extensions/sf/auto-timers.js @@ -6,6 +6,7 @@ * via startUnitSupervision() and torn down by the caller via clearUnitTimeout(). */ import { saveActivityLog } from "./activity-log.js"; +import { resolveAgentEnd } from "./auto/resolve.js"; import { resolveAgentEndCancelled } from "./auto/resolve.js"; import { detectWorkingTreeActivity } from "./auto-supervisor.js"; import { blockModel } from "./blocked-models.js"; @@ -40,6 +41,124 @@ import { writeUnitRuntimeRecord, } from "./uok/unit-runtime.js"; import { logError, logWarning } from "./workflow-logger.js"; + +/** + * Clear active supervision handles for the current unit attempt. + * + * Purpose: stop one runaway-guard terminal decision from being emitted repeatedly + * while the autonomous loop is being unblocked. + * + * Consumer: finalizeRunawayGuardFailure() when zero-progress or silent-worker + * detection has already converted the current unit attempt into a failed record. + */ +function clearSupervisionHandles(s) { + if (s.unitTimeoutHandle) { + clearTimeout(s.unitTimeoutHandle); + s.unitTimeoutHandle = null; + } + if (s.wrapupWarningHandle) { + clearTimeout(s.wrapupWarningHandle); + s.wrapupWarningHandle = null; + } + if (s.idleWatchdogHandle) { + clearInterval(s.idleWatchdogHandle); + s.idleWatchdogHandle = null; + } + if (s.continueHereHandle) { + clearInterval(s.continueHereHandle); + s.continueHereHandle = null; + } +} + +/** + * Finish a runaway-guard failure as one terminal unit-attempt event. + * + * Purpose: convert zero-progress and silent-worker supervision failures into a + * retryable failed runtime record, close the worker lineage, stop supervision + * timers, and unblock the unit promise so the autonomous loop can select the + * next eligible model instead of repeating the same warning. + * + * Consumer: startUnitSupervision() idle watchdog fail branch. + */ +export async function finalizeRunawayGuardFailure(sctx, decision, helpers = {}) { + const { s, ctx, unitType, unitId, buildSnapshotOpts } = sctx; + const currentUnit = s.currentUnit; + if (!currentUnit) return; + const closeout = helpers.closeoutUnit ?? closeoutUnit; + const writeRuntime = helpers.writeUnitRuntimeRecord ?? writeUnitRuntimeRecord; + const block = helpers.blockModel ?? blockModel; + const recordFeedback = helpers.recordSelfFeedback ?? recordSelfFeedback; + const notify = helpers.notify ?? ((message, level) => ctx.ui.notify(message, level)); + const resolveUnit = + helpers.resolveAgentEnd ?? + ((event) => { + resolveAgentEnd(event); + }); + const failedModel = s.currentUnitModel; + if ( + decision.reason === "zero-progress" && + failedModel?.provider && + failedModel?.id + ) { + block( + s.basePath, + failedModel.provider, + failedModel.id, + `zero-progress on ${unitType} ${unitId}`, + { expiresAt: Date.now() + 60 * 60 * 1000 }, + ); + notify( + `Temporarily blocked ${failedModel.provider}/${failedModel.id} after zero-progress on ${unitType} ${unitId}; retry will choose a fallback.`, + "warning", + ); + } + await closeout( + ctx, + s.basePath, + currentUnit.type, + currentUnit.id, + currentUnit.startedAt, + buildSnapshotOpts(), + ); + writeRuntime(s.basePath, unitType, unitId, currentUnit.startedAt, { + phase: "failed-silent-worker", + status: "failed", + lastProgressAt: Date.now(), + lastProgressKind: "runaway-guard-fail", + runawayGuardFail: decision.metadata, + lineageEvent: { + status: "failed", + workerSessionId: ctx.sessionManager?.getSessionId?.(), + note: `${decision.reason ?? "runaway-guard"} failed current attempt`, + }, + }); + const unitParts = unitId.split("/"); + recordFeedback( + { + kind: "runaway-loop:silent-worker-failure", + severity: "high", + summary: decision.reason, + evidence: JSON.stringify(decision.metadata, null, 2), + suggestedFix: + "LLM session never produced an assistant message — check session-manager.ts:1086-1096 (silent _persist skip) and verify the model/provider is responding. The dispatcher will attempt retry within maxRetries; if persistent, transitions to blocked.", + occurredIn: { + unitType, + milestone: unitParts[0], + slice: unitParts[1], + task: unitParts.slice(2).join("/") || undefined, + }, + source: "detector", + }, + s.basePath, + ); + clearSupervisionHandles(s); + notify(decision.reason, "error"); + resolveUnit({ + messages: [], + _synthetic: "runaway-guard-fail", + reason: decision.reason, + }); +} /** * Set up all four supervision timers for the current unit: * 1. Soft timeout warning (wrapup) @@ -271,65 +390,7 @@ export function startUnitSupervision(sctx) { } if (decision.action === "fail") { if (getInFlightToolCount() > 0) return; - const failedModel = s.currentUnitModel; - if ( - decision.reason === "zero-progress" && - failedModel?.provider && - failedModel?.id - ) { - blockModel( - s.basePath, - failedModel.provider, - failedModel.id, - `zero-progress on ${unitType} ${unitId}`, - { expiresAt: Date.now() + 60 * 60 * 1000 }, - ); - ctx.ui.notify( - `Temporarily blocked ${failedModel.provider}/${failedModel.id} after zero-progress on ${unitType} ${unitId}; retry will choose a fallback.`, - "warning", - ); - } - await closeoutUnit( - ctx, - s.basePath, - s.currentUnit.type, - s.currentUnit.id, - s.currentUnit.startedAt, - buildSnapshotOpts(), - ); - writeUnitRuntimeRecord( - s.basePath, - unitType, - unitId, - s.currentUnit.startedAt, - { - phase: "failed-silent-worker", - status: "failed", - lastProgressAt: Date.now(), - lastProgressKind: "runaway-guard-fail", - runawayGuardFail: decision.metadata, - }, - ); - const unitParts = unitId.split("/"); - recordSelfFeedback( - { - kind: "runaway-loop:silent-worker-failure", - severity: "high", - summary: decision.reason, - evidence: JSON.stringify(decision.metadata, null, 2), - suggestedFix: - "LLM session never produced an assistant message — check session-manager.ts:1086-1096 (silent _persist skip) and verify the model/provider is responding. The dispatcher will attempt retry within maxRetries; if persistent, transitions to blocked.", - occurredIn: { - unitType, - milestone: unitParts[0], - slice: unitParts[1], - task: unitParts.slice(2).join("/") || undefined, - }, - source: "detector", - }, - s.basePath, - ); - ctx.ui.notify(decision.reason, "error"); + await finalizeRunawayGuardFailure(sctx, decision); return; } if (decision.action === "pause") { diff --git a/src/resources/extensions/sf/detectors/index.js b/src/resources/extensions/sf/detectors/index.js index 669bcfb33..b67851686 100644 --- a/src/resources/extensions/sf/detectors/index.js +++ b/src/resources/extensions/sf/detectors/index.js @@ -11,6 +11,7 @@ export { periodicDetectorSweepGate } from "./periodic-runner.js"; export { productionPlateauGate } from "./production-plateau.js"; export { repeatedFeedbackKindGate } from "./repeated-feedback-kind.js"; export { sameUnitLoopGate } from "./same-unit-loop.js"; +export { serverDirectionDriftGate } from "./server-direction-drift.js"; export { staleLockGate } from "./stale-lock.js"; export { statusCompletionDriftGate } from "./status-completion-drift.js"; export { zeroProgressGate } from "./zero-progress.js"; diff --git a/src/resources/extensions/sf/detectors/periodic-runner.js b/src/resources/extensions/sf/detectors/periodic-runner.js index 623198609..12894f44d 100644 --- a/src/resources/extensions/sf/detectors/periodic-runner.js +++ b/src/resources/extensions/sf/detectors/periodic-runner.js @@ -11,6 +11,7 @@ import { detectCrashLoop } from "./crash-loop-classifier.js"; import { detectProductionPlateau } from "./production-plateau.js"; import { detectRepeatedFeedbackKind } from "./repeated-feedback-kind.js"; import { detectSameUnitLoop } from "./same-unit-loop.js"; +import { detectServerDirectionDrift } from "./server-direction-drift.js"; import { detectStaleLock } from "./stale-lock.js"; import { detectStatusCompletionDrift } from "./status-completion-drift.js"; import { detectZeroProgress } from "./zero-progress.js"; @@ -74,6 +75,10 @@ function defaultDetectors(ctx, options) { name: "production-plateau", run: () => detectProductionPlateau(ctx?.unitMetrics, ctx, options), }, + { + name: "server-direction-drift", + run: () => detectServerDirectionDrift(ctx, options), + }, ]; } diff --git a/src/resources/extensions/sf/detectors/server-direction-drift.js b/src/resources/extensions/sf/detectors/server-direction-drift.js new file mode 100644 index 000000000..51e8857ee --- /dev/null +++ b/src/resources/extensions/sf/detectors/server-direction-drift.js @@ -0,0 +1,132 @@ +/** + * server-direction-drift.js — detect obsolete server architecture in live work. + * + * Purpose: stop SF from planning queued work against superseded server shapes + * after the product direction moves to one embedded `sf server` control plane. + * + * Consumer: Wiggums periodic detector sweep and UOK detector gate registry. + */ + +const DEFAULT_DEPRECATED_PATTERNS = [ + /\bsf serve\b/i, + /\bA2A\b/i, + /\bJSON-RPC API\b/i, + /\bper-repo systemd unit\b/i, + /\bper-repo web servers?\b/i, + /\bseparate standalone daemon brain\b/i, +]; + +const ACTIVE_STATUSES = new Set(["queued", "active", "planned", "pending"]); +const CLOSED_STATUSES = new Set([ + "cancelled", + "canceled", + "complete", + "completed", + "done", + "superseded", + "parked", +]); + +/** + * Detect queued milestone/slice work that still targets a deprecated server path. + * + * Purpose: make stale roadmap/server-direction drift visible before autonomous + * planning spends turns on obsolete `sf serve`, A2A, or per-repo server work. + * + * Consumer: periodic-runner.js default detector list. + */ +export function detectServerDirectionDrift(ctx = {}, options = {}) { + const rows = [ + ...normalizeRows(ctx.milestones, "milestone"), + ...normalizeRows(ctx.slices, "slice"), + ...normalizeRows(ctx.requirements, "requirement"), + ]; + const patterns = + options.deprecatedServerPatterns ?? DEFAULT_DEPRECATED_PATTERNS; + const matches = []; + + for (const row of rows) { + if (!isActiveRow(row)) continue; + const text = searchableText(row); + const pattern = patterns.find((candidate) => candidate.test(text)); + if (!pattern) continue; + matches.push({ + kind: row.kind, + id: row.id, + milestoneId: row.milestoneId ?? row.milestone_id ?? null, + status: row.status ?? null, + pattern: pattern.source, + title: row.title ?? "", + }); + } + + if (matches.length === 0) { + return { stuck: false, reason: "", signature: { checked: rows.length } }; + } + return { + stuck: true, + reason: "server-direction-drift", + signature: { + matches, + expectedDirection: + "sf server is the single operator server; web/Next.js embeds daemon lifecycle", + }, + }; +} + +/** + * Run server-direction drift as a UOK verification gate. + * + * Purpose: make superseded server architecture detectable through the common + * gate runner, not only through ad hoc roadmap review. + * + * Consumer: detector gate registry and periodicDetectorSweepGate. + */ +export const serverDirectionDriftGate = { + id: "server-direction-drift", + type: "verification", + async execute(ctx = {}) { + const result = detectServerDirectionDrift(ctx, ctx.options); + if (result.stuck) { + return { + outcome: "manual-attention", + failureClass: "verification", + rationale: result.reason, + findings: result.signature, + }; + } + return { + outcome: "pass", + failureClass: null, + rationale: "no server-direction drift", + }; + }, +}; + +function normalizeRows(rows, kind) { + if (!Array.isArray(rows)) return []; + return rows.map((row) => ({ ...row, kind })); +} + +function isActiveRow(row) { + const status = String(row.status ?? "").toLowerCase(); + if (CLOSED_STATUSES.has(status)) return false; + return ACTIVE_STATUSES.has(status) || status === ""; +} + +function searchableText(row) { + return [ + row.id, + row.title, + row.description, + row.why, + row.goal, + row.successCriteria, + row.success_criteria, + row.notes, + row.full_content, + row.vision, + ] + .filter((value) => typeof value === "string") + .join("\n"); +} diff --git a/src/resources/extensions/sf/experimental.js b/src/resources/extensions/sf/experimental.js index 19e3e9cfd..e25e1c5db 100644 --- a/src/resources/extensions/sf/experimental.js +++ b/src/resources/extensions/sf/experimental.js @@ -17,12 +17,35 @@ import { loadProjectSFPreferences, } from "./preferences.js"; -/** Extract the body section that follows a YAML frontmatter block. */ -function extractBodyAfterFrontmatter(content) { - const closingIdx = content.indexOf("\n---", content.indexOf("---")); - if (closingIdx === -1) return null; - const afterFrontmatter = content.slice(closingIdx + 4); - return afterFrontmatter.trim() ? afterFrontmatter : null; +/** Return the preferences documentation comment block from a YAML file. */ +function extractPreferencesCommentBlock(content) { + const marker = "\n# SF Preferences"; + const idx = content.indexOf(marker); + if (idx >= 0) return commentPreferencesBody(content.slice(idx)); + if (content.startsWith("# SF Preferences")) return content; + return null; +} + +/** Return a YAML-commented default preferences reference block. */ +function defaultPreferencesCommentBlock() { + return [ + "", + "# SF Preferences", + "#", + "# See `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.", + "", + ].join("\n"); +} + +/** Preserve the human reference body without making preferences.yaml multi-doc. */ +function commentPreferencesBody(body) { + return body + .split("\n") + .map((line) => { + if (line === "" || line.startsWith("#")) return line; + return `# ${line}`; + }) + .join("\n"); } /** All recognized experimental feature flags with descriptions. */ @@ -81,14 +104,15 @@ export function setExperimentalFlag(name, value) { prefs.experimental = { ...(prefs.experimental ?? {}), [name]: value }; const frontmatter = serializePreferencesToFrontmatter(prefs); - let body = - "\n# SF Preferences\n\nSee `~/.sf/agent/extensions/sf/docs/preferences-reference.md` for full documentation.\n"; + let body = defaultPreferencesCommentBlock(); if (existsSync(path)) { - const preserved = extractBodyAfterFrontmatter(readFileSync(path, "utf-8")); + const preserved = extractPreferencesCommentBlock( + readFileSync(path, "utf-8"), + ); if (preserved) body = preserved; } mkdirSync(dirname(path), { recursive: true }); - writeFileSync(path, `---\n${frontmatter}---${body}`, "utf-8"); + writeFileSync(path, `${frontmatter}${body}`, "utf-8"); } /** diff --git a/src/resources/extensions/sf/preferences-loader.js b/src/resources/extensions/sf/preferences-loader.js index fc27b0f08..a484120b5 100644 --- a/src/resources/extensions/sf/preferences-loader.js +++ b/src/resources/extensions/sf/preferences-loader.js @@ -194,7 +194,7 @@ export function _resetParseWarningFlag() { */ export function parsePreferencesYaml(content) { try { - const parsed = parseYaml(content); + const parsed = parseYaml(stripPreferencesYamlDocument(content)); if (typeof parsed !== "object" || parsed === null) return {}; return parsed; } catch (e) { @@ -203,6 +203,22 @@ export function parsePreferencesYaml(content) { } } +/** + * Return only the machine-readable YAML document from preferences.yaml. + * + * Purpose: tolerate older files where a human reference body was appended as + * raw Markdown after `# SF Preferences` while keeping canonical writes pure + * YAML plus comments. + * + * Consumer: parsePreferencesYaml before handing content to the YAML parser. + */ +function stripPreferencesYamlDocument(content) { + const marker = "\n# SF Preferences"; + const idx = content.indexOf(marker); + if (idx < 0) return content; + return content.slice(0, idx); +} + /** * Parse legacy frontmatter-style preference content. * diff --git a/src/resources/extensions/sf/sf-db.js b/src/resources/extensions/sf/sf-db.js index a2493d2d2..d2c7cb11f 100644 --- a/src/resources/extensions/sf/sf-db.js +++ b/src/resources/extensions/sf/sf-db.js @@ -13,6 +13,7 @@ export * from "./sf-db/sf-db-memory.js"; export * from "./sf-db/sf-db-milestones.js"; export * from "./sf-db/sf-db-mode-state.js"; export * from "./sf-db/sf-db-profile.js"; +export * from "./sf-db/roadmap-projection-sync.js"; export * from "./sf-db/sf-db-self-feedback.js"; export * from "./sf-db/sf-db-session-store.js"; export * from "./sf-db/sf-db-slices.js"; diff --git a/src/resources/extensions/sf/sf-db/roadmap-projection-sync.js b/src/resources/extensions/sf/sf-db/roadmap-projection-sync.js new file mode 100644 index 000000000..72d81769a --- /dev/null +++ b/src/resources/extensions/sf/sf-db/roadmap-projection-sync.js @@ -0,0 +1,85 @@ +/** + * roadmap-projection-sync.js - schedule DB-backed roadmap projection refreshes. + * + * Purpose: keep M###-ROADMAP.md and M###-ROADMAP.json as generated views of + * canonical SQLite planning state after milestone or slice mutations. + * + * Consumer: sf-db milestone/slice write wrappers and projection-sync tests. + */ +import { logWarning } from "../workflow-logger.js"; + +const pending = new Map(); +const inFlight = new Set(); + +/** + * Queue a best-effort ROADMAP.md/json refresh for one milestone. + * + * Purpose: make roadmap files server-maintained projections instead of stale + * manually rendered artifacts while keeping DB writes synchronous and durable. + * + * Consumer: insert/update milestone and slice DB wrappers. + */ +export function scheduleRoadmapProjectionRefresh( + basePath = process.cwd(), + milestoneId, +) { + if (!milestoneId || roadmapProjectionSyncDisabled()) return; + const key = `${basePath}\0${milestoneId}`; + if (pending.has(key) || inFlight.has(key)) return; + pending.set(key, { basePath, milestoneId }); + const timer = setTimeout(() => { + void flushOneRoadmapProjection(key); + }, 0); + timer.unref?.(); +} + +/** + * Refresh one roadmap projection immediately. + * + * Purpose: provide an explicit, awaitable projection path for tests and repair + * tools while sharing the same renderer used by the asynchronous scheduler. + * + * Consumer: roadmap projection sync tests and future server repair jobs. + */ +export async function refreshRoadmapProjectionNow(basePath, milestoneId) { + const { renderRoadmapFromDb } = await import("../markdown-renderer.js"); + return renderRoadmapFromDb(basePath, milestoneId); +} + +/** + * Drain queued projection refreshes. + * + * Purpose: let tests prove DB writes schedule real roadmap projection updates + * without waiting on wall-clock timers. + * + * Consumer: roadmap-projection-sync.test.mjs. + */ +export async function flushRoadmapProjectionRefreshesForTests() { + while (pending.size > 0) { + const keys = [...pending.keys()]; + await Promise.all(keys.map((key) => flushOneRoadmapProjection(key))); + } +} + +function roadmapProjectionSyncDisabled() { + if (process.env.SF_ROADMAP_PROJECTION_SYNC === "0") return true; + if (process.env.SF_ROADMAP_PROJECTION_SYNC === "1") return false; + return process.env.VITEST === "true"; +} + +async function flushOneRoadmapProjection(key) { + const entry = pending.get(key); + if (!entry || inFlight.has(key)) return; + pending.delete(key); + inFlight.add(key); + try { + await refreshRoadmapProjectionNow(entry.basePath, entry.milestoneId); + } catch (err) { + logWarning("roadmap-projection-sync", "projection refresh failed", { + milestoneId: entry.milestoneId, + error: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlight.delete(key); + } +} diff --git a/src/resources/extensions/sf/sf-db/sf-db-milestones.js b/src/resources/extensions/sf/sf-db/sf-db-milestones.js index 5c221b52b..355898392 100644 --- a/src/resources/extensions/sf/sf-db/sf-db-milestones.js +++ b/src/resources/extensions/sf/sf-db/sf-db-milestones.js @@ -11,6 +11,7 @@ import { rowToMilestone, transaction, } from "./sf-db-core.js"; +import { scheduleRoadmapProjectionRefresh } from "./roadmap-projection-sync.js"; export function insertMilestone(m) { const currentDb = _getAdapter(); @@ -57,6 +58,7 @@ export function insertMilestone(m) { if (hasPlanningPayload(m.planning)) { insertMilestoneSpecIfAbsent(m.id, m.planning ?? {}); } + scheduleRoadmapProjectionRefresh(process.cwd(), m.id); } export function upsertMilestonePlanning(milestoneId, planning) { @@ -111,6 +113,7 @@ export function upsertMilestonePlanning(milestoneId, planning) { ? JSON.stringify(planning.productResearch) : null, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function getAllMilestones() { @@ -146,6 +149,7 @@ export function updateMilestoneStatus(milestoneId, status, completedAt) { ":completed_at": completedAt ?? null, ":id": milestoneId, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function updateMilestoneQueueOrder(order) { @@ -159,6 +163,9 @@ export function updateMilestoneQueueOrder(order) { stmt.run({ ":sequence": i + 1, ":id": order[i] }); } }); + for (const milestoneId of order) { + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); + } } export function getActiveMilestoneFromDb() { @@ -274,6 +281,9 @@ export function bulkInsertLegacyHierarchy(payload) { ); } }); + for (const milestoneId of clearMilestoneIds) { + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); + } } export function clearEngineHierarchy() { diff --git a/src/resources/extensions/sf/sf-db/sf-db-slices.js b/src/resources/extensions/sf/sf-db/sf-db-slices.js index fe6e79130..9f76e4fce 100644 --- a/src/resources/extensions/sf/sf-db/sf-db-slices.js +++ b/src/resources/extensions/sf/sf-db/sf-db-slices.js @@ -10,6 +10,7 @@ import { safeParseJsonArray, transaction, } from "./sf-db-core.js"; +import { scheduleRoadmapProjectionRefresh } from "./roadmap-projection-sync.js"; export function insertSlice(s) { const currentDb = _getAdapter(); @@ -95,6 +96,7 @@ export function insertSlice(s) { ":raw_traces_vision_fragment": s.tracesVisionFragment ?? null, }); insertSliceSpecIfAbsent(s.milestoneId, s.id, s.planning ?? {}); + scheduleRoadmapProjectionRefresh(process.cwd(), s.milestoneId); } export function insertOrIgnoreSlice(args) { @@ -109,6 +111,7 @@ export function insertOrIgnoreSlice(args) { ":title": args.title, ":ts": args.createdAt, }); + scheduleRoadmapProjectionRefresh(process.cwd(), args.milestoneId); } export function clearSliceSketch(milestoneId, sliceId) { @@ -127,6 +130,7 @@ export function setSliceSketchFlag(milestoneId, sliceId, isSketch) { ":mid": milestoneId, ":sid": sliceId, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function autoHealSketchFlags(milestoneId, hasPlanFile) { @@ -178,6 +182,7 @@ export function upsertSlicePlanning(milestoneId, sliceId, planning) { // ADR-0000 P2 (schema v69): vision trace fragment is part of planning. ":traces_vision_fragment": planning.tracesVisionFragment ?? null, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } // ADR-0000 P2 (schema v69): focused setter so callers that already have a @@ -195,6 +200,7 @@ export function updateSliceVisionTrace(milestoneId, sliceId, fragment) { ":mid": milestoneId, ":sid": sliceId, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function getSlice(milestoneId, sliceId) { @@ -219,6 +225,7 @@ export function updateSliceStatus(milestoneId, sliceId, status, completedAt) { ":milestone_id": milestoneId, ":id": sliceId, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function setSliceUatVerdict(milestoneId, sliceId, verdict) { @@ -229,6 +236,7 @@ export function setSliceUatVerdict(milestoneId, sliceId, verdict) { `UPDATE slices SET uat_verdict = :verdict WHERE milestone_id = :mid AND id = :sid`, ) .run({ ":mid": milestoneId, ":sid": sliceId, ":verdict": verdict }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function getSliceUatVerdict(milestoneId, sliceId) { @@ -312,6 +320,7 @@ export function setSliceSummaryMd(milestoneId, sliceId, summaryMd, uatMd) { ":summary_md": summaryMd, ":uat_md": uatMd, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function getMilestoneSlices(milestoneId) { @@ -369,6 +378,7 @@ export function syncSliceDependencies(milestoneId, sliceId, depends) { ) .run({ ":mid": milestoneId, ":sid": sliceId, ":dep": dep }); } + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function getDependentSlices(milestoneId, sliceId) { @@ -452,6 +462,7 @@ export function updateSliceFields(milestoneId, sliceId, fields) { ":depends": fields.depends ? JSON.stringify(fields.depends) : null, ":demo": fields.demo ?? null, }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function setSliceReplanTriggeredAt(milestoneId, sliceId, ts) { @@ -462,6 +473,7 @@ export function setSliceReplanTriggeredAt(milestoneId, sliceId, ts) { "UPDATE slices SET replan_triggered_at = :ts WHERE milestone_id = :mid AND id = :sid", ) .run({ ":ts": ts, ":mid": milestoneId, ":sid": sliceId }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } export function deleteSlice(milestoneId, sliceId) { @@ -493,4 +505,5 @@ export function deleteSlice(milestoneId, sliceId) { .prepare(`DELETE FROM slices WHERE milestone_id = :mid AND id = :sid`) .run({ ":mid": milestoneId, ":sid": sliceId }); }); + scheduleRoadmapProjectionRefresh(process.cwd(), milestoneId); } diff --git a/src/resources/extensions/sf/tests/detector-server-direction-drift.test.mjs b/src/resources/extensions/sf/tests/detector-server-direction-drift.test.mjs new file mode 100644 index 000000000..2e75f3a76 --- /dev/null +++ b/src/resources/extensions/sf/tests/detector-server-direction-drift.test.mjs @@ -0,0 +1,82 @@ +/** + * detector-server-direction-drift.test.mjs — server direction drift contracts. + * + * Purpose: prove Wiggums catches queued work that revives superseded server + * architecture while ignoring cancelled historical slices. + */ +import assert from "node:assert/strict"; +import { test } from "vitest"; +import { + detectServerDirectionDrift, + serverDirectionDriftGate, +} from "../detectors/server-direction-drift.js"; +import { runDetectorSweep } from "../detectors/periodic-runner.js"; + +test("detectServerDirectionDrift_when_queued_slice_mentions_sf_serve_flags_drift", () => { + const result = detectServerDirectionDrift({ + slices: [ + { + milestone_id: "M053", + id: "S01", + status: "queued", + title: "`sf serve` daemon scaffold + JSON-RPC API", + goal: "Create a separate JSON-RPC API.", + }, + ], + }); + + assert.equal(result.stuck, true); + assert.equal(result.reason, "server-direction-drift"); + assert.equal(result.signature.matches[0].id, "S01"); +}); + +test("detectServerDirectionDrift_when_cancelled_slice_mentions_sf_serve_ignores_history", () => { + const result = detectServerDirectionDrift({ + slices: [ + { + milestone_id: "M053", + id: "S01", + status: "cancelled", + title: "`sf serve` daemon scaffold + JSON-RPC API", + }, + ], + }); + + assert.equal(result.stuck, false); +}); + +test("serverDirectionDriftGate_when_drift_exists_returns_manual_attention", async () => { + const result = await serverDirectionDriftGate.execute({ + requirements: [ + { + id: "R999", + status: "active", + description: "Add A2A as the primary server control plane.", + }, + ], + }); + + assert.equal(result.outcome, "manual-attention"); + assert.equal(result.rationale, "server-direction-drift"); +}); + +test("runDetectorSweep_includes_server_direction_drift_detector", async () => { + const result = await runDetectorSweep( + { + slices: [ + { + id: "S99", + status: "queued", + title: "Per-repo systemd unit for another server", + }, + ], + }, + { throttleMs: 0 }, + ); + + assert.ok( + result.detectorsFired.some( + (detector) => detector.name === "server-direction-drift", + ), + ); +}); diff --git a/src/resources/extensions/sf/tests/model-router-agentic.test.mjs b/src/resources/extensions/sf/tests/model-router-agentic.test.mjs index dd41a3518..794f87a93 100644 --- a/src/resources/extensions/sf/tests/model-router-agentic.test.mjs +++ b/src/resources/extensions/sf/tests/model-router-agentic.test.mjs @@ -2,6 +2,7 @@ import { describe, expect, test } from "vitest"; import { BASE_REQUIREMENTS, MODEL_CAPABILITY_PROFILES, + resolveModelForComplexity, scoreEligibleModels, scoreModel, } from "../model-router.js"; @@ -16,6 +17,11 @@ describe("agentic capability axis (ADR-0079)", () => { ); }); + test("challenge base requirements weight adversarial agentic reasoning", () => { + expect(BASE_REQUIREMENTS.challenge.reasoning).toBeGreaterThanOrEqual(0.8); + expect(BASE_REQUIREMENTS.challenge.agentic).toBeGreaterThanOrEqual(0.85); + }); + test("known agentic-capable models score higher than coding-completion models on execute-task", () => { const codestralScore = scoreModel( MODEL_CAPABILITY_PROFILES["codestral-latest"], @@ -34,6 +40,45 @@ describe("agentic capability axis (ADR-0079)", () => { expect(sonnetScore).toBeGreaterThan(codestralScore); }); + test("challenge routing ignores sticky model unless explicitly enabled", () => { + const phaseConfig = { + primary: "openai/gpt-5.5", + fallbacks: ["minimax/MiniMax-M2.7"], + }; + const routingConfig = { + enabled: true, + capability_routing: true, + }; + const availableModels = ["kimi-coding/kimi-k2.6", "minimax/MiniMax-M2.7"]; + const stickyHint = { provider: "minimax", id: "MiniMax-M2.7" }; + + const withoutSticky = resolveModelForComplexity( + { tier: "standard" }, + phaseConfig, + routingConfig, + availableModels, + "challenge", + {}, + {}, + stickyHint, + ); + expect(withoutSticky.selectionMethod).toBe("capability-scored"); + expect(withoutSticky.modelId).toBe("kimi-coding/kimi-k2.6"); + + const withSticky = resolveModelForComplexity( + { tier: "standard" }, + phaseConfig, + { ...routingConfig, sticky_routing: true }, + availableModels, + "challenge", + {}, + {}, + stickyHint, + ); + expect(withSticky.selectionMethod).toBe("slice-sticky"); + expect(withSticky.modelId).toBe("minimax/MiniMax-M2.7"); + }); + test("devstral variants score below agentic models on execute-task", () => { const devstralScore = scoreModel( MODEL_CAPABILITY_PROFILES["devstral-2512"], diff --git a/src/resources/extensions/sf/tests/preferences-models.test.mjs b/src/resources/extensions/sf/tests/preferences-models.test.mjs index 0a74c0629..64b919f0f 100644 --- a/src/resources/extensions/sf/tests/preferences-models.test.mjs +++ b/src/resources/extensions/sf/tests/preferences-models.test.mjs @@ -110,6 +110,25 @@ describe("preferences model resolution", () => { }); }); + test("resolveModelWithFallbacksForUnit_when_challenge_uses_validation_model", () => { + makePreferencesProject( + [ + "version: 1", + "models:", + " planning: minimax/MiniMax-M2.7", + " validation: kimi-coding/kimi-k2.6", + "", + ].join("\n"), + ); + + const result = resolveModelWithFallbacksForUnit("challenge"); + + assert.deepEqual(result, { + primary: "kimi-coding/kimi-k2.6", + fallbacks: [], + }); + }); + test("isModelInEnabledList_when_list_empty_allows_any_model", () => { assert.equal(isModelInEnabledList("kimi-coding", "kimi-k2.6", []), true); assert.equal( diff --git a/src/resources/extensions/sf/tests/roadmap-projection-sync.test.mjs b/src/resources/extensions/sf/tests/roadmap-projection-sync.test.mjs new file mode 100644 index 000000000..f3dc756aa --- /dev/null +++ b/src/resources/extensions/sf/tests/roadmap-projection-sync.test.mjs @@ -0,0 +1,106 @@ +import assert from "node:assert/strict"; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, test } from "vitest"; +import { + closeDatabase, + flushRoadmapProjectionRefreshesForTests, + insertMilestone, + insertSlice, + openDatabase, + updateSliceStatus, + upsertMilestonePlanning, +} from "../sf-db.js"; + +const originalCwd = process.cwd(); +const originalEnv = { ...process.env }; +const tmpDirs = []; + +afterEach(() => { + closeDatabase(); + process.chdir(originalCwd); + process.env = { ...originalEnv }; + while (tmpDirs.length > 0) { + rmSync(tmpDirs.pop(), { recursive: true, force: true }); + } +}); + +function makeProject() { + const dir = mkdtempSync(join(tmpdir(), "sf-roadmap-sync-")); + tmpDirs.push(dir); + mkdirSync(join(dir, ".sf"), { recursive: true }); + process.env.SF_ROADMAP_PROJECTION_SYNC = "1"; + process.chdir(dir); + openDatabase(join(dir, ".sf", "sf.db")); + return dir; +} + +describe("roadmap projection sync", () => { + test("db_writes_refresh_roadmap_projection", async () => { + const project = makeProject(); + + insertMilestone({ + id: "M777", + title: "Initial server plan", + status: "queued", + planning: { + vision: "Keep planning state in SQLite.", + successCriteria: ["Projection exists."], + }, + }); + insertSlice({ + milestoneId: "M777", + id: "S01", + title: "Render projection", + status: "pending", + sequence: 1, + planning: { + goal: "Write ROADMAP.md and ROADMAP.json from DB state.", + }, + }); + await flushRoadmapProjectionRefreshesForTests(); + + const roadmapPath = join( + project, + ".sf", + "milestones", + "M777", + "M777-ROADMAP.md", + ); + const jsonPath = join( + project, + ".sf", + "milestones", + "M777", + "M777-ROADMAP.json", + ); + assert.equal(existsSync(roadmapPath), true); + assert.equal(existsSync(jsonPath), true); + assert.match(readFileSync(roadmapPath, "utf-8"), /Initial server plan/); + + upsertMilestonePlanning("M777", { + title: "Server-owned roadmap projection", + vision: "The server refreshes generated roadmap files after DB writes.", + }); + updateSliceStatus("M777", "S01", "complete", "2026-05-17T20:00:00.000Z"); + await flushRoadmapProjectionRefreshesForTests(); + + const roadmap = readFileSync(roadmapPath, "utf-8"); + const projection = JSON.parse(readFileSync(jsonPath, "utf-8")); + assert.match(roadmap, /Server-owned roadmap projection/); + assert.match( + roadmap, + /The server refreshes generated roadmap files after DB writes/, + ); + assert.match(roadmap, /- \[x\] \*\*S01: Render projection\*\*/); + assert.equal(projection.origin, "db-projection"); + assert.equal(projection.slices[0].status, "complete"); + }); +}); diff --git a/src/resources/extensions/sf/uok/gate-registry-bootstrap.js b/src/resources/extensions/sf/uok/gate-registry-bootstrap.js index c1fa9194a..5bc136ef4 100644 --- a/src/resources/extensions/sf/uok/gate-registry-bootstrap.js +++ b/src/resources/extensions/sf/uok/gate-registry-bootstrap.js @@ -11,6 +11,7 @@ import { repeatedFeedbackKindGate } from "../detectors/repeated-feedback-kind.js import { artifactFlapGate } from "../detectors/artifact-flap.js"; import { staleLockGate } from "../detectors/stale-lock.js"; import { periodicDetectorSweepGate } from "../detectors/periodic-runner.js"; +import { serverDirectionDriftGate } from "../detectors/server-direction-drift.js"; import { inlineRuntimeGate } from "./inline-runtime-gate.js"; /** @@ -41,6 +42,7 @@ registry.register(zeroProgressGate); registry.register(repeatedFeedbackKindGate); registry.register(artifactFlapGate); registry.register(staleLockGate); +registry.register(serverDirectionDriftGate); registry.register(periodicDetectorSweepGate); registry.register(inlineRuntimeGate); diff --git a/src/tests/headless-feedback.test.ts b/src/tests/headless-feedback.test.ts index 953bdb43d..76a873563 100644 --- a/src/tests/headless-feedback.test.ts +++ b/src/tests/headless-feedback.test.ts @@ -20,6 +20,24 @@ const handlerSrc = readFileSync( join(__dirname, "..", "headless-feedback.ts"), "utf-8", ); +const forwardSrc = readFileSync( + join(__dirname, "..", "headless-server-forward.ts"), + "utf-8", +); +const rpcModeSrc = readFileSync( + join( + __dirname, + "..", + "..", + "packages", + "coding-agent", + "src", + "modes", + "rpc", + "rpc-mode.ts", + ), + "utf-8", +); test("headless.ts dispatches feedback command to handleFeedback", () => { assert.match( @@ -72,5 +90,29 @@ test("add path defaults blocking from severity, doesn't require it", () => { // readBoolFlag(--blocking) OR severity === high|critical → blocking=true. // The behaviour is documented in self-feedback.js (deriveBlocking), // mirror it so operator-filed entries have consistent semantics. - assert.match(handlerSrc, /severity === "high" \|\| severity === "critical"/); + assert.match(handlerSrc, /severity === "high"/); + assert.match(handlerSrc, /severity === "critical"/); +}); + +test("active-server feedback forwarding queues writes instead of blocking RPC", () => { + assert.match( + forwardSrc, + /queued:\s*true/, + "forwarded add/resolve commands must ask the active RPC server to queue writes", + ); + assert.match( + rpcModeSrc, + /SF_FEEDBACK_QUEUE_FILE = "sf-feedback-queue\.jsonl"/, + "RPC server must persist queued feedback commands durably", + ); + assert.match( + rpcModeSrc, + /await drainQueuedSfFeedbackCommands\(process\.cwd\(\)\)/, + "server-owned autonomous startup must drain queued feedback before running", + ); + assert.match( + rpcModeSrc, + /scheduleQueuedSfFeedbackDrain\(process\.cwd\(\)\)/, + "queued feedback commands should also drain from the server control lane", + ); }); diff --git a/src/tests/integration/web-mode-cli.test.ts b/src/tests/integration/web-mode-cli.test.ts index 19c2eb1dd..2e97ad143 100644 --- a/src/tests/integration/web-mode-cli.test.ts +++ b/src/tests/integration/web-mode-cli.test.ts @@ -954,6 +954,51 @@ test("sf server stop is parsed and dispatched with resolved path", async assert.equal(stopOptions?.all, false); }); +test("sf server reload is parsed as reload launch", async (_t) => { + const tmp = mkdtempSync(join(tmpdir(), "sf-web-reload-path-")); + let receivedOptions: Record | undefined; + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }); + }); + + mkdirSync(tmp, { recursive: true }); + const flags = cliWeb.parseCliArgs([ + "node", + "dist/loader.js", + "server", + "reload", + tmp, + ]); + assert.deepEqual(flags.messages, ["server", "reload", tmp]); + + const result = await cliWeb.runWebCliBranch(flags, { + cwd: () => "/", + runWebMode: async (options) => { + receivedOptions = options as unknown as Record; + return { + mode: "web" as const, + ok: true as const, + cwd: options.cwd, + projectSessionsDir: options.projectSessionsDir, + host: "127.0.0.1", + port: 4000, + url: "http://127.0.0.1:4000", + hostKind: "packaged-standalone" as const, + hostPath: "/tmp/server.js", + hostRoot: "/tmp", + }; + }, + stderr: { write: () => true }, + }); + + assert.equal(result.handled, true); + if (!result.handled) throw new Error("expected handled"); + assert.equal(result.action, "reload"); + assert.equal(receivedOptions?.cwd, tmp); + assert.equal(receivedOptions?.reload, true); +}); + // ─── Context-aware launch detection tests ────────────────────────────── test("resolveContextAwareCwd returns project cwd when inside a project under dev root", (_t) => { @@ -1137,12 +1182,94 @@ test("launchWebMode kills stale instance for same cwd before spawning", async (_ assert.equal(status.ok, true); assert.equal(spawnCalled, true); // Stale instance for same cwd should have been cleaned up - assert.match(stderrOutput, /Cleaning up stale/); + assert.match(stderrOutput, /Stale SF server was already stopped/); // New instance should be registered const registry = webMode.readInstanceRegistry(registryPath); assert.equal(registry[resolve(cwd)]?.pid, 88888); }); +test("launchWebMode reload proves candidate before replacing fixed-port server", async (_t) => { + const tmp = mkdtempSync(join(tmpdir(), "sf-web-reload-")); + const standaloneRoot = join(tmp, "dist", "web", "standalone"); + const serverPath = join(standaloneRoot, "server.js"); + mkdirSync(standaloneRoot, { recursive: true }); + writeFileSync(serverPath, 'console.log("stub")\n'); + + const registryPath = join(tmp, "web-instances.json"); + const pidFilePath = join(tmp, "web-server.pid"); + const cwd = "/tmp/reload-project"; + webMode.registerInstance( + cwd, + { pid: 77777, port: 4000, url: "http://127.0.0.1:4000" }, + registryPath, + ); + + const spawnPorts: string[] = []; + const bootUrls: string[] = []; + let nextPid = 90000; + let stderrOutput = ""; + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }); + }); + + const status = await webMode.launchWebMode( + { + cwd, + projectSessionsDir: "/tmp/.sf/sessions/reload", + agentDir: "/tmp/.sf/agent", + packageRoot: tmp, + port: 4000, + reload: true, + }, + { + initResources: () => {}, + resolvePort: async () => 45123, + execPath: "/custom/node", + env: { TEST_ENV: "1" }, + kill: ((pid: number, signal?: string | number) => { + if (pid === 77777 && signal === 0) return true; + const error = new Error("no such process") as NodeJS.ErrnoException; + error.code = "ESRCH"; + throw error; + }) as typeof process.kill, + spawn: (_command, _args, options) => { + spawnPorts.push(String(options.env?.PORT)); + return { + pid: nextPid++, + once: () => undefined, + unref: () => {}, + } as any; + }, + waitForBootReady: async (url) => { + bootUrls.push(url); + }, + openBrowser: () => {}, + pidFilePath, + writePidFile: webMode.writePidFile, + registryPath, + stderr: { + write(chunk: string) { + stderrOutput += chunk; + return true; + }, + }, + }, + ); + + assert.equal(status.ok, true); + assert.deepEqual(spawnPorts, ["45123", "4000"]); + assert.deepEqual(bootUrls, [ + "http://127.0.0.1:45123", + "http://127.0.0.1:4000", + ]); + assert.match(stderrOutput, /Proving reload candidate/); + assert.match(stderrOutput, /Reload candidate passed boot check/); + const registry = webMode.readInstanceRegistry(registryPath); + assert.equal(registry[resolve(cwd)]?.pid, 90001); + assert.equal(registry[resolve(cwd)]?.port, 4000); +}); + test("launchWebMode does not log cleanup when no stale instance exists", async (_t) => { const tmp = mkdtempSync(join(tmpdir(), "sf-web-no-stale-")); const standaloneRoot = join(tmp, "dist", "web", "standalone"); diff --git a/src/web-mode.ts b/src/web-mode.ts index 7e6bb2a70..2af3ff712 100644 --- a/src/web-mode.ts +++ b/src/web-mode.ts @@ -56,6 +56,16 @@ export interface WebModeLaunchOptions { packageRoot?: string; host?: string; port?: number; + /** + * Reload an existing registered server after the replacement passes boot. + * + * Purpose: keep `sf server` upgrades graceful by proving the candidate host + * is healthy before terminating the old process bound to the project. + * + * Consumer: `sf server reload` and default `sf server start` behavior when a + * live same-project instance already exists. + */ + reload?: boolean; /** Additional allowed origins for CORS (forwarded as SF_WEB_ALLOWED_ORIGINS). */ allowedOrigins?: string[]; } @@ -128,6 +138,7 @@ export interface WebModeDeps { writePidFile?: (path: string, pid: number) => void; readPidFile?: (path: string) => number | null; deletePidFile?: (path: string) => void; + kill?: typeof process.kill; /** Path to the multi-instance registry JSON (for testing). */ registryPath?: string; } @@ -146,6 +157,11 @@ export interface WebModeStopResult { stoppedCount?: number; } +type ExistingServerInstance = + | { state: "none" } + | { state: "dead"; entry: WebInstanceEntry } + | { state: "live"; entry: WebInstanceEntry }; + // ─── Instance Registry ────────────────────────────────────────────────────── export interface WebInstanceEntry { @@ -831,6 +847,57 @@ function cleanupStaleInstance( unregisterInstance(cwd, registryPath); } +function getRegisteredServerInstance( + cwd: string, + registryPath?: string, + kill: typeof process.kill = process.kill, +): ExistingServerInstance { + const registry = readInstanceRegistry(registryPath); + const entry = registry[resolve(cwd)]; + if (!entry) return { state: "none" }; + if (!pidExists(entry.pid, kill)) return { state: "dead", entry }; + return { state: "live", entry }; +} + +function cleanupDeadRegisteredInstance( + cwd: string, + stderr: WritableLike, + entry: WebInstanceEntry, + registryPath?: string, +): void { + stderr.write( + `[forge] Stale SF server was already stopped (pid=${entry.pid}) — clearing entry.\n`, + ); + unregisterInstance(cwd, registryPath); +} + +function stopReloadedInstance( + cwd: string, + stderr: WritableLike, + entry: WebInstanceEntry, + registryPath?: string, +): void { + const result = terminateWebServerProcessTree(entry.pid); + if (result === "killed" || result === "force-killed") { + stderr.write( + `[forge] Reloaded SF server for ${resolve(cwd)}; stopped previous pid=${entry.pid}.\n`, + ); + } else if (result === "already-dead") { + stderr.write( + `[forge] Previous SF server already exited during reload (pid=${entry.pid}).\n`, + ); + } else { + stderr.write( + `[forge] Reload candidate is running, but previous SF server pid=${entry.pid} did not stop: ${result.error}\n`, + ); + return; + } + // Only remove the old registry row after the new instance has already + // registered itself. unregisterInstance deletes by cwd, so callers must + // invoke this before registering the replacement. + unregisterInstance(cwd, registryPath); +} + /** * Detect and reap orphaned next-server processes that outlived their parent * web host. These orphans have cwd under dist/web/standalone (or a deleted @@ -951,10 +1018,35 @@ export async function launchWebMode( stderr.write(`[forge] Starting server mode…\n`); - // Kill any stale server instance for this project before reserving a port. - // This prevents EADDRINUSE when the previous `sf server` was terminated - // without a clean shutdown (e.g. terminal closed, crash). - cleanupStaleInstance(options.cwd, stderr, deps.registryPath); + const existing = getRegisteredServerInstance( + options.cwd, + deps.registryPath, + deps.kill, + ); + let reloadPrevious: WebInstanceEntry | null = null; + if ( + existing.state === "live" && + (options.reload === true || + !options.port || + options.port === existing.entry.port) + ) { + reloadPrevious = existing.entry; + stderr.write( + `[forge] Existing SF server found for ${resolve(options.cwd)} (pid=${existing.entry.pid}, port=${existing.entry.port}); launching replacement before shutdown.\n`, + ); + } else if (existing.state === "dead") { + cleanupDeadRegisteredInstance( + options.cwd, + stderr, + existing.entry, + deps.registryPath, + ); + } else if (existing.state === "live") { + // Explicit fixed-port start cannot bind beside a live same-port process. + // Stop it before launch so legacy `sf server start --port 4000` keeps + // working, while normal starts use reload-first behavior. + cleanupStaleInstance(options.cwd, stderr, deps.registryPath); + } // Also reap orphaned next-server processes from prior unclean shutdowns // (sf-mooe4m5k-6fm7z9): orphaned next-server processes with cwd under @@ -969,28 +1061,11 @@ export async function launchWebMode( ); } - const port = + const targetPort = options.port ?? + reloadPrevious?.port ?? (deps.resolvePort ? await deps.resolvePort(host) : DEFAULT_PORT); - const authToken = randomBytes(32).toString("hex"); - const url = `http://${host}:${port}`; - const env = { - ...(deps.env ?? process.env), - HOSTNAME: host, - PORT: String(port), - SF_WEB_HOST: host, - SF_WEB_PORT: String(port), - SF_WEB_AUTH_TOKEN: authToken, - SF_WEB_PROJECT_CWD: options.cwd, - SF_WEB_PROJECT_SESSIONS_DIR: options.projectSessionsDir, - SF_WEB_PACKAGE_ROOT: resolution.packageRoot, - SF_WEB_HOST_KIND: resolution.kind, - SF_WEB_AUTO_START_AUTONOMOUS: "1", - ...(resolution.kind === "source-dev" ? { NEXT_PUBLIC_SF_DEV: "1" } : {}), - ...(options.allowedOrigins?.length - ? { SF_WEB_ALLOWED_ORIGINS: options.allowedOrigins.join(",") } - : {}), - }; + const targetUrl = `http://${host}:${targetPort}`; try { stderr.write(`[forge] Initialising resources…\n`); @@ -1005,8 +1080,8 @@ export async function launchWebMode( cwd: options.cwd, projectSessionsDir: options.projectSessionsDir, host, - port, - url, + port: targetPort, + url: targetUrl, hostKind: resolution.kind, hostPath: resolution.entryPath, hostRoot: resolution.hostRoot, @@ -1016,89 +1091,163 @@ export async function launchWebMode( return failure; } - const spawnSpec = buildSpawnSpec( - resolution, - host, - port, - deps.platform ?? process.platform, - deps.execPath ?? process.execPath, - ); - - stderr.write(`[forge] Launching web host on port ${port}…\n`); - - const spawnResult = await spawnDetachedProcess( - deps.spawn ?? - ((command, args, spawnOptions) => spawn(command, args, spawnOptions)), - spawnSpec.command, - spawnSpec.args, - { - cwd: spawnSpec.cwd, - detached: true, - stdio: "ignore", - windowsHide: true, - shell: needsWindowsShell( - spawnSpec.command, - deps.platform ?? process.platform, - ), - env, - }, - ); - - if (!spawnResult.ok) { - const failure: WebModeLaunchFailure = { - mode: "web", - ok: false, - cwd: options.cwd, - projectSessionsDir: options.projectSessionsDir, + const spawnVerifiedHost = async ( + port: number, + label: "candidate" | "web host", + autoStartAutonomous: boolean, + ): Promise< + | { + ok: true; + child: SpawnedChildLike; + authToken: string; + url: string; + } + | { ok: false; failure: WebModeLaunchFailure } + > => { + const authToken = randomBytes(32).toString("hex"); + const url = `http://${host}:${port}`; + const env = { + ...(deps.env ?? process.env), + HOSTNAME: host, + PORT: String(port), + SF_WEB_HOST: host, + SF_WEB_PORT: String(port), + SF_WEB_AUTH_TOKEN: authToken, + SF_WEB_PROJECT_CWD: options.cwd, + SF_WEB_PROJECT_SESSIONS_DIR: options.projectSessionsDir, + SF_WEB_PACKAGE_ROOT: resolution.packageRoot, + SF_WEB_HOST_KIND: resolution.kind, + SF_WEB_AUTO_START_AUTONOMOUS: autoStartAutonomous ? "1" : "0", + ...(resolution.kind === "source-dev" ? { NEXT_PUBLIC_SF_DEV: "1" } : {}), + ...(options.allowedOrigins?.length + ? { SF_WEB_ALLOWED_ORIGINS: options.allowedOrigins.join(",") } + : {}), + }; + const spawnSpec = buildSpawnSpec( + resolution, host, port, - url, - hostKind: resolution.kind, - hostPath: resolution.entryPath, - hostRoot: resolution.hostRoot, - failureReason: `launch:${spawnResult.error instanceof Error ? spawnResult.error.message : String(spawnResult.error)}`, - }; - emitLaunchStatus(stderr, failure); - return failure; + deps.platform ?? process.platform, + deps.execPath ?? process.execPath, + ); + stderr.write(`[forge] Launching ${label} on port ${port}…\n`); + const spawnResult = await spawnDetachedProcess( + deps.spawn ?? + ((command, args, spawnOptions) => spawn(command, args, spawnOptions)), + spawnSpec.command, + spawnSpec.args, + { + cwd: spawnSpec.cwd, + detached: true, + stdio: "ignore", + windowsHide: true, + shell: needsWindowsShell( + spawnSpec.command, + deps.platform ?? process.platform, + ), + env, + }, + ); + if (!spawnResult.ok) { + return { + ok: false, + failure: { + mode: "web", + ok: false, + cwd: options.cwd, + projectSessionsDir: options.projectSessionsDir, + host, + port, + url, + hostKind: resolution.kind, + hostPath: resolution.entryPath, + hostRoot: resolution.hostRoot, + failureReason: `launch:${spawnResult.error instanceof Error ? spawnResult.error.message : String(spawnResult.error)}`, + }, + }; + } + try { + const bootReadyFn = + deps.waitForBootReady ?? + ((u: string) => waitForBootReady(u, 180_000, stderr, authToken)); + await bootReadyFn(url); + } catch (error) { + if (spawnResult.child.pid !== undefined) { + terminateWebServerProcessTree(spawnResult.child.pid); + } + return { + ok: false, + failure: { + mode: "web", + ok: false, + cwd: options.cwd, + projectSessionsDir: options.projectSessionsDir, + host, + port, + url, + hostKind: resolution.kind, + hostPath: resolution.entryPath, + hostRoot: resolution.hostRoot, + failureReason: `boot-ready:${error instanceof Error ? error.message : String(error)}`, + }, + }; + } + return { ok: true, child: spawnResult.child, authToken, url }; + }; + + if (reloadPrevious) { + const candidatePort = deps.resolvePort + ? await deps.resolvePort(host) + : await reserveWebPort(host); + stderr.write( + `[forge] Proving reload candidate on temporary port ${candidatePort} before touching fixed port ${targetPort}…\n`, + ); + const candidate = await spawnVerifiedHost( + candidatePort, + "candidate", + false, + ); + if (!candidate.ok) { + emitLaunchStatus(stderr, candidate.failure); + return candidate.failure; + } + if (candidate.child.pid !== undefined) { + terminateWebServerProcessTree(candidate.child.pid); + } + stderr.write(`[forge] Reload candidate passed boot check.\n`); + stopReloadedInstance( + options.cwd, + stderr, + reloadPrevious, + deps.registryPath, + ); + } + + const finalHost = await spawnVerifiedHost(targetPort, "web host", true); + if (!finalHost.ok) { + emitLaunchStatus(stderr, finalHost.failure); + return finalHost.failure; } try { - const bootReadyFn = - deps.waitForBootReady ?? - ((u: string) => waitForBootReady(u, 180_000, stderr, authToken)); - await bootReadyFn(url); - } catch (error) { - const failure: WebModeLaunchFailure = { - mode: "web", - ok: false, - cwd: options.cwd, - projectSessionsDir: options.projectSessionsDir, - host, - port, - url, - hostKind: resolution.kind, - hostPath: resolution.entryPath, - hostRoot: resolution.hostRoot, - failureReason: `boot-ready:${error instanceof Error ? error.message : String(error)}`, - }; - emitLaunchStatus(stderr, failure); - return failure; - } - - try { - spawnResult.child.unref?.(); - const pid = spawnResult.child.pid; + finalHost.child.unref?.(); + const pid = finalHost.child.pid; if (pid !== undefined) { const pidFilePath = deps.pidFilePath ?? defaultWebPidFilePath; (deps.writePidFile ?? writePidFile)(pidFilePath, pid); // Register in multi-instance registry registerInstance( options.cwd, - { pid, port, url, authToken }, + { + pid, + port: targetPort, + url: targetUrl, + authToken: finalHost.authToken, + }, deps.registryPath, ); } - const authenticatedUrl = `${url}/#token=${authToken}`; + const authenticatedUrl = `${targetUrl}/#token=${finalHost.authToken}`; try { (deps.openBrowser ?? openBrowser)(authenticatedUrl); } catch (browserError) { @@ -1113,8 +1262,8 @@ export async function launchWebMode( cwd: options.cwd, projectSessionsDir: options.projectSessionsDir, host, - port, - url, + port: targetPort, + url: targetUrl, hostKind: resolution.kind, hostPath: resolution.entryPath, hostRoot: resolution.hostRoot, @@ -1124,15 +1273,15 @@ export async function launchWebMode( return failure; } - const authenticatedUrl = `${url}/#token=${authToken}`; + const authenticatedUrl = `${targetUrl}/#token=${finalHost.authToken}`; const success: WebModeLaunchSuccess = { mode: "web", ok: true, cwd: options.cwd, projectSessionsDir: options.projectSessionsDir, host, - port, - url, + port: targetPort, + url: targetUrl, hostKind: resolution.kind, hostPath: resolution.entryPath, hostRoot: resolution.hostRoot, diff --git a/src/web/settings-service.ts b/src/web/settings-service.ts index 3bbf749e6..a972c2fdd 100644 --- a/src/web/settings-service.ts +++ b/src/web/settings-service.ts @@ -1,6 +1,6 @@ import { execFile } from "node:child_process"; import { existsSync } from "node:fs"; -import { join } from "node:path"; +import { dirname, join } from "node:path"; import { pathToFileURL } from "node:url"; import type { SettingsData } from "../../web/lib/settings-types.ts"; import { resolveBridgeRuntimeConfig } from "./bridge-service.ts"; @@ -65,6 +65,13 @@ export async function collectSettingsData( const budgetPath = budgetResolution.modulePath; const historyPath = historyResolution.modulePath; const metricsPath = metricsResolution.modulePath; + const benchmarksPath = join( + dirname(routerPath), + "learning", + "data", + "model-benchmarks.json", + ); + const performancePath = join(projectCwd, ".sf", "model-performance.json"); // All modules share the same compiled-vs-source mode (they're all from the same package) const useCompiledJs = prefsResolution.useCompiledJs; @@ -102,6 +109,7 @@ export async function collectSettingsData( // and writes a combined JSON payload to stdout. const script = [ 'const { pathToFileURL } = await import("node:url");', + 'const { existsSync, readFileSync } = await import("node:fs");', "const prefsMod = await import(pathToFileURL(process.env.SF_SETTINGS_PREFS_MODULE).href);", "const routerMod = await import(pathToFileURL(process.env.SF_SETTINGS_ROUTER_MODULE).href);", "const budgetMod = await import(pathToFileURL(process.env.SF_SETTINGS_BUDGET_MODULE).href);", @@ -172,8 +180,45 @@ export async function collectSettingsData( "const ledger = metricsMod.loadLedgerFromDisk(process.env.SF_SETTINGS_BASE);", "const projectTotals = ledger ? metricsMod.getProjectTotals(ledger.units) : null;", + // 6. Published benchmark table and local learned model outcomes + "function readJson(path) {", + " if (!path || !existsSync(path)) return null;", + " try { return JSON.parse(readFileSync(path, 'utf-8')); } catch { return null; }", + "}", + "function benchmarkRows(raw) {", + " if (!raw || typeof raw !== 'object') return [];", + " return Object.entries(raw)", + " .filter(([modelId]) => !modelId.startsWith('_'))", + " .map(([modelId, row]) => ({ modelId, ...(row && typeof row === 'object' ? row : {}) }))", + " .sort((a, b) => String(a.modelId).localeCompare(String(b.modelId)));", + "}", + "function performanceRows(raw) {", + " if (!raw || typeof raw !== 'object') return [];", + " const rows = [];", + " for (const [unitType, models] of Object.entries(raw)) {", + " if (!models || typeof models !== 'object') continue;", + " for (const [modelId, value] of Object.entries(models)) {", + " if (!value || typeof value !== 'object') continue;", + " const aggregate = value.aggregate && typeof value.aggregate === 'object' ? value.aggregate : {};", + " rows.push({", + " unitType,", + " modelId,", + " successes: Number(aggregate.successes ?? 0),", + " failures: Number(aggregate.failures ?? 0),", + " timeouts: Number(aggregate.timeouts ?? 0),", + " totalTokens: Number(aggregate.totalTokens ?? 0),", + " totalCost: Number(aggregate.totalCost ?? 0),", + " lastUsed: aggregate.lastUsed ?? null,", + " });", + " }", + " }", + " return rows.sort((a, b) => String(b.lastUsed ?? '').localeCompare(String(a.lastUsed ?? '')));", + "}", + "const modelBenchmarks = benchmarkRows(readJson(process.env.SF_SETTINGS_BENCHMARKS_PATH));", + "const modelPerformance = performanceRows(readJson(process.env.SF_SETTINGS_MODEL_PERFORMANCE_PATH));", + // Write combined payload - "process.stdout.write(JSON.stringify({ preferences, routingConfig, budgetAllocation, routingHistory, projectTotals }));", + "process.stdout.write(JSON.stringify({ preferences, routingConfig, budgetAllocation, routingHistory, projectTotals, modelBenchmarks, modelPerformance }));", ].join(" "); const prefixArgs = buildSubprocessPrefixArgs( @@ -196,6 +241,8 @@ export async function collectSettingsData( SF_SETTINGS_HISTORY_MODULE: historyPath, SF_SETTINGS_METRICS_MODULE: metricsPath, SF_SETTINGS_BASE: projectCwd, + SF_SETTINGS_BENCHMARKS_PATH: benchmarksPath, + SF_SETTINGS_MODEL_PERFORMANCE_PATH: performancePath, }, maxBuffer: SETTINGS_MAX_BUFFER, windowsHide: true, diff --git a/web/components/sf/settings-panels.tsx b/web/components/sf/settings-panels.tsx index 8bbd04602..2ae2e392d 100644 --- a/web/components/sf/settings-panels.tsx +++ b/web/components/sf/settings-panels.tsx @@ -22,6 +22,8 @@ import { Button } from "@/components/ui/button"; import { authFetch } from "@/lib/auth"; import type { SettingsData, + SettingsModelBenchmark, + SettingsModelPerformance, SettingsPatternHistory, SettingsRoutingHistory, } from "@/lib/settings-types"; @@ -438,10 +440,68 @@ function TierOutcomeBadge({ ); } +function normalizeModelId(id: string): string { + return id.includes("/") ? (id.split("/").pop() ?? id) : id; +} + +function formatBenchmarkScore(value: number | null | undefined): string { + return typeof value === "number" && Number.isFinite(value) + ? value.toFixed(1) + : "–"; +} + +function aggregateModelPerformance( + rows: SettingsModelPerformance[], + modelId: string, +): { runs: number; successRate: string; cost: string } { + const bare = normalizeModelId(modelId); + const matched = rows.filter( + (row) => + row.modelId === modelId || + row.modelId.endsWith(`/${bare}`) || + normalizeModelId(row.modelId) === bare, + ); + const totals = matched.reduce( + (acc, row) => { + acc.successes += row.successes; + acc.failures += row.failures; + acc.timeouts += row.timeouts; + acc.cost += row.totalCost; + return acc; + }, + { successes: 0, failures: 0, timeouts: 0, cost: 0 }, + ); + const runs = totals.successes + totals.failures + totals.timeouts; + return { + runs, + successRate: + runs > 0 ? `${Math.round((totals.successes / runs) * 100)}%` : "–", + cost: runs > 0 ? formatCost(totals.cost) : "–", + }; +} + +function rankedBenchmarks( + benchmarks: SettingsModelBenchmark[], +): SettingsModelBenchmark[] { + return [...benchmarks] + .sort((a, b) => { + const score = (row: SettingsModelBenchmark) => + (row.swe_bench_verified ?? row.swe_bench ?? 0) * 0.35 + + (row.live_code_bench ?? 0) * 0.25 + + (row.hle ?? 0) * 0.15 + + (row.gpqa ?? 0) * 0.15 + + (row.instruction_following ?? 0) * 0.1; + return score(b) - score(a); + }) + .slice(0, 12); +} + export function ModelRoutingPanel() { const { state, data, busy, refresh } = useSettingsData(); const routingConfig = data?.routingConfig ?? null; const routingHistory = data?.routingHistory ?? null; + const modelBenchmarks = rankedBenchmarks(data?.modelBenchmarks ?? []); + const modelPerformance = data?.modelPerformance ?? []; return (
@@ -569,6 +629,73 @@ export function ModelRoutingPanel() { ) : ( )} + + {/* Model benchmarks */} + {modelBenchmarks.length > 0 ? ( +
+

+ Model Benchmarks +

+
+ + + + + + + + + + + + + + {modelBenchmarks.map((row) => { + const local = aggregateModelPerformance( + modelPerformance, + row.modelId, + ); + return ( + + + + + + + + + + ); + })} + +
ModelSWELCBHLEGPQALocalCost
+ {row.modelId} + + {formatBenchmarkScore( + row.swe_bench_verified ?? row.swe_bench, + )} + + {formatBenchmarkScore(row.live_code_bench)} + + {formatBenchmarkScore(row.hle)} + + {formatBenchmarkScore(row.gpqa)} + + {local.runs > 0 + ? `${local.successRate} / ${local.runs}` + : "–"} + + {local.cost} +
+
+
+ ) : ( + + )} )}
@@ -775,7 +902,7 @@ export function RemoteQuestionsPanel() { const { data, busy, refresh } = useSettingsData(); const existingConfig = data?.preferences?.remoteQuestions ?? null; - const [_envVarSet, setEnvVarSet] = useState(false); + const [, setEnvVarSet] = useState(false); const [envVarName, setEnvVarName] = useState(null); const [apiLoading, setApiLoading] = useState(true); const [tokenSet, setTokenSet] = useState(false); diff --git a/web/lib/settings-types.ts b/web/lib/settings-types.ts index f9ce24249..3e6c4f2c7 100644 --- a/web/lib/settings-types.ts +++ b/web/lib/settings-types.ts @@ -83,6 +83,35 @@ export interface SettingsProjectTotals { userMessages: number; } +// ─── Model Benchmark And Local Outcome Data ───────────────────────────────── + +export interface SettingsModelBenchmark { + modelId: string; + swe_bench?: number | null; + swe_bench_verified?: number | null; + live_code_bench?: number | null; + human_eval?: number | null; + hle?: number | null; + aime_2026?: number | null; + gpqa?: number | null; + mmlu_pro?: number | null; + instruction_following?: number | null; + context_window?: number | null; + max_output_tokens?: number | null; + source?: string | null; +} + +export interface SettingsModelPerformance { + unitType: string; + modelId: string; + successes: number; + failures: number; + timeouts: number; + totalTokens: number; + totalCost: number; + lastUsed: string | null; +} + // ─── Effective Preferences ──────────────────────────────────────────────────── export interface SettingsPreferencesData { @@ -124,4 +153,6 @@ export interface SettingsData { budgetAllocation: SettingsBudgetAllocation; routingHistory: SettingsRoutingHistory | null; projectTotals: SettingsProjectTotals | null; + modelBenchmarks: SettingsModelBenchmark[]; + modelPerformance: SettingsModelPerformance[]; }