sf snapshot: uncommitted changes after 33m inactivity

This commit is contained in:
Mikael Hugo 2026-05-16 17:00:13 +02:00
parent b73e386090
commit b5764af27b
10 changed files with 699 additions and 38 deletions

View file

@ -1,3 +1,4 @@
import { spawn } from "node:child_process";
import { DiscordBot, validateDiscordConfig } from "./discord-bot.js";
import { EventBridge } from "./event-bridge.js";
import type { Logger } from "./logger.js";
@ -6,6 +7,11 @@ import { scanForProjects } from "./project-scanner.js";
import { SessionManager } from "./session-manager.js";
import type { DaemonConfig, ProjectInfo } from "./types.js";
/** Cadence for the model-catalog + benchmark-coverage maintenance timer.
* Matches the catalog cache TTL in model-catalog-cache.js (6 hours), so the
* spawned `sf --maintain` call is a no-op when the cache is still fresh. */
const MAINTENANCE_INTERVAL_MS = 6 * 60 * 60 * 1000;
/**
* Core daemon class ties config + logger together with lifecycle management.
* Registers SIGTERM/SIGINT handlers for clean shutdown.
@ -14,6 +20,7 @@ export class Daemon {
private shuttingDown = false;
private keepaliveTimer: ReturnType<typeof setInterval> | undefined;
private healthTimer: ReturnType<typeof setInterval> | undefined;
private maintenanceTimer: ReturnType<typeof setInterval> | undefined;
private readonly onSigterm: () => void;
private readonly onSigint: () => void;
private sessionManager: SessionManager | undefined;
@ -25,6 +32,7 @@ export class Daemon {
private readonly config: DaemonConfig,
private readonly logger: Logger,
private readonly healthIntervalMs: number = 300_000,
private readonly maintenanceIntervalMs: number = MAINTENANCE_INTERVAL_MS,
) {
this.onSigterm = () => void this.shutdown();
this.onSigint = () => void this.shutdown();
@ -127,6 +135,54 @@ export class Daemon {
memory_rss_mb: Math.round(process.memoryUsage().rss / 1024 / 1024),
});
}, this.healthIntervalMs);
// Model-catalog + benchmark coverage maintenance — spawn `sf --maintain`
// every MAINTENANCE_INTERVAL_MS. Subprocess keeps the daemon decoupled
// from sf extension internals; the maintenance command is TTL-checked
// and idempotent, so back-to-back fires are no-ops. The first session
// launched after the daemon comes up still triggers register-hooks'
// session_start refresh path, so we don't lose the first 6h window by
// not firing on startup here.
this.maintenanceTimer = setInterval(
() => this.runMaintenance(),
this.maintenanceIntervalMs,
);
}
/**
* Spawn `sf --maintain` and log the outcome. Failures are logged but never
* thrown the daemon's main work must never be blocked by maintenance.
*/
private runMaintenance(): void {
const child = spawn("sf", ["--maintain"], {
stdio: ["ignore", "pipe", "pipe"],
detached: false,
});
let stdout = "";
let stderr = "";
child.stdout?.on("data", (chunk) => {
stdout += chunk.toString();
});
child.stderr?.on("data", (chunk) => {
stderr += chunk.toString();
});
child.on("error", (err) => {
this.logger.warn("maintenance spawn failed", {
error: err instanceof Error ? err.message : String(err),
});
});
child.on("exit", (code) => {
if (code === 0) {
this.logger.info("maintenance complete", {
stdout: stdout.trim().slice(0, 500),
});
} else {
this.logger.warn("maintenance exited non-zero", {
code,
stderr: stderr.trim().slice(0, 500),
});
}
});
}
/** Scan configured project roots for project directories. */
@ -171,6 +227,12 @@ export class Daemon {
this.healthTimer = undefined;
}
// Clear maintenance timer
if (this.maintenanceTimer) {
clearInterval(this.maintenanceTimer);
this.maintenanceTimer = undefined;
}
// Clear keepalive so the event loop can drain
if (this.keepaliveTimer) {
clearInterval(this.keepaliveTimer);

View file

@ -29,6 +29,8 @@ export interface CliFlags {
model?: string;
listModels?: string | true;
discover?: boolean;
/** `sf --maintain` — refresh model catalogs and audit benchmark coverage, then exit. Idempotent / TTL-checked. Used by the daemon's periodic maintenance timer. */
maintain?: boolean;
extensions: string[];
appendSystemPrompt?: string;
tools?: string[];
@ -121,6 +123,8 @@ export function parseCliArgs(argv: string[]): CliFlags {
i + 1 < args.length && !args[i + 1].startsWith("-") ? args[++i] : true;
} else if (arg === "--discover") {
flags.discover = true;
} else if (arg === "--maintain") {
flags.maintain = true;
} else if (!arg.startsWith("--") && !arg.startsWith("-")) {
flags.messages.push(arg);
}

View file

@ -779,6 +779,47 @@ if (cliFlags.listModels !== undefined) {
process.exit(0);
}
// --maintain: refresh model catalogs (TTL-checked) and audit benchmark
// coverage, then exit. Designed for the daemon's periodic maintenance timer:
// idempotent, never spawns a session, never touches user-visible state.
if (cliFlags.maintain) {
exitIfManagedResourcesAreNewer(agentDir);
const startedAt = Date.now();
try {
const { runModelCatalogRefreshIfStale } = await import(
"./resources/extensions/sf/model-catalog-cache.js"
);
const { runGeminiCatalogRefreshIfStale } = await import(
"./resources/extensions/sf/gemini-catalog.js"
);
const { runOpenaiCodexCatalogRefreshIfStale } = await import(
"./resources/extensions/sf/openai-codex-catalog.js"
);
const { getKeyManagerAuthStorage } = await import(
"./resources/extensions/sf/key-manager.js"
);
const { computeBenchmarkCoverage, writeBenchmarkCoverage } = await import(
"./resources/extensions/sf/benchmark-coverage.js"
);
await runModelCatalogRefreshIfStale(process.cwd(), getKeyManagerAuthStorage());
await runGeminiCatalogRefreshIfStale(process.cwd());
await runOpenaiCodexCatalogRefreshIfStale(process.cwd());
const prefs = loadEffectiveSFPreferences()?.preferences ?? {};
const coverage = computeBenchmarkCoverage(prefs);
writeBenchmarkCoverage(coverage);
const ms = Date.now() - startedAt;
process.stdout.write(
`[sf --maintain] catalog refresh + coverage audit done in ${ms}ms — coverage ${coverage.summary.coveredCount}/${coverage.summary.total} (${coverage.uncovered.length} uncovered)\n`,
);
} catch (err) {
process.stderr.write(
`[sf --maintain] failed: ${err instanceof Error ? err.message : String(err)}\n`,
);
process.exit(1);
}
process.exit(0);
}
// SF always uses quiet startup — the sf extension renders its own branded header
if (!settingsManager.getQuietStartup()) {
settingsManager.setQuietStartup(true);

View file

@ -0,0 +1,243 @@
/**
* benchmark-coverage.js audit which catalog models have static benchmark data.
*
* Reads the global model-catalog (~/.sf/model-catalog/<provider>.json) and the
* static benchmark file (learning/data/model-benchmarks.json), applies the
* same user policy filters used by the dispatcher (isProviderAllowedByLists +
* isProviderModelAllowed), and writes ~/.sf/benchmark-coverage.json with the
* set of dispatchable models that have NO published benchmark numbers.
*
* Use: surface the gap so the static benchmark file can be kept current
* without guessing. The audit never blocks it's a fire-and-forget at
* session_start (and, later, on the daemon maintenance timer).
*/
import {
existsSync,
mkdirSync,
readdirSync,
readFileSync,
writeFileSync,
} from "node:fs";
import { dirname, join } from "node:path";
import {
isProviderAllowedByLists,
isProviderModelAllowed,
} from "./preferences-models.js";
import { sfHome } from "./sf-home.js";
const BENCHMARK_DATA_REL = ["learning", "data", "model-benchmarks.json"];
function catalogDir() {
return join(sfHome(), "model-catalog");
}
function coverageFilePath() {
return join(sfHome(), "benchmark-coverage.json");
}
function benchmarkDataPath() {
return join(import.meta.dirname, ...BENCHMARK_DATA_REL);
}
/**
* Build a lowercase set of model ids that have any published benchmark data.
* Returns an empty Set when the file is missing or malformed.
*/
function loadBenchmarkKeys() {
const path = benchmarkDataPath();
if (!existsSync(path)) return new Set();
try {
const data = JSON.parse(readFileSync(path, "utf-8"));
return new Set(
Object.keys(data)
.filter((k) => k !== "_meta")
.map((k) => k.toLowerCase()),
);
} catch {
return new Set();
}
}
/**
* Read every <provider>.json file under ~/.sf/model-catalog/ and flatten into
* { provider, id, cost? } entries. Returns [] if the catalog dir is missing.
*/
function loadCatalogEntries() {
const dir = catalogDir();
if (!existsSync(dir)) return [];
const out = [];
for (const file of readdirSync(dir)) {
if (!file.endsWith(".json")) continue;
const provider = file.slice(0, -".json".length);
let parsed;
try {
parsed = JSON.parse(readFileSync(join(dir, file), "utf-8"));
} catch {
continue;
}
const entries = Array.isArray(parsed?.modelIds) ? parsed.modelIds : [];
for (const raw of entries) {
const id = typeof raw === "string" ? raw : raw?.id;
if (typeof id !== "string" || id.length === 0) continue;
out.push({
provider,
id,
cost: typeof raw === "object" ? raw?.cost : undefined,
contextWindow:
typeof raw === "object" ? raw?.contextWindow : undefined,
});
}
}
return out;
}
/**
* Reduce an arbitrary route id to the bare model id used in the benchmark
* file. Lowercases, strips `<provider>/` prefix (openrouter style) and any
* `:free` suffix.
*/
export function normalizeForBenchmarkLookup(modelId) {
let key = modelId.toLowerCase();
const slash = key.indexOf("/");
if (slash !== -1) key = key.slice(slash + 1);
if (key.endsWith(":free")) key = key.slice(0, -":free".length);
return key;
}
/**
* Compute coverage stats for the user's dispatchable model set.
*
* `prefs` is the merged preferences object; we read allowed_providers,
* blocked_providers, provider_model_allow, provider_model_block.
*/
export function computeBenchmarkCoverage(prefs) {
const benchmarkKeys = loadBenchmarkKeys();
const catalog = loadCatalogEntries();
const allowed = prefs?.allowed_providers;
const blocked = prefs?.blocked_providers;
const providerModelAllow = prefs?.provider_model_allow;
const providerModelBlock = prefs?.provider_model_block;
const covered = [];
const uncovered = [];
for (const entry of catalog) {
if (!isProviderAllowedByLists(entry.provider, allowed, blocked)) continue;
const modelObj = {
cost: entry.cost,
contextWindow: entry.contextWindow,
};
if (
!isProviderModelAllowed(
entry.provider,
entry.id,
providerModelAllow,
providerModelBlock,
modelObj,
)
) {
continue;
}
const key = normalizeForBenchmarkLookup(entry.id);
const bucket = benchmarkKeys.has(key) ? covered : uncovered;
bucket.push({ provider: entry.provider, id: entry.id });
}
const total = covered.length + uncovered.length;
return {
covered,
uncovered,
summary: {
total,
coveredCount: covered.length,
uncoveredCount: uncovered.length,
coverageRatio: total === 0 ? 0 : covered.length / total,
},
};
}
/**
* Write the coverage report to ~/.sf/benchmark-coverage.json. Best-effort
* never throws on disk failure.
*/
export function writeBenchmarkCoverage(coverage) {
const path = coverageFilePath();
try {
mkdirSync(dirname(path), { recursive: true });
const payload = {
schemaVersion: 1,
generated: new Date().toISOString(),
summary: coverage.summary,
uncovered: coverage.uncovered,
};
writeFileSync(path, `${JSON.stringify(payload, null, 2)}\n`, "utf-8");
} catch {
// Best-effort — never fail the caller.
}
}
/**
* Compare a freshly computed coverage report against the previously written
* one. Returns true when the uncovered set has changed (additions OR removals).
* Used to gate change-driven notifications.
*/
export function detectCoverageChange(coverage) {
const path = coverageFilePath();
if (!existsSync(path)) return true;
let prev;
try {
prev = JSON.parse(readFileSync(path, "utf-8"));
} catch {
return true;
}
const prevIds = new Set(
(Array.isArray(prev?.uncovered) ? prev.uncovered : []).map(
(m) => `${m.provider}/${m.id}`,
),
);
const currIds = new Set(
coverage.uncovered.map((m) => `${m.provider}/${m.id}`),
);
if (prevIds.size !== currIds.size) return true;
for (const id of currIds) {
if (!prevIds.has(id)) return true;
}
return false;
}
/**
* Fire-and-forget audit. Writes the coverage report and, if the uncovered set
* has changed since the previous run, invokes the optional `notify` callback
* with a one-line summary. Caller supplies notify so this module stays free of
* UI dependencies.
*
* scheduleBenchmarkCoverageAudit(prefs, (msg) => ctx.ui?.notify?.(msg, "info"))
*/
export function scheduleBenchmarkCoverageAudit(prefs, notify) {
setImmediate(() => {
try {
const coverage = computeBenchmarkCoverage(prefs);
const changed = detectCoverageChange(coverage);
writeBenchmarkCoverage(coverage);
if (
changed &&
coverage.uncovered.length > 0 &&
typeof notify === "function"
) {
const sample = coverage.uncovered
.slice(0, 3)
.map((m) => `${m.provider}/${m.id}`)
.join(", ");
const more =
coverage.uncovered.length > 3
? ` (+${coverage.uncovered.length - 3} more)`
: "";
notify(
`Benchmark coverage: ${coverage.summary.coveredCount}/${coverage.summary.total} models have static scores. Uncovered: ${sample}${more}. See ~/.sf/benchmark-coverage.json`,
);
}
} catch {
// Audit failures must never block session start.
}
});
}

View file

@ -538,6 +538,28 @@ export function registerHooks(pi, ecosystemHandlers = []) {
} catch {
/* non-fatal — codex catalog refresh must never block session start */
}
// Audit benchmark coverage — compare the dispatchable model set
// (catalog ∩ user policy) against the static benchmark file and write
// ~/.sf/benchmark-coverage.json. Surfaces models routed via /v1/models
// discovery that don't yet have published benchmark numbers, so the
// static file can be kept current without guessing model quality.
try {
const { scheduleBenchmarkCoverageAudit } = await import(
"../benchmark-coverage.js"
);
const { loadEffectiveSFPreferences } = await import(
"../preferences.js"
);
const prefs = loadEffectiveSFPreferences() ?? {};
scheduleBenchmarkCoverageAudit(prefs, (msg) =>
ctx.ui?.notify?.(msg, "info", {
noticeKind: NOTICE_KIND.SYSTEM_NOTICE,
dedupe_key: "benchmark-coverage",
}),
);
} catch {
/* non-fatal — benchmark audit must never block session start */
}
// Detect drift in source-of-truth markdown files since last session.
try {
const { detectMdFileDrift, formatDriftReport } = await import(

View file

@ -77,18 +77,25 @@ export async function refreshGeminiCatalog(basePath) {
}
/**
* Fire-and-forget background refresh of the gemini-cli model catalog. Skipped
* if the on-disk cache is already fresh (within CATALOG_TTL_MS).
* Awaitable refresh only fetches when the cache is stale. Returns the
* fresh model id list, the existing-stale list, or null on failure / when
* gemini-cli isn't available.
*/
export async function runGeminiCatalogRefreshIfStale(basePath) {
if (isCacheFresh()) return null;
try {
return await refreshGeminiCatalog(basePath);
} catch {
return null;
}
}
/**
* Fire-and-forget background refresh. Skipped if the on-disk cache is fresh.
*
* Consumer: bootstrap/register-hooks.js session_start hook.
*/
export function scheduleGeminiCatalogRefresh(basePath) {
if (isCacheFresh()) return;
setImmediate(async () => {
try {
await refreshGeminiCatalog(basePath);
} catch {
// Per-provider failure is silently swallowed.
}
});
setImmediate(() => runGeminiCatalogRefreshIfStale(basePath));
}

View file

@ -334,33 +334,39 @@ export async function refreshProviderCatalog(basePath, providerId, apiKey) {
}
/**
* Fire-and-forget background refresh for all discoverable providers that have
* a key in auth.json and a stale or absent cache. Safe to call at session
* start never throws, never blocks.
* Awaitable refresh for all discoverable providers with a key in auth.json
* and a stale or absent cache. Per-provider failures are logged but never
* surfaced as exceptions the loop is best-effort.
*/
export function scheduleModelCatalogRefresh(basePath, auth) {
setImmediate(async () => {
for (const providerId of DISCOVERABLE_PROVIDER_IDS) {
try {
const creds = auth.getCredentialsForProvider(providerId);
const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key;
if (!apiKey) continue;
if (readCachedModelIds(basePath, providerId) !== null) continue;
const result = await refreshProviderCatalog(basePath, providerId, apiKey);
if (result === null) {
// Surface per-provider fetch failures so they don't silently disappear.
process.stderr.write(
`[model-catalog-cache] refresh failed for provider: ${providerId}\n`,
);
}
} catch (err) {
// Per-provider failures must not crash the refresh loop, but should be visible.
export async function runModelCatalogRefreshIfStale(basePath, auth) {
for (const providerId of DISCOVERABLE_PROVIDER_IDS) {
try {
const creds = auth.getCredentialsForProvider(providerId);
const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key;
if (!apiKey) continue;
if (readCachedModelIds(basePath, providerId) !== null) continue;
const result = await refreshProviderCatalog(basePath, providerId, apiKey);
if (result === null) {
// Surface per-provider fetch failures so they don't silently disappear.
process.stderr.write(
`[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`,
`[model-catalog-cache] refresh failed for provider: ${providerId}\n`,
);
}
} catch (err) {
// Per-provider failures must not crash the refresh loop, but should be visible.
process.stderr.write(
`[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`,
);
}
});
}
}
/**
* Fire-and-forget background refresh. Wraps runModelCatalogRefreshIfStale in
* setImmediate. Safe to call at session start never throws, never blocks.
*/
export function scheduleModelCatalogRefresh(basePath, auth) {
setImmediate(() => runModelCatalogRefreshIfStale(basePath, auth));
}
/**

View file

@ -85,6 +85,19 @@ export async function refreshOpenaiCodexCatalog(_basePath) {
return modelIds;
}
/**
* Awaitable refresh only fetches when the SF cache is stale. Returns the
* fresh model id list, or null on failure / no codex cache available.
*/
export async function runOpenaiCodexCatalogRefreshIfStale(basePath) {
if (isSfCacheFresh()) return null;
try {
return await refreshOpenaiCodexCatalog(basePath);
} catch {
return null;
}
}
/**
* Fire-and-forget background refresh. Skipped if the SF cache is fresh.
*
@ -92,11 +105,5 @@ export async function refreshOpenaiCodexCatalog(_basePath) {
*/
export function scheduleOpenaiCodexCatalogRefresh(basePath) {
if (isSfCacheFresh()) return;
setImmediate(async () => {
try {
await refreshOpenaiCodexCatalog(basePath);
} catch {
// Per-provider failure is silently swallowed.
}
});
setImmediate(() => runOpenaiCodexCatalogRefreshIfStale(basePath));
}

View file

@ -0,0 +1,249 @@
/**
* benchmark-coverage.test.mjs
*
* Tests that computeBenchmarkCoverage partitions the dispatchable model set
* against the static benchmark file, applies the user policy filters
* (allowed_providers, provider_model_allow, provider_model_block, plus the
* built-in per-provider policy), and that writeBenchmarkCoverage /
* detectCoverageChange round-trip and notice change correctly.
*/
import assert from "node:assert/strict";
import {
existsSync,
mkdirSync,
mkdtempSync,
readFileSync,
rmSync,
writeFileSync,
} from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, test } from "vitest";
// Touch preferences.js so the prefs loader / lazy circular dep is wired.
import "../preferences.js";
import {
computeBenchmarkCoverage,
detectCoverageChange,
normalizeForBenchmarkLookup,
writeBenchmarkCoverage,
} from "../benchmark-coverage.js";
// ─── Test isolation ──────────────────────────────────────────────────────────
const tmpDirs = [];
let originalSfHome;
beforeEach(() => {
originalSfHome = process.env.SF_HOME;
});
afterEach(() => {
while (tmpDirs.length > 0) {
rmSync(tmpDirs.pop(), { recursive: true, force: true });
}
if (originalSfHome === undefined) delete process.env.SF_HOME;
else process.env.SF_HOME = originalSfHome;
});
function tempSfHome() {
const dir = mkdtempSync(join(tmpdir(), "sf-benchmark-coverage-test-"));
tmpDirs.push(dir);
process.env.SF_HOME = dir;
return dir;
}
function writeCatalog(sfHome, providerId, modelEntries) {
const dir = join(sfHome, "model-catalog");
mkdirSync(dir, { recursive: true });
writeFileSync(
join(dir, `${providerId}.json`),
JSON.stringify({
fetchedAt: new Date().toISOString(),
modelIds: modelEntries,
}),
"utf-8",
);
}
// ─── normalizeForBenchmarkLookup ─────────────────────────────────────────────
describe("normalizeForBenchmarkLookup", () => {
test("lowercases plain ids", () => {
assert.equal(normalizeForBenchmarkLookup("MiniMax-M2.7"), "minimax-m2.7");
});
test("strips provider/ prefix (openrouter)", () => {
assert.equal(
normalizeForBenchmarkLookup("deepseek/deepseek-v4-flash"),
"deepseek-v4-flash",
);
});
test("strips :free suffix", () => {
assert.equal(
normalizeForBenchmarkLookup("qwen/qwen3-coder:free"),
"qwen3-coder",
);
});
test("preserves :Nb size suffix", () => {
assert.equal(
normalizeForBenchmarkLookup("deepseek-v3.1:671b"),
"deepseek-v3.1:671b",
);
});
});
// ─── computeBenchmarkCoverage ────────────────────────────────────────────────
describe("computeBenchmarkCoverage", () => {
test("partitions covered vs uncovered using the real benchmark file", () => {
const home = tempSfHome();
// glm-4.5 is in the static benchmark file; bogus-not-in-bench-2026 is not.
writeCatalog(home, "zai", [{ id: "glm-4.5" }, { id: "bogus-not-in-bench-2026" }]);
const result = computeBenchmarkCoverage({
allowed_providers: ["zai"],
});
const coveredIds = result.covered.map((m) => m.id);
const uncoveredIds = result.uncovered.map((m) => m.id);
assert.ok(coveredIds.includes("glm-4.5"), "glm-4.5 should be covered");
assert.ok(
uncoveredIds.includes("bogus-not-in-bench-2026"),
"bogus model should be uncovered",
);
assert.equal(result.summary.total, 2);
assert.equal(result.summary.coveredCount, 1);
assert.equal(result.summary.uncoveredCount, 1);
assert.equal(result.summary.coverageRatio, 0.5);
});
test("allowed_providers excludes models from non-listed providers", () => {
const home = tempSfHome();
writeCatalog(home, "zai", [{ id: "glm-4.5" }]);
writeCatalog(home, "anthropic", [{ id: "claude-sonnet-4-6" }]);
const result = computeBenchmarkCoverage({
allowed_providers: ["zai"],
});
const allIds = [...result.covered, ...result.uncovered].map(
(m) => `${m.provider}/${m.id}`,
);
assert.ok(allIds.includes("zai/glm-4.5"));
assert.ok(
!allIds.some((id) => id.startsWith("anthropic/")),
"anthropic must not appear when not in allowed_providers",
);
});
test("provider_model_block: ollama-cloud gemini-* is excluded", () => {
const home = tempSfHome();
writeCatalog(home, "ollama-cloud", [
{ id: "deepseek-v4-flash" },
{ id: "gemini-3-flash-preview" },
]);
const result = computeBenchmarkCoverage({
allowed_providers: ["ollama-cloud"],
provider_model_block: { "ollama-cloud": ["gemini-*"] },
});
const ids = [...result.covered, ...result.uncovered].map((m) => m.id);
assert.ok(ids.includes("deepseek-v4-flash"));
assert.ok(
!ids.includes("gemini-3-flash-preview"),
"gemini-3-flash-preview must be blocked",
);
});
test("openrouter built-in policy keeps only :free / zero-cost", () => {
const home = tempSfHome();
writeCatalog(home, "openrouter", [
{ id: "anthropic/claude-opus-4-7" }, // paid → blocked by built-in
{ id: "qwen/qwen3-coder:free" }, // :free → allowed
{
id: "openrouter/zero-cost-model",
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
}, // zero-cost → allowed
]);
const result = computeBenchmarkCoverage({
allowed_providers: ["openrouter"],
});
const ids = [...result.covered, ...result.uncovered].map((m) => m.id);
assert.ok(!ids.includes("anthropic/claude-opus-4-7"), "paid claude blocked");
assert.ok(ids.includes("qwen/qwen3-coder:free"), ":free allowed");
assert.ok(ids.includes("openrouter/zero-cost-model"), "zero-cost allowed");
});
test("returns empty when catalog dir is missing", () => {
tempSfHome(); // pinned but no catalog dir written
const result = computeBenchmarkCoverage({});
assert.deepEqual(result.covered, []);
assert.deepEqual(result.uncovered, []);
assert.equal(result.summary.total, 0);
assert.equal(result.summary.coverageRatio, 0);
});
});
// ─── writeBenchmarkCoverage + detectCoverageChange ───────────────────────────
describe("writeBenchmarkCoverage / detectCoverageChange", () => {
test("write + read round-trips", () => {
const home = tempSfHome();
const coverage = {
covered: [{ provider: "zai", id: "glm-4.5" }],
uncovered: [{ provider: "zai", id: "bogus" }],
summary: { total: 2, coveredCount: 1, uncoveredCount: 1, coverageRatio: 0.5 },
};
writeBenchmarkCoverage(coverage);
const path = join(home, "benchmark-coverage.json");
assert.ok(existsSync(path), "coverage file should be written");
const parsed = JSON.parse(readFileSync(path, "utf-8"));
assert.equal(parsed.schemaVersion, 1);
assert.equal(parsed.summary.total, 2);
assert.deepEqual(parsed.uncovered, [{ provider: "zai", id: "bogus" }]);
});
test("detectCoverageChange returns true on first write (no prior file)", () => {
tempSfHome();
assert.equal(
detectCoverageChange({ uncovered: [{ provider: "zai", id: "x" }] }),
true,
);
});
test("detectCoverageChange returns false when uncovered set is unchanged", () => {
tempSfHome();
const coverage = {
covered: [],
uncovered: [
{ provider: "zai", id: "x" },
{ provider: "kimi-coding", id: "y" },
],
summary: { total: 2, coveredCount: 0, uncoveredCount: 2, coverageRatio: 0 },
};
writeBenchmarkCoverage(coverage);
assert.equal(detectCoverageChange(coverage), false);
});
test("detectCoverageChange returns true when a new uncovered id appears", () => {
tempSfHome();
writeBenchmarkCoverage({
covered: [],
uncovered: [{ provider: "zai", id: "x" }],
summary: { total: 1, coveredCount: 0, uncoveredCount: 1, coverageRatio: 0 },
});
assert.equal(
detectCoverageChange({
covered: [],
uncovered: [
{ provider: "zai", id: "x" },
{ provider: "zai", id: "y" },
],
}),
true,
);
});
});

View file

@ -14,6 +14,7 @@
*/
import { runSubagent } from "@singularity-forge/coding-agent";
import { debugLog } from "../debug-logger.js";
const DEFAULT_MAX_CONTEXT_TURNS = 10;
const DEFAULT_MAX_TURNS_PER_RUN = 5;
@ -173,6 +174,19 @@ export async function runAgentTurn(agent, opts = {}) {
const allMessages = agent.receive(false); // all messages (read + unread)
const target = allMessages.find((m) => m.id === onlyMessageId && !m.read);
if (!target) {
// #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox.
// This is the chronic prompt-never-sent failure mode — caller swallows
// {turnsProcessed:0,response:null} as 'no work' and the LLM never runs.
// Surface the inbox state so the bus-instance / refresh-timing bug
// becomes debuggable.
debugLog("agent-runner", {
event: "silent-missing-message",
phase: "target-not-found",
agentName: agent.identity?.name,
onlyMessageId,
inboxSize: allMessages.length,
inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })),
});
return { turnsProcessed: 0, response: null };
}
messages = [target];
@ -181,6 +195,12 @@ export async function runAgentTurn(agent, opts = {}) {
}
if (messages.length === 0) {
debugLog("agent-runner", {
event: "silent-empty-inbox",
phase: "no-messages",
agentName: agent.identity?.name,
onlyMessageId: onlyMessageId ?? null,
});
return { turnsProcessed: 0, response: null };
}