From b5764af27bb5fc0466fe6d34217f2965caf9402b Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 16 May 2026 17:00:13 +0200 Subject: [PATCH] sf snapshot: uncommitted changes after 33m inactivity --- packages/daemon/src/daemon.ts | 62 +++++ src/cli-web-branch.ts | 4 + src/cli.ts | 41 +++ .../extensions/sf/benchmark-coverage.js | 243 +++++++++++++++++ .../extensions/sf/bootstrap/register-hooks.js | 22 ++ src/resources/extensions/sf/gemini-catalog.js | 25 +- .../extensions/sf/model-catalog-cache.js | 50 ++-- .../extensions/sf/openai-codex-catalog.js | 21 +- .../sf/tests/benchmark-coverage.test.mjs | 249 ++++++++++++++++++ .../extensions/sf/uok/agent-runner.js | 20 ++ 10 files changed, 699 insertions(+), 38 deletions(-) create mode 100644 src/resources/extensions/sf/benchmark-coverage.js create mode 100644 src/resources/extensions/sf/tests/benchmark-coverage.test.mjs diff --git a/packages/daemon/src/daemon.ts b/packages/daemon/src/daemon.ts index ce47a3300..0473b2c4d 100644 --- a/packages/daemon/src/daemon.ts +++ b/packages/daemon/src/daemon.ts @@ -1,3 +1,4 @@ +import { spawn } from "node:child_process"; import { DiscordBot, validateDiscordConfig } from "./discord-bot.js"; import { EventBridge } from "./event-bridge.js"; import type { Logger } from "./logger.js"; @@ -6,6 +7,11 @@ import { scanForProjects } from "./project-scanner.js"; import { SessionManager } from "./session-manager.js"; import type { DaemonConfig, ProjectInfo } from "./types.js"; +/** Cadence for the model-catalog + benchmark-coverage maintenance timer. + * Matches the catalog cache TTL in model-catalog-cache.js (6 hours), so the + * spawned `sf --maintain` call is a no-op when the cache is still fresh. */ +const MAINTENANCE_INTERVAL_MS = 6 * 60 * 60 * 1000; + /** * Core daemon class — ties config + logger together with lifecycle management. * Registers SIGTERM/SIGINT handlers for clean shutdown. @@ -14,6 +20,7 @@ export class Daemon { private shuttingDown = false; private keepaliveTimer: ReturnType | undefined; private healthTimer: ReturnType | undefined; + private maintenanceTimer: ReturnType | undefined; private readonly onSigterm: () => void; private readonly onSigint: () => void; private sessionManager: SessionManager | undefined; @@ -25,6 +32,7 @@ export class Daemon { private readonly config: DaemonConfig, private readonly logger: Logger, private readonly healthIntervalMs: number = 300_000, + private readonly maintenanceIntervalMs: number = MAINTENANCE_INTERVAL_MS, ) { this.onSigterm = () => void this.shutdown(); this.onSigint = () => void this.shutdown(); @@ -127,6 +135,54 @@ export class Daemon { memory_rss_mb: Math.round(process.memoryUsage().rss / 1024 / 1024), }); }, this.healthIntervalMs); + + // Model-catalog + benchmark coverage maintenance — spawn `sf --maintain` + // every MAINTENANCE_INTERVAL_MS. Subprocess keeps the daemon decoupled + // from sf extension internals; the maintenance command is TTL-checked + // and idempotent, so back-to-back fires are no-ops. The first session + // launched after the daemon comes up still triggers register-hooks' + // session_start refresh path, so we don't lose the first 6h window by + // not firing on startup here. + this.maintenanceTimer = setInterval( + () => this.runMaintenance(), + this.maintenanceIntervalMs, + ); + } + + /** + * Spawn `sf --maintain` and log the outcome. Failures are logged but never + * thrown — the daemon's main work must never be blocked by maintenance. + */ + private runMaintenance(): void { + const child = spawn("sf", ["--maintain"], { + stdio: ["ignore", "pipe", "pipe"], + detached: false, + }); + let stdout = ""; + let stderr = ""; + child.stdout?.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr?.on("data", (chunk) => { + stderr += chunk.toString(); + }); + child.on("error", (err) => { + this.logger.warn("maintenance spawn failed", { + error: err instanceof Error ? err.message : String(err), + }); + }); + child.on("exit", (code) => { + if (code === 0) { + this.logger.info("maintenance complete", { + stdout: stdout.trim().slice(0, 500), + }); + } else { + this.logger.warn("maintenance exited non-zero", { + code, + stderr: stderr.trim().slice(0, 500), + }); + } + }); } /** Scan configured project roots for project directories. */ @@ -171,6 +227,12 @@ export class Daemon { this.healthTimer = undefined; } + // Clear maintenance timer + if (this.maintenanceTimer) { + clearInterval(this.maintenanceTimer); + this.maintenanceTimer = undefined; + } + // Clear keepalive so the event loop can drain if (this.keepaliveTimer) { clearInterval(this.keepaliveTimer); diff --git a/src/cli-web-branch.ts b/src/cli-web-branch.ts index a73b69298..dbf1617bf 100644 --- a/src/cli-web-branch.ts +++ b/src/cli-web-branch.ts @@ -29,6 +29,8 @@ export interface CliFlags { model?: string; listModels?: string | true; discover?: boolean; + /** `sf --maintain` — refresh model catalogs and audit benchmark coverage, then exit. Idempotent / TTL-checked. Used by the daemon's periodic maintenance timer. */ + maintain?: boolean; extensions: string[]; appendSystemPrompt?: string; tools?: string[]; @@ -121,6 +123,8 @@ export function parseCliArgs(argv: string[]): CliFlags { i + 1 < args.length && !args[i + 1].startsWith("-") ? args[++i] : true; } else if (arg === "--discover") { flags.discover = true; + } else if (arg === "--maintain") { + flags.maintain = true; } else if (!arg.startsWith("--") && !arg.startsWith("-")) { flags.messages.push(arg); } diff --git a/src/cli.ts b/src/cli.ts index 036849315..cbb2affa3 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -779,6 +779,47 @@ if (cliFlags.listModels !== undefined) { process.exit(0); } +// --maintain: refresh model catalogs (TTL-checked) and audit benchmark +// coverage, then exit. Designed for the daemon's periodic maintenance timer: +// idempotent, never spawns a session, never touches user-visible state. +if (cliFlags.maintain) { + exitIfManagedResourcesAreNewer(agentDir); + const startedAt = Date.now(); + try { + const { runModelCatalogRefreshIfStale } = await import( + "./resources/extensions/sf/model-catalog-cache.js" + ); + const { runGeminiCatalogRefreshIfStale } = await import( + "./resources/extensions/sf/gemini-catalog.js" + ); + const { runOpenaiCodexCatalogRefreshIfStale } = await import( + "./resources/extensions/sf/openai-codex-catalog.js" + ); + const { getKeyManagerAuthStorage } = await import( + "./resources/extensions/sf/key-manager.js" + ); + const { computeBenchmarkCoverage, writeBenchmarkCoverage } = await import( + "./resources/extensions/sf/benchmark-coverage.js" + ); + await runModelCatalogRefreshIfStale(process.cwd(), getKeyManagerAuthStorage()); + await runGeminiCatalogRefreshIfStale(process.cwd()); + await runOpenaiCodexCatalogRefreshIfStale(process.cwd()); + const prefs = loadEffectiveSFPreferences()?.preferences ?? {}; + const coverage = computeBenchmarkCoverage(prefs); + writeBenchmarkCoverage(coverage); + const ms = Date.now() - startedAt; + process.stdout.write( + `[sf --maintain] catalog refresh + coverage audit done in ${ms}ms — coverage ${coverage.summary.coveredCount}/${coverage.summary.total} (${coverage.uncovered.length} uncovered)\n`, + ); + } catch (err) { + process.stderr.write( + `[sf --maintain] failed: ${err instanceof Error ? err.message : String(err)}\n`, + ); + process.exit(1); + } + process.exit(0); +} + // SF always uses quiet startup — the sf extension renders its own branded header if (!settingsManager.getQuietStartup()) { settingsManager.setQuietStartup(true); diff --git a/src/resources/extensions/sf/benchmark-coverage.js b/src/resources/extensions/sf/benchmark-coverage.js new file mode 100644 index 000000000..8b19c551a --- /dev/null +++ b/src/resources/extensions/sf/benchmark-coverage.js @@ -0,0 +1,243 @@ +/** + * benchmark-coverage.js — audit which catalog models have static benchmark data. + * + * Reads the global model-catalog (~/.sf/model-catalog/.json) and the + * static benchmark file (learning/data/model-benchmarks.json), applies the + * same user policy filters used by the dispatcher (isProviderAllowedByLists + + * isProviderModelAllowed), and writes ~/.sf/benchmark-coverage.json with the + * set of dispatchable models that have NO published benchmark numbers. + * + * Use: surface the gap so the static benchmark file can be kept current + * without guessing. The audit never blocks — it's a fire-and-forget at + * session_start (and, later, on the daemon maintenance timer). + */ +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + writeFileSync, +} from "node:fs"; +import { dirname, join } from "node:path"; +import { + isProviderAllowedByLists, + isProviderModelAllowed, +} from "./preferences-models.js"; +import { sfHome } from "./sf-home.js"; + +const BENCHMARK_DATA_REL = ["learning", "data", "model-benchmarks.json"]; + +function catalogDir() { + return join(sfHome(), "model-catalog"); +} + +function coverageFilePath() { + return join(sfHome(), "benchmark-coverage.json"); +} + +function benchmarkDataPath() { + return join(import.meta.dirname, ...BENCHMARK_DATA_REL); +} + +/** + * Build a lowercase set of model ids that have any published benchmark data. + * Returns an empty Set when the file is missing or malformed. + */ +function loadBenchmarkKeys() { + const path = benchmarkDataPath(); + if (!existsSync(path)) return new Set(); + try { + const data = JSON.parse(readFileSync(path, "utf-8")); + return new Set( + Object.keys(data) + .filter((k) => k !== "_meta") + .map((k) => k.toLowerCase()), + ); + } catch { + return new Set(); + } +} + +/** + * Read every .json file under ~/.sf/model-catalog/ and flatten into + * { provider, id, cost? } entries. Returns [] if the catalog dir is missing. + */ +function loadCatalogEntries() { + const dir = catalogDir(); + if (!existsSync(dir)) return []; + const out = []; + for (const file of readdirSync(dir)) { + if (!file.endsWith(".json")) continue; + const provider = file.slice(0, -".json".length); + let parsed; + try { + parsed = JSON.parse(readFileSync(join(dir, file), "utf-8")); + } catch { + continue; + } + const entries = Array.isArray(parsed?.modelIds) ? parsed.modelIds : []; + for (const raw of entries) { + const id = typeof raw === "string" ? raw : raw?.id; + if (typeof id !== "string" || id.length === 0) continue; + out.push({ + provider, + id, + cost: typeof raw === "object" ? raw?.cost : undefined, + contextWindow: + typeof raw === "object" ? raw?.contextWindow : undefined, + }); + } + } + return out; +} + +/** + * Reduce an arbitrary route id to the bare model id used in the benchmark + * file. Lowercases, strips `/` prefix (openrouter style) and any + * `:free` suffix. + */ +export function normalizeForBenchmarkLookup(modelId) { + let key = modelId.toLowerCase(); + const slash = key.indexOf("/"); + if (slash !== -1) key = key.slice(slash + 1); + if (key.endsWith(":free")) key = key.slice(0, -":free".length); + return key; +} + +/** + * Compute coverage stats for the user's dispatchable model set. + * + * `prefs` is the merged preferences object; we read allowed_providers, + * blocked_providers, provider_model_allow, provider_model_block. + */ +export function computeBenchmarkCoverage(prefs) { + const benchmarkKeys = loadBenchmarkKeys(); + const catalog = loadCatalogEntries(); + const allowed = prefs?.allowed_providers; + const blocked = prefs?.blocked_providers; + const providerModelAllow = prefs?.provider_model_allow; + const providerModelBlock = prefs?.provider_model_block; + + const covered = []; + const uncovered = []; + + for (const entry of catalog) { + if (!isProviderAllowedByLists(entry.provider, allowed, blocked)) continue; + const modelObj = { + cost: entry.cost, + contextWindow: entry.contextWindow, + }; + if ( + !isProviderModelAllowed( + entry.provider, + entry.id, + providerModelAllow, + providerModelBlock, + modelObj, + ) + ) { + continue; + } + const key = normalizeForBenchmarkLookup(entry.id); + const bucket = benchmarkKeys.has(key) ? covered : uncovered; + bucket.push({ provider: entry.provider, id: entry.id }); + } + + const total = covered.length + uncovered.length; + return { + covered, + uncovered, + summary: { + total, + coveredCount: covered.length, + uncoveredCount: uncovered.length, + coverageRatio: total === 0 ? 0 : covered.length / total, + }, + }; +} + +/** + * Write the coverage report to ~/.sf/benchmark-coverage.json. Best-effort — + * never throws on disk failure. + */ +export function writeBenchmarkCoverage(coverage) { + const path = coverageFilePath(); + try { + mkdirSync(dirname(path), { recursive: true }); + const payload = { + schemaVersion: 1, + generated: new Date().toISOString(), + summary: coverage.summary, + uncovered: coverage.uncovered, + }; + writeFileSync(path, `${JSON.stringify(payload, null, 2)}\n`, "utf-8"); + } catch { + // Best-effort — never fail the caller. + } +} + +/** + * Compare a freshly computed coverage report against the previously written + * one. Returns true when the uncovered set has changed (additions OR removals). + * Used to gate change-driven notifications. + */ +export function detectCoverageChange(coverage) { + const path = coverageFilePath(); + if (!existsSync(path)) return true; + let prev; + try { + prev = JSON.parse(readFileSync(path, "utf-8")); + } catch { + return true; + } + const prevIds = new Set( + (Array.isArray(prev?.uncovered) ? prev.uncovered : []).map( + (m) => `${m.provider}/${m.id}`, + ), + ); + const currIds = new Set( + coverage.uncovered.map((m) => `${m.provider}/${m.id}`), + ); + if (prevIds.size !== currIds.size) return true; + for (const id of currIds) { + if (!prevIds.has(id)) return true; + } + return false; +} + +/** + * Fire-and-forget audit. Writes the coverage report and, if the uncovered set + * has changed since the previous run, invokes the optional `notify` callback + * with a one-line summary. Caller supplies notify so this module stays free of + * UI dependencies. + * + * scheduleBenchmarkCoverageAudit(prefs, (msg) => ctx.ui?.notify?.(msg, "info")) + */ +export function scheduleBenchmarkCoverageAudit(prefs, notify) { + setImmediate(() => { + try { + const coverage = computeBenchmarkCoverage(prefs); + const changed = detectCoverageChange(coverage); + writeBenchmarkCoverage(coverage); + if ( + changed && + coverage.uncovered.length > 0 && + typeof notify === "function" + ) { + const sample = coverage.uncovered + .slice(0, 3) + .map((m) => `${m.provider}/${m.id}`) + .join(", "); + const more = + coverage.uncovered.length > 3 + ? ` (+${coverage.uncovered.length - 3} more)` + : ""; + notify( + `Benchmark coverage: ${coverage.summary.coveredCount}/${coverage.summary.total} models have static scores. Uncovered: ${sample}${more}. See ~/.sf/benchmark-coverage.json`, + ); + } + } catch { + // Audit failures must never block session start. + } + }); +} diff --git a/src/resources/extensions/sf/bootstrap/register-hooks.js b/src/resources/extensions/sf/bootstrap/register-hooks.js index da1b321d1..a7eebb5e8 100644 --- a/src/resources/extensions/sf/bootstrap/register-hooks.js +++ b/src/resources/extensions/sf/bootstrap/register-hooks.js @@ -538,6 +538,28 @@ export function registerHooks(pi, ecosystemHandlers = []) { } catch { /* non-fatal — codex catalog refresh must never block session start */ } + // Audit benchmark coverage — compare the dispatchable model set + // (catalog ∩ user policy) against the static benchmark file and write + // ~/.sf/benchmark-coverage.json. Surfaces models routed via /v1/models + // discovery that don't yet have published benchmark numbers, so the + // static file can be kept current without guessing model quality. + try { + const { scheduleBenchmarkCoverageAudit } = await import( + "../benchmark-coverage.js" + ); + const { loadEffectiveSFPreferences } = await import( + "../preferences.js" + ); + const prefs = loadEffectiveSFPreferences() ?? {}; + scheduleBenchmarkCoverageAudit(prefs, (msg) => + ctx.ui?.notify?.(msg, "info", { + noticeKind: NOTICE_KIND.SYSTEM_NOTICE, + dedupe_key: "benchmark-coverage", + }), + ); + } catch { + /* non-fatal — benchmark audit must never block session start */ + } // Detect drift in source-of-truth markdown files since last session. try { const { detectMdFileDrift, formatDriftReport } = await import( diff --git a/src/resources/extensions/sf/gemini-catalog.js b/src/resources/extensions/sf/gemini-catalog.js index 24d765780..9a36ababc 100644 --- a/src/resources/extensions/sf/gemini-catalog.js +++ b/src/resources/extensions/sf/gemini-catalog.js @@ -77,18 +77,25 @@ export async function refreshGeminiCatalog(basePath) { } /** - * Fire-and-forget background refresh of the gemini-cli model catalog. Skipped - * if the on-disk cache is already fresh (within CATALOG_TTL_MS). + * Awaitable refresh — only fetches when the cache is stale. Returns the + * fresh model id list, the existing-stale list, or null on failure / when + * gemini-cli isn't available. + */ +export async function runGeminiCatalogRefreshIfStale(basePath) { + if (isCacheFresh()) return null; + try { + return await refreshGeminiCatalog(basePath); + } catch { + return null; + } +} + +/** + * Fire-and-forget background refresh. Skipped if the on-disk cache is fresh. * * Consumer: bootstrap/register-hooks.js session_start hook. */ export function scheduleGeminiCatalogRefresh(basePath) { if (isCacheFresh()) return; - setImmediate(async () => { - try { - await refreshGeminiCatalog(basePath); - } catch { - // Per-provider failure is silently swallowed. - } - }); + setImmediate(() => runGeminiCatalogRefreshIfStale(basePath)); } diff --git a/src/resources/extensions/sf/model-catalog-cache.js b/src/resources/extensions/sf/model-catalog-cache.js index 67a14ac35..4aec2de00 100644 --- a/src/resources/extensions/sf/model-catalog-cache.js +++ b/src/resources/extensions/sf/model-catalog-cache.js @@ -334,33 +334,39 @@ export async function refreshProviderCatalog(basePath, providerId, apiKey) { } /** - * Fire-and-forget background refresh for all discoverable providers that have - * a key in auth.json and a stale or absent cache. Safe to call at session - * start — never throws, never blocks. + * Awaitable refresh for all discoverable providers with a key in auth.json + * and a stale or absent cache. Per-provider failures are logged but never + * surfaced as exceptions — the loop is best-effort. */ -export function scheduleModelCatalogRefresh(basePath, auth) { - setImmediate(async () => { - for (const providerId of DISCOVERABLE_PROVIDER_IDS) { - try { - const creds = auth.getCredentialsForProvider(providerId); - const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key; - if (!apiKey) continue; - if (readCachedModelIds(basePath, providerId) !== null) continue; - const result = await refreshProviderCatalog(basePath, providerId, apiKey); - if (result === null) { - // Surface per-provider fetch failures so they don't silently disappear. - process.stderr.write( - `[model-catalog-cache] refresh failed for provider: ${providerId}\n`, - ); - } - } catch (err) { - // Per-provider failures must not crash the refresh loop, but should be visible. +export async function runModelCatalogRefreshIfStale(basePath, auth) { + for (const providerId of DISCOVERABLE_PROVIDER_IDS) { + try { + const creds = auth.getCredentialsForProvider(providerId); + const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key; + if (!apiKey) continue; + if (readCachedModelIds(basePath, providerId) !== null) continue; + const result = await refreshProviderCatalog(basePath, providerId, apiKey); + if (result === null) { + // Surface per-provider fetch failures so they don't silently disappear. process.stderr.write( - `[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`, + `[model-catalog-cache] refresh failed for provider: ${providerId}\n`, ); } + } catch (err) { + // Per-provider failures must not crash the refresh loop, but should be visible. + process.stderr.write( + `[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`, + ); } - }); + } +} + +/** + * Fire-and-forget background refresh. Wraps runModelCatalogRefreshIfStale in + * setImmediate. Safe to call at session start — never throws, never blocks. + */ +export function scheduleModelCatalogRefresh(basePath, auth) { + setImmediate(() => runModelCatalogRefreshIfStale(basePath, auth)); } /** diff --git a/src/resources/extensions/sf/openai-codex-catalog.js b/src/resources/extensions/sf/openai-codex-catalog.js index 5da4273a0..82cc3cebc 100644 --- a/src/resources/extensions/sf/openai-codex-catalog.js +++ b/src/resources/extensions/sf/openai-codex-catalog.js @@ -85,6 +85,19 @@ export async function refreshOpenaiCodexCatalog(_basePath) { return modelIds; } +/** + * Awaitable refresh — only fetches when the SF cache is stale. Returns the + * fresh model id list, or null on failure / no codex cache available. + */ +export async function runOpenaiCodexCatalogRefreshIfStale(basePath) { + if (isSfCacheFresh()) return null; + try { + return await refreshOpenaiCodexCatalog(basePath); + } catch { + return null; + } +} + /** * Fire-and-forget background refresh. Skipped if the SF cache is fresh. * @@ -92,11 +105,5 @@ export async function refreshOpenaiCodexCatalog(_basePath) { */ export function scheduleOpenaiCodexCatalogRefresh(basePath) { if (isSfCacheFresh()) return; - setImmediate(async () => { - try { - await refreshOpenaiCodexCatalog(basePath); - } catch { - // Per-provider failure is silently swallowed. - } - }); + setImmediate(() => runOpenaiCodexCatalogRefreshIfStale(basePath)); } diff --git a/src/resources/extensions/sf/tests/benchmark-coverage.test.mjs b/src/resources/extensions/sf/tests/benchmark-coverage.test.mjs new file mode 100644 index 000000000..6820b7336 --- /dev/null +++ b/src/resources/extensions/sf/tests/benchmark-coverage.test.mjs @@ -0,0 +1,249 @@ +/** + * benchmark-coverage.test.mjs + * + * Tests that computeBenchmarkCoverage partitions the dispatchable model set + * against the static benchmark file, applies the user policy filters + * (allowed_providers, provider_model_allow, provider_model_block, plus the + * built-in per-provider policy), and that writeBenchmarkCoverage / + * detectCoverageChange round-trip and notice change correctly. + */ +import assert from "node:assert/strict"; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, test } from "vitest"; + +// Touch preferences.js so the prefs loader / lazy circular dep is wired. +import "../preferences.js"; +import { + computeBenchmarkCoverage, + detectCoverageChange, + normalizeForBenchmarkLookup, + writeBenchmarkCoverage, +} from "../benchmark-coverage.js"; + +// ─── Test isolation ────────────────────────────────────────────────────────── + +const tmpDirs = []; +let originalSfHome; + +beforeEach(() => { + originalSfHome = process.env.SF_HOME; +}); + +afterEach(() => { + while (tmpDirs.length > 0) { + rmSync(tmpDirs.pop(), { recursive: true, force: true }); + } + if (originalSfHome === undefined) delete process.env.SF_HOME; + else process.env.SF_HOME = originalSfHome; +}); + +function tempSfHome() { + const dir = mkdtempSync(join(tmpdir(), "sf-benchmark-coverage-test-")); + tmpDirs.push(dir); + process.env.SF_HOME = dir; + return dir; +} + +function writeCatalog(sfHome, providerId, modelEntries) { + const dir = join(sfHome, "model-catalog"); + mkdirSync(dir, { recursive: true }); + writeFileSync( + join(dir, `${providerId}.json`), + JSON.stringify({ + fetchedAt: new Date().toISOString(), + modelIds: modelEntries, + }), + "utf-8", + ); +} + +// ─── normalizeForBenchmarkLookup ───────────────────────────────────────────── + +describe("normalizeForBenchmarkLookup", () => { + test("lowercases plain ids", () => { + assert.equal(normalizeForBenchmarkLookup("MiniMax-M2.7"), "minimax-m2.7"); + }); + test("strips provider/ prefix (openrouter)", () => { + assert.equal( + normalizeForBenchmarkLookup("deepseek/deepseek-v4-flash"), + "deepseek-v4-flash", + ); + }); + test("strips :free suffix", () => { + assert.equal( + normalizeForBenchmarkLookup("qwen/qwen3-coder:free"), + "qwen3-coder", + ); + }); + test("preserves :Nb size suffix", () => { + assert.equal( + normalizeForBenchmarkLookup("deepseek-v3.1:671b"), + "deepseek-v3.1:671b", + ); + }); +}); + +// ─── computeBenchmarkCoverage ──────────────────────────────────────────────── + +describe("computeBenchmarkCoverage", () => { + test("partitions covered vs uncovered using the real benchmark file", () => { + const home = tempSfHome(); + // glm-4.5 is in the static benchmark file; bogus-not-in-bench-2026 is not. + writeCatalog(home, "zai", [{ id: "glm-4.5" }, { id: "bogus-not-in-bench-2026" }]); + + const result = computeBenchmarkCoverage({ + allowed_providers: ["zai"], + }); + + const coveredIds = result.covered.map((m) => m.id); + const uncoveredIds = result.uncovered.map((m) => m.id); + assert.ok(coveredIds.includes("glm-4.5"), "glm-4.5 should be covered"); + assert.ok( + uncoveredIds.includes("bogus-not-in-bench-2026"), + "bogus model should be uncovered", + ); + assert.equal(result.summary.total, 2); + assert.equal(result.summary.coveredCount, 1); + assert.equal(result.summary.uncoveredCount, 1); + assert.equal(result.summary.coverageRatio, 0.5); + }); + + test("allowed_providers excludes models from non-listed providers", () => { + const home = tempSfHome(); + writeCatalog(home, "zai", [{ id: "glm-4.5" }]); + writeCatalog(home, "anthropic", [{ id: "claude-sonnet-4-6" }]); + + const result = computeBenchmarkCoverage({ + allowed_providers: ["zai"], + }); + + const allIds = [...result.covered, ...result.uncovered].map( + (m) => `${m.provider}/${m.id}`, + ); + assert.ok(allIds.includes("zai/glm-4.5")); + assert.ok( + !allIds.some((id) => id.startsWith("anthropic/")), + "anthropic must not appear when not in allowed_providers", + ); + }); + + test("provider_model_block: ollama-cloud gemini-* is excluded", () => { + const home = tempSfHome(); + writeCatalog(home, "ollama-cloud", [ + { id: "deepseek-v4-flash" }, + { id: "gemini-3-flash-preview" }, + ]); + + const result = computeBenchmarkCoverage({ + allowed_providers: ["ollama-cloud"], + provider_model_block: { "ollama-cloud": ["gemini-*"] }, + }); + + const ids = [...result.covered, ...result.uncovered].map((m) => m.id); + assert.ok(ids.includes("deepseek-v4-flash")); + assert.ok( + !ids.includes("gemini-3-flash-preview"), + "gemini-3-flash-preview must be blocked", + ); + }); + + test("openrouter built-in policy keeps only :free / zero-cost", () => { + const home = tempSfHome(); + writeCatalog(home, "openrouter", [ + { id: "anthropic/claude-opus-4-7" }, // paid → blocked by built-in + { id: "qwen/qwen3-coder:free" }, // :free → allowed + { + id: "openrouter/zero-cost-model", + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + }, // zero-cost → allowed + ]); + + const result = computeBenchmarkCoverage({ + allowed_providers: ["openrouter"], + }); + + const ids = [...result.covered, ...result.uncovered].map((m) => m.id); + assert.ok(!ids.includes("anthropic/claude-opus-4-7"), "paid claude blocked"); + assert.ok(ids.includes("qwen/qwen3-coder:free"), ":free allowed"); + assert.ok(ids.includes("openrouter/zero-cost-model"), "zero-cost allowed"); + }); + + test("returns empty when catalog dir is missing", () => { + tempSfHome(); // pinned but no catalog dir written + const result = computeBenchmarkCoverage({}); + assert.deepEqual(result.covered, []); + assert.deepEqual(result.uncovered, []); + assert.equal(result.summary.total, 0); + assert.equal(result.summary.coverageRatio, 0); + }); +}); + +// ─── writeBenchmarkCoverage + detectCoverageChange ─────────────────────────── + +describe("writeBenchmarkCoverage / detectCoverageChange", () => { + test("write + read round-trips", () => { + const home = tempSfHome(); + const coverage = { + covered: [{ provider: "zai", id: "glm-4.5" }], + uncovered: [{ provider: "zai", id: "bogus" }], + summary: { total: 2, coveredCount: 1, uncoveredCount: 1, coverageRatio: 0.5 }, + }; + writeBenchmarkCoverage(coverage); + const path = join(home, "benchmark-coverage.json"); + assert.ok(existsSync(path), "coverage file should be written"); + const parsed = JSON.parse(readFileSync(path, "utf-8")); + assert.equal(parsed.schemaVersion, 1); + assert.equal(parsed.summary.total, 2); + assert.deepEqual(parsed.uncovered, [{ provider: "zai", id: "bogus" }]); + }); + + test("detectCoverageChange returns true on first write (no prior file)", () => { + tempSfHome(); + assert.equal( + detectCoverageChange({ uncovered: [{ provider: "zai", id: "x" }] }), + true, + ); + }); + + test("detectCoverageChange returns false when uncovered set is unchanged", () => { + tempSfHome(); + const coverage = { + covered: [], + uncovered: [ + { provider: "zai", id: "x" }, + { provider: "kimi-coding", id: "y" }, + ], + summary: { total: 2, coveredCount: 0, uncoveredCount: 2, coverageRatio: 0 }, + }; + writeBenchmarkCoverage(coverage); + assert.equal(detectCoverageChange(coverage), false); + }); + + test("detectCoverageChange returns true when a new uncovered id appears", () => { + tempSfHome(); + writeBenchmarkCoverage({ + covered: [], + uncovered: [{ provider: "zai", id: "x" }], + summary: { total: 1, coveredCount: 0, uncoveredCount: 1, coverageRatio: 0 }, + }); + assert.equal( + detectCoverageChange({ + covered: [], + uncovered: [ + { provider: "zai", id: "x" }, + { provider: "zai", id: "y" }, + ], + }), + true, + ); + }); +}); diff --git a/src/resources/extensions/sf/uok/agent-runner.js b/src/resources/extensions/sf/uok/agent-runner.js index 7382f3765..92272a8c1 100644 --- a/src/resources/extensions/sf/uok/agent-runner.js +++ b/src/resources/extensions/sf/uok/agent-runner.js @@ -14,6 +14,7 @@ */ import { runSubagent } from "@singularity-forge/coding-agent"; +import { debugLog } from "../debug-logger.js"; const DEFAULT_MAX_CONTEXT_TURNS = 10; const DEFAULT_MAX_TURNS_PER_RUN = 5; @@ -173,6 +174,19 @@ export async function runAgentTurn(agent, opts = {}) { const allMessages = agent.receive(false); // all messages (read + unread) const target = allMessages.find((m) => m.id === onlyMessageId && !m.read); if (!target) { + // #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox. + // This is the chronic prompt-never-sent failure mode — caller swallows + // {turnsProcessed:0,response:null} as 'no work' and the LLM never runs. + // Surface the inbox state so the bus-instance / refresh-timing bug + // becomes debuggable. + debugLog("agent-runner", { + event: "silent-missing-message", + phase: "target-not-found", + agentName: agent.identity?.name, + onlyMessageId, + inboxSize: allMessages.length, + inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })), + }); return { turnsProcessed: 0, response: null }; } messages = [target]; @@ -181,6 +195,12 @@ export async function runAgentTurn(agent, opts = {}) { } if (messages.length === 0) { + debugLog("agent-runner", { + event: "silent-empty-inbox", + phase: "no-messages", + agentName: agent.identity?.name, + onlyMessageId: onlyMessageId ?? null, + }); return { turnsProcessed: 0, response: null }; }