sf snapshot: uncommitted changes after 33m inactivity
This commit is contained in:
parent
b73e386090
commit
b5764af27b
10 changed files with 699 additions and 38 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import { spawn } from "node:child_process";
|
||||
import { DiscordBot, validateDiscordConfig } from "./discord-bot.js";
|
||||
import { EventBridge } from "./event-bridge.js";
|
||||
import type { Logger } from "./logger.js";
|
||||
|
|
@ -6,6 +7,11 @@ import { scanForProjects } from "./project-scanner.js";
|
|||
import { SessionManager } from "./session-manager.js";
|
||||
import type { DaemonConfig, ProjectInfo } from "./types.js";
|
||||
|
||||
/** Cadence for the model-catalog + benchmark-coverage maintenance timer.
|
||||
* Matches the catalog cache TTL in model-catalog-cache.js (6 hours), so the
|
||||
* spawned `sf --maintain` call is a no-op when the cache is still fresh. */
|
||||
const MAINTENANCE_INTERVAL_MS = 6 * 60 * 60 * 1000;
|
||||
|
||||
/**
|
||||
* Core daemon class — ties config + logger together with lifecycle management.
|
||||
* Registers SIGTERM/SIGINT handlers for clean shutdown.
|
||||
|
|
@ -14,6 +20,7 @@ export class Daemon {
|
|||
private shuttingDown = false;
|
||||
private keepaliveTimer: ReturnType<typeof setInterval> | undefined;
|
||||
private healthTimer: ReturnType<typeof setInterval> | undefined;
|
||||
private maintenanceTimer: ReturnType<typeof setInterval> | undefined;
|
||||
private readonly onSigterm: () => void;
|
||||
private readonly onSigint: () => void;
|
||||
private sessionManager: SessionManager | undefined;
|
||||
|
|
@ -25,6 +32,7 @@ export class Daemon {
|
|||
private readonly config: DaemonConfig,
|
||||
private readonly logger: Logger,
|
||||
private readonly healthIntervalMs: number = 300_000,
|
||||
private readonly maintenanceIntervalMs: number = MAINTENANCE_INTERVAL_MS,
|
||||
) {
|
||||
this.onSigterm = () => void this.shutdown();
|
||||
this.onSigint = () => void this.shutdown();
|
||||
|
|
@ -127,6 +135,54 @@ export class Daemon {
|
|||
memory_rss_mb: Math.round(process.memoryUsage().rss / 1024 / 1024),
|
||||
});
|
||||
}, this.healthIntervalMs);
|
||||
|
||||
// Model-catalog + benchmark coverage maintenance — spawn `sf --maintain`
|
||||
// every MAINTENANCE_INTERVAL_MS. Subprocess keeps the daemon decoupled
|
||||
// from sf extension internals; the maintenance command is TTL-checked
|
||||
// and idempotent, so back-to-back fires are no-ops. The first session
|
||||
// launched after the daemon comes up still triggers register-hooks'
|
||||
// session_start refresh path, so we don't lose the first 6h window by
|
||||
// not firing on startup here.
|
||||
this.maintenanceTimer = setInterval(
|
||||
() => this.runMaintenance(),
|
||||
this.maintenanceIntervalMs,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawn `sf --maintain` and log the outcome. Failures are logged but never
|
||||
* thrown — the daemon's main work must never be blocked by maintenance.
|
||||
*/
|
||||
private runMaintenance(): void {
|
||||
const child = spawn("sf", ["--maintain"], {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
detached: false,
|
||||
});
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
child.stdout?.on("data", (chunk) => {
|
||||
stdout += chunk.toString();
|
||||
});
|
||||
child.stderr?.on("data", (chunk) => {
|
||||
stderr += chunk.toString();
|
||||
});
|
||||
child.on("error", (err) => {
|
||||
this.logger.warn("maintenance spawn failed", {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
});
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
this.logger.info("maintenance complete", {
|
||||
stdout: stdout.trim().slice(0, 500),
|
||||
});
|
||||
} else {
|
||||
this.logger.warn("maintenance exited non-zero", {
|
||||
code,
|
||||
stderr: stderr.trim().slice(0, 500),
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/** Scan configured project roots for project directories. */
|
||||
|
|
@ -171,6 +227,12 @@ export class Daemon {
|
|||
this.healthTimer = undefined;
|
||||
}
|
||||
|
||||
// Clear maintenance timer
|
||||
if (this.maintenanceTimer) {
|
||||
clearInterval(this.maintenanceTimer);
|
||||
this.maintenanceTimer = undefined;
|
||||
}
|
||||
|
||||
// Clear keepalive so the event loop can drain
|
||||
if (this.keepaliveTimer) {
|
||||
clearInterval(this.keepaliveTimer);
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@ export interface CliFlags {
|
|||
model?: string;
|
||||
listModels?: string | true;
|
||||
discover?: boolean;
|
||||
/** `sf --maintain` — refresh model catalogs and audit benchmark coverage, then exit. Idempotent / TTL-checked. Used by the daemon's periodic maintenance timer. */
|
||||
maintain?: boolean;
|
||||
extensions: string[];
|
||||
appendSystemPrompt?: string;
|
||||
tools?: string[];
|
||||
|
|
@ -121,6 +123,8 @@ export function parseCliArgs(argv: string[]): CliFlags {
|
|||
i + 1 < args.length && !args[i + 1].startsWith("-") ? args[++i] : true;
|
||||
} else if (arg === "--discover") {
|
||||
flags.discover = true;
|
||||
} else if (arg === "--maintain") {
|
||||
flags.maintain = true;
|
||||
} else if (!arg.startsWith("--") && !arg.startsWith("-")) {
|
||||
flags.messages.push(arg);
|
||||
}
|
||||
|
|
|
|||
41
src/cli.ts
41
src/cli.ts
|
|
@ -779,6 +779,47 @@ if (cliFlags.listModels !== undefined) {
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
// --maintain: refresh model catalogs (TTL-checked) and audit benchmark
|
||||
// coverage, then exit. Designed for the daemon's periodic maintenance timer:
|
||||
// idempotent, never spawns a session, never touches user-visible state.
|
||||
if (cliFlags.maintain) {
|
||||
exitIfManagedResourcesAreNewer(agentDir);
|
||||
const startedAt = Date.now();
|
||||
try {
|
||||
const { runModelCatalogRefreshIfStale } = await import(
|
||||
"./resources/extensions/sf/model-catalog-cache.js"
|
||||
);
|
||||
const { runGeminiCatalogRefreshIfStale } = await import(
|
||||
"./resources/extensions/sf/gemini-catalog.js"
|
||||
);
|
||||
const { runOpenaiCodexCatalogRefreshIfStale } = await import(
|
||||
"./resources/extensions/sf/openai-codex-catalog.js"
|
||||
);
|
||||
const { getKeyManagerAuthStorage } = await import(
|
||||
"./resources/extensions/sf/key-manager.js"
|
||||
);
|
||||
const { computeBenchmarkCoverage, writeBenchmarkCoverage } = await import(
|
||||
"./resources/extensions/sf/benchmark-coverage.js"
|
||||
);
|
||||
await runModelCatalogRefreshIfStale(process.cwd(), getKeyManagerAuthStorage());
|
||||
await runGeminiCatalogRefreshIfStale(process.cwd());
|
||||
await runOpenaiCodexCatalogRefreshIfStale(process.cwd());
|
||||
const prefs = loadEffectiveSFPreferences()?.preferences ?? {};
|
||||
const coverage = computeBenchmarkCoverage(prefs);
|
||||
writeBenchmarkCoverage(coverage);
|
||||
const ms = Date.now() - startedAt;
|
||||
process.stdout.write(
|
||||
`[sf --maintain] catalog refresh + coverage audit done in ${ms}ms — coverage ${coverage.summary.coveredCount}/${coverage.summary.total} (${coverage.uncovered.length} uncovered)\n`,
|
||||
);
|
||||
} catch (err) {
|
||||
process.stderr.write(
|
||||
`[sf --maintain] failed: ${err instanceof Error ? err.message : String(err)}\n`,
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// SF always uses quiet startup — the sf extension renders its own branded header
|
||||
if (!settingsManager.getQuietStartup()) {
|
||||
settingsManager.setQuietStartup(true);
|
||||
|
|
|
|||
243
src/resources/extensions/sf/benchmark-coverage.js
Normal file
243
src/resources/extensions/sf/benchmark-coverage.js
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
/**
|
||||
* benchmark-coverage.js — audit which catalog models have static benchmark data.
|
||||
*
|
||||
* Reads the global model-catalog (~/.sf/model-catalog/<provider>.json) and the
|
||||
* static benchmark file (learning/data/model-benchmarks.json), applies the
|
||||
* same user policy filters used by the dispatcher (isProviderAllowedByLists +
|
||||
* isProviderModelAllowed), and writes ~/.sf/benchmark-coverage.json with the
|
||||
* set of dispatchable models that have NO published benchmark numbers.
|
||||
*
|
||||
* Use: surface the gap so the static benchmark file can be kept current
|
||||
* without guessing. The audit never blocks — it's a fire-and-forget at
|
||||
* session_start (and, later, on the daemon maintenance timer).
|
||||
*/
|
||||
import {
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
readdirSync,
|
||||
readFileSync,
|
||||
writeFileSync,
|
||||
} from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import {
|
||||
isProviderAllowedByLists,
|
||||
isProviderModelAllowed,
|
||||
} from "./preferences-models.js";
|
||||
import { sfHome } from "./sf-home.js";
|
||||
|
||||
const BENCHMARK_DATA_REL = ["learning", "data", "model-benchmarks.json"];
|
||||
|
||||
function catalogDir() {
|
||||
return join(sfHome(), "model-catalog");
|
||||
}
|
||||
|
||||
function coverageFilePath() {
|
||||
return join(sfHome(), "benchmark-coverage.json");
|
||||
}
|
||||
|
||||
function benchmarkDataPath() {
|
||||
return join(import.meta.dirname, ...BENCHMARK_DATA_REL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a lowercase set of model ids that have any published benchmark data.
|
||||
* Returns an empty Set when the file is missing or malformed.
|
||||
*/
|
||||
function loadBenchmarkKeys() {
|
||||
const path = benchmarkDataPath();
|
||||
if (!existsSync(path)) return new Set();
|
||||
try {
|
||||
const data = JSON.parse(readFileSync(path, "utf-8"));
|
||||
return new Set(
|
||||
Object.keys(data)
|
||||
.filter((k) => k !== "_meta")
|
||||
.map((k) => k.toLowerCase()),
|
||||
);
|
||||
} catch {
|
||||
return new Set();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read every <provider>.json file under ~/.sf/model-catalog/ and flatten into
|
||||
* { provider, id, cost? } entries. Returns [] if the catalog dir is missing.
|
||||
*/
|
||||
function loadCatalogEntries() {
|
||||
const dir = catalogDir();
|
||||
if (!existsSync(dir)) return [];
|
||||
const out = [];
|
||||
for (const file of readdirSync(dir)) {
|
||||
if (!file.endsWith(".json")) continue;
|
||||
const provider = file.slice(0, -".json".length);
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(readFileSync(join(dir, file), "utf-8"));
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
const entries = Array.isArray(parsed?.modelIds) ? parsed.modelIds : [];
|
||||
for (const raw of entries) {
|
||||
const id = typeof raw === "string" ? raw : raw?.id;
|
||||
if (typeof id !== "string" || id.length === 0) continue;
|
||||
out.push({
|
||||
provider,
|
||||
id,
|
||||
cost: typeof raw === "object" ? raw?.cost : undefined,
|
||||
contextWindow:
|
||||
typeof raw === "object" ? raw?.contextWindow : undefined,
|
||||
});
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduce an arbitrary route id to the bare model id used in the benchmark
|
||||
* file. Lowercases, strips `<provider>/` prefix (openrouter style) and any
|
||||
* `:free` suffix.
|
||||
*/
|
||||
export function normalizeForBenchmarkLookup(modelId) {
|
||||
let key = modelId.toLowerCase();
|
||||
const slash = key.indexOf("/");
|
||||
if (slash !== -1) key = key.slice(slash + 1);
|
||||
if (key.endsWith(":free")) key = key.slice(0, -":free".length);
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute coverage stats for the user's dispatchable model set.
|
||||
*
|
||||
* `prefs` is the merged preferences object; we read allowed_providers,
|
||||
* blocked_providers, provider_model_allow, provider_model_block.
|
||||
*/
|
||||
export function computeBenchmarkCoverage(prefs) {
|
||||
const benchmarkKeys = loadBenchmarkKeys();
|
||||
const catalog = loadCatalogEntries();
|
||||
const allowed = prefs?.allowed_providers;
|
||||
const blocked = prefs?.blocked_providers;
|
||||
const providerModelAllow = prefs?.provider_model_allow;
|
||||
const providerModelBlock = prefs?.provider_model_block;
|
||||
|
||||
const covered = [];
|
||||
const uncovered = [];
|
||||
|
||||
for (const entry of catalog) {
|
||||
if (!isProviderAllowedByLists(entry.provider, allowed, blocked)) continue;
|
||||
const modelObj = {
|
||||
cost: entry.cost,
|
||||
contextWindow: entry.contextWindow,
|
||||
};
|
||||
if (
|
||||
!isProviderModelAllowed(
|
||||
entry.provider,
|
||||
entry.id,
|
||||
providerModelAllow,
|
||||
providerModelBlock,
|
||||
modelObj,
|
||||
)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const key = normalizeForBenchmarkLookup(entry.id);
|
||||
const bucket = benchmarkKeys.has(key) ? covered : uncovered;
|
||||
bucket.push({ provider: entry.provider, id: entry.id });
|
||||
}
|
||||
|
||||
const total = covered.length + uncovered.length;
|
||||
return {
|
||||
covered,
|
||||
uncovered,
|
||||
summary: {
|
||||
total,
|
||||
coveredCount: covered.length,
|
||||
uncoveredCount: uncovered.length,
|
||||
coverageRatio: total === 0 ? 0 : covered.length / total,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the coverage report to ~/.sf/benchmark-coverage.json. Best-effort —
|
||||
* never throws on disk failure.
|
||||
*/
|
||||
export function writeBenchmarkCoverage(coverage) {
|
||||
const path = coverageFilePath();
|
||||
try {
|
||||
mkdirSync(dirname(path), { recursive: true });
|
||||
const payload = {
|
||||
schemaVersion: 1,
|
||||
generated: new Date().toISOString(),
|
||||
summary: coverage.summary,
|
||||
uncovered: coverage.uncovered,
|
||||
};
|
||||
writeFileSync(path, `${JSON.stringify(payload, null, 2)}\n`, "utf-8");
|
||||
} catch {
|
||||
// Best-effort — never fail the caller.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare a freshly computed coverage report against the previously written
|
||||
* one. Returns true when the uncovered set has changed (additions OR removals).
|
||||
* Used to gate change-driven notifications.
|
||||
*/
|
||||
export function detectCoverageChange(coverage) {
|
||||
const path = coverageFilePath();
|
||||
if (!existsSync(path)) return true;
|
||||
let prev;
|
||||
try {
|
||||
prev = JSON.parse(readFileSync(path, "utf-8"));
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
const prevIds = new Set(
|
||||
(Array.isArray(prev?.uncovered) ? prev.uncovered : []).map(
|
||||
(m) => `${m.provider}/${m.id}`,
|
||||
),
|
||||
);
|
||||
const currIds = new Set(
|
||||
coverage.uncovered.map((m) => `${m.provider}/${m.id}`),
|
||||
);
|
||||
if (prevIds.size !== currIds.size) return true;
|
||||
for (const id of currIds) {
|
||||
if (!prevIds.has(id)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget audit. Writes the coverage report and, if the uncovered set
|
||||
* has changed since the previous run, invokes the optional `notify` callback
|
||||
* with a one-line summary. Caller supplies notify so this module stays free of
|
||||
* UI dependencies.
|
||||
*
|
||||
* scheduleBenchmarkCoverageAudit(prefs, (msg) => ctx.ui?.notify?.(msg, "info"))
|
||||
*/
|
||||
export function scheduleBenchmarkCoverageAudit(prefs, notify) {
|
||||
setImmediate(() => {
|
||||
try {
|
||||
const coverage = computeBenchmarkCoverage(prefs);
|
||||
const changed = detectCoverageChange(coverage);
|
||||
writeBenchmarkCoverage(coverage);
|
||||
if (
|
||||
changed &&
|
||||
coverage.uncovered.length > 0 &&
|
||||
typeof notify === "function"
|
||||
) {
|
||||
const sample = coverage.uncovered
|
||||
.slice(0, 3)
|
||||
.map((m) => `${m.provider}/${m.id}`)
|
||||
.join(", ");
|
||||
const more =
|
||||
coverage.uncovered.length > 3
|
||||
? ` (+${coverage.uncovered.length - 3} more)`
|
||||
: "";
|
||||
notify(
|
||||
`Benchmark coverage: ${coverage.summary.coveredCount}/${coverage.summary.total} models have static scores. Uncovered: ${sample}${more}. See ~/.sf/benchmark-coverage.json`,
|
||||
);
|
||||
}
|
||||
} catch {
|
||||
// Audit failures must never block session start.
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
@ -538,6 +538,28 @@ export function registerHooks(pi, ecosystemHandlers = []) {
|
|||
} catch {
|
||||
/* non-fatal — codex catalog refresh must never block session start */
|
||||
}
|
||||
// Audit benchmark coverage — compare the dispatchable model set
|
||||
// (catalog ∩ user policy) against the static benchmark file and write
|
||||
// ~/.sf/benchmark-coverage.json. Surfaces models routed via /v1/models
|
||||
// discovery that don't yet have published benchmark numbers, so the
|
||||
// static file can be kept current without guessing model quality.
|
||||
try {
|
||||
const { scheduleBenchmarkCoverageAudit } = await import(
|
||||
"../benchmark-coverage.js"
|
||||
);
|
||||
const { loadEffectiveSFPreferences } = await import(
|
||||
"../preferences.js"
|
||||
);
|
||||
const prefs = loadEffectiveSFPreferences() ?? {};
|
||||
scheduleBenchmarkCoverageAudit(prefs, (msg) =>
|
||||
ctx.ui?.notify?.(msg, "info", {
|
||||
noticeKind: NOTICE_KIND.SYSTEM_NOTICE,
|
||||
dedupe_key: "benchmark-coverage",
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
/* non-fatal — benchmark audit must never block session start */
|
||||
}
|
||||
// Detect drift in source-of-truth markdown files since last session.
|
||||
try {
|
||||
const { detectMdFileDrift, formatDriftReport } = await import(
|
||||
|
|
|
|||
|
|
@ -77,18 +77,25 @@ export async function refreshGeminiCatalog(basePath) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget background refresh of the gemini-cli model catalog. Skipped
|
||||
* if the on-disk cache is already fresh (within CATALOG_TTL_MS).
|
||||
* Awaitable refresh — only fetches when the cache is stale. Returns the
|
||||
* fresh model id list, the existing-stale list, or null on failure / when
|
||||
* gemini-cli isn't available.
|
||||
*/
|
||||
export async function runGeminiCatalogRefreshIfStale(basePath) {
|
||||
if (isCacheFresh()) return null;
|
||||
try {
|
||||
return await refreshGeminiCatalog(basePath);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget background refresh. Skipped if the on-disk cache is fresh.
|
||||
*
|
||||
* Consumer: bootstrap/register-hooks.js session_start hook.
|
||||
*/
|
||||
export function scheduleGeminiCatalogRefresh(basePath) {
|
||||
if (isCacheFresh()) return;
|
||||
setImmediate(async () => {
|
||||
try {
|
||||
await refreshGeminiCatalog(basePath);
|
||||
} catch {
|
||||
// Per-provider failure is silently swallowed.
|
||||
}
|
||||
});
|
||||
setImmediate(() => runGeminiCatalogRefreshIfStale(basePath));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -334,33 +334,39 @@ export async function refreshProviderCatalog(basePath, providerId, apiKey) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget background refresh for all discoverable providers that have
|
||||
* a key in auth.json and a stale or absent cache. Safe to call at session
|
||||
* start — never throws, never blocks.
|
||||
* Awaitable refresh for all discoverable providers with a key in auth.json
|
||||
* and a stale or absent cache. Per-provider failures are logged but never
|
||||
* surfaced as exceptions — the loop is best-effort.
|
||||
*/
|
||||
export function scheduleModelCatalogRefresh(basePath, auth) {
|
||||
setImmediate(async () => {
|
||||
for (const providerId of DISCOVERABLE_PROVIDER_IDS) {
|
||||
try {
|
||||
const creds = auth.getCredentialsForProvider(providerId);
|
||||
const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key;
|
||||
if (!apiKey) continue;
|
||||
if (readCachedModelIds(basePath, providerId) !== null) continue;
|
||||
const result = await refreshProviderCatalog(basePath, providerId, apiKey);
|
||||
if (result === null) {
|
||||
// Surface per-provider fetch failures so they don't silently disappear.
|
||||
process.stderr.write(
|
||||
`[model-catalog-cache] refresh failed for provider: ${providerId}\n`,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
// Per-provider failures must not crash the refresh loop, but should be visible.
|
||||
export async function runModelCatalogRefreshIfStale(basePath, auth) {
|
||||
for (const providerId of DISCOVERABLE_PROVIDER_IDS) {
|
||||
try {
|
||||
const creds = auth.getCredentialsForProvider(providerId);
|
||||
const apiKey = creds.find((c) => c.type === "api_key" && c.key)?.key;
|
||||
if (!apiKey) continue;
|
||||
if (readCachedModelIds(basePath, providerId) !== null) continue;
|
||||
const result = await refreshProviderCatalog(basePath, providerId, apiKey);
|
||||
if (result === null) {
|
||||
// Surface per-provider fetch failures so they don't silently disappear.
|
||||
process.stderr.write(
|
||||
`[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`,
|
||||
`[model-catalog-cache] refresh failed for provider: ${providerId}\n`,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
// Per-provider failures must not crash the refresh loop, but should be visible.
|
||||
process.stderr.write(
|
||||
`[model-catalog-cache] unexpected error for provider ${providerId}: ${err?.message ?? err}\n`,
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget background refresh. Wraps runModelCatalogRefreshIfStale in
|
||||
* setImmediate. Safe to call at session start — never throws, never blocks.
|
||||
*/
|
||||
export function scheduleModelCatalogRefresh(basePath, auth) {
|
||||
setImmediate(() => runModelCatalogRefreshIfStale(basePath, auth));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -85,6 +85,19 @@ export async function refreshOpenaiCodexCatalog(_basePath) {
|
|||
return modelIds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Awaitable refresh — only fetches when the SF cache is stale. Returns the
|
||||
* fresh model id list, or null on failure / no codex cache available.
|
||||
*/
|
||||
export async function runOpenaiCodexCatalogRefreshIfStale(basePath) {
|
||||
if (isSfCacheFresh()) return null;
|
||||
try {
|
||||
return await refreshOpenaiCodexCatalog(basePath);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fire-and-forget background refresh. Skipped if the SF cache is fresh.
|
||||
*
|
||||
|
|
@ -92,11 +105,5 @@ export async function refreshOpenaiCodexCatalog(_basePath) {
|
|||
*/
|
||||
export function scheduleOpenaiCodexCatalogRefresh(basePath) {
|
||||
if (isSfCacheFresh()) return;
|
||||
setImmediate(async () => {
|
||||
try {
|
||||
await refreshOpenaiCodexCatalog(basePath);
|
||||
} catch {
|
||||
// Per-provider failure is silently swallowed.
|
||||
}
|
||||
});
|
||||
setImmediate(() => runOpenaiCodexCatalogRefreshIfStale(basePath));
|
||||
}
|
||||
|
|
|
|||
249
src/resources/extensions/sf/tests/benchmark-coverage.test.mjs
Normal file
249
src/resources/extensions/sf/tests/benchmark-coverage.test.mjs
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
/**
|
||||
* benchmark-coverage.test.mjs
|
||||
*
|
||||
* Tests that computeBenchmarkCoverage partitions the dispatchable model set
|
||||
* against the static benchmark file, applies the user policy filters
|
||||
* (allowed_providers, provider_model_allow, provider_model_block, plus the
|
||||
* built-in per-provider policy), and that writeBenchmarkCoverage /
|
||||
* detectCoverageChange round-trip and notice change correctly.
|
||||
*/
|
||||
import assert from "node:assert/strict";
|
||||
import {
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
mkdtempSync,
|
||||
readFileSync,
|
||||
rmSync,
|
||||
writeFileSync,
|
||||
} from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, test } from "vitest";
|
||||
|
||||
// Touch preferences.js so the prefs loader / lazy circular dep is wired.
|
||||
import "../preferences.js";
|
||||
import {
|
||||
computeBenchmarkCoverage,
|
||||
detectCoverageChange,
|
||||
normalizeForBenchmarkLookup,
|
||||
writeBenchmarkCoverage,
|
||||
} from "../benchmark-coverage.js";
|
||||
|
||||
// ─── Test isolation ──────────────────────────────────────────────────────────
|
||||
|
||||
const tmpDirs = [];
|
||||
let originalSfHome;
|
||||
|
||||
beforeEach(() => {
|
||||
originalSfHome = process.env.SF_HOME;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
while (tmpDirs.length > 0) {
|
||||
rmSync(tmpDirs.pop(), { recursive: true, force: true });
|
||||
}
|
||||
if (originalSfHome === undefined) delete process.env.SF_HOME;
|
||||
else process.env.SF_HOME = originalSfHome;
|
||||
});
|
||||
|
||||
function tempSfHome() {
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-benchmark-coverage-test-"));
|
||||
tmpDirs.push(dir);
|
||||
process.env.SF_HOME = dir;
|
||||
return dir;
|
||||
}
|
||||
|
||||
function writeCatalog(sfHome, providerId, modelEntries) {
|
||||
const dir = join(sfHome, "model-catalog");
|
||||
mkdirSync(dir, { recursive: true });
|
||||
writeFileSync(
|
||||
join(dir, `${providerId}.json`),
|
||||
JSON.stringify({
|
||||
fetchedAt: new Date().toISOString(),
|
||||
modelIds: modelEntries,
|
||||
}),
|
||||
"utf-8",
|
||||
);
|
||||
}
|
||||
|
||||
// ─── normalizeForBenchmarkLookup ─────────────────────────────────────────────
|
||||
|
||||
describe("normalizeForBenchmarkLookup", () => {
|
||||
test("lowercases plain ids", () => {
|
||||
assert.equal(normalizeForBenchmarkLookup("MiniMax-M2.7"), "minimax-m2.7");
|
||||
});
|
||||
test("strips provider/ prefix (openrouter)", () => {
|
||||
assert.equal(
|
||||
normalizeForBenchmarkLookup("deepseek/deepseek-v4-flash"),
|
||||
"deepseek-v4-flash",
|
||||
);
|
||||
});
|
||||
test("strips :free suffix", () => {
|
||||
assert.equal(
|
||||
normalizeForBenchmarkLookup("qwen/qwen3-coder:free"),
|
||||
"qwen3-coder",
|
||||
);
|
||||
});
|
||||
test("preserves :Nb size suffix", () => {
|
||||
assert.equal(
|
||||
normalizeForBenchmarkLookup("deepseek-v3.1:671b"),
|
||||
"deepseek-v3.1:671b",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── computeBenchmarkCoverage ────────────────────────────────────────────────
|
||||
|
||||
describe("computeBenchmarkCoverage", () => {
|
||||
test("partitions covered vs uncovered using the real benchmark file", () => {
|
||||
const home = tempSfHome();
|
||||
// glm-4.5 is in the static benchmark file; bogus-not-in-bench-2026 is not.
|
||||
writeCatalog(home, "zai", [{ id: "glm-4.5" }, { id: "bogus-not-in-bench-2026" }]);
|
||||
|
||||
const result = computeBenchmarkCoverage({
|
||||
allowed_providers: ["zai"],
|
||||
});
|
||||
|
||||
const coveredIds = result.covered.map((m) => m.id);
|
||||
const uncoveredIds = result.uncovered.map((m) => m.id);
|
||||
assert.ok(coveredIds.includes("glm-4.5"), "glm-4.5 should be covered");
|
||||
assert.ok(
|
||||
uncoveredIds.includes("bogus-not-in-bench-2026"),
|
||||
"bogus model should be uncovered",
|
||||
);
|
||||
assert.equal(result.summary.total, 2);
|
||||
assert.equal(result.summary.coveredCount, 1);
|
||||
assert.equal(result.summary.uncoveredCount, 1);
|
||||
assert.equal(result.summary.coverageRatio, 0.5);
|
||||
});
|
||||
|
||||
test("allowed_providers excludes models from non-listed providers", () => {
|
||||
const home = tempSfHome();
|
||||
writeCatalog(home, "zai", [{ id: "glm-4.5" }]);
|
||||
writeCatalog(home, "anthropic", [{ id: "claude-sonnet-4-6" }]);
|
||||
|
||||
const result = computeBenchmarkCoverage({
|
||||
allowed_providers: ["zai"],
|
||||
});
|
||||
|
||||
const allIds = [...result.covered, ...result.uncovered].map(
|
||||
(m) => `${m.provider}/${m.id}`,
|
||||
);
|
||||
assert.ok(allIds.includes("zai/glm-4.5"));
|
||||
assert.ok(
|
||||
!allIds.some((id) => id.startsWith("anthropic/")),
|
||||
"anthropic must not appear when not in allowed_providers",
|
||||
);
|
||||
});
|
||||
|
||||
test("provider_model_block: ollama-cloud gemini-* is excluded", () => {
|
||||
const home = tempSfHome();
|
||||
writeCatalog(home, "ollama-cloud", [
|
||||
{ id: "deepseek-v4-flash" },
|
||||
{ id: "gemini-3-flash-preview" },
|
||||
]);
|
||||
|
||||
const result = computeBenchmarkCoverage({
|
||||
allowed_providers: ["ollama-cloud"],
|
||||
provider_model_block: { "ollama-cloud": ["gemini-*"] },
|
||||
});
|
||||
|
||||
const ids = [...result.covered, ...result.uncovered].map((m) => m.id);
|
||||
assert.ok(ids.includes("deepseek-v4-flash"));
|
||||
assert.ok(
|
||||
!ids.includes("gemini-3-flash-preview"),
|
||||
"gemini-3-flash-preview must be blocked",
|
||||
);
|
||||
});
|
||||
|
||||
test("openrouter built-in policy keeps only :free / zero-cost", () => {
|
||||
const home = tempSfHome();
|
||||
writeCatalog(home, "openrouter", [
|
||||
{ id: "anthropic/claude-opus-4-7" }, // paid → blocked by built-in
|
||||
{ id: "qwen/qwen3-coder:free" }, // :free → allowed
|
||||
{
|
||||
id: "openrouter/zero-cost-model",
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
}, // zero-cost → allowed
|
||||
]);
|
||||
|
||||
const result = computeBenchmarkCoverage({
|
||||
allowed_providers: ["openrouter"],
|
||||
});
|
||||
|
||||
const ids = [...result.covered, ...result.uncovered].map((m) => m.id);
|
||||
assert.ok(!ids.includes("anthropic/claude-opus-4-7"), "paid claude blocked");
|
||||
assert.ok(ids.includes("qwen/qwen3-coder:free"), ":free allowed");
|
||||
assert.ok(ids.includes("openrouter/zero-cost-model"), "zero-cost allowed");
|
||||
});
|
||||
|
||||
test("returns empty when catalog dir is missing", () => {
|
||||
tempSfHome(); // pinned but no catalog dir written
|
||||
const result = computeBenchmarkCoverage({});
|
||||
assert.deepEqual(result.covered, []);
|
||||
assert.deepEqual(result.uncovered, []);
|
||||
assert.equal(result.summary.total, 0);
|
||||
assert.equal(result.summary.coverageRatio, 0);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── writeBenchmarkCoverage + detectCoverageChange ───────────────────────────
|
||||
|
||||
describe("writeBenchmarkCoverage / detectCoverageChange", () => {
|
||||
test("write + read round-trips", () => {
|
||||
const home = tempSfHome();
|
||||
const coverage = {
|
||||
covered: [{ provider: "zai", id: "glm-4.5" }],
|
||||
uncovered: [{ provider: "zai", id: "bogus" }],
|
||||
summary: { total: 2, coveredCount: 1, uncoveredCount: 1, coverageRatio: 0.5 },
|
||||
};
|
||||
writeBenchmarkCoverage(coverage);
|
||||
const path = join(home, "benchmark-coverage.json");
|
||||
assert.ok(existsSync(path), "coverage file should be written");
|
||||
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
||||
assert.equal(parsed.schemaVersion, 1);
|
||||
assert.equal(parsed.summary.total, 2);
|
||||
assert.deepEqual(parsed.uncovered, [{ provider: "zai", id: "bogus" }]);
|
||||
});
|
||||
|
||||
test("detectCoverageChange returns true on first write (no prior file)", () => {
|
||||
tempSfHome();
|
||||
assert.equal(
|
||||
detectCoverageChange({ uncovered: [{ provider: "zai", id: "x" }] }),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("detectCoverageChange returns false when uncovered set is unchanged", () => {
|
||||
tempSfHome();
|
||||
const coverage = {
|
||||
covered: [],
|
||||
uncovered: [
|
||||
{ provider: "zai", id: "x" },
|
||||
{ provider: "kimi-coding", id: "y" },
|
||||
],
|
||||
summary: { total: 2, coveredCount: 0, uncoveredCount: 2, coverageRatio: 0 },
|
||||
};
|
||||
writeBenchmarkCoverage(coverage);
|
||||
assert.equal(detectCoverageChange(coverage), false);
|
||||
});
|
||||
|
||||
test("detectCoverageChange returns true when a new uncovered id appears", () => {
|
||||
tempSfHome();
|
||||
writeBenchmarkCoverage({
|
||||
covered: [],
|
||||
uncovered: [{ provider: "zai", id: "x" }],
|
||||
summary: { total: 1, coveredCount: 0, uncoveredCount: 1, coverageRatio: 0 },
|
||||
});
|
||||
assert.equal(
|
||||
detectCoverageChange({
|
||||
covered: [],
|
||||
uncovered: [
|
||||
{ provider: "zai", id: "x" },
|
||||
{ provider: "zai", id: "y" },
|
||||
],
|
||||
}),
|
||||
true,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -14,6 +14,7 @@
|
|||
*/
|
||||
|
||||
import { runSubagent } from "@singularity-forge/coding-agent";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
|
||||
const DEFAULT_MAX_CONTEXT_TURNS = 10;
|
||||
const DEFAULT_MAX_TURNS_PER_RUN = 5;
|
||||
|
|
@ -173,6 +174,19 @@ export async function runAgentTurn(agent, opts = {}) {
|
|||
const allMessages = agent.receive(false); // all messages (read + unread)
|
||||
const target = allMessages.find((m) => m.id === onlyMessageId && !m.read);
|
||||
if (!target) {
|
||||
// #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox.
|
||||
// This is the chronic prompt-never-sent failure mode — caller swallows
|
||||
// {turnsProcessed:0,response:null} as 'no work' and the LLM never runs.
|
||||
// Surface the inbox state so the bus-instance / refresh-timing bug
|
||||
// becomes debuggable.
|
||||
debugLog("agent-runner", {
|
||||
event: "silent-missing-message",
|
||||
phase: "target-not-found",
|
||||
agentName: agent.identity?.name,
|
||||
onlyMessageId,
|
||||
inboxSize: allMessages.length,
|
||||
inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })),
|
||||
});
|
||||
return { turnsProcessed: 0, response: null };
|
||||
}
|
||||
messages = [target];
|
||||
|
|
@ -181,6 +195,12 @@ export async function runAgentTurn(agent, opts = {}) {
|
|||
}
|
||||
|
||||
if (messages.length === 0) {
|
||||
debugLog("agent-runner", {
|
||||
event: "silent-empty-inbox",
|
||||
phase: "no-messages",
|
||||
agentName: agent.identity?.name,
|
||||
onlyMessageId: onlyMessageId ?? null,
|
||||
});
|
||||
return { turnsProcessed: 0, response: null };
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue