From 089bf0cbebdfd102d1c32b3174823d9687fa4aa1 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Fri, 15 May 2026 14:02:58 +0200 Subject: [PATCH] fix(model-learner): resolve canonical-id lazy-load race + 23 wire-id aliases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Of 52 dispatches in this repo's journal this session, 51 landed in .sf/model-performance.json's _unmapped bucket — meaning the live-outcome learner couldn't tell which provider/model succeeded or failed. Only 1 dispatch (google-gemini-cli/gemini-3-flash-preview) bucketed correctly. Root cause was NOT just missing aliases — it was a lazy-load race: - model-learner.js declared canonicalIdFor as a fire-and-forget dynamic import side-effect at module bottom - metrics.js called recordOutcome() synchronously after `await import("./model-learner.js")` resolved — before the registry injection promise settled - Result: _canonicalIdForFn was null for the first dispatch every session. Every session. Since the file shipped. Why nobody noticed: _unmapped is a bucket, not an error. No throw, no warning, no UI surface. Selection still worked because benchmark-selector + static hand-tuned scores carry the routing decision. Only the feedback loop (recordOutcome → adjust scores) was silently severed. Fix: - model-learner.js: export `registryReady` promise instead of swallowing it - metrics.js: await registryReady before recordOutcome() - model-registry.ts: 23 new CANONICAL_BY_ROUTE entries covering the actual production fallback chain — zai/glm-4.5{-air,-flash,5,5.1,5-turbo,4.6,4.7,4.7-flash}, mistral/codestral-latest + devstral-2512 + devstral-{small,medium}-* + mistral-{large,medium,small}-latest, google-gemini-cli/gemini-{2.5-pro,3-flash-preview,3.1-pro-preview}, opencode-go/{glm-5,glm-5.1,mimo-v2-omni,mimo-v2-pro} Also adds opt-in backfillModelPerformanceFromJournal(basePath) to reclassify the existing 51 _unmapped records from past journal events. Never auto-runs; backs up the old file before overwriting. Tests: 16 in canonical-id-mapping.test.mjs covering pattern matching, non-mappable cases, bare canonical-id passthrough, and the backfill path. Full suite 1906 pass, no regressions. Known follow-up: CANONICAL_BY_ROUTE uses mixed casing (MiniMax-M2.7 vs minimax-m2) — should be standardized lowercase in a future pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/resources/extensions/sf/metrics.js | 6 +- src/resources/extensions/sf/model-learner.js | 154 ++++++++++++- src/resources/extensions/sf/model-registry.ts | 29 +++ .../sf/tests/canonical-id-mapping.test.mjs | 216 ++++++++++++++++++ 4 files changed, 401 insertions(+), 4 deletions(-) create mode 100644 src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs diff --git a/src/resources/extensions/sf/metrics.js b/src/resources/extensions/sf/metrics.js index 00c312bac..59d9aa630 100644 --- a/src/resources/extensions/sf/metrics.js +++ b/src/resources/extensions/sf/metrics.js @@ -79,7 +79,11 @@ async function recordUnitOutcome(unit) { // Quick Win #2: Also record to model-learner for per-task-type tracking try { - const { ModelLearner } = await import("./model-learner.js"); + const { ModelLearner, registryReady } = await import("./model-learner.js"); + // Await registry load so canonicalIdFor is wired before the first + // recordOutcome() call. Without this, the timing window between module + // load and registry resolution causes all routes to land in _unmapped. + await registryReady; const learner = new ModelLearner(basePath); learner.recordOutcome(unit.type, modelId, { success: true, diff --git a/src/resources/extensions/sf/model-learner.js b/src/resources/extensions/sf/model-learner.js index 09c30b8e6..99c73519b 100644 --- a/src/resources/extensions/sf/model-learner.js +++ b/src/resources/extensions/sf/model-learner.js @@ -14,7 +14,9 @@ import { appendFileSync, existsSync, mkdirSync, + readdirSync, readFileSync, + renameSync, writeFileSync, } from "node:fs"; import { dirname, join } from "node:path"; @@ -60,9 +62,10 @@ export function setRegistryResolver(fn) { // Wire the registry lazily so model-learner.js can be imported independently // of @singularity-forge/ai (e.g. in tests that don't load the full AI package). // The fire-and-forget import populates _canonicalIdForFn when the registry -// resolves. Outcomes recorded before the registry loads go to _unmapped and -// are preserved there for re-resolution on next format migration. -import("./model-registry.js") +// resolves. Export `registryReady` so callers can await it before the first +// recordOutcome() call to avoid the timing window where _canonicalIdForFn is +// still null (which routes everything to _unmapped). +export const registryReady = import("./model-registry.js") .then((mod) => { if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") { _canonicalIdForFn = mod.canonicalIdFor; @@ -796,3 +799,148 @@ export default { ModelPerformanceTracker, FailureAnalyzer, }; + +/** + * Opt-in backfill: re-aggregate model-performance.json from journal events. + * + * Reads all `model_auto_resolved` events (which carry provider + model) from + * `.sf/journal/*.jsonl`, joins them with `unit_completed` events on traceId to + * get success/failure signal, then writes a fresh model-performance.json keyed + * by canonical ids. The old file is timestamped-renamed as a backup. + * + * Only call this explicitly — never at session start — so operators are not + * surprised by rewritten data. + * + * @param {string} basePath - project root (same as passed to ModelLearner) + * @returns {{ written: number, unmapped: number, backupPath: string | null }} + */ +export async function backfillModelPerformanceFromJournal(basePath) { + // Ensure registry is loaded so canonicalIdFor works. + await registryReady; + + const journalDir = join(basePath, ".sf", "journal"); + const perfPath = join(basePath, ".sf", "model-performance.json"); + + if (!existsSync(journalDir)) { + return { written: 0, unmapped: 0, backupPath: null }; + } + + // ── Phase 1: Collect unit_completed outcomes keyed by traceId ──────────── + // Shape: Map + const outcomeByTrace = new Map(); + // ── Phase 2: Collect model_auto_resolved events ─────────────────────────── + // Shape: Array<{ traceId, unitType, routeKey }> + const dispatches = []; + + const files = readdirSync(journalDir) + .filter((f) => f.endsWith(".jsonl")) + .sort(); + + for (const file of files) { + const content = readFileSync(join(journalDir, file), "utf-8"); + for (const line of content.split("\n")) { + if (!line.trim()) continue; + let event; + try { + event = JSON.parse(line); + } catch { + continue; + } + const et = event.eventType; + const d = event.data; + if (!d) continue; + + if (et === "model_auto_resolved") { + const inner = d.data ?? d; + const provider = inner.resolvedProvider; + const model = inner.resolvedModel; + const unitType = inner.unitType ?? d.unitType; + const traceId = inner.traceId ?? d.traceId ?? event.traceId; + if (provider && model && unitType && traceId) { + dispatches.push({ + traceId, + unitType, + routeKey: `${provider}/${model}`, + }); + } + } else if (et === "unit_completed") { + const inner = d.data ?? d; + const traceId = inner.traceId ?? d.traceId ?? event.traceId; + const status = inner.status ?? "completed"; + const tokens = inner.tokens ?? 0; + const cost = inner.cost_usd ?? inner.cost ?? 0; + if (traceId) { + // last-write wins (multiple units per trace is fine) + outcomeByTrace.set(traceId, { + status, + tokens: typeof tokens === "number" ? tokens : (tokens.total ?? 0), + cost, + }); + } + } + } + } + + // ── Phase 3: Build aggregated performance data ──────────────────────────── + // Shape: { [taskType]: { [canonicalId]: { aggregate: {...}, by_route: {...} } } } + const newData = {}; + let written = 0; + let unmapped = 0; + + for (const { traceId, unitType, routeKey } of dispatches) { + const outcome = outcomeByTrace.get(traceId); + const success = outcome ? outcome.status === "completed" : true; // assume success if not found + const tokensUsed = outcome?.tokens ?? 0; + const costUsd = outcome?.cost ?? 0; + const timestamp = new Date().toISOString(); + + if (!newData[unitType]) newData[unitType] = {}; + + const canonicalId = tryCanonicalIdFor(routeKey); + if (canonicalId === null) { + unmapped++; + if (!newData[unitType]["_unmapped"]) { + newData[unitType]["_unmapped"] = { by_route: {} }; + } + const unmappedEntry = newData[unitType]["_unmapped"]; + if (!unmappedEntry.by_route[routeKey]) { + unmappedEntry.by_route[routeKey] = emptyRouteStats(timestamp); + } + const rs = unmappedEntry.by_route[routeKey]; + if (success) rs.successes++; + else rs.failures++; + rs.totalTokens += tokensUsed; + rs.totalCost += costUsd; + rs.lastUsed = timestamp; + } else { + written++; + if (!newData[unitType][canonicalId]) { + newData[unitType][canonicalId] = { + aggregate: emptyRouteStats(timestamp), + by_route: {}, + }; + } + const canonicalEntry = newData[unitType][canonicalId]; + if (!canonicalEntry.by_route[routeKey]) { + canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp); + } + const rs = canonicalEntry.by_route[routeKey]; + if (success) rs.successes++; + else rs.failures++; + rs.totalTokens += tokensUsed; + rs.totalCost += costUsd; + rs.lastUsed = timestamp; + recomputeAggregate(canonicalEntry); + } + } + + // ── Phase 4: Backup old file and write new one ──────────────────────────── + let backupPath = null; + if (existsSync(perfPath)) { + backupPath = `${perfPath}.bak.${Date.now()}`; + renameSync(perfPath, backupPath); + } + writeFileSync(perfPath, JSON.stringify(newData, null, 2), "utf-8"); + + return { written, unmapped, backupPath }; +} diff --git a/src/resources/extensions/sf/model-registry.ts b/src/resources/extensions/sf/model-registry.ts index d780b5984..87db6e324 100644 --- a/src/resources/extensions/sf/model-registry.ts +++ b/src/resources/extensions/sf/model-registry.ts @@ -278,6 +278,35 @@ const CANONICAL_BY_ROUTE: Record = { // opencode wire IDs that need to map through "opencode/kimi-k2.5": "kimi-k2.5", "opencode-go/kimi-k2.5": "kimi-k2.5", + "opencode-go/glm-5": "glm-5", + "opencode-go/glm-5.1": "glm-5.1", + "opencode-go/mimo-v2-omni": "mimo-v2-omni", + "opencode-go/mimo-v2-pro": "mimo-v2-pro", + // ── google-gemini-cli ───────────────────────────────────────────────────── + // Wire IDs are already canonical; listed explicitly so CANONICAL_BY_ROUTE + // takes precedence over the _ENTRY_BY_ROUTE fallback, making the mapping + // robust even if _canonicalIdForFn is resolved before _ENTRY_BY_ROUTE is built. + "google-gemini-cli/gemini-2.5-pro": "gemini-2.5-pro", + "google-gemini-cli/gemini-3-flash-preview": "gemini-3-flash-preview", + "google-gemini-cli/gemini-3.1-pro-preview": "gemini-3.1-pro-preview", + // ── mistral (direct, not via openrouter) ───────────────────────────────── + "mistral/codestral-latest": "codestral-latest", + "mistral/devstral-2512": "devstral-2512", + "mistral/devstral-medium-2507": "devstral-medium-latest", + "mistral/devstral-small-2507": "devstral-small-2507", + "mistral/mistral-large-latest": "mistral-large-latest", + "mistral/mistral-medium-latest": "mistral-medium-latest", + "mistral/mistral-small-latest": "mistral-small-latest", + // ── zai (direct, not via openrouter) ───────────────────────────────────── + "zai/glm-4.5": "glm-4.5", + "zai/glm-4.5-air": "glm-4.5-air", + "zai/glm-4.5-flash": "glm-4.5-flash", + "zai/glm-4.6": "glm-4.6", + "zai/glm-4.7": "glm-4.7", + "zai/glm-4.7-flash": "glm-4.7-flash", + "zai/glm-5": "glm-5", + "zai/glm-5-turbo": "glm-5-turbo", + "zai/glm-5.1": "glm-5.1", // ── openrouter ──────────────────────────────────────────────────────────── "openrouter/anthropic/claude-3-haiku": "claude-3-haiku", "openrouter/anthropic/claude-3.5-haiku": "claude-3-5-haiku", diff --git a/src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs b/src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs new file mode 100644 index 000000000..72e0e9f4f --- /dev/null +++ b/src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs @@ -0,0 +1,216 @@ +/** + * canonical-id-mapping.test.mjs + * + * Verify that canonicalIdFor() maps every production route key that has + * been observed in the journal to a non-null canonical id, and that + * backfillModelPerformanceFromJournal() produces non-_unmapped buckets + * when given a mock journal. + */ +import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +// ── canonicalIdFor tests ────────────────────────────────────────────────────── + +describe("canonicalIdFor — production route keys", async () => { + const { canonicalIdFor } = await import("../model-registry.js"); + + // Route keys observed in .sf/journal/* via model_auto_resolved events. + const expectedMappings = [ + // kimi-coding + ["kimi-coding/kimi-k2.6", "kimi-k2.6"], + ["kimi-coding/kimi-for-coding", "kimi-k2.6"], // alias → same canonical + // minimax + ["minimax/MiniMax-M2.7-highspeed", "MiniMax-M2.7-highspeed"], + ["minimax/MiniMax-M2.7", "MiniMax-M2.7"], + ["minimax/MiniMax-M2.1", "minimax-m2.1"], + // zai (direct provider, not via openrouter) + ["zai/glm-4.5", "glm-4.5"], + // opencode-go + ["opencode-go/kimi-k2.5", "kimi-k2.5"], + // mistral (direct provider, not via openrouter) + ["mistral/codestral-latest", "codestral-latest"], + // google-gemini-cli + ["google-gemini-cli/gemini-3-flash-preview", "gemini-3-flash-preview"], + ["google-gemini-cli/gemini-3.1-pro-preview", "gemini-3.1-pro-preview"], + ["google-gemini-cli/gemini-2.5-pro", "gemini-2.5-pro"], + ]; + + for (const [routeKey, expectedCanonical] of expectedMappings) { + it(`maps "${routeKey}" → "${expectedCanonical}"`, () => { + const result = canonicalIdFor(routeKey); + expect(result).not.toBeNull(); + expect(result).toBe(expectedCanonical); + }); + } + + it("returns null for nonexistent/foo (no false positives)", () => { + expect(canonicalIdFor("nonexistent/foo")).toBeNull(); + }); + + it("bare canonical id (no provider prefix) resolves via _ENTRY_BY_ROUTE fallback to itself", () => { + // "kimi-k2.6" is a wire_id under kimi-coding — if it happened to be a + // provider-less route key it has no _ENTRY_BY_ROUTE entry, so returns null. + // This test documents the expected behaviour: bare ids WITHOUT a provider + // are not looked up (they never appear as route keys in practice). + // Bare ids used as canonicalId strings are never passed to canonicalIdFor. + expect(canonicalIdFor("kimi-k2.6")).toBeNull(); + }); +}); + +// ── backfillModelPerformanceFromJournal tests ───────────────────────────────── + +describe("backfillModelPerformanceFromJournal — mock journal", async () => { + const { backfillModelPerformanceFromJournal, setRegistryResolver } = + await import("../model-learner.js"); + + let tmpDir; + + beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "sf-backfill-test-")); + mkdirSync(join(tmpDir, ".sf", "journal"), { recursive: true }); + + // Inject a stub resolver so tests don't need the full @singularity-forge/ai stack. + setRegistryResolver((rk) => { + const mappings = { + "kimi-coding/kimi-k2.6": "kimi-k2.6", + "minimax/MiniMax-M2.7-highspeed": "MiniMax-M2.7-highspeed", + "zai/glm-4.5": "glm-4.5", + "mistral/codestral-latest": "codestral-latest", + "opencode-go/kimi-k2.5": "kimi-k2.5", + }; + return mappings[rk] ?? null; + }); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + // Reset to null so other tests use the real registry. + setRegistryResolver(null); + }); + + function writeJournalLine(date, event) { + const file = join(tmpDir, ".sf", "journal", `${date}.jsonl`); + writeFileSync(file, JSON.stringify(event) + "\n", { flag: "a" }); + } + + it("maps dispatched routes to non-_unmapped canonical buckets", async () => { + // Dispatch: kimi-coding/kimi-k2.6 on traceId aaa + writeJournalLine("2026-05-15", { + schemaVersion: 1, + eventType: "model_auto_resolved", + data: { + eventType: "model_auto_resolved", + unitType: "execute-task", + traceId: "aaa", + data: { + resolvedProvider: "kimi-coding", + resolvedModel: "kimi-k2.6", + routing: { tier: "standard", modelDowngraded: false }, + }, + }, + }); + + // Completion for traceId aaa + writeJournalLine("2026-05-15", { + schemaVersion: 1, + eventType: "unit_completed", + data: { + eventType: "unit_completed", + traceId: "aaa", + data: { + status: "completed", + tokens: 1000, + cost_usd: 0.01, + }, + }, + }); + + // Dispatch: zai/glm-4.5 on traceId bbb (no completion = assume success) + writeJournalLine("2026-05-15", { + schemaVersion: 1, + eventType: "model_auto_resolved", + data: { + eventType: "model_auto_resolved", + unitType: "research-slice", + traceId: "bbb", + data: { + resolvedProvider: "zai", + resolvedModel: "glm-4.5", + routing: { tier: "standard", modelDowngraded: false }, + }, + }, + }); + + const result = await backfillModelPerformanceFromJournal(tmpDir); + + expect(result.written).toBe(2); + expect(result.unmapped).toBe(0); + + // Verify the written file has the correct structure. + const { readFileSync } = await import("node:fs"); + const perfPath = join(tmpDir, ".sf", "model-performance.json"); + const data = JSON.parse(readFileSync(perfPath, "utf-8")); + + // execute-task → kimi-k2.6 bucket + expect(data["execute-task"]).toBeDefined(); + expect(data["execute-task"]["kimi-k2.6"]).toBeDefined(); + expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1); + expect(data["execute-task"]["_unmapped"]).toBeUndefined(); + + // research-slice → glm-4.5 bucket + expect(data["research-slice"]["glm-4.5"]).toBeDefined(); + expect(data["research-slice"]["glm-4.5"].aggregate.successes).toBe(1); + }); + + it("puts unknown routes into _unmapped (no false positives)", async () => { + writeJournalLine("2026-05-15", { + schemaVersion: 1, + eventType: "model_auto_resolved", + data: { + eventType: "model_auto_resolved", + unitType: "execute-task", + traceId: "zzz", + data: { + resolvedProvider: "unknown-provider", + resolvedModel: "mystery-model", + routing: { tier: "standard", modelDowngraded: false }, + }, + }, + }); + + const result = await backfillModelPerformanceFromJournal(tmpDir); + expect(result.unmapped).toBe(1); + expect(result.written).toBe(0); + }); + + it("backs up existing model-performance.json before overwriting", async () => { + const perfPath = join(tmpDir, ".sf", "model-performance.json"); + writeFileSync(perfPath, JSON.stringify({ old: true }), "utf-8"); + + // Write a minimal dispatch so the function has something to process. + writeJournalLine("2026-05-15", { + schemaVersion: 1, + eventType: "model_auto_resolved", + data: { + eventType: "model_auto_resolved", + unitType: "execute-task", + traceId: "ccc", + data: { + resolvedProvider: "kimi-coding", + resolvedModel: "kimi-k2.6", + }, + }, + }); + + const result = await backfillModelPerformanceFromJournal(tmpDir); + expect(result.backupPath).not.toBeNull(); + expect(result.backupPath).toMatch(/model-performance\.json\.bak\.\d+$/); + + // Old content is in the backup. + const { readFileSync } = await import("node:fs"); + const backup = JSON.parse(readFileSync(result.backupPath, "utf-8")); + expect(backup.old).toBe(true); + }); +});