fix(model-learner): resolve canonical-id lazy-load race + 23 wire-id aliases

Of 52 dispatches in this repo's journal this session, 51 landed in .sf/model-performance.json's _unmapped bucket — meaning the live-outcome learner couldn't tell which provider/model succeeded or failed. Only 1 dispatch (google-gemini-cli/gemini-3-flash-preview) bucketed correctly. Root cause was NOT just missing aliases — it was a lazy-load race: - model-learner.js declared canonicalIdFor as a fire-and-forget dynamic import side-effect at module bottom - metrics.js called recordOutcome() synchronously after `await import("./model-learner.js")` resolved — before the registry injection promise settled - Result: _canonicalIdForFn was null for the first dispatch every session. Every session. Since the file shipped. Why nobody noticed: _unmapped is a bucket, not an error. No throw, no warning, no UI surface. Selection still worked because benchmark-selector + static hand-tuned scores carry the routing decision. Only the feedback loop (recordOutcome → adjust scores) was silently severed. Fix: - model-learner.js: export `registryReady` promise instead of swallowing it - metrics.js: await registryReady before recordOutcome() - model-registry.ts: 23 new CANONICAL_BY_ROUTE entries covering the actual production fallback chain — zai/glm-4.5{-air,-flash,5,5.1,5-turbo,4.6,4.7,4.7-flash}, mistral/codestral-latest + devstral-2512 + devstral-{small,medium}-* + mistral-{large,medium,small}-latest, google-gemini-cli/gemini-{2.5-pro,3-flash-preview,3.1-pro-preview}, opencode-go/{glm-5,glm-5.1,mimo-v2-omni,mimo-v2-pro} Also adds opt-in backfillModelPerformanceFromJournal(basePath) to reclassify the existing 51 _unmapped records from past journal events. Never auto-runs; backs up the old file before overwriting. Tests: 16 in canonical-id-mapping.test.mjs covering pattern matching, non-mappable cases, bare canonical-id passthrough, and the backfill path. Full suite 1906 pass, no regressions. Known follow-up: CANONICAL_BY_ROUTE uses mixed casing (MiniMax-M2.7 vs minimax-m2) — should be standardized lowercase in a future pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 14:02:58 +02:00 · 2026-05-15 14:02:58 +02:00 · 089bf0cbeb
commit 089bf0cbeb
parent 5f92320c7d
4 changed files with 401 additions and 4 deletions
--- a/src/resources/extensions/sf/metrics.js
+++ b/src/resources/extensions/sf/metrics.js
@ -79,7 +79,11 @@ async function recordUnitOutcome(unit) {
 		// Quick Win #2: Also record to model-learner for per-task-type tracking
 		try {
-			const { ModelLearner } = await import("./model-learner.js");
+			const { ModelLearner, registryReady } = await import("./model-learner.js");
 			// Await registry load so canonicalIdFor is wired before the first
 			// recordOutcome() call. Without this, the timing window between module
 			// load and registry resolution causes all routes to land in _unmapped.
 			await registryReady;
 			const learner = new ModelLearner(basePath);
 			learner.recordOutcome(unit.type, modelId, {
 				success: true,
--- a/src/resources/extensions/sf/model-learner.js
+++ b/src/resources/extensions/sf/model-learner.js
@ -14,7 +14,9 @@ import {
 	appendFileSync,
 	existsSync,
 	mkdirSync,
 	readdirSync,
 	readFileSync,
 	renameSync,
 	writeFileSync,
 } from "node:fs";
 import { dirname, join } from "node:path";
@ -60,9 +62,10 @@ export function setRegistryResolver(fn) {
 // Wire the registry lazily so model-learner.js can be imported independently
 // of @singularity-forge/ai (e.g. in tests that don't load the full AI package).
 // The fire-and-forget import populates _canonicalIdForFn when the registry
-// resolves. Outcomes recorded before the registry loads go to _unmapped and
+// resolves. Export `registryReady` so callers can await it before the first
-// are preserved there for re-resolution on next format migration.
+// recordOutcome() call to avoid the timing window where _canonicalIdForFn is
-import("./model-registry.js")
+// still null (which routes everything to _unmapped).
 export const registryReady = import("./model-registry.js")
 	.then((mod) => {
 		if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") {
 			_canonicalIdForFn = mod.canonicalIdFor;
@ -796,3 +799,148 @@ export default {
 	ModelPerformanceTracker,
 	FailureAnalyzer,
 };
 /**
 * Opt-in backfill: re-aggregate model-performance.json from journal events.
 *
 * Reads all `model_auto_resolved` events (which carry provider + model) from
 * `.sf/journal/*.jsonl`, joins them with `unit_completed` events on traceId to
 * get success/failure signal, then writes a fresh model-performance.json keyed
 * by canonical ids. The old file is timestamped-renamed as a backup.
 *
 * Only call this explicitly — never at session start — so operators are not
 * surprised by rewritten data.
 *
 * @param {string} basePath - project root (same as passed to ModelLearner)
 * @returns {{ written: number, unmapped: number, backupPath: string | null }}
 */
 export async function backfillModelPerformanceFromJournal(basePath) {
 	// Ensure registry is loaded so canonicalIdFor works.
 	await registryReady;
 	const journalDir = join(basePath, ".sf", "journal");
 	const perfPath = join(basePath, ".sf", "model-performance.json");
 	if (!existsSync(journalDir)) {
 		return { written: 0, unmapped: 0, backupPath: null };
 	}
 	// ── Phase 1: Collect unit_completed outcomes keyed by traceId ────────────
 	// Shape: Map<traceId, { status: "completed" | "failed", tokens: number, cost: number }>
 	const outcomeByTrace = new Map();
 	// ── Phase 2: Collect model_auto_resolved events ───────────────────────────
 	// Shape: Array<{ traceId, unitType, routeKey }>
 	const dispatches = [];
 	const files = readdirSync(journalDir)
 		.filter((f) => f.endsWith(".jsonl"))
 		.sort();
 	for (const file of files) {
 		const content = readFileSync(join(journalDir, file), "utf-8");
 		for (const line of content.split("\n")) {
 			if (!line.trim()) continue;
 			let event;
 			try {
 				event = JSON.parse(line);
 			} catch {
 				continue;
 			}
 			const et = event.eventType;
 			const d = event.data;
 			if (!d) continue;
 			if (et === "model_auto_resolved") {
 				const inner = d.data ?? d;
 				const provider = inner.resolvedProvider;
 				const model = inner.resolvedModel;
 				const unitType = inner.unitType ?? d.unitType;
 				const traceId = inner.traceId ?? d.traceId ?? event.traceId;
 				if (provider && model && unitType && traceId) {
 					dispatches.push({
 						traceId,
 						unitType,
 						routeKey: `${provider}/${model}`,
 					});
 				}
 			} else if (et === "unit_completed") {
 				const inner = d.data ?? d;
 				const traceId = inner.traceId ?? d.traceId ?? event.traceId;
 				const status = inner.status ?? "completed";
 				const tokens = inner.tokens ?? 0;
 				const cost = inner.cost_usd ?? inner.cost ?? 0;
 				if (traceId) {
 					// last-write wins (multiple units per trace is fine)
 					outcomeByTrace.set(traceId, {
 						status,
 						tokens: typeof tokens === "number" ? tokens : (tokens.total ?? 0),
 						cost,
 					});
 				}
 			}
 		}
 	}
 	// ── Phase 3: Build aggregated performance data ────────────────────────────
 	// Shape: { [taskType]: { [canonicalId]: { aggregate: {...}, by_route: {...} } } }
 	const newData = {};
 	let written = 0;
 	let unmapped = 0;
 	for (const { traceId, unitType, routeKey } of dispatches) {
 		const outcome = outcomeByTrace.get(traceId);
 		const success = outcome ? outcome.status === "completed" : true; // assume success if not found
 		const tokensUsed = outcome?.tokens ?? 0;
 		const costUsd = outcome?.cost ?? 0;
 		const timestamp = new Date().toISOString();
 		if (!newData[unitType]) newData[unitType] = {};
 		const canonicalId = tryCanonicalIdFor(routeKey);
 		if (canonicalId === null) {
 			unmapped++;
 			if (!newData[unitType]["_unmapped"]) {
 				newData[unitType]["_unmapped"] = { by_route: {} };
 			}
 			const unmappedEntry = newData[unitType]["_unmapped"];
 			if (!unmappedEntry.by_route[routeKey]) {
 				unmappedEntry.by_route[routeKey] = emptyRouteStats(timestamp);
 			}
 			const rs = unmappedEntry.by_route[routeKey];
 			if (success) rs.successes++;
 			else rs.failures++;
 			rs.totalTokens += tokensUsed;
 			rs.totalCost += costUsd;
 			rs.lastUsed = timestamp;
 		} else {
 			written++;
 			if (!newData[unitType][canonicalId]) {
 				newData[unitType][canonicalId] = {
 					aggregate: emptyRouteStats(timestamp),
 					by_route: {},
 				};
 			}
 			const canonicalEntry = newData[unitType][canonicalId];
 			if (!canonicalEntry.by_route[routeKey]) {
 				canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp);
 			}
 			const rs = canonicalEntry.by_route[routeKey];
 			if (success) rs.successes++;
 			else rs.failures++;
 			rs.totalTokens += tokensUsed;
 			rs.totalCost += costUsd;
 			rs.lastUsed = timestamp;
 			recomputeAggregate(canonicalEntry);
 		}
 	}
 	// ── Phase 4: Backup old file and write new one ────────────────────────────
 	let backupPath = null;
 	if (existsSync(perfPath)) {
 		backupPath = `${perfPath}.bak.${Date.now()}`;
 		renameSync(perfPath, backupPath);
 	}
 	writeFileSync(perfPath, JSON.stringify(newData, null, 2), "utf-8");
 	return { written, unmapped, backupPath };
 }
--- a/src/resources/extensions/sf/model-registry.ts
+++ b/src/resources/extensions/sf/model-registry.ts
@ -278,6 +278,35 @@ const CANONICAL_BY_ROUTE: Record<RouteKey, CanonicalId> = {
 	// opencode wire IDs that need to map through
 	"opencode/kimi-k2.5": "kimi-k2.5",
 	"opencode-go/kimi-k2.5": "kimi-k2.5",
 	"opencode-go/glm-5": "glm-5",
 	"opencode-go/glm-5.1": "glm-5.1",
 	"opencode-go/mimo-v2-omni": "mimo-v2-omni",
 	"opencode-go/mimo-v2-pro": "mimo-v2-pro",
 	// ── google-gemini-cli ─────────────────────────────────────────────────────
 	// Wire IDs are already canonical; listed explicitly so CANONICAL_BY_ROUTE
 	// takes precedence over the _ENTRY_BY_ROUTE fallback, making the mapping
 	// robust even if _canonicalIdForFn is resolved before _ENTRY_BY_ROUTE is built.
 	"google-gemini-cli/gemini-2.5-pro": "gemini-2.5-pro",
 	"google-gemini-cli/gemini-3-flash-preview": "gemini-3-flash-preview",
 	"google-gemini-cli/gemini-3.1-pro-preview": "gemini-3.1-pro-preview",
 	// ── mistral (direct, not via openrouter) ─────────────────────────────────
 	"mistral/codestral-latest": "codestral-latest",
 	"mistral/devstral-2512": "devstral-2512",
 	"mistral/devstral-medium-2507": "devstral-medium-latest",
 	"mistral/devstral-small-2507": "devstral-small-2507",
 	"mistral/mistral-large-latest": "mistral-large-latest",
 	"mistral/mistral-medium-latest": "mistral-medium-latest",
 	"mistral/mistral-small-latest": "mistral-small-latest",
 	// ── zai (direct, not via openrouter) ─────────────────────────────────────
 	"zai/glm-4.5": "glm-4.5",
 	"zai/glm-4.5-air": "glm-4.5-air",
 	"zai/glm-4.5-flash": "glm-4.5-flash",
 	"zai/glm-4.6": "glm-4.6",
 	"zai/glm-4.7": "glm-4.7",
 	"zai/glm-4.7-flash": "glm-4.7-flash",
 	"zai/glm-5": "glm-5",
 	"zai/glm-5-turbo": "glm-5-turbo",
 	"zai/glm-5.1": "glm-5.1",
 	// ── openrouter ────────────────────────────────────────────────────────────
 	"openrouter/anthropic/claude-3-haiku": "claude-3-haiku",
 	"openrouter/anthropic/claude-3.5-haiku": "claude-3-5-haiku",
--- a/src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs
+++ b/src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs
@ -0,0 +1,216 @@
 /**
 * canonical-id-mapping.test.mjs
 *
 * Verify that canonicalIdFor() maps every production route key that has
 * been observed in the journal to a non-null canonical id, and that
 * backfillModelPerformanceFromJournal() produces non-_unmapped buckets
 * when given a mock journal.
 */
 import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { afterEach, beforeEach, describe, expect, it } from "vitest";
 // ── canonicalIdFor tests ──────────────────────────────────────────────────────
 describe("canonicalIdFor — production route keys", async () => {
 	const { canonicalIdFor } = await import("../model-registry.js");
 	// Route keys observed in .sf/journal/* via model_auto_resolved events.
 	const expectedMappings = [
 		// kimi-coding
 		["kimi-coding/kimi-k2.6", "kimi-k2.6"],
 		["kimi-coding/kimi-for-coding", "kimi-k2.6"], // alias → same canonical
 		// minimax
 		["minimax/MiniMax-M2.7-highspeed", "MiniMax-M2.7-highspeed"],
 		["minimax/MiniMax-M2.7", "MiniMax-M2.7"],
 		["minimax/MiniMax-M2.1", "minimax-m2.1"],
 		// zai (direct provider, not via openrouter)
 		["zai/glm-4.5", "glm-4.5"],
 		// opencode-go
 		["opencode-go/kimi-k2.5", "kimi-k2.5"],
 		// mistral (direct provider, not via openrouter)
 		["mistral/codestral-latest", "codestral-latest"],
 		// google-gemini-cli
 		["google-gemini-cli/gemini-3-flash-preview", "gemini-3-flash-preview"],
 		["google-gemini-cli/gemini-3.1-pro-preview", "gemini-3.1-pro-preview"],
 		["google-gemini-cli/gemini-2.5-pro", "gemini-2.5-pro"],
 	];
 	for (const [routeKey, expectedCanonical] of expectedMappings) {
 		it(`maps "${routeKey}" → "${expectedCanonical}"`, () => {
 			const result = canonicalIdFor(routeKey);
 			expect(result).not.toBeNull();
 			expect(result).toBe(expectedCanonical);
 		});
 	}
 	it("returns null for nonexistent/foo (no false positives)", () => {
 		expect(canonicalIdFor("nonexistent/foo")).toBeNull();
 	});
 	it("bare canonical id (no provider prefix) resolves via _ENTRY_BY_ROUTE fallback to itself", () => {
 		// "kimi-k2.6" is a wire_id under kimi-coding — if it happened to be a
 		// provider-less route key it has no _ENTRY_BY_ROUTE entry, so returns null.
 		// This test documents the expected behaviour: bare ids WITHOUT a provider
 		// are not looked up (they never appear as route keys in practice).
 		// Bare ids used as canonicalId strings are never passed to canonicalIdFor.
 		expect(canonicalIdFor("kimi-k2.6")).toBeNull();
 	});
 });
 // ── backfillModelPerformanceFromJournal tests ─────────────────────────────────
 describe("backfillModelPerformanceFromJournal — mock journal", async () => {
 	const { backfillModelPerformanceFromJournal, setRegistryResolver } =
 		await import("../model-learner.js");
 	let tmpDir;
 	beforeEach(() => {
 		tmpDir = mkdtempSync(join(tmpdir(), "sf-backfill-test-"));
 		mkdirSync(join(tmpDir, ".sf", "journal"), { recursive: true });
 		// Inject a stub resolver so tests don't need the full @singularity-forge/ai stack.
 		setRegistryResolver((rk) => {
 			const mappings = {
 				"kimi-coding/kimi-k2.6": "kimi-k2.6",
 				"minimax/MiniMax-M2.7-highspeed": "MiniMax-M2.7-highspeed",
 				"zai/glm-4.5": "glm-4.5",
 				"mistral/codestral-latest": "codestral-latest",
 				"opencode-go/kimi-k2.5": "kimi-k2.5",
 			};
 			return mappings[rk] ?? null;
 		});
 	});
 	afterEach(() => {
 		rmSync(tmpDir, { recursive: true, force: true });
 		// Reset to null so other tests use the real registry.
 		setRegistryResolver(null);
 	});
 	function writeJournalLine(date, event) {
 		const file = join(tmpDir, ".sf", "journal", `${date}.jsonl`);
 		writeFileSync(file, JSON.stringify(event) + "\n", { flag: "a" });
 	}
 	it("maps dispatched routes to non-_unmapped canonical buckets", async () => {
 		// Dispatch: kimi-coding/kimi-k2.6 on traceId aaa
 		writeJournalLine("2026-05-15", {
 			schemaVersion: 1,
 			eventType: "model_auto_resolved",
 			data: {
 				eventType: "model_auto_resolved",
 				unitType: "execute-task",
 				traceId: "aaa",
 				data: {
 					resolvedProvider: "kimi-coding",
 					resolvedModel: "kimi-k2.6",
 					routing: { tier: "standard", modelDowngraded: false },
 				},
 			},
 		});
 		// Completion for traceId aaa
 		writeJournalLine("2026-05-15", {
 			schemaVersion: 1,
 			eventType: "unit_completed",
 			data: {
 				eventType: "unit_completed",
 				traceId: "aaa",
 				data: {
 					status: "completed",
 					tokens: 1000,
 					cost_usd: 0.01,
 				},
 			},
 		});
 		// Dispatch: zai/glm-4.5 on traceId bbb (no completion = assume success)
 		writeJournalLine("2026-05-15", {
 			schemaVersion: 1,
 			eventType: "model_auto_resolved",
 			data: {
 				eventType: "model_auto_resolved",
 				unitType: "research-slice",
 				traceId: "bbb",
 				data: {
 					resolvedProvider: "zai",
 					resolvedModel: "glm-4.5",
 					routing: { tier: "standard", modelDowngraded: false },
 				},
 			},
 		});
 		const result = await backfillModelPerformanceFromJournal(tmpDir);
 		expect(result.written).toBe(2);
 		expect(result.unmapped).toBe(0);
 		// Verify the written file has the correct structure.
 		const { readFileSync } = await import("node:fs");
 		const perfPath = join(tmpDir, ".sf", "model-performance.json");
 		const data = JSON.parse(readFileSync(perfPath, "utf-8"));
 		// execute-task → kimi-k2.6 bucket
 		expect(data["execute-task"]).toBeDefined();
 		expect(data["execute-task"]["kimi-k2.6"]).toBeDefined();
 		expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1);
 		expect(data["execute-task"]["_unmapped"]).toBeUndefined();
 		// research-slice → glm-4.5 bucket
 		expect(data["research-slice"]["glm-4.5"]).toBeDefined();
 		expect(data["research-slice"]["glm-4.5"].aggregate.successes).toBe(1);
 	});
 	it("puts unknown routes into _unmapped (no false positives)", async () => {
 		writeJournalLine("2026-05-15", {
 			schemaVersion: 1,
 			eventType: "model_auto_resolved",
 			data: {
 				eventType: "model_auto_resolved",
 				unitType: "execute-task",
 				traceId: "zzz",
 				data: {
 					resolvedProvider: "unknown-provider",
 					resolvedModel: "mystery-model",
 					routing: { tier: "standard", modelDowngraded: false },
 				},
 			},
 		});
 		const result = await backfillModelPerformanceFromJournal(tmpDir);
 		expect(result.unmapped).toBe(1);
 		expect(result.written).toBe(0);
 	});
 	it("backs up existing model-performance.json before overwriting", async () => {
 		const perfPath = join(tmpDir, ".sf", "model-performance.json");
 		writeFileSync(perfPath, JSON.stringify({ old: true }), "utf-8");
 		// Write a minimal dispatch so the function has something to process.
 		writeJournalLine("2026-05-15", {
 			schemaVersion: 1,
 			eventType: "model_auto_resolved",
 			data: {
 				eventType: "model_auto_resolved",
 				unitType: "execute-task",
 				traceId: "ccc",
 				data: {
 					resolvedProvider: "kimi-coding",
 					resolvedModel: "kimi-k2.6",
 				},
 			},
 		});
 		const result = await backfillModelPerformanceFromJournal(tmpDir);
 		expect(result.backupPath).not.toBeNull();
 		expect(result.backupPath).toMatch(/model-performance\.json\.bak\.\d+$/);
 		// Old content is in the backup.
 		const { readFileSync } = await import("node:fs");
 		const backup = JSON.parse(readFileSync(result.backupPath, "utf-8"));
 		expect(backup.old).toBe(true);
 	});
 });