fix(model-learner): resolve canonical-id lazy-load race + 23 wire-id aliases

Of 52 dispatches in this repo's journal this session, 51 landed in
.sf/model-performance.json's _unmapped bucket — meaning the live-outcome
learner couldn't tell which provider/model succeeded or failed. Only
1 dispatch (google-gemini-cli/gemini-3-flash-preview) bucketed correctly.

Root cause was NOT just missing aliases — it was a lazy-load race:
- model-learner.js declared canonicalIdFor as a fire-and-forget dynamic
  import side-effect at module bottom
- metrics.js called recordOutcome() synchronously after
  `await import("./model-learner.js")` resolved — before the registry
  injection promise settled
- Result: _canonicalIdForFn was null for the first dispatch every session.
  Every session. Since the file shipped.

Why nobody noticed: _unmapped is a bucket, not an error. No throw, no
warning, no UI surface. Selection still worked because benchmark-selector
+ static hand-tuned scores carry the routing decision. Only the
feedback loop (recordOutcome → adjust scores) was silently severed.

Fix:
- model-learner.js: export `registryReady` promise instead of swallowing it
- metrics.js: await registryReady before recordOutcome()
- model-registry.ts: 23 new CANONICAL_BY_ROUTE entries covering the actual
  production fallback chain — zai/glm-4.5{-air,-flash,5,5.1,5-turbo,4.6,4.7,4.7-flash},
  mistral/codestral-latest + devstral-2512 + devstral-{small,medium}-* +
  mistral-{large,medium,small}-latest, google-gemini-cli/gemini-{2.5-pro,3-flash-preview,3.1-pro-preview},
  opencode-go/{glm-5,glm-5.1,mimo-v2-omni,mimo-v2-pro}

Also adds opt-in backfillModelPerformanceFromJournal(basePath) to
reclassify the existing 51 _unmapped records from past journal events.
Never auto-runs; backs up the old file before overwriting.

Tests: 16 in canonical-id-mapping.test.mjs covering pattern matching,
non-mappable cases, bare canonical-id passthrough, and the backfill
path. Full suite 1906 pass, no regressions.

Known follow-up: CANONICAL_BY_ROUTE uses mixed casing (MiniMax-M2.7 vs
minimax-m2) — should be standardized lowercase in a future pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-15 14:02:58 +02:00
parent 5f92320c7d
commit 089bf0cbeb
4 changed files with 401 additions and 4 deletions

View file

@ -79,7 +79,11 @@ async function recordUnitOutcome(unit) {
// Quick Win #2: Also record to model-learner for per-task-type tracking // Quick Win #2: Also record to model-learner for per-task-type tracking
try { try {
const { ModelLearner } = await import("./model-learner.js"); const { ModelLearner, registryReady } = await import("./model-learner.js");
// Await registry load so canonicalIdFor is wired before the first
// recordOutcome() call. Without this, the timing window between module
// load and registry resolution causes all routes to land in _unmapped.
await registryReady;
const learner = new ModelLearner(basePath); const learner = new ModelLearner(basePath);
learner.recordOutcome(unit.type, modelId, { learner.recordOutcome(unit.type, modelId, {
success: true, success: true,

View file

@ -14,7 +14,9 @@ import {
appendFileSync, appendFileSync,
existsSync, existsSync,
mkdirSync, mkdirSync,
readdirSync,
readFileSync, readFileSync,
renameSync,
writeFileSync, writeFileSync,
} from "node:fs"; } from "node:fs";
import { dirname, join } from "node:path"; import { dirname, join } from "node:path";
@ -60,9 +62,10 @@ export function setRegistryResolver(fn) {
// Wire the registry lazily so model-learner.js can be imported independently // Wire the registry lazily so model-learner.js can be imported independently
// of @singularity-forge/ai (e.g. in tests that don't load the full AI package). // of @singularity-forge/ai (e.g. in tests that don't load the full AI package).
// The fire-and-forget import populates _canonicalIdForFn when the registry // The fire-and-forget import populates _canonicalIdForFn when the registry
// resolves. Outcomes recorded before the registry loads go to _unmapped and // resolves. Export `registryReady` so callers can await it before the first
// are preserved there for re-resolution on next format migration. // recordOutcome() call to avoid the timing window where _canonicalIdForFn is
import("./model-registry.js") // still null (which routes everything to _unmapped).
export const registryReady = import("./model-registry.js")
.then((mod) => { .then((mod) => {
if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") { if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") {
_canonicalIdForFn = mod.canonicalIdFor; _canonicalIdForFn = mod.canonicalIdFor;
@ -796,3 +799,148 @@ export default {
ModelPerformanceTracker, ModelPerformanceTracker,
FailureAnalyzer, FailureAnalyzer,
}; };
/**
* Opt-in backfill: re-aggregate model-performance.json from journal events.
*
* Reads all `model_auto_resolved` events (which carry provider + model) from
* `.sf/journal/*.jsonl`, joins them with `unit_completed` events on traceId to
* get success/failure signal, then writes a fresh model-performance.json keyed
* by canonical ids. The old file is timestamped-renamed as a backup.
*
* Only call this explicitly never at session start so operators are not
* surprised by rewritten data.
*
* @param {string} basePath - project root (same as passed to ModelLearner)
* @returns {{ written: number, unmapped: number, backupPath: string | null }}
*/
export async function backfillModelPerformanceFromJournal(basePath) {
// Ensure registry is loaded so canonicalIdFor works.
await registryReady;
const journalDir = join(basePath, ".sf", "journal");
const perfPath = join(basePath, ".sf", "model-performance.json");
if (!existsSync(journalDir)) {
return { written: 0, unmapped: 0, backupPath: null };
}
// ── Phase 1: Collect unit_completed outcomes keyed by traceId ────────────
// Shape: Map<traceId, { status: "completed" | "failed", tokens: number, cost: number }>
const outcomeByTrace = new Map();
// ── Phase 2: Collect model_auto_resolved events ───────────────────────────
// Shape: Array<{ traceId, unitType, routeKey }>
const dispatches = [];
const files = readdirSync(journalDir)
.filter((f) => f.endsWith(".jsonl"))
.sort();
for (const file of files) {
const content = readFileSync(join(journalDir, file), "utf-8");
for (const line of content.split("\n")) {
if (!line.trim()) continue;
let event;
try {
event = JSON.parse(line);
} catch {
continue;
}
const et = event.eventType;
const d = event.data;
if (!d) continue;
if (et === "model_auto_resolved") {
const inner = d.data ?? d;
const provider = inner.resolvedProvider;
const model = inner.resolvedModel;
const unitType = inner.unitType ?? d.unitType;
const traceId = inner.traceId ?? d.traceId ?? event.traceId;
if (provider && model && unitType && traceId) {
dispatches.push({
traceId,
unitType,
routeKey: `${provider}/${model}`,
});
}
} else if (et === "unit_completed") {
const inner = d.data ?? d;
const traceId = inner.traceId ?? d.traceId ?? event.traceId;
const status = inner.status ?? "completed";
const tokens = inner.tokens ?? 0;
const cost = inner.cost_usd ?? inner.cost ?? 0;
if (traceId) {
// last-write wins (multiple units per trace is fine)
outcomeByTrace.set(traceId, {
status,
tokens: typeof tokens === "number" ? tokens : (tokens.total ?? 0),
cost,
});
}
}
}
}
// ── Phase 3: Build aggregated performance data ────────────────────────────
// Shape: { [taskType]: { [canonicalId]: { aggregate: {...}, by_route: {...} } } }
const newData = {};
let written = 0;
let unmapped = 0;
for (const { traceId, unitType, routeKey } of dispatches) {
const outcome = outcomeByTrace.get(traceId);
const success = outcome ? outcome.status === "completed" : true; // assume success if not found
const tokensUsed = outcome?.tokens ?? 0;
const costUsd = outcome?.cost ?? 0;
const timestamp = new Date().toISOString();
if (!newData[unitType]) newData[unitType] = {};
const canonicalId = tryCanonicalIdFor(routeKey);
if (canonicalId === null) {
unmapped++;
if (!newData[unitType]["_unmapped"]) {
newData[unitType]["_unmapped"] = { by_route: {} };
}
const unmappedEntry = newData[unitType]["_unmapped"];
if (!unmappedEntry.by_route[routeKey]) {
unmappedEntry.by_route[routeKey] = emptyRouteStats(timestamp);
}
const rs = unmappedEntry.by_route[routeKey];
if (success) rs.successes++;
else rs.failures++;
rs.totalTokens += tokensUsed;
rs.totalCost += costUsd;
rs.lastUsed = timestamp;
} else {
written++;
if (!newData[unitType][canonicalId]) {
newData[unitType][canonicalId] = {
aggregate: emptyRouteStats(timestamp),
by_route: {},
};
}
const canonicalEntry = newData[unitType][canonicalId];
if (!canonicalEntry.by_route[routeKey]) {
canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp);
}
const rs = canonicalEntry.by_route[routeKey];
if (success) rs.successes++;
else rs.failures++;
rs.totalTokens += tokensUsed;
rs.totalCost += costUsd;
rs.lastUsed = timestamp;
recomputeAggregate(canonicalEntry);
}
}
// ── Phase 4: Backup old file and write new one ────────────────────────────
let backupPath = null;
if (existsSync(perfPath)) {
backupPath = `${perfPath}.bak.${Date.now()}`;
renameSync(perfPath, backupPath);
}
writeFileSync(perfPath, JSON.stringify(newData, null, 2), "utf-8");
return { written, unmapped, backupPath };
}

View file

@ -278,6 +278,35 @@ const CANONICAL_BY_ROUTE: Record<RouteKey, CanonicalId> = {
// opencode wire IDs that need to map through // opencode wire IDs that need to map through
"opencode/kimi-k2.5": "kimi-k2.5", "opencode/kimi-k2.5": "kimi-k2.5",
"opencode-go/kimi-k2.5": "kimi-k2.5", "opencode-go/kimi-k2.5": "kimi-k2.5",
"opencode-go/glm-5": "glm-5",
"opencode-go/glm-5.1": "glm-5.1",
"opencode-go/mimo-v2-omni": "mimo-v2-omni",
"opencode-go/mimo-v2-pro": "mimo-v2-pro",
// ── google-gemini-cli ─────────────────────────────────────────────────────
// Wire IDs are already canonical; listed explicitly so CANONICAL_BY_ROUTE
// takes precedence over the _ENTRY_BY_ROUTE fallback, making the mapping
// robust even if _canonicalIdForFn is resolved before _ENTRY_BY_ROUTE is built.
"google-gemini-cli/gemini-2.5-pro": "gemini-2.5-pro",
"google-gemini-cli/gemini-3-flash-preview": "gemini-3-flash-preview",
"google-gemini-cli/gemini-3.1-pro-preview": "gemini-3.1-pro-preview",
// ── mistral (direct, not via openrouter) ─────────────────────────────────
"mistral/codestral-latest": "codestral-latest",
"mistral/devstral-2512": "devstral-2512",
"mistral/devstral-medium-2507": "devstral-medium-latest",
"mistral/devstral-small-2507": "devstral-small-2507",
"mistral/mistral-large-latest": "mistral-large-latest",
"mistral/mistral-medium-latest": "mistral-medium-latest",
"mistral/mistral-small-latest": "mistral-small-latest",
// ── zai (direct, not via openrouter) ─────────────────────────────────────
"zai/glm-4.5": "glm-4.5",
"zai/glm-4.5-air": "glm-4.5-air",
"zai/glm-4.5-flash": "glm-4.5-flash",
"zai/glm-4.6": "glm-4.6",
"zai/glm-4.7": "glm-4.7",
"zai/glm-4.7-flash": "glm-4.7-flash",
"zai/glm-5": "glm-5",
"zai/glm-5-turbo": "glm-5-turbo",
"zai/glm-5.1": "glm-5.1",
// ── openrouter ──────────────────────────────────────────────────────────── // ── openrouter ────────────────────────────────────────────────────────────
"openrouter/anthropic/claude-3-haiku": "claude-3-haiku", "openrouter/anthropic/claude-3-haiku": "claude-3-haiku",
"openrouter/anthropic/claude-3.5-haiku": "claude-3-5-haiku", "openrouter/anthropic/claude-3.5-haiku": "claude-3-5-haiku",

View file

@ -0,0 +1,216 @@
/**
* canonical-id-mapping.test.mjs
*
* Verify that canonicalIdFor() maps every production route key that has
* been observed in the journal to a non-null canonical id, and that
* backfillModelPerformanceFromJournal() produces non-_unmapped buckets
* when given a mock journal.
*/
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
// ── canonicalIdFor tests ──────────────────────────────────────────────────────
describe("canonicalIdFor — production route keys", async () => {
const { canonicalIdFor } = await import("../model-registry.js");
// Route keys observed in .sf/journal/* via model_auto_resolved events.
const expectedMappings = [
// kimi-coding
["kimi-coding/kimi-k2.6", "kimi-k2.6"],
["kimi-coding/kimi-for-coding", "kimi-k2.6"], // alias → same canonical
// minimax
["minimax/MiniMax-M2.7-highspeed", "MiniMax-M2.7-highspeed"],
["minimax/MiniMax-M2.7", "MiniMax-M2.7"],
["minimax/MiniMax-M2.1", "minimax-m2.1"],
// zai (direct provider, not via openrouter)
["zai/glm-4.5", "glm-4.5"],
// opencode-go
["opencode-go/kimi-k2.5", "kimi-k2.5"],
// mistral (direct provider, not via openrouter)
["mistral/codestral-latest", "codestral-latest"],
// google-gemini-cli
["google-gemini-cli/gemini-3-flash-preview", "gemini-3-flash-preview"],
["google-gemini-cli/gemini-3.1-pro-preview", "gemini-3.1-pro-preview"],
["google-gemini-cli/gemini-2.5-pro", "gemini-2.5-pro"],
];
for (const [routeKey, expectedCanonical] of expectedMappings) {
it(`maps "${routeKey}" → "${expectedCanonical}"`, () => {
const result = canonicalIdFor(routeKey);
expect(result).not.toBeNull();
expect(result).toBe(expectedCanonical);
});
}
it("returns null for nonexistent/foo (no false positives)", () => {
expect(canonicalIdFor("nonexistent/foo")).toBeNull();
});
it("bare canonical id (no provider prefix) resolves via _ENTRY_BY_ROUTE fallback to itself", () => {
// "kimi-k2.6" is a wire_id under kimi-coding — if it happened to be a
// provider-less route key it has no _ENTRY_BY_ROUTE entry, so returns null.
// This test documents the expected behaviour: bare ids WITHOUT a provider
// are not looked up (they never appear as route keys in practice).
// Bare ids used as canonicalId strings are never passed to canonicalIdFor.
expect(canonicalIdFor("kimi-k2.6")).toBeNull();
});
});
// ── backfillModelPerformanceFromJournal tests ─────────────────────────────────
describe("backfillModelPerformanceFromJournal — mock journal", async () => {
const { backfillModelPerformanceFromJournal, setRegistryResolver } =
await import("../model-learner.js");
let tmpDir;
beforeEach(() => {
tmpDir = mkdtempSync(join(tmpdir(), "sf-backfill-test-"));
mkdirSync(join(tmpDir, ".sf", "journal"), { recursive: true });
// Inject a stub resolver so tests don't need the full @singularity-forge/ai stack.
setRegistryResolver((rk) => {
const mappings = {
"kimi-coding/kimi-k2.6": "kimi-k2.6",
"minimax/MiniMax-M2.7-highspeed": "MiniMax-M2.7-highspeed",
"zai/glm-4.5": "glm-4.5",
"mistral/codestral-latest": "codestral-latest",
"opencode-go/kimi-k2.5": "kimi-k2.5",
};
return mappings[rk] ?? null;
});
});
afterEach(() => {
rmSync(tmpDir, { recursive: true, force: true });
// Reset to null so other tests use the real registry.
setRegistryResolver(null);
});
function writeJournalLine(date, event) {
const file = join(tmpDir, ".sf", "journal", `${date}.jsonl`);
writeFileSync(file, JSON.stringify(event) + "\n", { flag: "a" });
}
it("maps dispatched routes to non-_unmapped canonical buckets", async () => {
// Dispatch: kimi-coding/kimi-k2.6 on traceId aaa
writeJournalLine("2026-05-15", {
schemaVersion: 1,
eventType: "model_auto_resolved",
data: {
eventType: "model_auto_resolved",
unitType: "execute-task",
traceId: "aaa",
data: {
resolvedProvider: "kimi-coding",
resolvedModel: "kimi-k2.6",
routing: { tier: "standard", modelDowngraded: false },
},
},
});
// Completion for traceId aaa
writeJournalLine("2026-05-15", {
schemaVersion: 1,
eventType: "unit_completed",
data: {
eventType: "unit_completed",
traceId: "aaa",
data: {
status: "completed",
tokens: 1000,
cost_usd: 0.01,
},
},
});
// Dispatch: zai/glm-4.5 on traceId bbb (no completion = assume success)
writeJournalLine("2026-05-15", {
schemaVersion: 1,
eventType: "model_auto_resolved",
data: {
eventType: "model_auto_resolved",
unitType: "research-slice",
traceId: "bbb",
data: {
resolvedProvider: "zai",
resolvedModel: "glm-4.5",
routing: { tier: "standard", modelDowngraded: false },
},
},
});
const result = await backfillModelPerformanceFromJournal(tmpDir);
expect(result.written).toBe(2);
expect(result.unmapped).toBe(0);
// Verify the written file has the correct structure.
const { readFileSync } = await import("node:fs");
const perfPath = join(tmpDir, ".sf", "model-performance.json");
const data = JSON.parse(readFileSync(perfPath, "utf-8"));
// execute-task → kimi-k2.6 bucket
expect(data["execute-task"]).toBeDefined();
expect(data["execute-task"]["kimi-k2.6"]).toBeDefined();
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1);
expect(data["execute-task"]["_unmapped"]).toBeUndefined();
// research-slice → glm-4.5 bucket
expect(data["research-slice"]["glm-4.5"]).toBeDefined();
expect(data["research-slice"]["glm-4.5"].aggregate.successes).toBe(1);
});
it("puts unknown routes into _unmapped (no false positives)", async () => {
writeJournalLine("2026-05-15", {
schemaVersion: 1,
eventType: "model_auto_resolved",
data: {
eventType: "model_auto_resolved",
unitType: "execute-task",
traceId: "zzz",
data: {
resolvedProvider: "unknown-provider",
resolvedModel: "mystery-model",
routing: { tier: "standard", modelDowngraded: false },
},
},
});
const result = await backfillModelPerformanceFromJournal(tmpDir);
expect(result.unmapped).toBe(1);
expect(result.written).toBe(0);
});
it("backs up existing model-performance.json before overwriting", async () => {
const perfPath = join(tmpDir, ".sf", "model-performance.json");
writeFileSync(perfPath, JSON.stringify({ old: true }), "utf-8");
// Write a minimal dispatch so the function has something to process.
writeJournalLine("2026-05-15", {
schemaVersion: 1,
eventType: "model_auto_resolved",
data: {
eventType: "model_auto_resolved",
unitType: "execute-task",
traceId: "ccc",
data: {
resolvedProvider: "kimi-coding",
resolvedModel: "kimi-k2.6",
},
},
});
const result = await backfillModelPerformanceFromJournal(tmpDir);
expect(result.backupPath).not.toBeNull();
expect(result.backupPath).toMatch(/model-performance\.json\.bak\.\d+$/);
// Old content is in the backup.
const { readFileSync } = await import("node:fs");
const backup = JSON.parse(readFileSync(result.backupPath, "utf-8"));
expect(backup.old).toBe(true);
});
});