fix(model-learner): resolve canonical-id lazy-load race + 23 wire-id aliases
Of 52 dispatches in this repo's journal this session, 51 landed in
.sf/model-performance.json's _unmapped bucket — meaning the live-outcome
learner couldn't tell which provider/model succeeded or failed. Only
1 dispatch (google-gemini-cli/gemini-3-flash-preview) bucketed correctly.
Root cause was NOT just missing aliases — it was a lazy-load race:
- model-learner.js declared canonicalIdFor as a fire-and-forget dynamic
import side-effect at module bottom
- metrics.js called recordOutcome() synchronously after
`await import("./model-learner.js")` resolved — before the registry
injection promise settled
- Result: _canonicalIdForFn was null for the first dispatch every session.
Every session. Since the file shipped.
Why nobody noticed: _unmapped is a bucket, not an error. No throw, no
warning, no UI surface. Selection still worked because benchmark-selector
+ static hand-tuned scores carry the routing decision. Only the
feedback loop (recordOutcome → adjust scores) was silently severed.
Fix:
- model-learner.js: export `registryReady` promise instead of swallowing it
- metrics.js: await registryReady before recordOutcome()
- model-registry.ts: 23 new CANONICAL_BY_ROUTE entries covering the actual
production fallback chain — zai/glm-4.5{-air,-flash,5,5.1,5-turbo,4.6,4.7,4.7-flash},
mistral/codestral-latest + devstral-2512 + devstral-{small,medium}-* +
mistral-{large,medium,small}-latest, google-gemini-cli/gemini-{2.5-pro,3-flash-preview,3.1-pro-preview},
opencode-go/{glm-5,glm-5.1,mimo-v2-omni,mimo-v2-pro}
Also adds opt-in backfillModelPerformanceFromJournal(basePath) to
reclassify the existing 51 _unmapped records from past journal events.
Never auto-runs; backs up the old file before overwriting.
Tests: 16 in canonical-id-mapping.test.mjs covering pattern matching,
non-mappable cases, bare canonical-id passthrough, and the backfill
path. Full suite 1906 pass, no regressions.
Known follow-up: CANONICAL_BY_ROUTE uses mixed casing (MiniMax-M2.7 vs
minimax-m2) — should be standardized lowercase in a future pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5f92320c7d
commit
089bf0cbeb
4 changed files with 401 additions and 4 deletions
|
|
@ -79,7 +79,11 @@ async function recordUnitOutcome(unit) {
|
|||
|
||||
// Quick Win #2: Also record to model-learner for per-task-type tracking
|
||||
try {
|
||||
const { ModelLearner } = await import("./model-learner.js");
|
||||
const { ModelLearner, registryReady } = await import("./model-learner.js");
|
||||
// Await registry load so canonicalIdFor is wired before the first
|
||||
// recordOutcome() call. Without this, the timing window between module
|
||||
// load and registry resolution causes all routes to land in _unmapped.
|
||||
await registryReady;
|
||||
const learner = new ModelLearner(basePath);
|
||||
learner.recordOutcome(unit.type, modelId, {
|
||||
success: true,
|
||||
|
|
|
|||
|
|
@ -14,7 +14,9 @@ import {
|
|||
appendFileSync,
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
readdirSync,
|
||||
readFileSync,
|
||||
renameSync,
|
||||
writeFileSync,
|
||||
} from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
|
|
@ -60,9 +62,10 @@ export function setRegistryResolver(fn) {
|
|||
// Wire the registry lazily so model-learner.js can be imported independently
|
||||
// of @singularity-forge/ai (e.g. in tests that don't load the full AI package).
|
||||
// The fire-and-forget import populates _canonicalIdForFn when the registry
|
||||
// resolves. Outcomes recorded before the registry loads go to _unmapped and
|
||||
// are preserved there for re-resolution on next format migration.
|
||||
import("./model-registry.js")
|
||||
// resolves. Export `registryReady` so callers can await it before the first
|
||||
// recordOutcome() call to avoid the timing window where _canonicalIdForFn is
|
||||
// still null (which routes everything to _unmapped).
|
||||
export const registryReady = import("./model-registry.js")
|
||||
.then((mod) => {
|
||||
if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") {
|
||||
_canonicalIdForFn = mod.canonicalIdFor;
|
||||
|
|
@ -796,3 +799,148 @@ export default {
|
|||
ModelPerformanceTracker,
|
||||
FailureAnalyzer,
|
||||
};
|
||||
|
||||
/**
|
||||
* Opt-in backfill: re-aggregate model-performance.json from journal events.
|
||||
*
|
||||
* Reads all `model_auto_resolved` events (which carry provider + model) from
|
||||
* `.sf/journal/*.jsonl`, joins them with `unit_completed` events on traceId to
|
||||
* get success/failure signal, then writes a fresh model-performance.json keyed
|
||||
* by canonical ids. The old file is timestamped-renamed as a backup.
|
||||
*
|
||||
* Only call this explicitly — never at session start — so operators are not
|
||||
* surprised by rewritten data.
|
||||
*
|
||||
* @param {string} basePath - project root (same as passed to ModelLearner)
|
||||
* @returns {{ written: number, unmapped: number, backupPath: string | null }}
|
||||
*/
|
||||
export async function backfillModelPerformanceFromJournal(basePath) {
|
||||
// Ensure registry is loaded so canonicalIdFor works.
|
||||
await registryReady;
|
||||
|
||||
const journalDir = join(basePath, ".sf", "journal");
|
||||
const perfPath = join(basePath, ".sf", "model-performance.json");
|
||||
|
||||
if (!existsSync(journalDir)) {
|
||||
return { written: 0, unmapped: 0, backupPath: null };
|
||||
}
|
||||
|
||||
// ── Phase 1: Collect unit_completed outcomes keyed by traceId ────────────
|
||||
// Shape: Map<traceId, { status: "completed" | "failed", tokens: number, cost: number }>
|
||||
const outcomeByTrace = new Map();
|
||||
// ── Phase 2: Collect model_auto_resolved events ───────────────────────────
|
||||
// Shape: Array<{ traceId, unitType, routeKey }>
|
||||
const dispatches = [];
|
||||
|
||||
const files = readdirSync(journalDir)
|
||||
.filter((f) => f.endsWith(".jsonl"))
|
||||
.sort();
|
||||
|
||||
for (const file of files) {
|
||||
const content = readFileSync(join(journalDir, file), "utf-8");
|
||||
for (const line of content.split("\n")) {
|
||||
if (!line.trim()) continue;
|
||||
let event;
|
||||
try {
|
||||
event = JSON.parse(line);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
const et = event.eventType;
|
||||
const d = event.data;
|
||||
if (!d) continue;
|
||||
|
||||
if (et === "model_auto_resolved") {
|
||||
const inner = d.data ?? d;
|
||||
const provider = inner.resolvedProvider;
|
||||
const model = inner.resolvedModel;
|
||||
const unitType = inner.unitType ?? d.unitType;
|
||||
const traceId = inner.traceId ?? d.traceId ?? event.traceId;
|
||||
if (provider && model && unitType && traceId) {
|
||||
dispatches.push({
|
||||
traceId,
|
||||
unitType,
|
||||
routeKey: `${provider}/${model}`,
|
||||
});
|
||||
}
|
||||
} else if (et === "unit_completed") {
|
||||
const inner = d.data ?? d;
|
||||
const traceId = inner.traceId ?? d.traceId ?? event.traceId;
|
||||
const status = inner.status ?? "completed";
|
||||
const tokens = inner.tokens ?? 0;
|
||||
const cost = inner.cost_usd ?? inner.cost ?? 0;
|
||||
if (traceId) {
|
||||
// last-write wins (multiple units per trace is fine)
|
||||
outcomeByTrace.set(traceId, {
|
||||
status,
|
||||
tokens: typeof tokens === "number" ? tokens : (tokens.total ?? 0),
|
||||
cost,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Phase 3: Build aggregated performance data ────────────────────────────
|
||||
// Shape: { [taskType]: { [canonicalId]: { aggregate: {...}, by_route: {...} } } }
|
||||
const newData = {};
|
||||
let written = 0;
|
||||
let unmapped = 0;
|
||||
|
||||
for (const { traceId, unitType, routeKey } of dispatches) {
|
||||
const outcome = outcomeByTrace.get(traceId);
|
||||
const success = outcome ? outcome.status === "completed" : true; // assume success if not found
|
||||
const tokensUsed = outcome?.tokens ?? 0;
|
||||
const costUsd = outcome?.cost ?? 0;
|
||||
const timestamp = new Date().toISOString();
|
||||
|
||||
if (!newData[unitType]) newData[unitType] = {};
|
||||
|
||||
const canonicalId = tryCanonicalIdFor(routeKey);
|
||||
if (canonicalId === null) {
|
||||
unmapped++;
|
||||
if (!newData[unitType]["_unmapped"]) {
|
||||
newData[unitType]["_unmapped"] = { by_route: {} };
|
||||
}
|
||||
const unmappedEntry = newData[unitType]["_unmapped"];
|
||||
if (!unmappedEntry.by_route[routeKey]) {
|
||||
unmappedEntry.by_route[routeKey] = emptyRouteStats(timestamp);
|
||||
}
|
||||
const rs = unmappedEntry.by_route[routeKey];
|
||||
if (success) rs.successes++;
|
||||
else rs.failures++;
|
||||
rs.totalTokens += tokensUsed;
|
||||
rs.totalCost += costUsd;
|
||||
rs.lastUsed = timestamp;
|
||||
} else {
|
||||
written++;
|
||||
if (!newData[unitType][canonicalId]) {
|
||||
newData[unitType][canonicalId] = {
|
||||
aggregate: emptyRouteStats(timestamp),
|
||||
by_route: {},
|
||||
};
|
||||
}
|
||||
const canonicalEntry = newData[unitType][canonicalId];
|
||||
if (!canonicalEntry.by_route[routeKey]) {
|
||||
canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp);
|
||||
}
|
||||
const rs = canonicalEntry.by_route[routeKey];
|
||||
if (success) rs.successes++;
|
||||
else rs.failures++;
|
||||
rs.totalTokens += tokensUsed;
|
||||
rs.totalCost += costUsd;
|
||||
rs.lastUsed = timestamp;
|
||||
recomputeAggregate(canonicalEntry);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Phase 4: Backup old file and write new one ────────────────────────────
|
||||
let backupPath = null;
|
||||
if (existsSync(perfPath)) {
|
||||
backupPath = `${perfPath}.bak.${Date.now()}`;
|
||||
renameSync(perfPath, backupPath);
|
||||
}
|
||||
writeFileSync(perfPath, JSON.stringify(newData, null, 2), "utf-8");
|
||||
|
||||
return { written, unmapped, backupPath };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -278,6 +278,35 @@ const CANONICAL_BY_ROUTE: Record<RouteKey, CanonicalId> = {
|
|||
// opencode wire IDs that need to map through
|
||||
"opencode/kimi-k2.5": "kimi-k2.5",
|
||||
"opencode-go/kimi-k2.5": "kimi-k2.5",
|
||||
"opencode-go/glm-5": "glm-5",
|
||||
"opencode-go/glm-5.1": "glm-5.1",
|
||||
"opencode-go/mimo-v2-omni": "mimo-v2-omni",
|
||||
"opencode-go/mimo-v2-pro": "mimo-v2-pro",
|
||||
// ── google-gemini-cli ─────────────────────────────────────────────────────
|
||||
// Wire IDs are already canonical; listed explicitly so CANONICAL_BY_ROUTE
|
||||
// takes precedence over the _ENTRY_BY_ROUTE fallback, making the mapping
|
||||
// robust even if _canonicalIdForFn is resolved before _ENTRY_BY_ROUTE is built.
|
||||
"google-gemini-cli/gemini-2.5-pro": "gemini-2.5-pro",
|
||||
"google-gemini-cli/gemini-3-flash-preview": "gemini-3-flash-preview",
|
||||
"google-gemini-cli/gemini-3.1-pro-preview": "gemini-3.1-pro-preview",
|
||||
// ── mistral (direct, not via openrouter) ─────────────────────────────────
|
||||
"mistral/codestral-latest": "codestral-latest",
|
||||
"mistral/devstral-2512": "devstral-2512",
|
||||
"mistral/devstral-medium-2507": "devstral-medium-latest",
|
||||
"mistral/devstral-small-2507": "devstral-small-2507",
|
||||
"mistral/mistral-large-latest": "mistral-large-latest",
|
||||
"mistral/mistral-medium-latest": "mistral-medium-latest",
|
||||
"mistral/mistral-small-latest": "mistral-small-latest",
|
||||
// ── zai (direct, not via openrouter) ─────────────────────────────────────
|
||||
"zai/glm-4.5": "glm-4.5",
|
||||
"zai/glm-4.5-air": "glm-4.5-air",
|
||||
"zai/glm-4.5-flash": "glm-4.5-flash",
|
||||
"zai/glm-4.6": "glm-4.6",
|
||||
"zai/glm-4.7": "glm-4.7",
|
||||
"zai/glm-4.7-flash": "glm-4.7-flash",
|
||||
"zai/glm-5": "glm-5",
|
||||
"zai/glm-5-turbo": "glm-5-turbo",
|
||||
"zai/glm-5.1": "glm-5.1",
|
||||
// ── openrouter ────────────────────────────────────────────────────────────
|
||||
"openrouter/anthropic/claude-3-haiku": "claude-3-haiku",
|
||||
"openrouter/anthropic/claude-3.5-haiku": "claude-3-5-haiku",
|
||||
|
|
|
|||
216
src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs
Normal file
216
src/resources/extensions/sf/tests/canonical-id-mapping.test.mjs
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
/**
|
||||
* canonical-id-mapping.test.mjs
|
||||
*
|
||||
* Verify that canonicalIdFor() maps every production route key that has
|
||||
* been observed in the journal to a non-null canonical id, and that
|
||||
* backfillModelPerformanceFromJournal() produces non-_unmapped buckets
|
||||
* when given a mock journal.
|
||||
*/
|
||||
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
|
||||
// ── canonicalIdFor tests ──────────────────────────────────────────────────────
|
||||
|
||||
describe("canonicalIdFor — production route keys", async () => {
|
||||
const { canonicalIdFor } = await import("../model-registry.js");
|
||||
|
||||
// Route keys observed in .sf/journal/* via model_auto_resolved events.
|
||||
const expectedMappings = [
|
||||
// kimi-coding
|
||||
["kimi-coding/kimi-k2.6", "kimi-k2.6"],
|
||||
["kimi-coding/kimi-for-coding", "kimi-k2.6"], // alias → same canonical
|
||||
// minimax
|
||||
["minimax/MiniMax-M2.7-highspeed", "MiniMax-M2.7-highspeed"],
|
||||
["minimax/MiniMax-M2.7", "MiniMax-M2.7"],
|
||||
["minimax/MiniMax-M2.1", "minimax-m2.1"],
|
||||
// zai (direct provider, not via openrouter)
|
||||
["zai/glm-4.5", "glm-4.5"],
|
||||
// opencode-go
|
||||
["opencode-go/kimi-k2.5", "kimi-k2.5"],
|
||||
// mistral (direct provider, not via openrouter)
|
||||
["mistral/codestral-latest", "codestral-latest"],
|
||||
// google-gemini-cli
|
||||
["google-gemini-cli/gemini-3-flash-preview", "gemini-3-flash-preview"],
|
||||
["google-gemini-cli/gemini-3.1-pro-preview", "gemini-3.1-pro-preview"],
|
||||
["google-gemini-cli/gemini-2.5-pro", "gemini-2.5-pro"],
|
||||
];
|
||||
|
||||
for (const [routeKey, expectedCanonical] of expectedMappings) {
|
||||
it(`maps "${routeKey}" → "${expectedCanonical}"`, () => {
|
||||
const result = canonicalIdFor(routeKey);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result).toBe(expectedCanonical);
|
||||
});
|
||||
}
|
||||
|
||||
it("returns null for nonexistent/foo (no false positives)", () => {
|
||||
expect(canonicalIdFor("nonexistent/foo")).toBeNull();
|
||||
});
|
||||
|
||||
it("bare canonical id (no provider prefix) resolves via _ENTRY_BY_ROUTE fallback to itself", () => {
|
||||
// "kimi-k2.6" is a wire_id under kimi-coding — if it happened to be a
|
||||
// provider-less route key it has no _ENTRY_BY_ROUTE entry, so returns null.
|
||||
// This test documents the expected behaviour: bare ids WITHOUT a provider
|
||||
// are not looked up (they never appear as route keys in practice).
|
||||
// Bare ids used as canonicalId strings are never passed to canonicalIdFor.
|
||||
expect(canonicalIdFor("kimi-k2.6")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ── backfillModelPerformanceFromJournal tests ─────────────────────────────────
|
||||
|
||||
describe("backfillModelPerformanceFromJournal — mock journal", async () => {
|
||||
const { backfillModelPerformanceFromJournal, setRegistryResolver } =
|
||||
await import("../model-learner.js");
|
||||
|
||||
let tmpDir;
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = mkdtempSync(join(tmpdir(), "sf-backfill-test-"));
|
||||
mkdirSync(join(tmpDir, ".sf", "journal"), { recursive: true });
|
||||
|
||||
// Inject a stub resolver so tests don't need the full @singularity-forge/ai stack.
|
||||
setRegistryResolver((rk) => {
|
||||
const mappings = {
|
||||
"kimi-coding/kimi-k2.6": "kimi-k2.6",
|
||||
"minimax/MiniMax-M2.7-highspeed": "MiniMax-M2.7-highspeed",
|
||||
"zai/glm-4.5": "glm-4.5",
|
||||
"mistral/codestral-latest": "codestral-latest",
|
||||
"opencode-go/kimi-k2.5": "kimi-k2.5",
|
||||
};
|
||||
return mappings[rk] ?? null;
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
rmSync(tmpDir, { recursive: true, force: true });
|
||||
// Reset to null so other tests use the real registry.
|
||||
setRegistryResolver(null);
|
||||
});
|
||||
|
||||
function writeJournalLine(date, event) {
|
||||
const file = join(tmpDir, ".sf", "journal", `${date}.jsonl`);
|
||||
writeFileSync(file, JSON.stringify(event) + "\n", { flag: "a" });
|
||||
}
|
||||
|
||||
it("maps dispatched routes to non-_unmapped canonical buckets", async () => {
|
||||
// Dispatch: kimi-coding/kimi-k2.6 on traceId aaa
|
||||
writeJournalLine("2026-05-15", {
|
||||
schemaVersion: 1,
|
||||
eventType: "model_auto_resolved",
|
||||
data: {
|
||||
eventType: "model_auto_resolved",
|
||||
unitType: "execute-task",
|
||||
traceId: "aaa",
|
||||
data: {
|
||||
resolvedProvider: "kimi-coding",
|
||||
resolvedModel: "kimi-k2.6",
|
||||
routing: { tier: "standard", modelDowngraded: false },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Completion for traceId aaa
|
||||
writeJournalLine("2026-05-15", {
|
||||
schemaVersion: 1,
|
||||
eventType: "unit_completed",
|
||||
data: {
|
||||
eventType: "unit_completed",
|
||||
traceId: "aaa",
|
||||
data: {
|
||||
status: "completed",
|
||||
tokens: 1000,
|
||||
cost_usd: 0.01,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Dispatch: zai/glm-4.5 on traceId bbb (no completion = assume success)
|
||||
writeJournalLine("2026-05-15", {
|
||||
schemaVersion: 1,
|
||||
eventType: "model_auto_resolved",
|
||||
data: {
|
||||
eventType: "model_auto_resolved",
|
||||
unitType: "research-slice",
|
||||
traceId: "bbb",
|
||||
data: {
|
||||
resolvedProvider: "zai",
|
||||
resolvedModel: "glm-4.5",
|
||||
routing: { tier: "standard", modelDowngraded: false },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const result = await backfillModelPerformanceFromJournal(tmpDir);
|
||||
|
||||
expect(result.written).toBe(2);
|
||||
expect(result.unmapped).toBe(0);
|
||||
|
||||
// Verify the written file has the correct structure.
|
||||
const { readFileSync } = await import("node:fs");
|
||||
const perfPath = join(tmpDir, ".sf", "model-performance.json");
|
||||
const data = JSON.parse(readFileSync(perfPath, "utf-8"));
|
||||
|
||||
// execute-task → kimi-k2.6 bucket
|
||||
expect(data["execute-task"]).toBeDefined();
|
||||
expect(data["execute-task"]["kimi-k2.6"]).toBeDefined();
|
||||
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1);
|
||||
expect(data["execute-task"]["_unmapped"]).toBeUndefined();
|
||||
|
||||
// research-slice → glm-4.5 bucket
|
||||
expect(data["research-slice"]["glm-4.5"]).toBeDefined();
|
||||
expect(data["research-slice"]["glm-4.5"].aggregate.successes).toBe(1);
|
||||
});
|
||||
|
||||
it("puts unknown routes into _unmapped (no false positives)", async () => {
|
||||
writeJournalLine("2026-05-15", {
|
||||
schemaVersion: 1,
|
||||
eventType: "model_auto_resolved",
|
||||
data: {
|
||||
eventType: "model_auto_resolved",
|
||||
unitType: "execute-task",
|
||||
traceId: "zzz",
|
||||
data: {
|
||||
resolvedProvider: "unknown-provider",
|
||||
resolvedModel: "mystery-model",
|
||||
routing: { tier: "standard", modelDowngraded: false },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const result = await backfillModelPerformanceFromJournal(tmpDir);
|
||||
expect(result.unmapped).toBe(1);
|
||||
expect(result.written).toBe(0);
|
||||
});
|
||||
|
||||
it("backs up existing model-performance.json before overwriting", async () => {
|
||||
const perfPath = join(tmpDir, ".sf", "model-performance.json");
|
||||
writeFileSync(perfPath, JSON.stringify({ old: true }), "utf-8");
|
||||
|
||||
// Write a minimal dispatch so the function has something to process.
|
||||
writeJournalLine("2026-05-15", {
|
||||
schemaVersion: 1,
|
||||
eventType: "model_auto_resolved",
|
||||
data: {
|
||||
eventType: "model_auto_resolved",
|
||||
unitType: "execute-task",
|
||||
traceId: "ccc",
|
||||
data: {
|
||||
resolvedProvider: "kimi-coding",
|
||||
resolvedModel: "kimi-k2.6",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const result = await backfillModelPerformanceFromJournal(tmpDir);
|
||||
expect(result.backupPath).not.toBeNull();
|
||||
expect(result.backupPath).toMatch(/model-performance\.json\.bak\.\d+$/);
|
||||
|
||||
// Old content is in the backup.
|
||||
const { readFileSync } = await import("node:fs");
|
||||
const backup = JSON.parse(readFileSync(result.backupPath, "utf-8"));
|
||||
expect(backup.old).toBe(true);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue