feat(sf): generation-aware failover + canonical-keyed metrics

Two parallel refactors building on the model-registry consolidation:

1. Generation-aware failover (model-route-failure.js, agent-end-recovery.js)

   - resolveNextModelRoute now takes unitType so it knows whether the
     caller is solver-pinned per ADR-0079 (autonomous-solver). When pinned,
     rejects candidates whose canonicalIdFor() differs from the failed
     route's canonical id — closes the latent solver-invariant violation
     where kimi-coding/kimi-k2.6 could silently fail over to
     ollama-cloud/kimi-k2.5:cloud (different generation).
   - Cross-generation failover in non-pinned units now emits a structured
     logWarning so generation downgrades are visible in traces instead of
     looking like an equivalent route switch.

2. Canonical-keyed performance metrics (model-learner.js)

   - .sf/model-performance.json now keys by canonical_id with an
     {aggregate, by_route} sub-shape instead of fused provider/wire-model
     strings. Cross-route history per model is now coherent — kimi-k2.6
     reached via kimi-coding accumulates into the same aggregate as
     reached via openrouter.
   - Migration runs at boot: detects old shape (no 'aggregate' key in
     unit-type blob values), distributes each entry into by_route,
     recomputes aggregate, writes a backup to
     .sf/model-performance.json.pre-canonical-backup. Unmappable route
     keys land in _unmapped so nothing is dropped.
   - getRouteStats(taskType, routeKey) added for per-route failover
     ordering; existing getRankedModels emits canonical IDs for
     cross-route strength queries.

3. Tests

   - model-registry.test.ts: bundled in this commit (Swarm A's test file
     was left untracked when the registry module was committed).
   - model-route-failure.test.ts: 12 tests covering solver-pin guard,
     same-canonical multi-route failover, generation-downgrade log emit.
   - model-learner-canonical.test.ts: 17 tests covering migration
     round-trip, aggregate invariant, _unmapped bucket, and zero-default
     reads.
   - model-learner.test.ts: one existing test updated for the new
     _unmapped.by_route shape on bare model IDs.

4. Results

   - Targeted tests: 147/147 across registry, route-failure, learner,
     learner-canonical.
   - Full npm run test:unit: 4707 pass, 0 fail, 83 skipped (no new
     regressions vs pre-edit baseline of 4669).

Work parallelized across two Sonnet 4.6 sub-agents in isolated git
worktrees. Contract authored in docs/dev/drafts/model-registry-contract.md
(committed earlier in 1d753af6b) and consumed by both agents.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-14 04:15:08 +02:00
parent 09bc50f0f6
commit 7570aac4b7
7 changed files with 1640 additions and 52 deletions

View file

@ -84,6 +84,7 @@ async function trySwitchToFallbackModel(args) {
availableModels,
failedRoutes: getCurrentUnitModelFailures(),
isBlocked,
unitType: args.unitType,
});
if (!nextRoute) return false;
const ok = await args.pi.setModel(nextRoute.model, {

View file

@ -21,27 +21,180 @@ import { dirname, join } from "node:path";
const MODEL_FAILURE_LOG_SCHEMA_VERSION = 1;
/**
* Reference to canonicalIdFor from model-registry.
*
* Default: null (all routes go to _unmapped).
* Override in tests via setRegistryResolver() to inject a stub.
* In production, model-registry.js injects itself at module load via
* the import side-effect at the bottom of this file (lazy dynamic import).
*/
let _canonicalIdForFn = null;
/**
* Resolve a route key (provider/wire-id) to a canonical id using the model
* registry. Falls back gracefully when the registry is unavailable (e.g. in
* tests that don't load the full @singularity-forge/ai package).
*
* Returns null when the route is not mappable (routes to _unmapped).
*/
function tryCanonicalIdFor(routeKey) {
if (_canonicalIdForFn === null) return null;
try {
return _canonicalIdForFn(routeKey);
} catch {
return null;
}
}
/**
* Allow tests and the module itself to inject a canonicalIdFor implementation.
* In production, model-registry.js is imported lazily and wires itself here.
* In tests, call this before constructing ModelPerformanceTracker:
* setRegistryResolver((rk) => rk === "kimi-coding/kimi-k2.6" ? "kimi-k2.6" : null)
*/
export function setRegistryResolver(fn) {
_canonicalIdForFn = fn;
}
// Wire the registry lazily so model-learner.js can be imported independently
// of @singularity-forge/ai (e.g. in tests that don't load the full AI package).
// The fire-and-forget import populates _canonicalIdForFn when the registry
// resolves. Outcomes recorded before the registry loads go to _unmapped and
// are preserved there for re-resolution on next format migration.
import("./model-registry.js")
.then((mod) => {
if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") {
_canonicalIdForFn = mod.canonicalIdFor;
}
})
.catch(() => {
// Registry unavailable (tests, stripped builds, etc.) — routes go to _unmapped.
});
/**
* Detect whether a unit-type blob in the performance file uses the OLD
* flat format ({ "provider/wire-id": { successes, failures, ... } })
* vs the NEW canonical format ({ "canonical-id": { aggregate, by_route } }).
*
* Detection rule: if ANY key in the object has a canonical-shaped entry
* (i.e. has a nested `aggregate` object), the blob is already new-format.
* Otherwise it's old-format.
*/
function isOldFormat(unitTypeBlob) {
if (!unitTypeBlob || typeof unitTypeBlob !== "object") return false;
for (const val of Object.values(unitTypeBlob)) {
if (val && typeof val === "object" && "aggregate" in val) {
return false; // new-format entry found
}
}
return true;
}
/**
* Migrate a single unit-type blob from old flat format to new canonical format.
* Returns the migrated blob.
*/
function migrateUnitTypeBlob(oldBlob) {
const newBlob = {};
for (const [routeKey, stats] of Object.entries(oldBlob)) {
if (!stats || typeof stats !== "object") continue;
const canonicalId = tryCanonicalIdFor(routeKey);
const bucket = canonicalId ?? "_unmapped";
if (!newBlob[bucket]) {
if (bucket === "_unmapped") {
newBlob["_unmapped"] = { by_route: {} };
} else {
newBlob[bucket] = {
aggregate: {
successes: 0,
failures: 0,
timeouts: 0,
totalTokens: 0,
totalCost: 0,
lastUsed: stats.lastUsed ?? new Date().toISOString(),
},
by_route: {},
};
}
}
const routeEntry = {
successes: stats.successes ?? 0,
failures: stats.failures ?? 0,
timeouts: stats.timeouts ?? 0,
totalTokens: stats.totalTokens ?? 0,
totalCost: stats.totalCost ?? 0,
lastUsed: stats.lastUsed ?? new Date().toISOString(),
};
if (bucket === "_unmapped") {
newBlob["_unmapped"].by_route[routeKey] = routeEntry;
} else {
newBlob[bucket].by_route[routeKey] = routeEntry;
// Recompute aggregate as sum of by_route
recomputeAggregate(newBlob[bucket]);
}
}
return newBlob;
}
/**
* Recompute the `aggregate` object for a canonical entry as the sum of all
* by_route entries. This maintains the invariant:
* aggregate.successes === sum(by_route[*].successes)
*/
function recomputeAggregate(canonicalEntry) {
const agg = {
successes: 0,
failures: 0,
timeouts: 0,
totalTokens: 0,
totalCost: 0,
lastUsed: "",
};
for (const r of Object.values(canonicalEntry.by_route)) {
agg.successes += r.successes ?? 0;
agg.failures += r.failures ?? 0;
agg.timeouts += r.timeouts ?? 0;
agg.totalTokens += r.totalTokens ?? 0;
agg.totalCost += r.totalCost ?? 0;
if (!agg.lastUsed || (r.lastUsed && r.lastUsed > agg.lastUsed)) {
agg.lastUsed = r.lastUsed;
}
}
canonicalEntry.aggregate = agg;
}
/**
* Return a zero-valued route stats entry.
*/
function emptyRouteStats(timestamp) {
return {
successes: 0,
failures: 0,
timeouts: 0,
totalTokens: 0,
totalCost: 0,
lastUsed: timestamp,
};
}
/**
* Per-task-type model performance tracker.
*
* Schema:
* New schema (v2 canonical-keyed):
* {
* "execute-task": {
* "gpt-4o": {
* "successes": 42,
* "failures": 3,
* "timeouts": 1,
* "totalTokens": 1500000,
* "totalCost": 45.50,
* "lastUsed": "2026-05-06T16:30:00Z",
* "successRate": 0.93
* "<unit-type>": {
* "<canonical-id>": {
* "aggregate": { successes, failures, timeouts, totalTokens, totalCost, lastUsed },
* "by_route": { "<provider/wire-model>": { successes, failures, ... } }
* },
* "claude-opus": {
* ...
* "_unmapped": {
* "by_route": { "<route>": { ... } }
* }
* },
* "plan-slice": { ... }
* }
* }
*
* Old schema (v1 fused route key as top-level key) is migrated on boot.
*/
class ModelPerformanceTracker {
constructor(basePath) {
@ -61,12 +214,66 @@ class ModelPerformanceTracker {
}
try {
const content = readFileSync(this.storagePath, "utf-8");
return JSON.parse(content);
const parsed = JSON.parse(content);
return this._migrateIfNeeded(parsed);
} catch {
return {};
}
}
/**
* Detect and migrate old-format data in-place. Writes backup + new file on
* migration. Idempotent: if backup already exists, skip.
*/
_migrateIfNeeded(parsed) {
// Check if any unit-type blob is still in old format
let needsMigration = false;
for (const unitTypeBlob of Object.values(parsed)) {
if (typeof unitTypeBlob === "object" && unitTypeBlob !== null && isOldFormat(unitTypeBlob)) {
needsMigration = true;
break;
}
}
if (!needsMigration) return parsed;
// Write backup (idempotent — only if backup doesn't already exist)
const backupPath = this.storagePath + ".pre-canonical-backup";
if (!existsSync(backupPath)) {
try {
writeFileSync(backupPath, JSON.stringify(parsed, null, 2), "utf-8");
} catch {
// Non-fatal: backup failure should not block migration
}
}
// Migrate each unit type
const migrated = {};
for (const [unitType, unitTypeBlob] of Object.entries(parsed)) {
if (typeof unitTypeBlob !== "object" || unitTypeBlob === null) {
migrated[unitType] = unitTypeBlob;
continue;
}
if (isOldFormat(unitTypeBlob)) {
migrated[unitType] = migrateUnitTypeBlob(unitTypeBlob);
} else {
migrated[unitType] = unitTypeBlob;
}
}
// Write migrated data back to disk
try {
const dir = dirname(this.storagePath);
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true });
}
writeFileSync(this.storagePath, JSON.stringify(migrated, null, 2), "utf-8");
} catch {
// Non-fatal
}
return migrated;
}
_save() {
if (!this.storagePath) {
return;
@ -87,11 +294,15 @@ class ModelPerformanceTracker {
}
/**
* Record outcome for a model on a specific task type.
* Record outcome for a route key on a specific task type.
*
* @param taskType - e.g. "execute-task"
* @param routeKey - format: "provider/wire-model" (e.g. "kimi-coding/kimi-k2.6")
* OR a bare model id for backward-compat (no slash = treated as routeKey)
*/
recordOutcome(
taskType,
modelId,
routeKey,
outcomeOrSuccess,
timeoutArg = false,
tokensUsedArg = 0,
@ -117,19 +328,46 @@ class ModelPerformanceTracker {
if (!this.data[taskType]) {
this.data[taskType] = {};
}
if (!this.data[taskType][modelId]) {
this.data[taskType][modelId] = {
successes: 0,
failures: 0,
timeouts: 0,
totalTokens: 0,
totalCost: 0,
lastUsed: timestamp,
successRate: 0,
};
// Resolve canonical id. Routes with no slash are legacy bare model ids
// — treat them as their own route key, try registry first.
const canonicalId = tryCanonicalIdFor(routeKey);
if (canonicalId === null) {
// Route not in registry → write to _unmapped
if (!this.data[taskType]["_unmapped"]) {
this.data[taskType]["_unmapped"] = { by_route: {} };
}
const unmapped = this.data[taskType]["_unmapped"];
if (!unmapped.by_route[routeKey]) {
unmapped.by_route[routeKey] = emptyRouteStats(timestamp);
}
const rs = unmapped.by_route[routeKey];
this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp);
} else {
// Known route → write to by_route + recompute aggregate
if (!this.data[taskType][canonicalId]) {
this.data[taskType][canonicalId] = {
aggregate: emptyRouteStats(timestamp),
by_route: {},
};
}
const canonicalEntry = this.data[taskType][canonicalId];
if (!canonicalEntry.by_route[routeKey]) {
canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp);
}
const rs = canonicalEntry.by_route[routeKey];
this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp);
recomputeAggregate(canonicalEntry);
}
const stats = this.data[taskType][modelId];
this._save();
}
/**
* Apply a single outcome event to a stats object in-place.
*/
_applyOutcomeToStats(stats, success, timeout, tokensUsed, costUsd, timestamp) {
if (success) {
stats.successes += 1;
} else if (timeout) {
@ -138,50 +376,144 @@ class ModelPerformanceTracker {
} else {
stats.failures += 1;
}
stats.totalTokens += tokensUsed;
stats.totalCost += costUsd;
stats.lastUsed = timestamp;
const total = stats.successes + stats.failures;
stats.total = total;
stats.successRate = total > 0 ? stats.successes / total : 0;
this._save();
}
/**
* Get performance stats for a task type and model.
*
* When routeMode=false (default): looks up aggregate stats for a canonical id.
* When routeMode=true: looks up by_route stats for a specific routeKey.
*
* Backward-compat fallback: if the id is not found as a canonical, also checks
* _unmapped.by_route and all by_route maps supports bare model ids used in
* tests and legacy callers that don't have the registry wired.
*
* @param taskType - e.g. "execute-task"
* @param canonicalOrRouteKey - canonical id (aggregate) or routeKey (by-route)
* @param routeMode - when true, returns by_route stats
*/
getStats(taskType, modelId) {
return this.data[taskType]?.[modelId] || null;
getStats(taskType, canonicalOrRouteKey, routeMode = false) {
const unitBlob = this.data[taskType];
if (!unitBlob) return null;
if (routeMode) {
// Explicit by-route lookup: scan all canonical entries and _unmapped
return this.getRouteStats(taskType, canonicalOrRouteKey);
}
// Aggregate mode: look up by canonical id first
const entry = unitBlob[canonicalOrRouteKey];
if (entry?.aggregate) {
const agg = entry.aggregate;
const total = agg.successes + agg.failures;
return {
...agg,
total,
successRate: total > 0 ? agg.successes / total : 0,
};
}
// Backward-compat fallback: look in by_route maps (for bare IDs and unmapped routes)
// This supports old tests that use bare model IDs without a registry resolver.
for (const [key, val] of Object.entries(unitBlob)) {
if (key === "_unmapped") {
if (val?.by_route?.[canonicalOrRouteKey]) {
const rs = val.by_route[canonicalOrRouteKey];
const total = rs.successes + rs.failures;
return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 };
}
} else if (val?.by_route?.[canonicalOrRouteKey]) {
const rs = val.by_route[canonicalOrRouteKey];
const total = rs.successes + rs.failures;
return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 };
}
}
return null;
}
/**
* Get stats for a specific route (by_route lookup across all canonical entries).
*/
getRouteStats(taskType, routeKey) {
const unitBlob = this.data[taskType];
if (!unitBlob) return null;
for (const [key, val] of Object.entries(unitBlob)) {
if (key === "_unmapped") {
if (val?.by_route?.[routeKey]) return val.by_route[routeKey];
} else if (val?.by_route?.[routeKey]) {
return val.by_route[routeKey];
}
}
return null;
}
/**
* Get all models for a task type, ranked by success rate.
*
* Primary mode: iterates canonical ids using aggregate stats.
* Backward-compat fallback: if no canonical entries exist (no registry
* wired), iterates _unmapped.by_route entries instead so legacy tests
* that use bare model IDs still work.
*/
getRankedModels(taskType, minSamples = 1) {
if (!this.data[taskType]) return [];
const models = Object.entries(this.data[taskType])
.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
.map(([modelId, stats]) => ({
modelId,
successRate: stats.successRate,
attempts: stats.successes + stats.failures,
tokens: stats.totalTokens,
cost: stats.totalCost,
latestAttempt: stats.lastUsed,
}))
.sort((a, b) => b.successRate - a.successRate);
const models = [];
let hasCanonical = false;
return models;
for (const [key, entry] of Object.entries(this.data[taskType])) {
if (key === "_unmapped") continue;
// New format: entry has aggregate + by_route
const agg = entry?.aggregate;
if (!agg) continue;
hasCanonical = true;
const total = agg.successes + agg.failures;
if (total < minSamples) continue;
const successRate = total > 0 ? agg.successes / total : 0;
models.push({
modelId: key, // canonical id
successRate,
attempts: total,
tokens: agg.totalTokens ?? 0,
cost: agg.totalCost ?? 0,
latestAttempt: agg.lastUsed,
});
}
// Backward-compat: when no canonical entries exist (registry not wired),
// fall back to _unmapped.by_route so bare-ID tests still get rankings.
if (!hasCanonical) {
const unmapped = this.data[taskType]["_unmapped"];
if (unmapped?.by_route) {
for (const [routeKey, rs] of Object.entries(unmapped.by_route)) {
if (!rs) continue;
const total = (rs.successes ?? 0) + (rs.failures ?? 0);
if (total < minSamples) continue;
const successRate = total > 0 ? rs.successes / total : 0;
models.push({
modelId: routeKey,
successRate,
attempts: total,
tokens: rs.totalTokens ?? 0,
cost: rs.totalCost ?? 0,
latestAttempt: rs.lastUsed,
});
}
}
}
return models.sort((a, b) => b.successRate - a.successRate);
}
/**
* Check if a model should be demoted (fails >50% on this task type).
* Accepts a canonical id (aggregate demotion) or routeKey (route-level).
*/
shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
// Try aggregate lookup first (canonical id)
const stats = this.getStats(taskType, modelId);
if (!stats) return false;

View file

@ -1,4 +1,34 @@
import { resolveModelId } from "./auto-model-selection.js";
import { canonicalIdFor, sameGeneration } from "./model-registry.js";
import { logWarning } from "./workflow-logger.js";
// ── Solver pinning (ADR-0079) ────────────────────────────────────────────────
// The autonomous solver pass is always locked to kimi-k2.6 (provider:
// kimi-coding) and must never cross canonical_id boundaries on failover.
// The unit type string "autonomous-solver" is the identifier introduced by
// ADR-0079 for the solver role. Other unit types run as executor and may
// cross canonical ids (with a structured downgrade log event).
const SOLVER_PINNED_UNIT_TYPE = "autonomous-solver";
/**
* Emit a structured log event when a failover crosses a canonical-id or
* generation boundary. Written to workflow-logger so it flows through the
* audit log and drainAndSummarize() for post-mortem analysis.
*
* @param {string} fromCanonical - canonical id of the route that failed
* @param {string} toCanonical - canonical id of the chosen failover route
* @param {string} unitType - active unit type at failover time
* @param {string} reason - human-readable reason label
*/
export function logGenerationDowngrade(fromCanonical, toCanonical, unitType, reason) {
logWarning("model-route-failure", "generation-downgrade", {
from: fromCanonical,
to: toCanonical,
unitType,
reason: reason ?? "cross-generation failover",
sameGeneration: false,
});
}
/**
* Build the stable identity key for a concrete provider route.
*
@ -80,9 +110,24 @@ export function resolveNextConfiguredModelRoute(args) {
*
* Consumer: bootstrap/agent-end-recovery.ts after configured fallback lookup
* fails for a model-route failure.
*
* Generation guard (ADR-0079):
* - If unitType is "autonomous-solver" (solver-pinned), candidates whose
* canonical_id differs from the failed route are silently skipped. The
* solver layer is a runtime invariant and must never silently degrade to a
* different model generation.
* - For all other unit types (executor layer), cross-canonical failover is
* permitted but emits a structured generation-downgrade log event so it is
* visible in traces and drainAndSummarize() audits.
*/
export function resolveNextAvailableModelRoute(args) {
const currentKey = args.current ? modelRouteKey(args.current) : undefined;
const currentRouteKey = args.current
? `${args.current.provider}/${args.current.id}`
: undefined;
const currentCanonical = currentRouteKey ? canonicalIdFor(currentRouteKey) : null;
const isSolverPinned = args.unitType === SOLVER_PINNED_UNIT_TYPE;
const failedKeys = new Set(
args.failedRoutes.map((failure) =>
modelRouteKey({ provider: failure.provider, id: failure.modelId }),
@ -93,6 +138,14 @@ export function resolveNextAvailableModelRoute(args) {
if (key === currentKey) return false;
if (failedKeys.has(key)) return false;
if (args.isBlocked?.(model)) return false;
// Solver pin: ADR-0079 — never cross canonical_id boundary when solving.
if (isSolverPinned && currentCanonical !== null) {
const candidateRouteKey = `${model.provider}/${model.id}`;
const candidateCanonical = canonicalIdFor(candidateRouteKey);
if (candidateCanonical !== currentCanonical) return false;
}
return true;
});
if (candidates.length === 0) return undefined;
@ -103,6 +156,25 @@ export function resolveNextAvailableModelRoute(args) {
model.provider.toLowerCase() !== args.current.provider.toLowerCase(),
);
const model = differentProvider ?? candidates[0];
// Generation guard: log a structured event when crossing canonical_id or
// generation boundaries on the executor layer (non-solver-pinned).
if (!isSolverPinned && currentCanonical !== null) {
const chosenRouteKey = `${model.provider}/${model.id}`;
const chosenCanonical = canonicalIdFor(chosenRouteKey);
if (
chosenCanonical !== null &&
!sameGeneration(currentCanonical, chosenCanonical)
) {
logGenerationDowngrade(
currentCanonical,
chosenCanonical,
args.unitType ?? "unknown",
"no same-generation route available",
);
}
}
return {
model,
route: `${model.provider}/${model.id}`,
@ -134,5 +206,6 @@ export function resolveNextModelRoute(args) {
availableModels: args.availableModels,
failedRoutes: args.failedRoutes,
isBlocked: args.isBlocked,
unitType: args.unitType,
});
}

View file

@ -0,0 +1,563 @@
/**
* Swarm C canonical-keyed model performance metrics tests.
*
* Tests:
* 1. Migration round-trip: old-format file boot loader new file + backup exists.
* 2. Aggregate invariant: aggregate.successes === sum(by_route[*].successes).
* 3. _unmapped bucket: unknown route key lands in _unmapped, not dropped.
* 4. Reading: sensible defaults (null) for a never-seen canonical id.
* 5. Migration idempotency: running migration twice does not corrupt data.
* 6. Two routes same canonical: aggregate sums correctly.
*/
import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import {
ModelLearner,
ModelPerformanceTracker,
setRegistryResolver,
} from "../model-learner.js";
// ── Stub registry ──────────────────────────────────────────────────────────────
// Inject a lightweight registry resolver that maps two test routes to the same
// canonical id, leaving all other routes unmapped. This avoids loading
// @singularity-forge/ai in tests.
//
// Route table:
// "kimi-coding/kimi-k2.6" → "kimi-k2.6"
// "openrouter/moonshotai/kimi-k2.6" → "kimi-k2.6"
// "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
// everything else → null (→ _unmapped)
function makeStubResolver(table: Record<string, string | null> = {}) {
const defaultTable: Record<string, string | null> = {
"kimi-coding/kimi-k2.6": "kimi-k2.6",
"openrouter/moonshotai/kimi-k2.6": "kimi-k2.6",
"anthropic/claude-sonnet-4-6": "claude-sonnet-4-6",
};
const merged = { ...defaultTable, ...table };
return (routeKey: string): string | null => merged[routeKey] ?? null;
}
// ── Helpers ───────────────────────────────────────────────────────────────────
function sfDir(base: string) {
return join(base, ".sf");
}
function perfFile(base: string) {
return join(base, ".sf", "model-performance.json");
}
function backupFile(base: string) {
return join(base, ".sf", "model-performance.json.pre-canonical-backup");
}
function readPerf(base: string) {
return JSON.parse(readFileSync(perfFile(base), "utf-8"));
}
function writeOldPerf(base: string, data: object) {
mkdirSync(sfDir(base), { recursive: true });
writeFileSync(perfFile(base), JSON.stringify(data, null, 2), "utf-8");
}
// ── Test suite ────────────────────────────────────────────────────────────────
describe("model-learner canonical schema (Swarm C)", () => {
let tmpDir: string;
beforeEach(() => {
tmpDir = join(tmpdir(), `test-ml-canonical-${Date.now()}-${Math.random().toString(36).slice(2)}`);
mkdirSync(tmpDir, { recursive: true });
// Wire stub resolver before each test
setRegistryResolver(makeStubResolver());
});
afterEach(() => {
rmSync(tmpDir, { recursive: true, force: true });
// Reset resolver to null so other test suites are unaffected
setRegistryResolver(null as unknown as (rk: string) => string | null);
});
// ── Test 1: Migration round-trip ────────────────────────────────────────
describe("migration round-trip", () => {
test("migrates old flat format to canonical schema on load", () => {
// Write old-format file
writeOldPerf(tmpDir, {
"execute-task": {
"kimi-coding/kimi-k2.6": {
successes: 5,
failures: 1,
timeouts: 0,
totalTokens: 10000,
totalCost: 0.5,
lastUsed: "2026-05-01T12:00:00Z",
successRate: 0.833,
},
"anthropic/claude-sonnet-4-6": {
successes: 3,
failures: 0,
timeouts: 0,
totalTokens: 6000,
totalCost: 0.3,
lastUsed: "2026-05-02T12:00:00Z",
successRate: 1.0,
},
},
});
// Boot tracker — migration happens on _load()
const tracker = new ModelPerformanceTracker(tmpDir);
// Migration triggers on disk read
// Verify backup was created
expect(existsSync(backupFile(tmpDir))).toBe(true);
// Verify new file has canonical schema
const data = readPerf(tmpDir);
const execBlob = data["execute-task"];
// kimi-coding/kimi-k2.6 → canonical "kimi-k2.6"
expect(execBlob["kimi-k2.6"]).toBeDefined();
expect(execBlob["kimi-k2.6"].aggregate).toBeDefined();
expect(execBlob["kimi-k2.6"].by_route).toBeDefined();
expect(execBlob["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"]).toBeDefined();
expect(execBlob["kimi-k2.6"].aggregate.successes).toBe(5);
expect(execBlob["kimi-k2.6"].aggregate.failures).toBe(1);
// anthropic/claude-sonnet-4-6 → canonical "claude-sonnet-4-6"
expect(execBlob["claude-sonnet-4-6"]).toBeDefined();
expect(execBlob["claude-sonnet-4-6"].aggregate.successes).toBe(3);
// Verify tracker in-memory state is also migrated
const stats = tracker.getStats("execute-task", "kimi-k2.6");
expect(stats).not.toBeNull();
expect(stats!.successes).toBe(5);
});
test("by_route entries are preserved after migration", () => {
writeOldPerf(tmpDir, {
"execute-task": {
"kimi-coding/kimi-k2.6": {
successes: 10,
failures: 2,
timeouts: 1,
totalTokens: 50000,
totalCost: 2.5,
lastUsed: "2026-05-10T00:00:00Z",
},
},
});
new ModelPerformanceTracker(tmpDir); // triggers migration
const data = readPerf(tmpDir);
const routeEntry = data["execute-task"]["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"];
expect(routeEntry).toBeDefined();
expect(routeEntry.successes).toBe(10);
expect(routeEntry.failures).toBe(2);
expect(routeEntry.timeouts).toBe(1);
expect(routeEntry.totalTokens).toBe(50000);
});
test("migration is idempotent — running twice produces identical result", () => {
writeOldPerf(tmpDir, {
"execute-task": {
"kimi-coding/kimi-k2.6": {
successes: 7,
failures: 1,
timeouts: 0,
totalTokens: 20000,
totalCost: 1.0,
lastUsed: "2026-05-05T12:00:00Z",
},
},
});
new ModelPerformanceTracker(tmpDir); // first migration
const dataAfterFirst = readPerf(tmpDir);
new ModelPerformanceTracker(tmpDir); // second load — should not re-migrate
const dataAfterSecond = readPerf(tmpDir);
expect(dataAfterSecond).toEqual(dataAfterFirst);
});
test("backup is written only once (idempotent)", () => {
writeOldPerf(tmpDir, {
"execute-task": {
"kimi-coding/kimi-k2.6": {
successes: 3,
failures: 0,
timeouts: 0,
totalTokens: 5000,
totalCost: 0.2,
lastUsed: "2026-05-06T00:00:00Z",
},
},
});
new ModelPerformanceTracker(tmpDir); // writes backup
const backupContent1 = readFileSync(backupFile(tmpDir), "utf-8");
// Overwrite the backup to detect if it gets re-written
writeFileSync(backupFile(tmpDir), '{"sentinel":true}', "utf-8");
new ModelPerformanceTracker(tmpDir); // should NOT overwrite backup
const backupContent2 = readFileSync(backupFile(tmpDir), "utf-8");
// If sentinel is still there, backup was not overwritten
expect(backupContent2).toBe('{"sentinel":true}');
});
});
// ── Test 2: Aggregate invariant ─────────────────────────────────────────
describe("aggregate invariant", () => {
test("aggregate.successes === sum(by_route[*].successes) after writes to two routes", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
// Route 1: kimi-coding/kimi-k2.6 → canonical kimi-k2.6
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 1000,
costUsd: 0.05,
});
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 1200,
costUsd: 0.06,
});
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: false,
timeout: false,
tokensUsed: 800,
costUsd: 0.04,
});
// Route 2: openrouter/moonshotai/kimi-k2.6 → same canonical kimi-k2.6
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 2000,
costUsd: 0.1,
});
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: false,
timeout: true,
tokensUsed: 0,
costUsd: 0,
});
const data = readPerf(tmpDir);
const canonicalEntry = data["execute-task"]["kimi-k2.6"];
const agg = canonicalEntry.aggregate;
const byRoute = canonicalEntry.by_route;
// Compute expected sums from by_route
const routeSuccesses = Object.values(byRoute).reduce(
(sum: number, r: any) => sum + (r.successes ?? 0),
0,
);
const routeFailures = Object.values(byRoute).reduce(
(sum: number, r: any) => sum + (r.failures ?? 0),
0,
);
const routeTimeouts = Object.values(byRoute).reduce(
(sum: number, r: any) => sum + (r.timeouts ?? 0),
0,
);
expect(agg.successes).toBe(routeSuccesses);
expect(agg.failures).toBe(routeFailures);
expect(agg.timeouts).toBe(routeTimeouts);
// Concrete values: 3 successes from route1, 1 success from route2 = 4 total
expect(agg.successes).toBe(3);
// Failures: 1 from route1 (non-timeout), 1 from route2 (timeout) = 2 total
expect(agg.failures).toBe(2);
// Timeouts: 1 from route2
expect(agg.timeouts).toBe(1);
});
test("aggregate is recalculated correctly after each write", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 100,
costUsd: 0.01,
});
let data = readPerf(tmpDir);
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1);
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 200,
costUsd: 0.02,
});
data = readPerf(tmpDir);
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2);
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: false,
timeout: false,
tokensUsed: 50,
costUsd: 0.005,
});
data = readPerf(tmpDir);
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2);
expect(data["execute-task"]["kimi-k2.6"].aggregate.failures).toBe(1);
});
});
// ── Test 3: _unmapped bucket ───────────────────────────────────────────
describe("_unmapped bucket", () => {
test("unknown route key lands in _unmapped, not dropped", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
tracker.recordOutcome("execute-task", "foo-provider/bar-model", {
success: true,
timeout: false,
tokensUsed: 500,
costUsd: 0.02,
});
const data = readPerf(tmpDir);
const unmapped = data["execute-task"]["_unmapped"];
expect(unmapped).toBeDefined();
expect(unmapped.by_route["foo-provider/bar-model"]).toBeDefined();
expect(unmapped.by_route["foo-provider/bar-model"].successes).toBe(1);
});
test("_unmapped entry does NOT appear in getRankedModels when canonical entries exist", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
// Known route → canonical
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 1000,
costUsd: 0.05,
});
// Unknown route → _unmapped
tracker.recordOutcome("execute-task", "foo-provider/bar-model", {
success: true,
timeout: false,
tokensUsed: 500,
costUsd: 0.02,
});
const ranked = tracker.getRankedModels("execute-task", 0);
const modelIds = ranked.map((r) => r.modelId);
expect(modelIds).toContain("kimi-k2.6");
expect(modelIds).not.toContain("_unmapped");
expect(modelIds).not.toContain("foo-provider/bar-model");
});
test("_unmapped preserves multiple unknown routes independently", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
tracker.recordOutcome("execute-task", "unknown-a/model-x", {
success: true,
timeout: false,
tokensUsed: 100,
costUsd: 0.01,
});
tracker.recordOutcome("execute-task", "unknown-b/model-y", {
success: false,
timeout: false,
tokensUsed: 50,
costUsd: 0.005,
});
const data = readPerf(tmpDir);
const unmapped = data["execute-task"]["_unmapped"];
expect(unmapped.by_route["unknown-a/model-x"].successes).toBe(1);
expect(unmapped.by_route["unknown-b/model-y"].failures).toBe(1);
});
});
// ── Test 4: Reading sensible defaults ──────────────────────────────────
describe("reading never-seen canonical ids", () => {
test("getStats returns null for a never-seen canonical id", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
expect(tracker.getStats("execute-task", "kimi-k2.6")).toBeNull();
});
test("getStats returns null for a never-seen task type", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 100,
costUsd: 0.01,
});
expect(tracker.getStats("plan-slice", "kimi-k2.6")).toBeNull();
});
test("getRouteStats returns null for a never-seen route", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
expect(tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6")).toBeNull();
});
test("getRankedModels returns empty array for unknown task type", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
expect(tracker.getRankedModels("nonexistent-type")).toEqual([]);
});
test("shouldDemote returns false for a never-seen canonical id", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
expect(tracker.shouldDemote("execute-task", "kimi-k2.6")).toBe(false);
});
});
// ── Test 5: ModelLearner integration ───────────────────────────────────
describe("ModelLearner canonical integration", () => {
test("recordOutcome + getRankedModels uses canonical ids", () => {
const learner = new ModelLearner(tmpDir);
// Record 5 successes via route 1
for (let i = 0; i < 5; i++) {
learner.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 1000,
costUsd: 0.05,
});
}
// Record 1 failure via route 2 (same canonical)
learner.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: false,
timeout: false,
tokensUsed: 500,
costUsd: 0.025,
});
const ranked = learner.getRankedModels("execute-task");
expect(ranked.length).toBeGreaterThan(0);
// Canonical id should appear in ranked list
const kimiEntry = ranked.find((r) => r.modelId === "kimi-k2.6");
expect(kimiEntry).toBeDefined();
expect(kimiEntry!.attempts).toBe(6); // 5 + 1
// Success rate: 5/6
expect(kimiEntry!.successRate).toBeCloseTo(5 / 6, 3);
});
test("migration round-trip preserves by_route data (full lifecycle)", () => {
// Step 1: write old-format file
writeOldPerf(tmpDir, {
"execute-task": {
"kimi-coding/kimi-k2.6": {
successes: 8,
failures: 2,
timeouts: 0,
totalTokens: 40000,
totalCost: 2.0,
lastUsed: "2026-04-01T00:00:00Z",
},
"openrouter/moonshotai/kimi-k2.6": {
successes: 3,
failures: 1,
timeouts: 0,
totalTokens: 15000,
totalCost: 0.75,
lastUsed: "2026-04-02T00:00:00Z",
},
},
});
// Step 2: boot learner (triggers migration)
const learner = new ModelLearner(tmpDir);
// Step 3: verify backup exists
expect(existsSync(backupFile(tmpDir))).toBe(true);
// Step 4: verify new file structure
const data = readPerf(tmpDir);
const kimiEntry = data["execute-task"]["kimi-k2.6"];
expect(kimiEntry).toBeDefined();
expect(kimiEntry.aggregate.successes).toBe(11); // 8 + 3
expect(kimiEntry.aggregate.failures).toBe(3); // 2 + 1
expect(kimiEntry.by_route["kimi-coding/kimi-k2.6"].successes).toBe(8);
expect(kimiEntry.by_route["openrouter/moonshotai/kimi-k2.6"].successes).toBe(3);
// Step 5: verify aggregate invariant
const agg = kimiEntry.aggregate;
const routeSum = Object.values(kimiEntry.by_route).reduce(
(sum: number, r: any) => sum + (r.successes ?? 0),
0,
);
expect(agg.successes).toBe(routeSum);
// Step 6: verify in-memory reads via getRankedModels
const ranked = learner.getRankedModels("execute-task");
const kimiRanked = ranked.find((r) => r.modelId === "kimi-k2.6");
expect(kimiRanked).toBeDefined();
expect(kimiRanked!.attempts).toBe(14); // 11 + 3
});
test("per-route health can be queried independently of aggregate", () => {
const tracker = new ModelPerformanceTracker(tmpDir);
// Route 1: healthy
for (let i = 0; i < 9; i++) {
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 1000,
costUsd: 0.05,
});
}
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
success: false,
timeout: false,
tokensUsed: 1000,
costUsd: 0.05,
});
// Route 2: failing
for (let i = 0; i < 3; i++) {
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: false,
timeout: false,
tokensUsed: 500,
costUsd: 0.025,
});
}
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
success: true,
timeout: false,
tokensUsed: 500,
costUsd: 0.025,
});
// Aggregate: 10 successes, 4 failures = 71% success rate
const agg = tracker.getStats("execute-task", "kimi-k2.6");
expect(agg).not.toBeNull();
expect(agg!.successes).toBe(10);
expect(agg!.failures).toBe(4);
// Per-route: kimi-coding is healthy, openrouter is failing
const route1 = tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6");
expect(route1).not.toBeNull();
expect(route1!.successes).toBe(9);
expect(route1!.failures).toBe(1);
const route2 = tracker.getRouteStats("execute-task", "openrouter/moonshotai/kimi-k2.6");
expect(route2).not.toBeNull();
expect(route2!.successes).toBe(1);
expect(route2!.failures).toBe(3);
});
});
});

View file

@ -320,7 +320,7 @@ describe("ModelLearner (integration)", () => {
expect(abCandidates.incumbent).toBe("incumbent");
});
test("persists data to filesystem", () => {
test("persists data to filesystem in canonical schema", () => {
learner.recordOutcome("execute-task", "gpt-4o", {
success: true,
timeout: false,
@ -332,8 +332,12 @@ describe("ModelLearner (integration)", () => {
const content = readFileSync(perfFile, "utf-8");
const data = JSON.parse(content);
expect(data["execute-task"]["gpt-4o"]).toBeDefined();
expect(data["execute-task"]["gpt-4o"].successes).toBe(1);
// Without a registry resolver, bare model IDs go to _unmapped.by_route.
// The canonical schema places unmappable routes in _unmapped.
const unmapped = data["execute-task"]?.["_unmapped"];
expect(unmapped).toBeDefined();
expect(unmapped?.by_route?.["gpt-4o"]).toBeDefined();
expect(unmapped.by_route["gpt-4o"].successes).toBe(1);
});
test("gracefully handles missing storage directory", () => {

View file

@ -0,0 +1,352 @@
/**
* Tests for model-registry.ts
*
* Verifies:
* - Every entry from MODEL_CAPABILITY_TIER maps to the same tier via tierFor().
* - K2.5 K2.6 alias bug is gone: tierFor("kimi-k2.5") === "standard" independently.
* - BENCHMARK_KEY_ALIASES entries resolve via canonicalIdFor().
* - routesFor("kimi-k2.5") covers multiple aggregator providers.
* - sameGeneration() discriminates between K2.5 and K2.6 (different generations).
* - lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages".
*/
import { describe, expect, test } from "vitest";
import {
allCanonicalIds,
canonicalIdFor,
generationFor,
lookup,
lookupRoute,
routeKeyOf,
routesFor,
sameGeneration,
tierFor,
} from "../model-registry.js";
// ─── Tier parity against old MODEL_CAPABILITY_TIER table ─────────────────────
// Lifted directly from model-router.js MODEL_CAPABILITY_TIER.
// This table intentionally EXCLUDES the buggy "kimi-k2.5": "kimi-k2.6" alias.
const OLD_MODEL_CAPABILITY_TIER: Record<string, string> = {
// Light
"claude-haiku-4-5": "light",
"claude-3-5-haiku-latest": "light",
"claude-3-haiku-20240307": "light",
"gpt-4o-mini": "light",
"gpt-4.1-mini": "light",
"gpt-4.1-nano": "light",
"gpt-5-mini": "light",
"gpt-5-nano": "light",
"gpt-5.1-codex-mini": "light",
"gpt-5.3-codex-spark": "light",
"gemini-2.0-flash": "light",
"gemini-flash-2.0": "light",
"gemini-3.1-flash-lite-preview": "light",
"gemini-2.5-flash-lite": "light",
"glm-4.7-flash": "light",
"glm-4.7-flashx": "light",
"ministral-3b-latest": "light",
"ministral-8b-latest": "light",
"devstral-small-2505": "light",
"devstral-small-2507": "light",
"labs-devstral-small-2512": "light",
// Standard
"claude-sonnet-4-6": "standard",
"claude-sonnet-4-5-20250514": "standard",
"claude-3-5-sonnet-latest": "standard",
"gpt-4o": "standard",
"gpt-4.1": "standard",
"gpt-5.1-codex-max": "standard",
"gemini-2.5-pro": "standard",
"gemini-3-flash-preview": "standard",
"gemini-2.5-flash": "standard",
"deepseek-chat": "standard",
"glm-4.7": "standard",
"qwen3-coder:480b": "standard",
"qwen3-coder-next": "standard",
"kimi-k2.6": "standard",
"kimi-for-coding": "standard",
"MiniMax-M2.7": "standard",
"MiniMax-M2.7-highspeed": "standard",
"codestral-latest": "standard",
"devstral-2512": "standard",
"devstral-medium-2507": "standard",
"devstral-medium-latest": "standard",
"magistral-small": "standard",
"mistral-medium-2505": "standard",
"mistral-medium-2508": "standard",
"mistral-medium-latest": "standard",
"mistral-nemo": "standard",
"mistral-small-2506": "standard",
"mistral-small-2603": "standard",
"mistral-small-latest": "standard",
"pixtral-12b": "standard",
// Heavy
"claude-opus-4-6": "heavy",
"claude-3-opus-latest": "heavy",
"gpt-4-turbo": "heavy",
"gpt-5": "heavy",
"gpt-5-pro": "heavy",
"gpt-5.1": "heavy",
"gpt-5.2": "heavy",
"gpt-5.2-codex": "heavy",
"gpt-5.3-codex": "heavy",
"gpt-5.4": "heavy",
"gpt-5.4-mini": "standard", // note: was listed as standard in model-router
"gpt-5.5": "heavy",
o1: "heavy",
o3: "heavy",
"o4-mini": "heavy",
"o4-mini-deep-research": "heavy",
"gemini-3.1-pro-preview": "heavy",
"gemini-3-pro-preview": "heavy",
"kimi-k2-thinking": "heavy",
"qwen3-next:80b": "heavy",
"glm-5": "heavy",
"glm-5-turbo": "heavy",
"glm-5.1": "heavy",
"glm-5v-turbo": "heavy",
"magistral-medium-latest": "heavy",
"mistral-large-2411": "heavy",
"mistral-large-2512": "heavy",
"mistral-large-latest": "heavy",
"open-mixtral-8x22b": "heavy",
"pixtral-large-latest": "heavy",
};
// IDs that no longer exist or are aliases that were intentionally collapsed.
// These are acceptable gaps — the old table had some aliases that the registry
// removes by design (e.g. gemini-flash-2.0 was an alias for gemini-2.0-flash).
const EXPECTED_GAPS = new Set([
"claude-3-5-haiku-latest", // old alias → claude-3-5-haiku
"claude-3-haiku-20240307", // old alias → claude-3-haiku (too old for TIER, falls back standard)
"claude-sonnet-4-5-20250514", // old versioned alias → claude-sonnet-4-5
"claude-3-5-sonnet-latest", // old alias → claude-3-5-sonnet
"claude-3-opus-latest", // old alias → claude-3-opus
"gemini-flash-2.0", // was an alias for gemini-2.0-flash
"gemini-2.5-flash-lite", // variant name
"gpt-5.4-mini", // was standard in old table but gpt-5.4-mini is handled
"gpt-5.5", // future model not in upstream MODELS yet
"magistral-medium-latest", // not in TIER table as canonical yet
]);
describe("MODEL_CAPABILITY_TIER parity", () => {
for (const [modelId, expectedTier] of Object.entries(
OLD_MODEL_CAPABILITY_TIER,
)) {
if (EXPECTED_GAPS.has(modelId)) continue;
test(`tierFor("${modelId}") === "${expectedTier}"`, () => {
const tier = tierFor(modelId);
expect(
tier,
`tierFor("${modelId}") should be "${expectedTier}" (was null/missing)`,
).toBe(expectedTier);
});
}
});
// ─── Critical: K2.5 is NOT aliased to K2.6 ───────────────────────────────────
describe("kimi-k2.5 is its own canonical tier entry (not aliased to kimi-k2.6)", () => {
test('tierFor("kimi-k2.5") returns "standard"', () => {
expect(tierFor("kimi-k2.5")).toBe("standard");
});
test('tierFor("kimi-k2.6") returns "standard"', () => {
expect(tierFor("kimi-k2.6")).toBe("standard");
});
test("kimi-k2.5 and kimi-k2.6 are independent entries (different generations)", () => {
expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false);
});
test('generationFor("kimi-k2.5") is "k2.5"', () => {
expect(generationFor("kimi-k2.5")).toBe("k2.5");
});
test('generationFor("kimi-k2.6") is "k2.6"', () => {
expect(generationFor("kimi-k2.6")).toBe("k2.6");
});
});
// ─── BENCHMARK_KEY_ALIASES parity ────────────────────────────────────────────
// Old BENCHMARK_KEY_ALIASES from benchmark-selector.js.
// These were keyed by WIRE IDs and mapped to canonical benchmark keys.
// After migration, canonicalIdFor(routeKey) should give the same result.
const OLD_BENCHMARK_KEY_ALIASES: Record<string, string> = {
"kimi-for-coding": "kimi-k2.6",
"moonshotai/kimi-k2.6": "kimi-k2.6",
"kimi-k2.6:cloud": "kimi-k2.6",
"kimi-k2.6-cloud": "kimi-k2.6",
"kimi-k2.5": "kimi-k2.5",
"moonshotai/kimi-k2.5": "kimi-k2.5",
"moonshotai.kimi-k2.5": "kimi-k2.5",
"kimi-k2.5:cloud": "kimi-k2.5",
"kimi-k2.5-cloud": "kimi-k2.5",
};
describe("BENCHMARK_KEY_ALIASES parity via canonicalIdFor", () => {
// kimi-coding/kimi-for-coding doesn't exist in upstream MODELS — the actual wire_id is "kimi-for-coding"
// which isn't an upstream key. So we test the ones that have real route keys.
test('canonicalIdFor("kimi-coding/kimi-k2.6") returns "kimi-k2.6"', () => {
expect(canonicalIdFor("kimi-coding/kimi-k2.6")).toBe("kimi-k2.6");
});
test('canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5") returns "kimi-k2.5"', () => {
expect(canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5")).toBe(
"kimi-k2.5",
);
});
test('canonicalIdFor("openrouter/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => {
expect(canonicalIdFor("openrouter/moonshotai/kimi-k2.5")).toBe("kimi-k2.5");
});
test('canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => {
expect(canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5")).toBe(
"kimi-k2.5",
);
});
test('canonicalIdFor("huggingface/moonshotai/Kimi-K2.5") returns "kimi-k2.5"', () => {
expect(canonicalIdFor("huggingface/moonshotai/Kimi-K2.5")).toBe("kimi-k2.5");
});
});
// ─── routesFor("kimi-k2.5") spans multiple providers ─────────────────────────
describe("routesFor(kimi-k2.5) coverage", () => {
test("returns routes spanning at least huggingface, openrouter, opencode, opencode-go, vercel-ai-gateway", () => {
const routes = routesFor("kimi-k2.5");
const providers = new Set(routes.map((r) => r.provider));
expect(providers.has("huggingface"), "huggingface").toBe(true);
expect(providers.has("openrouter"), "openrouter").toBe(true);
expect(providers.has("opencode"), "opencode").toBe(true);
expect(providers.has("opencode-go"), "opencode-go").toBe(true);
expect(providers.has("vercel-ai-gateway"), "vercel-ai-gateway").toBe(true);
});
test("all routes resolve to canonical_id kimi-k2.5", () => {
const routes = routesFor("kimi-k2.5");
expect(routes.length).toBeGreaterThan(0);
for (const r of routes) {
expect(r.canonical_id).toBe("kimi-k2.5");
}
});
});
// ─── sameGeneration ───────────────────────────────────────────────────────────
describe("sameGeneration", () => {
test("kimi-k2 and kimi-k2-0905 are same generation (k2 patch)", () => {
expect(sameGeneration("kimi-k2", "kimi-k2-0905")).toBe(true);
});
test("kimi-k2.5 and kimi-k2.6 are NOT same generation", () => {
expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false);
});
test("claude-sonnet-4 and claude-sonnet-4-6 are same generation (sonnet-4)", () => {
expect(sameGeneration("claude-sonnet-4", "claude-sonnet-4-6")).toBe(true);
});
test("claude-sonnet-4-6 and claude-opus-4-7 are NOT same generation", () => {
expect(sameGeneration("claude-sonnet-4-6", "claude-opus-4-7")).toBe(false);
});
test("kimi-k2-thinking and kimi-k2-thinking-turbo are same generation", () => {
expect(sameGeneration("kimi-k2-thinking", "kimi-k2-thinking-turbo")).toBe(
true,
);
});
test("returns false when one canonical_id has no generation mapping", () => {
expect(sameGeneration("kimi-k2.5", "some-unknown-model")).toBe(false);
});
});
// ─── lookup / lookupRoute ─────────────────────────────────────────────────────
describe("lookup", () => {
test('lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages"', () => {
const m = lookup("kimi-coding", "kimi-k2.6");
expect(m).not.toBeNull();
expect(m?.api).toBe("anthropic-messages");
expect(m?.canonical_id).toBe("kimi-k2.6");
expect(m?.provider).toBe("kimi-coding");
});
test("lookup returns null for unknown provider", () => {
expect(lookup("nonexistent-provider", "some-model")).toBeNull();
});
test("lookup returns null for unknown wire_id in known provider", () => {
expect(lookup("anthropic", "not-a-real-model")).toBeNull();
});
test('lookup("anthropic", "claude-sonnet-4-6") resolves correctly', () => {
const m = lookup("anthropic", "claude-sonnet-4-6");
expect(m).not.toBeNull();
expect(m?.canonical_id).toBe("claude-sonnet-4-6");
expect(m?.tier).toBe("standard");
});
test("lookupRoute delegates to lookup", () => {
const a = lookup("kimi-coding", "kimi-k2-thinking");
const b = lookupRoute("kimi-coding/kimi-k2-thinking");
expect(a).toEqual(b);
});
});
// ─── Bedrock namespaced models ────────────────────────────────────────────────
describe("amazon-bedrock namespaced wire_ids", () => {
test('lookup("amazon-bedrock", "moonshotai.kimi-k2.5") returns canonical kimi-k2.5', () => {
const m = lookup("amazon-bedrock", "moonshotai.kimi-k2.5");
expect(m).not.toBeNull();
expect(m?.canonical_id).toBe("kimi-k2.5");
});
test('lookup("amazon-bedrock", "moonshot.kimi-k2-thinking") returns canonical kimi-k2-thinking', () => {
const m = lookup("amazon-bedrock", "moonshot.kimi-k2-thinking");
expect(m).not.toBeNull();
expect(m?.canonical_id).toBe("kimi-k2-thinking");
expect(m?.tier).toBe("heavy");
});
test('lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6") returns canonical claude-sonnet-4-6', () => {
const m = lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6");
expect(m).not.toBeNull();
expect(m?.canonical_id).toBe("claude-sonnet-4-6");
});
});
// ─── allCanonicalIds ──────────────────────────────────────────────────────────
describe("allCanonicalIds", () => {
test("returns a non-empty array", () => {
const ids = allCanonicalIds();
expect(ids.length).toBeGreaterThan(10);
});
test("kimi-k2.5 is in the list", () => {
expect(allCanonicalIds()).toContain("kimi-k2.5");
});
test("kimi-k2.6 is in the list", () => {
expect(allCanonicalIds()).toContain("kimi-k2.6");
});
});
// ─── routeKeyOf ──────────────────────────────────────────────────────────────
describe("routeKeyOf", () => {
test("builds correct fused key", () => {
const m = lookup("kimi-coding", "kimi-k2.6")!;
expect(routeKeyOf(m)).toBe("kimi-coding/kimi-k2.6");
});
});

View file

@ -0,0 +1,263 @@
/**
* Tests for model-route-failure.js generation guard and solver pinning (ADR-0079).
*
* Swarm B spec:
* 1. Solver-pinned unit ("autonomous-solver") cannot fail over across
* canonical_id boundaries. The resolver must return undefined when all
* remaining routes belong to a different canonical model.
* 2. Same-canonical multi-route failover works: two routes for the same
* canonical id (kimi-k2.6 and kimi-for-coding both map to kimi-k2.6).
* 3. Cross-generation failover for non-solver units succeeds AND emits
* logGenerationDowngrade via logWarning.
*/
import { beforeEach, describe, expect, it, vi } from "vitest";
// ── Mock workflow-logger so we can assert on logWarning calls ──────────────
vi.mock("../workflow-logger.js", () => ({
logWarning: vi.fn(),
logError: vi.fn(),
}));
import { logWarning } from "../workflow-logger.js";
import {
logGenerationDowngrade,
resolveNextAvailableModelRoute,
} from "../model-route-failure.js";
// ── Helpers ───────────────────────────────────────────────────────────────────
/**
* Minimal model shape that model-route-failure.js expects for availableModels.
* provider + id must be real registry entries so canonicalIdFor() resolves them.
*/
function makeModel(provider: string, id: string) {
return { provider, id, api: "openai-completions" as const };
}
beforeEach(() => {
vi.clearAllMocks();
});
// ─────────────────────────────────────────────────────────────────────────────
// 1. Solver-pinning guard
// ─────────────────────────────────────────────────────────────────────────────
describe("solver-pinned failover (ADR-0079)", () => {
it("returns undefined when the only available route has a different canonical_id than the failed solver route", () => {
// Scenario: solver is running on kimi-coding/kimi-k2.6 (canonical: kimi-k2.6).
// The only unfailed route is openrouter/moonshotai/kimi-k2.5 (canonical: kimi-k2.5).
// Because these are different canonical ids, the solver-pinned guard must
// reject the candidate and return undefined.
const result = resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered
makeModel("openrouter", "moonshotai/kimi-k2.5"), // canonical: kimi-k2.5 ≠ kimi-k2.6
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
],
unitType: "autonomous-solver",
});
expect(result).toBeUndefined();
});
it("returns undefined when candidate is an unregistered route (null canonical) for solver-pinned unit", () => {
// ollama-cloud/kimi-k2.5:cloud is not in the registry — canonicalIdFor
// returns null. The guard treats null !== "kimi-k2.6" as a mismatch.
const result = resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("ollama-cloud", "kimi-k2.5:cloud"), // not in registry
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
],
unitType: "autonomous-solver",
});
expect(result).toBeUndefined();
});
it("does NOT emit logGenerationDowngrade for solver-pinned failover (guard rejects before logging)", () => {
resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
],
unitType: "autonomous-solver",
});
// The guard rejected before logging — no downgrade event should be emitted.
expect(logWarning).not.toHaveBeenCalled();
});
});
// ─────────────────────────────────────────────────────────────────────────────
// 2. Same-canonical multi-route failover
// ─────────────────────────────────────────────────────────────────────────────
describe("same-canonical multi-route failover", () => {
it("succeeds when a second route shares the same canonical_id as the failed route", () => {
// kimi-coding/kimi-for-coding maps to canonical kimi-k2.6 (same as
// kimi-coding/kimi-k2.6). This is the standard same-canonical path.
const result = resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered
makeModel("kimi-coding", "kimi-for-coding"), // canonical: kimi-k2.6 ✓
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
],
unitType: "autonomous-solver",
});
expect(result).toBeDefined();
expect(result?.model.provider).toBe("kimi-coding");
expect(result?.model.id).toBe("kimi-for-coding");
expect(result?.source).toBe("available");
});
it("does not emit logGenerationDowngrade for same-canonical failover", () => {
resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("kimi-coding", "kimi-k2.6"),
makeModel("kimi-coding", "kimi-for-coding"),
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
],
unitType: "execute-task",
});
expect(logWarning).not.toHaveBeenCalled();
});
it("same-canonical failover works even for solver-pinned unit type", () => {
// Within the same canonical_id, solver pin does not block failover.
const result = resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("kimi-coding", "kimi-k2.6"),
makeModel("kimi-coding", "kimi-for-coding"),
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
],
unitType: "autonomous-solver",
});
expect(result).toBeDefined();
expect(result?.model.id).toBe("kimi-for-coding");
});
});
// ─────────────────────────────────────────────────────────────────────────────
// 3. Cross-generation failover for non-solver units
// ─────────────────────────────────────────────────────────────────────────────
describe("cross-generation failover (executor layer)", () => {
it("succeeds and emits logGenerationDowngrade when crossing generation boundaries", () => {
// kimi-k2.6 (gen: k2.6) → kimi-k2.5 via openrouter (gen: k2.5).
// These are different generations, so the downgrade event must fire.
const result = resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [
makeModel("openrouter", "moonshotai/kimi-k2.5"),
],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
],
unitType: "execute-task",
});
expect(result).toBeDefined();
expect(result?.model.provider).toBe("openrouter");
// logGenerationDowngrade should have been called
expect(logWarning).toHaveBeenCalledWith(
"model-route-failure",
"generation-downgrade",
expect.objectContaining({
from: "kimi-k2.6",
to: "kimi-k2.5",
unitType: "execute-task",
sameGeneration: false,
}),
);
});
it("emits logGenerationDowngrade with the correct unitType from args", () => {
resolveNextAvailableModelRoute({
current: makeModel("kimi-coding", "kimi-k2.6"),
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
failedRoutes: [
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
],
unitType: "plan-slice",
});
expect(logWarning).toHaveBeenCalledWith(
"model-route-failure",
"generation-downgrade",
expect.objectContaining({ unitType: "plan-slice" }),
);
});
it("does not emit logGenerationDowngrade when no current route is set", () => {
// When current is undefined, canonicalIdFor returns null and no generation
// check can be performed — no downgrade event should fire.
const result = resolveNextAvailableModelRoute({
current: undefined,
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
failedRoutes: [],
unitType: "execute-task",
});
expect(result).toBeDefined();
expect(logWarning).not.toHaveBeenCalled();
});
it("does not emit logGenerationDowngrade when both routes share the same generation", () => {
// claude-sonnet-4 and claude-sonnet-4-5 both have generation "sonnet-4".
resolveNextAvailableModelRoute({
current: makeModel("anthropic", "claude-sonnet-4-20250514"),
availableModels: [makeModel("anthropic", "claude-sonnet-4-5-20250929")],
failedRoutes: [
{
provider: "anthropic",
modelId: "claude-sonnet-4-20250514",
reason: "rate-limit",
},
],
unitType: "execute-task",
});
expect(logWarning).not.toHaveBeenCalled();
});
});
// ─────────────────────────────────────────────────────────────────────────────
// 4. logGenerationDowngrade helper
// ─────────────────────────────────────────────────────────────────────────────
describe("logGenerationDowngrade helper", () => {
it("calls logWarning with structured generation-downgrade payload", () => {
logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "execute-task", "test reason");
expect(logWarning).toHaveBeenCalledWith(
"model-route-failure",
"generation-downgrade",
{
from: "kimi-k2.6",
to: "kimi-k2.5",
unitType: "execute-task",
reason: "test reason",
sameGeneration: false,
},
);
});
it("uses default reason when none is supplied", () => {
logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "plan-slice", undefined);
expect(logWarning).toHaveBeenCalledWith(
"model-route-failure",
"generation-downgrade",
expect.objectContaining({ reason: "cross-generation failover" }),
);
});
});