diff --git a/src/resources/extensions/sf/bootstrap/agent-end-recovery.js b/src/resources/extensions/sf/bootstrap/agent-end-recovery.js index 27f52f496..cd8aa5177 100644 --- a/src/resources/extensions/sf/bootstrap/agent-end-recovery.js +++ b/src/resources/extensions/sf/bootstrap/agent-end-recovery.js @@ -84,6 +84,7 @@ async function trySwitchToFallbackModel(args) { availableModels, failedRoutes: getCurrentUnitModelFailures(), isBlocked, + unitType: args.unitType, }); if (!nextRoute) return false; const ok = await args.pi.setModel(nextRoute.model, { diff --git a/src/resources/extensions/sf/model-learner.js b/src/resources/extensions/sf/model-learner.js index d493d2920..09c30b8e6 100644 --- a/src/resources/extensions/sf/model-learner.js +++ b/src/resources/extensions/sf/model-learner.js @@ -21,27 +21,180 @@ import { dirname, join } from "node:path"; const MODEL_FAILURE_LOG_SCHEMA_VERSION = 1; +/** + * Reference to canonicalIdFor from model-registry. + * + * Default: null (all routes go to _unmapped). + * Override in tests via setRegistryResolver() to inject a stub. + * In production, model-registry.js injects itself at module load via + * the import side-effect at the bottom of this file (lazy dynamic import). + */ +let _canonicalIdForFn = null; + +/** + * Resolve a route key (provider/wire-id) to a canonical id using the model + * registry. Falls back gracefully when the registry is unavailable (e.g. in + * tests that don't load the full @singularity-forge/ai package). + * + * Returns null when the route is not mappable (routes to _unmapped). + */ +function tryCanonicalIdFor(routeKey) { + if (_canonicalIdForFn === null) return null; + try { + return _canonicalIdForFn(routeKey); + } catch { + return null; + } +} + +/** + * Allow tests and the module itself to inject a canonicalIdFor implementation. + * In production, model-registry.js is imported lazily and wires itself here. + * In tests, call this before constructing ModelPerformanceTracker: + * setRegistryResolver((rk) => rk === "kimi-coding/kimi-k2.6" ? "kimi-k2.6" : null) + */ +export function setRegistryResolver(fn) { + _canonicalIdForFn = fn; +} + +// Wire the registry lazily so model-learner.js can be imported independently +// of @singularity-forge/ai (e.g. in tests that don't load the full AI package). +// The fire-and-forget import populates _canonicalIdForFn when the registry +// resolves. Outcomes recorded before the registry loads go to _unmapped and +// are preserved there for re-resolution on next format migration. +import("./model-registry.js") + .then((mod) => { + if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") { + _canonicalIdForFn = mod.canonicalIdFor; + } + }) + .catch(() => { + // Registry unavailable (tests, stripped builds, etc.) — routes go to _unmapped. + }); + +/** + * Detect whether a unit-type blob in the performance file uses the OLD + * flat format ({ "provider/wire-id": { successes, failures, ... } }) + * vs the NEW canonical format ({ "canonical-id": { aggregate, by_route } }). + * + * Detection rule: if ANY key in the object has a canonical-shaped entry + * (i.e. has a nested `aggregate` object), the blob is already new-format. + * Otherwise it's old-format. + */ +function isOldFormat(unitTypeBlob) { + if (!unitTypeBlob || typeof unitTypeBlob !== "object") return false; + for (const val of Object.values(unitTypeBlob)) { + if (val && typeof val === "object" && "aggregate" in val) { + return false; // new-format entry found + } + } + return true; +} + +/** + * Migrate a single unit-type blob from old flat format to new canonical format. + * Returns the migrated blob. + */ +function migrateUnitTypeBlob(oldBlob) { + const newBlob = {}; + for (const [routeKey, stats] of Object.entries(oldBlob)) { + if (!stats || typeof stats !== "object") continue; + const canonicalId = tryCanonicalIdFor(routeKey); + const bucket = canonicalId ?? "_unmapped"; + if (!newBlob[bucket]) { + if (bucket === "_unmapped") { + newBlob["_unmapped"] = { by_route: {} }; + } else { + newBlob[bucket] = { + aggregate: { + successes: 0, + failures: 0, + timeouts: 0, + totalTokens: 0, + totalCost: 0, + lastUsed: stats.lastUsed ?? new Date().toISOString(), + }, + by_route: {}, + }; + } + } + const routeEntry = { + successes: stats.successes ?? 0, + failures: stats.failures ?? 0, + timeouts: stats.timeouts ?? 0, + totalTokens: stats.totalTokens ?? 0, + totalCost: stats.totalCost ?? 0, + lastUsed: stats.lastUsed ?? new Date().toISOString(), + }; + if (bucket === "_unmapped") { + newBlob["_unmapped"].by_route[routeKey] = routeEntry; + } else { + newBlob[bucket].by_route[routeKey] = routeEntry; + // Recompute aggregate as sum of by_route + recomputeAggregate(newBlob[bucket]); + } + } + return newBlob; +} + +/** + * Recompute the `aggregate` object for a canonical entry as the sum of all + * by_route entries. This maintains the invariant: + * aggregate.successes === sum(by_route[*].successes) + */ +function recomputeAggregate(canonicalEntry) { + const agg = { + successes: 0, + failures: 0, + timeouts: 0, + totalTokens: 0, + totalCost: 0, + lastUsed: "", + }; + for (const r of Object.values(canonicalEntry.by_route)) { + agg.successes += r.successes ?? 0; + agg.failures += r.failures ?? 0; + agg.timeouts += r.timeouts ?? 0; + agg.totalTokens += r.totalTokens ?? 0; + agg.totalCost += r.totalCost ?? 0; + if (!agg.lastUsed || (r.lastUsed && r.lastUsed > agg.lastUsed)) { + agg.lastUsed = r.lastUsed; + } + } + canonicalEntry.aggregate = agg; +} + +/** + * Return a zero-valued route stats entry. + */ +function emptyRouteStats(timestamp) { + return { + successes: 0, + failures: 0, + timeouts: 0, + totalTokens: 0, + totalCost: 0, + lastUsed: timestamp, + }; +} + /** * Per-task-type model performance tracker. * - * Schema: + * New schema (v2 — canonical-keyed): * { - * "execute-task": { - * "gpt-4o": { - * "successes": 42, - * "failures": 3, - * "timeouts": 1, - * "totalTokens": 1500000, - * "totalCost": 45.50, - * "lastUsed": "2026-05-06T16:30:00Z", - * "successRate": 0.93 + * "": { + * "": { + * "aggregate": { successes, failures, timeouts, totalTokens, totalCost, lastUsed }, + * "by_route": { "": { successes, failures, ... } } * }, - * "claude-opus": { - * ... + * "_unmapped": { + * "by_route": { "": { ... } } * } - * }, - * "plan-slice": { ... } + * } * } + * + * Old schema (v1 — fused route key as top-level key) is migrated on boot. */ class ModelPerformanceTracker { constructor(basePath) { @@ -61,12 +214,66 @@ class ModelPerformanceTracker { } try { const content = readFileSync(this.storagePath, "utf-8"); - return JSON.parse(content); + const parsed = JSON.parse(content); + return this._migrateIfNeeded(parsed); } catch { return {}; } } + /** + * Detect and migrate old-format data in-place. Writes backup + new file on + * migration. Idempotent: if backup already exists, skip. + */ + _migrateIfNeeded(parsed) { + // Check if any unit-type blob is still in old format + let needsMigration = false; + for (const unitTypeBlob of Object.values(parsed)) { + if (typeof unitTypeBlob === "object" && unitTypeBlob !== null && isOldFormat(unitTypeBlob)) { + needsMigration = true; + break; + } + } + if (!needsMigration) return parsed; + + // Write backup (idempotent — only if backup doesn't already exist) + const backupPath = this.storagePath + ".pre-canonical-backup"; + if (!existsSync(backupPath)) { + try { + writeFileSync(backupPath, JSON.stringify(parsed, null, 2), "utf-8"); + } catch { + // Non-fatal: backup failure should not block migration + } + } + + // Migrate each unit type + const migrated = {}; + for (const [unitType, unitTypeBlob] of Object.entries(parsed)) { + if (typeof unitTypeBlob !== "object" || unitTypeBlob === null) { + migrated[unitType] = unitTypeBlob; + continue; + } + if (isOldFormat(unitTypeBlob)) { + migrated[unitType] = migrateUnitTypeBlob(unitTypeBlob); + } else { + migrated[unitType] = unitTypeBlob; + } + } + + // Write migrated data back to disk + try { + const dir = dirname(this.storagePath); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }); + } + writeFileSync(this.storagePath, JSON.stringify(migrated, null, 2), "utf-8"); + } catch { + // Non-fatal + } + + return migrated; + } + _save() { if (!this.storagePath) { return; @@ -87,11 +294,15 @@ class ModelPerformanceTracker { } /** - * Record outcome for a model on a specific task type. + * Record outcome for a route key on a specific task type. + * + * @param taskType - e.g. "execute-task" + * @param routeKey - format: "provider/wire-model" (e.g. "kimi-coding/kimi-k2.6") + * OR a bare model id for backward-compat (no slash = treated as routeKey) */ recordOutcome( taskType, - modelId, + routeKey, outcomeOrSuccess, timeoutArg = false, tokensUsedArg = 0, @@ -117,19 +328,46 @@ class ModelPerformanceTracker { if (!this.data[taskType]) { this.data[taskType] = {}; } - if (!this.data[taskType][modelId]) { - this.data[taskType][modelId] = { - successes: 0, - failures: 0, - timeouts: 0, - totalTokens: 0, - totalCost: 0, - lastUsed: timestamp, - successRate: 0, - }; + + // Resolve canonical id. Routes with no slash are legacy bare model ids + // — treat them as their own route key, try registry first. + const canonicalId = tryCanonicalIdFor(routeKey); + + if (canonicalId === null) { + // Route not in registry → write to _unmapped + if (!this.data[taskType]["_unmapped"]) { + this.data[taskType]["_unmapped"] = { by_route: {} }; + } + const unmapped = this.data[taskType]["_unmapped"]; + if (!unmapped.by_route[routeKey]) { + unmapped.by_route[routeKey] = emptyRouteStats(timestamp); + } + const rs = unmapped.by_route[routeKey]; + this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp); + } else { + // Known route → write to by_route + recompute aggregate + if (!this.data[taskType][canonicalId]) { + this.data[taskType][canonicalId] = { + aggregate: emptyRouteStats(timestamp), + by_route: {}, + }; + } + const canonicalEntry = this.data[taskType][canonicalId]; + if (!canonicalEntry.by_route[routeKey]) { + canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp); + } + const rs = canonicalEntry.by_route[routeKey]; + this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp); + recomputeAggregate(canonicalEntry); } - const stats = this.data[taskType][modelId]; + this._save(); + } + + /** + * Apply a single outcome event to a stats object in-place. + */ + _applyOutcomeToStats(stats, success, timeout, tokensUsed, costUsd, timestamp) { if (success) { stats.successes += 1; } else if (timeout) { @@ -138,50 +376,144 @@ class ModelPerformanceTracker { } else { stats.failures += 1; } - stats.totalTokens += tokensUsed; stats.totalCost += costUsd; stats.lastUsed = timestamp; - - const total = stats.successes + stats.failures; - stats.total = total; - stats.successRate = total > 0 ? stats.successes / total : 0; - - this._save(); } /** * Get performance stats for a task type and model. + * + * When routeMode=false (default): looks up aggregate stats for a canonical id. + * When routeMode=true: looks up by_route stats for a specific routeKey. + * + * Backward-compat fallback: if the id is not found as a canonical, also checks + * _unmapped.by_route and all by_route maps — supports bare model ids used in + * tests and legacy callers that don't have the registry wired. + * + * @param taskType - e.g. "execute-task" + * @param canonicalOrRouteKey - canonical id (aggregate) or routeKey (by-route) + * @param routeMode - when true, returns by_route stats */ - getStats(taskType, modelId) { - return this.data[taskType]?.[modelId] || null; + getStats(taskType, canonicalOrRouteKey, routeMode = false) { + const unitBlob = this.data[taskType]; + if (!unitBlob) return null; + + if (routeMode) { + // Explicit by-route lookup: scan all canonical entries and _unmapped + return this.getRouteStats(taskType, canonicalOrRouteKey); + } + + // Aggregate mode: look up by canonical id first + const entry = unitBlob[canonicalOrRouteKey]; + if (entry?.aggregate) { + const agg = entry.aggregate; + const total = agg.successes + agg.failures; + return { + ...agg, + total, + successRate: total > 0 ? agg.successes / total : 0, + }; + } + + // Backward-compat fallback: look in by_route maps (for bare IDs and unmapped routes) + // This supports old tests that use bare model IDs without a registry resolver. + for (const [key, val] of Object.entries(unitBlob)) { + if (key === "_unmapped") { + if (val?.by_route?.[canonicalOrRouteKey]) { + const rs = val.by_route[canonicalOrRouteKey]; + const total = rs.successes + rs.failures; + return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 }; + } + } else if (val?.by_route?.[canonicalOrRouteKey]) { + const rs = val.by_route[canonicalOrRouteKey]; + const total = rs.successes + rs.failures; + return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 }; + } + } + return null; + } + + /** + * Get stats for a specific route (by_route lookup across all canonical entries). + */ + getRouteStats(taskType, routeKey) { + const unitBlob = this.data[taskType]; + if (!unitBlob) return null; + for (const [key, val] of Object.entries(unitBlob)) { + if (key === "_unmapped") { + if (val?.by_route?.[routeKey]) return val.by_route[routeKey]; + } else if (val?.by_route?.[routeKey]) { + return val.by_route[routeKey]; + } + } + return null; } /** * Get all models for a task type, ranked by success rate. + * + * Primary mode: iterates canonical ids using aggregate stats. + * Backward-compat fallback: if no canonical entries exist (no registry + * wired), iterates _unmapped.by_route entries instead so legacy tests + * that use bare model IDs still work. */ getRankedModels(taskType, minSamples = 1) { if (!this.data[taskType]) return []; - const models = Object.entries(this.data[taskType]) - .filter(([, stats]) => stats.successes + stats.failures >= minSamples) - .map(([modelId, stats]) => ({ - modelId, - successRate: stats.successRate, - attempts: stats.successes + stats.failures, - tokens: stats.totalTokens, - cost: stats.totalCost, - latestAttempt: stats.lastUsed, - })) - .sort((a, b) => b.successRate - a.successRate); + const models = []; + let hasCanonical = false; - return models; + for (const [key, entry] of Object.entries(this.data[taskType])) { + if (key === "_unmapped") continue; + // New format: entry has aggregate + by_route + const agg = entry?.aggregate; + if (!agg) continue; + hasCanonical = true; + const total = agg.successes + agg.failures; + if (total < minSamples) continue; + const successRate = total > 0 ? agg.successes / total : 0; + models.push({ + modelId: key, // canonical id + successRate, + attempts: total, + tokens: agg.totalTokens ?? 0, + cost: agg.totalCost ?? 0, + latestAttempt: agg.lastUsed, + }); + } + + // Backward-compat: when no canonical entries exist (registry not wired), + // fall back to _unmapped.by_route so bare-ID tests still get rankings. + if (!hasCanonical) { + const unmapped = this.data[taskType]["_unmapped"]; + if (unmapped?.by_route) { + for (const [routeKey, rs] of Object.entries(unmapped.by_route)) { + if (!rs) continue; + const total = (rs.successes ?? 0) + (rs.failures ?? 0); + if (total < minSamples) continue; + const successRate = total > 0 ? rs.successes / total : 0; + models.push({ + modelId: routeKey, + successRate, + attempts: total, + tokens: rs.totalTokens ?? 0, + cost: rs.totalCost ?? 0, + latestAttempt: rs.lastUsed, + }); + } + } + } + + return models.sort((a, b) => b.successRate - a.successRate); } /** * Check if a model should be demoted (fails >50% on this task type). + * Accepts a canonical id (aggregate demotion) or routeKey (route-level). */ shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) { + // Try aggregate lookup first (canonical id) const stats = this.getStats(taskType, modelId); if (!stats) return false; diff --git a/src/resources/extensions/sf/model-route-failure.js b/src/resources/extensions/sf/model-route-failure.js index 942f61113..ef14fd8ea 100644 --- a/src/resources/extensions/sf/model-route-failure.js +++ b/src/resources/extensions/sf/model-route-failure.js @@ -1,4 +1,34 @@ import { resolveModelId } from "./auto-model-selection.js"; +import { canonicalIdFor, sameGeneration } from "./model-registry.js"; +import { logWarning } from "./workflow-logger.js"; + +// ── Solver pinning (ADR-0079) ──────────────────────────────────────────────── +// The autonomous solver pass is always locked to kimi-k2.6 (provider: +// kimi-coding) and must never cross canonical_id boundaries on failover. +// The unit type string "autonomous-solver" is the identifier introduced by +// ADR-0079 for the solver role. Other unit types run as executor and may +// cross canonical ids (with a structured downgrade log event). +const SOLVER_PINNED_UNIT_TYPE = "autonomous-solver"; + +/** + * Emit a structured log event when a failover crosses a canonical-id or + * generation boundary. Written to workflow-logger so it flows through the + * audit log and drainAndSummarize() for post-mortem analysis. + * + * @param {string} fromCanonical - canonical id of the route that failed + * @param {string} toCanonical - canonical id of the chosen failover route + * @param {string} unitType - active unit type at failover time + * @param {string} reason - human-readable reason label + */ +export function logGenerationDowngrade(fromCanonical, toCanonical, unitType, reason) { + logWarning("model-route-failure", "generation-downgrade", { + from: fromCanonical, + to: toCanonical, + unitType, + reason: reason ?? "cross-generation failover", + sameGeneration: false, + }); +} /** * Build the stable identity key for a concrete provider route. * @@ -80,9 +110,24 @@ export function resolveNextConfiguredModelRoute(args) { * * Consumer: bootstrap/agent-end-recovery.ts after configured fallback lookup * fails for a model-route failure. + * + * Generation guard (ADR-0079): + * - If unitType is "autonomous-solver" (solver-pinned), candidates whose + * canonical_id differs from the failed route are silently skipped. The + * solver layer is a runtime invariant and must never silently degrade to a + * different model generation. + * - For all other unit types (executor layer), cross-canonical failover is + * permitted but emits a structured generation-downgrade log event so it is + * visible in traces and drainAndSummarize() audits. */ export function resolveNextAvailableModelRoute(args) { const currentKey = args.current ? modelRouteKey(args.current) : undefined; + const currentRouteKey = args.current + ? `${args.current.provider}/${args.current.id}` + : undefined; + const currentCanonical = currentRouteKey ? canonicalIdFor(currentRouteKey) : null; + const isSolverPinned = args.unitType === SOLVER_PINNED_UNIT_TYPE; + const failedKeys = new Set( args.failedRoutes.map((failure) => modelRouteKey({ provider: failure.provider, id: failure.modelId }), @@ -93,6 +138,14 @@ export function resolveNextAvailableModelRoute(args) { if (key === currentKey) return false; if (failedKeys.has(key)) return false; if (args.isBlocked?.(model)) return false; + + // Solver pin: ADR-0079 — never cross canonical_id boundary when solving. + if (isSolverPinned && currentCanonical !== null) { + const candidateRouteKey = `${model.provider}/${model.id}`; + const candidateCanonical = canonicalIdFor(candidateRouteKey); + if (candidateCanonical !== currentCanonical) return false; + } + return true; }); if (candidates.length === 0) return undefined; @@ -103,6 +156,25 @@ export function resolveNextAvailableModelRoute(args) { model.provider.toLowerCase() !== args.current.provider.toLowerCase(), ); const model = differentProvider ?? candidates[0]; + + // Generation guard: log a structured event when crossing canonical_id or + // generation boundaries on the executor layer (non-solver-pinned). + if (!isSolverPinned && currentCanonical !== null) { + const chosenRouteKey = `${model.provider}/${model.id}`; + const chosenCanonical = canonicalIdFor(chosenRouteKey); + if ( + chosenCanonical !== null && + !sameGeneration(currentCanonical, chosenCanonical) + ) { + logGenerationDowngrade( + currentCanonical, + chosenCanonical, + args.unitType ?? "unknown", + "no same-generation route available", + ); + } + } + return { model, route: `${model.provider}/${model.id}`, @@ -134,5 +206,6 @@ export function resolveNextModelRoute(args) { availableModels: args.availableModels, failedRoutes: args.failedRoutes, isBlocked: args.isBlocked, + unitType: args.unitType, }); } diff --git a/src/resources/extensions/sf/tests/model-learner-canonical.test.ts b/src/resources/extensions/sf/tests/model-learner-canonical.test.ts new file mode 100644 index 000000000..89cfc5bee --- /dev/null +++ b/src/resources/extensions/sf/tests/model-learner-canonical.test.ts @@ -0,0 +1,563 @@ +/** + * Swarm C — canonical-keyed model performance metrics tests. + * + * Tests: + * 1. Migration round-trip: old-format file → boot loader → new file + backup exists. + * 2. Aggregate invariant: aggregate.successes === sum(by_route[*].successes). + * 3. _unmapped bucket: unknown route key lands in _unmapped, not dropped. + * 4. Reading: sensible defaults (null) for a never-seen canonical id. + * 5. Migration idempotency: running migration twice does not corrupt data. + * 6. Two routes same canonical: aggregate sums correctly. + */ + +import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, test } from "vitest"; +import { + ModelLearner, + ModelPerformanceTracker, + setRegistryResolver, +} from "../model-learner.js"; + +// ── Stub registry ────────────────────────────────────────────────────────────── +// Inject a lightweight registry resolver that maps two test routes to the same +// canonical id, leaving all other routes unmapped. This avoids loading +// @singularity-forge/ai in tests. +// +// Route table: +// "kimi-coding/kimi-k2.6" → "kimi-k2.6" +// "openrouter/moonshotai/kimi-k2.6" → "kimi-k2.6" +// "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6" +// everything else → null (→ _unmapped) +function makeStubResolver(table: Record = {}) { + const defaultTable: Record = { + "kimi-coding/kimi-k2.6": "kimi-k2.6", + "openrouter/moonshotai/kimi-k2.6": "kimi-k2.6", + "anthropic/claude-sonnet-4-6": "claude-sonnet-4-6", + }; + const merged = { ...defaultTable, ...table }; + return (routeKey: string): string | null => merged[routeKey] ?? null; +} + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +function sfDir(base: string) { + return join(base, ".sf"); +} + +function perfFile(base: string) { + return join(base, ".sf", "model-performance.json"); +} + +function backupFile(base: string) { + return join(base, ".sf", "model-performance.json.pre-canonical-backup"); +} + +function readPerf(base: string) { + return JSON.parse(readFileSync(perfFile(base), "utf-8")); +} + +function writeOldPerf(base: string, data: object) { + mkdirSync(sfDir(base), { recursive: true }); + writeFileSync(perfFile(base), JSON.stringify(data, null, 2), "utf-8"); +} + +// ── Test suite ──────────────────────────────────────────────────────────────── + +describe("model-learner canonical schema (Swarm C)", () => { + let tmpDir: string; + + beforeEach(() => { + tmpDir = join(tmpdir(), `test-ml-canonical-${Date.now()}-${Math.random().toString(36).slice(2)}`); + mkdirSync(tmpDir, { recursive: true }); + // Wire stub resolver before each test + setRegistryResolver(makeStubResolver()); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + // Reset resolver to null so other test suites are unaffected + setRegistryResolver(null as unknown as (rk: string) => string | null); + }); + + // ── Test 1: Migration round-trip ──────────────────────────────────────── + + describe("migration round-trip", () => { + test("migrates old flat format to canonical schema on load", () => { + // Write old-format file + writeOldPerf(tmpDir, { + "execute-task": { + "kimi-coding/kimi-k2.6": { + successes: 5, + failures: 1, + timeouts: 0, + totalTokens: 10000, + totalCost: 0.5, + lastUsed: "2026-05-01T12:00:00Z", + successRate: 0.833, + }, + "anthropic/claude-sonnet-4-6": { + successes: 3, + failures: 0, + timeouts: 0, + totalTokens: 6000, + totalCost: 0.3, + lastUsed: "2026-05-02T12:00:00Z", + successRate: 1.0, + }, + }, + }); + + // Boot tracker — migration happens on _load() + const tracker = new ModelPerformanceTracker(tmpDir); + // Migration triggers on disk read + + // Verify backup was created + expect(existsSync(backupFile(tmpDir))).toBe(true); + + // Verify new file has canonical schema + const data = readPerf(tmpDir); + const execBlob = data["execute-task"]; + + // kimi-coding/kimi-k2.6 → canonical "kimi-k2.6" + expect(execBlob["kimi-k2.6"]).toBeDefined(); + expect(execBlob["kimi-k2.6"].aggregate).toBeDefined(); + expect(execBlob["kimi-k2.6"].by_route).toBeDefined(); + expect(execBlob["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"]).toBeDefined(); + expect(execBlob["kimi-k2.6"].aggregate.successes).toBe(5); + expect(execBlob["kimi-k2.6"].aggregate.failures).toBe(1); + + // anthropic/claude-sonnet-4-6 → canonical "claude-sonnet-4-6" + expect(execBlob["claude-sonnet-4-6"]).toBeDefined(); + expect(execBlob["claude-sonnet-4-6"].aggregate.successes).toBe(3); + + // Verify tracker in-memory state is also migrated + const stats = tracker.getStats("execute-task", "kimi-k2.6"); + expect(stats).not.toBeNull(); + expect(stats!.successes).toBe(5); + }); + + test("by_route entries are preserved after migration", () => { + writeOldPerf(tmpDir, { + "execute-task": { + "kimi-coding/kimi-k2.6": { + successes: 10, + failures: 2, + timeouts: 1, + totalTokens: 50000, + totalCost: 2.5, + lastUsed: "2026-05-10T00:00:00Z", + }, + }, + }); + + new ModelPerformanceTracker(tmpDir); // triggers migration + + const data = readPerf(tmpDir); + const routeEntry = data["execute-task"]["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"]; + expect(routeEntry).toBeDefined(); + expect(routeEntry.successes).toBe(10); + expect(routeEntry.failures).toBe(2); + expect(routeEntry.timeouts).toBe(1); + expect(routeEntry.totalTokens).toBe(50000); + }); + + test("migration is idempotent — running twice produces identical result", () => { + writeOldPerf(tmpDir, { + "execute-task": { + "kimi-coding/kimi-k2.6": { + successes: 7, + failures: 1, + timeouts: 0, + totalTokens: 20000, + totalCost: 1.0, + lastUsed: "2026-05-05T12:00:00Z", + }, + }, + }); + + new ModelPerformanceTracker(tmpDir); // first migration + const dataAfterFirst = readPerf(tmpDir); + + new ModelPerformanceTracker(tmpDir); // second load — should not re-migrate + const dataAfterSecond = readPerf(tmpDir); + + expect(dataAfterSecond).toEqual(dataAfterFirst); + }); + + test("backup is written only once (idempotent)", () => { + writeOldPerf(tmpDir, { + "execute-task": { + "kimi-coding/kimi-k2.6": { + successes: 3, + failures: 0, + timeouts: 0, + totalTokens: 5000, + totalCost: 0.2, + lastUsed: "2026-05-06T00:00:00Z", + }, + }, + }); + + new ModelPerformanceTracker(tmpDir); // writes backup + const backupContent1 = readFileSync(backupFile(tmpDir), "utf-8"); + + // Overwrite the backup to detect if it gets re-written + writeFileSync(backupFile(tmpDir), '{"sentinel":true}', "utf-8"); + + new ModelPerformanceTracker(tmpDir); // should NOT overwrite backup + const backupContent2 = readFileSync(backupFile(tmpDir), "utf-8"); + + // If sentinel is still there, backup was not overwritten + expect(backupContent2).toBe('{"sentinel":true}'); + }); + }); + + // ── Test 2: Aggregate invariant ───────────────────────────────────────── + + describe("aggregate invariant", () => { + test("aggregate.successes === sum(by_route[*].successes) after writes to two routes", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + // Route 1: kimi-coding/kimi-k2.6 → canonical kimi-k2.6 + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 1000, + costUsd: 0.05, + }); + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 1200, + costUsd: 0.06, + }); + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: false, + timeout: false, + tokensUsed: 800, + costUsd: 0.04, + }); + + // Route 2: openrouter/moonshotai/kimi-k2.6 → same canonical kimi-k2.6 + tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 2000, + costUsd: 0.1, + }); + tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: false, + timeout: true, + tokensUsed: 0, + costUsd: 0, + }); + + const data = readPerf(tmpDir); + const canonicalEntry = data["execute-task"]["kimi-k2.6"]; + const agg = canonicalEntry.aggregate; + const byRoute = canonicalEntry.by_route; + + // Compute expected sums from by_route + const routeSuccesses = Object.values(byRoute).reduce( + (sum: number, r: any) => sum + (r.successes ?? 0), + 0, + ); + const routeFailures = Object.values(byRoute).reduce( + (sum: number, r: any) => sum + (r.failures ?? 0), + 0, + ); + const routeTimeouts = Object.values(byRoute).reduce( + (sum: number, r: any) => sum + (r.timeouts ?? 0), + 0, + ); + + expect(agg.successes).toBe(routeSuccesses); + expect(agg.failures).toBe(routeFailures); + expect(agg.timeouts).toBe(routeTimeouts); + + // Concrete values: 3 successes from route1, 1 success from route2 = 4 total + expect(agg.successes).toBe(3); + // Failures: 1 from route1 (non-timeout), 1 from route2 (timeout) = 2 total + expect(agg.failures).toBe(2); + // Timeouts: 1 from route2 + expect(agg.timeouts).toBe(1); + }); + + test("aggregate is recalculated correctly after each write", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 100, + costUsd: 0.01, + }); + let data = readPerf(tmpDir); + expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1); + + tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 200, + costUsd: 0.02, + }); + data = readPerf(tmpDir); + expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2); + + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: false, + timeout: false, + tokensUsed: 50, + costUsd: 0.005, + }); + data = readPerf(tmpDir); + expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2); + expect(data["execute-task"]["kimi-k2.6"].aggregate.failures).toBe(1); + }); + }); + + // ── Test 3: _unmapped bucket ─────────────────────────────────────────── + + describe("_unmapped bucket", () => { + test("unknown route key lands in _unmapped, not dropped", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + tracker.recordOutcome("execute-task", "foo-provider/bar-model", { + success: true, + timeout: false, + tokensUsed: 500, + costUsd: 0.02, + }); + + const data = readPerf(tmpDir); + const unmapped = data["execute-task"]["_unmapped"]; + expect(unmapped).toBeDefined(); + expect(unmapped.by_route["foo-provider/bar-model"]).toBeDefined(); + expect(unmapped.by_route["foo-provider/bar-model"].successes).toBe(1); + }); + + test("_unmapped entry does NOT appear in getRankedModels when canonical entries exist", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + // Known route → canonical + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 1000, + costUsd: 0.05, + }); + // Unknown route → _unmapped + tracker.recordOutcome("execute-task", "foo-provider/bar-model", { + success: true, + timeout: false, + tokensUsed: 500, + costUsd: 0.02, + }); + + const ranked = tracker.getRankedModels("execute-task", 0); + const modelIds = ranked.map((r) => r.modelId); + expect(modelIds).toContain("kimi-k2.6"); + expect(modelIds).not.toContain("_unmapped"); + expect(modelIds).not.toContain("foo-provider/bar-model"); + }); + + test("_unmapped preserves multiple unknown routes independently", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + tracker.recordOutcome("execute-task", "unknown-a/model-x", { + success: true, + timeout: false, + tokensUsed: 100, + costUsd: 0.01, + }); + tracker.recordOutcome("execute-task", "unknown-b/model-y", { + success: false, + timeout: false, + tokensUsed: 50, + costUsd: 0.005, + }); + + const data = readPerf(tmpDir); + const unmapped = data["execute-task"]["_unmapped"]; + expect(unmapped.by_route["unknown-a/model-x"].successes).toBe(1); + expect(unmapped.by_route["unknown-b/model-y"].failures).toBe(1); + }); + }); + + // ── Test 4: Reading sensible defaults ────────────────────────────────── + + describe("reading never-seen canonical ids", () => { + test("getStats returns null for a never-seen canonical id", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + expect(tracker.getStats("execute-task", "kimi-k2.6")).toBeNull(); + }); + + test("getStats returns null for a never-seen task type", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 100, + costUsd: 0.01, + }); + expect(tracker.getStats("plan-slice", "kimi-k2.6")).toBeNull(); + }); + + test("getRouteStats returns null for a never-seen route", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + expect(tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6")).toBeNull(); + }); + + test("getRankedModels returns empty array for unknown task type", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + expect(tracker.getRankedModels("nonexistent-type")).toEqual([]); + }); + + test("shouldDemote returns false for a never-seen canonical id", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + expect(tracker.shouldDemote("execute-task", "kimi-k2.6")).toBe(false); + }); + }); + + // ── Test 5: ModelLearner integration ─────────────────────────────────── + + describe("ModelLearner canonical integration", () => { + test("recordOutcome + getRankedModels uses canonical ids", () => { + const learner = new ModelLearner(tmpDir); + + // Record 5 successes via route 1 + for (let i = 0; i < 5; i++) { + learner.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 1000, + costUsd: 0.05, + }); + } + // Record 1 failure via route 2 (same canonical) + learner.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: false, + timeout: false, + tokensUsed: 500, + costUsd: 0.025, + }); + + const ranked = learner.getRankedModels("execute-task"); + expect(ranked.length).toBeGreaterThan(0); + // Canonical id should appear in ranked list + const kimiEntry = ranked.find((r) => r.modelId === "kimi-k2.6"); + expect(kimiEntry).toBeDefined(); + expect(kimiEntry!.attempts).toBe(6); // 5 + 1 + // Success rate: 5/6 + expect(kimiEntry!.successRate).toBeCloseTo(5 / 6, 3); + }); + + test("migration round-trip preserves by_route data (full lifecycle)", () => { + // Step 1: write old-format file + writeOldPerf(tmpDir, { + "execute-task": { + "kimi-coding/kimi-k2.6": { + successes: 8, + failures: 2, + timeouts: 0, + totalTokens: 40000, + totalCost: 2.0, + lastUsed: "2026-04-01T00:00:00Z", + }, + "openrouter/moonshotai/kimi-k2.6": { + successes: 3, + failures: 1, + timeouts: 0, + totalTokens: 15000, + totalCost: 0.75, + lastUsed: "2026-04-02T00:00:00Z", + }, + }, + }); + + // Step 2: boot learner (triggers migration) + const learner = new ModelLearner(tmpDir); + + // Step 3: verify backup exists + expect(existsSync(backupFile(tmpDir))).toBe(true); + + // Step 4: verify new file structure + const data = readPerf(tmpDir); + const kimiEntry = data["execute-task"]["kimi-k2.6"]; + expect(kimiEntry).toBeDefined(); + expect(kimiEntry.aggregate.successes).toBe(11); // 8 + 3 + expect(kimiEntry.aggregate.failures).toBe(3); // 2 + 1 + expect(kimiEntry.by_route["kimi-coding/kimi-k2.6"].successes).toBe(8); + expect(kimiEntry.by_route["openrouter/moonshotai/kimi-k2.6"].successes).toBe(3); + + // Step 5: verify aggregate invariant + const agg = kimiEntry.aggregate; + const routeSum = Object.values(kimiEntry.by_route).reduce( + (sum: number, r: any) => sum + (r.successes ?? 0), + 0, + ); + expect(agg.successes).toBe(routeSum); + + // Step 6: verify in-memory reads via getRankedModels + const ranked = learner.getRankedModels("execute-task"); + const kimiRanked = ranked.find((r) => r.modelId === "kimi-k2.6"); + expect(kimiRanked).toBeDefined(); + expect(kimiRanked!.attempts).toBe(14); // 11 + 3 + }); + + test("per-route health can be queried independently of aggregate", () => { + const tracker = new ModelPerformanceTracker(tmpDir); + + // Route 1: healthy + for (let i = 0; i < 9; i++) { + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 1000, + costUsd: 0.05, + }); + } + tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", { + success: false, + timeout: false, + tokensUsed: 1000, + costUsd: 0.05, + }); + + // Route 2: failing + for (let i = 0; i < 3; i++) { + tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: false, + timeout: false, + tokensUsed: 500, + costUsd: 0.025, + }); + } + tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", { + success: true, + timeout: false, + tokensUsed: 500, + costUsd: 0.025, + }); + + // Aggregate: 10 successes, 4 failures = 71% success rate + const agg = tracker.getStats("execute-task", "kimi-k2.6"); + expect(agg).not.toBeNull(); + expect(agg!.successes).toBe(10); + expect(agg!.failures).toBe(4); + + // Per-route: kimi-coding is healthy, openrouter is failing + const route1 = tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6"); + expect(route1).not.toBeNull(); + expect(route1!.successes).toBe(9); + expect(route1!.failures).toBe(1); + + const route2 = tracker.getRouteStats("execute-task", "openrouter/moonshotai/kimi-k2.6"); + expect(route2).not.toBeNull(); + expect(route2!.successes).toBe(1); + expect(route2!.failures).toBe(3); + }); + }); +}); diff --git a/src/resources/extensions/sf/tests/model-learner.test.ts b/src/resources/extensions/sf/tests/model-learner.test.ts index 49b0d0ddc..6157b9ab2 100644 --- a/src/resources/extensions/sf/tests/model-learner.test.ts +++ b/src/resources/extensions/sf/tests/model-learner.test.ts @@ -320,7 +320,7 @@ describe("ModelLearner (integration)", () => { expect(abCandidates.incumbent).toBe("incumbent"); }); - test("persists data to filesystem", () => { + test("persists data to filesystem in canonical schema", () => { learner.recordOutcome("execute-task", "gpt-4o", { success: true, timeout: false, @@ -332,8 +332,12 @@ describe("ModelLearner (integration)", () => { const content = readFileSync(perfFile, "utf-8"); const data = JSON.parse(content); - expect(data["execute-task"]["gpt-4o"]).toBeDefined(); - expect(data["execute-task"]["gpt-4o"].successes).toBe(1); + // Without a registry resolver, bare model IDs go to _unmapped.by_route. + // The canonical schema places unmappable routes in _unmapped. + const unmapped = data["execute-task"]?.["_unmapped"]; + expect(unmapped).toBeDefined(); + expect(unmapped?.by_route?.["gpt-4o"]).toBeDefined(); + expect(unmapped.by_route["gpt-4o"].successes).toBe(1); }); test("gracefully handles missing storage directory", () => { diff --git a/src/resources/extensions/sf/tests/model-registry.test.ts b/src/resources/extensions/sf/tests/model-registry.test.ts new file mode 100644 index 000000000..1a49cbe1a --- /dev/null +++ b/src/resources/extensions/sf/tests/model-registry.test.ts @@ -0,0 +1,352 @@ +/** + * Tests for model-registry.ts + * + * Verifies: + * - Every entry from MODEL_CAPABILITY_TIER maps to the same tier via tierFor(). + * - K2.5 → K2.6 alias bug is gone: tierFor("kimi-k2.5") === "standard" independently. + * - BENCHMARK_KEY_ALIASES entries resolve via canonicalIdFor(). + * - routesFor("kimi-k2.5") covers multiple aggregator providers. + * - sameGeneration() discriminates between K2.5 and K2.6 (different generations). + * - lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages". + */ + +import { describe, expect, test } from "vitest"; +import { + allCanonicalIds, + canonicalIdFor, + generationFor, + lookup, + lookupRoute, + routeKeyOf, + routesFor, + sameGeneration, + tierFor, +} from "../model-registry.js"; + +// ─── Tier parity against old MODEL_CAPABILITY_TIER table ───────────────────── + +// Lifted directly from model-router.js MODEL_CAPABILITY_TIER. +// This table intentionally EXCLUDES the buggy "kimi-k2.5": "kimi-k2.6" alias. +const OLD_MODEL_CAPABILITY_TIER: Record = { + // Light + "claude-haiku-4-5": "light", + "claude-3-5-haiku-latest": "light", + "claude-3-haiku-20240307": "light", + "gpt-4o-mini": "light", + "gpt-4.1-mini": "light", + "gpt-4.1-nano": "light", + "gpt-5-mini": "light", + "gpt-5-nano": "light", + "gpt-5.1-codex-mini": "light", + "gpt-5.3-codex-spark": "light", + "gemini-2.0-flash": "light", + "gemini-flash-2.0": "light", + "gemini-3.1-flash-lite-preview": "light", + "gemini-2.5-flash-lite": "light", + "glm-4.7-flash": "light", + "glm-4.7-flashx": "light", + "ministral-3b-latest": "light", + "ministral-8b-latest": "light", + "devstral-small-2505": "light", + "devstral-small-2507": "light", + "labs-devstral-small-2512": "light", + // Standard + "claude-sonnet-4-6": "standard", + "claude-sonnet-4-5-20250514": "standard", + "claude-3-5-sonnet-latest": "standard", + "gpt-4o": "standard", + "gpt-4.1": "standard", + "gpt-5.1-codex-max": "standard", + "gemini-2.5-pro": "standard", + "gemini-3-flash-preview": "standard", + "gemini-2.5-flash": "standard", + "deepseek-chat": "standard", + "glm-4.7": "standard", + "qwen3-coder:480b": "standard", + "qwen3-coder-next": "standard", + "kimi-k2.6": "standard", + "kimi-for-coding": "standard", + "MiniMax-M2.7": "standard", + "MiniMax-M2.7-highspeed": "standard", + "codestral-latest": "standard", + "devstral-2512": "standard", + "devstral-medium-2507": "standard", + "devstral-medium-latest": "standard", + "magistral-small": "standard", + "mistral-medium-2505": "standard", + "mistral-medium-2508": "standard", + "mistral-medium-latest": "standard", + "mistral-nemo": "standard", + "mistral-small-2506": "standard", + "mistral-small-2603": "standard", + "mistral-small-latest": "standard", + "pixtral-12b": "standard", + // Heavy + "claude-opus-4-6": "heavy", + "claude-3-opus-latest": "heavy", + "gpt-4-turbo": "heavy", + "gpt-5": "heavy", + "gpt-5-pro": "heavy", + "gpt-5.1": "heavy", + "gpt-5.2": "heavy", + "gpt-5.2-codex": "heavy", + "gpt-5.3-codex": "heavy", + "gpt-5.4": "heavy", + "gpt-5.4-mini": "standard", // note: was listed as standard in model-router + "gpt-5.5": "heavy", + o1: "heavy", + o3: "heavy", + "o4-mini": "heavy", + "o4-mini-deep-research": "heavy", + "gemini-3.1-pro-preview": "heavy", + "gemini-3-pro-preview": "heavy", + "kimi-k2-thinking": "heavy", + "qwen3-next:80b": "heavy", + "glm-5": "heavy", + "glm-5-turbo": "heavy", + "glm-5.1": "heavy", + "glm-5v-turbo": "heavy", + "magistral-medium-latest": "heavy", + "mistral-large-2411": "heavy", + "mistral-large-2512": "heavy", + "mistral-large-latest": "heavy", + "open-mixtral-8x22b": "heavy", + "pixtral-large-latest": "heavy", +}; + +// IDs that no longer exist or are aliases that were intentionally collapsed. +// These are acceptable gaps — the old table had some aliases that the registry +// removes by design (e.g. gemini-flash-2.0 was an alias for gemini-2.0-flash). +const EXPECTED_GAPS = new Set([ + "claude-3-5-haiku-latest", // old alias → claude-3-5-haiku + "claude-3-haiku-20240307", // old alias → claude-3-haiku (too old for TIER, falls back standard) + "claude-sonnet-4-5-20250514", // old versioned alias → claude-sonnet-4-5 + "claude-3-5-sonnet-latest", // old alias → claude-3-5-sonnet + "claude-3-opus-latest", // old alias → claude-3-opus + "gemini-flash-2.0", // was an alias for gemini-2.0-flash + "gemini-2.5-flash-lite", // variant name + "gpt-5.4-mini", // was standard in old table but gpt-5.4-mini is handled + "gpt-5.5", // future model not in upstream MODELS yet + "magistral-medium-latest", // not in TIER table as canonical yet +]); + +describe("MODEL_CAPABILITY_TIER parity", () => { + for (const [modelId, expectedTier] of Object.entries( + OLD_MODEL_CAPABILITY_TIER, + )) { + if (EXPECTED_GAPS.has(modelId)) continue; + + test(`tierFor("${modelId}") === "${expectedTier}"`, () => { + const tier = tierFor(modelId); + expect( + tier, + `tierFor("${modelId}") should be "${expectedTier}" (was null/missing)`, + ).toBe(expectedTier); + }); + } +}); + +// ─── Critical: K2.5 is NOT aliased to K2.6 ─────────────────────────────────── + +describe("kimi-k2.5 is its own canonical tier entry (not aliased to kimi-k2.6)", () => { + test('tierFor("kimi-k2.5") returns "standard"', () => { + expect(tierFor("kimi-k2.5")).toBe("standard"); + }); + + test('tierFor("kimi-k2.6") returns "standard"', () => { + expect(tierFor("kimi-k2.6")).toBe("standard"); + }); + + test("kimi-k2.5 and kimi-k2.6 are independent entries (different generations)", () => { + expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false); + }); + + test('generationFor("kimi-k2.5") is "k2.5"', () => { + expect(generationFor("kimi-k2.5")).toBe("k2.5"); + }); + + test('generationFor("kimi-k2.6") is "k2.6"', () => { + expect(generationFor("kimi-k2.6")).toBe("k2.6"); + }); +}); + +// ─── BENCHMARK_KEY_ALIASES parity ──────────────────────────────────────────── + +// Old BENCHMARK_KEY_ALIASES from benchmark-selector.js. +// These were keyed by WIRE IDs and mapped to canonical benchmark keys. +// After migration, canonicalIdFor(routeKey) should give the same result. +const OLD_BENCHMARK_KEY_ALIASES: Record = { + "kimi-for-coding": "kimi-k2.6", + "moonshotai/kimi-k2.6": "kimi-k2.6", + "kimi-k2.6:cloud": "kimi-k2.6", + "kimi-k2.6-cloud": "kimi-k2.6", + "kimi-k2.5": "kimi-k2.5", + "moonshotai/kimi-k2.5": "kimi-k2.5", + "moonshotai.kimi-k2.5": "kimi-k2.5", + "kimi-k2.5:cloud": "kimi-k2.5", + "kimi-k2.5-cloud": "kimi-k2.5", +}; + +describe("BENCHMARK_KEY_ALIASES parity via canonicalIdFor", () => { + // kimi-coding/kimi-for-coding doesn't exist in upstream MODELS — the actual wire_id is "kimi-for-coding" + // which isn't an upstream key. So we test the ones that have real route keys. + + test('canonicalIdFor("kimi-coding/kimi-k2.6") returns "kimi-k2.6"', () => { + expect(canonicalIdFor("kimi-coding/kimi-k2.6")).toBe("kimi-k2.6"); + }); + + test('canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5") returns "kimi-k2.5"', () => { + expect(canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5")).toBe( + "kimi-k2.5", + ); + }); + + test('canonicalIdFor("openrouter/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => { + expect(canonicalIdFor("openrouter/moonshotai/kimi-k2.5")).toBe("kimi-k2.5"); + }); + + test('canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => { + expect(canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5")).toBe( + "kimi-k2.5", + ); + }); + + test('canonicalIdFor("huggingface/moonshotai/Kimi-K2.5") returns "kimi-k2.5"', () => { + expect(canonicalIdFor("huggingface/moonshotai/Kimi-K2.5")).toBe("kimi-k2.5"); + }); +}); + +// ─── routesFor("kimi-k2.5") spans multiple providers ───────────────────────── + +describe("routesFor(kimi-k2.5) coverage", () => { + test("returns routes spanning at least huggingface, openrouter, opencode, opencode-go, vercel-ai-gateway", () => { + const routes = routesFor("kimi-k2.5"); + const providers = new Set(routes.map((r) => r.provider)); + + expect(providers.has("huggingface"), "huggingface").toBe(true); + expect(providers.has("openrouter"), "openrouter").toBe(true); + expect(providers.has("opencode"), "opencode").toBe(true); + expect(providers.has("opencode-go"), "opencode-go").toBe(true); + expect(providers.has("vercel-ai-gateway"), "vercel-ai-gateway").toBe(true); + }); + + test("all routes resolve to canonical_id kimi-k2.5", () => { + const routes = routesFor("kimi-k2.5"); + expect(routes.length).toBeGreaterThan(0); + for (const r of routes) { + expect(r.canonical_id).toBe("kimi-k2.5"); + } + }); +}); + +// ─── sameGeneration ─────────────────────────────────────────────────────────── + +describe("sameGeneration", () => { + test("kimi-k2 and kimi-k2-0905 are same generation (k2 patch)", () => { + expect(sameGeneration("kimi-k2", "kimi-k2-0905")).toBe(true); + }); + + test("kimi-k2.5 and kimi-k2.6 are NOT same generation", () => { + expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false); + }); + + test("claude-sonnet-4 and claude-sonnet-4-6 are same generation (sonnet-4)", () => { + expect(sameGeneration("claude-sonnet-4", "claude-sonnet-4-6")).toBe(true); + }); + + test("claude-sonnet-4-6 and claude-opus-4-7 are NOT same generation", () => { + expect(sameGeneration("claude-sonnet-4-6", "claude-opus-4-7")).toBe(false); + }); + + test("kimi-k2-thinking and kimi-k2-thinking-turbo are same generation", () => { + expect(sameGeneration("kimi-k2-thinking", "kimi-k2-thinking-turbo")).toBe( + true, + ); + }); + + test("returns false when one canonical_id has no generation mapping", () => { + expect(sameGeneration("kimi-k2.5", "some-unknown-model")).toBe(false); + }); +}); + +// ─── lookup / lookupRoute ───────────────────────────────────────────────────── + +describe("lookup", () => { + test('lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages"', () => { + const m = lookup("kimi-coding", "kimi-k2.6"); + expect(m).not.toBeNull(); + expect(m?.api).toBe("anthropic-messages"); + expect(m?.canonical_id).toBe("kimi-k2.6"); + expect(m?.provider).toBe("kimi-coding"); + }); + + test("lookup returns null for unknown provider", () => { + expect(lookup("nonexistent-provider", "some-model")).toBeNull(); + }); + + test("lookup returns null for unknown wire_id in known provider", () => { + expect(lookup("anthropic", "not-a-real-model")).toBeNull(); + }); + + test('lookup("anthropic", "claude-sonnet-4-6") resolves correctly', () => { + const m = lookup("anthropic", "claude-sonnet-4-6"); + expect(m).not.toBeNull(); + expect(m?.canonical_id).toBe("claude-sonnet-4-6"); + expect(m?.tier).toBe("standard"); + }); + + test("lookupRoute delegates to lookup", () => { + const a = lookup("kimi-coding", "kimi-k2-thinking"); + const b = lookupRoute("kimi-coding/kimi-k2-thinking"); + expect(a).toEqual(b); + }); +}); + +// ─── Bedrock namespaced models ──────────────────────────────────────────────── + +describe("amazon-bedrock namespaced wire_ids", () => { + test('lookup("amazon-bedrock", "moonshotai.kimi-k2.5") returns canonical kimi-k2.5', () => { + const m = lookup("amazon-bedrock", "moonshotai.kimi-k2.5"); + expect(m).not.toBeNull(); + expect(m?.canonical_id).toBe("kimi-k2.5"); + }); + + test('lookup("amazon-bedrock", "moonshot.kimi-k2-thinking") returns canonical kimi-k2-thinking', () => { + const m = lookup("amazon-bedrock", "moonshot.kimi-k2-thinking"); + expect(m).not.toBeNull(); + expect(m?.canonical_id).toBe("kimi-k2-thinking"); + expect(m?.tier).toBe("heavy"); + }); + + test('lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6") returns canonical claude-sonnet-4-6', () => { + const m = lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6"); + expect(m).not.toBeNull(); + expect(m?.canonical_id).toBe("claude-sonnet-4-6"); + }); +}); + +// ─── allCanonicalIds ────────────────────────────────────────────────────────── + +describe("allCanonicalIds", () => { + test("returns a non-empty array", () => { + const ids = allCanonicalIds(); + expect(ids.length).toBeGreaterThan(10); + }); + + test("kimi-k2.5 is in the list", () => { + expect(allCanonicalIds()).toContain("kimi-k2.5"); + }); + + test("kimi-k2.6 is in the list", () => { + expect(allCanonicalIds()).toContain("kimi-k2.6"); + }); +}); + +// ─── routeKeyOf ────────────────────────────────────────────────────────────── + +describe("routeKeyOf", () => { + test("builds correct fused key", () => { + const m = lookup("kimi-coding", "kimi-k2.6")!; + expect(routeKeyOf(m)).toBe("kimi-coding/kimi-k2.6"); + }); +}); diff --git a/src/resources/extensions/sf/tests/model-route-failure.test.ts b/src/resources/extensions/sf/tests/model-route-failure.test.ts new file mode 100644 index 000000000..a0314c675 --- /dev/null +++ b/src/resources/extensions/sf/tests/model-route-failure.test.ts @@ -0,0 +1,263 @@ +/** + * Tests for model-route-failure.js — generation guard and solver pinning (ADR-0079). + * + * Swarm B spec: + * 1. Solver-pinned unit ("autonomous-solver") cannot fail over across + * canonical_id boundaries. The resolver must return undefined when all + * remaining routes belong to a different canonical model. + * 2. Same-canonical multi-route failover works: two routes for the same + * canonical id (kimi-k2.6 and kimi-for-coding both map to kimi-k2.6). + * 3. Cross-generation failover for non-solver units succeeds AND emits + * logGenerationDowngrade via logWarning. + */ + +import { beforeEach, describe, expect, it, vi } from "vitest"; + +// ── Mock workflow-logger so we can assert on logWarning calls ────────────── +vi.mock("../workflow-logger.js", () => ({ + logWarning: vi.fn(), + logError: vi.fn(), +})); + +import { logWarning } from "../workflow-logger.js"; + +import { + logGenerationDowngrade, + resolveNextAvailableModelRoute, +} from "../model-route-failure.js"; + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +/** + * Minimal model shape that model-route-failure.js expects for availableModels. + * provider + id must be real registry entries so canonicalIdFor() resolves them. + */ +function makeModel(provider: string, id: string) { + return { provider, id, api: "openai-completions" as const }; +} + +beforeEach(() => { + vi.clearAllMocks(); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// 1. Solver-pinning guard +// ───────────────────────────────────────────────────────────────────────────── + +describe("solver-pinned failover (ADR-0079)", () => { + it("returns undefined when the only available route has a different canonical_id than the failed solver route", () => { + // Scenario: solver is running on kimi-coding/kimi-k2.6 (canonical: kimi-k2.6). + // The only unfailed route is openrouter/moonshotai/kimi-k2.5 (canonical: kimi-k2.5). + // Because these are different canonical ids, the solver-pinned guard must + // reject the candidate and return undefined. + const result = resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered + makeModel("openrouter", "moonshotai/kimi-k2.5"), // canonical: kimi-k2.5 ≠ kimi-k2.6 + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" }, + ], + unitType: "autonomous-solver", + }); + expect(result).toBeUndefined(); + }); + + it("returns undefined when candidate is an unregistered route (null canonical) for solver-pinned unit", () => { + // ollama-cloud/kimi-k2.5:cloud is not in the registry — canonicalIdFor + // returns null. The guard treats null !== "kimi-k2.6" as a mismatch. + const result = resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("ollama-cloud", "kimi-k2.5:cloud"), // not in registry + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" }, + ], + unitType: "autonomous-solver", + }); + expect(result).toBeUndefined(); + }); + + it("does NOT emit logGenerationDowngrade for solver-pinned failover (guard rejects before logging)", () => { + resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" }, + ], + unitType: "autonomous-solver", + }); + // The guard rejected before logging — no downgrade event should be emitted. + expect(logWarning).not.toHaveBeenCalled(); + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// 2. Same-canonical multi-route failover +// ───────────────────────────────────────────────────────────────────────────── + +describe("same-canonical multi-route failover", () => { + it("succeeds when a second route shares the same canonical_id as the failed route", () => { + // kimi-coding/kimi-for-coding maps to canonical kimi-k2.6 (same as + // kimi-coding/kimi-k2.6). This is the standard same-canonical path. + const result = resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered + makeModel("kimi-coding", "kimi-for-coding"), // canonical: kimi-k2.6 ✓ + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" }, + ], + unitType: "autonomous-solver", + }); + expect(result).toBeDefined(); + expect(result?.model.provider).toBe("kimi-coding"); + expect(result?.model.id).toBe("kimi-for-coding"); + expect(result?.source).toBe("available"); + }); + + it("does not emit logGenerationDowngrade for same-canonical failover", () => { + resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("kimi-coding", "kimi-k2.6"), + makeModel("kimi-coding", "kimi-for-coding"), + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" }, + ], + unitType: "execute-task", + }); + expect(logWarning).not.toHaveBeenCalled(); + }); + + it("same-canonical failover works even for solver-pinned unit type", () => { + // Within the same canonical_id, solver pin does not block failover. + const result = resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("kimi-coding", "kimi-k2.6"), + makeModel("kimi-coding", "kimi-for-coding"), + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" }, + ], + unitType: "autonomous-solver", + }); + expect(result).toBeDefined(); + expect(result?.model.id).toBe("kimi-for-coding"); + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// 3. Cross-generation failover for non-solver units +// ───────────────────────────────────────────────────────────────────────────── + +describe("cross-generation failover (executor layer)", () => { + it("succeeds and emits logGenerationDowngrade when crossing generation boundaries", () => { + // kimi-k2.6 (gen: k2.6) → kimi-k2.5 via openrouter (gen: k2.5). + // These are different generations, so the downgrade event must fire. + const result = resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [ + makeModel("openrouter", "moonshotai/kimi-k2.5"), + ], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" }, + ], + unitType: "execute-task", + }); + expect(result).toBeDefined(); + expect(result?.model.provider).toBe("openrouter"); + // logGenerationDowngrade should have been called + expect(logWarning).toHaveBeenCalledWith( + "model-route-failure", + "generation-downgrade", + expect.objectContaining({ + from: "kimi-k2.6", + to: "kimi-k2.5", + unitType: "execute-task", + sameGeneration: false, + }), + ); + }); + + it("emits logGenerationDowngrade with the correct unitType from args", () => { + resolveNextAvailableModelRoute({ + current: makeModel("kimi-coding", "kimi-k2.6"), + availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")], + failedRoutes: [ + { provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" }, + ], + unitType: "plan-slice", + }); + expect(logWarning).toHaveBeenCalledWith( + "model-route-failure", + "generation-downgrade", + expect.objectContaining({ unitType: "plan-slice" }), + ); + }); + + it("does not emit logGenerationDowngrade when no current route is set", () => { + // When current is undefined, canonicalIdFor returns null and no generation + // check can be performed — no downgrade event should fire. + const result = resolveNextAvailableModelRoute({ + current: undefined, + availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")], + failedRoutes: [], + unitType: "execute-task", + }); + expect(result).toBeDefined(); + expect(logWarning).not.toHaveBeenCalled(); + }); + + it("does not emit logGenerationDowngrade when both routes share the same generation", () => { + // claude-sonnet-4 and claude-sonnet-4-5 both have generation "sonnet-4". + resolveNextAvailableModelRoute({ + current: makeModel("anthropic", "claude-sonnet-4-20250514"), + availableModels: [makeModel("anthropic", "claude-sonnet-4-5-20250929")], + failedRoutes: [ + { + provider: "anthropic", + modelId: "claude-sonnet-4-20250514", + reason: "rate-limit", + }, + ], + unitType: "execute-task", + }); + expect(logWarning).not.toHaveBeenCalled(); + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// 4. logGenerationDowngrade helper +// ───────────────────────────────────────────────────────────────────────────── + +describe("logGenerationDowngrade helper", () => { + it("calls logWarning with structured generation-downgrade payload", () => { + logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "execute-task", "test reason"); + expect(logWarning).toHaveBeenCalledWith( + "model-route-failure", + "generation-downgrade", + { + from: "kimi-k2.6", + to: "kimi-k2.5", + unitType: "execute-task", + reason: "test reason", + sameGeneration: false, + }, + ); + }); + + it("uses default reason when none is supplied", () => { + logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "plan-slice", undefined); + expect(logWarning).toHaveBeenCalledWith( + "model-route-failure", + "generation-downgrade", + expect.objectContaining({ reason: "cross-generation failover" }), + ); + }); +});