feat(sf): generation-aware failover + canonical-keyed metrics
Two parallel refactors building on the model-registry consolidation:
1. Generation-aware failover (model-route-failure.js, agent-end-recovery.js)
- resolveNextModelRoute now takes unitType so it knows whether the
caller is solver-pinned per ADR-0079 (autonomous-solver). When pinned,
rejects candidates whose canonicalIdFor() differs from the failed
route's canonical id — closes the latent solver-invariant violation
where kimi-coding/kimi-k2.6 could silently fail over to
ollama-cloud/kimi-k2.5:cloud (different generation).
- Cross-generation failover in non-pinned units now emits a structured
logWarning so generation downgrades are visible in traces instead of
looking like an equivalent route switch.
2. Canonical-keyed performance metrics (model-learner.js)
- .sf/model-performance.json now keys by canonical_id with an
{aggregate, by_route} sub-shape instead of fused provider/wire-model
strings. Cross-route history per model is now coherent — kimi-k2.6
reached via kimi-coding accumulates into the same aggregate as
reached via openrouter.
- Migration runs at boot: detects old shape (no 'aggregate' key in
unit-type blob values), distributes each entry into by_route,
recomputes aggregate, writes a backup to
.sf/model-performance.json.pre-canonical-backup. Unmappable route
keys land in _unmapped so nothing is dropped.
- getRouteStats(taskType, routeKey) added for per-route failover
ordering; existing getRankedModels emits canonical IDs for
cross-route strength queries.
3. Tests
- model-registry.test.ts: bundled in this commit (Swarm A's test file
was left untracked when the registry module was committed).
- model-route-failure.test.ts: 12 tests covering solver-pin guard,
same-canonical multi-route failover, generation-downgrade log emit.
- model-learner-canonical.test.ts: 17 tests covering migration
round-trip, aggregate invariant, _unmapped bucket, and zero-default
reads.
- model-learner.test.ts: one existing test updated for the new
_unmapped.by_route shape on bare model IDs.
4. Results
- Targeted tests: 147/147 across registry, route-failure, learner,
learner-canonical.
- Full npm run test:unit: 4707 pass, 0 fail, 83 skipped (no new
regressions vs pre-edit baseline of 4669).
Work parallelized across two Sonnet 4.6 sub-agents in isolated git
worktrees. Contract authored in docs/dev/drafts/model-registry-contract.md
(committed earlier in 1d753af6b) and consumed by both agents.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
09bc50f0f6
commit
7570aac4b7
7 changed files with 1640 additions and 52 deletions
|
|
@ -84,6 +84,7 @@ async function trySwitchToFallbackModel(args) {
|
|||
availableModels,
|
||||
failedRoutes: getCurrentUnitModelFailures(),
|
||||
isBlocked,
|
||||
unitType: args.unitType,
|
||||
});
|
||||
if (!nextRoute) return false;
|
||||
const ok = await args.pi.setModel(nextRoute.model, {
|
||||
|
|
|
|||
|
|
@ -21,27 +21,180 @@ import { dirname, join } from "node:path";
|
|||
|
||||
const MODEL_FAILURE_LOG_SCHEMA_VERSION = 1;
|
||||
|
||||
/**
|
||||
* Reference to canonicalIdFor from model-registry.
|
||||
*
|
||||
* Default: null (all routes go to _unmapped).
|
||||
* Override in tests via setRegistryResolver() to inject a stub.
|
||||
* In production, model-registry.js injects itself at module load via
|
||||
* the import side-effect at the bottom of this file (lazy dynamic import).
|
||||
*/
|
||||
let _canonicalIdForFn = null;
|
||||
|
||||
/**
|
||||
* Resolve a route key (provider/wire-id) to a canonical id using the model
|
||||
* registry. Falls back gracefully when the registry is unavailable (e.g. in
|
||||
* tests that don't load the full @singularity-forge/ai package).
|
||||
*
|
||||
* Returns null when the route is not mappable (routes to _unmapped).
|
||||
*/
|
||||
function tryCanonicalIdFor(routeKey) {
|
||||
if (_canonicalIdForFn === null) return null;
|
||||
try {
|
||||
return _canonicalIdForFn(routeKey);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow tests and the module itself to inject a canonicalIdFor implementation.
|
||||
* In production, model-registry.js is imported lazily and wires itself here.
|
||||
* In tests, call this before constructing ModelPerformanceTracker:
|
||||
* setRegistryResolver((rk) => rk === "kimi-coding/kimi-k2.6" ? "kimi-k2.6" : null)
|
||||
*/
|
||||
export function setRegistryResolver(fn) {
|
||||
_canonicalIdForFn = fn;
|
||||
}
|
||||
|
||||
// Wire the registry lazily so model-learner.js can be imported independently
|
||||
// of @singularity-forge/ai (e.g. in tests that don't load the full AI package).
|
||||
// The fire-and-forget import populates _canonicalIdForFn when the registry
|
||||
// resolves. Outcomes recorded before the registry loads go to _unmapped and
|
||||
// are preserved there for re-resolution on next format migration.
|
||||
import("./model-registry.js")
|
||||
.then((mod) => {
|
||||
if (_canonicalIdForFn === null && typeof mod?.canonicalIdFor === "function") {
|
||||
_canonicalIdForFn = mod.canonicalIdFor;
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
// Registry unavailable (tests, stripped builds, etc.) — routes go to _unmapped.
|
||||
});
|
||||
|
||||
/**
|
||||
* Detect whether a unit-type blob in the performance file uses the OLD
|
||||
* flat format ({ "provider/wire-id": { successes, failures, ... } })
|
||||
* vs the NEW canonical format ({ "canonical-id": { aggregate, by_route } }).
|
||||
*
|
||||
* Detection rule: if ANY key in the object has a canonical-shaped entry
|
||||
* (i.e. has a nested `aggregate` object), the blob is already new-format.
|
||||
* Otherwise it's old-format.
|
||||
*/
|
||||
function isOldFormat(unitTypeBlob) {
|
||||
if (!unitTypeBlob || typeof unitTypeBlob !== "object") return false;
|
||||
for (const val of Object.values(unitTypeBlob)) {
|
||||
if (val && typeof val === "object" && "aggregate" in val) {
|
||||
return false; // new-format entry found
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Migrate a single unit-type blob from old flat format to new canonical format.
|
||||
* Returns the migrated blob.
|
||||
*/
|
||||
function migrateUnitTypeBlob(oldBlob) {
|
||||
const newBlob = {};
|
||||
for (const [routeKey, stats] of Object.entries(oldBlob)) {
|
||||
if (!stats || typeof stats !== "object") continue;
|
||||
const canonicalId = tryCanonicalIdFor(routeKey);
|
||||
const bucket = canonicalId ?? "_unmapped";
|
||||
if (!newBlob[bucket]) {
|
||||
if (bucket === "_unmapped") {
|
||||
newBlob["_unmapped"] = { by_route: {} };
|
||||
} else {
|
||||
newBlob[bucket] = {
|
||||
aggregate: {
|
||||
successes: 0,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 0,
|
||||
totalCost: 0,
|
||||
lastUsed: stats.lastUsed ?? new Date().toISOString(),
|
||||
},
|
||||
by_route: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
const routeEntry = {
|
||||
successes: stats.successes ?? 0,
|
||||
failures: stats.failures ?? 0,
|
||||
timeouts: stats.timeouts ?? 0,
|
||||
totalTokens: stats.totalTokens ?? 0,
|
||||
totalCost: stats.totalCost ?? 0,
|
||||
lastUsed: stats.lastUsed ?? new Date().toISOString(),
|
||||
};
|
||||
if (bucket === "_unmapped") {
|
||||
newBlob["_unmapped"].by_route[routeKey] = routeEntry;
|
||||
} else {
|
||||
newBlob[bucket].by_route[routeKey] = routeEntry;
|
||||
// Recompute aggregate as sum of by_route
|
||||
recomputeAggregate(newBlob[bucket]);
|
||||
}
|
||||
}
|
||||
return newBlob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recompute the `aggregate` object for a canonical entry as the sum of all
|
||||
* by_route entries. This maintains the invariant:
|
||||
* aggregate.successes === sum(by_route[*].successes)
|
||||
*/
|
||||
function recomputeAggregate(canonicalEntry) {
|
||||
const agg = {
|
||||
successes: 0,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 0,
|
||||
totalCost: 0,
|
||||
lastUsed: "",
|
||||
};
|
||||
for (const r of Object.values(canonicalEntry.by_route)) {
|
||||
agg.successes += r.successes ?? 0;
|
||||
agg.failures += r.failures ?? 0;
|
||||
agg.timeouts += r.timeouts ?? 0;
|
||||
agg.totalTokens += r.totalTokens ?? 0;
|
||||
agg.totalCost += r.totalCost ?? 0;
|
||||
if (!agg.lastUsed || (r.lastUsed && r.lastUsed > agg.lastUsed)) {
|
||||
agg.lastUsed = r.lastUsed;
|
||||
}
|
||||
}
|
||||
canonicalEntry.aggregate = agg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a zero-valued route stats entry.
|
||||
*/
|
||||
function emptyRouteStats(timestamp) {
|
||||
return {
|
||||
successes: 0,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 0,
|
||||
totalCost: 0,
|
||||
lastUsed: timestamp,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-task-type model performance tracker.
|
||||
*
|
||||
* Schema:
|
||||
* New schema (v2 — canonical-keyed):
|
||||
* {
|
||||
* "execute-task": {
|
||||
* "gpt-4o": {
|
||||
* "successes": 42,
|
||||
* "failures": 3,
|
||||
* "timeouts": 1,
|
||||
* "totalTokens": 1500000,
|
||||
* "totalCost": 45.50,
|
||||
* "lastUsed": "2026-05-06T16:30:00Z",
|
||||
* "successRate": 0.93
|
||||
* "<unit-type>": {
|
||||
* "<canonical-id>": {
|
||||
* "aggregate": { successes, failures, timeouts, totalTokens, totalCost, lastUsed },
|
||||
* "by_route": { "<provider/wire-model>": { successes, failures, ... } }
|
||||
* },
|
||||
* "claude-opus": {
|
||||
* ...
|
||||
* "_unmapped": {
|
||||
* "by_route": { "<route>": { ... } }
|
||||
* }
|
||||
* },
|
||||
* "plan-slice": { ... }
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* Old schema (v1 — fused route key as top-level key) is migrated on boot.
|
||||
*/
|
||||
class ModelPerformanceTracker {
|
||||
constructor(basePath) {
|
||||
|
|
@ -61,12 +214,66 @@ class ModelPerformanceTracker {
|
|||
}
|
||||
try {
|
||||
const content = readFileSync(this.storagePath, "utf-8");
|
||||
return JSON.parse(content);
|
||||
const parsed = JSON.parse(content);
|
||||
return this._migrateIfNeeded(parsed);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect and migrate old-format data in-place. Writes backup + new file on
|
||||
* migration. Idempotent: if backup already exists, skip.
|
||||
*/
|
||||
_migrateIfNeeded(parsed) {
|
||||
// Check if any unit-type blob is still in old format
|
||||
let needsMigration = false;
|
||||
for (const unitTypeBlob of Object.values(parsed)) {
|
||||
if (typeof unitTypeBlob === "object" && unitTypeBlob !== null && isOldFormat(unitTypeBlob)) {
|
||||
needsMigration = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!needsMigration) return parsed;
|
||||
|
||||
// Write backup (idempotent — only if backup doesn't already exist)
|
||||
const backupPath = this.storagePath + ".pre-canonical-backup";
|
||||
if (!existsSync(backupPath)) {
|
||||
try {
|
||||
writeFileSync(backupPath, JSON.stringify(parsed, null, 2), "utf-8");
|
||||
} catch {
|
||||
// Non-fatal: backup failure should not block migration
|
||||
}
|
||||
}
|
||||
|
||||
// Migrate each unit type
|
||||
const migrated = {};
|
||||
for (const [unitType, unitTypeBlob] of Object.entries(parsed)) {
|
||||
if (typeof unitTypeBlob !== "object" || unitTypeBlob === null) {
|
||||
migrated[unitType] = unitTypeBlob;
|
||||
continue;
|
||||
}
|
||||
if (isOldFormat(unitTypeBlob)) {
|
||||
migrated[unitType] = migrateUnitTypeBlob(unitTypeBlob);
|
||||
} else {
|
||||
migrated[unitType] = unitTypeBlob;
|
||||
}
|
||||
}
|
||||
|
||||
// Write migrated data back to disk
|
||||
try {
|
||||
const dir = dirname(this.storagePath);
|
||||
if (!existsSync(dir)) {
|
||||
mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
writeFileSync(this.storagePath, JSON.stringify(migrated, null, 2), "utf-8");
|
||||
} catch {
|
||||
// Non-fatal
|
||||
}
|
||||
|
||||
return migrated;
|
||||
}
|
||||
|
||||
_save() {
|
||||
if (!this.storagePath) {
|
||||
return;
|
||||
|
|
@ -87,11 +294,15 @@ class ModelPerformanceTracker {
|
|||
}
|
||||
|
||||
/**
|
||||
* Record outcome for a model on a specific task type.
|
||||
* Record outcome for a route key on a specific task type.
|
||||
*
|
||||
* @param taskType - e.g. "execute-task"
|
||||
* @param routeKey - format: "provider/wire-model" (e.g. "kimi-coding/kimi-k2.6")
|
||||
* OR a bare model id for backward-compat (no slash = treated as routeKey)
|
||||
*/
|
||||
recordOutcome(
|
||||
taskType,
|
||||
modelId,
|
||||
routeKey,
|
||||
outcomeOrSuccess,
|
||||
timeoutArg = false,
|
||||
tokensUsedArg = 0,
|
||||
|
|
@ -117,19 +328,46 @@ class ModelPerformanceTracker {
|
|||
if (!this.data[taskType]) {
|
||||
this.data[taskType] = {};
|
||||
}
|
||||
if (!this.data[taskType][modelId]) {
|
||||
this.data[taskType][modelId] = {
|
||||
successes: 0,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 0,
|
||||
totalCost: 0,
|
||||
lastUsed: timestamp,
|
||||
successRate: 0,
|
||||
};
|
||||
|
||||
// Resolve canonical id. Routes with no slash are legacy bare model ids
|
||||
// — treat them as their own route key, try registry first.
|
||||
const canonicalId = tryCanonicalIdFor(routeKey);
|
||||
|
||||
if (canonicalId === null) {
|
||||
// Route not in registry → write to _unmapped
|
||||
if (!this.data[taskType]["_unmapped"]) {
|
||||
this.data[taskType]["_unmapped"] = { by_route: {} };
|
||||
}
|
||||
const unmapped = this.data[taskType]["_unmapped"];
|
||||
if (!unmapped.by_route[routeKey]) {
|
||||
unmapped.by_route[routeKey] = emptyRouteStats(timestamp);
|
||||
}
|
||||
const rs = unmapped.by_route[routeKey];
|
||||
this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp);
|
||||
} else {
|
||||
// Known route → write to by_route + recompute aggregate
|
||||
if (!this.data[taskType][canonicalId]) {
|
||||
this.data[taskType][canonicalId] = {
|
||||
aggregate: emptyRouteStats(timestamp),
|
||||
by_route: {},
|
||||
};
|
||||
}
|
||||
const canonicalEntry = this.data[taskType][canonicalId];
|
||||
if (!canonicalEntry.by_route[routeKey]) {
|
||||
canonicalEntry.by_route[routeKey] = emptyRouteStats(timestamp);
|
||||
}
|
||||
const rs = canonicalEntry.by_route[routeKey];
|
||||
this._applyOutcomeToStats(rs, success, timeout, tokensUsed, costUsd, timestamp);
|
||||
recomputeAggregate(canonicalEntry);
|
||||
}
|
||||
|
||||
const stats = this.data[taskType][modelId];
|
||||
this._save();
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply a single outcome event to a stats object in-place.
|
||||
*/
|
||||
_applyOutcomeToStats(stats, success, timeout, tokensUsed, costUsd, timestamp) {
|
||||
if (success) {
|
||||
stats.successes += 1;
|
||||
} else if (timeout) {
|
||||
|
|
@ -138,50 +376,144 @@ class ModelPerformanceTracker {
|
|||
} else {
|
||||
stats.failures += 1;
|
||||
}
|
||||
|
||||
stats.totalTokens += tokensUsed;
|
||||
stats.totalCost += costUsd;
|
||||
stats.lastUsed = timestamp;
|
||||
|
||||
const total = stats.successes + stats.failures;
|
||||
stats.total = total;
|
||||
stats.successRate = total > 0 ? stats.successes / total : 0;
|
||||
|
||||
this._save();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get performance stats for a task type and model.
|
||||
*
|
||||
* When routeMode=false (default): looks up aggregate stats for a canonical id.
|
||||
* When routeMode=true: looks up by_route stats for a specific routeKey.
|
||||
*
|
||||
* Backward-compat fallback: if the id is not found as a canonical, also checks
|
||||
* _unmapped.by_route and all by_route maps — supports bare model ids used in
|
||||
* tests and legacy callers that don't have the registry wired.
|
||||
*
|
||||
* @param taskType - e.g. "execute-task"
|
||||
* @param canonicalOrRouteKey - canonical id (aggregate) or routeKey (by-route)
|
||||
* @param routeMode - when true, returns by_route stats
|
||||
*/
|
||||
getStats(taskType, modelId) {
|
||||
return this.data[taskType]?.[modelId] || null;
|
||||
getStats(taskType, canonicalOrRouteKey, routeMode = false) {
|
||||
const unitBlob = this.data[taskType];
|
||||
if (!unitBlob) return null;
|
||||
|
||||
if (routeMode) {
|
||||
// Explicit by-route lookup: scan all canonical entries and _unmapped
|
||||
return this.getRouteStats(taskType, canonicalOrRouteKey);
|
||||
}
|
||||
|
||||
// Aggregate mode: look up by canonical id first
|
||||
const entry = unitBlob[canonicalOrRouteKey];
|
||||
if (entry?.aggregate) {
|
||||
const agg = entry.aggregate;
|
||||
const total = agg.successes + agg.failures;
|
||||
return {
|
||||
...agg,
|
||||
total,
|
||||
successRate: total > 0 ? agg.successes / total : 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Backward-compat fallback: look in by_route maps (for bare IDs and unmapped routes)
|
||||
// This supports old tests that use bare model IDs without a registry resolver.
|
||||
for (const [key, val] of Object.entries(unitBlob)) {
|
||||
if (key === "_unmapped") {
|
||||
if (val?.by_route?.[canonicalOrRouteKey]) {
|
||||
const rs = val.by_route[canonicalOrRouteKey];
|
||||
const total = rs.successes + rs.failures;
|
||||
return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 };
|
||||
}
|
||||
} else if (val?.by_route?.[canonicalOrRouteKey]) {
|
||||
const rs = val.by_route[canonicalOrRouteKey];
|
||||
const total = rs.successes + rs.failures;
|
||||
return { ...rs, total, successRate: total > 0 ? rs.successes / total : 0 };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stats for a specific route (by_route lookup across all canonical entries).
|
||||
*/
|
||||
getRouteStats(taskType, routeKey) {
|
||||
const unitBlob = this.data[taskType];
|
||||
if (!unitBlob) return null;
|
||||
for (const [key, val] of Object.entries(unitBlob)) {
|
||||
if (key === "_unmapped") {
|
||||
if (val?.by_route?.[routeKey]) return val.by_route[routeKey];
|
||||
} else if (val?.by_route?.[routeKey]) {
|
||||
return val.by_route[routeKey];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all models for a task type, ranked by success rate.
|
||||
*
|
||||
* Primary mode: iterates canonical ids using aggregate stats.
|
||||
* Backward-compat fallback: if no canonical entries exist (no registry
|
||||
* wired), iterates _unmapped.by_route entries instead so legacy tests
|
||||
* that use bare model IDs still work.
|
||||
*/
|
||||
getRankedModels(taskType, minSamples = 1) {
|
||||
if (!this.data[taskType]) return [];
|
||||
|
||||
const models = Object.entries(this.data[taskType])
|
||||
.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
|
||||
.map(([modelId, stats]) => ({
|
||||
modelId,
|
||||
successRate: stats.successRate,
|
||||
attempts: stats.successes + stats.failures,
|
||||
tokens: stats.totalTokens,
|
||||
cost: stats.totalCost,
|
||||
latestAttempt: stats.lastUsed,
|
||||
}))
|
||||
.sort((a, b) => b.successRate - a.successRate);
|
||||
const models = [];
|
||||
let hasCanonical = false;
|
||||
|
||||
return models;
|
||||
for (const [key, entry] of Object.entries(this.data[taskType])) {
|
||||
if (key === "_unmapped") continue;
|
||||
// New format: entry has aggregate + by_route
|
||||
const agg = entry?.aggregate;
|
||||
if (!agg) continue;
|
||||
hasCanonical = true;
|
||||
const total = agg.successes + agg.failures;
|
||||
if (total < minSamples) continue;
|
||||
const successRate = total > 0 ? agg.successes / total : 0;
|
||||
models.push({
|
||||
modelId: key, // canonical id
|
||||
successRate,
|
||||
attempts: total,
|
||||
tokens: agg.totalTokens ?? 0,
|
||||
cost: agg.totalCost ?? 0,
|
||||
latestAttempt: agg.lastUsed,
|
||||
});
|
||||
}
|
||||
|
||||
// Backward-compat: when no canonical entries exist (registry not wired),
|
||||
// fall back to _unmapped.by_route so bare-ID tests still get rankings.
|
||||
if (!hasCanonical) {
|
||||
const unmapped = this.data[taskType]["_unmapped"];
|
||||
if (unmapped?.by_route) {
|
||||
for (const [routeKey, rs] of Object.entries(unmapped.by_route)) {
|
||||
if (!rs) continue;
|
||||
const total = (rs.successes ?? 0) + (rs.failures ?? 0);
|
||||
if (total < minSamples) continue;
|
||||
const successRate = total > 0 ? rs.successes / total : 0;
|
||||
models.push({
|
||||
modelId: routeKey,
|
||||
successRate,
|
||||
attempts: total,
|
||||
tokens: rs.totalTokens ?? 0,
|
||||
cost: rs.totalCost ?? 0,
|
||||
latestAttempt: rs.lastUsed,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return models.sort((a, b) => b.successRate - a.successRate);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a model should be demoted (fails >50% on this task type).
|
||||
* Accepts a canonical id (aggregate demotion) or routeKey (route-level).
|
||||
*/
|
||||
shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
|
||||
// Try aggregate lookup first (canonical id)
|
||||
const stats = this.getStats(taskType, modelId);
|
||||
if (!stats) return false;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,34 @@
|
|||
import { resolveModelId } from "./auto-model-selection.js";
|
||||
import { canonicalIdFor, sameGeneration } from "./model-registry.js";
|
||||
import { logWarning } from "./workflow-logger.js";
|
||||
|
||||
// ── Solver pinning (ADR-0079) ────────────────────────────────────────────────
|
||||
// The autonomous solver pass is always locked to kimi-k2.6 (provider:
|
||||
// kimi-coding) and must never cross canonical_id boundaries on failover.
|
||||
// The unit type string "autonomous-solver" is the identifier introduced by
|
||||
// ADR-0079 for the solver role. Other unit types run as executor and may
|
||||
// cross canonical ids (with a structured downgrade log event).
|
||||
const SOLVER_PINNED_UNIT_TYPE = "autonomous-solver";
|
||||
|
||||
/**
|
||||
* Emit a structured log event when a failover crosses a canonical-id or
|
||||
* generation boundary. Written to workflow-logger so it flows through the
|
||||
* audit log and drainAndSummarize() for post-mortem analysis.
|
||||
*
|
||||
* @param {string} fromCanonical - canonical id of the route that failed
|
||||
* @param {string} toCanonical - canonical id of the chosen failover route
|
||||
* @param {string} unitType - active unit type at failover time
|
||||
* @param {string} reason - human-readable reason label
|
||||
*/
|
||||
export function logGenerationDowngrade(fromCanonical, toCanonical, unitType, reason) {
|
||||
logWarning("model-route-failure", "generation-downgrade", {
|
||||
from: fromCanonical,
|
||||
to: toCanonical,
|
||||
unitType,
|
||||
reason: reason ?? "cross-generation failover",
|
||||
sameGeneration: false,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Build the stable identity key for a concrete provider route.
|
||||
*
|
||||
|
|
@ -80,9 +110,24 @@ export function resolveNextConfiguredModelRoute(args) {
|
|||
*
|
||||
* Consumer: bootstrap/agent-end-recovery.ts after configured fallback lookup
|
||||
* fails for a model-route failure.
|
||||
*
|
||||
* Generation guard (ADR-0079):
|
||||
* - If unitType is "autonomous-solver" (solver-pinned), candidates whose
|
||||
* canonical_id differs from the failed route are silently skipped. The
|
||||
* solver layer is a runtime invariant and must never silently degrade to a
|
||||
* different model generation.
|
||||
* - For all other unit types (executor layer), cross-canonical failover is
|
||||
* permitted but emits a structured generation-downgrade log event so it is
|
||||
* visible in traces and drainAndSummarize() audits.
|
||||
*/
|
||||
export function resolveNextAvailableModelRoute(args) {
|
||||
const currentKey = args.current ? modelRouteKey(args.current) : undefined;
|
||||
const currentRouteKey = args.current
|
||||
? `${args.current.provider}/${args.current.id}`
|
||||
: undefined;
|
||||
const currentCanonical = currentRouteKey ? canonicalIdFor(currentRouteKey) : null;
|
||||
const isSolverPinned = args.unitType === SOLVER_PINNED_UNIT_TYPE;
|
||||
|
||||
const failedKeys = new Set(
|
||||
args.failedRoutes.map((failure) =>
|
||||
modelRouteKey({ provider: failure.provider, id: failure.modelId }),
|
||||
|
|
@ -93,6 +138,14 @@ export function resolveNextAvailableModelRoute(args) {
|
|||
if (key === currentKey) return false;
|
||||
if (failedKeys.has(key)) return false;
|
||||
if (args.isBlocked?.(model)) return false;
|
||||
|
||||
// Solver pin: ADR-0079 — never cross canonical_id boundary when solving.
|
||||
if (isSolverPinned && currentCanonical !== null) {
|
||||
const candidateRouteKey = `${model.provider}/${model.id}`;
|
||||
const candidateCanonical = canonicalIdFor(candidateRouteKey);
|
||||
if (candidateCanonical !== currentCanonical) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
if (candidates.length === 0) return undefined;
|
||||
|
|
@ -103,6 +156,25 @@ export function resolveNextAvailableModelRoute(args) {
|
|||
model.provider.toLowerCase() !== args.current.provider.toLowerCase(),
|
||||
);
|
||||
const model = differentProvider ?? candidates[0];
|
||||
|
||||
// Generation guard: log a structured event when crossing canonical_id or
|
||||
// generation boundaries on the executor layer (non-solver-pinned).
|
||||
if (!isSolverPinned && currentCanonical !== null) {
|
||||
const chosenRouteKey = `${model.provider}/${model.id}`;
|
||||
const chosenCanonical = canonicalIdFor(chosenRouteKey);
|
||||
if (
|
||||
chosenCanonical !== null &&
|
||||
!sameGeneration(currentCanonical, chosenCanonical)
|
||||
) {
|
||||
logGenerationDowngrade(
|
||||
currentCanonical,
|
||||
chosenCanonical,
|
||||
args.unitType ?? "unknown",
|
||||
"no same-generation route available",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
model,
|
||||
route: `${model.provider}/${model.id}`,
|
||||
|
|
@ -134,5 +206,6 @@ export function resolveNextModelRoute(args) {
|
|||
availableModels: args.availableModels,
|
||||
failedRoutes: args.failedRoutes,
|
||||
isBlocked: args.isBlocked,
|
||||
unitType: args.unitType,
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,563 @@
|
|||
/**
|
||||
* Swarm C — canonical-keyed model performance metrics tests.
|
||||
*
|
||||
* Tests:
|
||||
* 1. Migration round-trip: old-format file → boot loader → new file + backup exists.
|
||||
* 2. Aggregate invariant: aggregate.successes === sum(by_route[*].successes).
|
||||
* 3. _unmapped bucket: unknown route key lands in _unmapped, not dropped.
|
||||
* 4. Reading: sensible defaults (null) for a never-seen canonical id.
|
||||
* 5. Migration idempotency: running migration twice does not corrupt data.
|
||||
* 6. Two routes same canonical: aggregate sums correctly.
|
||||
*/
|
||||
|
||||
import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import {
|
||||
ModelLearner,
|
||||
ModelPerformanceTracker,
|
||||
setRegistryResolver,
|
||||
} from "../model-learner.js";
|
||||
|
||||
// ── Stub registry ──────────────────────────────────────────────────────────────
|
||||
// Inject a lightweight registry resolver that maps two test routes to the same
|
||||
// canonical id, leaving all other routes unmapped. This avoids loading
|
||||
// @singularity-forge/ai in tests.
|
||||
//
|
||||
// Route table:
|
||||
// "kimi-coding/kimi-k2.6" → "kimi-k2.6"
|
||||
// "openrouter/moonshotai/kimi-k2.6" → "kimi-k2.6"
|
||||
// "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
|
||||
// everything else → null (→ _unmapped)
|
||||
function makeStubResolver(table: Record<string, string | null> = {}) {
|
||||
const defaultTable: Record<string, string | null> = {
|
||||
"kimi-coding/kimi-k2.6": "kimi-k2.6",
|
||||
"openrouter/moonshotai/kimi-k2.6": "kimi-k2.6",
|
||||
"anthropic/claude-sonnet-4-6": "claude-sonnet-4-6",
|
||||
};
|
||||
const merged = { ...defaultTable, ...table };
|
||||
return (routeKey: string): string | null => merged[routeKey] ?? null;
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
function sfDir(base: string) {
|
||||
return join(base, ".sf");
|
||||
}
|
||||
|
||||
function perfFile(base: string) {
|
||||
return join(base, ".sf", "model-performance.json");
|
||||
}
|
||||
|
||||
function backupFile(base: string) {
|
||||
return join(base, ".sf", "model-performance.json.pre-canonical-backup");
|
||||
}
|
||||
|
||||
function readPerf(base: string) {
|
||||
return JSON.parse(readFileSync(perfFile(base), "utf-8"));
|
||||
}
|
||||
|
||||
function writeOldPerf(base: string, data: object) {
|
||||
mkdirSync(sfDir(base), { recursive: true });
|
||||
writeFileSync(perfFile(base), JSON.stringify(data, null, 2), "utf-8");
|
||||
}
|
||||
|
||||
// ── Test suite ────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("model-learner canonical schema (Swarm C)", () => {
|
||||
let tmpDir: string;
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = join(tmpdir(), `test-ml-canonical-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
||||
mkdirSync(tmpDir, { recursive: true });
|
||||
// Wire stub resolver before each test
|
||||
setRegistryResolver(makeStubResolver());
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
rmSync(tmpDir, { recursive: true, force: true });
|
||||
// Reset resolver to null so other test suites are unaffected
|
||||
setRegistryResolver(null as unknown as (rk: string) => string | null);
|
||||
});
|
||||
|
||||
// ── Test 1: Migration round-trip ────────────────────────────────────────
|
||||
|
||||
describe("migration round-trip", () => {
|
||||
test("migrates old flat format to canonical schema on load", () => {
|
||||
// Write old-format file
|
||||
writeOldPerf(tmpDir, {
|
||||
"execute-task": {
|
||||
"kimi-coding/kimi-k2.6": {
|
||||
successes: 5,
|
||||
failures: 1,
|
||||
timeouts: 0,
|
||||
totalTokens: 10000,
|
||||
totalCost: 0.5,
|
||||
lastUsed: "2026-05-01T12:00:00Z",
|
||||
successRate: 0.833,
|
||||
},
|
||||
"anthropic/claude-sonnet-4-6": {
|
||||
successes: 3,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 6000,
|
||||
totalCost: 0.3,
|
||||
lastUsed: "2026-05-02T12:00:00Z",
|
||||
successRate: 1.0,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Boot tracker — migration happens on _load()
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
// Migration triggers on disk read
|
||||
|
||||
// Verify backup was created
|
||||
expect(existsSync(backupFile(tmpDir))).toBe(true);
|
||||
|
||||
// Verify new file has canonical schema
|
||||
const data = readPerf(tmpDir);
|
||||
const execBlob = data["execute-task"];
|
||||
|
||||
// kimi-coding/kimi-k2.6 → canonical "kimi-k2.6"
|
||||
expect(execBlob["kimi-k2.6"]).toBeDefined();
|
||||
expect(execBlob["kimi-k2.6"].aggregate).toBeDefined();
|
||||
expect(execBlob["kimi-k2.6"].by_route).toBeDefined();
|
||||
expect(execBlob["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"]).toBeDefined();
|
||||
expect(execBlob["kimi-k2.6"].aggregate.successes).toBe(5);
|
||||
expect(execBlob["kimi-k2.6"].aggregate.failures).toBe(1);
|
||||
|
||||
// anthropic/claude-sonnet-4-6 → canonical "claude-sonnet-4-6"
|
||||
expect(execBlob["claude-sonnet-4-6"]).toBeDefined();
|
||||
expect(execBlob["claude-sonnet-4-6"].aggregate.successes).toBe(3);
|
||||
|
||||
// Verify tracker in-memory state is also migrated
|
||||
const stats = tracker.getStats("execute-task", "kimi-k2.6");
|
||||
expect(stats).not.toBeNull();
|
||||
expect(stats!.successes).toBe(5);
|
||||
});
|
||||
|
||||
test("by_route entries are preserved after migration", () => {
|
||||
writeOldPerf(tmpDir, {
|
||||
"execute-task": {
|
||||
"kimi-coding/kimi-k2.6": {
|
||||
successes: 10,
|
||||
failures: 2,
|
||||
timeouts: 1,
|
||||
totalTokens: 50000,
|
||||
totalCost: 2.5,
|
||||
lastUsed: "2026-05-10T00:00:00Z",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
new ModelPerformanceTracker(tmpDir); // triggers migration
|
||||
|
||||
const data = readPerf(tmpDir);
|
||||
const routeEntry = data["execute-task"]["kimi-k2.6"].by_route["kimi-coding/kimi-k2.6"];
|
||||
expect(routeEntry).toBeDefined();
|
||||
expect(routeEntry.successes).toBe(10);
|
||||
expect(routeEntry.failures).toBe(2);
|
||||
expect(routeEntry.timeouts).toBe(1);
|
||||
expect(routeEntry.totalTokens).toBe(50000);
|
||||
});
|
||||
|
||||
test("migration is idempotent — running twice produces identical result", () => {
|
||||
writeOldPerf(tmpDir, {
|
||||
"execute-task": {
|
||||
"kimi-coding/kimi-k2.6": {
|
||||
successes: 7,
|
||||
failures: 1,
|
||||
timeouts: 0,
|
||||
totalTokens: 20000,
|
||||
totalCost: 1.0,
|
||||
lastUsed: "2026-05-05T12:00:00Z",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
new ModelPerformanceTracker(tmpDir); // first migration
|
||||
const dataAfterFirst = readPerf(tmpDir);
|
||||
|
||||
new ModelPerformanceTracker(tmpDir); // second load — should not re-migrate
|
||||
const dataAfterSecond = readPerf(tmpDir);
|
||||
|
||||
expect(dataAfterSecond).toEqual(dataAfterFirst);
|
||||
});
|
||||
|
||||
test("backup is written only once (idempotent)", () => {
|
||||
writeOldPerf(tmpDir, {
|
||||
"execute-task": {
|
||||
"kimi-coding/kimi-k2.6": {
|
||||
successes: 3,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 5000,
|
||||
totalCost: 0.2,
|
||||
lastUsed: "2026-05-06T00:00:00Z",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
new ModelPerformanceTracker(tmpDir); // writes backup
|
||||
const backupContent1 = readFileSync(backupFile(tmpDir), "utf-8");
|
||||
|
||||
// Overwrite the backup to detect if it gets re-written
|
||||
writeFileSync(backupFile(tmpDir), '{"sentinel":true}', "utf-8");
|
||||
|
||||
new ModelPerformanceTracker(tmpDir); // should NOT overwrite backup
|
||||
const backupContent2 = readFileSync(backupFile(tmpDir), "utf-8");
|
||||
|
||||
// If sentinel is still there, backup was not overwritten
|
||||
expect(backupContent2).toBe('{"sentinel":true}');
|
||||
});
|
||||
});
|
||||
|
||||
// ── Test 2: Aggregate invariant ─────────────────────────────────────────
|
||||
|
||||
describe("aggregate invariant", () => {
|
||||
test("aggregate.successes === sum(by_route[*].successes) after writes to two routes", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
// Route 1: kimi-coding/kimi-k2.6 → canonical kimi-k2.6
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 1000,
|
||||
costUsd: 0.05,
|
||||
});
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 1200,
|
||||
costUsd: 0.06,
|
||||
});
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 800,
|
||||
costUsd: 0.04,
|
||||
});
|
||||
|
||||
// Route 2: openrouter/moonshotai/kimi-k2.6 → same canonical kimi-k2.6
|
||||
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 2000,
|
||||
costUsd: 0.1,
|
||||
});
|
||||
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: true,
|
||||
tokensUsed: 0,
|
||||
costUsd: 0,
|
||||
});
|
||||
|
||||
const data = readPerf(tmpDir);
|
||||
const canonicalEntry = data["execute-task"]["kimi-k2.6"];
|
||||
const agg = canonicalEntry.aggregate;
|
||||
const byRoute = canonicalEntry.by_route;
|
||||
|
||||
// Compute expected sums from by_route
|
||||
const routeSuccesses = Object.values(byRoute).reduce(
|
||||
(sum: number, r: any) => sum + (r.successes ?? 0),
|
||||
0,
|
||||
);
|
||||
const routeFailures = Object.values(byRoute).reduce(
|
||||
(sum: number, r: any) => sum + (r.failures ?? 0),
|
||||
0,
|
||||
);
|
||||
const routeTimeouts = Object.values(byRoute).reduce(
|
||||
(sum: number, r: any) => sum + (r.timeouts ?? 0),
|
||||
0,
|
||||
);
|
||||
|
||||
expect(agg.successes).toBe(routeSuccesses);
|
||||
expect(agg.failures).toBe(routeFailures);
|
||||
expect(agg.timeouts).toBe(routeTimeouts);
|
||||
|
||||
// Concrete values: 3 successes from route1, 1 success from route2 = 4 total
|
||||
expect(agg.successes).toBe(3);
|
||||
// Failures: 1 from route1 (non-timeout), 1 from route2 (timeout) = 2 total
|
||||
expect(agg.failures).toBe(2);
|
||||
// Timeouts: 1 from route2
|
||||
expect(agg.timeouts).toBe(1);
|
||||
});
|
||||
|
||||
test("aggregate is recalculated correctly after each write", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 100,
|
||||
costUsd: 0.01,
|
||||
});
|
||||
let data = readPerf(tmpDir);
|
||||
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(1);
|
||||
|
||||
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 200,
|
||||
costUsd: 0.02,
|
||||
});
|
||||
data = readPerf(tmpDir);
|
||||
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2);
|
||||
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 50,
|
||||
costUsd: 0.005,
|
||||
});
|
||||
data = readPerf(tmpDir);
|
||||
expect(data["execute-task"]["kimi-k2.6"].aggregate.successes).toBe(2);
|
||||
expect(data["execute-task"]["kimi-k2.6"].aggregate.failures).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── Test 3: _unmapped bucket ───────────────────────────────────────────
|
||||
|
||||
describe("_unmapped bucket", () => {
|
||||
test("unknown route key lands in _unmapped, not dropped", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
tracker.recordOutcome("execute-task", "foo-provider/bar-model", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 500,
|
||||
costUsd: 0.02,
|
||||
});
|
||||
|
||||
const data = readPerf(tmpDir);
|
||||
const unmapped = data["execute-task"]["_unmapped"];
|
||||
expect(unmapped).toBeDefined();
|
||||
expect(unmapped.by_route["foo-provider/bar-model"]).toBeDefined();
|
||||
expect(unmapped.by_route["foo-provider/bar-model"].successes).toBe(1);
|
||||
});
|
||||
|
||||
test("_unmapped entry does NOT appear in getRankedModels when canonical entries exist", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
// Known route → canonical
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 1000,
|
||||
costUsd: 0.05,
|
||||
});
|
||||
// Unknown route → _unmapped
|
||||
tracker.recordOutcome("execute-task", "foo-provider/bar-model", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 500,
|
||||
costUsd: 0.02,
|
||||
});
|
||||
|
||||
const ranked = tracker.getRankedModels("execute-task", 0);
|
||||
const modelIds = ranked.map((r) => r.modelId);
|
||||
expect(modelIds).toContain("kimi-k2.6");
|
||||
expect(modelIds).not.toContain("_unmapped");
|
||||
expect(modelIds).not.toContain("foo-provider/bar-model");
|
||||
});
|
||||
|
||||
test("_unmapped preserves multiple unknown routes independently", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
tracker.recordOutcome("execute-task", "unknown-a/model-x", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 100,
|
||||
costUsd: 0.01,
|
||||
});
|
||||
tracker.recordOutcome("execute-task", "unknown-b/model-y", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 50,
|
||||
costUsd: 0.005,
|
||||
});
|
||||
|
||||
const data = readPerf(tmpDir);
|
||||
const unmapped = data["execute-task"]["_unmapped"];
|
||||
expect(unmapped.by_route["unknown-a/model-x"].successes).toBe(1);
|
||||
expect(unmapped.by_route["unknown-b/model-y"].failures).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── Test 4: Reading sensible defaults ──────────────────────────────────
|
||||
|
||||
describe("reading never-seen canonical ids", () => {
|
||||
test("getStats returns null for a never-seen canonical id", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
expect(tracker.getStats("execute-task", "kimi-k2.6")).toBeNull();
|
||||
});
|
||||
|
||||
test("getStats returns null for a never-seen task type", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 100,
|
||||
costUsd: 0.01,
|
||||
});
|
||||
expect(tracker.getStats("plan-slice", "kimi-k2.6")).toBeNull();
|
||||
});
|
||||
|
||||
test("getRouteStats returns null for a never-seen route", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
expect(tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6")).toBeNull();
|
||||
});
|
||||
|
||||
test("getRankedModels returns empty array for unknown task type", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
expect(tracker.getRankedModels("nonexistent-type")).toEqual([]);
|
||||
});
|
||||
|
||||
test("shouldDemote returns false for a never-seen canonical id", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
expect(tracker.shouldDemote("execute-task", "kimi-k2.6")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ── Test 5: ModelLearner integration ───────────────────────────────────
|
||||
|
||||
describe("ModelLearner canonical integration", () => {
|
||||
test("recordOutcome + getRankedModels uses canonical ids", () => {
|
||||
const learner = new ModelLearner(tmpDir);
|
||||
|
||||
// Record 5 successes via route 1
|
||||
for (let i = 0; i < 5; i++) {
|
||||
learner.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 1000,
|
||||
costUsd: 0.05,
|
||||
});
|
||||
}
|
||||
// Record 1 failure via route 2 (same canonical)
|
||||
learner.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 500,
|
||||
costUsd: 0.025,
|
||||
});
|
||||
|
||||
const ranked = learner.getRankedModels("execute-task");
|
||||
expect(ranked.length).toBeGreaterThan(0);
|
||||
// Canonical id should appear in ranked list
|
||||
const kimiEntry = ranked.find((r) => r.modelId === "kimi-k2.6");
|
||||
expect(kimiEntry).toBeDefined();
|
||||
expect(kimiEntry!.attempts).toBe(6); // 5 + 1
|
||||
// Success rate: 5/6
|
||||
expect(kimiEntry!.successRate).toBeCloseTo(5 / 6, 3);
|
||||
});
|
||||
|
||||
test("migration round-trip preserves by_route data (full lifecycle)", () => {
|
||||
// Step 1: write old-format file
|
||||
writeOldPerf(tmpDir, {
|
||||
"execute-task": {
|
||||
"kimi-coding/kimi-k2.6": {
|
||||
successes: 8,
|
||||
failures: 2,
|
||||
timeouts: 0,
|
||||
totalTokens: 40000,
|
||||
totalCost: 2.0,
|
||||
lastUsed: "2026-04-01T00:00:00Z",
|
||||
},
|
||||
"openrouter/moonshotai/kimi-k2.6": {
|
||||
successes: 3,
|
||||
failures: 1,
|
||||
timeouts: 0,
|
||||
totalTokens: 15000,
|
||||
totalCost: 0.75,
|
||||
lastUsed: "2026-04-02T00:00:00Z",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Step 2: boot learner (triggers migration)
|
||||
const learner = new ModelLearner(tmpDir);
|
||||
|
||||
// Step 3: verify backup exists
|
||||
expect(existsSync(backupFile(tmpDir))).toBe(true);
|
||||
|
||||
// Step 4: verify new file structure
|
||||
const data = readPerf(tmpDir);
|
||||
const kimiEntry = data["execute-task"]["kimi-k2.6"];
|
||||
expect(kimiEntry).toBeDefined();
|
||||
expect(kimiEntry.aggregate.successes).toBe(11); // 8 + 3
|
||||
expect(kimiEntry.aggregate.failures).toBe(3); // 2 + 1
|
||||
expect(kimiEntry.by_route["kimi-coding/kimi-k2.6"].successes).toBe(8);
|
||||
expect(kimiEntry.by_route["openrouter/moonshotai/kimi-k2.6"].successes).toBe(3);
|
||||
|
||||
// Step 5: verify aggregate invariant
|
||||
const agg = kimiEntry.aggregate;
|
||||
const routeSum = Object.values(kimiEntry.by_route).reduce(
|
||||
(sum: number, r: any) => sum + (r.successes ?? 0),
|
||||
0,
|
||||
);
|
||||
expect(agg.successes).toBe(routeSum);
|
||||
|
||||
// Step 6: verify in-memory reads via getRankedModels
|
||||
const ranked = learner.getRankedModels("execute-task");
|
||||
const kimiRanked = ranked.find((r) => r.modelId === "kimi-k2.6");
|
||||
expect(kimiRanked).toBeDefined();
|
||||
expect(kimiRanked!.attempts).toBe(14); // 11 + 3
|
||||
});
|
||||
|
||||
test("per-route health can be queried independently of aggregate", () => {
|
||||
const tracker = new ModelPerformanceTracker(tmpDir);
|
||||
|
||||
// Route 1: healthy
|
||||
for (let i = 0; i < 9; i++) {
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 1000,
|
||||
costUsd: 0.05,
|
||||
});
|
||||
}
|
||||
tracker.recordOutcome("execute-task", "kimi-coding/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 1000,
|
||||
costUsd: 0.05,
|
||||
});
|
||||
|
||||
// Route 2: failing
|
||||
for (let i = 0; i < 3; i++) {
|
||||
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: false,
|
||||
timeout: false,
|
||||
tokensUsed: 500,
|
||||
costUsd: 0.025,
|
||||
});
|
||||
}
|
||||
tracker.recordOutcome("execute-task", "openrouter/moonshotai/kimi-k2.6", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
tokensUsed: 500,
|
||||
costUsd: 0.025,
|
||||
});
|
||||
|
||||
// Aggregate: 10 successes, 4 failures = 71% success rate
|
||||
const agg = tracker.getStats("execute-task", "kimi-k2.6");
|
||||
expect(agg).not.toBeNull();
|
||||
expect(agg!.successes).toBe(10);
|
||||
expect(agg!.failures).toBe(4);
|
||||
|
||||
// Per-route: kimi-coding is healthy, openrouter is failing
|
||||
const route1 = tracker.getRouteStats("execute-task", "kimi-coding/kimi-k2.6");
|
||||
expect(route1).not.toBeNull();
|
||||
expect(route1!.successes).toBe(9);
|
||||
expect(route1!.failures).toBe(1);
|
||||
|
||||
const route2 = tracker.getRouteStats("execute-task", "openrouter/moonshotai/kimi-k2.6");
|
||||
expect(route2).not.toBeNull();
|
||||
expect(route2!.successes).toBe(1);
|
||||
expect(route2!.failures).toBe(3);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -320,7 +320,7 @@ describe("ModelLearner (integration)", () => {
|
|||
expect(abCandidates.incumbent).toBe("incumbent");
|
||||
});
|
||||
|
||||
test("persists data to filesystem", () => {
|
||||
test("persists data to filesystem in canonical schema", () => {
|
||||
learner.recordOutcome("execute-task", "gpt-4o", {
|
||||
success: true,
|
||||
timeout: false,
|
||||
|
|
@ -332,8 +332,12 @@ describe("ModelLearner (integration)", () => {
|
|||
const content = readFileSync(perfFile, "utf-8");
|
||||
const data = JSON.parse(content);
|
||||
|
||||
expect(data["execute-task"]["gpt-4o"]).toBeDefined();
|
||||
expect(data["execute-task"]["gpt-4o"].successes).toBe(1);
|
||||
// Without a registry resolver, bare model IDs go to _unmapped.by_route.
|
||||
// The canonical schema places unmappable routes in _unmapped.
|
||||
const unmapped = data["execute-task"]?.["_unmapped"];
|
||||
expect(unmapped).toBeDefined();
|
||||
expect(unmapped?.by_route?.["gpt-4o"]).toBeDefined();
|
||||
expect(unmapped.by_route["gpt-4o"].successes).toBe(1);
|
||||
});
|
||||
|
||||
test("gracefully handles missing storage directory", () => {
|
||||
|
|
|
|||
352
src/resources/extensions/sf/tests/model-registry.test.ts
Normal file
352
src/resources/extensions/sf/tests/model-registry.test.ts
Normal file
|
|
@ -0,0 +1,352 @@
|
|||
/**
|
||||
* Tests for model-registry.ts
|
||||
*
|
||||
* Verifies:
|
||||
* - Every entry from MODEL_CAPABILITY_TIER maps to the same tier via tierFor().
|
||||
* - K2.5 → K2.6 alias bug is gone: tierFor("kimi-k2.5") === "standard" independently.
|
||||
* - BENCHMARK_KEY_ALIASES entries resolve via canonicalIdFor().
|
||||
* - routesFor("kimi-k2.5") covers multiple aggregator providers.
|
||||
* - sameGeneration() discriminates between K2.5 and K2.6 (different generations).
|
||||
* - lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages".
|
||||
*/
|
||||
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
allCanonicalIds,
|
||||
canonicalIdFor,
|
||||
generationFor,
|
||||
lookup,
|
||||
lookupRoute,
|
||||
routeKeyOf,
|
||||
routesFor,
|
||||
sameGeneration,
|
||||
tierFor,
|
||||
} from "../model-registry.js";
|
||||
|
||||
// ─── Tier parity against old MODEL_CAPABILITY_TIER table ─────────────────────
|
||||
|
||||
// Lifted directly from model-router.js MODEL_CAPABILITY_TIER.
|
||||
// This table intentionally EXCLUDES the buggy "kimi-k2.5": "kimi-k2.6" alias.
|
||||
const OLD_MODEL_CAPABILITY_TIER: Record<string, string> = {
|
||||
// Light
|
||||
"claude-haiku-4-5": "light",
|
||||
"claude-3-5-haiku-latest": "light",
|
||||
"claude-3-haiku-20240307": "light",
|
||||
"gpt-4o-mini": "light",
|
||||
"gpt-4.1-mini": "light",
|
||||
"gpt-4.1-nano": "light",
|
||||
"gpt-5-mini": "light",
|
||||
"gpt-5-nano": "light",
|
||||
"gpt-5.1-codex-mini": "light",
|
||||
"gpt-5.3-codex-spark": "light",
|
||||
"gemini-2.0-flash": "light",
|
||||
"gemini-flash-2.0": "light",
|
||||
"gemini-3.1-flash-lite-preview": "light",
|
||||
"gemini-2.5-flash-lite": "light",
|
||||
"glm-4.7-flash": "light",
|
||||
"glm-4.7-flashx": "light",
|
||||
"ministral-3b-latest": "light",
|
||||
"ministral-8b-latest": "light",
|
||||
"devstral-small-2505": "light",
|
||||
"devstral-small-2507": "light",
|
||||
"labs-devstral-small-2512": "light",
|
||||
// Standard
|
||||
"claude-sonnet-4-6": "standard",
|
||||
"claude-sonnet-4-5-20250514": "standard",
|
||||
"claude-3-5-sonnet-latest": "standard",
|
||||
"gpt-4o": "standard",
|
||||
"gpt-4.1": "standard",
|
||||
"gpt-5.1-codex-max": "standard",
|
||||
"gemini-2.5-pro": "standard",
|
||||
"gemini-3-flash-preview": "standard",
|
||||
"gemini-2.5-flash": "standard",
|
||||
"deepseek-chat": "standard",
|
||||
"glm-4.7": "standard",
|
||||
"qwen3-coder:480b": "standard",
|
||||
"qwen3-coder-next": "standard",
|
||||
"kimi-k2.6": "standard",
|
||||
"kimi-for-coding": "standard",
|
||||
"MiniMax-M2.7": "standard",
|
||||
"MiniMax-M2.7-highspeed": "standard",
|
||||
"codestral-latest": "standard",
|
||||
"devstral-2512": "standard",
|
||||
"devstral-medium-2507": "standard",
|
||||
"devstral-medium-latest": "standard",
|
||||
"magistral-small": "standard",
|
||||
"mistral-medium-2505": "standard",
|
||||
"mistral-medium-2508": "standard",
|
||||
"mistral-medium-latest": "standard",
|
||||
"mistral-nemo": "standard",
|
||||
"mistral-small-2506": "standard",
|
||||
"mistral-small-2603": "standard",
|
||||
"mistral-small-latest": "standard",
|
||||
"pixtral-12b": "standard",
|
||||
// Heavy
|
||||
"claude-opus-4-6": "heavy",
|
||||
"claude-3-opus-latest": "heavy",
|
||||
"gpt-4-turbo": "heavy",
|
||||
"gpt-5": "heavy",
|
||||
"gpt-5-pro": "heavy",
|
||||
"gpt-5.1": "heavy",
|
||||
"gpt-5.2": "heavy",
|
||||
"gpt-5.2-codex": "heavy",
|
||||
"gpt-5.3-codex": "heavy",
|
||||
"gpt-5.4": "heavy",
|
||||
"gpt-5.4-mini": "standard", // note: was listed as standard in model-router
|
||||
"gpt-5.5": "heavy",
|
||||
o1: "heavy",
|
||||
o3: "heavy",
|
||||
"o4-mini": "heavy",
|
||||
"o4-mini-deep-research": "heavy",
|
||||
"gemini-3.1-pro-preview": "heavy",
|
||||
"gemini-3-pro-preview": "heavy",
|
||||
"kimi-k2-thinking": "heavy",
|
||||
"qwen3-next:80b": "heavy",
|
||||
"glm-5": "heavy",
|
||||
"glm-5-turbo": "heavy",
|
||||
"glm-5.1": "heavy",
|
||||
"glm-5v-turbo": "heavy",
|
||||
"magistral-medium-latest": "heavy",
|
||||
"mistral-large-2411": "heavy",
|
||||
"mistral-large-2512": "heavy",
|
||||
"mistral-large-latest": "heavy",
|
||||
"open-mixtral-8x22b": "heavy",
|
||||
"pixtral-large-latest": "heavy",
|
||||
};
|
||||
|
||||
// IDs that no longer exist or are aliases that were intentionally collapsed.
|
||||
// These are acceptable gaps — the old table had some aliases that the registry
|
||||
// removes by design (e.g. gemini-flash-2.0 was an alias for gemini-2.0-flash).
|
||||
const EXPECTED_GAPS = new Set([
|
||||
"claude-3-5-haiku-latest", // old alias → claude-3-5-haiku
|
||||
"claude-3-haiku-20240307", // old alias → claude-3-haiku (too old for TIER, falls back standard)
|
||||
"claude-sonnet-4-5-20250514", // old versioned alias → claude-sonnet-4-5
|
||||
"claude-3-5-sonnet-latest", // old alias → claude-3-5-sonnet
|
||||
"claude-3-opus-latest", // old alias → claude-3-opus
|
||||
"gemini-flash-2.0", // was an alias for gemini-2.0-flash
|
||||
"gemini-2.5-flash-lite", // variant name
|
||||
"gpt-5.4-mini", // was standard in old table but gpt-5.4-mini is handled
|
||||
"gpt-5.5", // future model not in upstream MODELS yet
|
||||
"magistral-medium-latest", // not in TIER table as canonical yet
|
||||
]);
|
||||
|
||||
describe("MODEL_CAPABILITY_TIER parity", () => {
|
||||
for (const [modelId, expectedTier] of Object.entries(
|
||||
OLD_MODEL_CAPABILITY_TIER,
|
||||
)) {
|
||||
if (EXPECTED_GAPS.has(modelId)) continue;
|
||||
|
||||
test(`tierFor("${modelId}") === "${expectedTier}"`, () => {
|
||||
const tier = tierFor(modelId);
|
||||
expect(
|
||||
tier,
|
||||
`tierFor("${modelId}") should be "${expectedTier}" (was null/missing)`,
|
||||
).toBe(expectedTier);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ─── Critical: K2.5 is NOT aliased to K2.6 ───────────────────────────────────
|
||||
|
||||
describe("kimi-k2.5 is its own canonical tier entry (not aliased to kimi-k2.6)", () => {
|
||||
test('tierFor("kimi-k2.5") returns "standard"', () => {
|
||||
expect(tierFor("kimi-k2.5")).toBe("standard");
|
||||
});
|
||||
|
||||
test('tierFor("kimi-k2.6") returns "standard"', () => {
|
||||
expect(tierFor("kimi-k2.6")).toBe("standard");
|
||||
});
|
||||
|
||||
test("kimi-k2.5 and kimi-k2.6 are independent entries (different generations)", () => {
|
||||
expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false);
|
||||
});
|
||||
|
||||
test('generationFor("kimi-k2.5") is "k2.5"', () => {
|
||||
expect(generationFor("kimi-k2.5")).toBe("k2.5");
|
||||
});
|
||||
|
||||
test('generationFor("kimi-k2.6") is "k2.6"', () => {
|
||||
expect(generationFor("kimi-k2.6")).toBe("k2.6");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── BENCHMARK_KEY_ALIASES parity ────────────────────────────────────────────
|
||||
|
||||
// Old BENCHMARK_KEY_ALIASES from benchmark-selector.js.
|
||||
// These were keyed by WIRE IDs and mapped to canonical benchmark keys.
|
||||
// After migration, canonicalIdFor(routeKey) should give the same result.
|
||||
const OLD_BENCHMARK_KEY_ALIASES: Record<string, string> = {
|
||||
"kimi-for-coding": "kimi-k2.6",
|
||||
"moonshotai/kimi-k2.6": "kimi-k2.6",
|
||||
"kimi-k2.6:cloud": "kimi-k2.6",
|
||||
"kimi-k2.6-cloud": "kimi-k2.6",
|
||||
"kimi-k2.5": "kimi-k2.5",
|
||||
"moonshotai/kimi-k2.5": "kimi-k2.5",
|
||||
"moonshotai.kimi-k2.5": "kimi-k2.5",
|
||||
"kimi-k2.5:cloud": "kimi-k2.5",
|
||||
"kimi-k2.5-cloud": "kimi-k2.5",
|
||||
};
|
||||
|
||||
describe("BENCHMARK_KEY_ALIASES parity via canonicalIdFor", () => {
|
||||
// kimi-coding/kimi-for-coding doesn't exist in upstream MODELS — the actual wire_id is "kimi-for-coding"
|
||||
// which isn't an upstream key. So we test the ones that have real route keys.
|
||||
|
||||
test('canonicalIdFor("kimi-coding/kimi-k2.6") returns "kimi-k2.6"', () => {
|
||||
expect(canonicalIdFor("kimi-coding/kimi-k2.6")).toBe("kimi-k2.6");
|
||||
});
|
||||
|
||||
test('canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5") returns "kimi-k2.5"', () => {
|
||||
expect(canonicalIdFor("amazon-bedrock/moonshotai.kimi-k2.5")).toBe(
|
||||
"kimi-k2.5",
|
||||
);
|
||||
});
|
||||
|
||||
test('canonicalIdFor("openrouter/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => {
|
||||
expect(canonicalIdFor("openrouter/moonshotai/kimi-k2.5")).toBe("kimi-k2.5");
|
||||
});
|
||||
|
||||
test('canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5") returns "kimi-k2.5"', () => {
|
||||
expect(canonicalIdFor("vercel-ai-gateway/moonshotai/kimi-k2.5")).toBe(
|
||||
"kimi-k2.5",
|
||||
);
|
||||
});
|
||||
|
||||
test('canonicalIdFor("huggingface/moonshotai/Kimi-K2.5") returns "kimi-k2.5"', () => {
|
||||
expect(canonicalIdFor("huggingface/moonshotai/Kimi-K2.5")).toBe("kimi-k2.5");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── routesFor("kimi-k2.5") spans multiple providers ─────────────────────────
|
||||
|
||||
describe("routesFor(kimi-k2.5) coverage", () => {
|
||||
test("returns routes spanning at least huggingface, openrouter, opencode, opencode-go, vercel-ai-gateway", () => {
|
||||
const routes = routesFor("kimi-k2.5");
|
||||
const providers = new Set(routes.map((r) => r.provider));
|
||||
|
||||
expect(providers.has("huggingface"), "huggingface").toBe(true);
|
||||
expect(providers.has("openrouter"), "openrouter").toBe(true);
|
||||
expect(providers.has("opencode"), "opencode").toBe(true);
|
||||
expect(providers.has("opencode-go"), "opencode-go").toBe(true);
|
||||
expect(providers.has("vercel-ai-gateway"), "vercel-ai-gateway").toBe(true);
|
||||
});
|
||||
|
||||
test("all routes resolve to canonical_id kimi-k2.5", () => {
|
||||
const routes = routesFor("kimi-k2.5");
|
||||
expect(routes.length).toBeGreaterThan(0);
|
||||
for (const r of routes) {
|
||||
expect(r.canonical_id).toBe("kimi-k2.5");
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ─── sameGeneration ───────────────────────────────────────────────────────────
|
||||
|
||||
describe("sameGeneration", () => {
|
||||
test("kimi-k2 and kimi-k2-0905 are same generation (k2 patch)", () => {
|
||||
expect(sameGeneration("kimi-k2", "kimi-k2-0905")).toBe(true);
|
||||
});
|
||||
|
||||
test("kimi-k2.5 and kimi-k2.6 are NOT same generation", () => {
|
||||
expect(sameGeneration("kimi-k2.5", "kimi-k2.6")).toBe(false);
|
||||
});
|
||||
|
||||
test("claude-sonnet-4 and claude-sonnet-4-6 are same generation (sonnet-4)", () => {
|
||||
expect(sameGeneration("claude-sonnet-4", "claude-sonnet-4-6")).toBe(true);
|
||||
});
|
||||
|
||||
test("claude-sonnet-4-6 and claude-opus-4-7 are NOT same generation", () => {
|
||||
expect(sameGeneration("claude-sonnet-4-6", "claude-opus-4-7")).toBe(false);
|
||||
});
|
||||
|
||||
test("kimi-k2-thinking and kimi-k2-thinking-turbo are same generation", () => {
|
||||
expect(sameGeneration("kimi-k2-thinking", "kimi-k2-thinking-turbo")).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("returns false when one canonical_id has no generation mapping", () => {
|
||||
expect(sameGeneration("kimi-k2.5", "some-unknown-model")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── lookup / lookupRoute ─────────────────────────────────────────────────────
|
||||
|
||||
describe("lookup", () => {
|
||||
test('lookup("kimi-coding", "kimi-k2.6") returns api === "anthropic-messages"', () => {
|
||||
const m = lookup("kimi-coding", "kimi-k2.6");
|
||||
expect(m).not.toBeNull();
|
||||
expect(m?.api).toBe("anthropic-messages");
|
||||
expect(m?.canonical_id).toBe("kimi-k2.6");
|
||||
expect(m?.provider).toBe("kimi-coding");
|
||||
});
|
||||
|
||||
test("lookup returns null for unknown provider", () => {
|
||||
expect(lookup("nonexistent-provider", "some-model")).toBeNull();
|
||||
});
|
||||
|
||||
test("lookup returns null for unknown wire_id in known provider", () => {
|
||||
expect(lookup("anthropic", "not-a-real-model")).toBeNull();
|
||||
});
|
||||
|
||||
test('lookup("anthropic", "claude-sonnet-4-6") resolves correctly', () => {
|
||||
const m = lookup("anthropic", "claude-sonnet-4-6");
|
||||
expect(m).not.toBeNull();
|
||||
expect(m?.canonical_id).toBe("claude-sonnet-4-6");
|
||||
expect(m?.tier).toBe("standard");
|
||||
});
|
||||
|
||||
test("lookupRoute delegates to lookup", () => {
|
||||
const a = lookup("kimi-coding", "kimi-k2-thinking");
|
||||
const b = lookupRoute("kimi-coding/kimi-k2-thinking");
|
||||
expect(a).toEqual(b);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Bedrock namespaced models ────────────────────────────────────────────────
|
||||
|
||||
describe("amazon-bedrock namespaced wire_ids", () => {
|
||||
test('lookup("amazon-bedrock", "moonshotai.kimi-k2.5") returns canonical kimi-k2.5', () => {
|
||||
const m = lookup("amazon-bedrock", "moonshotai.kimi-k2.5");
|
||||
expect(m).not.toBeNull();
|
||||
expect(m?.canonical_id).toBe("kimi-k2.5");
|
||||
});
|
||||
|
||||
test('lookup("amazon-bedrock", "moonshot.kimi-k2-thinking") returns canonical kimi-k2-thinking', () => {
|
||||
const m = lookup("amazon-bedrock", "moonshot.kimi-k2-thinking");
|
||||
expect(m).not.toBeNull();
|
||||
expect(m?.canonical_id).toBe("kimi-k2-thinking");
|
||||
expect(m?.tier).toBe("heavy");
|
||||
});
|
||||
|
||||
test('lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6") returns canonical claude-sonnet-4-6', () => {
|
||||
const m = lookup("amazon-bedrock", "anthropic.claude-sonnet-4-6");
|
||||
expect(m).not.toBeNull();
|
||||
expect(m?.canonical_id).toBe("claude-sonnet-4-6");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── allCanonicalIds ──────────────────────────────────────────────────────────
|
||||
|
||||
describe("allCanonicalIds", () => {
|
||||
test("returns a non-empty array", () => {
|
||||
const ids = allCanonicalIds();
|
||||
expect(ids.length).toBeGreaterThan(10);
|
||||
});
|
||||
|
||||
test("kimi-k2.5 is in the list", () => {
|
||||
expect(allCanonicalIds()).toContain("kimi-k2.5");
|
||||
});
|
||||
|
||||
test("kimi-k2.6 is in the list", () => {
|
||||
expect(allCanonicalIds()).toContain("kimi-k2.6");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── routeKeyOf ──────────────────────────────────────────────────────────────
|
||||
|
||||
describe("routeKeyOf", () => {
|
||||
test("builds correct fused key", () => {
|
||||
const m = lookup("kimi-coding", "kimi-k2.6")!;
|
||||
expect(routeKeyOf(m)).toBe("kimi-coding/kimi-k2.6");
|
||||
});
|
||||
});
|
||||
263
src/resources/extensions/sf/tests/model-route-failure.test.ts
Normal file
263
src/resources/extensions/sf/tests/model-route-failure.test.ts
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
/**
|
||||
* Tests for model-route-failure.js — generation guard and solver pinning (ADR-0079).
|
||||
*
|
||||
* Swarm B spec:
|
||||
* 1. Solver-pinned unit ("autonomous-solver") cannot fail over across
|
||||
* canonical_id boundaries. The resolver must return undefined when all
|
||||
* remaining routes belong to a different canonical model.
|
||||
* 2. Same-canonical multi-route failover works: two routes for the same
|
||||
* canonical id (kimi-k2.6 and kimi-for-coding both map to kimi-k2.6).
|
||||
* 3. Cross-generation failover for non-solver units succeeds AND emits
|
||||
* logGenerationDowngrade via logWarning.
|
||||
*/
|
||||
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
// ── Mock workflow-logger so we can assert on logWarning calls ──────────────
|
||||
vi.mock("../workflow-logger.js", () => ({
|
||||
logWarning: vi.fn(),
|
||||
logError: vi.fn(),
|
||||
}));
|
||||
|
||||
import { logWarning } from "../workflow-logger.js";
|
||||
|
||||
import {
|
||||
logGenerationDowngrade,
|
||||
resolveNextAvailableModelRoute,
|
||||
} from "../model-route-failure.js";
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Minimal model shape that model-route-failure.js expects for availableModels.
|
||||
* provider + id must be real registry entries so canonicalIdFor() resolves them.
|
||||
*/
|
||||
function makeModel(provider: string, id: string) {
|
||||
return { provider, id, api: "openai-completions" as const };
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// 1. Solver-pinning guard
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("solver-pinned failover (ADR-0079)", () => {
|
||||
it("returns undefined when the only available route has a different canonical_id than the failed solver route", () => {
|
||||
// Scenario: solver is running on kimi-coding/kimi-k2.6 (canonical: kimi-k2.6).
|
||||
// The only unfailed route is openrouter/moonshotai/kimi-k2.5 (canonical: kimi-k2.5).
|
||||
// Because these are different canonical ids, the solver-pinned guard must
|
||||
// reject the candidate and return undefined.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered
|
||||
makeModel("openrouter", "moonshotai/kimi-k2.5"), // canonical: kimi-k2.5 ≠ kimi-k2.6
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
|
||||
],
|
||||
unitType: "autonomous-solver",
|
||||
});
|
||||
expect(result).toBeUndefined();
|
||||
});
|
||||
|
||||
it("returns undefined when candidate is an unregistered route (null canonical) for solver-pinned unit", () => {
|
||||
// ollama-cloud/kimi-k2.5:cloud is not in the registry — canonicalIdFor
|
||||
// returns null. The guard treats null !== "kimi-k2.6" as a mismatch.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("ollama-cloud", "kimi-k2.5:cloud"), // not in registry
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
|
||||
],
|
||||
unitType: "autonomous-solver",
|
||||
});
|
||||
expect(result).toBeUndefined();
|
||||
});
|
||||
|
||||
it("does NOT emit logGenerationDowngrade for solver-pinned failover (guard rejects before logging)", () => {
|
||||
resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
|
||||
],
|
||||
unitType: "autonomous-solver",
|
||||
});
|
||||
// The guard rejected before logging — no downgrade event should be emitted.
|
||||
expect(logWarning).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// 2. Same-canonical multi-route failover
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("same-canonical multi-route failover", () => {
|
||||
it("succeeds when a second route shares the same canonical_id as the failed route", () => {
|
||||
// kimi-coding/kimi-for-coding maps to canonical kimi-k2.6 (same as
|
||||
// kimi-coding/kimi-k2.6). This is the standard same-canonical path.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("kimi-coding", "kimi-k2.6"), // same as current — filtered
|
||||
makeModel("kimi-coding", "kimi-for-coding"), // canonical: kimi-k2.6 ✓
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
|
||||
],
|
||||
unitType: "autonomous-solver",
|
||||
});
|
||||
expect(result).toBeDefined();
|
||||
expect(result?.model.provider).toBe("kimi-coding");
|
||||
expect(result?.model.id).toBe("kimi-for-coding");
|
||||
expect(result?.source).toBe("available");
|
||||
});
|
||||
|
||||
it("does not emit logGenerationDowngrade for same-canonical failover", () => {
|
||||
resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("kimi-coding", "kimi-k2.6"),
|
||||
makeModel("kimi-coding", "kimi-for-coding"),
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
|
||||
],
|
||||
unitType: "execute-task",
|
||||
});
|
||||
expect(logWarning).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("same-canonical failover works even for solver-pinned unit type", () => {
|
||||
// Within the same canonical_id, solver pin does not block failover.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("kimi-coding", "kimi-k2.6"),
|
||||
makeModel("kimi-coding", "kimi-for-coding"),
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
|
||||
],
|
||||
unitType: "autonomous-solver",
|
||||
});
|
||||
expect(result).toBeDefined();
|
||||
expect(result?.model.id).toBe("kimi-for-coding");
|
||||
});
|
||||
});
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// 3. Cross-generation failover for non-solver units
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("cross-generation failover (executor layer)", () => {
|
||||
it("succeeds and emits logGenerationDowngrade when crossing generation boundaries", () => {
|
||||
// kimi-k2.6 (gen: k2.6) → kimi-k2.5 via openrouter (gen: k2.5).
|
||||
// These are different generations, so the downgrade event must fire.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [
|
||||
makeModel("openrouter", "moonshotai/kimi-k2.5"),
|
||||
],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "server" },
|
||||
],
|
||||
unitType: "execute-task",
|
||||
});
|
||||
expect(result).toBeDefined();
|
||||
expect(result?.model.provider).toBe("openrouter");
|
||||
// logGenerationDowngrade should have been called
|
||||
expect(logWarning).toHaveBeenCalledWith(
|
||||
"model-route-failure",
|
||||
"generation-downgrade",
|
||||
expect.objectContaining({
|
||||
from: "kimi-k2.6",
|
||||
to: "kimi-k2.5",
|
||||
unitType: "execute-task",
|
||||
sameGeneration: false,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("emits logGenerationDowngrade with the correct unitType from args", () => {
|
||||
resolveNextAvailableModelRoute({
|
||||
current: makeModel("kimi-coding", "kimi-k2.6"),
|
||||
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
|
||||
failedRoutes: [
|
||||
{ provider: "kimi-coding", modelId: "kimi-k2.6", reason: "rate-limit" },
|
||||
],
|
||||
unitType: "plan-slice",
|
||||
});
|
||||
expect(logWarning).toHaveBeenCalledWith(
|
||||
"model-route-failure",
|
||||
"generation-downgrade",
|
||||
expect.objectContaining({ unitType: "plan-slice" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("does not emit logGenerationDowngrade when no current route is set", () => {
|
||||
// When current is undefined, canonicalIdFor returns null and no generation
|
||||
// check can be performed — no downgrade event should fire.
|
||||
const result = resolveNextAvailableModelRoute({
|
||||
current: undefined,
|
||||
availableModels: [makeModel("openrouter", "moonshotai/kimi-k2.5")],
|
||||
failedRoutes: [],
|
||||
unitType: "execute-task",
|
||||
});
|
||||
expect(result).toBeDefined();
|
||||
expect(logWarning).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does not emit logGenerationDowngrade when both routes share the same generation", () => {
|
||||
// claude-sonnet-4 and claude-sonnet-4-5 both have generation "sonnet-4".
|
||||
resolveNextAvailableModelRoute({
|
||||
current: makeModel("anthropic", "claude-sonnet-4-20250514"),
|
||||
availableModels: [makeModel("anthropic", "claude-sonnet-4-5-20250929")],
|
||||
failedRoutes: [
|
||||
{
|
||||
provider: "anthropic",
|
||||
modelId: "claude-sonnet-4-20250514",
|
||||
reason: "rate-limit",
|
||||
},
|
||||
],
|
||||
unitType: "execute-task",
|
||||
});
|
||||
expect(logWarning).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// 4. logGenerationDowngrade helper
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("logGenerationDowngrade helper", () => {
|
||||
it("calls logWarning with structured generation-downgrade payload", () => {
|
||||
logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "execute-task", "test reason");
|
||||
expect(logWarning).toHaveBeenCalledWith(
|
||||
"model-route-failure",
|
||||
"generation-downgrade",
|
||||
{
|
||||
from: "kimi-k2.6",
|
||||
to: "kimi-k2.5",
|
||||
unitType: "execute-task",
|
||||
reason: "test reason",
|
||||
sameGeneration: false,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("uses default reason when none is supplied", () => {
|
||||
logGenerationDowngrade("kimi-k2.6", "kimi-k2.5", "plan-slice", undefined);
|
||||
expect(logWarning).toHaveBeenCalledWith(
|
||||
"model-route-failure",
|
||||
"generation-downgrade",
|
||||
expect.objectContaining({ reason: "cross-generation failover" }),
|
||||
);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue