singularity-forge/src/resources/extensions/sf/model-learner.js

/**
 * Continuous Model Learning — track per-task-type model performance and
 * adaptively route to better-performing models.
 *
 * Purpose: Make model selection data-driven and adaptive instead of static.
 * When a model consistently fails on certain task types, demote it. When a new
 * model succeeds where the incumbent fails, promote it.
 *
 * Consumer: auto-dispatch.ts outcome logging, model-router.ts selection logic,
 * benchmark-selector.ts display.
 */

import { existsSync, readFileSync, writeFileSync, appendFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { mkdirSync } from "node:fs";

/**
 * Per-task-type model performance tracker.
 *
 * Schema:
 * {
 *   "execute-task": {
 *     "gpt-4o": {
 *       "successes": 42,
 *       "failures": 3,
 *       "timeouts": 1,
 *       "totalTokens": 1500000,
 *       "totalCost": 45.50,
 *       "lastUsed": "2026-05-06T16:30:00Z",
 *       "successRate": 0.93
 *     },
 *     "claude-opus": {
 *       ...
 *     }
 *   },
 *   "plan-slice": { ... }
 * }
 */
class ModelPerformanceTracker {
	constructor(basePath) {
		this.basePath = basePath;
		this.storagePath = join(basePath, ".sf", "model-performance.json");
		this.data = this._load();
	}

	_load() {
		if (!existsSync(this.storagePath)) {
			return {};
		}
		try {
			const content = readFileSync(this.storagePath, "utf-8");
			return JSON.parse(content);
		} catch {
			return {};
		}
	}

	_save() {
		try {
			const dir = dirname(this.storagePath);
			if (!existsSync(dir)) {
				mkdirSync(dir, { recursive: true });
			}
			writeFileSync(
				this.storagePath,
				JSON.stringify(this.data, null, 2),
				"utf-8",
			);
		} catch (err) {
			console.error("Failed to save model performance data:", err);
		}
	}

	/**
	 * Record outcome for a model on a specific task type.
	 */
	recordOutcome(taskType, modelId, outcome) {
		const {
			success,
			timeout = false,
			tokensUsed = 0,
			costUsd = 0,
			timestamp = new Date().toISOString(),
		} = outcome;

		if (!this.data[taskType]) {
			this.data[taskType] = {};
		}
		if (!this.data[taskType][modelId]) {
			this.data[taskType][modelId] = {
				successes: 0,
				failures: 0,
				timeouts: 0,
				totalTokens: 0,
				totalCost: 0,
				lastUsed: timestamp,
				successRate: 0,
			};
		}

		const stats = this.data[taskType][modelId];
		if (success) {
			stats.successes += 1;
		} else if (timeout) {
			stats.timeouts += 1;
			stats.failures += 1;
		} else {
			stats.failures += 1;
		}

		stats.totalTokens += tokensUsed;
		stats.totalCost += costUsd;
		stats.lastUsed = timestamp;

		const total = stats.successes + stats.failures;
		stats.successRate = total > 0 ? stats.successes / total : 0;

		this._save();
	}

	/**
	 * Get performance stats for a task type and model.
	 */
	getStats(taskType, modelId) {
		return this.data[taskType]?.[modelId] || null;
	}

	/**
	 * Get all models for a task type, ranked by success rate.
	 */
	getRankedModels(taskType, minSamples = 3) {
		if (!this.data[taskType]) return [];

		const models = Object.entries(this.data[taskType])
			.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
			.map(([modelId, stats]) => ({
				modelId,
				successRate: stats.successRate,
				attempts: stats.successes + stats.failures,
				tokens: stats.totalTokens,
				cost: stats.totalCost,
				latestAttempt: stats.lastUsed,
			}))
			.sort((a, b) => b.successRate - a.successRate);

		return models;
	}

	/**
	 * Check if a model should be demoted (fails >50% on this task type).
	 */
	shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
		const stats = this.getStats(taskType, modelId);
		if (!stats) return false;

		const failureRate = 1 - stats.successRate;
		const totalAttempts = stats.successes + stats.failures;

		return failureRate > thresholdFailureRate && totalAttempts >= 5;
	}

	/**
	 * Get candidates for A/B testing (new model vs incumbent).
	 * Returns: { incumbent, challengers: [] }
	 */
	getABTestCandidates(taskType, minSamples = 3, lowRiskFraction = 0.1) {
		const ranked = this.getRankedModels(taskType, minSamples);
		if (ranked.length < 2) return null;

		const incumbent = ranked[0];
		const challengers = ranked.slice(1, 3); // Top 2 challengers

		return {
			incumbent,
			challengers,
			testBudget: Math.max(1, Math.ceil(1 / lowRiskFraction)), // E.g., 10 tasks
		};
	}

	/**
	 * Track A/B test results and decide on promotion/demotion.
	 */
	analyzeABTest(taskType, results) {
		// results: { incumbentWins, challengerWins, incumbentAvgLatency, challengerAvgLatency }
		const { incumbentWins, challengerWins } = results;
		const total = incumbentWins + challengerWins;

		if (total < 5) {
			return { recommendation: "inconclusive", reason: "insufficient samples" };
		}

		const challengerSuccessRate = challengerWins / total;
		const incumbentSuccessRate = incumbentWins / total;

		if (challengerSuccessRate > incumbentSuccessRate + 0.1) {
			return {
				recommendation: "promote",
				reason: `challenger ${challengerSuccessRate.toFixed(2)} vs incumbent ${incumbentSuccessRate.toFixed(2)}`,
			};
		}

		return {
			recommendation: "continue",
			reason: "incumbent still ahead",
		};
	}
}

/**
 * Failure Analyzer — categorize and log why models failed.
 *
 * Purpose: Understand failure patterns (timeout, quality, cost) to inform
 * promotion/demotion decisions.
 */
class FailureAnalyzer {
	constructor(basePath) {
		this.basePath = basePath;
		this.logsPath = join(basePath, ".sf", "model-failure-log.jsonl");
	}

	logFailure(taskType, modelId, failure) {
		const {
			reason = "unknown",
			timeout = false,
			tokensUsed = 0,
			context = {},
			timestamp = new Date().toISOString(),
		} = failure;

		const entry = {
			timestamp,
			taskType,
			modelId,
			reason,
			timeout,
			tokensUsed,
			context,
		};

		try {
			const dir = dirname(this.logsPath);
			if (!existsSync(dir)) {
				mkdirSync(dir, { recursive: true });
			}
			appendFileSync(this.logsPath, JSON.stringify(entry) + "\n", "utf-8");
		} catch (err) {
			console.error("Failed to log model failure:", err);
		}
	}

	/**
	 * Get failure summary for a model on a task type.
	 * Returns: { reasons: { [reason]: count }, patterns: [...] }
	 */
	getFailureSummary(taskType, modelId) {
		if (!existsSync(this.logsPath)) {
			return { reasons: {}, patterns: [] };
		}

		try {
			const content = readFileSync(this.logsPath, "utf-8");
			const lines = content.trim().split("\n");

			const reasons = {};
			const failures = [];

			for (const line of lines) {
				const entry = JSON.parse(line);
				if (entry.taskType !== taskType || entry.modelId !== modelId) continue;

				reasons[entry.reason] = (reasons[entry.reason] || 0) + 1;
				failures.push(entry);
			}

			// Detect patterns
			const patterns = this._detectPatterns(failures);

			return { reasons, patterns };
		} catch {
			return { reasons: {}, patterns: [] };
		}
	}

	_detectPatterns(failures) {
		// Analyze failure distribution to detect systematic issues
		const timeoutCount = failures.filter((f) => f.timeout).length;
		const patterns = [];

		if (timeoutCount / Math.max(failures.length, 1) > 0.5) {
			patterns.push({
				type: "timeout_prone",
				severity: "high",
				suggestion: "Use shorter timeout or lower batch size",
			});
		}

		return patterns;
	}
}

/**
 * Main API: Integrate model learning into dispatch workflow.
 *
 * Usage in auto-dispatch.ts:
 * ```
 * const learner = new ModelLearner(projectPath);
 * learner.recordOutcome("execute-task", modelUsed, {
 *   success: taskSucceeded,
 *   timeout: taskTimedOut,
 *   tokensUsed: totalTokens,
 *   costUsd: modelCost,
 * });
 * ```
 */
export class ModelLearner {
	constructor(basePath) {
		this.basePath = basePath;
		this.tracker = new ModelPerformanceTracker(basePath);
		this.analyzer = new FailureAnalyzer(basePath);
	}

	/**
	 * Record an outcome for a model on a task.
	 */
	recordOutcome(taskType, modelId, outcome) {
		this.tracker.recordOutcome(taskType, modelId, outcome);
	}

	/**
	 * Log failure details for analysis.
	 */
	logFailure(taskType, modelId, failure) {
		this.analyzer.logFailure(taskType, modelId, failure);
	}

	/**
	 * Get ranked models for a task type (for intelligent routing).
	 */
	getRankedModels(taskType, minSamples = 3) {
		return this.tracker.getRankedModels(taskType, minSamples);
	}

	/**
	 * Decide whether to demote a model.
	 */
	shouldDemote(taskType, modelId, failureThreshold = 0.5) {
		return this.tracker.shouldDemote(taskType, modelId, failureThreshold);
	}

	/**
	 * Get A/B test candidates (for hypothesis testing).
	 */
	getABTestCandidates(taskType, minSamples = 3) {
		return this.tracker.getABTestCandidates(taskType, minSamples);
	}

	/**
	 * Analyze A/B test results.
	 */
	analyzeABTest(taskType, results) {
		return this.tracker.analyzeABTest(taskType, results);
	}

	/**
	 * Get failure analysis for a model.
	 */
	getFailureAnalysis(taskType, modelId) {
		return this.analyzer.getFailureSummary(taskType, modelId);
	}
}

export { ModelPerformanceTracker, FailureAnalyzer };

export default {
	ModelLearner,
	ModelPerformanceTracker,
	FailureAnalyzer,
};