test: harden uok self-evolution paths

2026-05-06 22:55:35 +02:00 · 2026-05-06 22:55:35 +02:00 · 30f8738585
commit 30f8738585
parent 69d3114265
25 changed files with 314 additions and 481 deletions
--- a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts
+++ b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts
@ -104,3 +104,15 @@ test("set_widget_when_widget_host_missing_ignores_factory_without_throwing", ()
 		{ placement: "belowEditor" },
 	);
 });
+
+test("dialog_methods_when_host_dialogs_missing_degrade_without_throwing", async () => {
+	const ui = createExtensionUIContext({
+		// RPC/headless-style hosts may not implement interactive dialogs.
+	});
+
+	await assert.doesNotReject(async () => {
+		assert.equal(await ui.confirm("Proceed?", "Dangerous command"), false);
+		assert.equal(await ui.select("Pick one", ["a", "b"]), undefined);
+		assert.equal(await ui.input("Value", "placeholder"), undefined);
+	});
+});
--- a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts
+++ b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts
@ -96,12 +96,24 @@ function createWidgetSetter(
 export function createExtensionUIContext(host: any): ExtensionUIContext {
 	const setWidget = createWidgetSetter(host);
 	return {
-		select: (title, options, opts) =>
-			host.showExtensionSelector(title, options, opts),
-		confirm: (title, message, opts) =>
-			host.showExtensionConfirm(title, message, opts),
-		input: (title, placeholder, opts) =>
-			host.showExtensionInput(title, placeholder, opts),
+		select: (title, options, opts) => {
+			if (typeof host.showExtensionSelector !== "function") {
+				return Promise.resolve(undefined);
+			}
+			return host.showExtensionSelector(title, options, opts);
+		},
+		confirm: (title, message, opts) => {
+			if (typeof host.showExtensionConfirm !== "function") {
+				return Promise.resolve(false);
+			}
+			return host.showExtensionConfirm(title, message, opts);
+		},
+		input: (title, placeholder, opts) => {
+			if (typeof host.showExtensionInput !== "function") {
+				return Promise.resolve(undefined);
+			}
+			return host.showExtensionInput(title, placeholder, opts);
+		},
 		notify: (message, type) => notifyHost(host, message, type),
 		onTerminalInput: (handler) =>
 			host.addExtensionTerminalInputListener(handler),
--- a/src/resources/extensions/sf/auto-direct-dispatch.js
+++ b/src/resources/extensions/sf/auto-direct-dispatch.js
@ -17,12 +17,7 @@ import {
 } from "./auto-prompts.js";
 import { scopeActiveToolsForUnitType } from "./constants.js";
 import { loadFile } from "./files.js";
-import { parseRoadmap } from "./parsers.js";
-import {
-	relSliceFile,
-	resolveMilestoneFile,
-	resolveSliceFile,
-} from "./paths.js";
+import { relSliceFile, resolveSliceFile } from "./paths.js";
 import { loadEffectiveSFPreferences } from "./preferences.js";
 import { getMilestoneSlices, isDbAvailable } from "./sf-db.js";
 import { deriveState } from "./state.js";
@ -181,24 +176,17 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
 		}
 		case "reassess":
 		case "reassess-roadmap": {
-			// DB primary path — get completed slices, fall back to file parsing when DB has no data
 			let completedSliceIds = [];
 			if (isDbAvailable()) {
 				completedSliceIds = getMilestoneSlices(mid)
 					.filter((s) => s.status === "complete")
 					.map((s) => s.id);
-			}
-			if (completedSliceIds.length === 0) {
-				// File-based fallback: parse roadmap checkboxes
-				const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
-				if (roadmapPath) {
-					const roadmapContent = await loadFile(roadmapPath);
-					if (roadmapContent) {
-						completedSliceIds = parseRoadmap(roadmapContent)
-							.slices.filter((s) => s.done)
-							.map((s) => s.id);
-					}
-				}
+			} else {
+				ctx.ui.notify(
+					"Cannot dispatch reassess-roadmap: database unavailable.",
+					"warning",
+				);
+				return;
 			}
 			if (completedSliceIds.length === 0) {
 				ctx.ui.notify(
@ -223,24 +211,18 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
 			// UAT targets the most recently completed slice, not the active (next
 			// incomplete) slice. After slice completion, state.activeSlice advances
 			// to the next incomplete slice, so we find the last done slice from the
-			// roadmap instead (#1693).
+			// DB instead (#1693).
 			let uatCompletedSliceIds = [];
 			if (isDbAvailable()) {
 				uatCompletedSliceIds = getMilestoneSlices(mid)
 					.filter((s) => s.status === "complete")
 					.map((s) => s.id);
-			}
-			if (uatCompletedSliceIds.length === 0) {
-				// File-based fallback: parse roadmap checkboxes
-				const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
-				if (roadmapPath) {
-					const roadmapContent = await loadFile(roadmapPath);
-					if (roadmapContent) {
-						uatCompletedSliceIds = parseRoadmap(roadmapContent)
-							.slices.filter((s) => s.done)
-							.map((s) => s.id);
-					}
-				}
+			} else {
+				ctx.ui.notify(
+					"Cannot dispatch run-uat: database unavailable.",
+					"warning",
+				);
+				return;
 			}
 			if (uatCompletedSliceIds.length === 0) {
 				ctx.ui.notify(
--- a/src/resources/extensions/sf/auto-prompts.js
+++ b/src/resources/extensions/sf/auto-prompts.js
@ -24,6 +24,7 @@ import {
 } from "./files.js";
 import { assertGateCoverage, getGatesForTurn } from "./gate-registry.js";
 import { inlineGraphSubgraph } from "./graph-context.js";
+import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";
 import {
 	formatMemoriesForPrompt,
 	getActiveMemoriesRanked,
@ -66,7 +67,6 @@ import {
 import { composeInlinedContext } from "./unit-context-composer.js";
 import { getUatType, hasVerdict } from "./verdict-parser.js";
 import { logWarning } from "./workflow-logger.js";
-import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";

 // ─── Preamble Cap ─────────────────────────────────────────────────────────────
 /**
@ -88,7 +88,7 @@ async function getKnowledgeInjection(basePath, taskContext = {}) {
 			minConfidence: 0.7,
 			minSimilarity: 0.5,
 		});
-	} catch (err) {
+	} catch {
 		// Gracefully degrade if knowledge injection fails
 		return "(knowledge unavailable)";
 	}
--- a/src/resources/extensions/sf/auto-recovery.js
+++ b/src/resources/extensions/sf/auto-recovery.js
@ -389,9 +389,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
 				if (isDbAvailable()) {
 					const tasks = getSliceTasks(mid, sid);
 					if (tasks.length > 0) taskIds = tasks.map((t) => t.id);
+					else return false;
 				}
-				if (!taskIds) {
-					// LEGACY: DB unavailable or no tasks in DB — parse plan file for task IDs
+				if (!taskIds && !isDbAvailable()) {
+					// LEGACY: DB unavailable — parse plan file for task IDs.
 					const planContent = readFileSync(absPath, "utf-8");
 					const plan = parsePlan(planContent);
 					if (plan.tasks.length > 0) taskIds = plan.tasks.map((t) => t.id);
@ -443,9 +444,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
 						return false;
 					}
 				}
+			} else {
+				// DB available but slice row not found — completion tool never ran.
+				return false;
 			}
-			// else: DB available but slice not found — summary + UAT exist,
-			// treat as verified (slice may not be imported yet)
 		}
 	}
 	// complete-milestone must have produced implementation artifacts (#1703).
--- a/src/resources/extensions/sf/auto-verification.js
+++ b/src/resources/extensions/sf/auto-verification.js
@ -25,6 +25,7 @@ import {
 import { isMilestoneComplete } from "./state.js";
 import { isClosedStatus } from "./status-guards.js";
 import { parseUnitId } from "./unit-id.js";
+import { ChaosMonkeyGate } from "./uok/chaos-monkey.js";
 import { CostGuardGate } from "./uok/cost-guard-gate.js";
 import { resolveUokFlags } from "./uok/flags.js";
 import { UokGateRunner } from "./uok/gate-runner.js";
@ -357,6 +358,24 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
 					unitId: s.currentUnit.id,
 				});
 			}
+			if (uokFlags.chaosMonkey) {
+				gateRunner.register(new ChaosMonkeyGate({ active: true }));
+				const cmResult = await gateRunner.run("chaos-monkey", {
+					basePath: s.basePath,
+					traceId: `chaos-monkey:${s.currentUnit.id}`,
+					turnId: s.currentUnit.id,
+					milestoneId: mid ?? undefined,
+					sliceId: sid ?? undefined,
+					taskId: tid ?? undefined,
+					unitType: s.currentUnit.type,
+					unitId: s.currentUnit.id,
+				});
+				if (cmResult.outcome === "fail") {
+					result.passed = false;
+					result.chaosMonkeyFailure = true;
+					result.chaosMonkeyRationale = cmResult.rationale;
+				}
+			}
 		}
 		// Auto-fix retry preferences
 		const autoFixEnabled = prefs?.verification_auto_fix !== false;
@ -438,6 +457,16 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
 				`verification-gate: cost-guard failure: ${result.costGuardRationale}\n`,
 			);
 		}
+		// Log chaos-monkey failures
+		if (result.chaosMonkeyFailure) {
+			ctx.ui.notify(
+				`[verify] CHAOS-MONKEY FAIL — ${result.chaosMonkeyRationale}`,
+				"error",
+			);
+			process.stderr.write(
+				`verification-gate: chaos-monkey injected failure: ${result.chaosMonkeyRationale}\n`,
+			);
+		}
 		// Write verification evidence JSON
 		const attempt = s.verificationRetryCount.get(s.currentUnit.id) ?? 0;
 		if (mid && sid && tid) {
--- a/src/resources/extensions/sf/commands-bootstrap.js
+++ b/src/resources/extensions/sf/commands-bootstrap.js
@ -35,7 +35,7 @@ const TOP_LEVEL_SUBCOMMANDS = [
 	{ cmd: "run-hook", desc: "Manually trigger a specific hook" },
 	{ cmd: "skill-health", desc: "Skill lifecycle dashboard" },
 	{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
-	{ cmd: "uok", desc: "UOK runtime health and ledger status" },
+	{ cmd: "uok", desc: "UOK runtime health, ledger status, and gate metrics" },
 	{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
 	{ cmd: "forensics", desc: "Examine execution logs" },
 	{ cmd: "init", desc: "Project init wizard" },
--- a/src/resources/extensions/sf/commands-uok.js
+++ b/src/resources/extensions/sf/commands-uok.js
@ -4,6 +4,8 @@ import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
 import { sfRoot } from "./paths.js";
 import { getUokRuns, isDbAvailable } from "./sf-db.js";
 import { writeUokDiagnostics } from "./uok/diagnostic-synthesis.js";
+import { UokGateRunner } from "./uok/gate-runner.js";
+import { readUokMetrics, writeUokMetrics } from "./uok/metrics-exposition.js";
 import {
 	summarizeParityHealth,
 	writeParityReport,
@ -90,6 +92,15 @@ export async function collectUokStatus(
 	} catch {
 		diagnostics = null;
 	}
+	let gateHealth = null;
+	let metricsPath = null;
+	try {
+		const runner = new UokGateRunner();
+		gateHealth = runner.getHealthSummary();
+		metricsPath = writeUokMetrics(basePath);
+	} catch {
+		// gate health and metrics are best-effort
+	}
 	return {
 		dbAvailable,
 		generatedAt: new Date(nowMs).toISOString(),
@ -103,6 +114,8 @@ export async function collectUokStatus(
 		current,
 		historical,
 		diagnostics,
+		gateHealth,
+		metricsPath,
 		reportPath: join(sfRoot(basePath), "runtime", "uok-parity-report.json"),
 	};
 }
@ -164,6 +177,24 @@ export function formatUokStatus(status, nowMs = Date.now()) {
 		lines.push("Last error: none in ledger");
 	}
 	lines.push("");
+	if (status.gateHealth?.gates?.length > 0) {
+		lines.push("Gate health (24h):");
+		for (const g of status.gateHealth.gates) {
+			const icon =
+				g.circuitBreaker === "open"
+					? "🔴"
+					: g.circuitBreaker === "half-open"
+						? "🟡"
+						: "🟢";
+			lines.push(
+				`  ${icon} ${g.id}: ${g.pass} pass / ${g.fail} fail / ${g.retry} retry | cb: ${g.circuitBreaker}${g.failureStreak > 0 ? ` (streak ${g.failureStreak})` : ""}`,
+			);
+		}
+		lines.push("");
+	}
+	if (status.metricsPath) {
+		lines.push(`Metrics: ${status.metricsPath}`);
+	}
 	lines.push(`Report: ${status.reportPath}`);
 	return lines.join("\n");
 }
@ -172,11 +203,22 @@ export async function handleUok(args, ctx) {
 	const trimmed = args.trim();
 	if (trimmed === "help" || trimmed === "--help") {
 		ctx.ui.notify(
-			"Usage: /sf uok [status|--json]\n\nShows UOK ledger health, last run, last error, historical drift, and startup gate state.",
+			"Usage: /sf uok [status|metrics|--json]\n\n  status   — UOK ledger health, last run, last error, historical drift, startup gate, and gate health\n  metrics  — Render Prometheus-format metrics to .sf/runtime/uok-metrics.prom and display\n  --json   — Same as status but outputs JSON",
 			"info",
 		);
 		return;
 	}
+	if (trimmed === "metrics") {
+		const basePath = process.cwd();
+		const path = writeUokMetrics(basePath);
+		const text = readUokMetrics(basePath);
+		ctx.ui.notify(
+			text ?? "No metrics available (DB unavailable or no gate data)",
+			"info",
+		);
+		ctx.ui.notify(`Written to: ${path}`, "info");
+		return;
+	}
 	const status = await collectUokStatus(process.cwd());
 	if (trimmed === "--json" || trimmed === "json") {
 		ctx.ui.notify(JSON.stringify(status, null, 2), "info");
--- a/src/resources/extensions/sf/commands/catalog.js
+++ b/src/resources/extensions/sf/commands/catalog.js
@ -98,7 +98,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
 	{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
 	{
 		cmd: "uok",
-		desc: "UOK runtime health: ledger, last run, last error, startup gate",
+		desc: "UOK runtime health: ledger, last run, last error, startup gate, gate metrics",
 	},
 	{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
 	{ cmd: "forensics", desc: "Examine execution logs" },
--- a/src/resources/extensions/sf/knowledge-injector.js
+++ b/src/resources/extensions/sf/knowledge-injector.js
@ -32,8 +32,7 @@ import { join } from "node:path";
 */
 function parseKnowledgeEntries(knowledgeContent) {
 	const entries = [];
-	const entryPattern =
-		/### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;
+	const entryPattern = /### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;

 	let match;
 	while ((match = entryPattern.exec(knowledgeContent)) !== null) {
@ -41,9 +40,15 @@ function parseKnowledgeEntries(knowledgeContent) {
 		const body = match[2];

 		// Extract fields
-		const evidenceMatch = body.match(/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/);
-		const confidenceMatch = body.match(/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/);
-		const domainMatch = body.match(/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/);
+		const evidenceMatch = body.match(
+			/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/,
+		);
+		const confidenceMatch = body.match(
+			/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/,
+		);
+		const domainMatch = body.match(
+			/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/,
+		);
 		const recommendationMatch = body.match(
 			/[-*]\s+\*?\*?Recommendation:\*?\*?\s*(.+?)(?:\n|$)/,
 		);
@ -90,9 +95,7 @@ function extractConcepts(entry) {
 	}

 	// Add title keywords
-	const titleKeywords = entry.title
-		.split(/\s+/)
-		.filter((w) => w.length > 3);
+	const titleKeywords = entry.title.split(/\s+/).filter((w) => w.length > 3);
 	titleKeywords.forEach((w) => concepts.add(w.toLowerCase()));

 	return Array.from(concepts);
@ -235,9 +238,7 @@ function loadKnowledgeFile(basePath) {
 		if (existsSync(p)) {
 			try {
 				return readFileSync(p, "utf-8");
-			} catch {
-				continue;
-			}
+			} catch {}
 		}
 	}

@ -257,7 +258,11 @@ function loadKnowledgeFile(basePath) {
 *
 * Returns: formatted string suitable for prompt variable substitution
 */
-export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {}) {
+export function injectKnowledgeIntPrompt(
+	basePath,
+	taskContext = {},
+	options = {},
+) {
 	const knowledgeContent = loadKnowledgeFile(basePath);
 	if (!knowledgeContent) {
 		return "(knowledge base unavailable)";
@ -304,7 +309,7 @@ export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {
 * Purpose: Record which knowledge was actually used in a dispatch so we can
 * later measure effectiveness and refine knowledge compounding.
 */
-export function trackKnowledgeUsage(basePath, taskId, injectedKnowledge) {
+export function trackKnowledgeUsage(_basePath, taskId, injectedKnowledge) {
 	// This would write to a usage log in .sf/knowledge-usage.jsonl
 	// Implementation deferred to feedback-loop integration
 	return {
--- a/src/resources/extensions/sf/model-learner.js
+++ b/src/resources/extensions/sf/model-learner.js
@ -1,378 +0,0 @@
-/**
- * Continuous Model Learning — track per-task-type model performance and
- * adaptively route to better-performing models.
- *
- * Purpose: Make model selection data-driven and adaptive instead of static.
- * When a model consistently fails on certain task types, demote it. When a new
- * model succeeds where the incumbent fails, promote it.
- *
- * Consumer: auto-dispatch.ts outcome logging, model-router.ts selection logic,
- * benchmark-selector.ts display.
- */
-
-import { existsSync, readFileSync, writeFileSync, appendFileSync } from "node:fs";
-import { dirname, join } from "node:path";
-import { mkdirSync } from "node:fs";
-
-/**
- * Per-task-type model performance tracker.
- *
- * Schema:
- * {
- *   "execute-task": {
- *     "gpt-4o": {
- *       "successes": 42,
- *       "failures": 3,
- *       "timeouts": 1,
- *       "totalTokens": 1500000,
- *       "totalCost": 45.50,
- *       "lastUsed": "2026-05-06T16:30:00Z",
- *       "successRate": 0.93
- *     },
- *     "claude-opus": {
- *       ...
- *     }
- *   },
- *   "plan-slice": { ... }
- * }
- */
-class ModelPerformanceTracker {
-	constructor(basePath) {
-		this.basePath = basePath;
-		this.storagePath = join(basePath, ".sf", "model-performance.json");
-		this.data = this._load();
-	}
-
-	_load() {
-		if (!existsSync(this.storagePath)) {
-			return {};
-		}
-		try {
-			const content = readFileSync(this.storagePath, "utf-8");
-			return JSON.parse(content);
-		} catch {
-			return {};
-		}
-	}
-
-	_save() {
-		try {
-			const dir = dirname(this.storagePath);
-			if (!existsSync(dir)) {
-				mkdirSync(dir, { recursive: true });
-			}
-			writeFileSync(
-				this.storagePath,
-				JSON.stringify(this.data, null, 2),
-				"utf-8",
-			);
-		} catch (err) {
-			console.error("Failed to save model performance data:", err);
-		}
-	}
-
-	/**
-	 * Record outcome for a model on a specific task type.
-	 */
-	recordOutcome(taskType, modelId, outcome) {
-		const {
-			success,
-			timeout = false,
-			tokensUsed = 0,
-			costUsd = 0,
-			timestamp = new Date().toISOString(),
-		} = outcome;
-
-		if (!this.data[taskType]) {
-			this.data[taskType] = {};
-		}
-		if (!this.data[taskType][modelId]) {
-			this.data[taskType][modelId] = {
-				successes: 0,
-				failures: 0,
-				timeouts: 0,
-				totalTokens: 0,
-				totalCost: 0,
-				lastUsed: timestamp,
-				successRate: 0,
-			};
-		}
-
-		const stats = this.data[taskType][modelId];
-		if (success) {
-			stats.successes += 1;
-		} else if (timeout) {
-			stats.timeouts += 1;
-			stats.failures += 1;
-		} else {
-			stats.failures += 1;
-		}
-
-		stats.totalTokens += tokensUsed;
-		stats.totalCost += costUsd;
-		stats.lastUsed = timestamp;
-
-		const total = stats.successes + stats.failures;
-		stats.successRate = total > 0 ? stats.successes / total : 0;
-
-		this._save();
-	}
-
-	/**
-	 * Get performance stats for a task type and model.
-	 */
-	getStats(taskType, modelId) {
-		return this.data[taskType]?.[modelId] || null;
-	}
-
-	/**
-	 * Get all models for a task type, ranked by success rate.
-	 */
-	getRankedModels(taskType, minSamples = 3) {
-		if (!this.data[taskType]) return [];
-
-		const models = Object.entries(this.data[taskType])
-			.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
-			.map(([modelId, stats]) => ({
-				modelId,
-				successRate: stats.successRate,
-				attempts: stats.successes + stats.failures,
-				tokens: stats.totalTokens,
-				cost: stats.totalCost,
-				latestAttempt: stats.lastUsed,
-			}))
-			.sort((a, b) => b.successRate - a.successRate);
-
-		return models;
-	}
-
-	/**
-	 * Check if a model should be demoted (fails >50% on this task type).
-	 */
-	shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
-		const stats = this.getStats(taskType, modelId);
-		if (!stats) return false;
-
-		const failureRate = 1 - stats.successRate;
-		const totalAttempts = stats.successes + stats.failures;
-
-		return failureRate > thresholdFailureRate && totalAttempts >= 5;
-	}
-
-	/**
-	 * Get candidates for A/B testing (new model vs incumbent).
-	 * Returns: { incumbent, challengers: [] }
-	 */
-	getABTestCandidates(taskType, minSamples = 3, lowRiskFraction = 0.1) {
-		const ranked = this.getRankedModels(taskType, minSamples);
-		if (ranked.length < 2) return null;
-
-		const incumbent = ranked[0];
-		const challengers = ranked.slice(1, 3); // Top 2 challengers
-
-		return {
-			incumbent,
-			challengers,
-			testBudget: Math.max(1, Math.ceil(1 / lowRiskFraction)), // E.g., 10 tasks
-		};
-	}
-
-	/**
-	 * Track A/B test results and decide on promotion/demotion.
-	 */
-	analyzeABTest(taskType, results) {
-		// results: { incumbentWins, challengerWins, incumbentAvgLatency, challengerAvgLatency }
-		const { incumbentWins, challengerWins } = results;
-		const total = incumbentWins + challengerWins;
-
-		if (total < 5) {
-			return { recommendation: "inconclusive", reason: "insufficient samples" };
-		}
-
-		const challengerSuccessRate = challengerWins / total;
-		const incumbentSuccessRate = incumbentWins / total;
-
-		if (challengerSuccessRate > incumbentSuccessRate + 0.1) {
-			return {
-				recommendation: "promote",
-				reason: `challenger ${challengerSuccessRate.toFixed(2)} vs incumbent ${incumbentSuccessRate.toFixed(2)}`,
-			};
-		}
-
-		return {
-			recommendation: "continue",
-			reason: "incumbent still ahead",
-		};
-	}
-}
-
-/**
- * Failure Analyzer — categorize and log why models failed.
- *
- * Purpose: Understand failure patterns (timeout, quality, cost) to inform
- * promotion/demotion decisions.
- */
-class FailureAnalyzer {
-	constructor(basePath) {
-		this.basePath = basePath;
-		this.logsPath = join(basePath, ".sf", "model-failure-log.jsonl");
-	}
-
-	logFailure(taskType, modelId, failure) {
-		const {
-			reason = "unknown",
-			timeout = false,
-			tokensUsed = 0,
-			context = {},
-			timestamp = new Date().toISOString(),
-		} = failure;
-
-		const entry = {
-			timestamp,
-			taskType,
-			modelId,
-			reason,
-			timeout,
-			tokensUsed,
-			context,
-		};
-
-		try {
-			const dir = dirname(this.logsPath);
-			if (!existsSync(dir)) {
-				mkdirSync(dir, { recursive: true });
-			}
-			appendFileSync(this.logsPath, JSON.stringify(entry) + "\n", "utf-8");
-		} catch (err) {
-			console.error("Failed to log model failure:", err);
-		}
-	}
-
-	/**
-	 * Get failure summary for a model on a task type.
-	 * Returns: { reasons: { [reason]: count }, patterns: [...] }
-	 */
-	getFailureSummary(taskType, modelId) {
-		if (!existsSync(this.logsPath)) {
-			return { reasons: {}, patterns: [] };
-		}
-
-		try {
-			const content = readFileSync(this.logsPath, "utf-8");
-			const lines = content.trim().split("\n");
-
-			const reasons = {};
-			const failures = [];
-
-			for (const line of lines) {
-				const entry = JSON.parse(line);
-				if (entry.taskType !== taskType || entry.modelId !== modelId) continue;
-
-				reasons[entry.reason] = (reasons[entry.reason] || 0) + 1;
-				failures.push(entry);
-			}
-
-			// Detect patterns
-			const patterns = this._detectPatterns(failures);
-
-			return { reasons, patterns };
-		} catch {
-			return { reasons: {}, patterns: [] };
-		}
-	}
-
-	_detectPatterns(failures) {
-		// Analyze failure distribution to detect systematic issues
-		const timeoutCount = failures.filter((f) => f.timeout).length;
-		const patterns = [];
-
-		if (timeoutCount / Math.max(failures.length, 1) > 0.5) {
-			patterns.push({
-				type: "timeout_prone",
-				severity: "high",
-				suggestion: "Use shorter timeout or lower batch size",
-			});
-		}
-
-		return patterns;
-	}
-}
-
-/**
- * Main API: Integrate model learning into dispatch workflow.
- *
- * Usage in auto-dispatch.ts:
- * ```
- * const learner = new ModelLearner(projectPath);
- * learner.recordOutcome("execute-task", modelUsed, {
- *   success: taskSucceeded,
- *   timeout: taskTimedOut,
- *   tokensUsed: totalTokens,
- *   costUsd: modelCost,
- * });
- * ```
- */
-export class ModelLearner {
-	constructor(basePath) {
-		this.basePath = basePath;
-		this.tracker = new ModelPerformanceTracker(basePath);
-		this.analyzer = new FailureAnalyzer(basePath);
-	}
-
-	/**
-	 * Record an outcome for a model on a task.
-	 */
-	recordOutcome(taskType, modelId, outcome) {
-		this.tracker.recordOutcome(taskType, modelId, outcome);
-	}
-
-	/**
-	 * Log failure details for analysis.
-	 */
-	logFailure(taskType, modelId, failure) {
-		this.analyzer.logFailure(taskType, modelId, failure);
-	}
-
-	/**
-	 * Get ranked models for a task type (for intelligent routing).
-	 */
-	getRankedModels(taskType, minSamples = 3) {
-		return this.tracker.getRankedModels(taskType, minSamples);
-	}
-
-	/**
-	 * Decide whether to demote a model.
-	 */
-	shouldDemote(taskType, modelId, failureThreshold = 0.5) {
-		return this.tracker.shouldDemote(taskType, modelId, failureThreshold);
-	}
-
-	/**
-	 * Get A/B test candidates (for hypothesis testing).
-	 */
-	getABTestCandidates(taskType, minSamples = 3) {
-		return this.tracker.getABTestCandidates(taskType, minSamples);
-	}
-
-	/**
-	 * Analyze A/B test results.
-	 */
-	analyzeABTest(taskType, results) {
-		return this.tracker.analyzeABTest(taskType, results);
-	}
-
-	/**
-	 * Get failure analysis for a model.
-	 */
-	getFailureAnalysis(taskType, modelId) {
-		return this.analyzer.getFailureSummary(taskType, modelId);
-	}
-}
-
-export { ModelPerformanceTracker, FailureAnalyzer };
-
-export default {
-	ModelLearner,
-	ModelPerformanceTracker,
-	FailureAnalyzer,
-};
--- a/src/resources/extensions/sf/prompts/discuss-headless.md
+++ b/src/resources/extensions/sf/prompts/discuss-headless.md
@ -76,7 +76,7 @@ Before anything else, form a diagnosis: What is the core challenge? What is brok
 - **Measure coverage**: find untested critical paths
 - **Scan for dead code, stubs, and commented-out features** — abandoned attempts are signals
 - **Discover needed skills**: identify repo languages, frameworks, data stores, external services, build tools, and domain-specific competencies. Check installed skills first; record installed, missing, and potentially useful skills in `.sf/CODEBASE.md` and `.sf/PM-STRATEGY.md`.
- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context.
+- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence.
 - Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. Fall back to shell `rg`, `find`, `ast-grep`, or `ls -la` only when the native/in-process tool surface is insufficient.

 ### Step 2: Check library and ecosystem facts
--- a/src/resources/extensions/sf/prompts/discuss.md
+++ b/src/resources/extensions/sf/prompts/discuss.md
@ -34,7 +34,7 @@ After reflection is confirmed, decide the approach based on the actual scope —

 Before asking your first question, do a mandatory investigation pass. This is not optional.

-1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
+1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
 2. **Check library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework the user mentioned. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover.** Get current facts about capabilities, constraints, API shapes, version-specific behavior.
 3. **Web search** — `search-the-web` if the domain is unfamiliar, if you need current best practices, or if the user referenced external services/APIs you need facts about. Use `fetch_page` for full content when snippets aren't enough.

--- a/src/resources/extensions/sf/prompts/gate-evaluate.md
+++ b/src/resources/extensions/sf/prompts/gate-evaluate.md
@ -18,6 +18,18 @@ You are evaluating **quality gates in parallel** for this slice. Each gate is an

 {{gateList}}

+## Gate Types Reference
+
+The following gate implementations may be present in this project. Each has distinct failure classes:
+
+- **`verification-gate`** — Runs lint, typecheck, tests, and post-execution checks. Failure classes: `verification` (check failed), `execution` (runtime/blocking error), `artifact` (post-execution consistency issue).
+- **`security-guard`** — Scans for secrets, unsafe patterns, and dependency vulnerabilities. Failure classes: `policy` (secret leaked), `input` (unsafe pattern).
+- **`cost-guard`** — Monitors LLM spend against per-unit and per-hour budgets, detects high-tier model failures. Failure classes: `policy` (budget exceeded), `execution` (high-tier model failure).
+- **`outcome-learning`** — Queries historical task outcomes for failure-rate anomalies. Failure classes: `policy` (failure rate too high), `input` (model recommendation).
+- **`multi-package-healing`** — Detects affected packages from git diff and runs targeted checks. Failure classes: `verification` (package check failed), `execution` (check timeout).
+- **`chaos-monkey`** — Stress-tests durability by injecting latency, retryable errors, disk stress, or memory pressure. Failure classes: `execution` (injected fault caused failure). This gate only runs when explicitly enabled (`active: false` by default).
+- **`post-execution-checks`** — Cross-task consistency verification after a task completes. Failure classes: `artifact` (consistency violation), `policy` (strict-mode warning escalation).
+
 ## Execution Protocol

 1. **Dispatch all gates** using `subagent` in parallel mode. Each subagent prompt is provided below.
--- a/src/resources/extensions/sf/prompts/guided-research-slice.md
+++ b/src/resources/extensions/sf/prompts/guided-research-slice.md
@ -1,4 +1,4 @@
-Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.
+Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If the repository is checked out locally, GitHub code search is a scarce remote-only fallback: do not use GitHub `/search/code` for that local repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so use it only for repositories that are not on disk, dedupe repeated queries, and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.

 **You are the scout.** A planner agent reads your output in a fresh context to decompose this slice into tasks. Write for the planner — surface key files, where the work divides naturally, what to build first, and how to verify. If the research doc is vague, the planner re-explores code you already read. If it's precise, the planner decomposes immediately.

--- a/src/resources/extensions/sf/prompts/research-slice.md
+++ b/src/resources/extensions/sf/prompts/research-slice.md
@ -46,6 +46,7 @@ Research what this slice needs. Narrate key findings and surprises as you go —
 2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
 3. Explore relevant code for this slice's scope. Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first.
 3a. Use a research swarm when the slice has 2-3 independent unknowns or subsystems. Dispatch parallel `scout`/`researcher` subagents with distinct lenses, then synthesize what each found into this single RESEARCH artifact. Do not swarm a narrow, sequence-dependent investigation.
+3b. **GitHub code search is a scarce remote-only fallback.** When the repository is present in `{{workingDirectory}}`, do not use GitHub `/search/code` for that repo; use local `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` as needed. GitHub's `code_search` bucket is small and separate from the normal REST/GraphQL quotas. Use GitHub code search only for repositories that are not checked out locally, dedupe repeated queries, and if it returns `403` rate-limit with a short reset, wait until reset or continue with local evidence. If remote code search is essential and still unavailable, checkpoint `continue`, `blocked`, or `decide` with the missing source named.
 4. **Documentation lookup — prefer DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework — AI-indexed, no free-tier cap. Fall back to `resolve_library` → `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 requests/month — spend those on cases DeepWiki can't cover.** Skip both for libraries already used in this codebase.
 5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — try DeepWiki → Context7 → web search in that order. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit.
 6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt).
--- a/src/resources/extensions/sf/prompts/system.md
+++ b/src/resources/extensions/sf/prompts/system.md
@ -161,7 +161,7 @@ Templates showing the expected format for each artifact type are in:

 **Code navigation:** Use `lsp` for definition, type_definition, implementation, references, incoming_calls, outgoing_calls, hover, signature, symbols, rename, code_actions, format, and diagnostics. Falls back gracefully if no server is available. Never `grep` for a symbol definition when `lsp` can resolve it semantically. Never shell out to prettier/rustfmt/gofmt when `lsp format` is available. After editing code, use `lsp diagnostics` to verify no type errors were introduced.

-**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.
+**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.

 **Swarm dispatch:** Let the system decide whether swarming fits before dispatching multiple execution subagents. Use a 2-3 worker same-model swarm only when the work splits into independent shards with explicit file/directory ownership, shard-local verification, low conflict risk, and clear wall-clock savings. Do not swarm shared-interface edits, lockfiles, migrations, single-failure debugging, or sequence-dependent work. The parent agent remains coordinator: assign ownership, synthesize results, inspect dirty files, resolve conflicts, and run final verification.

--- a/src/resources/extensions/sf/self-report-fixer.js
+++ b/src/resources/extensions/sf/self-report-fixer.js
@ -15,7 +15,7 @@
 * 4. Apply fix, test, and mark self-report resolved
 */

-import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { existsSync, readFileSync } from "node:fs";
 import { join } from "node:path";

 /**
@ -25,30 +25,36 @@ import { join } from "node:path";
 const FIX_PATTERNS = [
 	{
 		id: "validation-reviewer-rubric",
-		pattern: /validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
+		pattern:
+			/validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
 		confidence: 0.95, // We fixed this in validation prompts already
-		description: "Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
+		description:
+			"Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
 		fix: fixValidationReviewerRubric,
 	},
 	{
 		id: "gate-verdict-clarity",
 		pattern: /gate.*verdict.*ambiguous|verdict.*semantics.*unclear/i,
 		confidence: 0.9,
-		description: "Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
+		description:
+			"Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
 		fix: fixGateVerdictSemantics,
 	},
 	{
 		id: "env-vars-unvalidated",
-		pattern: /SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
+		pattern:
+			/SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
 		confidence: 0.85,
 		description: "Add runtime validation for SF_* environment variables",
 		fix: fixEnvValidation,
 	},
 	{
 		id: "self-report-coverage-gap",
-		pattern: /self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
+		pattern:
+			/self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
 		confidence: 0.8,
-		description: "Implement automated self-report triage pipeline (this module)",
+		description:
+			"Implement automated self-report triage pipeline (this module)",
 		fix: fixSelfReportPipeline,
 	},
 ];
@ -72,11 +78,19 @@ async function fixValidationReviewerRubric(basePath) {

 	// Check if rubric already exists
 	if (content.includes("Gate vs. Task Scope Rubric")) {
-		return { success: true, alreadyFixed: true, reason: "Rubric already present" };
+		return {
+			success: true,
+			alreadyFixed: true,
+			reason: "Rubric already present",
+		};
 	}

 	// This is already done in prior session, so just confirm
-	return { success: true, alreadyFixed: true, reason: "Fix verified in session" };
+	return {
+		success: true,
+		alreadyFixed: true,
+		reason: "Fix verified in session",
+	};
 }

 /**
@ -92,7 +106,11 @@ async function fixGateVerdictSemantics(basePath) {

 	// Check if gate semantics already documented
 	if (content.includes("Gate Verdict Semantics")) {
-		return { success: true, alreadyFixed: true, reason: "Gate semantics documented" };
+		return {
+			success: true,
+			alreadyFixed: true,
+			reason: "Gate semantics documented",
+		};
 	}

 	return { success: true, alreadyFixed: true, reason: "Fix already verified" };
@ -137,7 +155,7 @@ async function fixEnvValidation(basePath) {
 /**
 * Attempt to fix: Self-report triage pipeline (this module itself).
 */
-async function fixSelfReportPipeline(basePath) {
+async function fixSelfReportPipeline(_basePath) {
 	const thisFile = new URL(import.meta.url).pathname;
 	if (!existsSync(thisFile)) {
 		return { success: false, reason: "Self-report-fixer module not found" };
@ -280,16 +298,17 @@ export function generateTriageSummary(reports) {
 		uniqueClusters: clusters.length,
 		deduped: clusters,
 		categorized: categories,
-		highConfidenceFixes: reports
-			.flatMap((r) => {
-				const fixes = classifyReportFixes(r);
-				return fixes.filter((f) => f.confidence > 0.85).map((f) => ({
+		highConfidenceFixes: reports.flatMap((r) => {
+			const fixes = classifyReportFixes(r);
+			return fixes
+				.filter((f) => f.confidence > 0.85)
+				.map((f) => ({
 					reportId: r.id,
 					fixId: f.id,
 					description: f.description,
 					confidence: f.confidence,
 				}));
-			}),
+		}),
 	};
 }

--- a/src/resources/extensions/sf/sf-db.js
+++ b/src/resources/extensions/sf/sf-db.js
@ -4199,6 +4199,17 @@ export function getGateLatencyStats(gateId, windowHours = 24) {
 		return { total: 0, avgMs: 0, p50Ms: 0, p95Ms: 0, maxMs: 0 };
 	}
 }
+export function getDistinctGateIds() {
+	if (!currentDb) return [];
+	try {
+		const rows = currentDb
+			.prepare("SELECT DISTINCT gate_id FROM gate_runs")
+			.all();
+		return rows.map((r) => r.gate_id).filter(Boolean);
+	} catch {
+		return [];
+	}
+}
 function asStringOrNull(value) {
 	return typeof value === "string" && value.length > 0 ? value : null;
 }
--- a/src/resources/extensions/sf/skills/researcher/SKILL.md
+++ b/src/resources/extensions/sf/skills/researcher/SKILL.md
@ -69,6 +69,15 @@ rg --files src/resources/extensions/sf/skills
 {"query": "sift_request_factory", "strategy": "bm25", "limit": 10}
 ```

+**GitHub code search — remote-only fallback, not local repo search:**
+When the repository is checked out locally, do not use GitHub `/search/code` for
+that repo. Use `git grep` for tracked-file global search, `rg` for broader
+worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead.
+GitHub's `code_search` bucket is small and separate from normal REST/GraphQL
+quotas. Use GitHub code search only for repositories that are not on disk,
+dedupe repeated queries, and treat `403` rate-limit responses as a signal to
+wait for reset or continue with local evidence.
+
 **SF project database queries:**
 ```bash
 # Current milestone and slices
--- a/src/resources/extensions/sf/tests/knowledge-injector.test.ts
+++ b/src/resources/extensions/sf/tests/knowledge-injector.test.ts
@ -5,7 +5,7 @@
 * and prompt injection work correctly.
 */

-import { describe, test, expect } from "vitest";
+import { describe, expect, test } from "vitest";
 import knowledgeInjector from "../knowledge-injector.js";

 const {
@ -208,7 +208,7 @@ describe("knowledge-injector", () => {
 		const contradictions = detectContradictions(entries);
 		// These are compatible tools, not contradictions
 		const realContradictions = contradictions.filter(
-			(c) => !c.message.includes("suspicious")
+			(c) => !c.message.includes("suspicious"),
 		);
 		expect(realContradictions.length).toBe(0);
 	});
@ -305,7 +305,7 @@ describe("knowledge-injector", () => {
 		const relevant = findRelevantKnowledge(entries, context, 0, 0);

 		if (relevant.length > 0) {
-			const { score, entry } = relevant[0];
+			const { score } = relevant[0];
 			expect(score).toBeDefined();
 			expect(score).toBeGreaterThan(0);
 			expect(score).toBeLessThanOrEqual(1);
--- a/src/resources/extensions/sf/tests/self-report-fixer.test.ts
+++ b/src/resources/extensions/sf/tests/self-report-fixer.test.ts
@ -5,11 +5,11 @@
 * deduplication, and severity categorization work correctly.
 */

-import { describe, test, expect } from "vitest";
+import { describe, expect, test } from "vitest";
 import {
+	categorizeBySeverity,
 	classifyReportFixes,
 	dedupReports,
-	categorizeBySeverity,
 	generateTriageSummary,
 } from "../self-report-fixer.js";

@ -132,7 +132,7 @@ describe("self-report-fixer", () => {
 		// Validation reviewer should be blocker
 		const blockers = categorized.blocker;
 		expect(
-			blockers.some((r) => r.title.toLowerCase().includes("validation"))
+			blockers.some((r) => r.title.toLowerCase().includes("validation")),
 		).toBe(true);
 	});

@ -288,7 +288,7 @@ describe("self-report-fixer", () => {
 		// Recommendation should mention the actual action
 		const recommendation = summary.recommendations[0];
 		expect(recommendation.toLowerCase()).toMatch(
-			/rubric|criteria|document|validation/
+			/rubric|criteria|document|validation/,
 		);
 	});

--- a/src/resources/extensions/sf/uok/chaos-monkey.js
+++ b/src/resources/extensions/sf/uok/chaos-monkey.js
@ -28,6 +28,42 @@ function randomInRange(min, max) {
 	return min + Math.random() * (max - min);
 }

+export class ChaosMonkeyGate {
+	constructor(options = {}) {
+		this.id = "chaos-monkey";
+		this.type = "chaos";
+		this._monkey = new ChaosMonkey(options);
+	}
+
+	async execute(_ctx, attempt) {
+		try {
+			await this._monkey.strike("verification");
+		} catch (err) {
+			return {
+				outcome: "fail",
+				failureClass: "execution",
+				rationale: `Chaos monkey injected fault: ${err instanceof Error ? err.message : String(err)}`,
+				findings: `Injected during verification phase (attempt ${attempt})`,
+			};
+		}
+		const events = this._monkey.getInjectedEvents();
+		const last = events[events.length - 1];
+		if (last && last.phase === "verification") {
+			return {
+				outcome: "pass",
+				failureClass: "none",
+				rationale: `Chaos monkey injected ${last.type} during verification (non-fatal)`,
+				findings: `Latency: ${last.delay ?? 0}ms | Disk: ${last.sizeMb ?? 0}MB | Memory: ${last.sizeMb ?? 0}MB`,
+			};
+		}
+		return {
+			outcome: "pass",
+			failureClass: "none",
+			rationale: "Chaos monkey: no fault injected this run",
+		};
+	}
+}
+
 export class ChaosMonkey {
 	constructor(options = {}) {
 		this.active = options.active ?? false;
--- a/src/resources/extensions/sf/uok/gate-runner.js
+++ b/src/resources/extensions/sf/uok/gate-runner.js
@ -1,5 +1,6 @@
 import {
 	getGateCircuitBreaker,
+	getGateRunStats,
 	insertGateRun,
 	updateGateCircuitBreaker,
 } from "../sf-db.js";
@ -20,9 +21,16 @@ const RETRY_MATRIX = {
 	unknown: 0,
 };

-const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 5;
-const CIRCUIT_BREAKER_OPEN_DURATION_MS = 60_000;
-const CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS = 3;
+function resolveCircuitBreakerThresholds() {
+	return {
+		failureThreshold:
+			Number(process.env.SF_CIRCUIT_BREAKER_FAILURE_THRESHOLD) || 5,
+		openDurationMs:
+			Number(process.env.SF_CIRCUIT_BREAKER_OPEN_DURATION_MS) || 60_000,
+		halfOpenMaxAttempts:
+			Number(process.env.SF_CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) || 3,
+	};
+}

 function nowIso() {
 	return new Date().toISOString();
@ -41,11 +49,30 @@ export class UokGateRunner {
 		return Array.from(this.registry.values());
 	}

+	getHealthSummary() {
+		const gates = this.list();
+		return {
+			gates: gates.map((g) => {
+				const stats = getGateRunStats(g.id, 24);
+				const cb = getGateCircuitBreaker(g.id);
+				return {
+					id: g.id,
+					type: g.type,
+					...stats,
+					circuitBreaker: cb.state,
+					failureStreak: cb.failureStreak,
+				};
+			}),
+		};
+	}
+
 	_checkCircuitBreaker(gateId) {
+		const { openDurationMs, halfOpenMaxAttempts } =
+			resolveCircuitBreakerThresholds();
 		const breaker = getGateCircuitBreaker(gateId);
 		if (breaker.state === "open") {
 			const openedAt = breaker.openedAt ? Date.parse(breaker.openedAt) : 0;
-			if (Date.now() - openedAt >= CIRCUIT_BREAKER_OPEN_DURATION_MS) {
+			if (Date.now() - openedAt >= openDurationMs) {
 				// Transition to half-open automatically after cooldown
 				updateGateCircuitBreaker(gateId, {
 					state: "half-open",
@ -56,11 +83,11 @@ export class UokGateRunner {
 			}
 			return {
 				blocked: true,
-				reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + CIRCUIT_BREAKER_OPEN_DURATION_MS).toISOString()}.`,
+				reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + openDurationMs).toISOString()}.`,
 			};
 		}
 		if (breaker.state === "half-open") {
-			if (breaker.halfOpenAttempts >= CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) {
+			if (breaker.halfOpenAttempts >= halfOpenMaxAttempts) {
 				// Too many half-open attempts without success — go back to open
 				updateGateCircuitBreaker(gateId, {
 					state: "open",
@ -100,7 +127,8 @@ export class UokGateRunner {
 			});
 			return;
 		}
-		if (nextStreak >= CIRCUIT_BREAKER_FAILURE_THRESHOLD) {
+		const { failureThreshold } = resolveCircuitBreakerThresholds();
+		if (nextStreak >= failureThreshold) {
 			updateGateCircuitBreaker(gateId, {
 				state: "open",
 				failureStreak: nextStreak,
--- a/src/resources/extensions/sf/uok/metrics-exposition.js
+++ b/src/resources/extensions/sf/uok/metrics-exposition.js
@ -8,21 +8,26 @@
 * Consumer: health widgets, /sf uok status, and external monitoring.
 */

-import { existsSync, mkdirSync, writeFileSync } from "node:fs";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { sfRoot } from "../paths.js";
 import {
+	getDistinctGateIds,
 	getGateCircuitBreaker,
 	getGateLatencyStats,
 	getGateRunStats,
 	isDbAvailable,
 } from "../sf-db.js";

-const GATE_NAMES = [
+const DEFAULT_GATE_NAMES = [
 	"security-guard",
 	"cost-guard",
 	"outcome-learning",
 	"multi-package-healing",
+	"chaos-monkey",
+	"verification-gate",
+	"post-execution-checks",
+	"milestone-validation-post-check",
 ];

 function fmtCounter(name, value, labels = {}) {
@ -37,9 +42,9 @@ function fmtGauge(name, value, labels = {}) {
 	return fmtCounter(name, value, labels);
 }

-function collectGateMetrics() {
+function collectGateMetrics(gateIds) {
 	const lines = [];
-	for (const gateId of GATE_NAMES) {
+	for (const gateId of gateIds) {
 		const stats = getGateRunStats(gateId, 24);
 		lines.push(
 			fmtCounter("uok_gate_runs_total", stats.total, { gate_id: gateId }),
@ -89,7 +94,7 @@ function collectGateMetrics() {
 	return lines;
 }

-function buildMetricsText() {
+function buildMetricsText(gateIds) {
 	const lines = [
 		"# HELP uok_gate_runs_total Total gate runs in the last 24h",
 		"# TYPE uok_gate_runs_total counter",
@ -113,7 +118,13 @@ function buildMetricsText() {
 		"# TYPE uok_gate_circuit_breaker_failure_streak gauge",
 	];
 	if (isDbAvailable()) {
-		lines.push(...collectGateMetrics());
+		const ids =
+			gateIds && gateIds.length > 0
+				? gateIds
+				: getDistinctGateIds().length > 0
+					? getDistinctGateIds()
+					: DEFAULT_GATE_NAMES;
+		lines.push(...collectGateMetrics(ids));
 	}
 	return lines.join("\n") + "\n";
 }
@ -122,11 +133,11 @@ export function metricsPath(basePath) {
 	return join(sfRoot(basePath), "runtime", "uok-metrics.prom");
 }

-export function writeUokMetrics(basePath) {
+export function writeUokMetrics(basePath, gateIds) {
 	const path = metricsPath(basePath);
 	const dir = join(sfRoot(basePath), "runtime");
 	mkdirSync(dir, { recursive: true });
-	writeFileSync(path, buildMetricsText(), "utf-8");
+	writeFileSync(path, buildMetricsText(gateIds), "utf-8");
 	return path;
 }

@ -134,7 +145,7 @@ export function readUokMetrics(basePath) {
 	const path = metricsPath(basePath);
 	if (!existsSync(path)) return null;
 	try {
-		return buildMetricsText();
+		return readFileSync(path, "utf-8");
 	} catch {
 		return null;
 	}