test: harden uok self-evolution paths

This commit is contained in:
Mikael Hugo 2026-05-06 22:55:35 +02:00
parent 69d3114265
commit 30f8738585
25 changed files with 314 additions and 481 deletions

View file

@ -104,3 +104,15 @@ test("set_widget_when_widget_host_missing_ignores_factory_without_throwing", ()
{ placement: "belowEditor" },
);
});
test("dialog_methods_when_host_dialogs_missing_degrade_without_throwing", async () => {
const ui = createExtensionUIContext({
// RPC/headless-style hosts may not implement interactive dialogs.
});
await assert.doesNotReject(async () => {
assert.equal(await ui.confirm("Proceed?", "Dangerous command"), false);
assert.equal(await ui.select("Pick one", ["a", "b"]), undefined);
assert.equal(await ui.input("Value", "placeholder"), undefined);
});
});

View file

@ -96,12 +96,24 @@ function createWidgetSetter(
export function createExtensionUIContext(host: any): ExtensionUIContext {
const setWidget = createWidgetSetter(host);
return {
select: (title, options, opts) =>
host.showExtensionSelector(title, options, opts),
confirm: (title, message, opts) =>
host.showExtensionConfirm(title, message, opts),
input: (title, placeholder, opts) =>
host.showExtensionInput(title, placeholder, opts),
select: (title, options, opts) => {
if (typeof host.showExtensionSelector !== "function") {
return Promise.resolve(undefined);
}
return host.showExtensionSelector(title, options, opts);
},
confirm: (title, message, opts) => {
if (typeof host.showExtensionConfirm !== "function") {
return Promise.resolve(false);
}
return host.showExtensionConfirm(title, message, opts);
},
input: (title, placeholder, opts) => {
if (typeof host.showExtensionInput !== "function") {
return Promise.resolve(undefined);
}
return host.showExtensionInput(title, placeholder, opts);
},
notify: (message, type) => notifyHost(host, message, type),
onTerminalInput: (handler) =>
host.addExtensionTerminalInputListener(handler),

View file

@ -17,12 +17,7 @@ import {
} from "./auto-prompts.js";
import { scopeActiveToolsForUnitType } from "./constants.js";
import { loadFile } from "./files.js";
import { parseRoadmap } from "./parsers.js";
import {
relSliceFile,
resolveMilestoneFile,
resolveSliceFile,
} from "./paths.js";
import { relSliceFile, resolveSliceFile } from "./paths.js";
import { loadEffectiveSFPreferences } from "./preferences.js";
import { getMilestoneSlices, isDbAvailable } from "./sf-db.js";
import { deriveState } from "./state.js";
@ -181,24 +176,17 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
}
case "reassess":
case "reassess-roadmap": {
// DB primary path — get completed slices, fall back to file parsing when DB has no data
let completedSliceIds = [];
if (isDbAvailable()) {
completedSliceIds = getMilestoneSlices(mid)
.filter((s) => s.status === "complete")
.map((s) => s.id);
}
if (completedSliceIds.length === 0) {
// File-based fallback: parse roadmap checkboxes
const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
if (roadmapPath) {
const roadmapContent = await loadFile(roadmapPath);
if (roadmapContent) {
completedSliceIds = parseRoadmap(roadmapContent)
.slices.filter((s) => s.done)
.map((s) => s.id);
}
}
} else {
ctx.ui.notify(
"Cannot dispatch reassess-roadmap: database unavailable.",
"warning",
);
return;
}
if (completedSliceIds.length === 0) {
ctx.ui.notify(
@ -223,24 +211,18 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
// UAT targets the most recently completed slice, not the active (next
// incomplete) slice. After slice completion, state.activeSlice advances
// to the next incomplete slice, so we find the last done slice from the
// roadmap instead (#1693).
// DB instead (#1693).
let uatCompletedSliceIds = [];
if (isDbAvailable()) {
uatCompletedSliceIds = getMilestoneSlices(mid)
.filter((s) => s.status === "complete")
.map((s) => s.id);
}
if (uatCompletedSliceIds.length === 0) {
// File-based fallback: parse roadmap checkboxes
const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
if (roadmapPath) {
const roadmapContent = await loadFile(roadmapPath);
if (roadmapContent) {
uatCompletedSliceIds = parseRoadmap(roadmapContent)
.slices.filter((s) => s.done)
.map((s) => s.id);
}
}
} else {
ctx.ui.notify(
"Cannot dispatch run-uat: database unavailable.",
"warning",
);
return;
}
if (uatCompletedSliceIds.length === 0) {
ctx.ui.notify(

View file

@ -24,6 +24,7 @@ import {
} from "./files.js";
import { assertGateCoverage, getGatesForTurn } from "./gate-registry.js";
import { inlineGraphSubgraph } from "./graph-context.js";
import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";
import {
formatMemoriesForPrompt,
getActiveMemoriesRanked,
@ -66,7 +67,6 @@ import {
import { composeInlinedContext } from "./unit-context-composer.js";
import { getUatType, hasVerdict } from "./verdict-parser.js";
import { logWarning } from "./workflow-logger.js";
import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";
// ─── Preamble Cap ─────────────────────────────────────────────────────────────
/**
@ -88,7 +88,7 @@ async function getKnowledgeInjection(basePath, taskContext = {}) {
minConfidence: 0.7,
minSimilarity: 0.5,
});
} catch (err) {
} catch {
// Gracefully degrade if knowledge injection fails
return "(knowledge unavailable)";
}

View file

@ -389,9 +389,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
if (isDbAvailable()) {
const tasks = getSliceTasks(mid, sid);
if (tasks.length > 0) taskIds = tasks.map((t) => t.id);
else return false;
}
if (!taskIds) {
// LEGACY: DB unavailable or no tasks in DB — parse plan file for task IDs
if (!taskIds && !isDbAvailable()) {
// LEGACY: DB unavailable — parse plan file for task IDs.
const planContent = readFileSync(absPath, "utf-8");
const plan = parsePlan(planContent);
if (plan.tasks.length > 0) taskIds = plan.tasks.map((t) => t.id);
@ -443,9 +444,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
return false;
}
}
} else {
// DB available but slice row not found — completion tool never ran.
return false;
}
// else: DB available but slice not found — summary + UAT exist,
// treat as verified (slice may not be imported yet)
}
}
// complete-milestone must have produced implementation artifacts (#1703).

View file

@ -25,6 +25,7 @@ import {
import { isMilestoneComplete } from "./state.js";
import { isClosedStatus } from "./status-guards.js";
import { parseUnitId } from "./unit-id.js";
import { ChaosMonkeyGate } from "./uok/chaos-monkey.js";
import { CostGuardGate } from "./uok/cost-guard-gate.js";
import { resolveUokFlags } from "./uok/flags.js";
import { UokGateRunner } from "./uok/gate-runner.js";
@ -357,6 +358,24 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
unitId: s.currentUnit.id,
});
}
if (uokFlags.chaosMonkey) {
gateRunner.register(new ChaosMonkeyGate({ active: true }));
const cmResult = await gateRunner.run("chaos-monkey", {
basePath: s.basePath,
traceId: `chaos-monkey:${s.currentUnit.id}`,
turnId: s.currentUnit.id,
milestoneId: mid ?? undefined,
sliceId: sid ?? undefined,
taskId: tid ?? undefined,
unitType: s.currentUnit.type,
unitId: s.currentUnit.id,
});
if (cmResult.outcome === "fail") {
result.passed = false;
result.chaosMonkeyFailure = true;
result.chaosMonkeyRationale = cmResult.rationale;
}
}
}
// Auto-fix retry preferences
const autoFixEnabled = prefs?.verification_auto_fix !== false;
@ -438,6 +457,16 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
`verification-gate: cost-guard failure: ${result.costGuardRationale}\n`,
);
}
// Log chaos-monkey failures
if (result.chaosMonkeyFailure) {
ctx.ui.notify(
`[verify] CHAOS-MONKEY FAIL — ${result.chaosMonkeyRationale}`,
"error",
);
process.stderr.write(
`verification-gate: chaos-monkey injected failure: ${result.chaosMonkeyRationale}\n`,
);
}
// Write verification evidence JSON
const attempt = s.verificationRetryCount.get(s.currentUnit.id) ?? 0;
if (mid && sid && tid) {

View file

@ -35,7 +35,7 @@ const TOP_LEVEL_SUBCOMMANDS = [
{ cmd: "run-hook", desc: "Manually trigger a specific hook" },
{ cmd: "skill-health", desc: "Skill lifecycle dashboard" },
{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
{ cmd: "uok", desc: "UOK runtime health and ledger status" },
{ cmd: "uok", desc: "UOK runtime health, ledger status, and gate metrics" },
{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
{ cmd: "forensics", desc: "Examine execution logs" },
{ cmd: "init", desc: "Project init wizard" },

View file

@ -4,6 +4,8 @@ import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
import { sfRoot } from "./paths.js";
import { getUokRuns, isDbAvailable } from "./sf-db.js";
import { writeUokDiagnostics } from "./uok/diagnostic-synthesis.js";
import { UokGateRunner } from "./uok/gate-runner.js";
import { readUokMetrics, writeUokMetrics } from "./uok/metrics-exposition.js";
import {
summarizeParityHealth,
writeParityReport,
@ -90,6 +92,15 @@ export async function collectUokStatus(
} catch {
diagnostics = null;
}
let gateHealth = null;
let metricsPath = null;
try {
const runner = new UokGateRunner();
gateHealth = runner.getHealthSummary();
metricsPath = writeUokMetrics(basePath);
} catch {
// gate health and metrics are best-effort
}
return {
dbAvailable,
generatedAt: new Date(nowMs).toISOString(),
@ -103,6 +114,8 @@ export async function collectUokStatus(
current,
historical,
diagnostics,
gateHealth,
metricsPath,
reportPath: join(sfRoot(basePath), "runtime", "uok-parity-report.json"),
};
}
@ -164,6 +177,24 @@ export function formatUokStatus(status, nowMs = Date.now()) {
lines.push("Last error: none in ledger");
}
lines.push("");
if (status.gateHealth?.gates?.length > 0) {
lines.push("Gate health (24h):");
for (const g of status.gateHealth.gates) {
const icon =
g.circuitBreaker === "open"
? "🔴"
: g.circuitBreaker === "half-open"
? "🟡"
: "🟢";
lines.push(
` ${icon} ${g.id}: ${g.pass} pass / ${g.fail} fail / ${g.retry} retry | cb: ${g.circuitBreaker}${g.failureStreak > 0 ? ` (streak ${g.failureStreak})` : ""}`,
);
}
lines.push("");
}
if (status.metricsPath) {
lines.push(`Metrics: ${status.metricsPath}`);
}
lines.push(`Report: ${status.reportPath}`);
return lines.join("\n");
}
@ -172,11 +203,22 @@ export async function handleUok(args, ctx) {
const trimmed = args.trim();
if (trimmed === "help" || trimmed === "--help") {
ctx.ui.notify(
"Usage: /sf uok [status|--json]\n\nShows UOK ledger health, last run, last error, historical drift, and startup gate state.",
"Usage: /sf uok [status|metrics|--json]\n\n status — UOK ledger health, last run, last error, historical drift, startup gate, and gate health\n metrics — Render Prometheus-format metrics to .sf/runtime/uok-metrics.prom and display\n --json — Same as status but outputs JSON",
"info",
);
return;
}
if (trimmed === "metrics") {
const basePath = process.cwd();
const path = writeUokMetrics(basePath);
const text = readUokMetrics(basePath);
ctx.ui.notify(
text ?? "No metrics available (DB unavailable or no gate data)",
"info",
);
ctx.ui.notify(`Written to: ${path}`, "info");
return;
}
const status = await collectUokStatus(process.cwd());
if (trimmed === "--json" || trimmed === "json") {
ctx.ui.notify(JSON.stringify(status, null, 2), "info");

View file

@ -98,7 +98,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
{
cmd: "uok",
desc: "UOK runtime health: ledger, last run, last error, startup gate",
desc: "UOK runtime health: ledger, last run, last error, startup gate, gate metrics",
},
{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
{ cmd: "forensics", desc: "Examine execution logs" },

View file

@ -32,8 +32,7 @@ import { join } from "node:path";
*/
function parseKnowledgeEntries(knowledgeContent) {
const entries = [];
const entryPattern =
/### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;
const entryPattern = /### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;
let match;
while ((match = entryPattern.exec(knowledgeContent)) !== null) {
@ -41,9 +40,15 @@ function parseKnowledgeEntries(knowledgeContent) {
const body = match[2];
// Extract fields
const evidenceMatch = body.match(/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/);
const confidenceMatch = body.match(/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/);
const domainMatch = body.match(/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/);
const evidenceMatch = body.match(
/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/,
);
const confidenceMatch = body.match(
/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/,
);
const domainMatch = body.match(
/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/,
);
const recommendationMatch = body.match(
/[-*]\s+\*?\*?Recommendation:\*?\*?\s*(.+?)(?:\n|$)/,
);
@ -90,9 +95,7 @@ function extractConcepts(entry) {
}
// Add title keywords
const titleKeywords = entry.title
.split(/\s+/)
.filter((w) => w.length > 3);
const titleKeywords = entry.title.split(/\s+/).filter((w) => w.length > 3);
titleKeywords.forEach((w) => concepts.add(w.toLowerCase()));
return Array.from(concepts);
@ -235,9 +238,7 @@ function loadKnowledgeFile(basePath) {
if (existsSync(p)) {
try {
return readFileSync(p, "utf-8");
} catch {
continue;
}
} catch {}
}
}
@ -257,7 +258,11 @@ function loadKnowledgeFile(basePath) {
*
* Returns: formatted string suitable for prompt variable substitution
*/
export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {}) {
export function injectKnowledgeIntPrompt(
basePath,
taskContext = {},
options = {},
) {
const knowledgeContent = loadKnowledgeFile(basePath);
if (!knowledgeContent) {
return "(knowledge base unavailable)";
@ -304,7 +309,7 @@ export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {
* Purpose: Record which knowledge was actually used in a dispatch so we can
* later measure effectiveness and refine knowledge compounding.
*/
export function trackKnowledgeUsage(basePath, taskId, injectedKnowledge) {
export function trackKnowledgeUsage(_basePath, taskId, injectedKnowledge) {
// This would write to a usage log in .sf/knowledge-usage.jsonl
// Implementation deferred to feedback-loop integration
return {

View file

@ -1,378 +0,0 @@
/**
* Continuous Model Learning track per-task-type model performance and
* adaptively route to better-performing models.
*
* Purpose: Make model selection data-driven and adaptive instead of static.
* When a model consistently fails on certain task types, demote it. When a new
* model succeeds where the incumbent fails, promote it.
*
* Consumer: auto-dispatch.ts outcome logging, model-router.ts selection logic,
* benchmark-selector.ts display.
*/
import { existsSync, readFileSync, writeFileSync, appendFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { mkdirSync } from "node:fs";
/**
* Per-task-type model performance tracker.
*
* Schema:
* {
* "execute-task": {
* "gpt-4o": {
* "successes": 42,
* "failures": 3,
* "timeouts": 1,
* "totalTokens": 1500000,
* "totalCost": 45.50,
* "lastUsed": "2026-05-06T16:30:00Z",
* "successRate": 0.93
* },
* "claude-opus": {
* ...
* }
* },
* "plan-slice": { ... }
* }
*/
class ModelPerformanceTracker {
constructor(basePath) {
this.basePath = basePath;
this.storagePath = join(basePath, ".sf", "model-performance.json");
this.data = this._load();
}
_load() {
if (!existsSync(this.storagePath)) {
return {};
}
try {
const content = readFileSync(this.storagePath, "utf-8");
return JSON.parse(content);
} catch {
return {};
}
}
_save() {
try {
const dir = dirname(this.storagePath);
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true });
}
writeFileSync(
this.storagePath,
JSON.stringify(this.data, null, 2),
"utf-8",
);
} catch (err) {
console.error("Failed to save model performance data:", err);
}
}
/**
* Record outcome for a model on a specific task type.
*/
recordOutcome(taskType, modelId, outcome) {
const {
success,
timeout = false,
tokensUsed = 0,
costUsd = 0,
timestamp = new Date().toISOString(),
} = outcome;
if (!this.data[taskType]) {
this.data[taskType] = {};
}
if (!this.data[taskType][modelId]) {
this.data[taskType][modelId] = {
successes: 0,
failures: 0,
timeouts: 0,
totalTokens: 0,
totalCost: 0,
lastUsed: timestamp,
successRate: 0,
};
}
const stats = this.data[taskType][modelId];
if (success) {
stats.successes += 1;
} else if (timeout) {
stats.timeouts += 1;
stats.failures += 1;
} else {
stats.failures += 1;
}
stats.totalTokens += tokensUsed;
stats.totalCost += costUsd;
stats.lastUsed = timestamp;
const total = stats.successes + stats.failures;
stats.successRate = total > 0 ? stats.successes / total : 0;
this._save();
}
/**
* Get performance stats for a task type and model.
*/
getStats(taskType, modelId) {
return this.data[taskType]?.[modelId] || null;
}
/**
* Get all models for a task type, ranked by success rate.
*/
getRankedModels(taskType, minSamples = 3) {
if (!this.data[taskType]) return [];
const models = Object.entries(this.data[taskType])
.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
.map(([modelId, stats]) => ({
modelId,
successRate: stats.successRate,
attempts: stats.successes + stats.failures,
tokens: stats.totalTokens,
cost: stats.totalCost,
latestAttempt: stats.lastUsed,
}))
.sort((a, b) => b.successRate - a.successRate);
return models;
}
/**
* Check if a model should be demoted (fails >50% on this task type).
*/
shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
const stats = this.getStats(taskType, modelId);
if (!stats) return false;
const failureRate = 1 - stats.successRate;
const totalAttempts = stats.successes + stats.failures;
return failureRate > thresholdFailureRate && totalAttempts >= 5;
}
/**
* Get candidates for A/B testing (new model vs incumbent).
* Returns: { incumbent, challengers: [] }
*/
getABTestCandidates(taskType, minSamples = 3, lowRiskFraction = 0.1) {
const ranked = this.getRankedModels(taskType, minSamples);
if (ranked.length < 2) return null;
const incumbent = ranked[0];
const challengers = ranked.slice(1, 3); // Top 2 challengers
return {
incumbent,
challengers,
testBudget: Math.max(1, Math.ceil(1 / lowRiskFraction)), // E.g., 10 tasks
};
}
/**
* Track A/B test results and decide on promotion/demotion.
*/
analyzeABTest(taskType, results) {
// results: { incumbentWins, challengerWins, incumbentAvgLatency, challengerAvgLatency }
const { incumbentWins, challengerWins } = results;
const total = incumbentWins + challengerWins;
if (total < 5) {
return { recommendation: "inconclusive", reason: "insufficient samples" };
}
const challengerSuccessRate = challengerWins / total;
const incumbentSuccessRate = incumbentWins / total;
if (challengerSuccessRate > incumbentSuccessRate + 0.1) {
return {
recommendation: "promote",
reason: `challenger ${challengerSuccessRate.toFixed(2)} vs incumbent ${incumbentSuccessRate.toFixed(2)}`,
};
}
return {
recommendation: "continue",
reason: "incumbent still ahead",
};
}
}
/**
* Failure Analyzer categorize and log why models failed.
*
* Purpose: Understand failure patterns (timeout, quality, cost) to inform
* promotion/demotion decisions.
*/
class FailureAnalyzer {
constructor(basePath) {
this.basePath = basePath;
this.logsPath = join(basePath, ".sf", "model-failure-log.jsonl");
}
logFailure(taskType, modelId, failure) {
const {
reason = "unknown",
timeout = false,
tokensUsed = 0,
context = {},
timestamp = new Date().toISOString(),
} = failure;
const entry = {
timestamp,
taskType,
modelId,
reason,
timeout,
tokensUsed,
context,
};
try {
const dir = dirname(this.logsPath);
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true });
}
appendFileSync(this.logsPath, JSON.stringify(entry) + "\n", "utf-8");
} catch (err) {
console.error("Failed to log model failure:", err);
}
}
/**
* Get failure summary for a model on a task type.
* Returns: { reasons: { [reason]: count }, patterns: [...] }
*/
getFailureSummary(taskType, modelId) {
if (!existsSync(this.logsPath)) {
return { reasons: {}, patterns: [] };
}
try {
const content = readFileSync(this.logsPath, "utf-8");
const lines = content.trim().split("\n");
const reasons = {};
const failures = [];
for (const line of lines) {
const entry = JSON.parse(line);
if (entry.taskType !== taskType || entry.modelId !== modelId) continue;
reasons[entry.reason] = (reasons[entry.reason] || 0) + 1;
failures.push(entry);
}
// Detect patterns
const patterns = this._detectPatterns(failures);
return { reasons, patterns };
} catch {
return { reasons: {}, patterns: [] };
}
}
_detectPatterns(failures) {
// Analyze failure distribution to detect systematic issues
const timeoutCount = failures.filter((f) => f.timeout).length;
const patterns = [];
if (timeoutCount / Math.max(failures.length, 1) > 0.5) {
patterns.push({
type: "timeout_prone",
severity: "high",
suggestion: "Use shorter timeout or lower batch size",
});
}
return patterns;
}
}
/**
* Main API: Integrate model learning into dispatch workflow.
*
* Usage in auto-dispatch.ts:
* ```
* const learner = new ModelLearner(projectPath);
* learner.recordOutcome("execute-task", modelUsed, {
* success: taskSucceeded,
* timeout: taskTimedOut,
* tokensUsed: totalTokens,
* costUsd: modelCost,
* });
* ```
*/
export class ModelLearner {
constructor(basePath) {
this.basePath = basePath;
this.tracker = new ModelPerformanceTracker(basePath);
this.analyzer = new FailureAnalyzer(basePath);
}
/**
* Record an outcome for a model on a task.
*/
recordOutcome(taskType, modelId, outcome) {
this.tracker.recordOutcome(taskType, modelId, outcome);
}
/**
* Log failure details for analysis.
*/
logFailure(taskType, modelId, failure) {
this.analyzer.logFailure(taskType, modelId, failure);
}
/**
* Get ranked models for a task type (for intelligent routing).
*/
getRankedModels(taskType, minSamples = 3) {
return this.tracker.getRankedModels(taskType, minSamples);
}
/**
* Decide whether to demote a model.
*/
shouldDemote(taskType, modelId, failureThreshold = 0.5) {
return this.tracker.shouldDemote(taskType, modelId, failureThreshold);
}
/**
* Get A/B test candidates (for hypothesis testing).
*/
getABTestCandidates(taskType, minSamples = 3) {
return this.tracker.getABTestCandidates(taskType, minSamples);
}
/**
* Analyze A/B test results.
*/
analyzeABTest(taskType, results) {
return this.tracker.analyzeABTest(taskType, results);
}
/**
* Get failure analysis for a model.
*/
getFailureAnalysis(taskType, modelId) {
return this.analyzer.getFailureSummary(taskType, modelId);
}
}
export { ModelPerformanceTracker, FailureAnalyzer };
export default {
ModelLearner,
ModelPerformanceTracker,
FailureAnalyzer,
};

View file

@ -76,7 +76,7 @@ Before anything else, form a diagnosis: What is the core challenge? What is brok
- **Measure coverage**: find untested critical paths
- **Scan for dead code, stubs, and commented-out features** — abandoned attempts are signals
- **Discover needed skills**: identify repo languages, frameworks, data stores, external services, build tools, and domain-specific competencies. Check installed skills first; record installed, missing, and potentially useful skills in `.sf/CODEBASE.md` and `.sf/PM-STRATEGY.md`.
- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context.
- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence.
- Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. Fall back to shell `rg`, `find`, `ast-grep`, or `ls -la` only when the native/in-process tool surface is insufficient.
### Step 2: Check library and ecosystem facts

View file

@ -34,7 +34,7 @@ After reflection is confirmed, decide the approach based on the actual scope —
Before asking your first question, do a mandatory investigation pass. This is not optional.
1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
2. **Check library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework the user mentioned. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover.** Get current facts about capabilities, constraints, API shapes, version-specific behavior.
3. **Web search**`search-the-web` if the domain is unfamiliar, if you need current best practices, or if the user referenced external services/APIs you need facts about. Use `fetch_page` for full content when snippets aren't enough.

View file

@ -18,6 +18,18 @@ You are evaluating **quality gates in parallel** for this slice. Each gate is an
{{gateList}}
## Gate Types Reference
The following gate implementations may be present in this project. Each has distinct failure classes:
- **`verification-gate`** — Runs lint, typecheck, tests, and post-execution checks. Failure classes: `verification` (check failed), `execution` (runtime/blocking error), `artifact` (post-execution consistency issue).
- **`security-guard`** — Scans for secrets, unsafe patterns, and dependency vulnerabilities. Failure classes: `policy` (secret leaked), `input` (unsafe pattern).
- **`cost-guard`** — Monitors LLM spend against per-unit and per-hour budgets, detects high-tier model failures. Failure classes: `policy` (budget exceeded), `execution` (high-tier model failure).
- **`outcome-learning`** — Queries historical task outcomes for failure-rate anomalies. Failure classes: `policy` (failure rate too high), `input` (model recommendation).
- **`multi-package-healing`** — Detects affected packages from git diff and runs targeted checks. Failure classes: `verification` (package check failed), `execution` (check timeout).
- **`chaos-monkey`** — Stress-tests durability by injecting latency, retryable errors, disk stress, or memory pressure. Failure classes: `execution` (injected fault caused failure). This gate only runs when explicitly enabled (`active: false` by default).
- **`post-execution-checks`** — Cross-task consistency verification after a task completes. Failure classes: `artifact` (consistency violation), `policy` (strict-mode warning escalation).
## Execution Protocol
1. **Dispatch all gates** using `subagent` in parallel mode. Each subagent prompt is provided below.

View file

@ -1,4 +1,4 @@
Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.
Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If the repository is checked out locally, GitHub code search is a scarce remote-only fallback: do not use GitHub `/search/code` for that local repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so use it only for repositories that are not on disk, dedupe repeated queries, and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.
**You are the scout.** A planner agent reads your output in a fresh context to decompose this slice into tasks. Write for the planner — surface key files, where the work divides naturally, what to build first, and how to verify. If the research doc is vague, the planner re-explores code you already read. If it's precise, the planner decomposes immediately.

View file

@ -46,6 +46,7 @@ Research what this slice needs. Narrate key findings and surprises as you go —
2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
3. Explore relevant code for this slice's scope. Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first.
3a. Use a research swarm when the slice has 2-3 independent unknowns or subsystems. Dispatch parallel `scout`/`researcher` subagents with distinct lenses, then synthesize what each found into this single RESEARCH artifact. Do not swarm a narrow, sequence-dependent investigation.
3b. **GitHub code search is a scarce remote-only fallback.** When the repository is present in `{{workingDirectory}}`, do not use GitHub `/search/code` for that repo; use local `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` as needed. GitHub's `code_search` bucket is small and separate from the normal REST/GraphQL quotas. Use GitHub code search only for repositories that are not checked out locally, dedupe repeated queries, and if it returns `403` rate-limit with a short reset, wait until reset or continue with local evidence. If remote code search is essential and still unavailable, checkpoint `continue`, `blocked`, or `decide` with the missing source named.
4. **Documentation lookup — prefer DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework — AI-indexed, no free-tier cap. Fall back to `resolve_library``get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 requests/month — spend those on cases DeepWiki can't cover.** Skip both for libraries already used in this codebase.
5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — try DeepWiki → Context7 → web search in that order. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit.
6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt).

View file

@ -161,7 +161,7 @@ Templates showing the expected format for each artifact type are in:
**Code navigation:** Use `lsp` for definition, type_definition, implementation, references, incoming_calls, outgoing_calls, hover, signature, symbols, rename, code_actions, format, and diagnostics. Falls back gracefully if no server is available. Never `grep` for a symbol definition when `lsp` can resolve it semantically. Never shell out to prettier/rustfmt/gofmt when `lsp format` is available. After editing code, use `lsp diagnostics` to verify no type errors were introduced.
**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.
**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.
**Swarm dispatch:** Let the system decide whether swarming fits before dispatching multiple execution subagents. Use a 2-3 worker same-model swarm only when the work splits into independent shards with explicit file/directory ownership, shard-local verification, low conflict risk, and clear wall-clock savings. Do not swarm shared-interface edits, lockfiles, migrations, single-failure debugging, or sequence-dependent work. The parent agent remains coordinator: assign ownership, synthesize results, inspect dirty files, resolve conflicts, and run final verification.

View file

@ -15,7 +15,7 @@
* 4. Apply fix, test, and mark self-report resolved
*/
import { existsSync, readFileSync, writeFileSync } from "node:fs";
import { existsSync, readFileSync } from "node:fs";
import { join } from "node:path";
/**
@ -25,30 +25,36 @@ import { join } from "node:path";
const FIX_PATTERNS = [
{
id: "validation-reviewer-rubric",
pattern: /validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
pattern:
/validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
confidence: 0.95, // We fixed this in validation prompts already
description: "Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
description:
"Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
fix: fixValidationReviewerRubric,
},
{
id: "gate-verdict-clarity",
pattern: /gate.*verdict.*ambiguous|verdict.*semantics.*unclear/i,
confidence: 0.9,
description: "Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
description:
"Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
fix: fixGateVerdictSemantics,
},
{
id: "env-vars-unvalidated",
pattern: /SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
pattern:
/SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
confidence: 0.85,
description: "Add runtime validation for SF_* environment variables",
fix: fixEnvValidation,
},
{
id: "self-report-coverage-gap",
pattern: /self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
pattern:
/self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
confidence: 0.8,
description: "Implement automated self-report triage pipeline (this module)",
description:
"Implement automated self-report triage pipeline (this module)",
fix: fixSelfReportPipeline,
},
];
@ -72,11 +78,19 @@ async function fixValidationReviewerRubric(basePath) {
// Check if rubric already exists
if (content.includes("Gate vs. Task Scope Rubric")) {
return { success: true, alreadyFixed: true, reason: "Rubric already present" };
return {
success: true,
alreadyFixed: true,
reason: "Rubric already present",
};
}
// This is already done in prior session, so just confirm
return { success: true, alreadyFixed: true, reason: "Fix verified in session" };
return {
success: true,
alreadyFixed: true,
reason: "Fix verified in session",
};
}
/**
@ -92,7 +106,11 @@ async function fixGateVerdictSemantics(basePath) {
// Check if gate semantics already documented
if (content.includes("Gate Verdict Semantics")) {
return { success: true, alreadyFixed: true, reason: "Gate semantics documented" };
return {
success: true,
alreadyFixed: true,
reason: "Gate semantics documented",
};
}
return { success: true, alreadyFixed: true, reason: "Fix already verified" };
@ -137,7 +155,7 @@ async function fixEnvValidation(basePath) {
/**
* Attempt to fix: Self-report triage pipeline (this module itself).
*/
async function fixSelfReportPipeline(basePath) {
async function fixSelfReportPipeline(_basePath) {
const thisFile = new URL(import.meta.url).pathname;
if (!existsSync(thisFile)) {
return { success: false, reason: "Self-report-fixer module not found" };
@ -280,16 +298,17 @@ export function generateTriageSummary(reports) {
uniqueClusters: clusters.length,
deduped: clusters,
categorized: categories,
highConfidenceFixes: reports
.flatMap((r) => {
const fixes = classifyReportFixes(r);
return fixes.filter((f) => f.confidence > 0.85).map((f) => ({
highConfidenceFixes: reports.flatMap((r) => {
const fixes = classifyReportFixes(r);
return fixes
.filter((f) => f.confidence > 0.85)
.map((f) => ({
reportId: r.id,
fixId: f.id,
description: f.description,
confidence: f.confidence,
}));
}),
}),
};
}

View file

@ -4199,6 +4199,17 @@ export function getGateLatencyStats(gateId, windowHours = 24) {
return { total: 0, avgMs: 0, p50Ms: 0, p95Ms: 0, maxMs: 0 };
}
}
export function getDistinctGateIds() {
if (!currentDb) return [];
try {
const rows = currentDb
.prepare("SELECT DISTINCT gate_id FROM gate_runs")
.all();
return rows.map((r) => r.gate_id).filter(Boolean);
} catch {
return [];
}
}
function asStringOrNull(value) {
return typeof value === "string" && value.length > 0 ? value : null;
}

View file

@ -69,6 +69,15 @@ rg --files src/resources/extensions/sf/skills
{"query": "sift_request_factory", "strategy": "bm25", "limit": 10}
```
**GitHub code search — remote-only fallback, not local repo search:**
When the repository is checked out locally, do not use GitHub `/search/code` for
that repo. Use `git grep` for tracked-file global search, `rg` for broader
worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead.
GitHub's `code_search` bucket is small and separate from normal REST/GraphQL
quotas. Use GitHub code search only for repositories that are not on disk,
dedupe repeated queries, and treat `403` rate-limit responses as a signal to
wait for reset or continue with local evidence.
**SF project database queries:**
```bash
# Current milestone and slices

View file

@ -5,7 +5,7 @@
* and prompt injection work correctly.
*/
import { describe, test, expect } from "vitest";
import { describe, expect, test } from "vitest";
import knowledgeInjector from "../knowledge-injector.js";
const {
@ -208,7 +208,7 @@ describe("knowledge-injector", () => {
const contradictions = detectContradictions(entries);
// These are compatible tools, not contradictions
const realContradictions = contradictions.filter(
(c) => !c.message.includes("suspicious")
(c) => !c.message.includes("suspicious"),
);
expect(realContradictions.length).toBe(0);
});
@ -305,7 +305,7 @@ describe("knowledge-injector", () => {
const relevant = findRelevantKnowledge(entries, context, 0, 0);
if (relevant.length > 0) {
const { score, entry } = relevant[0];
const { score } = relevant[0];
expect(score).toBeDefined();
expect(score).toBeGreaterThan(0);
expect(score).toBeLessThanOrEqual(1);

View file

@ -5,11 +5,11 @@
* deduplication, and severity categorization work correctly.
*/
import { describe, test, expect } from "vitest";
import { describe, expect, test } from "vitest";
import {
categorizeBySeverity,
classifyReportFixes,
dedupReports,
categorizeBySeverity,
generateTriageSummary,
} from "../self-report-fixer.js";
@ -132,7 +132,7 @@ describe("self-report-fixer", () => {
// Validation reviewer should be blocker
const blockers = categorized.blocker;
expect(
blockers.some((r) => r.title.toLowerCase().includes("validation"))
blockers.some((r) => r.title.toLowerCase().includes("validation")),
).toBe(true);
});
@ -288,7 +288,7 @@ describe("self-report-fixer", () => {
// Recommendation should mention the actual action
const recommendation = summary.recommendations[0];
expect(recommendation.toLowerCase()).toMatch(
/rubric|criteria|document|validation/
/rubric|criteria|document|validation/,
);
});

View file

@ -28,6 +28,42 @@ function randomInRange(min, max) {
return min + Math.random() * (max - min);
}
export class ChaosMonkeyGate {
constructor(options = {}) {
this.id = "chaos-monkey";
this.type = "chaos";
this._monkey = new ChaosMonkey(options);
}
async execute(_ctx, attempt) {
try {
await this._monkey.strike("verification");
} catch (err) {
return {
outcome: "fail",
failureClass: "execution",
rationale: `Chaos monkey injected fault: ${err instanceof Error ? err.message : String(err)}`,
findings: `Injected during verification phase (attempt ${attempt})`,
};
}
const events = this._monkey.getInjectedEvents();
const last = events[events.length - 1];
if (last && last.phase === "verification") {
return {
outcome: "pass",
failureClass: "none",
rationale: `Chaos monkey injected ${last.type} during verification (non-fatal)`,
findings: `Latency: ${last.delay ?? 0}ms | Disk: ${last.sizeMb ?? 0}MB | Memory: ${last.sizeMb ?? 0}MB`,
};
}
return {
outcome: "pass",
failureClass: "none",
rationale: "Chaos monkey: no fault injected this run",
};
}
}
export class ChaosMonkey {
constructor(options = {}) {
this.active = options.active ?? false;

View file

@ -1,5 +1,6 @@
import {
getGateCircuitBreaker,
getGateRunStats,
insertGateRun,
updateGateCircuitBreaker,
} from "../sf-db.js";
@ -20,9 +21,16 @@ const RETRY_MATRIX = {
unknown: 0,
};
const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 5;
const CIRCUIT_BREAKER_OPEN_DURATION_MS = 60_000;
const CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS = 3;
function resolveCircuitBreakerThresholds() {
return {
failureThreshold:
Number(process.env.SF_CIRCUIT_BREAKER_FAILURE_THRESHOLD) || 5,
openDurationMs:
Number(process.env.SF_CIRCUIT_BREAKER_OPEN_DURATION_MS) || 60_000,
halfOpenMaxAttempts:
Number(process.env.SF_CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) || 3,
};
}
function nowIso() {
return new Date().toISOString();
@ -41,11 +49,30 @@ export class UokGateRunner {
return Array.from(this.registry.values());
}
getHealthSummary() {
const gates = this.list();
return {
gates: gates.map((g) => {
const stats = getGateRunStats(g.id, 24);
const cb = getGateCircuitBreaker(g.id);
return {
id: g.id,
type: g.type,
...stats,
circuitBreaker: cb.state,
failureStreak: cb.failureStreak,
};
}),
};
}
_checkCircuitBreaker(gateId) {
const { openDurationMs, halfOpenMaxAttempts } =
resolveCircuitBreakerThresholds();
const breaker = getGateCircuitBreaker(gateId);
if (breaker.state === "open") {
const openedAt = breaker.openedAt ? Date.parse(breaker.openedAt) : 0;
if (Date.now() - openedAt >= CIRCUIT_BREAKER_OPEN_DURATION_MS) {
if (Date.now() - openedAt >= openDurationMs) {
// Transition to half-open automatically after cooldown
updateGateCircuitBreaker(gateId, {
state: "half-open",
@ -56,11 +83,11 @@ export class UokGateRunner {
}
return {
blocked: true,
reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + CIRCUIT_BREAKER_OPEN_DURATION_MS).toISOString()}.`,
reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + openDurationMs).toISOString()}.`,
};
}
if (breaker.state === "half-open") {
if (breaker.halfOpenAttempts >= CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) {
if (breaker.halfOpenAttempts >= halfOpenMaxAttempts) {
// Too many half-open attempts without success — go back to open
updateGateCircuitBreaker(gateId, {
state: "open",
@ -100,7 +127,8 @@ export class UokGateRunner {
});
return;
}
if (nextStreak >= CIRCUIT_BREAKER_FAILURE_THRESHOLD) {
const { failureThreshold } = resolveCircuitBreakerThresholds();
if (nextStreak >= failureThreshold) {
updateGateCircuitBreaker(gateId, {
state: "open",
failureStreak: nextStreak,

View file

@ -8,21 +8,26 @@
* Consumer: health widgets, /sf uok status, and external monitoring.
*/
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { sfRoot } from "../paths.js";
import {
getDistinctGateIds,
getGateCircuitBreaker,
getGateLatencyStats,
getGateRunStats,
isDbAvailable,
} from "../sf-db.js";
const GATE_NAMES = [
const DEFAULT_GATE_NAMES = [
"security-guard",
"cost-guard",
"outcome-learning",
"multi-package-healing",
"chaos-monkey",
"verification-gate",
"post-execution-checks",
"milestone-validation-post-check",
];
function fmtCounter(name, value, labels = {}) {
@ -37,9 +42,9 @@ function fmtGauge(name, value, labels = {}) {
return fmtCounter(name, value, labels);
}
function collectGateMetrics() {
function collectGateMetrics(gateIds) {
const lines = [];
for (const gateId of GATE_NAMES) {
for (const gateId of gateIds) {
const stats = getGateRunStats(gateId, 24);
lines.push(
fmtCounter("uok_gate_runs_total", stats.total, { gate_id: gateId }),
@ -89,7 +94,7 @@ function collectGateMetrics() {
return lines;
}
function buildMetricsText() {
function buildMetricsText(gateIds) {
const lines = [
"# HELP uok_gate_runs_total Total gate runs in the last 24h",
"# TYPE uok_gate_runs_total counter",
@ -113,7 +118,13 @@ function buildMetricsText() {
"# TYPE uok_gate_circuit_breaker_failure_streak gauge",
];
if (isDbAvailable()) {
lines.push(...collectGateMetrics());
const ids =
gateIds && gateIds.length > 0
? gateIds
: getDistinctGateIds().length > 0
? getDistinctGateIds()
: DEFAULT_GATE_NAMES;
lines.push(...collectGateMetrics(ids));
}
return lines.join("\n") + "\n";
}
@ -122,11 +133,11 @@ export function metricsPath(basePath) {
return join(sfRoot(basePath), "runtime", "uok-metrics.prom");
}
export function writeUokMetrics(basePath) {
export function writeUokMetrics(basePath, gateIds) {
const path = metricsPath(basePath);
const dir = join(sfRoot(basePath), "runtime");
mkdirSync(dir, { recursive: true });
writeFileSync(path, buildMetricsText(), "utf-8");
writeFileSync(path, buildMetricsText(gateIds), "utf-8");
return path;
}
@ -134,7 +145,7 @@ export function readUokMetrics(basePath) {
const path = metricsPath(basePath);
if (!existsSync(path)) return null;
try {
return buildMetricsText();
return readFileSync(path, "utf-8");
} catch {
return null;
}