diff --git a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts index 28cd53134..7ebf9ff0f 100644 --- a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts +++ b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.test.ts @@ -104,3 +104,15 @@ test("set_widget_when_widget_host_missing_ignores_factory_without_throwing", () { placement: "belowEditor" }, ); }); + +test("dialog_methods_when_host_dialogs_missing_degrade_without_throwing", async () => { + const ui = createExtensionUIContext({ + // RPC/headless-style hosts may not implement interactive dialogs. + }); + + await assert.doesNotReject(async () => { + assert.equal(await ui.confirm("Proceed?", "Dangerous command"), false); + assert.equal(await ui.select("Pick one", ["a", "b"]), undefined); + assert.equal(await ui.input("Value", "placeholder"), undefined); + }); +}); diff --git a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts index 648cab229..b443e4253 100644 --- a/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts +++ b/packages/pi-coding-agent/src/modes/interactive/controllers/extension-ui-controller.ts @@ -96,12 +96,24 @@ function createWidgetSetter( export function createExtensionUIContext(host: any): ExtensionUIContext { const setWidget = createWidgetSetter(host); return { - select: (title, options, opts) => - host.showExtensionSelector(title, options, opts), - confirm: (title, message, opts) => - host.showExtensionConfirm(title, message, opts), - input: (title, placeholder, opts) => - host.showExtensionInput(title, placeholder, opts), + select: (title, options, opts) => { + if (typeof host.showExtensionSelector !== "function") { + return Promise.resolve(undefined); + } + return host.showExtensionSelector(title, options, opts); + }, + confirm: (title, message, opts) => { + if (typeof host.showExtensionConfirm !== "function") { + return Promise.resolve(false); + } + return host.showExtensionConfirm(title, message, opts); + }, + input: (title, placeholder, opts) => { + if (typeof host.showExtensionInput !== "function") { + return Promise.resolve(undefined); + } + return host.showExtensionInput(title, placeholder, opts); + }, notify: (message, type) => notifyHost(host, message, type), onTerminalInput: (handler) => host.addExtensionTerminalInputListener(handler), diff --git a/src/resources/extensions/sf/auto-direct-dispatch.js b/src/resources/extensions/sf/auto-direct-dispatch.js index 70d9d8418..c4758d1f6 100644 --- a/src/resources/extensions/sf/auto-direct-dispatch.js +++ b/src/resources/extensions/sf/auto-direct-dispatch.js @@ -17,12 +17,7 @@ import { } from "./auto-prompts.js"; import { scopeActiveToolsForUnitType } from "./constants.js"; import { loadFile } from "./files.js"; -import { parseRoadmap } from "./parsers.js"; -import { - relSliceFile, - resolveMilestoneFile, - resolveSliceFile, -} from "./paths.js"; +import { relSliceFile, resolveSliceFile } from "./paths.js"; import { loadEffectiveSFPreferences } from "./preferences.js"; import { getMilestoneSlices, isDbAvailable } from "./sf-db.js"; import { deriveState } from "./state.js"; @@ -181,24 +176,17 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) { } case "reassess": case "reassess-roadmap": { - // DB primary path — get completed slices, fall back to file parsing when DB has no data let completedSliceIds = []; if (isDbAvailable()) { completedSliceIds = getMilestoneSlices(mid) .filter((s) => s.status === "complete") .map((s) => s.id); - } - if (completedSliceIds.length === 0) { - // File-based fallback: parse roadmap checkboxes - const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP"); - if (roadmapPath) { - const roadmapContent = await loadFile(roadmapPath); - if (roadmapContent) { - completedSliceIds = parseRoadmap(roadmapContent) - .slices.filter((s) => s.done) - .map((s) => s.id); - } - } + } else { + ctx.ui.notify( + "Cannot dispatch reassess-roadmap: database unavailable.", + "warning", + ); + return; } if (completedSliceIds.length === 0) { ctx.ui.notify( @@ -223,24 +211,18 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) { // UAT targets the most recently completed slice, not the active (next // incomplete) slice. After slice completion, state.activeSlice advances // to the next incomplete slice, so we find the last done slice from the - // roadmap instead (#1693). + // DB instead (#1693). let uatCompletedSliceIds = []; if (isDbAvailable()) { uatCompletedSliceIds = getMilestoneSlices(mid) .filter((s) => s.status === "complete") .map((s) => s.id); - } - if (uatCompletedSliceIds.length === 0) { - // File-based fallback: parse roadmap checkboxes - const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP"); - if (roadmapPath) { - const roadmapContent = await loadFile(roadmapPath); - if (roadmapContent) { - uatCompletedSliceIds = parseRoadmap(roadmapContent) - .slices.filter((s) => s.done) - .map((s) => s.id); - } - } + } else { + ctx.ui.notify( + "Cannot dispatch run-uat: database unavailable.", + "warning", + ); + return; } if (uatCompletedSliceIds.length === 0) { ctx.ui.notify( diff --git a/src/resources/extensions/sf/auto-prompts.js b/src/resources/extensions/sf/auto-prompts.js index ef701c238..495b15618 100644 --- a/src/resources/extensions/sf/auto-prompts.js +++ b/src/resources/extensions/sf/auto-prompts.js @@ -24,6 +24,7 @@ import { } from "./files.js"; import { assertGateCoverage, getGatesForTurn } from "./gate-registry.js"; import { inlineGraphSubgraph } from "./graph-context.js"; +import { injectKnowledgeIntPrompt } from "./knowledge-injector.js"; import { formatMemoriesForPrompt, getActiveMemoriesRanked, @@ -66,7 +67,6 @@ import { import { composeInlinedContext } from "./unit-context-composer.js"; import { getUatType, hasVerdict } from "./verdict-parser.js"; import { logWarning } from "./workflow-logger.js"; -import { injectKnowledgeIntPrompt } from "./knowledge-injector.js"; // ─── Preamble Cap ───────────────────────────────────────────────────────────── /** @@ -88,7 +88,7 @@ async function getKnowledgeInjection(basePath, taskContext = {}) { minConfidence: 0.7, minSimilarity: 0.5, }); - } catch (err) { + } catch { // Gracefully degrade if knowledge injection fails return "(knowledge unavailable)"; } diff --git a/src/resources/extensions/sf/auto-recovery.js b/src/resources/extensions/sf/auto-recovery.js index 7e8f75101..db8581557 100644 --- a/src/resources/extensions/sf/auto-recovery.js +++ b/src/resources/extensions/sf/auto-recovery.js @@ -389,9 +389,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) { if (isDbAvailable()) { const tasks = getSliceTasks(mid, sid); if (tasks.length > 0) taskIds = tasks.map((t) => t.id); + else return false; } - if (!taskIds) { - // LEGACY: DB unavailable or no tasks in DB — parse plan file for task IDs + if (!taskIds && !isDbAvailable()) { + // LEGACY: DB unavailable — parse plan file for task IDs. const planContent = readFileSync(absPath, "utf-8"); const plan = parsePlan(planContent); if (plan.tasks.length > 0) taskIds = plan.tasks.map((t) => t.id); @@ -443,9 +444,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) { return false; } } + } else { + // DB available but slice row not found — completion tool never ran. + return false; } - // else: DB available but slice not found — summary + UAT exist, - // treat as verified (slice may not be imported yet) } } // complete-milestone must have produced implementation artifacts (#1703). diff --git a/src/resources/extensions/sf/auto-verification.js b/src/resources/extensions/sf/auto-verification.js index d7ce58a16..c8a979eb4 100644 --- a/src/resources/extensions/sf/auto-verification.js +++ b/src/resources/extensions/sf/auto-verification.js @@ -25,6 +25,7 @@ import { import { isMilestoneComplete } from "./state.js"; import { isClosedStatus } from "./status-guards.js"; import { parseUnitId } from "./unit-id.js"; +import { ChaosMonkeyGate } from "./uok/chaos-monkey.js"; import { CostGuardGate } from "./uok/cost-guard-gate.js"; import { resolveUokFlags } from "./uok/flags.js"; import { UokGateRunner } from "./uok/gate-runner.js"; @@ -357,6 +358,24 @@ export async function runPostUnitVerification(vctx, pauseAuto) { unitId: s.currentUnit.id, }); } + if (uokFlags.chaosMonkey) { + gateRunner.register(new ChaosMonkeyGate({ active: true })); + const cmResult = await gateRunner.run("chaos-monkey", { + basePath: s.basePath, + traceId: `chaos-monkey:${s.currentUnit.id}`, + turnId: s.currentUnit.id, + milestoneId: mid ?? undefined, + sliceId: sid ?? undefined, + taskId: tid ?? undefined, + unitType: s.currentUnit.type, + unitId: s.currentUnit.id, + }); + if (cmResult.outcome === "fail") { + result.passed = false; + result.chaosMonkeyFailure = true; + result.chaosMonkeyRationale = cmResult.rationale; + } + } } // Auto-fix retry preferences const autoFixEnabled = prefs?.verification_auto_fix !== false; @@ -438,6 +457,16 @@ export async function runPostUnitVerification(vctx, pauseAuto) { `verification-gate: cost-guard failure: ${result.costGuardRationale}\n`, ); } + // Log chaos-monkey failures + if (result.chaosMonkeyFailure) { + ctx.ui.notify( + `[verify] CHAOS-MONKEY FAIL — ${result.chaosMonkeyRationale}`, + "error", + ); + process.stderr.write( + `verification-gate: chaos-monkey injected failure: ${result.chaosMonkeyRationale}\n`, + ); + } // Write verification evidence JSON const attempt = s.verificationRetryCount.get(s.currentUnit.id) ?? 0; if (mid && sid && tid) { diff --git a/src/resources/extensions/sf/commands-bootstrap.js b/src/resources/extensions/sf/commands-bootstrap.js index 5ef7d1ae3..5349834d7 100644 --- a/src/resources/extensions/sf/commands-bootstrap.js +++ b/src/resources/extensions/sf/commands-bootstrap.js @@ -35,7 +35,7 @@ const TOP_LEVEL_SUBCOMMANDS = [ { cmd: "run-hook", desc: "Manually trigger a specific hook" }, { cmd: "skill-health", desc: "Skill lifecycle dashboard" }, { cmd: "doctor", desc: "Runtime health checks with auto-fix" }, - { cmd: "uok", desc: "UOK runtime health and ledger status" }, + { cmd: "uok", desc: "UOK runtime health, ledger status, and gate metrics" }, { cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" }, { cmd: "forensics", desc: "Examine execution logs" }, { cmd: "init", desc: "Project init wizard" }, diff --git a/src/resources/extensions/sf/commands-uok.js b/src/resources/extensions/sf/commands-uok.js index 89b2ea6ef..a39ca5100 100644 --- a/src/resources/extensions/sf/commands-uok.js +++ b/src/resources/extensions/sf/commands-uok.js @@ -4,6 +4,8 @@ import { ensureDbOpen } from "./bootstrap/dynamic-tools.js"; import { sfRoot } from "./paths.js"; import { getUokRuns, isDbAvailable } from "./sf-db.js"; import { writeUokDiagnostics } from "./uok/diagnostic-synthesis.js"; +import { UokGateRunner } from "./uok/gate-runner.js"; +import { readUokMetrics, writeUokMetrics } from "./uok/metrics-exposition.js"; import { summarizeParityHealth, writeParityReport, @@ -90,6 +92,15 @@ export async function collectUokStatus( } catch { diagnostics = null; } + let gateHealth = null; + let metricsPath = null; + try { + const runner = new UokGateRunner(); + gateHealth = runner.getHealthSummary(); + metricsPath = writeUokMetrics(basePath); + } catch { + // gate health and metrics are best-effort + } return { dbAvailable, generatedAt: new Date(nowMs).toISOString(), @@ -103,6 +114,8 @@ export async function collectUokStatus( current, historical, diagnostics, + gateHealth, + metricsPath, reportPath: join(sfRoot(basePath), "runtime", "uok-parity-report.json"), }; } @@ -164,6 +177,24 @@ export function formatUokStatus(status, nowMs = Date.now()) { lines.push("Last error: none in ledger"); } lines.push(""); + if (status.gateHealth?.gates?.length > 0) { + lines.push("Gate health (24h):"); + for (const g of status.gateHealth.gates) { + const icon = + g.circuitBreaker === "open" + ? "🔴" + : g.circuitBreaker === "half-open" + ? "🟡" + : "🟢"; + lines.push( + ` ${icon} ${g.id}: ${g.pass} pass / ${g.fail} fail / ${g.retry} retry | cb: ${g.circuitBreaker}${g.failureStreak > 0 ? ` (streak ${g.failureStreak})` : ""}`, + ); + } + lines.push(""); + } + if (status.metricsPath) { + lines.push(`Metrics: ${status.metricsPath}`); + } lines.push(`Report: ${status.reportPath}`); return lines.join("\n"); } @@ -172,11 +203,22 @@ export async function handleUok(args, ctx) { const trimmed = args.trim(); if (trimmed === "help" || trimmed === "--help") { ctx.ui.notify( - "Usage: /sf uok [status|--json]\n\nShows UOK ledger health, last run, last error, historical drift, and startup gate state.", + "Usage: /sf uok [status|metrics|--json]\n\n status — UOK ledger health, last run, last error, historical drift, startup gate, and gate health\n metrics — Render Prometheus-format metrics to .sf/runtime/uok-metrics.prom and display\n --json — Same as status but outputs JSON", "info", ); return; } + if (trimmed === "metrics") { + const basePath = process.cwd(); + const path = writeUokMetrics(basePath); + const text = readUokMetrics(basePath); + ctx.ui.notify( + text ?? "No metrics available (DB unavailable or no gate data)", + "info", + ); + ctx.ui.notify(`Written to: ${path}`, "info"); + return; + } const status = await collectUokStatus(process.cwd()); if (trimmed === "--json" || trimmed === "json") { ctx.ui.notify(JSON.stringify(status, null, 2), "info"); diff --git a/src/resources/extensions/sf/commands/catalog.js b/src/resources/extensions/sf/commands/catalog.js index e08dd8855..2c47d5ce2 100644 --- a/src/resources/extensions/sf/commands/catalog.js +++ b/src/resources/extensions/sf/commands/catalog.js @@ -98,7 +98,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [ { cmd: "doctor", desc: "Runtime health checks with auto-fix" }, { cmd: "uok", - desc: "UOK runtime health: ledger, last run, last error, startup gate", + desc: "UOK runtime health: ledger, last run, last error, startup gate, gate metrics", }, { cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" }, { cmd: "forensics", desc: "Examine execution logs" }, diff --git a/src/resources/extensions/sf/knowledge-injector.js b/src/resources/extensions/sf/knowledge-injector.js index d329aea88..e71dc93df 100644 --- a/src/resources/extensions/sf/knowledge-injector.js +++ b/src/resources/extensions/sf/knowledge-injector.js @@ -32,8 +32,7 @@ import { join } from "node:path"; */ function parseKnowledgeEntries(knowledgeContent) { const entries = []; - const entryPattern = - /### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g; + const entryPattern = /### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g; let match; while ((match = entryPattern.exec(knowledgeContent)) !== null) { @@ -41,9 +40,15 @@ function parseKnowledgeEntries(knowledgeContent) { const body = match[2]; // Extract fields - const evidenceMatch = body.match(/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/); - const confidenceMatch = body.match(/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/); - const domainMatch = body.match(/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/); + const evidenceMatch = body.match( + /[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/, + ); + const confidenceMatch = body.match( + /[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/, + ); + const domainMatch = body.match( + /[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/, + ); const recommendationMatch = body.match( /[-*]\s+\*?\*?Recommendation:\*?\*?\s*(.+?)(?:\n|$)/, ); @@ -90,9 +95,7 @@ function extractConcepts(entry) { } // Add title keywords - const titleKeywords = entry.title - .split(/\s+/) - .filter((w) => w.length > 3); + const titleKeywords = entry.title.split(/\s+/).filter((w) => w.length > 3); titleKeywords.forEach((w) => concepts.add(w.toLowerCase())); return Array.from(concepts); @@ -235,9 +238,7 @@ function loadKnowledgeFile(basePath) { if (existsSync(p)) { try { return readFileSync(p, "utf-8"); - } catch { - continue; - } + } catch {} } } @@ -257,7 +258,11 @@ function loadKnowledgeFile(basePath) { * * Returns: formatted string suitable for prompt variable substitution */ -export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {}) { +export function injectKnowledgeIntPrompt( + basePath, + taskContext = {}, + options = {}, +) { const knowledgeContent = loadKnowledgeFile(basePath); if (!knowledgeContent) { return "(knowledge base unavailable)"; @@ -304,7 +309,7 @@ export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = { * Purpose: Record which knowledge was actually used in a dispatch so we can * later measure effectiveness and refine knowledge compounding. */ -export function trackKnowledgeUsage(basePath, taskId, injectedKnowledge) { +export function trackKnowledgeUsage(_basePath, taskId, injectedKnowledge) { // This would write to a usage log in .sf/knowledge-usage.jsonl // Implementation deferred to feedback-loop integration return { diff --git a/src/resources/extensions/sf/model-learner.js b/src/resources/extensions/sf/model-learner.js deleted file mode 100644 index 0275da199..000000000 --- a/src/resources/extensions/sf/model-learner.js +++ /dev/null @@ -1,378 +0,0 @@ -/** - * Continuous Model Learning — track per-task-type model performance and - * adaptively route to better-performing models. - * - * Purpose: Make model selection data-driven and adaptive instead of static. - * When a model consistently fails on certain task types, demote it. When a new - * model succeeds where the incumbent fails, promote it. - * - * Consumer: auto-dispatch.ts outcome logging, model-router.ts selection logic, - * benchmark-selector.ts display. - */ - -import { existsSync, readFileSync, writeFileSync, appendFileSync } from "node:fs"; -import { dirname, join } from "node:path"; -import { mkdirSync } from "node:fs"; - -/** - * Per-task-type model performance tracker. - * - * Schema: - * { - * "execute-task": { - * "gpt-4o": { - * "successes": 42, - * "failures": 3, - * "timeouts": 1, - * "totalTokens": 1500000, - * "totalCost": 45.50, - * "lastUsed": "2026-05-06T16:30:00Z", - * "successRate": 0.93 - * }, - * "claude-opus": { - * ... - * } - * }, - * "plan-slice": { ... } - * } - */ -class ModelPerformanceTracker { - constructor(basePath) { - this.basePath = basePath; - this.storagePath = join(basePath, ".sf", "model-performance.json"); - this.data = this._load(); - } - - _load() { - if (!existsSync(this.storagePath)) { - return {}; - } - try { - const content = readFileSync(this.storagePath, "utf-8"); - return JSON.parse(content); - } catch { - return {}; - } - } - - _save() { - try { - const dir = dirname(this.storagePath); - if (!existsSync(dir)) { - mkdirSync(dir, { recursive: true }); - } - writeFileSync( - this.storagePath, - JSON.stringify(this.data, null, 2), - "utf-8", - ); - } catch (err) { - console.error("Failed to save model performance data:", err); - } - } - - /** - * Record outcome for a model on a specific task type. - */ - recordOutcome(taskType, modelId, outcome) { - const { - success, - timeout = false, - tokensUsed = 0, - costUsd = 0, - timestamp = new Date().toISOString(), - } = outcome; - - if (!this.data[taskType]) { - this.data[taskType] = {}; - } - if (!this.data[taskType][modelId]) { - this.data[taskType][modelId] = { - successes: 0, - failures: 0, - timeouts: 0, - totalTokens: 0, - totalCost: 0, - lastUsed: timestamp, - successRate: 0, - }; - } - - const stats = this.data[taskType][modelId]; - if (success) { - stats.successes += 1; - } else if (timeout) { - stats.timeouts += 1; - stats.failures += 1; - } else { - stats.failures += 1; - } - - stats.totalTokens += tokensUsed; - stats.totalCost += costUsd; - stats.lastUsed = timestamp; - - const total = stats.successes + stats.failures; - stats.successRate = total > 0 ? stats.successes / total : 0; - - this._save(); - } - - /** - * Get performance stats for a task type and model. - */ - getStats(taskType, modelId) { - return this.data[taskType]?.[modelId] || null; - } - - /** - * Get all models for a task type, ranked by success rate. - */ - getRankedModels(taskType, minSamples = 3) { - if (!this.data[taskType]) return []; - - const models = Object.entries(this.data[taskType]) - .filter(([, stats]) => stats.successes + stats.failures >= minSamples) - .map(([modelId, stats]) => ({ - modelId, - successRate: stats.successRate, - attempts: stats.successes + stats.failures, - tokens: stats.totalTokens, - cost: stats.totalCost, - latestAttempt: stats.lastUsed, - })) - .sort((a, b) => b.successRate - a.successRate); - - return models; - } - - /** - * Check if a model should be demoted (fails >50% on this task type). - */ - shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) { - const stats = this.getStats(taskType, modelId); - if (!stats) return false; - - const failureRate = 1 - stats.successRate; - const totalAttempts = stats.successes + stats.failures; - - return failureRate > thresholdFailureRate && totalAttempts >= 5; - } - - /** - * Get candidates for A/B testing (new model vs incumbent). - * Returns: { incumbent, challengers: [] } - */ - getABTestCandidates(taskType, minSamples = 3, lowRiskFraction = 0.1) { - const ranked = this.getRankedModels(taskType, minSamples); - if (ranked.length < 2) return null; - - const incumbent = ranked[0]; - const challengers = ranked.slice(1, 3); // Top 2 challengers - - return { - incumbent, - challengers, - testBudget: Math.max(1, Math.ceil(1 / lowRiskFraction)), // E.g., 10 tasks - }; - } - - /** - * Track A/B test results and decide on promotion/demotion. - */ - analyzeABTest(taskType, results) { - // results: { incumbentWins, challengerWins, incumbentAvgLatency, challengerAvgLatency } - const { incumbentWins, challengerWins } = results; - const total = incumbentWins + challengerWins; - - if (total < 5) { - return { recommendation: "inconclusive", reason: "insufficient samples" }; - } - - const challengerSuccessRate = challengerWins / total; - const incumbentSuccessRate = incumbentWins / total; - - if (challengerSuccessRate > incumbentSuccessRate + 0.1) { - return { - recommendation: "promote", - reason: `challenger ${challengerSuccessRate.toFixed(2)} vs incumbent ${incumbentSuccessRate.toFixed(2)}`, - }; - } - - return { - recommendation: "continue", - reason: "incumbent still ahead", - }; - } -} - -/** - * Failure Analyzer — categorize and log why models failed. - * - * Purpose: Understand failure patterns (timeout, quality, cost) to inform - * promotion/demotion decisions. - */ -class FailureAnalyzer { - constructor(basePath) { - this.basePath = basePath; - this.logsPath = join(basePath, ".sf", "model-failure-log.jsonl"); - } - - logFailure(taskType, modelId, failure) { - const { - reason = "unknown", - timeout = false, - tokensUsed = 0, - context = {}, - timestamp = new Date().toISOString(), - } = failure; - - const entry = { - timestamp, - taskType, - modelId, - reason, - timeout, - tokensUsed, - context, - }; - - try { - const dir = dirname(this.logsPath); - if (!existsSync(dir)) { - mkdirSync(dir, { recursive: true }); - } - appendFileSync(this.logsPath, JSON.stringify(entry) + "\n", "utf-8"); - } catch (err) { - console.error("Failed to log model failure:", err); - } - } - - /** - * Get failure summary for a model on a task type. - * Returns: { reasons: { [reason]: count }, patterns: [...] } - */ - getFailureSummary(taskType, modelId) { - if (!existsSync(this.logsPath)) { - return { reasons: {}, patterns: [] }; - } - - try { - const content = readFileSync(this.logsPath, "utf-8"); - const lines = content.trim().split("\n"); - - const reasons = {}; - const failures = []; - - for (const line of lines) { - const entry = JSON.parse(line); - if (entry.taskType !== taskType || entry.modelId !== modelId) continue; - - reasons[entry.reason] = (reasons[entry.reason] || 0) + 1; - failures.push(entry); - } - - // Detect patterns - const patterns = this._detectPatterns(failures); - - return { reasons, patterns }; - } catch { - return { reasons: {}, patterns: [] }; - } - } - - _detectPatterns(failures) { - // Analyze failure distribution to detect systematic issues - const timeoutCount = failures.filter((f) => f.timeout).length; - const patterns = []; - - if (timeoutCount / Math.max(failures.length, 1) > 0.5) { - patterns.push({ - type: "timeout_prone", - severity: "high", - suggestion: "Use shorter timeout or lower batch size", - }); - } - - return patterns; - } -} - -/** - * Main API: Integrate model learning into dispatch workflow. - * - * Usage in auto-dispatch.ts: - * ``` - * const learner = new ModelLearner(projectPath); - * learner.recordOutcome("execute-task", modelUsed, { - * success: taskSucceeded, - * timeout: taskTimedOut, - * tokensUsed: totalTokens, - * costUsd: modelCost, - * }); - * ``` - */ -export class ModelLearner { - constructor(basePath) { - this.basePath = basePath; - this.tracker = new ModelPerformanceTracker(basePath); - this.analyzer = new FailureAnalyzer(basePath); - } - - /** - * Record an outcome for a model on a task. - */ - recordOutcome(taskType, modelId, outcome) { - this.tracker.recordOutcome(taskType, modelId, outcome); - } - - /** - * Log failure details for analysis. - */ - logFailure(taskType, modelId, failure) { - this.analyzer.logFailure(taskType, modelId, failure); - } - - /** - * Get ranked models for a task type (for intelligent routing). - */ - getRankedModels(taskType, minSamples = 3) { - return this.tracker.getRankedModels(taskType, minSamples); - } - - /** - * Decide whether to demote a model. - */ - shouldDemote(taskType, modelId, failureThreshold = 0.5) { - return this.tracker.shouldDemote(taskType, modelId, failureThreshold); - } - - /** - * Get A/B test candidates (for hypothesis testing). - */ - getABTestCandidates(taskType, minSamples = 3) { - return this.tracker.getABTestCandidates(taskType, minSamples); - } - - /** - * Analyze A/B test results. - */ - analyzeABTest(taskType, results) { - return this.tracker.analyzeABTest(taskType, results); - } - - /** - * Get failure analysis for a model. - */ - getFailureAnalysis(taskType, modelId) { - return this.analyzer.getFailureSummary(taskType, modelId); - } -} - -export { ModelPerformanceTracker, FailureAnalyzer }; - -export default { - ModelLearner, - ModelPerformanceTracker, - FailureAnalyzer, -}; diff --git a/src/resources/extensions/sf/prompts/discuss-headless.md b/src/resources/extensions/sf/prompts/discuss-headless.md index f996973fb..96017dfe2 100644 --- a/src/resources/extensions/sf/prompts/discuss-headless.md +++ b/src/resources/extensions/sf/prompts/discuss-headless.md @@ -76,7 +76,7 @@ Before anything else, form a diagnosis: What is the core challenge? What is brok - **Measure coverage**: find untested critical paths - **Scan for dead code, stubs, and commented-out features** — abandoned attempts are signals - **Discover needed skills**: identify repo languages, frameworks, data stores, external services, build tools, and domain-specific competencies. Check installed skills first; record installed, missing, and potentially useful skills in `.sf/CODEBASE.md` and `.sf/PM-STRATEGY.md`. -- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. +- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. - Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. Fall back to shell `rg`, `find`, `ast-grep`, or `ls -la` only when the native/in-process tool surface is insufficient. ### Step 2: Check library and ecosystem facts diff --git a/src/resources/extensions/sf/prompts/discuss.md b/src/resources/extensions/sf/prompts/discuss.md index e6fcee6f1..bdcb2746e 100644 --- a/src/resources/extensions/sf/prompts/discuss.md +++ b/src/resources/extensions/sf/prompts/discuss.md @@ -34,7 +34,7 @@ After reflection is confirmed, decide the approach based on the actual scope — Before asking your first question, do a mandatory investigation pass. This is not optional. -1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. +1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. 2. **Check library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework the user mentioned. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover.** Get current facts about capabilities, constraints, API shapes, version-specific behavior. 3. **Web search** — `search-the-web` if the domain is unfamiliar, if you need current best practices, or if the user referenced external services/APIs you need facts about. Use `fetch_page` for full content when snippets aren't enough. diff --git a/src/resources/extensions/sf/prompts/gate-evaluate.md b/src/resources/extensions/sf/prompts/gate-evaluate.md index 28e546a63..2eecc2903 100644 --- a/src/resources/extensions/sf/prompts/gate-evaluate.md +++ b/src/resources/extensions/sf/prompts/gate-evaluate.md @@ -18,6 +18,18 @@ You are evaluating **quality gates in parallel** for this slice. Each gate is an {{gateList}} +## Gate Types Reference + +The following gate implementations may be present in this project. Each has distinct failure classes: + +- **`verification-gate`** — Runs lint, typecheck, tests, and post-execution checks. Failure classes: `verification` (check failed), `execution` (runtime/blocking error), `artifact` (post-execution consistency issue). +- **`security-guard`** — Scans for secrets, unsafe patterns, and dependency vulnerabilities. Failure classes: `policy` (secret leaked), `input` (unsafe pattern). +- **`cost-guard`** — Monitors LLM spend against per-unit and per-hour budgets, detects high-tier model failures. Failure classes: `policy` (budget exceeded), `execution` (high-tier model failure). +- **`outcome-learning`** — Queries historical task outcomes for failure-rate anomalies. Failure classes: `policy` (failure rate too high), `input` (model recommendation). +- **`multi-package-healing`** — Detects affected packages from git diff and runs targeted checks. Failure classes: `verification` (package check failed), `execution` (check timeout). +- **`chaos-monkey`** — Stress-tests durability by injecting latency, retryable errors, disk stress, or memory pressure. Failure classes: `execution` (injected fault caused failure). This gate only runs when explicitly enabled (`active: false` by default). +- **`post-execution-checks`** — Cross-task consistency verification after a task completes. Failure classes: `artifact` (consistency violation), `policy` (strict-mode warning escalation). + ## Execution Protocol 1. **Dispatch all gates** using `subagent` in parallel mode. Each subagent prompt is provided below. diff --git a/src/resources/extensions/sf/prompts/guided-research-slice.md b/src/resources/extensions/sf/prompts/guided-research-slice.md index e5be157e6..441b2cfcb 100644 --- a/src/resources/extensions/sf/prompts/guided-research-slice.md +++ b/src/resources/extensions/sf/prompts/guided-research-slice.md @@ -1,4 +1,4 @@ -Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool. +Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If the repository is checked out locally, GitHub code search is a scarce remote-only fallback: do not use GitHub `/search/code` for that local repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so use it only for repositories that are not on disk, dedupe repeated queries, and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool. **You are the scout.** A planner agent reads your output in a fresh context to decompose this slice into tasks. Write for the planner — surface key files, where the work divides naturally, what to build first, and how to verify. If the research doc is vague, the planner re-explores code you already read. If it's precise, the planner decomposes immediately. diff --git a/src/resources/extensions/sf/prompts/research-slice.md b/src/resources/extensions/sf/prompts/research-slice.md index c0c27ab1f..e15d2852e 100644 --- a/src/resources/extensions/sf/prompts/research-slice.md +++ b/src/resources/extensions/sf/prompts/research-slice.md @@ -46,6 +46,7 @@ Research what this slice needs. Narrate key findings and surprises as you go — 2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}} 3. Explore relevant code for this slice's scope. Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first. 3a. Use a research swarm when the slice has 2-3 independent unknowns or subsystems. Dispatch parallel `scout`/`researcher` subagents with distinct lenses, then synthesize what each found into this single RESEARCH artifact. Do not swarm a narrow, sequence-dependent investigation. +3b. **GitHub code search is a scarce remote-only fallback.** When the repository is present in `{{workingDirectory}}`, do not use GitHub `/search/code` for that repo; use local `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` as needed. GitHub's `code_search` bucket is small and separate from the normal REST/GraphQL quotas. Use GitHub code search only for repositories that are not checked out locally, dedupe repeated queries, and if it returns `403` rate-limit with a short reset, wait until reset or continue with local evidence. If remote code search is essential and still unavailable, checkpoint `continue`, `blocked`, or `decide` with the missing source named. 4. **Documentation lookup — prefer DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework — AI-indexed, no free-tier cap. Fall back to `resolve_library` → `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 requests/month — spend those on cases DeepWiki can't cover.** Skip both for libraries already used in this codebase. 5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — try DeepWiki → Context7 → web search in that order. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit. 6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt). diff --git a/src/resources/extensions/sf/prompts/system.md b/src/resources/extensions/sf/prompts/system.md index d02814049..3282fdf39 100644 --- a/src/resources/extensions/sf/prompts/system.md +++ b/src/resources/extensions/sf/prompts/system.md @@ -161,7 +161,7 @@ Templates showing the expected format for each artifact type are in: **Code navigation:** Use `lsp` for definition, type_definition, implementation, references, incoming_calls, outgoing_calls, hover, signature, symbols, rename, code_actions, format, and diagnostics. Falls back gracefully if no server is available. Never `grep` for a symbol definition when `lsp` can resolve it semantically. Never shell out to prettier/rustfmt/gofmt when `lsp format` is available. After editing code, use `lsp diagnostics` to verify no type errors were introduced. -**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant. +**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant. **Swarm dispatch:** Let the system decide whether swarming fits before dispatching multiple execution subagents. Use a 2-3 worker same-model swarm only when the work splits into independent shards with explicit file/directory ownership, shard-local verification, low conflict risk, and clear wall-clock savings. Do not swarm shared-interface edits, lockfiles, migrations, single-failure debugging, or sequence-dependent work. The parent agent remains coordinator: assign ownership, synthesize results, inspect dirty files, resolve conflicts, and run final verification. diff --git a/src/resources/extensions/sf/self-report-fixer.js b/src/resources/extensions/sf/self-report-fixer.js index 2267805b4..520daec37 100644 --- a/src/resources/extensions/sf/self-report-fixer.js +++ b/src/resources/extensions/sf/self-report-fixer.js @@ -15,7 +15,7 @@ * 4. Apply fix, test, and mark self-report resolved */ -import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { existsSync, readFileSync } from "node:fs"; import { join } from "node:path"; /** @@ -25,30 +25,36 @@ import { join } from "node:path"; const FIX_PATTERNS = [ { id: "validation-reviewer-rubric", - pattern: /validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i, + pattern: + /validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i, confidence: 0.95, // We fixed this in validation prompts already - description: "Add explicit criterion/implementation-gap rubric to validation-reviewer prompt", + description: + "Add explicit criterion/implementation-gap rubric to validation-reviewer prompt", fix: fixValidationReviewerRubric, }, { id: "gate-verdict-clarity", pattern: /gate.*verdict.*ambiguous|verdict.*semantics.*unclear/i, confidence: 0.9, - description: "Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md", + description: + "Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md", fix: fixGateVerdictSemantics, }, { id: "env-vars-unvalidated", - pattern: /SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i, + pattern: + /SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i, confidence: 0.85, description: "Add runtime validation for SF_* environment variables", fix: fixEnvValidation, }, { id: "self-report-coverage-gap", - pattern: /self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i, + pattern: + /self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i, confidence: 0.8, - description: "Implement automated self-report triage pipeline (this module)", + description: + "Implement automated self-report triage pipeline (this module)", fix: fixSelfReportPipeline, }, ]; @@ -72,11 +78,19 @@ async function fixValidationReviewerRubric(basePath) { // Check if rubric already exists if (content.includes("Gate vs. Task Scope Rubric")) { - return { success: true, alreadyFixed: true, reason: "Rubric already present" }; + return { + success: true, + alreadyFixed: true, + reason: "Rubric already present", + }; } // This is already done in prior session, so just confirm - return { success: true, alreadyFixed: true, reason: "Fix verified in session" }; + return { + success: true, + alreadyFixed: true, + reason: "Fix verified in session", + }; } /** @@ -92,7 +106,11 @@ async function fixGateVerdictSemantics(basePath) { // Check if gate semantics already documented if (content.includes("Gate Verdict Semantics")) { - return { success: true, alreadyFixed: true, reason: "Gate semantics documented" }; + return { + success: true, + alreadyFixed: true, + reason: "Gate semantics documented", + }; } return { success: true, alreadyFixed: true, reason: "Fix already verified" }; @@ -137,7 +155,7 @@ async function fixEnvValidation(basePath) { /** * Attempt to fix: Self-report triage pipeline (this module itself). */ -async function fixSelfReportPipeline(basePath) { +async function fixSelfReportPipeline(_basePath) { const thisFile = new URL(import.meta.url).pathname; if (!existsSync(thisFile)) { return { success: false, reason: "Self-report-fixer module not found" }; @@ -280,16 +298,17 @@ export function generateTriageSummary(reports) { uniqueClusters: clusters.length, deduped: clusters, categorized: categories, - highConfidenceFixes: reports - .flatMap((r) => { - const fixes = classifyReportFixes(r); - return fixes.filter((f) => f.confidence > 0.85).map((f) => ({ + highConfidenceFixes: reports.flatMap((r) => { + const fixes = classifyReportFixes(r); + return fixes + .filter((f) => f.confidence > 0.85) + .map((f) => ({ reportId: r.id, fixId: f.id, description: f.description, confidence: f.confidence, })); - }), + }), }; } diff --git a/src/resources/extensions/sf/sf-db.js b/src/resources/extensions/sf/sf-db.js index 6289b5e56..11cfe6845 100644 --- a/src/resources/extensions/sf/sf-db.js +++ b/src/resources/extensions/sf/sf-db.js @@ -4199,6 +4199,17 @@ export function getGateLatencyStats(gateId, windowHours = 24) { return { total: 0, avgMs: 0, p50Ms: 0, p95Ms: 0, maxMs: 0 }; } } +export function getDistinctGateIds() { + if (!currentDb) return []; + try { + const rows = currentDb + .prepare("SELECT DISTINCT gate_id FROM gate_runs") + .all(); + return rows.map((r) => r.gate_id).filter(Boolean); + } catch { + return []; + } +} function asStringOrNull(value) { return typeof value === "string" && value.length > 0 ? value : null; } diff --git a/src/resources/extensions/sf/skills/researcher/SKILL.md b/src/resources/extensions/sf/skills/researcher/SKILL.md index b8fd6ac8f..df53d3d25 100644 --- a/src/resources/extensions/sf/skills/researcher/SKILL.md +++ b/src/resources/extensions/sf/skills/researcher/SKILL.md @@ -69,6 +69,15 @@ rg --files src/resources/extensions/sf/skills {"query": "sift_request_factory", "strategy": "bm25", "limit": 10} ``` +**GitHub code search — remote-only fallback, not local repo search:** +When the repository is checked out locally, do not use GitHub `/search/code` for +that repo. Use `git grep` for tracked-file global search, `rg` for broader +worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead. +GitHub's `code_search` bucket is small and separate from normal REST/GraphQL +quotas. Use GitHub code search only for repositories that are not on disk, +dedupe repeated queries, and treat `403` rate-limit responses as a signal to +wait for reset or continue with local evidence. + **SF project database queries:** ```bash # Current milestone and slices diff --git a/src/resources/extensions/sf/tests/knowledge-injector.test.ts b/src/resources/extensions/sf/tests/knowledge-injector.test.ts index 4c473123d..bf4fd86b4 100644 --- a/src/resources/extensions/sf/tests/knowledge-injector.test.ts +++ b/src/resources/extensions/sf/tests/knowledge-injector.test.ts @@ -5,7 +5,7 @@ * and prompt injection work correctly. */ -import { describe, test, expect } from "vitest"; +import { describe, expect, test } from "vitest"; import knowledgeInjector from "../knowledge-injector.js"; const { @@ -208,7 +208,7 @@ describe("knowledge-injector", () => { const contradictions = detectContradictions(entries); // These are compatible tools, not contradictions const realContradictions = contradictions.filter( - (c) => !c.message.includes("suspicious") + (c) => !c.message.includes("suspicious"), ); expect(realContradictions.length).toBe(0); }); @@ -305,7 +305,7 @@ describe("knowledge-injector", () => { const relevant = findRelevantKnowledge(entries, context, 0, 0); if (relevant.length > 0) { - const { score, entry } = relevant[0]; + const { score } = relevant[0]; expect(score).toBeDefined(); expect(score).toBeGreaterThan(0); expect(score).toBeLessThanOrEqual(1); diff --git a/src/resources/extensions/sf/tests/self-report-fixer.test.ts b/src/resources/extensions/sf/tests/self-report-fixer.test.ts index 2a1c55234..b8eccb2c8 100644 --- a/src/resources/extensions/sf/tests/self-report-fixer.test.ts +++ b/src/resources/extensions/sf/tests/self-report-fixer.test.ts @@ -5,11 +5,11 @@ * deduplication, and severity categorization work correctly. */ -import { describe, test, expect } from "vitest"; +import { describe, expect, test } from "vitest"; import { + categorizeBySeverity, classifyReportFixes, dedupReports, - categorizeBySeverity, generateTriageSummary, } from "../self-report-fixer.js"; @@ -132,7 +132,7 @@ describe("self-report-fixer", () => { // Validation reviewer should be blocker const blockers = categorized.blocker; expect( - blockers.some((r) => r.title.toLowerCase().includes("validation")) + blockers.some((r) => r.title.toLowerCase().includes("validation")), ).toBe(true); }); @@ -288,7 +288,7 @@ describe("self-report-fixer", () => { // Recommendation should mention the actual action const recommendation = summary.recommendations[0]; expect(recommendation.toLowerCase()).toMatch( - /rubric|criteria|document|validation/ + /rubric|criteria|document|validation/, ); }); diff --git a/src/resources/extensions/sf/uok/chaos-monkey.js b/src/resources/extensions/sf/uok/chaos-monkey.js index bea8eb193..a3f5ad5cc 100644 --- a/src/resources/extensions/sf/uok/chaos-monkey.js +++ b/src/resources/extensions/sf/uok/chaos-monkey.js @@ -28,6 +28,42 @@ function randomInRange(min, max) { return min + Math.random() * (max - min); } +export class ChaosMonkeyGate { + constructor(options = {}) { + this.id = "chaos-monkey"; + this.type = "chaos"; + this._monkey = new ChaosMonkey(options); + } + + async execute(_ctx, attempt) { + try { + await this._monkey.strike("verification"); + } catch (err) { + return { + outcome: "fail", + failureClass: "execution", + rationale: `Chaos monkey injected fault: ${err instanceof Error ? err.message : String(err)}`, + findings: `Injected during verification phase (attempt ${attempt})`, + }; + } + const events = this._monkey.getInjectedEvents(); + const last = events[events.length - 1]; + if (last && last.phase === "verification") { + return { + outcome: "pass", + failureClass: "none", + rationale: `Chaos monkey injected ${last.type} during verification (non-fatal)`, + findings: `Latency: ${last.delay ?? 0}ms | Disk: ${last.sizeMb ?? 0}MB | Memory: ${last.sizeMb ?? 0}MB`, + }; + } + return { + outcome: "pass", + failureClass: "none", + rationale: "Chaos monkey: no fault injected this run", + }; + } +} + export class ChaosMonkey { constructor(options = {}) { this.active = options.active ?? false; diff --git a/src/resources/extensions/sf/uok/gate-runner.js b/src/resources/extensions/sf/uok/gate-runner.js index a9056161a..5d473e919 100644 --- a/src/resources/extensions/sf/uok/gate-runner.js +++ b/src/resources/extensions/sf/uok/gate-runner.js @@ -1,5 +1,6 @@ import { getGateCircuitBreaker, + getGateRunStats, insertGateRun, updateGateCircuitBreaker, } from "../sf-db.js"; @@ -20,9 +21,16 @@ const RETRY_MATRIX = { unknown: 0, }; -const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 5; -const CIRCUIT_BREAKER_OPEN_DURATION_MS = 60_000; -const CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS = 3; +function resolveCircuitBreakerThresholds() { + return { + failureThreshold: + Number(process.env.SF_CIRCUIT_BREAKER_FAILURE_THRESHOLD) || 5, + openDurationMs: + Number(process.env.SF_CIRCUIT_BREAKER_OPEN_DURATION_MS) || 60_000, + halfOpenMaxAttempts: + Number(process.env.SF_CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) || 3, + }; +} function nowIso() { return new Date().toISOString(); @@ -41,11 +49,30 @@ export class UokGateRunner { return Array.from(this.registry.values()); } + getHealthSummary() { + const gates = this.list(); + return { + gates: gates.map((g) => { + const stats = getGateRunStats(g.id, 24); + const cb = getGateCircuitBreaker(g.id); + return { + id: g.id, + type: g.type, + ...stats, + circuitBreaker: cb.state, + failureStreak: cb.failureStreak, + }; + }), + }; + } + _checkCircuitBreaker(gateId) { + const { openDurationMs, halfOpenMaxAttempts } = + resolveCircuitBreakerThresholds(); const breaker = getGateCircuitBreaker(gateId); if (breaker.state === "open") { const openedAt = breaker.openedAt ? Date.parse(breaker.openedAt) : 0; - if (Date.now() - openedAt >= CIRCUIT_BREAKER_OPEN_DURATION_MS) { + if (Date.now() - openedAt >= openDurationMs) { // Transition to half-open automatically after cooldown updateGateCircuitBreaker(gateId, { state: "half-open", @@ -56,11 +83,11 @@ export class UokGateRunner { } return { blocked: true, - reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + CIRCUIT_BREAKER_OPEN_DURATION_MS).toISOString()}.`, + reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + openDurationMs).toISOString()}.`, }; } if (breaker.state === "half-open") { - if (breaker.halfOpenAttempts >= CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) { + if (breaker.halfOpenAttempts >= halfOpenMaxAttempts) { // Too many half-open attempts without success — go back to open updateGateCircuitBreaker(gateId, { state: "open", @@ -100,7 +127,8 @@ export class UokGateRunner { }); return; } - if (nextStreak >= CIRCUIT_BREAKER_FAILURE_THRESHOLD) { + const { failureThreshold } = resolveCircuitBreakerThresholds(); + if (nextStreak >= failureThreshold) { updateGateCircuitBreaker(gateId, { state: "open", failureStreak: nextStreak, diff --git a/src/resources/extensions/sf/uok/metrics-exposition.js b/src/resources/extensions/sf/uok/metrics-exposition.js index 5a5318312..7774932dd 100644 --- a/src/resources/extensions/sf/uok/metrics-exposition.js +++ b/src/resources/extensions/sf/uok/metrics-exposition.js @@ -8,21 +8,26 @@ * Consumer: health widgets, /sf uok status, and external monitoring. */ -import { existsSync, mkdirSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { sfRoot } from "../paths.js"; import { + getDistinctGateIds, getGateCircuitBreaker, getGateLatencyStats, getGateRunStats, isDbAvailable, } from "../sf-db.js"; -const GATE_NAMES = [ +const DEFAULT_GATE_NAMES = [ "security-guard", "cost-guard", "outcome-learning", "multi-package-healing", + "chaos-monkey", + "verification-gate", + "post-execution-checks", + "milestone-validation-post-check", ]; function fmtCounter(name, value, labels = {}) { @@ -37,9 +42,9 @@ function fmtGauge(name, value, labels = {}) { return fmtCounter(name, value, labels); } -function collectGateMetrics() { +function collectGateMetrics(gateIds) { const lines = []; - for (const gateId of GATE_NAMES) { + for (const gateId of gateIds) { const stats = getGateRunStats(gateId, 24); lines.push( fmtCounter("uok_gate_runs_total", stats.total, { gate_id: gateId }), @@ -89,7 +94,7 @@ function collectGateMetrics() { return lines; } -function buildMetricsText() { +function buildMetricsText(gateIds) { const lines = [ "# HELP uok_gate_runs_total Total gate runs in the last 24h", "# TYPE uok_gate_runs_total counter", @@ -113,7 +118,13 @@ function buildMetricsText() { "# TYPE uok_gate_circuit_breaker_failure_streak gauge", ]; if (isDbAvailable()) { - lines.push(...collectGateMetrics()); + const ids = + gateIds && gateIds.length > 0 + ? gateIds + : getDistinctGateIds().length > 0 + ? getDistinctGateIds() + : DEFAULT_GATE_NAMES; + lines.push(...collectGateMetrics(ids)); } return lines.join("\n") + "\n"; } @@ -122,11 +133,11 @@ export function metricsPath(basePath) { return join(sfRoot(basePath), "runtime", "uok-metrics.prom"); } -export function writeUokMetrics(basePath) { +export function writeUokMetrics(basePath, gateIds) { const path = metricsPath(basePath); const dir = join(sfRoot(basePath), "runtime"); mkdirSync(dir, { recursive: true }); - writeFileSync(path, buildMetricsText(), "utf-8"); + writeFileSync(path, buildMetricsText(gateIds), "utf-8"); return path; } @@ -134,7 +145,7 @@ export function readUokMetrics(basePath) { const path = metricsPath(basePath); if (!existsSync(path)) return null; try { - return buildMetricsText(); + return readFileSync(path, "utf-8"); } catch { return null; }