test: harden uok self-evolution paths
This commit is contained in:
parent
69d3114265
commit
30f8738585
25 changed files with 314 additions and 481 deletions
|
|
@ -104,3 +104,15 @@ test("set_widget_when_widget_host_missing_ignores_factory_without_throwing", ()
|
|||
{ placement: "belowEditor" },
|
||||
);
|
||||
});
|
||||
|
||||
test("dialog_methods_when_host_dialogs_missing_degrade_without_throwing", async () => {
|
||||
const ui = createExtensionUIContext({
|
||||
// RPC/headless-style hosts may not implement interactive dialogs.
|
||||
});
|
||||
|
||||
await assert.doesNotReject(async () => {
|
||||
assert.equal(await ui.confirm("Proceed?", "Dangerous command"), false);
|
||||
assert.equal(await ui.select("Pick one", ["a", "b"]), undefined);
|
||||
assert.equal(await ui.input("Value", "placeholder"), undefined);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -96,12 +96,24 @@ function createWidgetSetter(
|
|||
export function createExtensionUIContext(host: any): ExtensionUIContext {
|
||||
const setWidget = createWidgetSetter(host);
|
||||
return {
|
||||
select: (title, options, opts) =>
|
||||
host.showExtensionSelector(title, options, opts),
|
||||
confirm: (title, message, opts) =>
|
||||
host.showExtensionConfirm(title, message, opts),
|
||||
input: (title, placeholder, opts) =>
|
||||
host.showExtensionInput(title, placeholder, opts),
|
||||
select: (title, options, opts) => {
|
||||
if (typeof host.showExtensionSelector !== "function") {
|
||||
return Promise.resolve(undefined);
|
||||
}
|
||||
return host.showExtensionSelector(title, options, opts);
|
||||
},
|
||||
confirm: (title, message, opts) => {
|
||||
if (typeof host.showExtensionConfirm !== "function") {
|
||||
return Promise.resolve(false);
|
||||
}
|
||||
return host.showExtensionConfirm(title, message, opts);
|
||||
},
|
||||
input: (title, placeholder, opts) => {
|
||||
if (typeof host.showExtensionInput !== "function") {
|
||||
return Promise.resolve(undefined);
|
||||
}
|
||||
return host.showExtensionInput(title, placeholder, opts);
|
||||
},
|
||||
notify: (message, type) => notifyHost(host, message, type),
|
||||
onTerminalInput: (handler) =>
|
||||
host.addExtensionTerminalInputListener(handler),
|
||||
|
|
|
|||
|
|
@ -17,12 +17,7 @@ import {
|
|||
} from "./auto-prompts.js";
|
||||
import { scopeActiveToolsForUnitType } from "./constants.js";
|
||||
import { loadFile } from "./files.js";
|
||||
import { parseRoadmap } from "./parsers.js";
|
||||
import {
|
||||
relSliceFile,
|
||||
resolveMilestoneFile,
|
||||
resolveSliceFile,
|
||||
} from "./paths.js";
|
||||
import { relSliceFile, resolveSliceFile } from "./paths.js";
|
||||
import { loadEffectiveSFPreferences } from "./preferences.js";
|
||||
import { getMilestoneSlices, isDbAvailable } from "./sf-db.js";
|
||||
import { deriveState } from "./state.js";
|
||||
|
|
@ -181,24 +176,17 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
|
|||
}
|
||||
case "reassess":
|
||||
case "reassess-roadmap": {
|
||||
// DB primary path — get completed slices, fall back to file parsing when DB has no data
|
||||
let completedSliceIds = [];
|
||||
if (isDbAvailable()) {
|
||||
completedSliceIds = getMilestoneSlices(mid)
|
||||
.filter((s) => s.status === "complete")
|
||||
.map((s) => s.id);
|
||||
}
|
||||
if (completedSliceIds.length === 0) {
|
||||
// File-based fallback: parse roadmap checkboxes
|
||||
const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
|
||||
if (roadmapPath) {
|
||||
const roadmapContent = await loadFile(roadmapPath);
|
||||
if (roadmapContent) {
|
||||
completedSliceIds = parseRoadmap(roadmapContent)
|
||||
.slices.filter((s) => s.done)
|
||||
.map((s) => s.id);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ctx.ui.notify(
|
||||
"Cannot dispatch reassess-roadmap: database unavailable.",
|
||||
"warning",
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (completedSliceIds.length === 0) {
|
||||
ctx.ui.notify(
|
||||
|
|
@ -223,24 +211,18 @@ export async function dispatchDirectPhase(ctx, pi, phase, base) {
|
|||
// UAT targets the most recently completed slice, not the active (next
|
||||
// incomplete) slice. After slice completion, state.activeSlice advances
|
||||
// to the next incomplete slice, so we find the last done slice from the
|
||||
// roadmap instead (#1693).
|
||||
// DB instead (#1693).
|
||||
let uatCompletedSliceIds = [];
|
||||
if (isDbAvailable()) {
|
||||
uatCompletedSliceIds = getMilestoneSlices(mid)
|
||||
.filter((s) => s.status === "complete")
|
||||
.map((s) => s.id);
|
||||
}
|
||||
if (uatCompletedSliceIds.length === 0) {
|
||||
// File-based fallback: parse roadmap checkboxes
|
||||
const roadmapPath = resolveMilestoneFile(base, mid, "ROADMAP");
|
||||
if (roadmapPath) {
|
||||
const roadmapContent = await loadFile(roadmapPath);
|
||||
if (roadmapContent) {
|
||||
uatCompletedSliceIds = parseRoadmap(roadmapContent)
|
||||
.slices.filter((s) => s.done)
|
||||
.map((s) => s.id);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ctx.ui.notify(
|
||||
"Cannot dispatch run-uat: database unavailable.",
|
||||
"warning",
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (uatCompletedSliceIds.length === 0) {
|
||||
ctx.ui.notify(
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import {
|
|||
} from "./files.js";
|
||||
import { assertGateCoverage, getGatesForTurn } from "./gate-registry.js";
|
||||
import { inlineGraphSubgraph } from "./graph-context.js";
|
||||
import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";
|
||||
import {
|
||||
formatMemoriesForPrompt,
|
||||
getActiveMemoriesRanked,
|
||||
|
|
@ -66,7 +67,6 @@ import {
|
|||
import { composeInlinedContext } from "./unit-context-composer.js";
|
||||
import { getUatType, hasVerdict } from "./verdict-parser.js";
|
||||
import { logWarning } from "./workflow-logger.js";
|
||||
import { injectKnowledgeIntPrompt } from "./knowledge-injector.js";
|
||||
|
||||
// ─── Preamble Cap ─────────────────────────────────────────────────────────────
|
||||
/**
|
||||
|
|
@ -88,7 +88,7 @@ async function getKnowledgeInjection(basePath, taskContext = {}) {
|
|||
minConfidence: 0.7,
|
||||
minSimilarity: 0.5,
|
||||
});
|
||||
} catch (err) {
|
||||
} catch {
|
||||
// Gracefully degrade if knowledge injection fails
|
||||
return "(knowledge unavailable)";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -389,9 +389,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
|
|||
if (isDbAvailable()) {
|
||||
const tasks = getSliceTasks(mid, sid);
|
||||
if (tasks.length > 0) taskIds = tasks.map((t) => t.id);
|
||||
else return false;
|
||||
}
|
||||
if (!taskIds) {
|
||||
// LEGACY: DB unavailable or no tasks in DB — parse plan file for task IDs
|
||||
if (!taskIds && !isDbAvailable()) {
|
||||
// LEGACY: DB unavailable — parse plan file for task IDs.
|
||||
const planContent = readFileSync(absPath, "utf-8");
|
||||
const plan = parsePlan(planContent);
|
||||
if (plan.tasks.length > 0) taskIds = plan.tasks.map((t) => t.id);
|
||||
|
|
@ -443,9 +444,10 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// DB available but slice row not found — completion tool never ran.
|
||||
return false;
|
||||
}
|
||||
// else: DB available but slice not found — summary + UAT exist,
|
||||
// treat as verified (slice may not be imported yet)
|
||||
}
|
||||
}
|
||||
// complete-milestone must have produced implementation artifacts (#1703).
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ import {
|
|||
import { isMilestoneComplete } from "./state.js";
|
||||
import { isClosedStatus } from "./status-guards.js";
|
||||
import { parseUnitId } from "./unit-id.js";
|
||||
import { ChaosMonkeyGate } from "./uok/chaos-monkey.js";
|
||||
import { CostGuardGate } from "./uok/cost-guard-gate.js";
|
||||
import { resolveUokFlags } from "./uok/flags.js";
|
||||
import { UokGateRunner } from "./uok/gate-runner.js";
|
||||
|
|
@ -357,6 +358,24 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
|
|||
unitId: s.currentUnit.id,
|
||||
});
|
||||
}
|
||||
if (uokFlags.chaosMonkey) {
|
||||
gateRunner.register(new ChaosMonkeyGate({ active: true }));
|
||||
const cmResult = await gateRunner.run("chaos-monkey", {
|
||||
basePath: s.basePath,
|
||||
traceId: `chaos-monkey:${s.currentUnit.id}`,
|
||||
turnId: s.currentUnit.id,
|
||||
milestoneId: mid ?? undefined,
|
||||
sliceId: sid ?? undefined,
|
||||
taskId: tid ?? undefined,
|
||||
unitType: s.currentUnit.type,
|
||||
unitId: s.currentUnit.id,
|
||||
});
|
||||
if (cmResult.outcome === "fail") {
|
||||
result.passed = false;
|
||||
result.chaosMonkeyFailure = true;
|
||||
result.chaosMonkeyRationale = cmResult.rationale;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Auto-fix retry preferences
|
||||
const autoFixEnabled = prefs?.verification_auto_fix !== false;
|
||||
|
|
@ -438,6 +457,16 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
|
|||
`verification-gate: cost-guard failure: ${result.costGuardRationale}\n`,
|
||||
);
|
||||
}
|
||||
// Log chaos-monkey failures
|
||||
if (result.chaosMonkeyFailure) {
|
||||
ctx.ui.notify(
|
||||
`[verify] CHAOS-MONKEY FAIL — ${result.chaosMonkeyRationale}`,
|
||||
"error",
|
||||
);
|
||||
process.stderr.write(
|
||||
`verification-gate: chaos-monkey injected failure: ${result.chaosMonkeyRationale}\n`,
|
||||
);
|
||||
}
|
||||
// Write verification evidence JSON
|
||||
const attempt = s.verificationRetryCount.get(s.currentUnit.id) ?? 0;
|
||||
if (mid && sid && tid) {
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ const TOP_LEVEL_SUBCOMMANDS = [
|
|||
{ cmd: "run-hook", desc: "Manually trigger a specific hook" },
|
||||
{ cmd: "skill-health", desc: "Skill lifecycle dashboard" },
|
||||
{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
|
||||
{ cmd: "uok", desc: "UOK runtime health and ledger status" },
|
||||
{ cmd: "uok", desc: "UOK runtime health, ledger status, and gate metrics" },
|
||||
{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
|
||||
{ cmd: "forensics", desc: "Examine execution logs" },
|
||||
{ cmd: "init", desc: "Project init wizard" },
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
|
|||
import { sfRoot } from "./paths.js";
|
||||
import { getUokRuns, isDbAvailable } from "./sf-db.js";
|
||||
import { writeUokDiagnostics } from "./uok/diagnostic-synthesis.js";
|
||||
import { UokGateRunner } from "./uok/gate-runner.js";
|
||||
import { readUokMetrics, writeUokMetrics } from "./uok/metrics-exposition.js";
|
||||
import {
|
||||
summarizeParityHealth,
|
||||
writeParityReport,
|
||||
|
|
@ -90,6 +92,15 @@ export async function collectUokStatus(
|
|||
} catch {
|
||||
diagnostics = null;
|
||||
}
|
||||
let gateHealth = null;
|
||||
let metricsPath = null;
|
||||
try {
|
||||
const runner = new UokGateRunner();
|
||||
gateHealth = runner.getHealthSummary();
|
||||
metricsPath = writeUokMetrics(basePath);
|
||||
} catch {
|
||||
// gate health and metrics are best-effort
|
||||
}
|
||||
return {
|
||||
dbAvailable,
|
||||
generatedAt: new Date(nowMs).toISOString(),
|
||||
|
|
@ -103,6 +114,8 @@ export async function collectUokStatus(
|
|||
current,
|
||||
historical,
|
||||
diagnostics,
|
||||
gateHealth,
|
||||
metricsPath,
|
||||
reportPath: join(sfRoot(basePath), "runtime", "uok-parity-report.json"),
|
||||
};
|
||||
}
|
||||
|
|
@ -164,6 +177,24 @@ export function formatUokStatus(status, nowMs = Date.now()) {
|
|||
lines.push("Last error: none in ledger");
|
||||
}
|
||||
lines.push("");
|
||||
if (status.gateHealth?.gates?.length > 0) {
|
||||
lines.push("Gate health (24h):");
|
||||
for (const g of status.gateHealth.gates) {
|
||||
const icon =
|
||||
g.circuitBreaker === "open"
|
||||
? "🔴"
|
||||
: g.circuitBreaker === "half-open"
|
||||
? "🟡"
|
||||
: "🟢";
|
||||
lines.push(
|
||||
` ${icon} ${g.id}: ${g.pass} pass / ${g.fail} fail / ${g.retry} retry | cb: ${g.circuitBreaker}${g.failureStreak > 0 ? ` (streak ${g.failureStreak})` : ""}`,
|
||||
);
|
||||
}
|
||||
lines.push("");
|
||||
}
|
||||
if (status.metricsPath) {
|
||||
lines.push(`Metrics: ${status.metricsPath}`);
|
||||
}
|
||||
lines.push(`Report: ${status.reportPath}`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
|
@ -172,11 +203,22 @@ export async function handleUok(args, ctx) {
|
|||
const trimmed = args.trim();
|
||||
if (trimmed === "help" || trimmed === "--help") {
|
||||
ctx.ui.notify(
|
||||
"Usage: /sf uok [status|--json]\n\nShows UOK ledger health, last run, last error, historical drift, and startup gate state.",
|
||||
"Usage: /sf uok [status|metrics|--json]\n\n status — UOK ledger health, last run, last error, historical drift, startup gate, and gate health\n metrics — Render Prometheus-format metrics to .sf/runtime/uok-metrics.prom and display\n --json — Same as status but outputs JSON",
|
||||
"info",
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (trimmed === "metrics") {
|
||||
const basePath = process.cwd();
|
||||
const path = writeUokMetrics(basePath);
|
||||
const text = readUokMetrics(basePath);
|
||||
ctx.ui.notify(
|
||||
text ?? "No metrics available (DB unavailable or no gate data)",
|
||||
"info",
|
||||
);
|
||||
ctx.ui.notify(`Written to: ${path}`, "info");
|
||||
return;
|
||||
}
|
||||
const status = await collectUokStatus(process.cwd());
|
||||
if (trimmed === "--json" || trimmed === "json") {
|
||||
ctx.ui.notify(JSON.stringify(status, null, 2), "info");
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
|
|||
{ cmd: "doctor", desc: "Runtime health checks with auto-fix" },
|
||||
{
|
||||
cmd: "uok",
|
||||
desc: "UOK runtime health: ledger, last run, last error, startup gate",
|
||||
desc: "UOK runtime health: ledger, last run, last error, startup gate, gate metrics",
|
||||
},
|
||||
{ cmd: "logs", desc: "Browse activity logs, debug logs, and metrics" },
|
||||
{ cmd: "forensics", desc: "Examine execution logs" },
|
||||
|
|
|
|||
|
|
@ -32,8 +32,7 @@ import { join } from "node:path";
|
|||
*/
|
||||
function parseKnowledgeEntries(knowledgeContent) {
|
||||
const entries = [];
|
||||
const entryPattern =
|
||||
/### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;
|
||||
const entryPattern = /### Judgment Entry:\s*(.+?)\n([\s\S]*?)(?=###\s|$)/g;
|
||||
|
||||
let match;
|
||||
while ((match = entryPattern.exec(knowledgeContent)) !== null) {
|
||||
|
|
@ -41,9 +40,15 @@ function parseKnowledgeEntries(knowledgeContent) {
|
|||
const body = match[2];
|
||||
|
||||
// Extract fields
|
||||
const evidenceMatch = body.match(/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/);
|
||||
const confidenceMatch = body.match(/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/);
|
||||
const domainMatch = body.match(/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/);
|
||||
const evidenceMatch = body.match(
|
||||
/[-*]\s+\*?\*?Evidence:\*?\*?\s*(.+?)(?:\n|$)/,
|
||||
);
|
||||
const confidenceMatch = body.match(
|
||||
/[-*]\s+\*?\*?Confidence:\*?\*?\s*([\d.]+)/,
|
||||
);
|
||||
const domainMatch = body.match(
|
||||
/[-*]\s+\*?\*?Domain:\*?\*?\s*(.+?)(?:\n|$)/,
|
||||
);
|
||||
const recommendationMatch = body.match(
|
||||
/[-*]\s+\*?\*?Recommendation:\*?\*?\s*(.+?)(?:\n|$)/,
|
||||
);
|
||||
|
|
@ -90,9 +95,7 @@ function extractConcepts(entry) {
|
|||
}
|
||||
|
||||
// Add title keywords
|
||||
const titleKeywords = entry.title
|
||||
.split(/\s+/)
|
||||
.filter((w) => w.length > 3);
|
||||
const titleKeywords = entry.title.split(/\s+/).filter((w) => w.length > 3);
|
||||
titleKeywords.forEach((w) => concepts.add(w.toLowerCase()));
|
||||
|
||||
return Array.from(concepts);
|
||||
|
|
@ -235,9 +238,7 @@ function loadKnowledgeFile(basePath) {
|
|||
if (existsSync(p)) {
|
||||
try {
|
||||
return readFileSync(p, "utf-8");
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -257,7 +258,11 @@ function loadKnowledgeFile(basePath) {
|
|||
*
|
||||
* Returns: formatted string suitable for prompt variable substitution
|
||||
*/
|
||||
export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {}) {
|
||||
export function injectKnowledgeIntPrompt(
|
||||
basePath,
|
||||
taskContext = {},
|
||||
options = {},
|
||||
) {
|
||||
const knowledgeContent = loadKnowledgeFile(basePath);
|
||||
if (!knowledgeContent) {
|
||||
return "(knowledge base unavailable)";
|
||||
|
|
@ -304,7 +309,7 @@ export function injectKnowledgeIntPrompt(basePath, taskContext = {}, options = {
|
|||
* Purpose: Record which knowledge was actually used in a dispatch so we can
|
||||
* later measure effectiveness and refine knowledge compounding.
|
||||
*/
|
||||
export function trackKnowledgeUsage(basePath, taskId, injectedKnowledge) {
|
||||
export function trackKnowledgeUsage(_basePath, taskId, injectedKnowledge) {
|
||||
// This would write to a usage log in .sf/knowledge-usage.jsonl
|
||||
// Implementation deferred to feedback-loop integration
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -1,378 +0,0 @@
|
|||
/**
|
||||
* Continuous Model Learning — track per-task-type model performance and
|
||||
* adaptively route to better-performing models.
|
||||
*
|
||||
* Purpose: Make model selection data-driven and adaptive instead of static.
|
||||
* When a model consistently fails on certain task types, demote it. When a new
|
||||
* model succeeds where the incumbent fails, promote it.
|
||||
*
|
||||
* Consumer: auto-dispatch.ts outcome logging, model-router.ts selection logic,
|
||||
* benchmark-selector.ts display.
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync, writeFileSync, appendFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { mkdirSync } from "node:fs";
|
||||
|
||||
/**
|
||||
* Per-task-type model performance tracker.
|
||||
*
|
||||
* Schema:
|
||||
* {
|
||||
* "execute-task": {
|
||||
* "gpt-4o": {
|
||||
* "successes": 42,
|
||||
* "failures": 3,
|
||||
* "timeouts": 1,
|
||||
* "totalTokens": 1500000,
|
||||
* "totalCost": 45.50,
|
||||
* "lastUsed": "2026-05-06T16:30:00Z",
|
||||
* "successRate": 0.93
|
||||
* },
|
||||
* "claude-opus": {
|
||||
* ...
|
||||
* }
|
||||
* },
|
||||
* "plan-slice": { ... }
|
||||
* }
|
||||
*/
|
||||
class ModelPerformanceTracker {
|
||||
constructor(basePath) {
|
||||
this.basePath = basePath;
|
||||
this.storagePath = join(basePath, ".sf", "model-performance.json");
|
||||
this.data = this._load();
|
||||
}
|
||||
|
||||
_load() {
|
||||
if (!existsSync(this.storagePath)) {
|
||||
return {};
|
||||
}
|
||||
try {
|
||||
const content = readFileSync(this.storagePath, "utf-8");
|
||||
return JSON.parse(content);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
_save() {
|
||||
try {
|
||||
const dir = dirname(this.storagePath);
|
||||
if (!existsSync(dir)) {
|
||||
mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
writeFileSync(
|
||||
this.storagePath,
|
||||
JSON.stringify(this.data, null, 2),
|
||||
"utf-8",
|
||||
);
|
||||
} catch (err) {
|
||||
console.error("Failed to save model performance data:", err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record outcome for a model on a specific task type.
|
||||
*/
|
||||
recordOutcome(taskType, modelId, outcome) {
|
||||
const {
|
||||
success,
|
||||
timeout = false,
|
||||
tokensUsed = 0,
|
||||
costUsd = 0,
|
||||
timestamp = new Date().toISOString(),
|
||||
} = outcome;
|
||||
|
||||
if (!this.data[taskType]) {
|
||||
this.data[taskType] = {};
|
||||
}
|
||||
if (!this.data[taskType][modelId]) {
|
||||
this.data[taskType][modelId] = {
|
||||
successes: 0,
|
||||
failures: 0,
|
||||
timeouts: 0,
|
||||
totalTokens: 0,
|
||||
totalCost: 0,
|
||||
lastUsed: timestamp,
|
||||
successRate: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const stats = this.data[taskType][modelId];
|
||||
if (success) {
|
||||
stats.successes += 1;
|
||||
} else if (timeout) {
|
||||
stats.timeouts += 1;
|
||||
stats.failures += 1;
|
||||
} else {
|
||||
stats.failures += 1;
|
||||
}
|
||||
|
||||
stats.totalTokens += tokensUsed;
|
||||
stats.totalCost += costUsd;
|
||||
stats.lastUsed = timestamp;
|
||||
|
||||
const total = stats.successes + stats.failures;
|
||||
stats.successRate = total > 0 ? stats.successes / total : 0;
|
||||
|
||||
this._save();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get performance stats for a task type and model.
|
||||
*/
|
||||
getStats(taskType, modelId) {
|
||||
return this.data[taskType]?.[modelId] || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all models for a task type, ranked by success rate.
|
||||
*/
|
||||
getRankedModels(taskType, minSamples = 3) {
|
||||
if (!this.data[taskType]) return [];
|
||||
|
||||
const models = Object.entries(this.data[taskType])
|
||||
.filter(([, stats]) => stats.successes + stats.failures >= minSamples)
|
||||
.map(([modelId, stats]) => ({
|
||||
modelId,
|
||||
successRate: stats.successRate,
|
||||
attempts: stats.successes + stats.failures,
|
||||
tokens: stats.totalTokens,
|
||||
cost: stats.totalCost,
|
||||
latestAttempt: stats.lastUsed,
|
||||
}))
|
||||
.sort((a, b) => b.successRate - a.successRate);
|
||||
|
||||
return models;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a model should be demoted (fails >50% on this task type).
|
||||
*/
|
||||
shouldDemote(taskType, modelId, thresholdFailureRate = 0.5) {
|
||||
const stats = this.getStats(taskType, modelId);
|
||||
if (!stats) return false;
|
||||
|
||||
const failureRate = 1 - stats.successRate;
|
||||
const totalAttempts = stats.successes + stats.failures;
|
||||
|
||||
return failureRate > thresholdFailureRate && totalAttempts >= 5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get candidates for A/B testing (new model vs incumbent).
|
||||
* Returns: { incumbent, challengers: [] }
|
||||
*/
|
||||
getABTestCandidates(taskType, minSamples = 3, lowRiskFraction = 0.1) {
|
||||
const ranked = this.getRankedModels(taskType, minSamples);
|
||||
if (ranked.length < 2) return null;
|
||||
|
||||
const incumbent = ranked[0];
|
||||
const challengers = ranked.slice(1, 3); // Top 2 challengers
|
||||
|
||||
return {
|
||||
incumbent,
|
||||
challengers,
|
||||
testBudget: Math.max(1, Math.ceil(1 / lowRiskFraction)), // E.g., 10 tasks
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Track A/B test results and decide on promotion/demotion.
|
||||
*/
|
||||
analyzeABTest(taskType, results) {
|
||||
// results: { incumbentWins, challengerWins, incumbentAvgLatency, challengerAvgLatency }
|
||||
const { incumbentWins, challengerWins } = results;
|
||||
const total = incumbentWins + challengerWins;
|
||||
|
||||
if (total < 5) {
|
||||
return { recommendation: "inconclusive", reason: "insufficient samples" };
|
||||
}
|
||||
|
||||
const challengerSuccessRate = challengerWins / total;
|
||||
const incumbentSuccessRate = incumbentWins / total;
|
||||
|
||||
if (challengerSuccessRate > incumbentSuccessRate + 0.1) {
|
||||
return {
|
||||
recommendation: "promote",
|
||||
reason: `challenger ${challengerSuccessRate.toFixed(2)} vs incumbent ${incumbentSuccessRate.toFixed(2)}`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
recommendation: "continue",
|
||||
reason: "incumbent still ahead",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Failure Analyzer — categorize and log why models failed.
|
||||
*
|
||||
* Purpose: Understand failure patterns (timeout, quality, cost) to inform
|
||||
* promotion/demotion decisions.
|
||||
*/
|
||||
class FailureAnalyzer {
|
||||
constructor(basePath) {
|
||||
this.basePath = basePath;
|
||||
this.logsPath = join(basePath, ".sf", "model-failure-log.jsonl");
|
||||
}
|
||||
|
||||
logFailure(taskType, modelId, failure) {
|
||||
const {
|
||||
reason = "unknown",
|
||||
timeout = false,
|
||||
tokensUsed = 0,
|
||||
context = {},
|
||||
timestamp = new Date().toISOString(),
|
||||
} = failure;
|
||||
|
||||
const entry = {
|
||||
timestamp,
|
||||
taskType,
|
||||
modelId,
|
||||
reason,
|
||||
timeout,
|
||||
tokensUsed,
|
||||
context,
|
||||
};
|
||||
|
||||
try {
|
||||
const dir = dirname(this.logsPath);
|
||||
if (!existsSync(dir)) {
|
||||
mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
appendFileSync(this.logsPath, JSON.stringify(entry) + "\n", "utf-8");
|
||||
} catch (err) {
|
||||
console.error("Failed to log model failure:", err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get failure summary for a model on a task type.
|
||||
* Returns: { reasons: { [reason]: count }, patterns: [...] }
|
||||
*/
|
||||
getFailureSummary(taskType, modelId) {
|
||||
if (!existsSync(this.logsPath)) {
|
||||
return { reasons: {}, patterns: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const content = readFileSync(this.logsPath, "utf-8");
|
||||
const lines = content.trim().split("\n");
|
||||
|
||||
const reasons = {};
|
||||
const failures = [];
|
||||
|
||||
for (const line of lines) {
|
||||
const entry = JSON.parse(line);
|
||||
if (entry.taskType !== taskType || entry.modelId !== modelId) continue;
|
||||
|
||||
reasons[entry.reason] = (reasons[entry.reason] || 0) + 1;
|
||||
failures.push(entry);
|
||||
}
|
||||
|
||||
// Detect patterns
|
||||
const patterns = this._detectPatterns(failures);
|
||||
|
||||
return { reasons, patterns };
|
||||
} catch {
|
||||
return { reasons: {}, patterns: [] };
|
||||
}
|
||||
}
|
||||
|
||||
_detectPatterns(failures) {
|
||||
// Analyze failure distribution to detect systematic issues
|
||||
const timeoutCount = failures.filter((f) => f.timeout).length;
|
||||
const patterns = [];
|
||||
|
||||
if (timeoutCount / Math.max(failures.length, 1) > 0.5) {
|
||||
patterns.push({
|
||||
type: "timeout_prone",
|
||||
severity: "high",
|
||||
suggestion: "Use shorter timeout or lower batch size",
|
||||
});
|
||||
}
|
||||
|
||||
return patterns;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main API: Integrate model learning into dispatch workflow.
|
||||
*
|
||||
* Usage in auto-dispatch.ts:
|
||||
* ```
|
||||
* const learner = new ModelLearner(projectPath);
|
||||
* learner.recordOutcome("execute-task", modelUsed, {
|
||||
* success: taskSucceeded,
|
||||
* timeout: taskTimedOut,
|
||||
* tokensUsed: totalTokens,
|
||||
* costUsd: modelCost,
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export class ModelLearner {
|
||||
constructor(basePath) {
|
||||
this.basePath = basePath;
|
||||
this.tracker = new ModelPerformanceTracker(basePath);
|
||||
this.analyzer = new FailureAnalyzer(basePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Record an outcome for a model on a task.
|
||||
*/
|
||||
recordOutcome(taskType, modelId, outcome) {
|
||||
this.tracker.recordOutcome(taskType, modelId, outcome);
|
||||
}
|
||||
|
||||
/**
|
||||
* Log failure details for analysis.
|
||||
*/
|
||||
logFailure(taskType, modelId, failure) {
|
||||
this.analyzer.logFailure(taskType, modelId, failure);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get ranked models for a task type (for intelligent routing).
|
||||
*/
|
||||
getRankedModels(taskType, minSamples = 3) {
|
||||
return this.tracker.getRankedModels(taskType, minSamples);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decide whether to demote a model.
|
||||
*/
|
||||
shouldDemote(taskType, modelId, failureThreshold = 0.5) {
|
||||
return this.tracker.shouldDemote(taskType, modelId, failureThreshold);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get A/B test candidates (for hypothesis testing).
|
||||
*/
|
||||
getABTestCandidates(taskType, minSamples = 3) {
|
||||
return this.tracker.getABTestCandidates(taskType, minSamples);
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze A/B test results.
|
||||
*/
|
||||
analyzeABTest(taskType, results) {
|
||||
return this.tracker.analyzeABTest(taskType, results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get failure analysis for a model.
|
||||
*/
|
||||
getFailureAnalysis(taskType, modelId) {
|
||||
return this.analyzer.getFailureSummary(taskType, modelId);
|
||||
}
|
||||
}
|
||||
|
||||
export { ModelPerformanceTracker, FailureAnalyzer };
|
||||
|
||||
export default {
|
||||
ModelLearner,
|
||||
ModelPerformanceTracker,
|
||||
FailureAnalyzer,
|
||||
};
|
||||
|
|
@ -76,7 +76,7 @@ Before anything else, form a diagnosis: What is the core challenge? What is brok
|
|||
- **Measure coverage**: find untested critical paths
|
||||
- **Scan for dead code, stubs, and commented-out features** — abandoned attempts are signals
|
||||
- **Discover needed skills**: identify repo languages, frameworks, data stores, external services, build tools, and domain-specific competencies. Check installed skills first; record installed, missing, and potentially useful skills in `.sf/CODEBASE.md` and `.sf/PM-STRATEGY.md`.
|
||||
- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context.
|
||||
- **Use code intelligence**: start with in-process `grep`/`find`/`ls` and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence.
|
||||
- Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. Fall back to shell `rg`, `find`, `ast-grep`, or `ls -la` only when the native/in-process tool surface is insufficient.
|
||||
|
||||
### Step 2: Check library and ecosystem facts
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ After reflection is confirmed, decide the approach based on the actual scope —
|
|||
|
||||
Before asking your first question, do a mandatory investigation pass. This is not optional.
|
||||
|
||||
1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
|
||||
1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, and `lsp` for broad orientation. Use scoped `codebase_search` or `sift_search` as the live code index when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo. Use `.sf/CODEBASE.md` only as durable fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. If Sift is degraded, slow, empty, or timing out, keep using grep/find/ls, lsp, direct reads, and fallback CODEBASE context. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes.
|
||||
2. **Check library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework the user mentioned. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover.** Get current facts about capabilities, constraints, API shapes, version-specific behavior.
|
||||
3. **Web search** — `search-the-web` if the domain is unfamiliar, if you need current best practices, or if the user referenced external services/APIs you need facts about. Use `fetch_page` for full content when snippets aren't enough.
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,18 @@ You are evaluating **quality gates in parallel** for this slice. Each gate is an
|
|||
|
||||
{{gateList}}
|
||||
|
||||
## Gate Types Reference
|
||||
|
||||
The following gate implementations may be present in this project. Each has distinct failure classes:
|
||||
|
||||
- **`verification-gate`** — Runs lint, typecheck, tests, and post-execution checks. Failure classes: `verification` (check failed), `execution` (runtime/blocking error), `artifact` (post-execution consistency issue).
|
||||
- **`security-guard`** — Scans for secrets, unsafe patterns, and dependency vulnerabilities. Failure classes: `policy` (secret leaked), `input` (unsafe pattern).
|
||||
- **`cost-guard`** — Monitors LLM spend against per-unit and per-hour budgets, detects high-tier model failures. Failure classes: `policy` (budget exceeded), `execution` (high-tier model failure).
|
||||
- **`outcome-learning`** — Queries historical task outcomes for failure-rate anomalies. Failure classes: `policy` (failure rate too high), `input` (model recommendation).
|
||||
- **`multi-package-healing`** — Detects affected packages from git diff and runs targeted checks. Failure classes: `verification` (package check failed), `execution` (check timeout).
|
||||
- **`chaos-monkey`** — Stress-tests durability by injecting latency, retryable errors, disk stress, or memory pressure. Failure classes: `execution` (injected fault caused failure). This gate only runs when explicitly enabled (`active: false` by default).
|
||||
- **`post-execution-checks`** — Cross-task consistency verification after a task completes. Failure classes: `artifact` (consistency violation), `policy` (strict-mode warning escalation).
|
||||
|
||||
## Execution Protocol
|
||||
|
||||
1. **Dispatch all gates** using `subagent` in parallel mode. Each subagent prompt is provided below.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.
|
||||
Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If the repository is checked out locally, GitHub code search is a scarce remote-only fallback: do not use GitHub `/search/code` for that local repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so use it only for repositories that are not on disk, dedupe repeated queries, and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.
|
||||
|
||||
**You are the scout.** A planner agent reads your output in a fresh context to decompose this slice into tasks. Write for the planner — surface key files, where the work divides naturally, what to build first, and how to verify. If the research doc is vague, the planner re-explores code you already read. If it's precise, the planner decomposes immediately.
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ Research what this slice needs. Narrate key findings and surprises as you go —
|
|||
2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
|
||||
3. Explore relevant code for this slice's scope. Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first.
|
||||
3a. Use a research swarm when the slice has 2-3 independent unknowns or subsystems. Dispatch parallel `scout`/`researcher` subagents with distinct lenses, then synthesize what each found into this single RESEARCH artifact. Do not swarm a narrow, sequence-dependent investigation.
|
||||
3b. **GitHub code search is a scarce remote-only fallback.** When the repository is present in `{{workingDirectory}}`, do not use GitHub `/search/code` for that repo; use local `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search` as needed. GitHub's `code_search` bucket is small and separate from the normal REST/GraphQL quotas. Use GitHub code search only for repositories that are not checked out locally, dedupe repeated queries, and if it returns `403` rate-limit with a short reset, wait until reset or continue with local evidence. If remote code search is essential and still unavailable, checkpoint `continue`, `blocked`, or `decide` with the missing source named.
|
||||
4. **Documentation lookup — prefer DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework — AI-indexed, no free-tier cap. Fall back to `resolve_library` → `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 requests/month — spend those on cases DeepWiki can't cover.** Skip both for libraries already used in this codebase.
|
||||
5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — try DeepWiki → Context7 → web search in that order. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit.
|
||||
6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt).
|
||||
|
|
|
|||
|
|
@ -161,7 +161,7 @@ Templates showing the expected format for each artifact type are in:
|
|||
|
||||
**Code navigation:** Use `lsp` for definition, type_definition, implementation, references, incoming_calls, outgoing_calls, hover, signature, symbols, rename, code_actions, format, and diagnostics. Falls back gracefully if no server is available. Never `grep` for a symbol definition when `lsp` can resolve it semantically. Never shell out to prettier/rustfmt/gofmt when `lsp format` is available. After editing code, use `lsp diagnostics` to verify no type errors were introduced.
|
||||
|
||||
**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.
|
||||
**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, and `lsp`. When the `PROJECT CODE INTELLIGENCE` block says Sift is healthy, use scoped `codebase_search` or `sift_search` as the preferred live code index. Use `.sf/CODEBASE.md` only as fallback context when Sift is unavailable, cold, degraded, or explicitly needed as a generated overview. GitHub code search is a scarce remote-only fallback: if the repository is checked out locally, do not use GitHub `/search/code` for that repo; use `git grep` for tracked-file global search, `rg` for broader worktree text search, plus `lsp`, `sift_search`, or `codebase_search`. GitHub's `code_search` bucket is small and separate from normal REST/GraphQL quotas, so dedupe remote queries and treat `403` rate-limit responses as a signal to wait for reset or continue with local evidence. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). Never read files one-by-one to "explore" — search first, then read what's relevant.
|
||||
|
||||
**Swarm dispatch:** Let the system decide whether swarming fits before dispatching multiple execution subagents. Use a 2-3 worker same-model swarm only when the work splits into independent shards with explicit file/directory ownership, shard-local verification, low conflict risk, and clear wall-clock savings. Do not swarm shared-interface edits, lockfiles, migrations, single-failure debugging, or sequence-dependent work. The parent agent remains coordinator: assign ownership, synthesize results, inspect dirty files, resolve conflicts, and run final verification.
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
* 4. Apply fix, test, and mark self-report resolved
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
|
||||
/**
|
||||
|
|
@ -25,30 +25,36 @@ import { join } from "node:path";
|
|||
const FIX_PATTERNS = [
|
||||
{
|
||||
id: "validation-reviewer-rubric",
|
||||
pattern: /validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
|
||||
pattern:
|
||||
/validation-reviewer.*prompt.*lacks.*rubric|rubric.*criterion.*gap/i,
|
||||
confidence: 0.95, // We fixed this in validation prompts already
|
||||
description: "Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
|
||||
description:
|
||||
"Add explicit criterion/implementation-gap rubric to validation-reviewer prompt",
|
||||
fix: fixValidationReviewerRubric,
|
||||
},
|
||||
{
|
||||
id: "gate-verdict-clarity",
|
||||
pattern: /gate.*verdict.*ambiguous|verdict.*semantics.*unclear/i,
|
||||
confidence: 0.9,
|
||||
description: "Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
|
||||
description:
|
||||
"Document gate verdict semantics (passed/failed/omitted) in ARCHITECTURE.md",
|
||||
fix: fixGateVerdictSemantics,
|
||||
},
|
||||
{
|
||||
id: "env-vars-unvalidated",
|
||||
pattern: /SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
|
||||
pattern:
|
||||
/SF_.*env.*vars.*unvalidated|env.*validation.*missing|silent.*config.*missing/i,
|
||||
confidence: 0.85,
|
||||
description: "Add runtime validation for SF_* environment variables",
|
||||
fix: fixEnvValidation,
|
||||
},
|
||||
{
|
||||
id: "self-report-coverage-gap",
|
||||
pattern: /self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
|
||||
pattern:
|
||||
/self-report.*gap|triage.*pipeline.*missing|feedback.*loop.*incomplete/i,
|
||||
confidence: 0.8,
|
||||
description: "Implement automated self-report triage pipeline (this module)",
|
||||
description:
|
||||
"Implement automated self-report triage pipeline (this module)",
|
||||
fix: fixSelfReportPipeline,
|
||||
},
|
||||
];
|
||||
|
|
@ -72,11 +78,19 @@ async function fixValidationReviewerRubric(basePath) {
|
|||
|
||||
// Check if rubric already exists
|
||||
if (content.includes("Gate vs. Task Scope Rubric")) {
|
||||
return { success: true, alreadyFixed: true, reason: "Rubric already present" };
|
||||
return {
|
||||
success: true,
|
||||
alreadyFixed: true,
|
||||
reason: "Rubric already present",
|
||||
};
|
||||
}
|
||||
|
||||
// This is already done in prior session, so just confirm
|
||||
return { success: true, alreadyFixed: true, reason: "Fix verified in session" };
|
||||
return {
|
||||
success: true,
|
||||
alreadyFixed: true,
|
||||
reason: "Fix verified in session",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -92,7 +106,11 @@ async function fixGateVerdictSemantics(basePath) {
|
|||
|
||||
// Check if gate semantics already documented
|
||||
if (content.includes("Gate Verdict Semantics")) {
|
||||
return { success: true, alreadyFixed: true, reason: "Gate semantics documented" };
|
||||
return {
|
||||
success: true,
|
||||
alreadyFixed: true,
|
||||
reason: "Gate semantics documented",
|
||||
};
|
||||
}
|
||||
|
||||
return { success: true, alreadyFixed: true, reason: "Fix already verified" };
|
||||
|
|
@ -137,7 +155,7 @@ async function fixEnvValidation(basePath) {
|
|||
/**
|
||||
* Attempt to fix: Self-report triage pipeline (this module itself).
|
||||
*/
|
||||
async function fixSelfReportPipeline(basePath) {
|
||||
async function fixSelfReportPipeline(_basePath) {
|
||||
const thisFile = new URL(import.meta.url).pathname;
|
||||
if (!existsSync(thisFile)) {
|
||||
return { success: false, reason: "Self-report-fixer module not found" };
|
||||
|
|
@ -280,16 +298,17 @@ export function generateTriageSummary(reports) {
|
|||
uniqueClusters: clusters.length,
|
||||
deduped: clusters,
|
||||
categorized: categories,
|
||||
highConfidenceFixes: reports
|
||||
.flatMap((r) => {
|
||||
const fixes = classifyReportFixes(r);
|
||||
return fixes.filter((f) => f.confidence > 0.85).map((f) => ({
|
||||
highConfidenceFixes: reports.flatMap((r) => {
|
||||
const fixes = classifyReportFixes(r);
|
||||
return fixes
|
||||
.filter((f) => f.confidence > 0.85)
|
||||
.map((f) => ({
|
||||
reportId: r.id,
|
||||
fixId: f.id,
|
||||
description: f.description,
|
||||
confidence: f.confidence,
|
||||
}));
|
||||
}),
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4199,6 +4199,17 @@ export function getGateLatencyStats(gateId, windowHours = 24) {
|
|||
return { total: 0, avgMs: 0, p50Ms: 0, p95Ms: 0, maxMs: 0 };
|
||||
}
|
||||
}
|
||||
export function getDistinctGateIds() {
|
||||
if (!currentDb) return [];
|
||||
try {
|
||||
const rows = currentDb
|
||||
.prepare("SELECT DISTINCT gate_id FROM gate_runs")
|
||||
.all();
|
||||
return rows.map((r) => r.gate_id).filter(Boolean);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
function asStringOrNull(value) {
|
||||
return typeof value === "string" && value.length > 0 ? value : null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,6 +69,15 @@ rg --files src/resources/extensions/sf/skills
|
|||
{"query": "sift_request_factory", "strategy": "bm25", "limit": 10}
|
||||
```
|
||||
|
||||
**GitHub code search — remote-only fallback, not local repo search:**
|
||||
When the repository is checked out locally, do not use GitHub `/search/code` for
|
||||
that repo. Use `git grep` for tracked-file global search, `rg` for broader
|
||||
worktree text search, plus `lsp`, `sift_search`, or `codebase_search` instead.
|
||||
GitHub's `code_search` bucket is small and separate from normal REST/GraphQL
|
||||
quotas. Use GitHub code search only for repositories that are not on disk,
|
||||
dedupe repeated queries, and treat `403` rate-limit responses as a signal to
|
||||
wait for reset or continue with local evidence.
|
||||
|
||||
**SF project database queries:**
|
||||
```bash
|
||||
# Current milestone and slices
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
* and prompt injection work correctly.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from "vitest";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import knowledgeInjector from "../knowledge-injector.js";
|
||||
|
||||
const {
|
||||
|
|
@ -208,7 +208,7 @@ describe("knowledge-injector", () => {
|
|||
const contradictions = detectContradictions(entries);
|
||||
// These are compatible tools, not contradictions
|
||||
const realContradictions = contradictions.filter(
|
||||
(c) => !c.message.includes("suspicious")
|
||||
(c) => !c.message.includes("suspicious"),
|
||||
);
|
||||
expect(realContradictions.length).toBe(0);
|
||||
});
|
||||
|
|
@ -305,7 +305,7 @@ describe("knowledge-injector", () => {
|
|||
const relevant = findRelevantKnowledge(entries, context, 0, 0);
|
||||
|
||||
if (relevant.length > 0) {
|
||||
const { score, entry } = relevant[0];
|
||||
const { score } = relevant[0];
|
||||
expect(score).toBeDefined();
|
||||
expect(score).toBeGreaterThan(0);
|
||||
expect(score).toBeLessThanOrEqual(1);
|
||||
|
|
|
|||
|
|
@ -5,11 +5,11 @@
|
|||
* deduplication, and severity categorization work correctly.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from "vitest";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
categorizeBySeverity,
|
||||
classifyReportFixes,
|
||||
dedupReports,
|
||||
categorizeBySeverity,
|
||||
generateTriageSummary,
|
||||
} from "../self-report-fixer.js";
|
||||
|
||||
|
|
@ -132,7 +132,7 @@ describe("self-report-fixer", () => {
|
|||
// Validation reviewer should be blocker
|
||||
const blockers = categorized.blocker;
|
||||
expect(
|
||||
blockers.some((r) => r.title.toLowerCase().includes("validation"))
|
||||
blockers.some((r) => r.title.toLowerCase().includes("validation")),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
|
|
@ -288,7 +288,7 @@ describe("self-report-fixer", () => {
|
|||
// Recommendation should mention the actual action
|
||||
const recommendation = summary.recommendations[0];
|
||||
expect(recommendation.toLowerCase()).toMatch(
|
||||
/rubric|criteria|document|validation/
|
||||
/rubric|criteria|document|validation/,
|
||||
);
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,42 @@ function randomInRange(min, max) {
|
|||
return min + Math.random() * (max - min);
|
||||
}
|
||||
|
||||
export class ChaosMonkeyGate {
|
||||
constructor(options = {}) {
|
||||
this.id = "chaos-monkey";
|
||||
this.type = "chaos";
|
||||
this._monkey = new ChaosMonkey(options);
|
||||
}
|
||||
|
||||
async execute(_ctx, attempt) {
|
||||
try {
|
||||
await this._monkey.strike("verification");
|
||||
} catch (err) {
|
||||
return {
|
||||
outcome: "fail",
|
||||
failureClass: "execution",
|
||||
rationale: `Chaos monkey injected fault: ${err instanceof Error ? err.message : String(err)}`,
|
||||
findings: `Injected during verification phase (attempt ${attempt})`,
|
||||
};
|
||||
}
|
||||
const events = this._monkey.getInjectedEvents();
|
||||
const last = events[events.length - 1];
|
||||
if (last && last.phase === "verification") {
|
||||
return {
|
||||
outcome: "pass",
|
||||
failureClass: "none",
|
||||
rationale: `Chaos monkey injected ${last.type} during verification (non-fatal)`,
|
||||
findings: `Latency: ${last.delay ?? 0}ms | Disk: ${last.sizeMb ?? 0}MB | Memory: ${last.sizeMb ?? 0}MB`,
|
||||
};
|
||||
}
|
||||
return {
|
||||
outcome: "pass",
|
||||
failureClass: "none",
|
||||
rationale: "Chaos monkey: no fault injected this run",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export class ChaosMonkey {
|
||||
constructor(options = {}) {
|
||||
this.active = options.active ?? false;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import {
|
||||
getGateCircuitBreaker,
|
||||
getGateRunStats,
|
||||
insertGateRun,
|
||||
updateGateCircuitBreaker,
|
||||
} from "../sf-db.js";
|
||||
|
|
@ -20,9 +21,16 @@ const RETRY_MATRIX = {
|
|||
unknown: 0,
|
||||
};
|
||||
|
||||
const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 5;
|
||||
const CIRCUIT_BREAKER_OPEN_DURATION_MS = 60_000;
|
||||
const CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS = 3;
|
||||
function resolveCircuitBreakerThresholds() {
|
||||
return {
|
||||
failureThreshold:
|
||||
Number(process.env.SF_CIRCUIT_BREAKER_FAILURE_THRESHOLD) || 5,
|
||||
openDurationMs:
|
||||
Number(process.env.SF_CIRCUIT_BREAKER_OPEN_DURATION_MS) || 60_000,
|
||||
halfOpenMaxAttempts:
|
||||
Number(process.env.SF_CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) || 3,
|
||||
};
|
||||
}
|
||||
|
||||
function nowIso() {
|
||||
return new Date().toISOString();
|
||||
|
|
@ -41,11 +49,30 @@ export class UokGateRunner {
|
|||
return Array.from(this.registry.values());
|
||||
}
|
||||
|
||||
getHealthSummary() {
|
||||
const gates = this.list();
|
||||
return {
|
||||
gates: gates.map((g) => {
|
||||
const stats = getGateRunStats(g.id, 24);
|
||||
const cb = getGateCircuitBreaker(g.id);
|
||||
return {
|
||||
id: g.id,
|
||||
type: g.type,
|
||||
...stats,
|
||||
circuitBreaker: cb.state,
|
||||
failureStreak: cb.failureStreak,
|
||||
};
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
_checkCircuitBreaker(gateId) {
|
||||
const { openDurationMs, halfOpenMaxAttempts } =
|
||||
resolveCircuitBreakerThresholds();
|
||||
const breaker = getGateCircuitBreaker(gateId);
|
||||
if (breaker.state === "open") {
|
||||
const openedAt = breaker.openedAt ? Date.parse(breaker.openedAt) : 0;
|
||||
if (Date.now() - openedAt >= CIRCUIT_BREAKER_OPEN_DURATION_MS) {
|
||||
if (Date.now() - openedAt >= openDurationMs) {
|
||||
// Transition to half-open automatically after cooldown
|
||||
updateGateCircuitBreaker(gateId, {
|
||||
state: "half-open",
|
||||
|
|
@ -56,11 +83,11 @@ export class UokGateRunner {
|
|||
}
|
||||
return {
|
||||
blocked: true,
|
||||
reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + CIRCUIT_BREAKER_OPEN_DURATION_MS).toISOString()}.`,
|
||||
reason: `Circuit breaker OPEN for ${gateId} (failure streak ${breaker.failureStreak}). Cooldown until ${new Date(openedAt + openDurationMs).toISOString()}.`,
|
||||
};
|
||||
}
|
||||
if (breaker.state === "half-open") {
|
||||
if (breaker.halfOpenAttempts >= CIRCUIT_BREAKER_HALF_OPEN_MAX_ATTEMPTS) {
|
||||
if (breaker.halfOpenAttempts >= halfOpenMaxAttempts) {
|
||||
// Too many half-open attempts without success — go back to open
|
||||
updateGateCircuitBreaker(gateId, {
|
||||
state: "open",
|
||||
|
|
@ -100,7 +127,8 @@ export class UokGateRunner {
|
|||
});
|
||||
return;
|
||||
}
|
||||
if (nextStreak >= CIRCUIT_BREAKER_FAILURE_THRESHOLD) {
|
||||
const { failureThreshold } = resolveCircuitBreakerThresholds();
|
||||
if (nextStreak >= failureThreshold) {
|
||||
updateGateCircuitBreaker(gateId, {
|
||||
state: "open",
|
||||
failureStreak: nextStreak,
|
||||
|
|
|
|||
|
|
@ -8,21 +8,26 @@
|
|||
* Consumer: health widgets, /sf uok status, and external monitoring.
|
||||
*/
|
||||
|
||||
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
||||
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { sfRoot } from "../paths.js";
|
||||
import {
|
||||
getDistinctGateIds,
|
||||
getGateCircuitBreaker,
|
||||
getGateLatencyStats,
|
||||
getGateRunStats,
|
||||
isDbAvailable,
|
||||
} from "../sf-db.js";
|
||||
|
||||
const GATE_NAMES = [
|
||||
const DEFAULT_GATE_NAMES = [
|
||||
"security-guard",
|
||||
"cost-guard",
|
||||
"outcome-learning",
|
||||
"multi-package-healing",
|
||||
"chaos-monkey",
|
||||
"verification-gate",
|
||||
"post-execution-checks",
|
||||
"milestone-validation-post-check",
|
||||
];
|
||||
|
||||
function fmtCounter(name, value, labels = {}) {
|
||||
|
|
@ -37,9 +42,9 @@ function fmtGauge(name, value, labels = {}) {
|
|||
return fmtCounter(name, value, labels);
|
||||
}
|
||||
|
||||
function collectGateMetrics() {
|
||||
function collectGateMetrics(gateIds) {
|
||||
const lines = [];
|
||||
for (const gateId of GATE_NAMES) {
|
||||
for (const gateId of gateIds) {
|
||||
const stats = getGateRunStats(gateId, 24);
|
||||
lines.push(
|
||||
fmtCounter("uok_gate_runs_total", stats.total, { gate_id: gateId }),
|
||||
|
|
@ -89,7 +94,7 @@ function collectGateMetrics() {
|
|||
return lines;
|
||||
}
|
||||
|
||||
function buildMetricsText() {
|
||||
function buildMetricsText(gateIds) {
|
||||
const lines = [
|
||||
"# HELP uok_gate_runs_total Total gate runs in the last 24h",
|
||||
"# TYPE uok_gate_runs_total counter",
|
||||
|
|
@ -113,7 +118,13 @@ function buildMetricsText() {
|
|||
"# TYPE uok_gate_circuit_breaker_failure_streak gauge",
|
||||
];
|
||||
if (isDbAvailable()) {
|
||||
lines.push(...collectGateMetrics());
|
||||
const ids =
|
||||
gateIds && gateIds.length > 0
|
||||
? gateIds
|
||||
: getDistinctGateIds().length > 0
|
||||
? getDistinctGateIds()
|
||||
: DEFAULT_GATE_NAMES;
|
||||
lines.push(...collectGateMetrics(ids));
|
||||
}
|
||||
return lines.join("\n") + "\n";
|
||||
}
|
||||
|
|
@ -122,11 +133,11 @@ export function metricsPath(basePath) {
|
|||
return join(sfRoot(basePath), "runtime", "uok-metrics.prom");
|
||||
}
|
||||
|
||||
export function writeUokMetrics(basePath) {
|
||||
export function writeUokMetrics(basePath, gateIds) {
|
||||
const path = metricsPath(basePath);
|
||||
const dir = join(sfRoot(basePath), "runtime");
|
||||
mkdirSync(dir, { recursive: true });
|
||||
writeFileSync(path, buildMetricsText(), "utf-8");
|
||||
writeFileSync(path, buildMetricsText(gateIds), "utf-8");
|
||||
return path;
|
||||
}
|
||||
|
||||
|
|
@ -134,7 +145,7 @@ export function readUokMetrics(basePath) {
|
|||
const path = metricsPath(basePath);
|
||||
if (!existsSync(path)) return null;
|
||||
try {
|
||||
return buildMetricsText();
|
||||
return readFileSync(path, "utf-8");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue