diff --git a/docs/dev/drafts/uok-gate-call-site-migration-template.md b/docs/dev/drafts/uok-gate-call-site-migration-template.md new file mode 100644 index 000000000..4b5cff774 --- /dev/null +++ b/docs/dev/drafts/uok-gate-call-site-migration-template.md @@ -0,0 +1,227 @@ +# UOK Gate Call-Site Migration Template (ADR-0075 / R079) + +Migrate inline `new UokGateRunner()` call sites to `UokGateRegistry` + +`ctxFactory` pattern. The process singleton in `uok/gate-registry.js` replaces +ad-hoc runner construction and enables cross-process gate health queries. + +--- + +## 4-Step Pattern + +### Step 1 — Read the inline block + +Find the `new UokGateRunner()` instantiation, the `runner.register({id, type, +execute: async () => { ... }})` call, and note every local variable the closure +captures. These become the `ctx` fields you will supply at run time. + +### Step 2 — Lift to module scope + +Define a top-level `const` for the gate object. The `execute` function receives +`ctx` as its first argument; replace every closure reference with a `ctx.*` +read: + +```js +// module scope — defined once, not inside any function +const myGate = { + id: "my-gate-id", + type: "verification", // must be a valid UOK_GATE_TYPES value + execute: async (ctx) => ({ + outcome: ctx.someFlag ? "manual-attention" : "pass", + failureClass: ctx.someFlag ? "manual-attention" : "none", + rationale: `gate result: ${ctx.someValue}`, + findings: ctx.someFlag ? ctx.details ?? "" : "", + }), +}; +``` + +### Step 3 — Register with a ctxFactory (or inline override) + +Two sub-cases: + +**A. Per-call state (most common):** supply the captured locals as a +`ctxOverride` object to `registry.run()`. No `ctxFactory` argument needed when +`register()` is called: + +```js +const registry = getGateRegistry(); +if (!registry.has(myGate.id)) { + registry.register(myGate); // no ctxFactory — state comes from override +} +await registry.run(myGate.id, { + someFlag: localFlag, // previously-closed-over state + someValue: localValue, + // plus standard runner ctx fields: + basePath, + traceId: opts?.traceId ?? `my-trace`, + turnId: opts?.turnId ?? `my-turn`, + milestoneId, sliceId, unitType, unitId, +}); +``` + +**B. Stable per-module state:** if the gate always needs the same context +object (e.g. a fixed config block), pass a `ctxFactory` to `register()` so +callers never have to supply it: + +```js +registry.register(myGate, () => ({ stableConfig })); +await registry.run(myGate.id, {}); // ctxFactory merged automatically +``` + +`registry.run(id, override)` merges as `{ ...ctxFactory(), ...override }`, so +override values always win. + +### Step 4 — Remove the original inline block + +Delete the `new UokGateRunner()` instantiation, the `runner.register(...)` call, +and the `await runner.run(...)` call. Import `getGateRegistry` from +`"../uok/gate-registry.js"` and remove the `UokGateRunner` import if it is no +longer used. + +--- + +## Worked Example — `tools/validate-milestone.js` + +### Before (199 lines) + +```js +import { UokGateRunner } from "../uok/gate-runner.js"; +// ...inside handleValidateMilestone: +if (gatesEnabled) { + try { + const gateRunner = new UokGateRunner(); + const nonPassVerdict = params.verdict !== "pass"; + gateRunner.register({ + id: "milestone-validation-gates", + type: "verification", + execute: async () => ({ + outcome: nonPassVerdict ? "manual-attention" : "pass", + failureClass: nonPassVerdict ? "manual-attention" : "none", + rationale: `milestone validation verdict: ${params.verdict}`, + findings: nonPassVerdict + ? [params.verdictRationale, params.remediationPlan ?? ""].filter(Boolean).join("\n") + : "", + }), + }); + await gateRunner.run("milestone-validation-gates", { basePath, traceId, ... }); + } catch (err) { logWarning(...); } +} +``` + +The closure captured `params.verdict`, `params.verdictRationale`, +`params.remediationPlan`, and the derived `nonPassVerdict` flag — all call-time +values that differed on every invocation. + +### After (222 lines) + +```js +import { getGateRegistry } from "../uok/gate-registry.js"; + +// module scope +const milestoneValidationGatesGate = { + id: "milestone-validation-gates", + type: "verification", + execute: async (ctx) => { + const nonPassVerdict = ctx.verdict !== "pass"; + return { + outcome: nonPassVerdict ? "manual-attention" : "pass", + failureClass: nonPassVerdict ? "manual-attention" : "none", + rationale: `milestone validation verdict: ${ctx.verdict}`, + findings: nonPassVerdict + ? [ctx.verdictRationale, ctx.remediationPlan ?? ""].filter(Boolean).join("\n") + : "", + }; + }, +}; + +// inside handleValidateMilestone: +if (gatesEnabled) { + try { + const registry = getGateRegistry(); + if (!registry.has(milestoneValidationGatesGate.id)) { + registry.register(milestoneValidationGatesGate); + } + await registry.run(milestoneValidationGatesGate.id, { + verdict: params.verdict, + verdictRationale: params.verdictRationale, + remediationPlan: params.remediationPlan, + basePath, traceId: opts?.traceId ?? `...`, ... + }); + } catch (err) { logWarning(...); } +} +``` + +The gate object is now registered once (guarded by `registry.has()`). The +previously-closed-over locals are passed as a plain object to `registry.run()`, +where they merge with the ctxFactory output (none here) before reaching +`execute(ctx)`. + +--- + +## Per-Gate Scope Checklist (5 remaining inline gates) + +| Gate id | File | Captured locals | Notes | +|---------|------|-----------------|-------| +| `unit-verification-gate` | `auto/phases-finalize.js` | `deps`, `s`, `ctx`, `pi`, `pauseAuto` | Complex; `ctx` is the outer phase context — rename to `phaseCtx` in gate's `execute(ctx)` to avoid shadowing | +| `milestone-validation-post-check` | `uok/auto-verification.js` | `persistGate` args from `runValidateMilestonePostCheck` | Straightforward; all are primitive strings/booleans | +| `verification-gate` | `uok/auto-verification.js` | local verification result + helper formatting state | Multiple gates in same file — see Pitfall section | +| `post-execution-checks` | `uok/auto-verification.js` | `postExecResult`, strict-mode prefs, derived blocking state | Three gates in same file — same Pitfall applies | +| `planning-flow-gate` | `guided-flow.js` | `persistGate` args | Clean; same pattern as milestone-validation-post-check | + +--- + +## When to Skip ctxFactory + +If the gate's `execute()` needs no context at all (e.g. it reads only DB or +env state), register without a factory and run with an empty override: + +```js +registry.register(myStatelessGate); // no ctxFactory +await registry.run(myStatelessGate.id, { basePath, traceId, turnId }); +``` + +`uok/gate-registry-bootstrap.js` uses this form for `driftDetectionGate`, which +sources everything from the DB. + +--- + +## Pitfall: Multiple Call Sites, Same Gate Id, Different State + +`uok/auto-verification.js` contains three distinct gate registrations +(`verification-gate`, `milestone-validation-post-check`, +`post-execution-checks`). Each has its own unique id, so there is no collision. + +However, if a future refactor discovers two call sites in the SAME file that +run the SAME gate id with different state shapes, choose one of: + +**Option A — Two distinct gate ids** (preferred when logic differs): + +```js +const myGateA = { id: "my-gate-pass-path", type: "verification", execute: async (ctx) => ... }; +const myGateB = { id: "my-gate-fail-path", type: "verification", execute: async (ctx) => ... }; +``` + +Each id carries its own circuit-breaker state and audit trail — cleaner +semantics at the cost of a registry entry per variant. + +**Option B — Single id, discriminator field in ctx** (preferred when logic is +identical but input differs): + +```js +const myGate = { + id: "my-gate", + type: "verification", + execute: async (ctx) => { + if (ctx.variant === "foo") { ... } + else { ... } + }, +}; +// call site A: +await registry.run("my-gate", { variant: "foo", ... }); +// call site B: +await registry.run("my-gate", { variant: "bar", ... }); +``` + +Trade-off: Option A gives separate circuit breakers (fail on path A does not +suppress path B). Option B shares a circuit breaker — one pathological failure +mode can block the other path. Prefer Option A when the two paths have +meaningfully different failure domains. diff --git a/src/resources/extensions/search-the-web/tool-search.js b/src/resources/extensions/search-the-web/tool-search.js index 771da82b2..c557ca7b7 100644 --- a/src/resources/extensions/search-the-web/tool-search.js +++ b/src/resources/extensions/search-the-web/tool-search.js @@ -18,6 +18,7 @@ import { truncateHead, } from "@singularity-forge/coding-agent"; import { Text } from "@singularity-forge/tui"; +import { duplicateSearchSignal } from "../shared/loop-signals.js"; import { recordRetrievalEvidence } from "../sf/retrieval-evidence.js"; import { LRUTTLCache } from "./cache.js"; import { formatSearchResults } from "./format.js"; @@ -710,6 +711,12 @@ export function registerSearchTool(pi) { if (cacheKey === lastSearchKey) { consecutiveDupeCount++; if (consecutiveDupeCount > MAX_CONSECUTIVE_DUPES) { + const loopSignal = duplicateSearchSignal({ + query: params.query, + provider, + domain: params.domain ?? "", + consecutiveDupeCount, + }); await recordWebSearchEvidence(projectRoot, { query: params.query, strategy: provider, @@ -718,7 +725,7 @@ export function registerSearchTool(pi) { hitCount: 0, elapsedMs: Date.now() - startedAt, error: "Consecutive duplicate search detected", - result: { consecutiveDupeCount }, + result: { consecutiveDupeCount, loopSignal }, }); return { content: [ @@ -731,6 +738,7 @@ export function registerSearchTool(pi) { details: { errorKind: "search_loop", error: "Consecutive duplicate search detected", + loopSignal, }, }; } diff --git a/src/resources/extensions/sf/bootstrap/register-hooks.js b/src/resources/extensions/sf/bootstrap/register-hooks.js index fd6e07cf6..f31f8a23a 100644 --- a/src/resources/extensions/sf/bootstrap/register-hooks.js +++ b/src/resources/extensions/sf/bootstrap/register-hooks.js @@ -550,12 +550,8 @@ export function registerHooks(pi, ecosystemHandlers = []) { // first search-tool query paid the full cold-build cost (5-20 // minutes on a repo this size). Hooking it here closes that gap. try { - const { ensureSiftIndexWarmup } = await import( - "../code-intelligence.js" - ); - const { loadEffectiveSFPreferences } = await import( - "../preferences.js" - ); + const { ensureSiftIndexWarmup } = await import("../code-intelligence.js"); + const { loadEffectiveSFPreferences } = await import("../preferences.js"); const prefs = loadEffectiveSFPreferences()?.preferences; // Fire and forget — the warmup writes its state to // .sf/runtime/sift-index-warmup.json so other surfaces (doctor, @@ -992,7 +988,11 @@ export function registerHooks(pi, ecosystemHandlers = []) { // ── Loop guard: block repeated identical tool calls ── const loopCheck = checkToolCallLoop(event.toolName, event.input); if (loopCheck.block) { - return { block: true, reason: loopCheck.reason }; + return { + block: true, + reason: loopCheck.reason, + details: { loopSignal: loopCheck.loopSignal }, + }; } // ── Session file-touch recording ────────────────────────────────────── // Best-effort: path may be absent for non-file tools; recordFileTouch @@ -1320,6 +1320,7 @@ export function registerHooks(pi, ecosystemHandlers = []) { details: { key: steer.key, severity: steer.severity, + loopSignal: steer.loopSignal, toolName: event.toolName, toolCallId: event.toolCallId, }, diff --git a/src/resources/extensions/sf/bootstrap/tool-call-loop-guard.js b/src/resources/extensions/sf/bootstrap/tool-call-loop-guard.js index 25d9a9221..cfece4ec4 100644 --- a/src/resources/extensions/sf/bootstrap/tool-call-loop-guard.js +++ b/src/resources/extensions/sf/bootstrap/tool-call-loop-guard.js @@ -12,6 +12,7 @@ * and when a different tool call breaks the streak. */ import { createHash } from "node:crypto"; +import { repeatedToolCallSignal } from "../supervision/loop-signals.js"; const MAX_CONSECUTIVE_IDENTICAL_CALLS = 4; /** Interactive/user-facing tools where even 1 duplicate is confusing. */ @@ -60,6 +61,12 @@ export function checkToolCallLoop(toolName, args) { ? MAX_CONSECUTIVE_STRICT : MAX_CONSECUTIVE_IDENTICAL_CALLS; if (consecutiveCount > threshold) { + const loopSignal = repeatedToolCallSignal({ + toolName, + count: consecutiveCount, + threshold, + signature: sig, + }); return { block: true, reason: @@ -67,6 +74,8 @@ export function checkToolCallLoop(toolName, args) { `with identical arguments. Blocking to prevent infinite loop. ` + `Try a different approach or modify your arguments.`, count: consecutiveCount, + signature: sig, + loopSignal, }; } return { block: false, count: consecutiveCount }; diff --git a/src/resources/extensions/sf/dispatch/run-unit-inline.js b/src/resources/extensions/sf/dispatch/run-unit-inline.js index 08b6356ab..ae60dff47 100644 --- a/src/resources/extensions/sf/dispatch/run-unit-inline.js +++ b/src/resources/extensions/sf/dispatch/run-unit-inline.js @@ -49,17 +49,16 @@ import { runSubagent } from "@singularity-forge/coding-agent"; +import { SMOKE_FIXTURE_MILESTONE_ID } from "../smoke-fixture-constants.js"; +import { writeLastGreenRecord } from "../last-green.js"; + import { buildCompleteMilestonePrompt, buildReassessRoadmapPrompt, buildValidateMilestonePrompt, } from "../auto-prompts.js"; import { debugLog } from "../debug-logger.js"; -import { - getMilestone, - getSlice, - isDbAvailable, -} from "../sf-db.js"; +import { getMilestone, getSlice, isDbAvailable } from "../sf-db.js"; /** * Unit types this module supports for inline dispatch. @@ -294,5 +293,35 @@ export async function runUnitInline(unitType, unitId, options = {}) { elapsedMs: Date.now() - startedAt, }); + // Wire the last-green ledger for the smoke fixture milestone. + // Non-fatal: ledger write failure does not corrupt the dispatch result. + if (unitId === SMOKE_FIXTURE_MILESTONE_ID && result.ok === true) { + try { + const elapsedMs = Date.now() - startedAt; + // Inject elapsedMs into the result object — writeLastGreenRecord + // reads result.elapsedMs, but runSubagent doesn't populate it. + const resultWithTiming = { ...result, elapsedMs }; + const written = writeLastGreenRecord(basePath, resultWithTiming, { + milestoneId: unitId, + unitType, + traceId: null, + }); + debugLog("run-unit-inline", { + event: "last-green-write", + unitType, + unitId, + written, + }); + } catch (err) { + // Log and continue — the dispatch result takes priority. + debugLog("run-unit-inline", { + event: "last-green-write-failed", + unitType, + unitId, + error: err instanceof Error ? err.message : String(err), + }); + } + } + return result; } diff --git a/src/resources/extensions/sf/supervision/loop-signals.js b/src/resources/extensions/sf/supervision/loop-signals.js index a2906d424..db610ed6f 100644 --- a/src/resources/extensions/sf/supervision/loop-signals.js +++ b/src/resources/extensions/sf/supervision/loop-signals.js @@ -1,106 +1,16 @@ /** - * loop-signals.js — normalized loop/stuck supervision signals. + * loop-signals.js — SF-local re-export for shared supervision signals. * - * Purpose: give SF one contract for loop detectors, watchdogs, retry guards, - * and tool churn signals so the autonomous supervisor can decide from one - * shape instead of bespoke metadata branches. + * Purpose: preserve the SF import path while allowing non-SF extensions to + * emit the same normalized loop signal contract from `extensions/shared`. * - * Consumer: autonomous unit supervision and detector adapters. + * Consumer: SF autonomous supervision modules. */ - -const VALID_SCOPES = new Set([ - "process", - "session", - "unit", - "tool", - "provider", - "artifact", -]); -const VALID_SEVERITIES = new Set(["info", "warning", "fail", "pause"]); -const VALID_ACTIONS = new Set([ - "continue", - "retry", - "switch-model", - "pause", - "block", - "recover", -]); - -function pick(value, allowed, fallback) { - return allowed.has(value) ? value : fallback; -} - -/** - * Build a normalized loop supervision signal. - * - * Purpose: preserve detector-specific evidence while making action, severity, - * scope, and unit identity machine-readable for the shared supervisor. - * - * Consumer: detector adapters and auto-runaway-guard. - */ -export function createLoopSignal(input = {}) { - const unit = - input.unit && typeof input.unit === "object" - ? { - type: String(input.unit.type ?? ""), - id: String(input.unit.id ?? ""), - } - : undefined; - return { - scope: pick(input.scope, VALID_SCOPES, "unit"), - kind: String(input.kind ?? "unknown-loop-signal"), - severity: pick(input.severity, VALID_SEVERITIES, "warning"), - ...(unit?.type && unit?.id ? { unit } : {}), - evidence: - input.evidence && typeof input.evidence === "object" - ? input.evidence - : { value: input.evidence ?? null }, - recommendedAction: pick(input.recommendedAction, VALID_ACTIONS, "pause"), - ...(input.message ? { message: String(input.message) } : {}), - ...(input.source ? { source: String(input.source) } : {}), - }; -} - -/** - * Convert a Wiggums detector sweep hit into a normalized loop signal. - * - * Purpose: let the periodic detector family feed the same supervisor contract - * as direct runaway and zero-progress checks. - * - * Consumer: auto-runaway-guard when consuming buffered detector sweep results. - */ -export function detectorSweepSignal(fired, unitType, unitId, now = Date.now()) { - return createLoopSignal({ - scope: "unit", - kind: `wiggums:${fired?.name ?? "unknown"}`, - severity: "fail", - unit: { type: unitType, id: unitId }, - evidence: { - detector: fired?.name ?? "unknown", - signature: fired?.signature ?? null, - failedAt: now, - }, - recommendedAction: "pause", - source: "periodic-detector-sweep", - }); -} - -/** - * Convert a zero-progress detector hit into a normalized loop signal. - * - * Purpose: make the strongest unit-churn signal available through the same - * contract as Wiggums detectors and future tool/provider loop producers. - * - * Consumer: auto-runaway-guard. - */ -export function zeroProgressSignal(input) { - return createLoopSignal({ - scope: "unit", - kind: "zero-progress", - severity: "fail", - unit: { type: input.unitType, id: input.unitId }, - evidence: input.evidence, - recommendedAction: "pause", - source: "zero-progress-detector", - }); -} +export { + createLoopSignal, + detectorSweepSignal, + duplicateSearchSignal, + repeatedToolCallSignal, + repeatedToolFailureSignal, + zeroProgressSignal, +} from "../../shared/loop-signals.js"; diff --git a/src/resources/extensions/sf/tests/loop-signals.test.mjs b/src/resources/extensions/sf/tests/loop-signals.test.mjs index fdb3ff01e..fa5e1bb09 100644 --- a/src/resources/extensions/sf/tests/loop-signals.test.mjs +++ b/src/resources/extensions/sf/tests/loop-signals.test.mjs @@ -2,6 +2,9 @@ import { describe, expect, test } from "vitest"; import { createLoopSignal, detectorSweepSignal, + duplicateSearchSignal, + repeatedToolCallSignal, + repeatedToolFailureSignal, zeroProgressSignal, } from "../supervision/loop-signals.js"; @@ -62,4 +65,75 @@ describe("loop supervision signals", () => { evidence: { toolCallsTotal: 25 }, }); }); + + test("repeatedToolCallSignal_when_guard_blocks_preserves_tool_evidence", () => { + const signal = repeatedToolCallSignal({ + toolName: "ask_user_questions", + count: 2, + threshold: 1, + signature: "abc123", + }); + + expect(signal).toMatchObject({ + scope: "tool", + kind: "tool-call:duplicate", + severity: "fail", + recommendedAction: "block", + source: "tool-call-loop-guard", + evidence: { + toolName: "ask_user_questions", + count: 2, + threshold: 1, + signature: "abc123", + }, + }); + }); + + test("repeatedToolFailureSignal_when_watchdog_steers_preserves_command_evidence", () => { + const signal = repeatedToolFailureSignal({ + toolName: "bash", + command: "npm test", + count: 2, + windowMs: 600000, + excerpt: "failed", + }); + + expect(signal).toMatchObject({ + scope: "tool", + kind: "tool-result:repeated-failure", + severity: "warning", + recommendedAction: "continue", + source: "tool-watchdog", + evidence: { + toolName: "bash", + command: "npm test", + count: 2, + windowMs: 600000, + excerpt: "failed", + }, + }); + }); + + test("duplicateSearchSignal_when_search_guard_blocks_preserves_query_evidence", () => { + const signal = duplicateSearchSignal({ + query: "sf autonomous loops", + provider: "tavily", + domain: "example.com", + consecutiveDupeCount: 2, + }); + + expect(signal).toMatchObject({ + scope: "tool", + kind: "search:duplicate-query", + severity: "fail", + recommendedAction: "block", + source: "search-the-web", + evidence: { + query: "sf autonomous loops", + provider: "tavily", + domain: "example.com", + consecutiveDupeCount: 2, + }, + }); + }); }); diff --git a/src/resources/extensions/sf/tests/smoke-fixture.test.mjs b/src/resources/extensions/sf/tests/smoke-fixture.test.mjs index b2f8feb99..b3c8a1782 100644 --- a/src/resources/extensions/sf/tests/smoke-fixture.test.mjs +++ b/src/resources/extensions/sf/tests/smoke-fixture.test.mjs @@ -11,12 +11,7 @@ * DB schema drift, mock shape mismatch). */ import { afterEach, describe, expect, it, vi } from "vitest"; -import { - mkdtempSync, - writeFileSync, - mkdirSync, - rmSync, -} from "node:fs"; +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; @@ -29,6 +24,13 @@ import { import { DispatchLayer } from "../dispatch/dispatch-layer.js"; import { runSubagent } from "@singularity-forge/coding-agent"; import { SMOKE_FIXTURE_MILESTONE_ID } from "../smoke-fixture-constants.js"; +import { readLastGreenRecord, clearLastGreen } from "../last-green.js"; + +// Enable smoke_gate so last-green writes are not suppressed. +// Without this, writeLastGreenRecord returns false and the ledger tests would fail. +vi.mock("../experimental.js", () => ({ + getExperimentalFlag: vi.fn(() => true), +})); // Module-level mock — vitest hoists vi.mock automatically. // @@ -68,12 +70,34 @@ function makeProject() { const dir = mkdtempSync(join(tmpdir(), "sf-smoke-fixture-")); tmpDirs.push(dir); - mkdirSync(join(dir, ".sf", "milestones", SMOKE_FIXTURE_MILESTONE_ID, "slices", "S01", "tasks"), { - recursive: true, - }); - mkdirSync(join(dir, ".sf", "milestones", SMOKE_FIXTURE_MILESTONE_ID, "slices", "S02", "tasks"), { - recursive: true, - }); + mkdirSync( + join( + dir, + ".sf", + "milestones", + SMOKE_FIXTURE_MILESTONE_ID, + "slices", + "S01", + "tasks", + ), + { + recursive: true, + }, + ); + mkdirSync( + join( + dir, + ".sf", + "milestones", + SMOKE_FIXTURE_MILESTONE_ID, + "slices", + "S02", + "tasks", + ), + { + recursive: true, + }, + ); openDatabase(join(dir, ".sf", "sf.db")); @@ -116,13 +140,27 @@ function makeProject() { // ROADMAP.md must exist — buildCompleteMilestonePrompt falls back to it // when the DB slice query returns nothing. writeFileSync( - join(dir, ".sf", "milestones", SMOKE_FIXTURE_MILESTONE_ID, `${SMOKE_FIXTURE_MILESTONE_ID}-ROADMAP.md`), + join( + dir, + ".sf", + "milestones", + SMOKE_FIXTURE_MILESTONE_ID, + `${SMOKE_FIXTURE_MILESTONE_ID}-ROADMAP.md`, + ), "# SMOKE_FIXTURE_MILESTONE_ID: Smoke test milestone\n\n## S01: Smoke slice (complete)\n\n## S02: Smoke slice (active)\n", ); // S01-SUMMARY.md for slice summary excerpt path. writeFileSync( - join(dir, ".sf", "milestones", SMOKE_FIXTURE_MILESTONE_ID, "slices", "S01", "S01-SUMMARY.md"), + join( + dir, + ".sf", + "milestones", + SMOKE_FIXTURE_MILESTONE_ID, + "slices", + "S01", + "S01-SUMMARY.md", + ), "# S01: Smoke slice (complete)\n\n**Status:** complete\n", ); @@ -206,4 +244,86 @@ describe("M048/S01 — smoke fixture", () => { expect(result.exitCode).toBe(1); expect(result.stderr).toMatch(/must be one of/); }); -}); \ No newline at end of file + + // --------------------------------------------------------------------------- + // S02: last-green ledger wiring tests + // --------------------------------------------------------------------------- + + it("ledger is written after successful fixture dispatch", async () => { + const dir = makeProject(); + openDatabase(join(dir, ".sf", "sf.db")); + + // Clear any stale ledger from prior runs. + clearLastGreen(dir); + + const layer = new DispatchLayer(dir); + const result = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + unitType: "complete-milestone", + unitId: SMOKE_FIXTURE_MILESTONE_ID, + }); + + // The dispatch must have succeeded (mocked runSubagent always returns ok=true). + expect(result.ok).toBe(true); + + // Read the ledger — writeLastGreenRecord guards on smoke_gate flag, + // which T01 set to true via setExperimentalFlag in last-green.test.mjs setup. + const ledger = readLastGreenRecord(dir); + expect(ledger).not.toBeNull(); + expect(Array.isArray(ledger)).toBe(true); + expect(ledger.length).toBeGreaterThanOrEqual(1); + + // Verify the most-recent entry has the expected fixture shape. + const entry = ledger[0]; + expect(entry.fixture.milestoneId).toBe(SMOKE_FIXTURE_MILESTONE_ID); + expect(entry.fixture.unitType).toBe("complete-milestone"); + expect(entry.result.ok).toBe(true); + expect(entry.result.exitCode).toBe(0); + expect(entry.result.outputLength).toBeGreaterThan(0); + expect(entry.result.elapsedMs).toBeGreaterThanOrEqual(0); + expect(typeof entry.checksum).toBe("string"); + expect(entry.checksum.length).toBeGreaterThan(0); + expect(typeof entry.timestamp).toBe("string"); + }); + + it("ledger is NOT written when dispatch fails", async () => { + const dir = makeProject(); + openDatabase(join(dir, ".sf", "sf.db")); + + // Seed with a known-good entry so we can tell it wasn't replaced. + const layer = new DispatchLayer(dir); + + // Override the mock to return a failure result. + vi.mocked(runSubagent).mockResolvedValueOnce({ + ok: false, + exitCode: 1, + stderr: "test error", + output: "", + }); + + const result = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + unitType: "complete-milestone", + unitId: SMOKE_FIXTURE_MILESTONE_ID, + }); + + // The dispatch must have failed. + expect(result.ok).toBe(false); + expect(result.exitCode).toBe(1); + + // Ledger should be null (no entry written for failed dispatch), + // or empty (any prior entry preserved but not appended). + const ledger = readLastGreenRecord(dir); + // A null return means the file doesn't exist; an empty array means it + // exists but has no entries — both are acceptable for "not written". + expect( + ledger === null || (Array.isArray(ledger) && ledger.length === 0), + ).toBe(true); + }); +}); diff --git a/src/resources/extensions/sf/tests/tool-call-loop-guard.test.mjs b/src/resources/extensions/sf/tests/tool-call-loop-guard.test.mjs new file mode 100644 index 000000000..30843c9d8 --- /dev/null +++ b/src/resources/extensions/sf/tests/tool-call-loop-guard.test.mjs @@ -0,0 +1,37 @@ +import { describe, expect, test } from "vitest"; +import { + checkToolCallLoop, + resetToolCallLoopGuard, +} from "../bootstrap/tool-call-loop-guard.js"; + +describe("tool-call loop guard", () => { + test("checkToolCallLoop_when_identical_tool_repeats_returns_loop_signal", () => { + resetToolCallLoopGuard(); + + const first = checkToolCallLoop("ask_user_questions", { + questions: ["continue?"], + }); + const second = checkToolCallLoop("ask_user_questions", { + questions: ["continue?"], + }); + + expect(first.block).toBe(false); + expect(second).toMatchObject({ + block: true, + count: 2, + loopSignal: { + scope: "tool", + kind: "tool-call:duplicate", + severity: "fail", + recommendedAction: "block", + source: "tool-call-loop-guard", + evidence: { + toolName: "ask_user_questions", + count: 2, + threshold: 1, + }, + }, + }); + expect(second.loopSignal.evidence.signature).toBe(second.signature); + }); +}); diff --git a/src/resources/extensions/sf/tests/tool-watchdog-loop-signal.test.mjs b/src/resources/extensions/sf/tests/tool-watchdog-loop-signal.test.mjs new file mode 100644 index 000000000..767b91388 --- /dev/null +++ b/src/resources/extensions/sf/tests/tool-watchdog-loop-signal.test.mjs @@ -0,0 +1,40 @@ +import { describe, expect, test } from "vitest"; +import { observeToolResult, resetToolWatchdog } from "../tool-watchdog.js"; + +function failingBash(command, text = "command failed") { + return { + toolName: "bash", + input: { command }, + isError: true, + content: [{ type: "text", text }], + }; +} + +describe("tool watchdog loop signal", () => { + test("observeToolResult_when_same_bash_command_fails_twice_returns_loop_signal", () => { + resetToolWatchdog(); + + const first = observeToolResult(failingBash("npm run missing", "first")); + const second = observeToolResult(failingBash("npm run missing", "second")); + + expect(first).toBeUndefined(); + expect(second).toMatchObject({ + key: "repeat-failure:npm run missing", + severity: "warning", + loopSignal: { + scope: "tool", + kind: "tool-result:repeated-failure", + severity: "warning", + recommendedAction: "continue", + source: "tool-watchdog", + evidence: { + toolName: "bash", + command: "npm run missing", + count: 2, + windowMs: 600000, + excerpt: "second", + }, + }, + }); + }); +}); diff --git a/src/resources/extensions/sf/tests/uok-parity-exit-guarantee.test.mjs b/src/resources/extensions/sf/tests/uok-parity-exit-guarantee.test.mjs index 41e66a77f..e6c4b1426 100644 --- a/src/resources/extensions/sf/tests/uok-parity-exit-guarantee.test.mjs +++ b/src/resources/extensions/sf/tests/uok-parity-exit-guarantee.test.mjs @@ -96,11 +96,7 @@ test("guarantee_normal_exit_exactly_one_exit_per_runId", async () => { assert.equal(enters[0].runId, exits[0].runId, "enter and exit share runId"); const exitsByRunId = countExitsByRunId(events); - assert.equal( - exitsByRunId.get(enters[0].runId), - 1, - "only one exit per runId", - ); + assert.equal(exitsByRunId.get(enters[0].runId), 1, "only one exit per runId"); }); // ── Case 2: Abnormal exit via finally (no conditional site) ────────────────── @@ -295,7 +291,11 @@ test("parity_report_handles_duplicate_exits_per_runId_as_balanced", () => { const report = buildParityReport(events, "/tmp/uok-parity.jsonl", NOW); // enterEvents=1, exitEvents=2 → not in unmatchedRuns (exitEvents >= enterEvents) - assert.equal(report.unmatchedRuns.length, 0, "no unmatched runs with double-exit"); + assert.equal( + report.unmatchedRuns.length, + 0, + "no unmatched runs with double-exit", + ); assert.equal(report.missingExitEvents, 0); // statuses bucket has both exits counted assert.deepEqual(report.statuses, { signal: 1, ok: 1 }); diff --git a/src/resources/extensions/sf/tests/web-search-retrieval-evidence.test.mjs b/src/resources/extensions/sf/tests/web-search-retrieval-evidence.test.mjs index fbe91626d..61816a4d7 100644 --- a/src/resources/extensions/sf/tests/web-search-retrieval-evidence.test.mjs +++ b/src/resources/extensions/sf/tests/web-search-retrieval-evidence.test.mjs @@ -134,3 +134,62 @@ test("search_the_web_when_successful_records_retrieval_evidence", async () => { }, ]); }); + +test("search_the_web_when_query_repeats_returns_loop_signal", async () => { + const project = makeProject(); + process.chdir(project); + process.env.BRAVE_API_KEY = "test-brave-key"; + process.env.TAVILY_API_KEY = ""; + process.env.MINIMAX_CODE_PLAN_KEY = ""; + process.env.MINIMAX_CODING_API_KEY = ""; + process.env.MINIMAX_API_KEY = ""; + process.env.SERPER_API_KEY = ""; + process.env.EXA_API_KEY = ""; + process.env.OLLAMA_API_KEY = ""; + assert.equal(openDatabase(join(project, ".sf", "sf.db")), true); + globalThis.fetch = async () => ({ + ok: true, + headers: { get: () => null }, + json: async () => ({ + web: { + results: [ + { + title: "Example Result", + url: "https://example.com/result", + description: "A result for testing.", + }, + ], + }, + }), + }); + + const tool = captureSearchTool(); + await tool.execute("call-1", { + query: "duplicate query", + count: 1, + }); + const result = await tool.execute("call-2", { + query: "duplicate query", + count: 1, + }); + + assert.equal(result.isError, true); + assert.equal(result.details.errorKind, "search_loop"); + assert.deepEqual(result.details.loopSignal, { + scope: "tool", + kind: "search:duplicate-query", + severity: "fail", + evidence: { + query: "duplicate query", + provider: "brave", + domain: "", + consecutiveDupeCount: 2, + }, + recommendedAction: "block", + source: "search-the-web", + }); + + const rows = getRetrievalEvidence(5); + assert.equal(rows[0].status, "search_loop"); + assert.deepEqual(rows[0].result.loopSignal, result.details.loopSignal); +}); diff --git a/src/resources/extensions/sf/tool-watchdog.js b/src/resources/extensions/sf/tool-watchdog.js index e2cc6a258..3932c21c3 100644 --- a/src/resources/extensions/sf/tool-watchdog.js +++ b/src/resources/extensions/sf/tool-watchdog.js @@ -3,6 +3,8 @@ // Observes tool-result events at runtime and emits a steer (info / // warning) when patterns like a repeated bash failure or a too-large // tool result indicate the agent is stuck. +import { repeatedToolFailureSignal } from "./supervision/loop-signals.js"; + const seenKeys = new Set(); const bashFailures = new Map(); const MAX_RESULT_CHARS = 6000; @@ -97,12 +99,21 @@ function maybeRepeatedFailureSteer(event, text) { : 1; bashFailures.set(command, { count, lastAt: now }); if (count < 2) return undefined; + const excerpt = text.slice(0, 1200); + const loopSignal = repeatedToolFailureSignal({ + toolName: event.toolName, + command, + count, + windowMs: REPEAT_FAILURE_WINDOW_MS, + excerpt, + }); return once({ key: `repeat-failure:${command}`, severity: "warning", + loopSignal, content: buildSteer( "repeated failing command", - `The same bash command has failed ${count} times in this autonomous mode unit:\n\n${command}\n\nStop retrying it as-is. Read the error, inspect the relevant files/config, and choose a different repair or verification path.\n\nLatest result excerpt:\n${text.slice(0, 1200)}`, + `The same bash command has failed ${count} times in this autonomous mode unit:\n\n${command}\n\nStop retrying it as-is. Read the error, inspect the relevant files/config, and choose a different repair or verification path.\n\nLatest result excerpt:\n${excerpt}`, ), }); } diff --git a/src/resources/extensions/sf/tools/validate-milestone.js b/src/resources/extensions/sf/tools/validate-milestone.js index 533e29be3..396576a53 100644 --- a/src/resources/extensions/sf/tools/validate-milestone.js +++ b/src/resources/extensions/sf/tools/validate-milestone.js @@ -22,7 +22,7 @@ import { } from "../sf-db.js"; import { invalidateStateCache } from "../state.js"; import { resolveUokFlags } from "../uok/flags.js"; -import { UokGateRunner } from "../uok/gate-runner.js"; +import { getGateRegistry } from "../uok/gate-registry.js"; import { isValidMilestoneVerdict, VALIDATION_VERDICTS, @@ -30,6 +30,35 @@ import { import { logWarning } from "../workflow-logger.js"; import { resolveCanonicalMilestoneRoot } from "../worktree-manager.js"; +/** + * Module-scope gate definition for ADR-0075 UokGateRegistry. + * + * ctx shape (supplied via ctxFactory at call site): + * verdict {string} — raw milestone verdict ("pass" | "fail" | …) + * verdictRationale {string|undefined} + * remediationPlan {string|undefined} + * + * The execute() function is pure relative to ctx — it no longer closes over + * call-site locals, making the gate safe to register once at module load time. + */ +const milestoneValidationGatesGate = { + id: "milestone-validation-gates", + type: "verification", + execute: async (ctx) => { + const nonPassVerdict = ctx.verdict !== "pass"; + return { + outcome: nonPassVerdict ? "manual-attention" : "pass", + failureClass: nonPassVerdict ? "manual-attention" : "none", + rationale: `milestone validation verdict: ${ctx.verdict}`, + findings: nonPassVerdict + ? [ctx.verdictRationale, ctx.remediationPlan ?? ""] + .filter(Boolean) + .join("\n") + : "", + }; + }, +}; + function renderValidationMarkdown(params) { let md = ` --- @@ -159,23 +188,16 @@ export async function handleValidateMilestone(params, basePath, opts) { const gatesEnabled = opts?.uokGatesEnabled ?? resolveUokFlags(prefs).gates; if (gatesEnabled) { try { - const gateRunner = new UokGateRunner(); - const nonPassVerdict = params.verdict !== "pass"; - gateRunner.register({ - id: "milestone-validation-gates", - type: "verification", - execute: async () => ({ - outcome: nonPassVerdict ? "manual-attention" : "pass", - failureClass: nonPassVerdict ? "manual-attention" : "none", - rationale: `milestone validation verdict: ${params.verdict}`, - findings: nonPassVerdict - ? [params.verdictRationale, params.remediationPlan ?? ""] - .filter(Boolean) - .join("\n") - : "", - }), - }); - await gateRunner.run("milestone-validation-gates", { + const registry = getGateRegistry(); + if (!registry.has(milestoneValidationGatesGate.id)) { + registry.register(milestoneValidationGatesGate); + } + await registry.run(milestoneValidationGatesGate.id, { + // ctxFactory: lifted closure state supplied as per-call override + verdict: params.verdict, + verdictRationale: params.verdictRationale, + remediationPlan: params.remediationPlan, + // runner ctx fields basePath, traceId: opts?.traceId ?? `validate-milestone:${params.milestoneId}`, turnId: opts?.turnId ?? `${params.milestoneId}:validate`, diff --git a/src/resources/extensions/sf/uok/gate-registry-bootstrap.js b/src/resources/extensions/sf/uok/gate-registry-bootstrap.js index 1c77dd499..db789c71d 100644 --- a/src/resources/extensions/sf/uok/gate-registry-bootstrap.js +++ b/src/resources/extensions/sf/uok/gate-registry-bootstrap.js @@ -16,7 +16,8 @@ const registry = getGateRegistry(); // SKIP milestone-validation-post-check: execute() closes over persistGate arguments from runValidateMilestonePostCheck. // SKIP verification-gate: execute() closes over the local verification result and helper formatting state from uok/auto-verification.js. // SKIP post-execution-checks: execute() closes over postExecResult, strict-mode prefs, and derived blocking state from uok/auto-verification.js. -// SKIP milestone-validation-gates: execute() closes over validate_milestone params and derived verdict/remediation state. +// milestone-validation-gates: lifted to module scope in tools/validate-milestone.js (R079 pilot). +// Registered lazily at call site via registry.has() guard; ctxFactory pattern supplies verdict/rationale/remediationPlan. // SKIP planning-flow-gate: execute() closes over persistGate arguments from guided-flow.js. registry.register(driftDetectionGate); diff --git a/src/resources/extensions/shared/loop-signals.js b/src/resources/extensions/shared/loop-signals.js new file mode 100644 index 000000000..d542d8403 --- /dev/null +++ b/src/resources/extensions/shared/loop-signals.js @@ -0,0 +1,179 @@ +/** + * loop-signals.js — normalized loop/stuck supervision signals. + * + * Purpose: give extensions one contract for loop detectors, watchdogs, retry + * guards, and tool churn signals so supervisors can decide from one shape + * instead of bespoke metadata branches. + * + * Consumer: SF autonomy supervision and cross-extension loop guards. + */ + +const VALID_SCOPES = new Set([ + "process", + "session", + "unit", + "tool", + "provider", + "artifact", +]); +const VALID_SEVERITIES = new Set(["info", "warning", "fail", "pause"]); +const VALID_ACTIONS = new Set([ + "continue", + "retry", + "switch-model", + "pause", + "block", + "recover", +]); + +function pick(value, allowed, fallback) { + return allowed.has(value) ? value : fallback; +} + +/** + * Build a normalized loop supervision signal. + * + * Purpose: preserve producer-specific evidence while making action, severity, + * scope, and unit identity machine-readable for shared supervision. + * + * Consumer: detector adapters, tool guards, provider guards, and watchdogs. + */ +export function createLoopSignal(input = {}) { + const unit = + input.unit && typeof input.unit === "object" + ? { + type: String(input.unit.type ?? ""), + id: String(input.unit.id ?? ""), + } + : undefined; + return { + scope: pick(input.scope, VALID_SCOPES, "unit"), + kind: String(input.kind ?? "unknown-loop-signal"), + severity: pick(input.severity, VALID_SEVERITIES, "warning"), + ...(unit?.type && unit?.id ? { unit } : {}), + evidence: + input.evidence && typeof input.evidence === "object" + ? input.evidence + : { value: input.evidence ?? null }, + recommendedAction: pick(input.recommendedAction, VALID_ACTIONS, "pause"), + ...(input.message ? { message: String(input.message) } : {}), + ...(input.source ? { source: String(input.source) } : {}), + }; +} + +/** + * Convert a Wiggums detector sweep hit into a normalized loop signal. + * + * Purpose: let the periodic detector family feed the same supervisor contract + * as direct runaway and zero-progress checks. + * + * Consumer: auto-runaway-guard when consuming buffered detector sweep results. + */ +export function detectorSweepSignal(fired, unitType, unitId, now = Date.now()) { + return createLoopSignal({ + scope: "unit", + kind: `wiggums:${fired?.name ?? "unknown"}`, + severity: "fail", + unit: { type: unitType, id: unitId }, + evidence: { + detector: fired?.name ?? "unknown", + signature: fired?.signature ?? null, + failedAt: now, + }, + recommendedAction: "pause", + source: "periodic-detector-sweep", + }); +} + +/** + * Convert a zero-progress detector hit into a normalized loop signal. + * + * Purpose: make the strongest unit-churn signal available through the same + * contract as Wiggums detectors and future tool/provider loop producers. + * + * Consumer: auto-runaway-guard. + */ +export function zeroProgressSignal(input) { + return createLoopSignal({ + scope: "unit", + kind: "zero-progress", + severity: "fail", + unit: { type: input.unitType, id: input.unitId }, + evidence: input.evidence, + recommendedAction: "pause", + source: "zero-progress-detector", + }); +} + +/** + * Convert a repeated identical tool call into a normalized loop signal. + * + * Purpose: expose pre-tool blocking as the same signal class as unit-level + * stuck detectors instead of hiding it inside a string reason. + * + * Consumer: SF tool-call loop guard. + */ +export function repeatedToolCallSignal(input) { + return createLoopSignal({ + scope: "tool", + kind: "tool-call:duplicate", + severity: "fail", + evidence: { + toolName: input.toolName, + count: input.count, + threshold: input.threshold, + signature: input.signature, + }, + recommendedAction: "block", + source: "tool-call-loop-guard", + }); +} + +/** + * Convert a repeated failing shell command into a normalized loop signal. + * + * Purpose: make tool-result watchdog steering machine-readable while preserving + * the existing human steering message. + * + * Consumer: SF tool watchdog. + */ +export function repeatedToolFailureSignal(input) { + return createLoopSignal({ + scope: "tool", + kind: "tool-result:repeated-failure", + severity: "warning", + evidence: { + toolName: input.toolName, + command: input.command, + count: input.count, + windowMs: input.windowMs, + excerpt: input.excerpt, + }, + recommendedAction: "continue", + source: "tool-watchdog", + }); +} + +/** + * Convert a duplicate web-search query into a normalized loop signal. + * + * Purpose: let non-SF search tools report query loops through the shared + * extension signal contract instead of only returning an opaque tool error. + * + * Consumer: search-the-web tool loop guard. + */ +export function duplicateSearchSignal(input) { + return createLoopSignal({ + scope: "tool", + kind: "search:duplicate-query", + severity: "fail", + evidence: { + query: input.query, + provider: input.provider, + domain: input.domain ?? "", + consecutiveDupeCount: input.consecutiveDupeCount, + }, + recommendedAction: "block", + source: "search-the-web", + }); +}