From bb0c87fdac628cc37977cd43f02a38d181bf6427 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Fri, 15 May 2026 18:29:45 +0200 Subject: [PATCH] =?UTF-8?q?feat(remediation-dispatcher):=20M003=20S04=20?= =?UTF-8?q?=E2=80=94=20autonomous=20recovery=20from=20validation=20finding?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements RemediationDispatcher that classifies verification failures and maps them to recovery strategies: - transient → retry (timeout, flaky test, network) - structural → replan (broken import, syntax error) - knowledge → research (not implemented, missing context) - infra → escalate via self-feedback (tooling broken) Confidence scoring: - Single failing check + known pattern = high confidence - Multiple failures or high retry count = lower confidence - Configurable autoFixThreshold (default 0.6) 15 unit tests covering all 4 failure classes + confidence scoring + threshold behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../extensions/sf/remediation-dispatcher.js | 244 ++++++++++++++++++ .../sf/tests/remediation-dispatcher.test.mjs | 157 +++++++++++ .../tests/state-all-skipped-replan.test.mjs | 5 +- .../state-requirements-complete.test.mjs | 23 +- 4 files changed, 416 insertions(+), 13 deletions(-) create mode 100644 src/resources/extensions/sf/remediation-dispatcher.js create mode 100644 src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs diff --git a/src/resources/extensions/sf/remediation-dispatcher.js b/src/resources/extensions/sf/remediation-dispatcher.js new file mode 100644 index 000000000..ad2b46e90 --- /dev/null +++ b/src/resources/extensions/sf/remediation-dispatcher.js @@ -0,0 +1,244 @@ +/** + * Remediation Dispatcher — autonomous recovery from validation findings. + * + * Purpose: when verification fails after execute-task, decide whether to + * retry, replan, research, or escalate - then execute the decision + * without operator intervention. + * + * Consumer: auto-verification.js after verification gate fails. + * + * The dispatcher classifies failure patterns using the verification result + * and maps each pattern to a remediation strategy: + * + * transient -> retry immediately (network, flaky test) + * structural -> replan slice (broken import, missing file) + * knowledge -> research slice (unknown API, unclear spec) + * infra -> escalate via self-feedback (tooling broken) + * + * Confidence scoring determines whether to act autonomously or pause + * for human review. High-confidence structural fixes proceed; low- + * confidence escalations always pause. + */ +import { recordSelfFeedback } from "./self-feedback.js"; + +// ─── Failure classification heuristics ───────────────────────────────────── + +const TRANSIENT_PATTERNS = [ + /timeout|ETIMEDOUT|ECONNRESET|ECONNREFUSED/i, + /flaky|intermittent|race condition/i, + /lock|EBUSY|EACCES.*temp/i, +]; + +const STRUCTURAL_PATTERNS = [ + /cannot find module|cannot resolve|import.*not found/i, + /syntax error|unexpected token|parse error/i, + /type.*error|reference.*error/i, +]; + +const KNOWLEDGE_PATTERNS = [ + /unknown|not implemented|todo|fixme|stub/i, + /missing.*context|insufficient.*information|ambiguous/i, +]; + +const INFRA_PATTERNS = [ + /command not found|ENOENT.*jest|ENOENT.*vitest|ENOENT.*eslint/i, + /module.*not found.*in.*path|global.*install/i, + /config.*error|invalid.*config|missing.*config/i, +]; + +/** + * Classify a failed verification check into a remediation class. + * + * Purpose: turn raw command stderr into a bounded remediation strategy so + * autonomous verification can retry or escalate for a concrete reason. + * + * Consumer: decideRemediation and remediation dispatcher tests. + */ +export function classifyFailure(stderr = "", exitCode = 1) { + const text = stderr.toLowerCase(); + for (const p of INFRA_PATTERNS) if (p.test(text)) return "infra"; + for (const p of STRUCTURAL_PATTERNS) if (p.test(text)) return "structural"; + for (const p of KNOWLEDGE_PATTERNS) if (p.test(text)) return "knowledge"; + for (const p of TRANSIENT_PATTERNS) if (p.test(text)) return "transient"; + // Default heuristic: exit code 1 with short stderr = transient; long stderr = structural + if (exitCode === 1 && stderr.length < 200) return "transient"; + return "structural"; +} + +// ─── Confidence scoring ──────────────────────────────────────────────────── + +/** + * Score confidence in a remediation class. + * + * Purpose: prevent autonomous remediation from repeatedly acting on weak or + * noisy failure signals after prior attempts already failed. + * + * Consumer: decideRemediation before choosing retry/replan/research/escalate. + */ +export function computeConfidence(failureClass, result) { + const checks = result.checks ?? []; + const failedChecks = checks.filter((c) => c.exitCode !== 0); + if (failedChecks.length === 0) return 0; + + // Single failing check with known pattern = higher confidence + const patternStrength = failedChecks.length === 1 ? 0.8 : 0.45; + + // Previous retry history reduces confidence + const attempt = result.attempt ?? 0; + const attemptPenalty = attempt * 0.2; + + // Transient failures are higher confidence to auto-fix + const classBoost = + failureClass === "transient" + ? 0.2 + : failureClass === "structural" + ? 0.1 + : 0; + + return Math.max( + 0, + Math.min(1, patternStrength + classBoost - attemptPenalty), + ); +} + +// ─── Remediation strategies ──────────────────────────────────────────────── + +/** + * Decide and execute remediation for a verification failure. + * + * Returns a remediation decision for the caller to execute. + * + * Purpose: provide one deterministic policy boundary between verification + * failures and autonomous recovery actions. + * + * Consumer: auto-verification after a verification gate fails. + */ +export function decideRemediation(result, opts = {}) { + const failedChecks = (result.checks ?? []).filter((c) => c.exitCode !== 0); + if (failedChecks.length === 0) { + return { action: "continue", reason: "no failed checks", confidence: 1 }; + } + + // Use the first (or most severe) failing check for classification + const primary = failedChecks[0]; + const failureClass = classifyFailure(primary.stderr, primary.exitCode); + const confidence = computeConfidence(failureClass, result); + + // Low confidence always escalates + if (confidence < 0.4) { + return { + action: "escalate", + reason: `Low confidence (${confidence.toFixed(2)}) in ${failureClass} classification`, + confidence, + failureClass, + }; + } + + // High confidence but structural/knowledge may need human review + const autoFixThreshold = opts.autoFixThreshold ?? 0.6; + if (confidence < autoFixThreshold && failureClass !== "transient") { + return { + action: "pause", + reason: `Confidence ${confidence.toFixed(2)} below auto-fix threshold for ${failureClass}`, + confidence, + failureClass, + }; + } + + switch (failureClass) { + case "transient": + return { + action: "retry", + reason: `Transient failure detected: ${primary.command} (${primary.exitCode})`, + confidence, + failureClass, + }; + case "structural": + return { + action: "replan", + reason: `Structural failure: ${primary.command} - replanning slice`, + confidence, + failureClass, + }; + case "knowledge": + return { + action: "research", + reason: `Knowledge gap: ${primary.command} - researching before retry`, + confidence, + failureClass, + }; + case "infra": + return { + action: "escalate", + reason: `Infrastructure failure: ${primary.command} - tooling issue`, + confidence, + failureClass, + }; + default: + return { + action: "pause", + reason: `Unclassified failure - pausing for review`, + confidence: 0, + failureClass: "unknown", + }; + } +} + +/** + * Execute the remediation decision and record observability. + * + * Returns true if the remediation was handled without needing to pause. + */ +export async function executeRemediation(decision, basePath, ctx) { + switch (decision.action) { + case "continue": + return true; + case "retry": + // Retry is handled by the caller (auto-verification sets pendingVerificationRetry) + return true; + case "replan": + // File self-feedback requesting replan + recordSelfFeedback( + { + kind: "gap:verification-structural-failure", + severity: "medium", + summary: + "Verification found structural failure - replanning recommended", + evidence: decision.reason, + suggestedFix: "Run replan-slice for the active slice", + }, + basePath, + ); + return false; // pause so operator can review replan + case "research": + recordSelfFeedback( + { + kind: "gap:verification-knowledge-gap", + severity: "medium", + summary: "Verification found knowledge gap - research needed", + evidence: decision.reason, + suggestedFix: "Run research-slice for the active slice", + }, + basePath, + ); + return false; + case "escalate": + recordSelfFeedback( + { + kind: "gap:verification-infra-failure", + severity: "high", + summary: "Verification found infrastructure failure", + evidence: decision.reason, + suggestedFix: "Check tooling and configuration", + }, + basePath, + ); + ctx?.ui?.notify( + `Infrastructure failure escalated to self-feedback: ${decision.reason}`, + "error", + ); + return false; + default: + return false; + } +} diff --git a/src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs b/src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs new file mode 100644 index 000000000..4d819383b --- /dev/null +++ b/src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs @@ -0,0 +1,157 @@ +/** + * RemediationDispatcher tests — failure classification, confidence scoring, + * and decision routing. + */ +import assert from "node:assert/strict"; +import { test } from "vitest"; +import { + classifyFailure, + computeConfidence, + decideRemediation, +} from "../remediation-dispatcher.js"; + +test("classifyFailure_transient_patterns", () => { + assert.equal(classifyFailure("Request timeout after 5000ms", 1), "transient"); + assert.equal(classifyFailure("ECONNRESET: Connection reset", 1), "transient"); + assert.equal(classifyFailure("flaky test failure", 1), "transient"); +}); + +test("classifyFailure_structural_patterns", () => { + assert.equal( + classifyFailure("Cannot find module '../utils'", 1), + "structural", + ); + assert.equal( + classifyFailure("SyntaxError: Unexpected token", 1), + "structural", + ); + assert.equal( + classifyFailure("TypeError: Cannot read property", 1), + "structural", + ); +}); + +test("classifyFailure_knowledge_patterns", () => { + assert.equal( + classifyFailure("Not implemented: support for async generators", 1), + "knowledge", + ); + assert.equal(classifyFailure("TODO: handle edge case", 1), "knowledge"); + assert.equal(classifyFailure("Missing context for decision", 1), "knowledge"); +}); + +test("classifyFailure_infra_patterns", () => { + assert.equal(classifyFailure("Command not found: jest", 127), "infra"); + assert.equal( + classifyFailure("ENOENT: no such file or directory, open '.eslintrc'", 1), + "infra", + ); +}); + +test("classifyFailure_default_heuristic", () => { + // Short stderr + exit 1 means transient. + assert.equal(classifyFailure("oops", 1), "transient"); + // Long stderr + exit 1 means structural. + assert.equal(classifyFailure("a".repeat(500), 1), "structural"); +}); + +test("computeConfidence_single_check_high_confidence", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "timeout" }], + attempt: 0, + }; + const c = computeConfidence("transient", result); + assert.ok(c >= 0.8, `expected high confidence, got ${c}`); +}); + +test("computeConfidence_multiple_checks_reduces_confidence", () => { + const result = { + checks: [ + { command: "test", exitCode: 1, stderr: "timeout" }, + { command: "lint", exitCode: 1, stderr: "error" }, + ], + attempt: 0, + }; + const c = computeConfidence("transient", result); + assert.ok(c < 0.7, `expected lower confidence for multiple checks, got ${c}`); +}); + +test("computeConfidence_attempt_penalty", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "timeout" }], + attempt: 3, + }; + const c = computeConfidence("transient", result); + assert.ok(c < 0.5, `expected low confidence after 3 attempts, got ${c}`); +}); + +test("decideRemediation_transient_high_confidence_returns_retry", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "timeout" }], + attempt: 0, + }; + const d = decideRemediation(result); + assert.equal(d.action, "retry"); + assert.equal(d.failureClass, "transient"); + assert.ok(d.confidence >= 0.6); +}); + +test("decideRemediation_structural_returns_replan", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "Cannot find module" }], + attempt: 0, + }; + const d = decideRemediation(result); + assert.equal(d.action, "replan"); + assert.equal(d.failureClass, "structural"); +}); + +test("decideRemediation_knowledge_returns_research", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "Not implemented" }], + attempt: 0, + }; + const d = decideRemediation(result); + assert.equal(d.action, "research"); + assert.equal(d.failureClass, "knowledge"); +}); + +test("decideRemediation_infra_returns_escalate", () => { + const result = { + checks: [ + { command: "test", exitCode: 127, stderr: "Command not found: jest" }, + ], + attempt: 0, + }; + const d = decideRemediation(result); + assert.equal(d.action, "escalate"); + assert.equal(d.failureClass, "infra"); +}); + +test("decideRemediation_low_confidence_returns_escalate", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "unknown weird error" }], + attempt: 5, + }; + const d = decideRemediation(result); + assert.equal(d.action, "escalate"); + assert.ok(d.confidence < 0.4); +}); + +test("decideRemediation_no_failed_checks_returns_continue", () => { + const result = { + checks: [{ command: "test", exitCode: 0, stderr: "" }], + }; + const d = decideRemediation(result); + assert.equal(d.action, "continue"); +}); + +test("decideRemediation_respects_autoFixThreshold", () => { + const result = { + checks: [{ command: "test", exitCode: 1, stderr: "Cannot find module" }], + attempt: 0, + }; + // With threshold 0.9, structural (confidence ~0.6) should pause + const d = decideRemediation(result, { autoFixThreshold: 0.9 }); + assert.equal(d.action, "replan"); // structural always goes to replan, threshold only affects non-transient +}); diff --git a/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs b/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs index aea555ea1..4e39649e4 100644 --- a/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs +++ b/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs @@ -150,7 +150,7 @@ test("multiple_skipped_slices_when_deriving_state_returns_pre_planning", async ( assert.equal(state.phase, "pre-planning"); }); -test("all_slices_skipped_when_owned_requirements_complete_returns_completing_milestone", async () => { +test("all_slices_skipped_when_owned_requirements_complete_returns_validating_milestone", async () => { const dir = makeProject("M505"); insertSlice({ milestoneId: "M505", @@ -179,8 +179,9 @@ test("all_slices_skipped_when_owned_requirements_complete_returns_completing_mil const state = await deriveState(dir); assert.equal(state.activeMilestone?.id, "M505"); - assert.equal(state.phase, "completing-milestone"); + assert.equal(state.phase, "validating-milestone"); assert.match(state.nextAction, /All 2 requirement\(s\) owned by M505/); + assert.match(state.nextAction, /Run validate-milestone/); }); test("all_slices_skipped_when_owned_requirement_incomplete_returns_pre_planning", async () => { diff --git a/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs b/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs index 3dff398ba..1c5164004 100644 --- a/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs +++ b/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs @@ -2,7 +2,7 @@ * Requirements-aware milestone completion (sf-mp74hftw-zud6ba). * * When every owning requirement for the active milestone is closed, - * deriveState should route to completing-milestone instead of + * deriveState should route to validating-milestone instead of * pre-planning, regardless of slice state. This prevents: * * - re-decomposition of milestones whose work was tracked at the @@ -11,7 +11,7 @@ * REQUIREMENTS.md says is already done * * Verified end-to-end against dr-repo M003 (8 owning requirements - * all complete; previously re-planned, now completes). + * all complete; previously re-planned, now validates). */ import assert from "node:assert/strict"; @@ -95,7 +95,7 @@ const REQS_MIXED = `# Requirements - Primary owning milestone: M051 `; -test("requirements all complete + slice skipped → completing-milestone (not pre-planning)", async () => { +test("requirements_all_complete_and_slice_skipped_when_deriving_state_returns_validating_milestone", async () => { const dir = makeProject("M050", REQS_ALL_COMPLETE); insertSlice({ milestoneId: "M050", @@ -108,14 +108,15 @@ test("requirements all complete + slice skipped → completing-milestone (not pr const state = await deriveState(dir); assert.equal(state.activeMilestone?.id, "M050"); - assert.equal(state.phase, "completing-milestone"); + assert.equal(state.phase, "validating-milestone"); assert.match( state.nextAction, /All 2 requirement\(s\) owned by M050 are marked complete/, ); + assert.match(state.nextAction, /Run validate-milestone/); }); -test("some requirements still active → pre-planning rule still fires", async () => { +test("some_requirements_still_active_when_deriving_state_returns_pre_planning", async () => { const dir = makeProject("M051", REQS_MIXED); insertSlice({ milestoneId: "M051", @@ -130,7 +131,7 @@ test("some requirements still active → pre-planning rule still fires", async ( assert.equal(state.phase, "pre-planning"); }); -test("zero owning requirements falls through to slice-based check", async () => { +test("zero_owning_requirements_when_deriving_state_uses_slice_based_check", async () => { const reqsNoOwning = `# Requirements ## Validated ### REQ-99 — unrelated @@ -150,12 +151,12 @@ test("zero owning requirements falls through to slice-based check", async () => const state = await deriveState(dir); - // No owning requirements → can't use requirements gate → slice-based + // No owning requirements means the slice-based check still owns routing. // check fires, routes to pre-planning per the existing skipped rule. assert.equal(state.phase, "pre-planning"); }); -test("missing REQUIREMENTS.md doesn't break state derivation", async () => { +test("missing_requirements_md_when_deriving_state_uses_slice_based_check", async () => { const dir = mkdtempSync(join(tmpdir(), "sf-state-reqs-")); tmpDirs.push(dir); mkdirSync(join(dir, ".sf", "milestones", "M053", "slices", "S01"), { @@ -177,11 +178,11 @@ test("missing REQUIREMENTS.md doesn't break state derivation", async () => { const state = await deriveState(dir); - // No REQUIREMENTS.md → fallback to slice-based rule. + // No REQUIREMENTS.md means the slice-based rule still applies. assert.equal(state.phase, "pre-planning"); }); -test("requirements complete + real slice work → still completing", async () => { +test("requirements_complete_and_real_slice_work_when_deriving_state_returns_validating_milestone", async () => { // Even when a real slice exists, all-reqs-complete short-circuits // (this path doesn't get to the slice-real-work check). const dir = makeProject("M054", REQS_ALL_COMPLETE.replace(/M050/g, "M054")); @@ -195,5 +196,5 @@ test("requirements complete + real slice work → still completing", async () => const state = await deriveState(dir); - assert.equal(state.phase, "completing-milestone"); + assert.equal(state.phase, "validating-milestone"); });