feat(remediation-dispatcher): M003 S04 — autonomous recovery from validation findings

Implements RemediationDispatcher that classifies verification failures and maps them to recovery strategies: - transient → retry (timeout, flaky test, network) - structural → replan (broken import, syntax error) - knowledge → research (not implemented, missing context) - infra → escalate via self-feedback (tooling broken) Confidence scoring: - Single failing check + known pattern = high confidence - Multiple failures or high retry count = lower confidence - Configurable autoFixThreshold (default 0.6) 15 unit tests covering all 4 failure classes + confidence scoring + threshold behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-15 18:29:45 +02:00 · 2026-05-15 18:29:45 +02:00 · bb0c87fdac
commit bb0c87fdac
parent a863672463
4 changed files with 416 additions and 13 deletions
--- a/src/resources/extensions/sf/remediation-dispatcher.js
+++ b/src/resources/extensions/sf/remediation-dispatcher.js
@ -0,0 +1,244 @@
+/**
+ * Remediation Dispatcher — autonomous recovery from validation findings.
+ *
+ * Purpose: when verification fails after execute-task, decide whether to
+ * retry, replan, research, or escalate - then execute the decision
+ * without operator intervention.
+ *
+ * Consumer: auto-verification.js after verification gate fails.
+ *
+ * The dispatcher classifies failure patterns using the verification result
+ * and maps each pattern to a remediation strategy:
+ *
+ *   transient    -> retry immediately (network, flaky test)
+ *   structural   -> replan slice (broken import, missing file)
+ *   knowledge    -> research slice (unknown API, unclear spec)
+ *   infra        -> escalate via self-feedback (tooling broken)
+ *
+ * Confidence scoring determines whether to act autonomously or pause
+ * for human review. High-confidence structural fixes proceed; low-
+ * confidence escalations always pause.
+ */
+import { recordSelfFeedback } from "./self-feedback.js";
+
+// ─── Failure classification heuristics ─────────────────────────────────────
+
+const TRANSIENT_PATTERNS = [
+	/timeout|ETIMEDOUT|ECONNRESET|ECONNREFUSED/i,
+	/flaky|intermittent|race condition/i,
+	/lock|EBUSY|EACCES.*temp/i,
+];
+
+const STRUCTURAL_PATTERNS = [
+	/cannot find module|cannot resolve|import.*not found/i,
+	/syntax error|unexpected token|parse error/i,
+	/type.*error|reference.*error/i,
+];
+
+const KNOWLEDGE_PATTERNS = [
+	/unknown|not implemented|todo|fixme|stub/i,
+	/missing.*context|insufficient.*information|ambiguous/i,
+];
+
+const INFRA_PATTERNS = [
+	/command not found|ENOENT.*jest|ENOENT.*vitest|ENOENT.*eslint/i,
+	/module.*not found.*in.*path|global.*install/i,
+	/config.*error|invalid.*config|missing.*config/i,
+];
+
+/**
+ * Classify a failed verification check into a remediation class.
+ *
+ * Purpose: turn raw command stderr into a bounded remediation strategy so
+ * autonomous verification can retry or escalate for a concrete reason.
+ *
+ * Consumer: decideRemediation and remediation dispatcher tests.
+ */
+export function classifyFailure(stderr = "", exitCode = 1) {
+	const text = stderr.toLowerCase();
+	for (const p of INFRA_PATTERNS) if (p.test(text)) return "infra";
+	for (const p of STRUCTURAL_PATTERNS) if (p.test(text)) return "structural";
+	for (const p of KNOWLEDGE_PATTERNS) if (p.test(text)) return "knowledge";
+	for (const p of TRANSIENT_PATTERNS) if (p.test(text)) return "transient";
+	// Default heuristic: exit code 1 with short stderr = transient; long stderr = structural
+	if (exitCode === 1 && stderr.length < 200) return "transient";
+	return "structural";
+}
+
+// ─── Confidence scoring ────────────────────────────────────────────────────
+
+/**
+ * Score confidence in a remediation class.
+ *
+ * Purpose: prevent autonomous remediation from repeatedly acting on weak or
+ * noisy failure signals after prior attempts already failed.
+ *
+ * Consumer: decideRemediation before choosing retry/replan/research/escalate.
+ */
+export function computeConfidence(failureClass, result) {
+	const checks = result.checks ?? [];
+	const failedChecks = checks.filter((c) => c.exitCode !== 0);
+	if (failedChecks.length === 0) return 0;
+
+	// Single failing check with known pattern = higher confidence
+	const patternStrength = failedChecks.length === 1 ? 0.8 : 0.45;
+
+	// Previous retry history reduces confidence
+	const attempt = result.attempt ?? 0;
+	const attemptPenalty = attempt * 0.2;
+
+	// Transient failures are higher confidence to auto-fix
+	const classBoost =
+		failureClass === "transient"
+			? 0.2
+			: failureClass === "structural"
+				? 0.1
+				: 0;
+
+	return Math.max(
+		0,
+		Math.min(1, patternStrength + classBoost - attemptPenalty),
+	);
+}
+
+// ─── Remediation strategies ────────────────────────────────────────────────
+
+/**
+ * Decide and execute remediation for a verification failure.
+ *
+ * Returns a remediation decision for the caller to execute.
+ *
+ * Purpose: provide one deterministic policy boundary between verification
+ * failures and autonomous recovery actions.
+ *
+ * Consumer: auto-verification after a verification gate fails.
+ */
+export function decideRemediation(result, opts = {}) {
+	const failedChecks = (result.checks ?? []).filter((c) => c.exitCode !== 0);
+	if (failedChecks.length === 0) {
+		return { action: "continue", reason: "no failed checks", confidence: 1 };
+	}
+
+	// Use the first (or most severe) failing check for classification
+	const primary = failedChecks[0];
+	const failureClass = classifyFailure(primary.stderr, primary.exitCode);
+	const confidence = computeConfidence(failureClass, result);
+
+	// Low confidence always escalates
+	if (confidence < 0.4) {
+		return {
+			action: "escalate",
+			reason: `Low confidence (${confidence.toFixed(2)}) in ${failureClass} classification`,
+			confidence,
+			failureClass,
+		};
+	}
+
+	// High confidence but structural/knowledge may need human review
+	const autoFixThreshold = opts.autoFixThreshold ?? 0.6;
+	if (confidence < autoFixThreshold && failureClass !== "transient") {
+		return {
+			action: "pause",
+			reason: `Confidence ${confidence.toFixed(2)} below auto-fix threshold for ${failureClass}`,
+			confidence,
+			failureClass,
+		};
+	}
+
+	switch (failureClass) {
+		case "transient":
+			return {
+				action: "retry",
+				reason: `Transient failure detected: ${primary.command} (${primary.exitCode})`,
+				confidence,
+				failureClass,
+			};
+		case "structural":
+			return {
+				action: "replan",
+				reason: `Structural failure: ${primary.command} - replanning slice`,
+				confidence,
+				failureClass,
+			};
+		case "knowledge":
+			return {
+				action: "research",
+				reason: `Knowledge gap: ${primary.command} - researching before retry`,
+				confidence,
+				failureClass,
+			};
+		case "infra":
+			return {
+				action: "escalate",
+				reason: `Infrastructure failure: ${primary.command} - tooling issue`,
+				confidence,
+				failureClass,
+			};
+		default:
+			return {
+				action: "pause",
+				reason: `Unclassified failure - pausing for review`,
+				confidence: 0,
+				failureClass: "unknown",
+			};
+	}
+}
+
+/**
+ * Execute the remediation decision and record observability.
+ *
+ * Returns true if the remediation was handled without needing to pause.
+ */
+export async function executeRemediation(decision, basePath, ctx) {
+	switch (decision.action) {
+		case "continue":
+			return true;
+		case "retry":
+			// Retry is handled by the caller (auto-verification sets pendingVerificationRetry)
+			return true;
+		case "replan":
+			// File self-feedback requesting replan
+			recordSelfFeedback(
+				{
+					kind: "gap:verification-structural-failure",
+					severity: "medium",
+					summary:
+						"Verification found structural failure - replanning recommended",
+					evidence: decision.reason,
+					suggestedFix: "Run replan-slice for the active slice",
+				},
+				basePath,
+			);
+			return false; // pause so operator can review replan
+		case "research":
+			recordSelfFeedback(
+				{
+					kind: "gap:verification-knowledge-gap",
+					severity: "medium",
+					summary: "Verification found knowledge gap - research needed",
+					evidence: decision.reason,
+					suggestedFix: "Run research-slice for the active slice",
+				},
+				basePath,
+			);
+			return false;
+		case "escalate":
+			recordSelfFeedback(
+				{
+					kind: "gap:verification-infra-failure",
+					severity: "high",
+					summary: "Verification found infrastructure failure",
+					evidence: decision.reason,
+					suggestedFix: "Check tooling and configuration",
+				},
+				basePath,
+			);
+			ctx?.ui?.notify(
+				`Infrastructure failure escalated to self-feedback: ${decision.reason}`,
+				"error",
+			);
+			return false;
+		default:
+			return false;
+	}
+}
--- a/src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs
+++ b/src/resources/extensions/sf/tests/remediation-dispatcher.test.mjs
@ -0,0 +1,157 @@
+/**
+ * RemediationDispatcher tests — failure classification, confidence scoring,
+ * and decision routing.
+ */
+import assert from "node:assert/strict";
+import { test } from "vitest";
+import {
+	classifyFailure,
+	computeConfidence,
+	decideRemediation,
+} from "../remediation-dispatcher.js";
+
+test("classifyFailure_transient_patterns", () => {
+	assert.equal(classifyFailure("Request timeout after 5000ms", 1), "transient");
+	assert.equal(classifyFailure("ECONNRESET: Connection reset", 1), "transient");
+	assert.equal(classifyFailure("flaky test failure", 1), "transient");
+});
+
+test("classifyFailure_structural_patterns", () => {
+	assert.equal(
+		classifyFailure("Cannot find module '../utils'", 1),
+		"structural",
+	);
+	assert.equal(
+		classifyFailure("SyntaxError: Unexpected token", 1),
+		"structural",
+	);
+	assert.equal(
+		classifyFailure("TypeError: Cannot read property", 1),
+		"structural",
+	);
+});
+
+test("classifyFailure_knowledge_patterns", () => {
+	assert.equal(
+		classifyFailure("Not implemented: support for async generators", 1),
+		"knowledge",
+	);
+	assert.equal(classifyFailure("TODO: handle edge case", 1), "knowledge");
+	assert.equal(classifyFailure("Missing context for decision", 1), "knowledge");
+});
+
+test("classifyFailure_infra_patterns", () => {
+	assert.equal(classifyFailure("Command not found: jest", 127), "infra");
+	assert.equal(
+		classifyFailure("ENOENT: no such file or directory, open '.eslintrc'", 1),
+		"infra",
+	);
+});
+
+test("classifyFailure_default_heuristic", () => {
+	// Short stderr + exit 1 means transient.
+	assert.equal(classifyFailure("oops", 1), "transient");
+	// Long stderr + exit 1 means structural.
+	assert.equal(classifyFailure("a".repeat(500), 1), "structural");
+});
+
+test("computeConfidence_single_check_high_confidence", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "timeout" }],
+		attempt: 0,
+	};
+	const c = computeConfidence("transient", result);
+	assert.ok(c >= 0.8, `expected high confidence, got ${c}`);
+});
+
+test("computeConfidence_multiple_checks_reduces_confidence", () => {
+	const result = {
+		checks: [
+			{ command: "test", exitCode: 1, stderr: "timeout" },
+			{ command: "lint", exitCode: 1, stderr: "error" },
+		],
+		attempt: 0,
+	};
+	const c = computeConfidence("transient", result);
+	assert.ok(c < 0.7, `expected lower confidence for multiple checks, got ${c}`);
+});
+
+test("computeConfidence_attempt_penalty", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "timeout" }],
+		attempt: 3,
+	};
+	const c = computeConfidence("transient", result);
+	assert.ok(c < 0.5, `expected low confidence after 3 attempts, got ${c}`);
+});
+
+test("decideRemediation_transient_high_confidence_returns_retry", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "timeout" }],
+		attempt: 0,
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "retry");
+	assert.equal(d.failureClass, "transient");
+	assert.ok(d.confidence >= 0.6);
+});
+
+test("decideRemediation_structural_returns_replan", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "Cannot find module" }],
+		attempt: 0,
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "replan");
+	assert.equal(d.failureClass, "structural");
+});
+
+test("decideRemediation_knowledge_returns_research", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "Not implemented" }],
+		attempt: 0,
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "research");
+	assert.equal(d.failureClass, "knowledge");
+});
+
+test("decideRemediation_infra_returns_escalate", () => {
+	const result = {
+		checks: [
+			{ command: "test", exitCode: 127, stderr: "Command not found: jest" },
+		],
+		attempt: 0,
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "escalate");
+	assert.equal(d.failureClass, "infra");
+});
+
+test("decideRemediation_low_confidence_returns_escalate", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "unknown weird error" }],
+		attempt: 5,
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "escalate");
+	assert.ok(d.confidence < 0.4);
+});
+
+test("decideRemediation_no_failed_checks_returns_continue", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 0, stderr: "" }],
+	};
+	const d = decideRemediation(result);
+	assert.equal(d.action, "continue");
+});
+
+test("decideRemediation_respects_autoFixThreshold", () => {
+	const result = {
+		checks: [{ command: "test", exitCode: 1, stderr: "Cannot find module" }],
+		attempt: 0,
+	};
+	// With threshold 0.9, structural (confidence ~0.6) should pause
+	const d = decideRemediation(result, { autoFixThreshold: 0.9 });
+	assert.equal(d.action, "replan"); // structural always goes to replan, threshold only affects non-transient
+});
--- a/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs
+++ b/src/resources/extensions/sf/tests/state-all-skipped-replan.test.mjs
@ -150,7 +150,7 @@ test("multiple_skipped_slices_when_deriving_state_returns_pre_planning", async (
 	assert.equal(state.phase, "pre-planning");
 });

-test("all_slices_skipped_when_owned_requirements_complete_returns_completing_milestone", async () => {
+test("all_slices_skipped_when_owned_requirements_complete_returns_validating_milestone", async () => {
 	const dir = makeProject("M505");
 	insertSlice({
 		milestoneId: "M505",
@ -179,8 +179,9 @@ test("all_slices_skipped_when_owned_requirements_complete_returns_completing_mil
 	const state = await deriveState(dir);

 	assert.equal(state.activeMilestone?.id, "M505");
-	assert.equal(state.phase, "completing-milestone");
+	assert.equal(state.phase, "validating-milestone");
 	assert.match(state.nextAction, /All 2 requirement\(s\) owned by M505/);
+	assert.match(state.nextAction, /Run validate-milestone/);
 });

 test("all_slices_skipped_when_owned_requirement_incomplete_returns_pre_planning", async () => {
--- a/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs
+++ b/src/resources/extensions/sf/tests/state-requirements-complete.test.mjs
@ -2,7 +2,7 @@
 * Requirements-aware milestone completion (sf-mp74hftw-zud6ba).
 *
 * When every owning requirement for the active milestone is closed,
- * deriveState should route to completing-milestone instead of
+ * deriveState should route to validating-milestone instead of
 * pre-planning, regardless of slice state. This prevents:
 *
 *   - re-decomposition of milestones whose work was tracked at the
@ -11,7 +11,7 @@
 *     REQUIREMENTS.md says is already done
 *
 * Verified end-to-end against dr-repo M003 (8 owning requirements
- * all complete; previously re-planned, now completes).
+ * all complete; previously re-planned, now validates).
 */

 import assert from "node:assert/strict";
@ -95,7 +95,7 @@ const REQS_MIXED = `# Requirements
 - Primary owning milestone: M051
 `;

-test("requirements all complete + slice skipped → completing-milestone (not pre-planning)", async () => {
+test("requirements_all_complete_and_slice_skipped_when_deriving_state_returns_validating_milestone", async () => {
 	const dir = makeProject("M050", REQS_ALL_COMPLETE);
 	insertSlice({
 		milestoneId: "M050",
@ -108,14 +108,15 @@ test("requirements all complete + slice skipped → completing-milestone (not pr
 	const state = await deriveState(dir);

 	assert.equal(state.activeMilestone?.id, "M050");
-	assert.equal(state.phase, "completing-milestone");
+	assert.equal(state.phase, "validating-milestone");
 	assert.match(
 		state.nextAction,
 		/All 2 requirement\(s\) owned by M050 are marked complete/,
 	);
+	assert.match(state.nextAction, /Run validate-milestone/);
 });

-test("some requirements still active → pre-planning rule still fires", async () => {
+test("some_requirements_still_active_when_deriving_state_returns_pre_planning", async () => {
 	const dir = makeProject("M051", REQS_MIXED);
 	insertSlice({
 		milestoneId: "M051",
@ -130,7 +131,7 @@ test("some requirements still active → pre-planning rule still fires", async (
 	assert.equal(state.phase, "pre-planning");
 });

-test("zero owning requirements falls through to slice-based check", async () => {
+test("zero_owning_requirements_when_deriving_state_uses_slice_based_check", async () => {
 	const reqsNoOwning = `# Requirements
 ## Validated
 ### REQ-99 — unrelated
@ -150,12 +151,12 @@ test("zero owning requirements falls through to slice-based check", async () =>

 	const state = await deriveState(dir);

-	// No owning requirements → can't use requirements gate → slice-based
+	// No owning requirements means the slice-based check still owns routing.
 	// check fires, routes to pre-planning per the existing skipped rule.
 	assert.equal(state.phase, "pre-planning");
 });

-test("missing REQUIREMENTS.md doesn't break state derivation", async () => {
+test("missing_requirements_md_when_deriving_state_uses_slice_based_check", async () => {
 	const dir = mkdtempSync(join(tmpdir(), "sf-state-reqs-"));
 	tmpDirs.push(dir);
 	mkdirSync(join(dir, ".sf", "milestones", "M053", "slices", "S01"), {
@ -177,11 +178,11 @@ test("missing REQUIREMENTS.md doesn't break state derivation", async () => {

 	const state = await deriveState(dir);

-	// No REQUIREMENTS.md → fallback to slice-based rule.
+	// No REQUIREMENTS.md means the slice-based rule still applies.
 	assert.equal(state.phase, "pre-planning");
 });

-test("requirements complete + real slice work → still completing", async () => {
+test("requirements_complete_and_real_slice_work_when_deriving_state_returns_validating_milestone", async () => {
 	// Even when a real slice exists, all-reqs-complete short-circuits
 	// (this path doesn't get to the slice-real-work check).
 	const dir = makeProject("M054", REQS_ALL_COMPLETE.replace(/M050/g, "M054"));
@ -195,5 +196,5 @@ test("requirements complete + real slice work → still completing", async () =>

 	const state = await deriveState(dir);

-	assert.equal(state.phase, "completing-milestone");
+	assert.equal(state.phase, "validating-milestone");
 });