feat(verification): auto-defer confidence policy for low-risk findings

Implements M003 S05: auto-deferral policy for low-risk validation findings. - New verification-defer-policy.js: classifyCheck, computeDeferConfidence, decideAutoDefer — classifies failed checks as deferrable/blocking/unknown - Patterns: style/format/deprecation-only → deferrable; error/fail/crash/fatal → blocking (always wins) - Confidence scoring: 0.9 all-deferrable, 0.7 mixed, 0.5 unknown, 0.0 blocking - Threshold preference: verification_auto_defer_threshold (default 0.75) - Integration in uok/auto-verification.js: checks defer before retry/pause, does not consume retry attempts, writes deferred: true + reasons to evidence JSON - verification-evidence.js: forwards deferred/deferredReasons/deferConfidence fields - Preferences wired: validation, types, serializer - Tests: 6 unit tests for classification, confidence, threshold, blocking dominance Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-15 18:55:26 +02:00 · 2026-05-15 18:55:26 +02:00 · f48a4cc7c5
commit f48a4cc7c5
parent 1b3dba6e51
11 changed files with 337 additions and 0 deletions
--- a/src/resources/extensions/sf/docs/preferences-reference.md
+++ b/src/resources/extensions/sf/docs/preferences-reference.md
@ -249,6 +249,8 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea

 - `verification_max_retries`: number — maximum number of fix-and-retry cycles for verification failures. Default: `0` (no retries).

+- `verification_auto_defer_threshold`: number — confidence threshold from `0` to `1` for treating low-risk style/format/deprecation-only verification failures as deferred instead of blocking. Default: `0.75`.
+
 - `uat_dispatch`: boolean — when `true`, enables UAT (User Acceptance Testing) dispatch mode. Default: `false`.

 - `post_unit_hooks`: array — hooks that fire after a unit completes. Each entry has:
--- a/src/resources/extensions/sf/preferences-serializer.js
+++ b/src/resources/extensions/sf/preferences-serializer.js
@ -125,6 +125,7 @@ export function serializePreferencesToFrontmatter(prefs) {
 		"verification_commands",
 		"verification_auto_fix",
 		"verification_max_retries",
+		"verification_auto_defer_threshold",
 		"search_provider",
 		"context_selection",
 	];
--- a/src/resources/extensions/sf/preferences-types.js
+++ b/src/resources/extensions/sf/preferences-types.js
@ -101,6 +101,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([
 	"verification_commands",
 	"verification_auto_fix",
 	"verification_max_retries",
+	"verification_auto_defer_threshold",
 	"search_provider",
 	"context_selection",
 	"widget_mode",
--- a/src/resources/extensions/sf/preferences-validation.js
+++ b/src/resources/extensions/sf/preferences-validation.js
@ -1398,6 +1398,21 @@ export function validatePreferences(preferences) {
 			errors.push("verification_max_retries must be a non-negative number");
 		}
 	}
+	if (preferences.verification_auto_defer_threshold !== undefined) {
+		const raw = preferences.verification_auto_defer_threshold;
+		if (
+			typeof raw === "number" &&
+			Number.isFinite(raw) &&
+			raw >= 0 &&
+			raw <= 1
+		) {
+			validated.verification_auto_defer_threshold = raw;
+		} else {
+			errors.push(
+				"verification_auto_defer_threshold must be a number between 0 and 1",
+			);
+		}
+	}
 	// ─── Git Preferences ───────────────────────────────────────────────────
 	if (preferences.git && typeof preferences.git === "object") {
 		const git = {};
--- a/src/resources/extensions/sf/preferences.js
+++ b/src/resources/extensions/sf/preferences.js
@ -418,6 +418,9 @@ function mergePreferences(base, override) {
 			override.verification_auto_fix ?? base.verification_auto_fix,
 		verification_max_retries:
 			override.verification_max_retries ?? base.verification_max_retries,
+		verification_auto_defer_threshold:
+			override.verification_auto_defer_threshold ??
+			base.verification_auto_defer_threshold,
 		enhanced_verification:
 			override.enhanced_verification ?? base.enhanced_verification,
 		enhanced_verification_pre:
--- a/src/resources/extensions/sf/templates/preferences.yaml
+++ b/src/resources/extensions/sf/templates/preferences.yaml
@ -97,6 +97,7 @@ parallel:
 verification_commands: []
 verification_auto_fix:
 verification_max_retries:
+verification_auto_defer_threshold:
 notifications:
  enabled:
  on_complete:
--- a/src/resources/extensions/sf/tests/preferences-verification-auto-defer.test.mjs
+++ b/src/resources/extensions/sf/tests/preferences-verification-auto-defer.test.mjs
@ -0,0 +1,29 @@
+/**
+ * preferences-verification-auto-defer.test.mjs — threshold preference contract.
+ *
+ * Purpose: ensure the auto-defer verification threshold survives preference
+ * validation instead of being silently dropped before autonomous verification.
+ */
+import assert from "node:assert/strict";
+import { test } from "vitest";
+import { validatePreferences } from "../preferences-validation.js";
+
+test("validatePreferences_when_auto_defer_threshold_is_between_zero_and_one_keeps_value", () => {
+	const result = validatePreferences({
+		verification_auto_defer_threshold: 0.65,
+	});
+
+	assert.deepEqual(result.errors, []);
+	assert.equal(result.preferences.verification_auto_defer_threshold, 0.65);
+});
+
+test("validatePreferences_when_auto_defer_threshold_is_out_of_range_reports_error", () => {
+	const result = validatePreferences({
+		verification_auto_defer_threshold: 1.5,
+	});
+
+	assert.equal(result.preferences.verification_auto_defer_threshold, undefined);
+	assert.deepEqual(result.errors, [
+		"verification_auto_defer_threshold must be a number between 0 and 1",
+	]);
+});
--- a/src/resources/extensions/sf/tests/verification-defer-policy.test.mjs
+++ b/src/resources/extensions/sf/tests/verification-defer-policy.test.mjs
@ -0,0 +1,102 @@
+/**
+ * verification-defer-policy.test.mjs — auto-defer classification contracts.
+ *
+ * Purpose: keep autonomous verification from retrying low-risk style findings
+ * while still blocking real test/build/runtime failures.
+ */
+import assert from "node:assert/strict";
+import { test } from "vitest";
+import {
+	classifyCheck,
+	computeDeferConfidence,
+	decideAutoDefer,
+} from "../verification-defer-policy.js";
+
+function result(checks) {
+	return { checks };
+}
+
+test("decideAutoDefer_when_all_failures_are_style_only_defers_above_threshold", () => {
+	const decision = decideAutoDefer(
+		result([
+			{
+				command: "npx biome format --write src",
+				exitCode: 1,
+				stderr: "formatting changed",
+			},
+		]),
+	);
+
+	assert.equal(decision.defer, true);
+	assert.equal(decision.confidence, 0.9);
+	assert.deepEqual(decision.reasons, [
+		"npx biome format --write src: deferrable",
+	]);
+});
+
+test("decideAutoDefer_when_failure_contains_error_blocks_even_for_lint_command", () => {
+	const check = {
+		command: "eslint --fix src",
+		exitCode: 1,
+		stderr: "Parsing error: Unexpected token",
+	};
+
+	assert.equal(classifyCheck(check), "blocking");
+	assert.deepEqual(decideAutoDefer(result([check])), {
+		defer: false,
+		confidence: 0,
+		reasons: [],
+	});
+});
+
+test("computeDeferConfidence_when_deferrable_and_unknown_stays_below_default_threshold", () => {
+	const confidence = computeDeferConfidence(
+		result([
+			{
+				command: "prettier --check .",
+				exitCode: 1,
+				stderr: "Code style issues found",
+			},
+			{
+				command: "custom verify",
+				exitCode: 2,
+				stderr: "review output manually",
+			},
+		]),
+	);
+
+	assert.equal(confidence, 0.7);
+	assert.equal(
+		decideAutoDefer(
+			result([
+				{
+					command: "prettier --check .",
+					exitCode: 1,
+					stderr: "Code style issues found",
+				},
+				{
+					command: "custom verify",
+					exitCode: 2,
+					stderr: "review output manually",
+				},
+			]),
+		).defer,
+		false,
+	);
+});
+
+test("decideAutoDefer_when_blocking_failure_present_does_not_defer", () => {
+	assert.equal(
+		decideAutoDefer(
+			result([
+				{ command: "lint", exitCode: 1, stderr: "style warning" },
+				{
+					command: "test",
+					exitCode: 1,
+					stderr: "fatal error: assertion failed",
+				},
+			]),
+		).defer,
+		false,
+	);
+});
--- a/src/resources/extensions/sf/uok/auto-verification.js
+++ b/src/resources/extensions/sf/uok/auto-verification.js
@ -27,6 +27,7 @@ import { isMilestoneComplete } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
 import { parseUnitId } from "../unit-id.js";
 import { extractVerdict } from "../verdict-parser.js";
+import { decideAutoDefer } from "../verification-defer-policy.js";
 import { writeVerificationJSON } from "../verification-evidence.js";
 import {
 	captureRuntimeErrors,
@ -748,6 +749,59 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
 		if (postExecBlockingFailure) {
 			result.passed = false;
 		}
+		// ── Auto-defer confidence check ──
+		// Low-risk findings (style, format, deprecation-only) should not block flow.
+		const deferThreshold =
+			typeof prefs?.verification_auto_defer_threshold === "number"
+				? prefs.verification_auto_defer_threshold
+				: 0.75;
+		const deferDecision = decideAutoDefer(result, deferThreshold);
+		if (deferDecision.defer) {
+			// Do not consume a retry attempt — the finding is intentionally deferred.
+			s.verificationRetryCount.delete(s.currentUnit.id);
+			s.pendingVerificationRetry = null;
+			ctx.ui.notify(
+				`Verification deferred: low-risk findings (${deferDecision.reasons.join("; ")}) confidence=${deferDecision.confidence.toFixed(2)} >= threshold=${deferThreshold}`,
+				"info",
+			);
+			process.stderr.write(
+				`verification-gate: deferred ${deferDecision.reasons.length} low-risk check(s) — confidence=${deferDecision.confidence.toFixed(2)}\n`,
+			);
+			// Write evidence JSON with deferred flag
+			if (mid && sid && tid) {
+				try {
+					const sDir = resolveSlicePath(s.basePath, mid, sid);
+					if (sDir) {
+						const tasksDir = join(sDir, "tasks");
+						const deferredResult = {
+							...result,
+							passed: false,
+							deferred: true,
+							deferredReasons: deferDecision.reasons,
+							deferConfidence: deferDecision.confidence,
+						};
+						writeVerificationJSON(
+							deferredResult,
+							tasksDir,
+							tid,
+							s.currentUnit.id,
+							undefined,
+							undefined,
+							tokenCount,
+							memoryPressureMB,
+							gateOutcomes,
+							recoveryStatus,
+						);
+					}
+				} catch (evidenceErr) {
+					logWarning(
+						"engine",
+						`verification-evidence write error (defer): ${evidenceErr.message}`,
+					);
+				}
+			}
+			return "continue";
+		}
 		// ── Auto-fix retry logic ──
 		if (result.passed) {
 			s.verificationRetryCount.delete(s.currentUnit.id);
@ -847,6 +901,13 @@ function writeVerificationJSONWithPostExec(
 		...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
 		...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
 		postExecutionChecks,
+		...(result.deferred === true ? { deferred: true } : {}),
+		...(result.deferredReasons
+			? { deferredReasons: result.deferredReasons }
+			: {}),
+		...(result.deferConfidence !== undefined
+			? { deferConfidence: result.deferConfidence }
+			: {}),
 	};
 	if (result.runtimeErrors && result.runtimeErrors.length > 0) {
 		evidence.runtimeErrors = result.runtimeErrors.map((e) => ({
--- a/src/resources/extensions/sf/verification-defer-policy.js
+++ b/src/resources/extensions/sf/verification-defer-policy.js
@ -0,0 +1,115 @@
+/**
+ * verification-defer-policy.js — classify verification failures as deferrable vs blocking.
+ *
+ * Purpose: prevent low-risk findings (style, format, deprecation-only warnings)
+ * from blocking autonomous flow. When every failed check is deferrable and
+ * confidence exceeds the threshold, the verification gate returns "deferred"
+ * instead of "retry"/"pause", allowing the loop to continue without consuming
+ * a retry attempt.
+ *
+ * Consumer: uok/auto-verification.js after the verification gate runs but before
+ * retry/pause decision.
+ */
+
+const DEFERRABLE_WORDS =
+	/\b(style|format|prettier|eslint.*fix|lint.*warning|deprecat|cosmetic|whitespace|trailing.*space|indent|semicolon|quote)\b/i;
+const BLOCKING_WORDS =
+	/\b(error|fail|crash|fatal|exception|throw|reject|unhandled|assert|timeout|broken|invalid|syntax)\b/i;
+const STYLE_COMMANDS = /\b(prettier|eslint.*fix|stylelint|biome.*format)\b/i;
+
+/**
+ * Classify a single failed verification check.
+ *
+ * Returns:
+ * - "deferrable" when stderr/command suggests style/format/deprecation-only issues.
+ * - "blocking" when stderr contains blocking indicators.
+ * - "unknown" when the failure is ambiguous.
+ *
+ * Purpose: separate advisory verification noise from failures that should stop
+ * autonomous execution.
+ *
+ * Consumer: decideAutoDefer before runPostUnitVerification spends a retry.
+ */
+export function classifyCheck(check) {
+	const stderr = check.stderr ?? "";
+	const command = check.command ?? "";
+
+	// Blocking words override everything — if the output contains "error", "fail",
+	// "crash", "fatal", etc., it is never deferrable regardless of command name.
+	if (BLOCKING_WORDS.test(stderr)) return "blocking";
+
+	// Style/format-only commands with exit code 1 and short stderr are deferrable.
+	if (
+		STYLE_COMMANDS.test(command) &&
+		check.exitCode === 1 &&
+		stderr.length < 300
+	) {
+		return "deferrable";
+	}
+
+	// Deferrable words in stderr with no blocking words → deferrable.
+	if (DEFERRABLE_WORDS.test(stderr)) return "deferrable";
+
+	// Deprecation-only warnings (no blocking words already checked above).
+	if (/\bdeprecat/i.test(stderr)) return "deferrable";
+
+	return "unknown";
+}
+
+/**
+ * Compute auto-defer confidence for a verification result.
+ *
+ * Returns a number from 0 to 1.
+ *
+ * Purpose: give autonomous verification an explicit confidence score before it
+ * decides whether a failed check is safe to defer.
+ *
+ * Consumer: decideAutoDefer and focused policy tests.
+ */
+export function computeDeferConfidence(result) {
+	const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
+	if (failedChecks.length === 0) return 1.0;
+
+	const classifications = failedChecks.map(classifyCheck);
+	const blockingCount = classifications.filter((c) => c === "blocking").length;
+	const deferrableCount = classifications.filter(
+		(c) => c === "deferrable",
+	).length;
+	const unknownCount = classifications.filter((c) => c === "unknown").length;
+
+	if (blockingCount > 0) return 0.0;
+	if (deferrableCount > 0 && unknownCount === 0) return 0.9;
+	if (deferrableCount > 0 && unknownCount > 0) return 0.7;
+	return 0.5;
+}
+
+/**
+ * Decide whether a failed verification result should be auto-deferred.
+ *
+ * Purpose: allow low-risk verification failures to continue without consuming
+ * a retry while keeping blocking failures on the normal retry/pause path.
+ *
+ * Consumer: runPostUnitVerification after evidence has been collected.
+ *
+ * @param result - verification gate result (must have .checks)
+ * @param threshold - confidence threshold (default 0.75)
+ * @returns { defer: boolean, confidence: number, reasons: string[] }
+ */
+export function decideAutoDefer(result, threshold = 0.75) {
+	const confidence = computeDeferConfidence(result);
+	const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
+	const reasons = [];
+
+	for (const check of failedChecks) {
+		const cls = classifyCheck(check);
+		if (cls === "deferrable") {
+			reasons.push(`${check.command}: ${cls}`);
+		}
+	}
+
+	return {
+		defer: confidence >= threshold && reasons.length > 0,
+		confidence,
+		reasons,
+	};
+}
--- a/src/resources/extensions/sf/verification-evidence.js
+++ b/src/resources/extensions/sf/verification-evidence.js
@ -49,6 +49,13 @@ export function writeVerificationJSON(
 		...(memoryPressureMB !== undefined ? { memoryPressureMB } : {}),
 		...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
 		...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
+		...(result.deferred === true ? { deferred: true } : {}),
+		...(result.deferredReasons
+			? { deferredReasons: result.deferredReasons }
+			: {}),
+		...(result.deferConfidence !== undefined
+			? { deferConfidence: result.deferConfidence }
+			: {}),
 	};
 	if (result.runtimeErrors && result.runtimeErrors.length > 0) {
 		evidence.runtimeErrors = result.runtimeErrors.map((e) => ({