feat(verification): auto-defer confidence policy for low-risk findings

Implements M003 S05: auto-deferral policy for low-risk validation findings.

- New verification-defer-policy.js: classifyCheck, computeDeferConfidence,
  decideAutoDefer — classifies failed checks as deferrable/blocking/unknown
- Patterns: style/format/deprecation-only → deferrable; error/fail/crash/fatal
  → blocking (always wins)
- Confidence scoring: 0.9 all-deferrable, 0.7 mixed, 0.5 unknown, 0.0 blocking
- Threshold preference: verification_auto_defer_threshold (default 0.75)
- Integration in uok/auto-verification.js: checks defer before retry/pause,
  does not consume retry attempts, writes deferred: true + reasons to evidence JSON
- verification-evidence.js: forwards deferred/deferredReasons/deferConfidence fields
- Preferences wired: validation, types, serializer
- Tests: 6 unit tests for classification, confidence, threshold, blocking dominance

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Mikael Hugo 2026-05-15 18:55:26 +02:00
parent 1b3dba6e51
commit f48a4cc7c5
11 changed files with 337 additions and 0 deletions

View file

@ -249,6 +249,8 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea
- `verification_max_retries`: number — maximum number of fix-and-retry cycles for verification failures. Default: `0` (no retries).
- `verification_auto_defer_threshold`: number — confidence threshold from `0` to `1` for treating low-risk style/format/deprecation-only verification failures as deferred instead of blocking. Default: `0.75`.
- `uat_dispatch`: boolean — when `true`, enables UAT (User Acceptance Testing) dispatch mode. Default: `false`.
- `post_unit_hooks`: array — hooks that fire after a unit completes. Each entry has:

View file

@ -125,6 +125,7 @@ export function serializePreferencesToFrontmatter(prefs) {
"verification_commands",
"verification_auto_fix",
"verification_max_retries",
"verification_auto_defer_threshold",
"search_provider",
"context_selection",
];

View file

@ -101,6 +101,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([
"verification_commands",
"verification_auto_fix",
"verification_max_retries",
"verification_auto_defer_threshold",
"search_provider",
"context_selection",
"widget_mode",

View file

@ -1398,6 +1398,21 @@ export function validatePreferences(preferences) {
errors.push("verification_max_retries must be a non-negative number");
}
}
if (preferences.verification_auto_defer_threshold !== undefined) {
const raw = preferences.verification_auto_defer_threshold;
if (
typeof raw === "number" &&
Number.isFinite(raw) &&
raw >= 0 &&
raw <= 1
) {
validated.verification_auto_defer_threshold = raw;
} else {
errors.push(
"verification_auto_defer_threshold must be a number between 0 and 1",
);
}
}
// ─── Git Preferences ───────────────────────────────────────────────────
if (preferences.git && typeof preferences.git === "object") {
const git = {};

View file

@ -418,6 +418,9 @@ function mergePreferences(base, override) {
override.verification_auto_fix ?? base.verification_auto_fix,
verification_max_retries:
override.verification_max_retries ?? base.verification_max_retries,
verification_auto_defer_threshold:
override.verification_auto_defer_threshold ??
base.verification_auto_defer_threshold,
enhanced_verification:
override.enhanced_verification ?? base.enhanced_verification,
enhanced_verification_pre:

View file

@ -97,6 +97,7 @@ parallel:
verification_commands: []
verification_auto_fix:
verification_max_retries:
verification_auto_defer_threshold:
notifications:
enabled:
on_complete:

View file

@ -0,0 +1,29 @@
/**
* preferences-verification-auto-defer.test.mjs threshold preference contract.
*
* Purpose: ensure the auto-defer verification threshold survives preference
* validation instead of being silently dropped before autonomous verification.
*/
import assert from "node:assert/strict";
import { test } from "vitest";
import { validatePreferences } from "../preferences-validation.js";
test("validatePreferences_when_auto_defer_threshold_is_between_zero_and_one_keeps_value", () => {
const result = validatePreferences({
verification_auto_defer_threshold: 0.65,
});
assert.deepEqual(result.errors, []);
assert.equal(result.preferences.verification_auto_defer_threshold, 0.65);
});
test("validatePreferences_when_auto_defer_threshold_is_out_of_range_reports_error", () => {
const result = validatePreferences({
verification_auto_defer_threshold: 1.5,
});
assert.equal(result.preferences.verification_auto_defer_threshold, undefined);
assert.deepEqual(result.errors, [
"verification_auto_defer_threshold must be a number between 0 and 1",
]);
});

View file

@ -0,0 +1,102 @@
/**
* verification-defer-policy.test.mjs auto-defer classification contracts.
*
* Purpose: keep autonomous verification from retrying low-risk style findings
* while still blocking real test/build/runtime failures.
*/
import assert from "node:assert/strict";
import { test } from "vitest";
import {
classifyCheck,
computeDeferConfidence,
decideAutoDefer,
} from "../verification-defer-policy.js";
function result(checks) {
return { checks };
}
test("decideAutoDefer_when_all_failures_are_style_only_defers_above_threshold", () => {
const decision = decideAutoDefer(
result([
{
command: "npx biome format --write src",
exitCode: 1,
stderr: "formatting changed",
},
]),
);
assert.equal(decision.defer, true);
assert.equal(decision.confidence, 0.9);
assert.deepEqual(decision.reasons, [
"npx biome format --write src: deferrable",
]);
});
test("decideAutoDefer_when_failure_contains_error_blocks_even_for_lint_command", () => {
const check = {
command: "eslint --fix src",
exitCode: 1,
stderr: "Parsing error: Unexpected token",
};
assert.equal(classifyCheck(check), "blocking");
assert.deepEqual(decideAutoDefer(result([check])), {
defer: false,
confidence: 0,
reasons: [],
});
});
test("computeDeferConfidence_when_deferrable_and_unknown_stays_below_default_threshold", () => {
const confidence = computeDeferConfidence(
result([
{
command: "prettier --check .",
exitCode: 1,
stderr: "Code style issues found",
},
{
command: "custom verify",
exitCode: 2,
stderr: "review output manually",
},
]),
);
assert.equal(confidence, 0.7);
assert.equal(
decideAutoDefer(
result([
{
command: "prettier --check .",
exitCode: 1,
stderr: "Code style issues found",
},
{
command: "custom verify",
exitCode: 2,
stderr: "review output manually",
},
]),
).defer,
false,
);
});
test("decideAutoDefer_when_blocking_failure_present_does_not_defer", () => {
assert.equal(
decideAutoDefer(
result([
{ command: "lint", exitCode: 1, stderr: "style warning" },
{
command: "test",
exitCode: 1,
stderr: "fatal error: assertion failed",
},
]),
).defer,
false,
);
});

View file

@ -27,6 +27,7 @@ import { isMilestoneComplete } from "../state.js";
import { isClosedStatus } from "../status-guards.js";
import { parseUnitId } from "../unit-id.js";
import { extractVerdict } from "../verdict-parser.js";
import { decideAutoDefer } from "../verification-defer-policy.js";
import { writeVerificationJSON } from "../verification-evidence.js";
import {
captureRuntimeErrors,
@ -748,6 +749,59 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
if (postExecBlockingFailure) {
result.passed = false;
}
// ── Auto-defer confidence check ──
// Low-risk findings (style, format, deprecation-only) should not block flow.
const deferThreshold =
typeof prefs?.verification_auto_defer_threshold === "number"
? prefs.verification_auto_defer_threshold
: 0.75;
const deferDecision = decideAutoDefer(result, deferThreshold);
if (deferDecision.defer) {
// Do not consume a retry attempt — the finding is intentionally deferred.
s.verificationRetryCount.delete(s.currentUnit.id);
s.pendingVerificationRetry = null;
ctx.ui.notify(
`Verification deferred: low-risk findings (${deferDecision.reasons.join("; ")}) confidence=${deferDecision.confidence.toFixed(2)} >= threshold=${deferThreshold}`,
"info",
);
process.stderr.write(
`verification-gate: deferred ${deferDecision.reasons.length} low-risk check(s) — confidence=${deferDecision.confidence.toFixed(2)}\n`,
);
// Write evidence JSON with deferred flag
if (mid && sid && tid) {
try {
const sDir = resolveSlicePath(s.basePath, mid, sid);
if (sDir) {
const tasksDir = join(sDir, "tasks");
const deferredResult = {
...result,
passed: false,
deferred: true,
deferredReasons: deferDecision.reasons,
deferConfidence: deferDecision.confidence,
};
writeVerificationJSON(
deferredResult,
tasksDir,
tid,
s.currentUnit.id,
undefined,
undefined,
tokenCount,
memoryPressureMB,
gateOutcomes,
recoveryStatus,
);
}
} catch (evidenceErr) {
logWarning(
"engine",
`verification-evidence write error (defer): ${evidenceErr.message}`,
);
}
}
return "continue";
}
// ── Auto-fix retry logic ──
if (result.passed) {
s.verificationRetryCount.delete(s.currentUnit.id);
@ -847,6 +901,13 @@ function writeVerificationJSONWithPostExec(
...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
postExecutionChecks,
...(result.deferred === true ? { deferred: true } : {}),
...(result.deferredReasons
? { deferredReasons: result.deferredReasons }
: {}),
...(result.deferConfidence !== undefined
? { deferConfidence: result.deferConfidence }
: {}),
};
if (result.runtimeErrors && result.runtimeErrors.length > 0) {
evidence.runtimeErrors = result.runtimeErrors.map((e) => ({

View file

@ -0,0 +1,115 @@
/**
* verification-defer-policy.js classify verification failures as deferrable vs blocking.
*
* Purpose: prevent low-risk findings (style, format, deprecation-only warnings)
* from blocking autonomous flow. When every failed check is deferrable and
* confidence exceeds the threshold, the verification gate returns "deferred"
* instead of "retry"/"pause", allowing the loop to continue without consuming
* a retry attempt.
*
* Consumer: uok/auto-verification.js after the verification gate runs but before
* retry/pause decision.
*/
const DEFERRABLE_WORDS =
/\b(style|format|prettier|eslint.*fix|lint.*warning|deprecat|cosmetic|whitespace|trailing.*space|indent|semicolon|quote)\b/i;
const BLOCKING_WORDS =
/\b(error|fail|crash|fatal|exception|throw|reject|unhandled|assert|timeout|broken|invalid|syntax)\b/i;
const STYLE_COMMANDS = /\b(prettier|eslint.*fix|stylelint|biome.*format)\b/i;
/**
* Classify a single failed verification check.
*
* Returns:
* - "deferrable" when stderr/command suggests style/format/deprecation-only issues.
* - "blocking" when stderr contains blocking indicators.
* - "unknown" when the failure is ambiguous.
*
* Purpose: separate advisory verification noise from failures that should stop
* autonomous execution.
*
* Consumer: decideAutoDefer before runPostUnitVerification spends a retry.
*/
export function classifyCheck(check) {
const stderr = check.stderr ?? "";
const command = check.command ?? "";
// Blocking words override everything — if the output contains "error", "fail",
// "crash", "fatal", etc., it is never deferrable regardless of command name.
if (BLOCKING_WORDS.test(stderr)) return "blocking";
// Style/format-only commands with exit code 1 and short stderr are deferrable.
if (
STYLE_COMMANDS.test(command) &&
check.exitCode === 1 &&
stderr.length < 300
) {
return "deferrable";
}
// Deferrable words in stderr with no blocking words → deferrable.
if (DEFERRABLE_WORDS.test(stderr)) return "deferrable";
// Deprecation-only warnings (no blocking words already checked above).
if (/\bdeprecat/i.test(stderr)) return "deferrable";
return "unknown";
}
/**
* Compute auto-defer confidence for a verification result.
*
* Returns a number from 0 to 1.
*
* Purpose: give autonomous verification an explicit confidence score before it
* decides whether a failed check is safe to defer.
*
* Consumer: decideAutoDefer and focused policy tests.
*/
export function computeDeferConfidence(result) {
const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
if (failedChecks.length === 0) return 1.0;
const classifications = failedChecks.map(classifyCheck);
const blockingCount = classifications.filter((c) => c === "blocking").length;
const deferrableCount = classifications.filter(
(c) => c === "deferrable",
).length;
const unknownCount = classifications.filter((c) => c === "unknown").length;
if (blockingCount > 0) return 0.0;
if (deferrableCount > 0 && unknownCount === 0) return 0.9;
if (deferrableCount > 0 && unknownCount > 0) return 0.7;
return 0.5;
}
/**
* Decide whether a failed verification result should be auto-deferred.
*
* Purpose: allow low-risk verification failures to continue without consuming
* a retry while keeping blocking failures on the normal retry/pause path.
*
* Consumer: runPostUnitVerification after evidence has been collected.
*
* @param result - verification gate result (must have .checks)
* @param threshold - confidence threshold (default 0.75)
* @returns { defer: boolean, confidence: number, reasons: string[] }
*/
export function decideAutoDefer(result, threshold = 0.75) {
const confidence = computeDeferConfidence(result);
const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
const reasons = [];
for (const check of failedChecks) {
const cls = classifyCheck(check);
if (cls === "deferrable") {
reasons.push(`${check.command}: ${cls}`);
}
}
return {
defer: confidence >= threshold && reasons.length > 0,
confidence,
reasons,
};
}

View file

@ -49,6 +49,13 @@ export function writeVerificationJSON(
...(memoryPressureMB !== undefined ? { memoryPressureMB } : {}),
...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
...(result.deferred === true ? { deferred: true } : {}),
...(result.deferredReasons
? { deferredReasons: result.deferredReasons }
: {}),
...(result.deferConfidence !== undefined
? { deferConfidence: result.deferConfidence }
: {}),
};
if (result.runtimeErrors && result.runtimeErrors.length > 0) {
evidence.runtimeErrors = result.runtimeErrors.map((e) => ({