feat(verification): auto-defer confidence policy for low-risk findings
Implements M003 S05: auto-deferral policy for low-risk validation findings. - New verification-defer-policy.js: classifyCheck, computeDeferConfidence, decideAutoDefer — classifies failed checks as deferrable/blocking/unknown - Patterns: style/format/deprecation-only → deferrable; error/fail/crash/fatal → blocking (always wins) - Confidence scoring: 0.9 all-deferrable, 0.7 mixed, 0.5 unknown, 0.0 blocking - Threshold preference: verification_auto_defer_threshold (default 0.75) - Integration in uok/auto-verification.js: checks defer before retry/pause, does not consume retry attempts, writes deferred: true + reasons to evidence JSON - verification-evidence.js: forwards deferred/deferredReasons/deferConfidence fields - Preferences wired: validation, types, serializer - Tests: 6 unit tests for classification, confidence, threshold, blocking dominance Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
1b3dba6e51
commit
f48a4cc7c5
11 changed files with 337 additions and 0 deletions
|
|
@ -249,6 +249,8 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea
|
|||
|
||||
- `verification_max_retries`: number — maximum number of fix-and-retry cycles for verification failures. Default: `0` (no retries).
|
||||
|
||||
- `verification_auto_defer_threshold`: number — confidence threshold from `0` to `1` for treating low-risk style/format/deprecation-only verification failures as deferred instead of blocking. Default: `0.75`.
|
||||
|
||||
- `uat_dispatch`: boolean — when `true`, enables UAT (User Acceptance Testing) dispatch mode. Default: `false`.
|
||||
|
||||
- `post_unit_hooks`: array — hooks that fire after a unit completes. Each entry has:
|
||||
|
|
|
|||
|
|
@ -125,6 +125,7 @@ export function serializePreferencesToFrontmatter(prefs) {
|
|||
"verification_commands",
|
||||
"verification_auto_fix",
|
||||
"verification_max_retries",
|
||||
"verification_auto_defer_threshold",
|
||||
"search_provider",
|
||||
"context_selection",
|
||||
];
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([
|
|||
"verification_commands",
|
||||
"verification_auto_fix",
|
||||
"verification_max_retries",
|
||||
"verification_auto_defer_threshold",
|
||||
"search_provider",
|
||||
"context_selection",
|
||||
"widget_mode",
|
||||
|
|
|
|||
|
|
@ -1398,6 +1398,21 @@ export function validatePreferences(preferences) {
|
|||
errors.push("verification_max_retries must be a non-negative number");
|
||||
}
|
||||
}
|
||||
if (preferences.verification_auto_defer_threshold !== undefined) {
|
||||
const raw = preferences.verification_auto_defer_threshold;
|
||||
if (
|
||||
typeof raw === "number" &&
|
||||
Number.isFinite(raw) &&
|
||||
raw >= 0 &&
|
||||
raw <= 1
|
||||
) {
|
||||
validated.verification_auto_defer_threshold = raw;
|
||||
} else {
|
||||
errors.push(
|
||||
"verification_auto_defer_threshold must be a number between 0 and 1",
|
||||
);
|
||||
}
|
||||
}
|
||||
// ─── Git Preferences ───────────────────────────────────────────────────
|
||||
if (preferences.git && typeof preferences.git === "object") {
|
||||
const git = {};
|
||||
|
|
|
|||
|
|
@ -418,6 +418,9 @@ function mergePreferences(base, override) {
|
|||
override.verification_auto_fix ?? base.verification_auto_fix,
|
||||
verification_max_retries:
|
||||
override.verification_max_retries ?? base.verification_max_retries,
|
||||
verification_auto_defer_threshold:
|
||||
override.verification_auto_defer_threshold ??
|
||||
base.verification_auto_defer_threshold,
|
||||
enhanced_verification:
|
||||
override.enhanced_verification ?? base.enhanced_verification,
|
||||
enhanced_verification_pre:
|
||||
|
|
|
|||
|
|
@ -97,6 +97,7 @@ parallel:
|
|||
verification_commands: []
|
||||
verification_auto_fix:
|
||||
verification_max_retries:
|
||||
verification_auto_defer_threshold:
|
||||
notifications:
|
||||
enabled:
|
||||
on_complete:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
/**
|
||||
* preferences-verification-auto-defer.test.mjs — threshold preference contract.
|
||||
*
|
||||
* Purpose: ensure the auto-defer verification threshold survives preference
|
||||
* validation instead of being silently dropped before autonomous verification.
|
||||
*/
|
||||
import assert from "node:assert/strict";
|
||||
import { test } from "vitest";
|
||||
import { validatePreferences } from "../preferences-validation.js";
|
||||
|
||||
test("validatePreferences_when_auto_defer_threshold_is_between_zero_and_one_keeps_value", () => {
|
||||
const result = validatePreferences({
|
||||
verification_auto_defer_threshold: 0.65,
|
||||
});
|
||||
|
||||
assert.deepEqual(result.errors, []);
|
||||
assert.equal(result.preferences.verification_auto_defer_threshold, 0.65);
|
||||
});
|
||||
|
||||
test("validatePreferences_when_auto_defer_threshold_is_out_of_range_reports_error", () => {
|
||||
const result = validatePreferences({
|
||||
verification_auto_defer_threshold: 1.5,
|
||||
});
|
||||
|
||||
assert.equal(result.preferences.verification_auto_defer_threshold, undefined);
|
||||
assert.deepEqual(result.errors, [
|
||||
"verification_auto_defer_threshold must be a number between 0 and 1",
|
||||
]);
|
||||
});
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
/**
|
||||
* verification-defer-policy.test.mjs — auto-defer classification contracts.
|
||||
*
|
||||
* Purpose: keep autonomous verification from retrying low-risk style findings
|
||||
* while still blocking real test/build/runtime failures.
|
||||
*/
|
||||
import assert from "node:assert/strict";
|
||||
import { test } from "vitest";
|
||||
import {
|
||||
classifyCheck,
|
||||
computeDeferConfidence,
|
||||
decideAutoDefer,
|
||||
} from "../verification-defer-policy.js";
|
||||
|
||||
function result(checks) {
|
||||
return { checks };
|
||||
}
|
||||
|
||||
test("decideAutoDefer_when_all_failures_are_style_only_defers_above_threshold", () => {
|
||||
const decision = decideAutoDefer(
|
||||
result([
|
||||
{
|
||||
command: "npx biome format --write src",
|
||||
exitCode: 1,
|
||||
stderr: "formatting changed",
|
||||
},
|
||||
]),
|
||||
);
|
||||
|
||||
assert.equal(decision.defer, true);
|
||||
assert.equal(decision.confidence, 0.9);
|
||||
assert.deepEqual(decision.reasons, [
|
||||
"npx biome format --write src: deferrable",
|
||||
]);
|
||||
});
|
||||
|
||||
test("decideAutoDefer_when_failure_contains_error_blocks_even_for_lint_command", () => {
|
||||
const check = {
|
||||
command: "eslint --fix src",
|
||||
exitCode: 1,
|
||||
stderr: "Parsing error: Unexpected token",
|
||||
};
|
||||
|
||||
assert.equal(classifyCheck(check), "blocking");
|
||||
assert.deepEqual(decideAutoDefer(result([check])), {
|
||||
defer: false,
|
||||
confidence: 0,
|
||||
reasons: [],
|
||||
});
|
||||
});
|
||||
|
||||
test("computeDeferConfidence_when_deferrable_and_unknown_stays_below_default_threshold", () => {
|
||||
const confidence = computeDeferConfidence(
|
||||
result([
|
||||
{
|
||||
command: "prettier --check .",
|
||||
exitCode: 1,
|
||||
stderr: "Code style issues found",
|
||||
},
|
||||
{
|
||||
command: "custom verify",
|
||||
exitCode: 2,
|
||||
stderr: "review output manually",
|
||||
},
|
||||
]),
|
||||
);
|
||||
|
||||
assert.equal(confidence, 0.7);
|
||||
assert.equal(
|
||||
decideAutoDefer(
|
||||
result([
|
||||
{
|
||||
command: "prettier --check .",
|
||||
exitCode: 1,
|
||||
stderr: "Code style issues found",
|
||||
},
|
||||
{
|
||||
command: "custom verify",
|
||||
exitCode: 2,
|
||||
stderr: "review output manually",
|
||||
},
|
||||
]),
|
||||
).defer,
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test("decideAutoDefer_when_blocking_failure_present_does_not_defer", () => {
|
||||
assert.equal(
|
||||
decideAutoDefer(
|
||||
result([
|
||||
{ command: "lint", exitCode: 1, stderr: "style warning" },
|
||||
{
|
||||
command: "test",
|
||||
exitCode: 1,
|
||||
stderr: "fatal error: assertion failed",
|
||||
},
|
||||
]),
|
||||
).defer,
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
|
@ -27,6 +27,7 @@ import { isMilestoneComplete } from "../state.js";
|
|||
import { isClosedStatus } from "../status-guards.js";
|
||||
import { parseUnitId } from "../unit-id.js";
|
||||
import { extractVerdict } from "../verdict-parser.js";
|
||||
import { decideAutoDefer } from "../verification-defer-policy.js";
|
||||
import { writeVerificationJSON } from "../verification-evidence.js";
|
||||
import {
|
||||
captureRuntimeErrors,
|
||||
|
|
@ -748,6 +749,59 @@ export async function runPostUnitVerification(vctx, pauseAuto) {
|
|||
if (postExecBlockingFailure) {
|
||||
result.passed = false;
|
||||
}
|
||||
// ── Auto-defer confidence check ──
|
||||
// Low-risk findings (style, format, deprecation-only) should not block flow.
|
||||
const deferThreshold =
|
||||
typeof prefs?.verification_auto_defer_threshold === "number"
|
||||
? prefs.verification_auto_defer_threshold
|
||||
: 0.75;
|
||||
const deferDecision = decideAutoDefer(result, deferThreshold);
|
||||
if (deferDecision.defer) {
|
||||
// Do not consume a retry attempt — the finding is intentionally deferred.
|
||||
s.verificationRetryCount.delete(s.currentUnit.id);
|
||||
s.pendingVerificationRetry = null;
|
||||
ctx.ui.notify(
|
||||
`Verification deferred: low-risk findings (${deferDecision.reasons.join("; ")}) confidence=${deferDecision.confidence.toFixed(2)} >= threshold=${deferThreshold}`,
|
||||
"info",
|
||||
);
|
||||
process.stderr.write(
|
||||
`verification-gate: deferred ${deferDecision.reasons.length} low-risk check(s) — confidence=${deferDecision.confidence.toFixed(2)}\n`,
|
||||
);
|
||||
// Write evidence JSON with deferred flag
|
||||
if (mid && sid && tid) {
|
||||
try {
|
||||
const sDir = resolveSlicePath(s.basePath, mid, sid);
|
||||
if (sDir) {
|
||||
const tasksDir = join(sDir, "tasks");
|
||||
const deferredResult = {
|
||||
...result,
|
||||
passed: false,
|
||||
deferred: true,
|
||||
deferredReasons: deferDecision.reasons,
|
||||
deferConfidence: deferDecision.confidence,
|
||||
};
|
||||
writeVerificationJSON(
|
||||
deferredResult,
|
||||
tasksDir,
|
||||
tid,
|
||||
s.currentUnit.id,
|
||||
undefined,
|
||||
undefined,
|
||||
tokenCount,
|
||||
memoryPressureMB,
|
||||
gateOutcomes,
|
||||
recoveryStatus,
|
||||
);
|
||||
}
|
||||
} catch (evidenceErr) {
|
||||
logWarning(
|
||||
"engine",
|
||||
`verification-evidence write error (defer): ${evidenceErr.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
return "continue";
|
||||
}
|
||||
// ── Auto-fix retry logic ──
|
||||
if (result.passed) {
|
||||
s.verificationRetryCount.delete(s.currentUnit.id);
|
||||
|
|
@ -847,6 +901,13 @@ function writeVerificationJSONWithPostExec(
|
|||
...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
|
||||
...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
|
||||
postExecutionChecks,
|
||||
...(result.deferred === true ? { deferred: true } : {}),
|
||||
...(result.deferredReasons
|
||||
? { deferredReasons: result.deferredReasons }
|
||||
: {}),
|
||||
...(result.deferConfidence !== undefined
|
||||
? { deferConfidence: result.deferConfidence }
|
||||
: {}),
|
||||
};
|
||||
if (result.runtimeErrors && result.runtimeErrors.length > 0) {
|
||||
evidence.runtimeErrors = result.runtimeErrors.map((e) => ({
|
||||
|
|
|
|||
115
src/resources/extensions/sf/verification-defer-policy.js
Normal file
115
src/resources/extensions/sf/verification-defer-policy.js
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
/**
|
||||
* verification-defer-policy.js — classify verification failures as deferrable vs blocking.
|
||||
*
|
||||
* Purpose: prevent low-risk findings (style, format, deprecation-only warnings)
|
||||
* from blocking autonomous flow. When every failed check is deferrable and
|
||||
* confidence exceeds the threshold, the verification gate returns "deferred"
|
||||
* instead of "retry"/"pause", allowing the loop to continue without consuming
|
||||
* a retry attempt.
|
||||
*
|
||||
* Consumer: uok/auto-verification.js after the verification gate runs but before
|
||||
* retry/pause decision.
|
||||
*/
|
||||
|
||||
const DEFERRABLE_WORDS =
|
||||
/\b(style|format|prettier|eslint.*fix|lint.*warning|deprecat|cosmetic|whitespace|trailing.*space|indent|semicolon|quote)\b/i;
|
||||
const BLOCKING_WORDS =
|
||||
/\b(error|fail|crash|fatal|exception|throw|reject|unhandled|assert|timeout|broken|invalid|syntax)\b/i;
|
||||
const STYLE_COMMANDS = /\b(prettier|eslint.*fix|stylelint|biome.*format)\b/i;
|
||||
|
||||
/**
|
||||
* Classify a single failed verification check.
|
||||
*
|
||||
* Returns:
|
||||
* - "deferrable" when stderr/command suggests style/format/deprecation-only issues.
|
||||
* - "blocking" when stderr contains blocking indicators.
|
||||
* - "unknown" when the failure is ambiguous.
|
||||
*
|
||||
* Purpose: separate advisory verification noise from failures that should stop
|
||||
* autonomous execution.
|
||||
*
|
||||
* Consumer: decideAutoDefer before runPostUnitVerification spends a retry.
|
||||
*/
|
||||
export function classifyCheck(check) {
|
||||
const stderr = check.stderr ?? "";
|
||||
const command = check.command ?? "";
|
||||
|
||||
// Blocking words override everything — if the output contains "error", "fail",
|
||||
// "crash", "fatal", etc., it is never deferrable regardless of command name.
|
||||
if (BLOCKING_WORDS.test(stderr)) return "blocking";
|
||||
|
||||
// Style/format-only commands with exit code 1 and short stderr are deferrable.
|
||||
if (
|
||||
STYLE_COMMANDS.test(command) &&
|
||||
check.exitCode === 1 &&
|
||||
stderr.length < 300
|
||||
) {
|
||||
return "deferrable";
|
||||
}
|
||||
|
||||
// Deferrable words in stderr with no blocking words → deferrable.
|
||||
if (DEFERRABLE_WORDS.test(stderr)) return "deferrable";
|
||||
|
||||
// Deprecation-only warnings (no blocking words already checked above).
|
||||
if (/\bdeprecat/i.test(stderr)) return "deferrable";
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute auto-defer confidence for a verification result.
|
||||
*
|
||||
* Returns a number from 0 to 1.
|
||||
*
|
||||
* Purpose: give autonomous verification an explicit confidence score before it
|
||||
* decides whether a failed check is safe to defer.
|
||||
*
|
||||
* Consumer: decideAutoDefer and focused policy tests.
|
||||
*/
|
||||
export function computeDeferConfidence(result) {
|
||||
const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
|
||||
if (failedChecks.length === 0) return 1.0;
|
||||
|
||||
const classifications = failedChecks.map(classifyCheck);
|
||||
const blockingCount = classifications.filter((c) => c === "blocking").length;
|
||||
const deferrableCount = classifications.filter(
|
||||
(c) => c === "deferrable",
|
||||
).length;
|
||||
const unknownCount = classifications.filter((c) => c === "unknown").length;
|
||||
|
||||
if (blockingCount > 0) return 0.0;
|
||||
if (deferrableCount > 0 && unknownCount === 0) return 0.9;
|
||||
if (deferrableCount > 0 && unknownCount > 0) return 0.7;
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decide whether a failed verification result should be auto-deferred.
|
||||
*
|
||||
* Purpose: allow low-risk verification failures to continue without consuming
|
||||
* a retry while keeping blocking failures on the normal retry/pause path.
|
||||
*
|
||||
* Consumer: runPostUnitVerification after evidence has been collected.
|
||||
*
|
||||
* @param result - verification gate result (must have .checks)
|
||||
* @param threshold - confidence threshold (default 0.75)
|
||||
* @returns { defer: boolean, confidence: number, reasons: string[] }
|
||||
*/
|
||||
export function decideAutoDefer(result, threshold = 0.75) {
|
||||
const confidence = computeDeferConfidence(result);
|
||||
const failedChecks = result.checks.filter((c) => c.exitCode !== 0);
|
||||
const reasons = [];
|
||||
|
||||
for (const check of failedChecks) {
|
||||
const cls = classifyCheck(check);
|
||||
if (cls === "deferrable") {
|
||||
reasons.push(`${check.command}: ${cls}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
defer: confidence >= threshold && reasons.length > 0,
|
||||
confidence,
|
||||
reasons,
|
||||
};
|
||||
}
|
||||
|
|
@ -49,6 +49,13 @@ export function writeVerificationJSON(
|
|||
...(memoryPressureMB !== undefined ? { memoryPressureMB } : {}),
|
||||
...(gateOutcomes !== undefined ? { gateOutcomes } : {}),
|
||||
...(recoveryStatus !== undefined ? { recoveryStatus } : {}),
|
||||
...(result.deferred === true ? { deferred: true } : {}),
|
||||
...(result.deferredReasons
|
||||
? { deferredReasons: result.deferredReasons }
|
||||
: {}),
|
||||
...(result.deferConfidence !== undefined
|
||||
? { deferConfidence: result.deferConfidence }
|
||||
: {}),
|
||||
};
|
||||
if (result.runtimeErrors && result.runtimeErrors.length > 0) {
|
||||
evidence.runtimeErrors = result.runtimeErrors.map((e) => ({
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue