fix: verdict gate accepts PARTIAL for mixed/human-experience/live-runtime UATs
The verdict gate in auto-dispatch.ts now reads the UAT file to determine the UAT type. For mixed, human-experience, and live-runtime modes, PARTIAL is accepted as a valid verdict (all automatable checks passed, human-only checks documented as NEEDS-HUMAN). The run-uat prompt is updated so that PASS is the correct verdict when all automatable checks succeed, even if human-only checks remain. PARTIAL is reserved for when automatable checks themselves are inconclusive. Fixes gsd-build/gsd-2#1400 Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com> Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/5a619137-0710-4934-949f-bae63945bf70
This commit is contained in:
parent
abb8fe69dc
commit
f2283c9a30
3 changed files with 90 additions and 5 deletions
|
|
@ -190,7 +190,24 @@ export const DISPATCH_RULES: DispatchRule[] = [
|
|||
if (!content) continue;
|
||||
const verdictMatch = content.match(/verdict:\s*([\w-]+)/i);
|
||||
const verdict = verdictMatch?.[1]?.toLowerCase();
|
||||
if (verdict && verdict !== "pass" && verdict !== "passed") {
|
||||
|
||||
// Determine acceptable verdicts based on UAT type.
|
||||
// mixed / human-experience / live-runtime modes may legitimately
|
||||
// produce PARTIAL when all automatable checks pass but human-only
|
||||
// checks remain — this should not block progression.
|
||||
const acceptableVerdicts: string[] = ["pass", "passed"];
|
||||
const uatFile = resolveSliceFile(basePath, mid, sliceId, "UAT");
|
||||
if (uatFile) {
|
||||
const uatContent = await loadFile(uatFile);
|
||||
if (uatContent) {
|
||||
const uatType = extractUatType(uatContent);
|
||||
if (uatType === "mixed" || uatType === "human-experience" || uatType === "live-runtime") {
|
||||
acceptableVerdicts.push("partial");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verdict && !acceptableVerdicts.includes(verdict)) {
|
||||
return {
|
||||
action: "stop" as const,
|
||||
reason: `UAT verdict for ${sliceId} is "${verdict}" — blocking progression until resolved.\nReview the UAT result and update the verdict to PASS, or re-run /gsd auto after fixing.`,
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ You are the UAT runner. Execute every check defined in `{{uatPath}}` as deeply a
|
|||
- `runtime-executable` — execute the specified command or script. Capture stdout/stderr as evidence. Record pass/fail based on exit code and output.
|
||||
- `live-runtime` — exercise the real runtime path. Start or connect to the app/service if needed, use browser/runtime/network checks, and verify observable behavior.
|
||||
- `mixed` — run all automatable artifact-driven and live-runtime checks. Separate any remaining human-only checks explicitly.
|
||||
- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN` and use an overall verdict of `PARTIAL` unless every required check was objective and passed.
|
||||
- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN`. Use an overall verdict of `PASS` when all automatable checks succeed (even if human-only checks remain as `NEEDS-HUMAN`). Use `PARTIAL` only when automatable checks themselves were inconclusive.
|
||||
|
||||
### Evidence tools
|
||||
|
||||
|
|
@ -51,9 +51,9 @@ For each check, record:
|
|||
- `PASS`, `FAIL`, or `NEEDS-HUMAN`
|
||||
|
||||
After running all checks, compute the **overall verdict**:
|
||||
- `PASS` — all required checks passed and no human-only checks remain
|
||||
- `FAIL` — one or more checks failed
|
||||
- `PARTIAL` — some checks passed, but one or more checks were skipped, inconclusive, or still require human judgment
|
||||
- `PASS` — all automatable checks passed. Any remaining checks that honestly require human judgment are marked `NEEDS-HUMAN` with clear instructions for the human reviewer. (This is the correct verdict for mixed/human-experience/live-runtime modes when all automatable checks succeed.)
|
||||
- `FAIL` — one or more automatable checks failed
|
||||
- `PARTIAL` — one or more automatable checks were skipped or returned inconclusive results (not the same as `NEEDS-HUMAN` — use PARTIAL only when the agent itself could not determine pass/fail for a check it was supposed to automate)
|
||||
|
||||
Call `gsd_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "ASSESSMENT"`, and the full UAT result markdown as `content` — the tool computes the file path and persists to both DB and disk. The content should follow this format:
|
||||
|
||||
|
|
|
|||
|
|
@ -343,6 +343,74 @@ test('(m) non-artifact UAT skip', async () => {
|
|||
}
|
||||
});
|
||||
|
||||
test('(o) verdict gate: PARTIAL is acceptable for mixed/human-experience/live-runtime UAT types', () => {
|
||||
// This test verifies the contract that extractUatType correctly identifies
|
||||
// the modes where PARTIAL should not block progression.
|
||||
// The verdict gate in auto-dispatch.ts uses this to build acceptableVerdicts.
|
||||
const mixedType = extractUatType(makeUatContent('mixed'));
|
||||
const humanExpType = extractUatType(makeUatContent('human-experience'));
|
||||
const liveRuntimeType = extractUatType(makeUatContent('live-runtime'));
|
||||
const artifactType = extractUatType(makeUatContent('artifact-driven'));
|
||||
const browserType = extractUatType(makeUatContent('browser-executable'));
|
||||
const runtimeExecType = extractUatType(makeUatContent('runtime-executable'));
|
||||
|
||||
// These modes should allow PARTIAL (non-fully-automatable)
|
||||
const partialAcceptableModes = ['mixed', 'human-experience', 'live-runtime'];
|
||||
assert.ok(
|
||||
partialAcceptableModes.includes(mixedType!),
|
||||
`mixed → "${mixedType}" is in partialAcceptableModes`,
|
||||
);
|
||||
assert.ok(
|
||||
partialAcceptableModes.includes(humanExpType!),
|
||||
`human-experience → "${humanExpType}" is in partialAcceptableModes`,
|
||||
);
|
||||
assert.ok(
|
||||
partialAcceptableModes.includes(liveRuntimeType!),
|
||||
`live-runtime → "${liveRuntimeType}" is in partialAcceptableModes`,
|
||||
);
|
||||
|
||||
// These modes should NOT allow PARTIAL (fully automatable)
|
||||
assert.ok(
|
||||
!partialAcceptableModes.includes(artifactType!),
|
||||
`artifact-driven → "${artifactType}" is NOT in partialAcceptableModes`,
|
||||
);
|
||||
assert.ok(
|
||||
!partialAcceptableModes.includes(browserType!),
|
||||
`browser-executable → "${browserType}" is NOT in partialAcceptableModes`,
|
||||
);
|
||||
assert.ok(
|
||||
!partialAcceptableModes.includes(runtimeExecType!),
|
||||
`runtime-executable → "${runtimeExecType}" is NOT in partialAcceptableModes`,
|
||||
);
|
||||
});
|
||||
|
||||
test('(p) run-uat prompt allows PASS when human-only checks remain as NEEDS-HUMAN', () => {
|
||||
const promptResult = loadPromptFromWorktree('run-uat', {
|
||||
workingDirectory: '/tmp/test-project',
|
||||
milestoneId: 'M001',
|
||||
sliceId: 'S01',
|
||||
uatPath: '.gsd/milestones/M001/slices/S01/S01-UAT.md',
|
||||
uatResultPath: '.gsd/milestones/M001/slices/S01/S01-UAT-RESULT.md',
|
||||
uatType: 'mixed',
|
||||
inlinedContext: '<!-- no context -->',
|
||||
});
|
||||
|
||||
// PASS verdict should be usable when automatable checks pass (even with NEEDS-HUMAN remaining)
|
||||
assert.ok(
|
||||
/PASS.*automatable checks passed/i.test(promptResult),
|
||||
'prompt defines PASS as valid when all automatable checks passed',
|
||||
);
|
||||
assert.ok(
|
||||
/PARTIAL.*automatable checks.*skipped|inconclusive/i.test(promptResult),
|
||||
'prompt reserves PARTIAL for when automatable checks themselves are inconclusive',
|
||||
);
|
||||
// human-experience mode should NOT force PARTIAL when automatable checks pass
|
||||
assert.ok(
|
||||
!promptResult.includes('use an overall verdict of `PARTIAL`'),
|
||||
'prompt does not force PARTIAL verdict for human-experience mode',
|
||||
);
|
||||
});
|
||||
|
||||
test('(n) stale replay guard', async () => {
|
||||
const base = createFixtureBase();
|
||||
try {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue