fix: verdict gate accepts PARTIAL for mixed/human-experience/live-runtime UATs

The verdict gate in auto-dispatch.ts now reads the UAT file to determine
the UAT type. For mixed, human-experience, and live-runtime modes,
PARTIAL is accepted as a valid verdict (all automatable checks passed,
human-only checks documented as NEEDS-HUMAN).

The run-uat prompt is updated so that PASS is the correct verdict when
all automatable checks succeed, even if human-only checks remain. PARTIAL
is reserved for when automatable checks themselves are inconclusive.

Fixes gsd-build/gsd-2#1400

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/5a619137-0710-4934-949f-bae63945bf70
This commit is contained in:
copilot-swe-agent[bot] 2026-03-25 22:34:30 +00:00
parent abb8fe69dc
commit f2283c9a30
3 changed files with 90 additions and 5 deletions

View file

@ -190,7 +190,24 @@ export const DISPATCH_RULES: DispatchRule[] = [
if (!content) continue;
const verdictMatch = content.match(/verdict:\s*([\w-]+)/i);
const verdict = verdictMatch?.[1]?.toLowerCase();
if (verdict && verdict !== "pass" && verdict !== "passed") {
// Determine acceptable verdicts based on UAT type.
// mixed / human-experience / live-runtime modes may legitimately
// produce PARTIAL when all automatable checks pass but human-only
// checks remain — this should not block progression.
const acceptableVerdicts: string[] = ["pass", "passed"];
const uatFile = resolveSliceFile(basePath, mid, sliceId, "UAT");
if (uatFile) {
const uatContent = await loadFile(uatFile);
if (uatContent) {
const uatType = extractUatType(uatContent);
if (uatType === "mixed" || uatType === "human-experience" || uatType === "live-runtime") {
acceptableVerdicts.push("partial");
}
}
}
if (verdict && !acceptableVerdicts.includes(verdict)) {
return {
action: "stop" as const,
reason: `UAT verdict for ${sliceId} is "${verdict}" — blocking progression until resolved.\nReview the UAT result and update the verdict to PASS, or re-run /gsd auto after fixing.`,

View file

@ -29,7 +29,7 @@ You are the UAT runner. Execute every check defined in `{{uatPath}}` as deeply a
- `runtime-executable` — execute the specified command or script. Capture stdout/stderr as evidence. Record pass/fail based on exit code and output.
- `live-runtime` — exercise the real runtime path. Start or connect to the app/service if needed, use browser/runtime/network checks, and verify observable behavior.
- `mixed` — run all automatable artifact-driven and live-runtime checks. Separate any remaining human-only checks explicitly.
- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN` and use an overall verdict of `PARTIAL` unless every required check was objective and passed.
- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN`. Use an overall verdict of `PASS` when all automatable checks succeed (even if human-only checks remain as `NEEDS-HUMAN`). Use `PARTIAL` only when automatable checks themselves were inconclusive.
### Evidence tools
@ -51,9 +51,9 @@ For each check, record:
- `PASS`, `FAIL`, or `NEEDS-HUMAN`
After running all checks, compute the **overall verdict**:
- `PASS` — all required checks passed and no human-only checks remain
- `FAIL` — one or more checks failed
- `PARTIAL`some checks passed, but one or more checks were skipped, inconclusive, or still require human judgment
- `PASS` — all automatable checks passed. Any remaining checks that honestly require human judgment are marked `NEEDS-HUMAN` with clear instructions for the human reviewer. (This is the correct verdict for mixed/human-experience/live-runtime modes when all automatable checks succeed.)
- `FAIL` — one or more automatable checks failed
- `PARTIAL`one or more automatable checks were skipped or returned inconclusive results (not the same as `NEEDS-HUMAN` — use PARTIAL only when the agent itself could not determine pass/fail for a check it was supposed to automate)
Call `gsd_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "ASSESSMENT"`, and the full UAT result markdown as `content` — the tool computes the file path and persists to both DB and disk. The content should follow this format:

View file

@ -343,6 +343,74 @@ test('(m) non-artifact UAT skip', async () => {
}
});
test('(o) verdict gate: PARTIAL is acceptable for mixed/human-experience/live-runtime UAT types', () => {
// This test verifies the contract that extractUatType correctly identifies
// the modes where PARTIAL should not block progression.
// The verdict gate in auto-dispatch.ts uses this to build acceptableVerdicts.
const mixedType = extractUatType(makeUatContent('mixed'));
const humanExpType = extractUatType(makeUatContent('human-experience'));
const liveRuntimeType = extractUatType(makeUatContent('live-runtime'));
const artifactType = extractUatType(makeUatContent('artifact-driven'));
const browserType = extractUatType(makeUatContent('browser-executable'));
const runtimeExecType = extractUatType(makeUatContent('runtime-executable'));
// These modes should allow PARTIAL (non-fully-automatable)
const partialAcceptableModes = ['mixed', 'human-experience', 'live-runtime'];
assert.ok(
partialAcceptableModes.includes(mixedType!),
`mixed → "${mixedType}" is in partialAcceptableModes`,
);
assert.ok(
partialAcceptableModes.includes(humanExpType!),
`human-experience → "${humanExpType}" is in partialAcceptableModes`,
);
assert.ok(
partialAcceptableModes.includes(liveRuntimeType!),
`live-runtime → "${liveRuntimeType}" is in partialAcceptableModes`,
);
// These modes should NOT allow PARTIAL (fully automatable)
assert.ok(
!partialAcceptableModes.includes(artifactType!),
`artifact-driven → "${artifactType}" is NOT in partialAcceptableModes`,
);
assert.ok(
!partialAcceptableModes.includes(browserType!),
`browser-executable → "${browserType}" is NOT in partialAcceptableModes`,
);
assert.ok(
!partialAcceptableModes.includes(runtimeExecType!),
`runtime-executable → "${runtimeExecType}" is NOT in partialAcceptableModes`,
);
});
test('(p) run-uat prompt allows PASS when human-only checks remain as NEEDS-HUMAN', () => {
const promptResult = loadPromptFromWorktree('run-uat', {
workingDirectory: '/tmp/test-project',
milestoneId: 'M001',
sliceId: 'S01',
uatPath: '.gsd/milestones/M001/slices/S01/S01-UAT.md',
uatResultPath: '.gsd/milestones/M001/slices/S01/S01-UAT-RESULT.md',
uatType: 'mixed',
inlinedContext: '<!-- no context -->',
});
// PASS verdict should be usable when automatable checks pass (even with NEEDS-HUMAN remaining)
assert.ok(
/PASS.*automatable checks passed/i.test(promptResult),
'prompt defines PASS as valid when all automatable checks passed',
);
assert.ok(
/PARTIAL.*automatable checks.*skipped|inconclusive/i.test(promptResult),
'prompt reserves PARTIAL for when automatable checks themselves are inconclusive',
);
// human-experience mode should NOT force PARTIAL when automatable checks pass
assert.ok(
!promptResult.includes('use an overall verdict of `PARTIAL`'),
'prompt does not force PARTIAL verdict for human-experience mode',
);
});
test('(n) stale replay guard', async () => {
const base = createFixtureBase();
try {