From f2283c9a30f4da65052c332758b78427da6ffe2e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:34:30 +0000 Subject: [PATCH] fix: verdict gate accepts PARTIAL for mixed/human-experience/live-runtime UATs The verdict gate in auto-dispatch.ts now reads the UAT file to determine the UAT type. For mixed, human-experience, and live-runtime modes, PARTIAL is accepted as a valid verdict (all automatable checks passed, human-only checks documented as NEEDS-HUMAN). The run-uat prompt is updated so that PASS is the correct verdict when all automatable checks succeed, even if human-only checks remain. PARTIAL is reserved for when automatable checks themselves are inconclusive. Fixes gsd-build/gsd-2#1400 Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com> Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/5a619137-0710-4934-949f-bae63945bf70 --- src/resources/extensions/gsd/auto-dispatch.ts | 19 +++++- .../extensions/gsd/prompts/run-uat.md | 8 +-- .../extensions/gsd/tests/run-uat.test.ts | 68 +++++++++++++++++++ 3 files changed, 90 insertions(+), 5 deletions(-) diff --git a/src/resources/extensions/gsd/auto-dispatch.ts b/src/resources/extensions/gsd/auto-dispatch.ts index f71fd71ad..a84739d70 100644 --- a/src/resources/extensions/gsd/auto-dispatch.ts +++ b/src/resources/extensions/gsd/auto-dispatch.ts @@ -190,7 +190,24 @@ export const DISPATCH_RULES: DispatchRule[] = [ if (!content) continue; const verdictMatch = content.match(/verdict:\s*([\w-]+)/i); const verdict = verdictMatch?.[1]?.toLowerCase(); - if (verdict && verdict !== "pass" && verdict !== "passed") { + + // Determine acceptable verdicts based on UAT type. + // mixed / human-experience / live-runtime modes may legitimately + // produce PARTIAL when all automatable checks pass but human-only + // checks remain — this should not block progression. + const acceptableVerdicts: string[] = ["pass", "passed"]; + const uatFile = resolveSliceFile(basePath, mid, sliceId, "UAT"); + if (uatFile) { + const uatContent = await loadFile(uatFile); + if (uatContent) { + const uatType = extractUatType(uatContent); + if (uatType === "mixed" || uatType === "human-experience" || uatType === "live-runtime") { + acceptableVerdicts.push("partial"); + } + } + } + + if (verdict && !acceptableVerdicts.includes(verdict)) { return { action: "stop" as const, reason: `UAT verdict for ${sliceId} is "${verdict}" — blocking progression until resolved.\nReview the UAT result and update the verdict to PASS, or re-run /gsd auto after fixing.`, diff --git a/src/resources/extensions/gsd/prompts/run-uat.md b/src/resources/extensions/gsd/prompts/run-uat.md index 13c3e2ea0..207a9592c 100644 --- a/src/resources/extensions/gsd/prompts/run-uat.md +++ b/src/resources/extensions/gsd/prompts/run-uat.md @@ -29,7 +29,7 @@ You are the UAT runner. Execute every check defined in `{{uatPath}}` as deeply a - `runtime-executable` — execute the specified command or script. Capture stdout/stderr as evidence. Record pass/fail based on exit code and output. - `live-runtime` — exercise the real runtime path. Start or connect to the app/service if needed, use browser/runtime/network checks, and verify observable behavior. - `mixed` — run all automatable artifact-driven and live-runtime checks. Separate any remaining human-only checks explicitly. -- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN` and use an overall verdict of `PARTIAL` unless every required check was objective and passed. +- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN`. Use an overall verdict of `PASS` when all automatable checks succeed (even if human-only checks remain as `NEEDS-HUMAN`). Use `PARTIAL` only when automatable checks themselves were inconclusive. ### Evidence tools @@ -51,9 +51,9 @@ For each check, record: - `PASS`, `FAIL`, or `NEEDS-HUMAN` After running all checks, compute the **overall verdict**: -- `PASS` — all required checks passed and no human-only checks remain -- `FAIL` — one or more checks failed -- `PARTIAL` — some checks passed, but one or more checks were skipped, inconclusive, or still require human judgment +- `PASS` — all automatable checks passed. Any remaining checks that honestly require human judgment are marked `NEEDS-HUMAN` with clear instructions for the human reviewer. (This is the correct verdict for mixed/human-experience/live-runtime modes when all automatable checks succeed.) +- `FAIL` — one or more automatable checks failed +- `PARTIAL` — one or more automatable checks were skipped or returned inconclusive results (not the same as `NEEDS-HUMAN` — use PARTIAL only when the agent itself could not determine pass/fail for a check it was supposed to automate) Call `gsd_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "ASSESSMENT"`, and the full UAT result markdown as `content` — the tool computes the file path and persists to both DB and disk. The content should follow this format: diff --git a/src/resources/extensions/gsd/tests/run-uat.test.ts b/src/resources/extensions/gsd/tests/run-uat.test.ts index 8956c1342..fd1ecfdb2 100644 --- a/src/resources/extensions/gsd/tests/run-uat.test.ts +++ b/src/resources/extensions/gsd/tests/run-uat.test.ts @@ -343,6 +343,74 @@ test('(m) non-artifact UAT skip', async () => { } }); +test('(o) verdict gate: PARTIAL is acceptable for mixed/human-experience/live-runtime UAT types', () => { + // This test verifies the contract that extractUatType correctly identifies + // the modes where PARTIAL should not block progression. + // The verdict gate in auto-dispatch.ts uses this to build acceptableVerdicts. + const mixedType = extractUatType(makeUatContent('mixed')); + const humanExpType = extractUatType(makeUatContent('human-experience')); + const liveRuntimeType = extractUatType(makeUatContent('live-runtime')); + const artifactType = extractUatType(makeUatContent('artifact-driven')); + const browserType = extractUatType(makeUatContent('browser-executable')); + const runtimeExecType = extractUatType(makeUatContent('runtime-executable')); + + // These modes should allow PARTIAL (non-fully-automatable) + const partialAcceptableModes = ['mixed', 'human-experience', 'live-runtime']; + assert.ok( + partialAcceptableModes.includes(mixedType!), + `mixed → "${mixedType}" is in partialAcceptableModes`, + ); + assert.ok( + partialAcceptableModes.includes(humanExpType!), + `human-experience → "${humanExpType}" is in partialAcceptableModes`, + ); + assert.ok( + partialAcceptableModes.includes(liveRuntimeType!), + `live-runtime → "${liveRuntimeType}" is in partialAcceptableModes`, + ); + + // These modes should NOT allow PARTIAL (fully automatable) + assert.ok( + !partialAcceptableModes.includes(artifactType!), + `artifact-driven → "${artifactType}" is NOT in partialAcceptableModes`, + ); + assert.ok( + !partialAcceptableModes.includes(browserType!), + `browser-executable → "${browserType}" is NOT in partialAcceptableModes`, + ); + assert.ok( + !partialAcceptableModes.includes(runtimeExecType!), + `runtime-executable → "${runtimeExecType}" is NOT in partialAcceptableModes`, + ); +}); + +test('(p) run-uat prompt allows PASS when human-only checks remain as NEEDS-HUMAN', () => { + const promptResult = loadPromptFromWorktree('run-uat', { + workingDirectory: '/tmp/test-project', + milestoneId: 'M001', + sliceId: 'S01', + uatPath: '.gsd/milestones/M001/slices/S01/S01-UAT.md', + uatResultPath: '.gsd/milestones/M001/slices/S01/S01-UAT-RESULT.md', + uatType: 'mixed', + inlinedContext: '', + }); + + // PASS verdict should be usable when automatable checks pass (even with NEEDS-HUMAN remaining) + assert.ok( + /PASS.*automatable checks passed/i.test(promptResult), + 'prompt defines PASS as valid when all automatable checks passed', + ); + assert.ok( + /PARTIAL.*automatable checks.*skipped|inconclusive/i.test(promptResult), + 'prompt reserves PARTIAL for when automatable checks themselves are inconclusive', + ); + // human-experience mode should NOT force PARTIAL when automatable checks pass + assert.ok( + !promptResult.includes('use an overall verdict of `PARTIAL`'), + 'prompt does not force PARTIAL verdict for human-experience mode', + ); +}); + test('(n) stale replay guard', async () => { const base = createFixtureBase(); try {