From f2283c9a30f4da65052c332758b78427da6ffe2e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 25 Mar 2026 22:34:30 +0000
Subject: [PATCH] fix: verdict gate accepts PARTIAL for
 mixed/human-experience/live-runtime UATs

The verdict gate in auto-dispatch.ts now reads the UAT file to determine
the UAT type. For mixed, human-experience, and live-runtime modes,
PARTIAL is accepted as a valid verdict (all automatable checks passed,
human-only checks documented as NEEDS-HUMAN).

The run-uat prompt is updated so that PASS is the correct verdict when
all automatable checks succeed, even if human-only checks remain. PARTIAL
is reserved for when automatable checks themselves are inconclusive.

Fixes gsd-build/gsd-2#1400

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/5a619137-0710-4934-949f-bae63945bf70
---
 src/resources/extensions/gsd/auto-dispatch.ts | 19 +++++-
 .../extensions/gsd/prompts/run-uat.md         |  8 +--
 .../extensions/gsd/tests/run-uat.test.ts      | 68 +++++++++++++++++++
 3 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/src/resources/extensions/gsd/auto-dispatch.ts b/src/resources/extensions/gsd/auto-dispatch.ts
index f71fd71ad..a84739d70 100644
--- a/src/resources/extensions/gsd/auto-dispatch.ts
+++ b/src/resources/extensions/gsd/auto-dispatch.ts
@@ -190,7 +190,24 @@ export const DISPATCH_RULES: DispatchRule[] = [
         if (!content) continue;
         const verdictMatch = content.match(/verdict:\s*([\w-]+)/i);
         const verdict = verdictMatch?.[1]?.toLowerCase();
-        if (verdict && verdict !== "pass" && verdict !== "passed") {
+
+        // Determine acceptable verdicts based on UAT type.
+        // mixed / human-experience / live-runtime modes may legitimately
+        // produce PARTIAL when all automatable checks pass but human-only
+        // checks remain — this should not block progression.
+        const acceptableVerdicts: string[] = ["pass", "passed"];
+        const uatFile = resolveSliceFile(basePath, mid, sliceId, "UAT");
+        if (uatFile) {
+          const uatContent = await loadFile(uatFile);
+          if (uatContent) {
+            const uatType = extractUatType(uatContent);
+            if (uatType === "mixed" || uatType === "human-experience" || uatType === "live-runtime") {
+              acceptableVerdicts.push("partial");
+            }
+          }
+        }
+
+        if (verdict && !acceptableVerdicts.includes(verdict)) {
           return {
             action: "stop" as const,
             reason: `UAT verdict for ${sliceId} is "${verdict}" — blocking progression until resolved.\nReview the UAT result and update the verdict to PASS, or re-run /gsd auto after fixing.`,
diff --git a/src/resources/extensions/gsd/prompts/run-uat.md b/src/resources/extensions/gsd/prompts/run-uat.md
index 13c3e2ea0..207a9592c 100644
--- a/src/resources/extensions/gsd/prompts/run-uat.md
+++ b/src/resources/extensions/gsd/prompts/run-uat.md
@@ -29,7 +29,7 @@ You are the UAT runner. Execute every check defined in `{{uatPath}}` as deeply a
 - `runtime-executable` — execute the specified command or script. Capture stdout/stderr as evidence. Record pass/fail based on exit code and output.
 - `live-runtime` — exercise the real runtime path. Start or connect to the app/service if needed, use browser/runtime/network checks, and verify observable behavior.
 - `mixed` — run all automatable artifact-driven and live-runtime checks. Separate any remaining human-only checks explicitly.
-- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN` and use an overall verdict of `PARTIAL` unless every required check was objective and passed.
+- `human-experience` — automate setup, preconditions, screenshots, logs, and objective checks, but do **not** invent subjective PASS results. Mark taste-based, experiential, or purely human-judgment checks as `NEEDS-HUMAN`. Use an overall verdict of `PASS` when all automatable checks succeed (even if human-only checks remain as `NEEDS-HUMAN`). Use `PARTIAL` only when automatable checks themselves were inconclusive.
 
 ### Evidence tools
 
@@ -51,9 +51,9 @@ For each check, record:
 - `PASS`, `FAIL`, or `NEEDS-HUMAN`
 
 After running all checks, compute the **overall verdict**:
-- `PASS` — all required checks passed and no human-only checks remain
-- `FAIL` — one or more checks failed
-- `PARTIAL` — some checks passed, but one or more checks were skipped, inconclusive, or still require human judgment
+- `PASS` — all automatable checks passed. Any remaining checks that honestly require human judgment are marked `NEEDS-HUMAN` with clear instructions for the human reviewer. (This is the correct verdict for mixed/human-experience/live-runtime modes when all automatable checks succeed.)
+- `FAIL` — one or more automatable checks failed
+- `PARTIAL` — one or more automatable checks were skipped or returned inconclusive results (not the same as `NEEDS-HUMAN` — use PARTIAL only when the agent itself could not determine pass/fail for a check it was supposed to automate)
 
 Call `gsd_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "ASSESSMENT"`, and the full UAT result markdown as `content` — the tool computes the file path and persists to both DB and disk. The content should follow this format:
 
diff --git a/src/resources/extensions/gsd/tests/run-uat.test.ts b/src/resources/extensions/gsd/tests/run-uat.test.ts
index 8956c1342..fd1ecfdb2 100644
--- a/src/resources/extensions/gsd/tests/run-uat.test.ts
+++ b/src/resources/extensions/gsd/tests/run-uat.test.ts
@@ -343,6 +343,74 @@ test('(m) non-artifact UAT skip', async () => {
     }
 });
 
+test('(o) verdict gate: PARTIAL is acceptable for mixed/human-experience/live-runtime UAT types', () => {
+    // This test verifies the contract that extractUatType correctly identifies
+    // the modes where PARTIAL should not block progression.
+    // The verdict gate in auto-dispatch.ts uses this to build acceptableVerdicts.
+    const mixedType = extractUatType(makeUatContent('mixed'));
+    const humanExpType = extractUatType(makeUatContent('human-experience'));
+    const liveRuntimeType = extractUatType(makeUatContent('live-runtime'));
+    const artifactType = extractUatType(makeUatContent('artifact-driven'));
+    const browserType = extractUatType(makeUatContent('browser-executable'));
+    const runtimeExecType = extractUatType(makeUatContent('runtime-executable'));
+
+    // These modes should allow PARTIAL (non-fully-automatable)
+    const partialAcceptableModes = ['mixed', 'human-experience', 'live-runtime'];
+    assert.ok(
+      partialAcceptableModes.includes(mixedType!),
+      `mixed → "${mixedType}" is in partialAcceptableModes`,
+    );
+    assert.ok(
+      partialAcceptableModes.includes(humanExpType!),
+      `human-experience → "${humanExpType}" is in partialAcceptableModes`,
+    );
+    assert.ok(
+      partialAcceptableModes.includes(liveRuntimeType!),
+      `live-runtime → "${liveRuntimeType}" is in partialAcceptableModes`,
+    );
+
+    // These modes should NOT allow PARTIAL (fully automatable)
+    assert.ok(
+      !partialAcceptableModes.includes(artifactType!),
+      `artifact-driven → "${artifactType}" is NOT in partialAcceptableModes`,
+    );
+    assert.ok(
+      !partialAcceptableModes.includes(browserType!),
+      `browser-executable → "${browserType}" is NOT in partialAcceptableModes`,
+    );
+    assert.ok(
+      !partialAcceptableModes.includes(runtimeExecType!),
+      `runtime-executable → "${runtimeExecType}" is NOT in partialAcceptableModes`,
+    );
+});
+
+test('(p) run-uat prompt allows PASS when human-only checks remain as NEEDS-HUMAN', () => {
+    const promptResult = loadPromptFromWorktree('run-uat', {
+      workingDirectory: '/tmp/test-project',
+      milestoneId: 'M001',
+      sliceId: 'S01',
+      uatPath: '.gsd/milestones/M001/slices/S01/S01-UAT.md',
+      uatResultPath: '.gsd/milestones/M001/slices/S01/S01-UAT-RESULT.md',
+      uatType: 'mixed',
+      inlinedContext: '<!-- no context -->',
+    });
+
+    // PASS verdict should be usable when automatable checks pass (even with NEEDS-HUMAN remaining)
+    assert.ok(
+      /PASS.*automatable checks passed/i.test(promptResult),
+      'prompt defines PASS as valid when all automatable checks passed',
+    );
+    assert.ok(
+      /PARTIAL.*automatable checks.*skipped|inconclusive/i.test(promptResult),
+      'prompt reserves PARTIAL for when automatable checks themselves are inconclusive',
+    );
+    // human-experience mode should NOT force PARTIAL when automatable checks pass
+    assert.ok(
+      !promptResult.includes('use an overall verdict of `PARTIAL`'),
+      'prompt does not force PARTIAL verdict for human-experience mode',
+    );
+});
+
 test('(n) stale replay guard', async () => {
     const base = createFixtureBase();
     try {