From 639dcde71782e69980e48e581d06bfe216748bfd Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Thu, 14 May 2026 06:01:57 +0200 Subject: [PATCH] =?UTF-8?q?feat(self-feedback):=20outcomes-verification=20?= =?UTF-8?q?AC2=20=E2=80=94=20check=20commit=20touches=20AC-mentioned=20fil?= =?UTF-8?q?es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses sf-mp4vxusa-pn2tnd. Completes the outcomes-verification chain filed as AC2 of the original sf-mp4rxkwn-jmp039 (AC1 was commit-exists, shipped 4af10ac1b). When an agent-fix resolution cites a commit_sha AND the entry has acceptanceCriteria mentioning specific file paths, verify the cited commit actually modifies at least one of those files. Without this check, an agent could stamp ANY existing commit (e.g. the most recent unrelated commit on main) as the fix evidence — the SHA exists but the commit has nothing to do with the entry. Implementation: extractFilesFromAcceptanceCriteria(acText) Two extraction strategies: 1. Backticked code spans (most reliable): `src/foo.js` 2. Bare path-like tokens (only when slash + dotted extension present, no whitespace, no http:// prefix, no leading digit) Returns [] when AC has no extractable paths — prose-only AC skips the check rather than rejecting (the silent-skip is the right failure mode here; we don't want to fabricate rejections when there's nothing to verify against). getCommitTouchedFiles(commitSha, basePath) Shell to git diff-tree --no-commit-id --name-only -r . 5-second timeout. Returns null on git failure or out-of-repo. Matching strategy: exact-path-set OR basename-set. The basename fallback tolerates the common operator informality where AC says "src/types.ts" but the actual change was at "packages/ai/src/types.ts". Exact match wins; basename match catches the typical case without over-trusting (still requires a file with that exact basename to be touched). Carve-out: skip the check when getCommitTouchedFiles returns null (git unavailable / not-a-repo) — same shape as AC1's "ungrokable" carve-out. The agent-fix-unverified evidence kind remains the explicit escape hatch for "I want agent-fix attribution but can't cite a verifiable commit." Tests (3 new, 19 total): - rejects_agent_fix_when_commit_does_not_touch_AC_files: real git init, commit touches src/unrelated.js, AC mentions src/expected.js → markResolved returns false. Then commit that DOES touch expected → markResolved returns true. - skips_AC_file_check_when_AC_has_no_extractable_paths: prose-only AC accepts any commit. - AC_file_check_tolerates_basename_match: AC says src/types.ts but commit touches packages/ai/src/types.ts — accepted via basename. 1619/1619 SF extension tests pass; typecheck clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/resources/extensions/sf/self-feedback.js | 134 +++++++++++++++ .../sf/tests/self-feedback-db.test.mjs | 160 ++++++++++++++++++ 2 files changed, 294 insertions(+) diff --git a/src/resources/extensions/sf/self-feedback.js b/src/resources/extensions/sf/self-feedback.js index 8570bf499..2570f3adb 100644 --- a/src/resources/extensions/sf/self-feedback.js +++ b/src/resources/extensions/sf/self-feedback.js @@ -573,6 +573,96 @@ function isAcceptedEvidence(evidence) { return typeof kind === "string" && ACCEPTED_EVIDENCE_KINDS.has(kind); } +/** + * Outcomes-verification AC2 (sf-mp4vxusa-pn2tnd): extract file/path + * mentions from a free-form acceptanceCriteria string. We accept two + * shapes that operators tend to use: + * + * 1. backticked code spans: `src/foo.js`, `tests/bar.test.mjs` + * 2. bare path-like tokens: src/foo.js, packages/ai/src/types.ts, + * docs/dev/ADR-019.md (any token containing a slash + dotted + * extension that doesn't start with a numeric digit) + * + * Returns an array of unique normalized paths (no leading ./, lowercased + * comparison is the caller's job — we keep original case here). Returns + * [] when no extractable paths found. Empty AC also returns []. + */ +function extractFilesFromAcceptanceCriteria(acText) { + if (typeof acText !== "string" || !acText.trim()) return []; + const found = new Set(); + // Backticked spans first — most reliable signal + const backtickRe = /`([^`\n]{1,200})`/g; + let m; + while ((m = backtickRe.exec(acText)) !== null) { + const candidate = m[1].trim(); + if (looksLikePath(candidate)) found.add(normalizePath(candidate)); + } + // Then bare path-like tokens. Word-boundary on either side; require at + // least one slash and a dotted extension to avoid sucking in plain + // english words. + const pathRe = + /(?:^|[\s,(\[])([a-zA-Z][a-zA-Z0-9._\-/]*\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)(?=[\s,)\]:;.]|$)/g; + while ((m = pathRe.exec(acText)) !== null) { + const candidate = m[1].trim(); + if (looksLikePath(candidate)) found.add(normalizePath(candidate)); + } + return Array.from(found); +} + +function looksLikePath(s) { + if (!s || s.length < 3) return false; + if (/\s/.test(s)) return false; + if (!s.includes("/")) return false; + if (!/\.[a-zA-Z0-9]+$/.test(s)) return false; + // Reject URLs and protocol-looking strings + if (/^https?:\/\//i.test(s)) return false; + // Reject leading digits (version-like strings) + if (/^\d/.test(s)) return false; + return true; +} + +function normalizePath(p) { + return p.replace(/^\.\//, "").replace(/^\/+/, ""); +} + +/** + * Outcomes-verification AC2 (sf-mp4vxusa-pn2tnd): return the list of files + * touched by a given commit. Best-effort — returns null when git is + * unavailable or the commit is missing, so callers downgrade gracefully. + * + * Consumer: markResolved when evidence.kind === "agent-fix" and the + * resolved entry has AC mentioning specific files. + */ +function getCommitTouchedFiles(commitSha, basePath) { + if (typeof commitSha !== "string" || !commitSha.trim()) return null; + let execFileSync; + try { + const cp = require("node:child_process"); + execFileSync = cp.execFileSync; + } catch { + return null; + } + try { + const out = execFileSync( + "git", + ["diff-tree", "--no-commit-id", "--name-only", "-r", commitSha.trim()], + { + cwd: basePath, + encoding: "utf-8", + stdio: ["ignore", "pipe", "pipe"], + timeout: 5000, + }, + ); + return out + .split("\n") + .map((s) => s.trim()) + .filter(Boolean) + .map(normalizePath); + } catch { + return null; + } +} + /** * Outcomes-verification AC1 (sf-mp4rxkwn-jmp039): when an agent-fix * resolution cites a commit_sha, verify the commit actually exists in the @@ -670,6 +760,50 @@ export function markResolved(entryId, resolution, basePath = process.cwd()) { basePath, ); if (status === "missing") return false; + // Outcomes-verification AC2 (sf-mp4vxusa-pn2tnd): if the entry has + // acceptanceCriteria mentioning specific files, the cited commit + // must touch at least one of them. Skip silently when AC has no + // extractable file mentions (operator AC was prose-only — there's + // nothing to verify against and we don't want false rejections). + // Skip when git can't enumerate the commit's files ("ungrokable" or + // out-of-repo) — same carve-out as AC1. + if (status === "verified" && isForgeRepo(basePath) && isDbAvailable()) { + try { + const entry = listSelfFeedbackEntries().find((e) => e.id === entryId); + const acText = + entry?.acceptanceCriteria ?? + entry?.full_json?.acceptanceCriteria ?? + ""; + const expectedFiles = extractFilesFromAcceptanceCriteria(acText); + if (expectedFiles.length > 0) { + const touched = getCommitTouchedFiles( + resolution.evidence.commitSha, + basePath, + ); + if (Array.isArray(touched) && touched.length > 0) { + const touchedSet = new Set(touched); + // Match by exact path OR by basename to tolerate the + // common AC pattern of saying "src/foo.js" when the + // actual change was in "packages/x/src/foo.js" (and vice + // versa). Exact match wins; basename match catches the + // usual operator informality without over-trusting. + const touchedBasenames = new Set( + touched.map((p) => p.split("/").pop()), + ); + const hit = expectedFiles.some((f) => { + if (touchedSet.has(f)) return true; + const basename = f.split("/").pop(); + return basename ? touchedBasenames.has(basename) : false; + }); + if (!hit) return false; + } + } + } catch { + // Verification is best-effort — never block resolution on a + // verification probe throwing. The AC1 commit-exists check + // has already accepted the SHA at this point. + } + } } if (isForgeRepo(basePath) && isDbAvailable()) { try { diff --git a/src/resources/extensions/sf/tests/self-feedback-db.test.mjs b/src/resources/extensions/sf/tests/self-feedback-db.test.mjs index f448735f6..7657f59fb 100644 --- a/src/resources/extensions/sf/tests/self-feedback-db.test.mjs +++ b/src/resources/extensions/sf/tests/self-feedback-db.test.mjs @@ -257,6 +257,166 @@ test("markResolved_rejects_agent_fix_with_nonexistent_commit_sha", async () => { assert.equal(okReal, true); }); +test("markResolved_rejects_agent_fix_when_commit_does_not_touch_AC_files", async () => { + const project = makeForgeProject(); + const { execFileSync } = await import("node:child_process"); + execFileSync("git", ["init", "-q", "-b", "main"], { cwd: project }); + execFileSync("git", ["config", "user.email", "test@example.com"], { + cwd: project, + }); + execFileSync("git", ["config", "user.name", "test"], { cwd: project }); + + // Two commits touching different files + const { writeFileSync: writeFs, mkdirSync: mkFs } = await import( + "node:fs" + ); + mkFs(join(project, "src"), { recursive: true }); + writeFs(join(project, "src", "expected.js"), "// initial\n"); + writeFs(join(project, "src", "unrelated.js"), "// initial\n"); + execFileSync("git", ["add", "-A"], { cwd: project }); + execFileSync("git", ["commit", "-m", "init", "-q"], { cwd: project }); + + // "Real fix" commit only touches src/unrelated.js + writeFs(join(project, "src", "unrelated.js"), "// updated\n"); + execFileSync("git", ["add", "-A"], { cwd: project }); + execFileSync("git", ["commit", "-m", "unrelated change", "-q"], { + cwd: project, + }); + const unrelatedSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: project, + encoding: "utf-8", + }).trim(); + + // File an entry whose AC mentions src/expected.js — the unrelated + // commit must be rejected because it doesn't touch that file. + const filed = recordSelfFeedback( + { + kind: "gap:ac-mismatch-test", + severity: "high", + summary: "AC mentions a specific file", + acceptanceCriteria: + "AC1: `src/expected.js` is modified to use the new helper", + }, + project, + ); + + const okWrongFile = markResolved( + filed.entry.id, + { + reason: "claims fix but commit touched the wrong file", + evidence: { kind: "agent-fix", commitSha: unrelatedSha }, + }, + project, + ); + assert.equal(okWrongFile, false); + + // Now make a commit that DOES touch src/expected.js — accepted + writeFs(join(project, "src", "expected.js"), "// actually-fixed\n"); + execFileSync("git", ["add", "-A"], { cwd: project }); + execFileSync("git", ["commit", "-m", "real fix", "-q"], { cwd: project }); + const realSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: project, + encoding: "utf-8", + }).trim(); + + const okRealFile = markResolved( + filed.entry.id, + { + reason: "real fix touches the AC-mentioned file", + evidence: { kind: "agent-fix", commitSha: realSha }, + }, + project, + ); + assert.equal(okRealFile, true); +}); + +test("markResolved_skips_AC_file_check_when_AC_has_no_extractable_paths", async () => { + const project = makeForgeProject(); + const { execFileSync } = await import("node:child_process"); + execFileSync("git", ["init", "-q", "-b", "main"], { cwd: project }); + execFileSync("git", ["config", "user.email", "test@example.com"], { + cwd: project, + }); + execFileSync("git", ["config", "user.name", "test"], { cwd: project }); + execFileSync("git", ["commit", "--allow-empty", "-m", "init", "-q"], { + cwd: project, + }); + const sha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: project, + encoding: "utf-8", + }).trim(); + + // AC is prose-only — no extractable file paths. Commit acceptance + // must succeed without file-match check (we don't fabricate + // rejections when there's nothing to compare against). + const filed = recordSelfFeedback( + { + kind: "gap:prose-only-ac-test", + severity: "low", + summary: "prose AC", + acceptanceCriteria: + "AC1: behavior is intuitive AC2: operators can recover quickly", + }, + project, + ); + + const ok = markResolved( + filed.entry.id, + { + reason: "prose AC accepts any commit", + evidence: { kind: "agent-fix", commitSha: sha }, + }, + project, + ); + assert.equal(ok, true); +}); + +test("markResolved_AC_file_check_tolerates_basename_match", async () => { + const project = makeForgeProject(); + const { execFileSync } = await import("node:child_process"); + const fsMod = await import("node:fs"); + const writeFs = fsMod.writeFileSync; + const mkFs = fsMod.mkdirSync; + execFileSync("git", ["init", "-q", "-b", "main"], { cwd: project }); + execFileSync("git", ["config", "user.email", "test@example.com"], { + cwd: project, + }); + execFileSync("git", ["config", "user.name", "test"], { cwd: project }); + mkFs(join(project, "packages", "ai", "src"), { recursive: true }); + writeFs( + join(project, "packages", "ai", "src", "types.ts"), + "export type X = {};\n", + ); + execFileSync("git", ["add", "-A"], { cwd: project }); + execFileSync("git", ["commit", "-m", "init", "-q"], { cwd: project }); + const sha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: project, + encoding: "utf-8", + }).trim(); + + // AC mentions "src/types.ts" but the actual commit touched + // "packages/ai/src/types.ts". Basename match should accept. + const filed = recordSelfFeedback( + { + kind: "gap:basename-tolerance-test", + severity: "low", + summary: "basename match", + acceptanceCriteria: "AC1: `src/types.ts` is extended with a new field", + }, + project, + ); + + const ok = markResolved( + filed.entry.id, + { + reason: "basename match across packages", + evidence: { kind: "agent-fix", commitSha: sha }, + }, + project, + ); + assert.equal(ok, true); +}); + test("markResolved_accepts_agent_fix_with_no_commit_sha_or_ungrokable_path", () => { const project = makeForgeProject(); // No git init → ungrokable → carve-out allows the resolution