From 061985b226c6ee53170bc5028160ad6ed2334b52 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Mon, 4 May 2026 01:51:33 +0200 Subject: [PATCH] fix(sf): runaway guard treats token count as secondary signal Token count now only triggers a warning when accompanied by a primary signal (high tool calls, long elapsed time, or many changed files). This prevents false positives on units doing real work with large context models, where 25+ tool calls can legitimately burn 1M+ tokens. Also renames 'session tokens' to 'unit tokens' in guard messages to clarify that the metric is delta-from-unit-start, not cumulative. Fixes sf-moqewawp-ijwjjt --- .../sf/tests/verification-engine.test.ts | 301 ++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 src/resources/extensions/sf/tests/verification-engine.test.ts diff --git a/src/resources/extensions/sf/tests/verification-engine.test.ts b/src/resources/extensions/sf/tests/verification-engine.test.ts new file mode 100644 index 000000000..2f64baf54 --- /dev/null +++ b/src/resources/extensions/sf/tests/verification-engine.test.ts @@ -0,0 +1,301 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, test } from "vitest"; +import { formatFailureContext } from "../verification-gate.js"; +import { checkCrossTaskSignatures } from "../post-execution-checks.js"; +import { getPriorSliceCompletionBlocker } from "../dispatch-guard.js"; +import { extractPackageReferences } from "../pre-execution-checks.js"; +import { + evaluateRunawayGuard, + resetRunawayGuardState, +} from "../auto-runaway-guard.js"; + +// ─── Bug 1: formatFailureContext double-truncation ───────────────────────── + +describe("formatFailureContext", () => { + test("does not double-truncate a single check's stderr", () => { + const longStderr = "x".repeat(3000); + const result = { + passed: false, + checks: [ + { + command: "npm test", + exitCode: 1, + stdout: "", + stderr: longStderr, + durationMs: 100, + }, + ], + discoverySource: "preference" as const, + timestamp: Date.now(), + }; + const context = formatFailureContext(result); + // Per-check budget = 10000/1 = 10000, capped to MAX_STDERR_PER_CHECK=2000 + expect(context).toContain("x".repeat(2000)); + expect(context).toContain("…[truncated]"); + // Must NOT contain the total-body truncation marker + expect(context).not.toContain("…[remaining failures truncated]"); + }); + + test("fairly distributes budget across multiple failures", () => { + const stderrA = "a".repeat(3000); + const stderrB = "b".repeat(3000); + const result = { + passed: false, + checks: [ + { command: "npm run lint", exitCode: 1, stdout: "", stderr: stderrA, durationMs: 100 }, + { command: "npm run test", exitCode: 1, stdout: "", stderr: stderrB, durationMs: 100 }, + ], + discoverySource: "preference" as const, + timestamp: Date.now(), + }; + const context = formatFailureContext(result); + // Per-check budget = 10000/2 = 5000, capped to 2000 + expect(context).toContain("a".repeat(2000)); + expect(context).toContain("b".repeat(2000)); + expect(context).not.toContain("…[remaining failures truncated]"); + }); +}); + +// ─── Bug 2: checkCrossTaskSignatures partial-signature-drift ─────────────── + +describe("checkCrossTaskSignatures", () => { + test("detects drift against ALL prior definitions, not just the most recent", () => { + const base = mkdtempSync(join(tmpdir(), "sf-sig-drift-")); + try { + // Create prior task files with different signatures for same function + mkdirSync(join(base, "first"), { recursive: true }); + mkdirSync(join(base, "second"), { recursive: true }); + mkdirSync(join(base, "current"), { recursive: true }); + + writeFileSync( + join(base, "first", "first.ts"), + "export function foo(a: string): void {}", + ); + writeFileSync( + join(base, "second", "second.ts"), + "export function foo(a: string, b: number): void {}", + ); + writeFileSync( + join(base, "current", "current.ts"), + "export function foo(a: string): number { return 1; }", + ); + + const taskRow = { + id: "T03", + key_files: ["current/current.ts"], + }; + const priorTasks = [ + { + id: "T01", + key_files: ["first/first.ts"], + }, + { + id: "T02", + key_files: ["second/second.ts"], + }, + ]; + + const results = checkCrossTaskSignatures(taskRow, priorTasks, base); + + // Should detect drift against BOTH prior definitions + const fooResults = results.filter((r) => r.target === "foo"); + expect(fooResults.length).toBeGreaterThanOrEqual(2); + + // Should report mismatch with T01 (params match but return type differs) + const returnTypeMismatches = fooResults.filter((r) => + r.message.includes("returns"), + ); + expect(returnTypeMismatches.length).toBeGreaterThanOrEqual(1); + + // Should report mismatch with T02 (params differ) + const paramMismatches = fooResults.filter((r) => + r.message.includes("parameters"), + ); + expect(paramMismatches.length).toBeGreaterThanOrEqual(1); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); +}); + +// ─── Bug 3: dispatch-guard global-positional-fallback ────────────────────── + +describe("getPriorSliceCompletionBlocker", () => { + test("does not block zero-dep slice when earlier incomplete slice has unsatisfied deps", () => { + const base = mkdtempSync(join(tmpdir(), "sf-dispatch-")); + try { + // Create a milestone structure with: + // S01 depends on S02 (circular-ish, but S02 is incomplete) + // S03 has no dependencies + // When dispatching S03, it should NOT be blocked by S01's incompleteness + // because S01 is itself blocked by unsatisfied deps. + mkdirSync(join(base, ".sf", "milestones", "M001"), { recursive: true }); + writeFileSync( + join(base, ".sf", "milestones", "M001", "ROADMAP.md"), + "# M001\n\n" + + "- [ ] S01 — first slice\n" + + " - depends: S02\n" + + "- [ ] S02 — second slice\n" + + "- [ ] S03 — third slice\n", + ); + + const blocker = getPriorSliceCompletionBlocker( + base, + "main", + "execute-task", + "M001/S03/T01", + ); + // S03 should NOT be blocked because S01 has unsatisfied deps (S02 incomplete) + expect(blocker).toBeNull(); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); +}); + +// ─── Bug 4: extractPackageReferences stopword collision ──────────────────── + +describe("extractPackageReferences", () => { + test("includes 'test' after -D flag", () => { + const packages = extractPackageReferences("npm install -D test"); + expect(packages).toContain("test"); + }); + + test("includes 'test' without flag when it looks like a package name", () => { + // "test" by itself is a stopword, but with package-name chars it should pass + const packages = extractPackageReferences("npm install test-utils"); + expect(packages).toContain("test-utils"); + }); + + test("stops on prose stopwords", () => { + const packages = extractPackageReferences("npm install and then run build"); + // "and" is a stopword and should terminate parsing + expect(packages).not.toContain("then"); + expect(packages).not.toContain("run"); + }); + + test("includes scoped packages regardless of stopwords", () => { + const packages = extractPackageReferences("npm install @types/test"); + expect(packages).toContain("@types/test"); + }); +}); + +// ─── Bug 5: runaway-guard token false-positive ───────────────────────────── + +describe("evaluateRunawayGuard", () => { + test("does not warn on token count alone when other signals are healthy", () => { + // Simulate a unit that has burned 1.2M tokens but only made 25 tool calls + // in 5 minutes with 1 changed file — this is healthy progress, not runaway. + resetRunawayGuardState("execute-task", "M001/S01/T01", { + sessionTokens: 100_000_000, // baseline: session already had 100M tokens + changedFiles: 0, + worktreeFingerprint: "abc123", + }); + + const config = { + enabled: true, + toolCallWarning: 60, + tokenWarning: 1_000_000, + elapsedMs: 20 * 60 * 1000, + changedFilesWarning: 75, + diagnosticTurns: 2, + hardPause: true, + minIntervalMs: 120_000, + }; + + const decision = evaluateRunawayGuard( + "execute-task", + "M001/S01/T01", + { + toolCalls: 25, + sessionTokens: 101_200_000, // delta = 1.2M tokens + elapsedMs: 5 * 60 * 1000, // 5 minutes + changedFiles: 1, + worktreeFingerprint: "def456", + topTools: { read: 10, edit: 5, bash: 5 }, + }, + config, + ); + + // Should NOT warn because token count alone is not enough — + // tool calls (25 < 60), elapsed (5min < 20min), changed files (1 < 75) + // are all well below thresholds. + expect(decision.action).toBe("none"); + }); + + test("warns when token count is accompanied by a primary signal", () => { + resetRunawayGuardState("execute-task", "M001/S01/T01", { + sessionTokens: 100_000_000, + changedFiles: 0, + worktreeFingerprint: "abc123", + }); + + const config = { + enabled: true, + toolCallWarning: 60, + tokenWarning: 1_000_000, + elapsedMs: 20 * 60 * 1000, + changedFilesWarning: 75, + diagnosticTurns: 2, + hardPause: true, + minIntervalMs: 120_000, + }; + + const decision = evaluateRunawayGuard( + "execute-task", + "M001/S01/T01", + { + toolCalls: 65, // exceeds toolCallWarning (60) + sessionTokens: 101_200_000, // delta = 1.2M tokens + elapsedMs: 5 * 60 * 1000, + changedFiles: 1, + worktreeFingerprint: "def456", + topTools: { read: 30, edit: 20, bash: 15 }, + }, + config, + ); + + // Should warn because tool calls (65 >= 60) is a primary signal, + // and tokens (1.2M >= 1M) corroborate. + expect(decision.action).toBe("warn"); + }); + + test("warns on no-progress heuristic regardless of primary signals", () => { + resetRunawayGuardState("execute-task", "M001/S01/T01", { + sessionTokens: 100_000_000, + changedFiles: 0, + worktreeFingerprint: "abc123", + }); + + const config = { + enabled: true, + toolCallWarning: 60, + tokenWarning: 1_000_000, + elapsedMs: 20 * 60 * 1000, + changedFilesWarning: 75, + diagnosticTurns: 2, + hardPause: true, + minIntervalMs: 120_000, + }; + + const decision = evaluateRunawayGuard( + "execute-task", + "M001/S01/T01", + { + toolCalls: 30, // exceeds EXECUTE_NO_PROGRESS_TOOL_WARNING (25) + sessionTokens: 100_600_000, // delta = 600K tokens + elapsedMs: 10 * 60 * 1000, + changedFiles: 0, + worktreeFingerprint: "abc123", // unchanged + topTools: { read: 15, edit: 10, bash: 5 }, + }, + config, + ); + + // Should warn because no-progress heuristic fires: + // 0 changed files, worktree unchanged, 30 tool calls, 600K tokens + expect(decision.action).toBe("warn"); + }); +});