fix(sf): runaway guard treats token count as secondary signal

Token count now only triggers a warning when accompanied by a primary signal (high tool calls, long elapsed time, or many changed files). This prevents false positives on units doing real work with large context models, where 25+ tool calls can legitimately burn 1M+ tokens. Also renames 'session tokens' to 'unit tokens' in guard messages to clarify that the metric is delta-from-unit-start, not cumulative. Fixes sf-moqewawp-ijwjjt
2026-05-04 01:51:33 +02:00 · 2026-05-04 01:51:33 +02:00 · 061985b226
commit 061985b226
parent f712c339b3
1 changed files with 301 additions and 0 deletions
--- a/src/resources/extensions/sf/tests/verification-engine.test.ts
+++ b/src/resources/extensions/sf/tests/verification-engine.test.ts
@ -0,0 +1,301 @@
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { describe, expect, test } from "vitest";
+import { formatFailureContext } from "../verification-gate.js";
+import { checkCrossTaskSignatures } from "../post-execution-checks.js";
+import { getPriorSliceCompletionBlocker } from "../dispatch-guard.js";
+import { extractPackageReferences } from "../pre-execution-checks.js";
+import {
+	evaluateRunawayGuard,
+	resetRunawayGuardState,
+} from "../auto-runaway-guard.js";
+
+// ─── Bug 1: formatFailureContext double-truncation ─────────────────────────
+
+describe("formatFailureContext", () => {
+	test("does not double-truncate a single check's stderr", () => {
+		const longStderr = "x".repeat(3000);
+		const result = {
+			passed: false,
+			checks: [
+				{
+					command: "npm test",
+					exitCode: 1,
+					stdout: "",
+					stderr: longStderr,
+					durationMs: 100,
+				},
+			],
+			discoverySource: "preference" as const,
+			timestamp: Date.now(),
+		};
+		const context = formatFailureContext(result);
+		// Per-check budget = 10000/1 = 10000, capped to MAX_STDERR_PER_CHECK=2000
+		expect(context).toContain("x".repeat(2000));
+		expect(context).toContain("…[truncated]");
+		// Must NOT contain the total-body truncation marker
+		expect(context).not.toContain("…[remaining failures truncated]");
+	});
+
+	test("fairly distributes budget across multiple failures", () => {
+		const stderrA = "a".repeat(3000);
+		const stderrB = "b".repeat(3000);
+		const result = {
+			passed: false,
+			checks: [
+				{ command: "npm run lint", exitCode: 1, stdout: "", stderr: stderrA, durationMs: 100 },
+				{ command: "npm run test", exitCode: 1, stdout: "", stderr: stderrB, durationMs: 100 },
+			],
+			discoverySource: "preference" as const,
+			timestamp: Date.now(),
+		};
+		const context = formatFailureContext(result);
+		// Per-check budget = 10000/2 = 5000, capped to 2000
+		expect(context).toContain("a".repeat(2000));
+		expect(context).toContain("b".repeat(2000));
+		expect(context).not.toContain("…[remaining failures truncated]");
+	});
+});
+
+// ─── Bug 2: checkCrossTaskSignatures partial-signature-drift ───────────────
+
+describe("checkCrossTaskSignatures", () => {
+	test("detects drift against ALL prior definitions, not just the most recent", () => {
+		const base = mkdtempSync(join(tmpdir(), "sf-sig-drift-"));
+		try {
+			// Create prior task files with different signatures for same function
+			mkdirSync(join(base, "first"), { recursive: true });
+			mkdirSync(join(base, "second"), { recursive: true });
+			mkdirSync(join(base, "current"), { recursive: true });
+
+			writeFileSync(
+				join(base, "first", "first.ts"),
+				"export function foo(a: string): void {}",
+			);
+			writeFileSync(
+				join(base, "second", "second.ts"),
+				"export function foo(a: string, b: number): void {}",
+			);
+			writeFileSync(
+				join(base, "current", "current.ts"),
+				"export function foo(a: string): number { return 1; }",
+			);
+
+			const taskRow = {
+				id: "T03",
+				key_files: ["current/current.ts"],
+			};
+			const priorTasks = [
+				{
+					id: "T01",
+					key_files: ["first/first.ts"],
+				},
+				{
+					id: "T02",
+					key_files: ["second/second.ts"],
+				},
+			];
+
+			const results = checkCrossTaskSignatures(taskRow, priorTasks, base);
+
+			// Should detect drift against BOTH prior definitions
+			const fooResults = results.filter((r) => r.target === "foo");
+			expect(fooResults.length).toBeGreaterThanOrEqual(2);
+
+			// Should report mismatch with T01 (params match but return type differs)
+			const returnTypeMismatches = fooResults.filter((r) =>
+				r.message.includes("returns"),
+			);
+			expect(returnTypeMismatches.length).toBeGreaterThanOrEqual(1);
+
+			// Should report mismatch with T02 (params differ)
+			const paramMismatches = fooResults.filter((r) =>
+				r.message.includes("parameters"),
+			);
+			expect(paramMismatches.length).toBeGreaterThanOrEqual(1);
+		} finally {
+			rmSync(base, { recursive: true, force: true });
+		}
+	});
+});
+
+// ─── Bug 3: dispatch-guard global-positional-fallback ──────────────────────
+
+describe("getPriorSliceCompletionBlocker", () => {
+	test("does not block zero-dep slice when earlier incomplete slice has unsatisfied deps", () => {
+		const base = mkdtempSync(join(tmpdir(), "sf-dispatch-"));
+		try {
+			// Create a milestone structure with:
+			// S01 depends on S02 (circular-ish, but S02 is incomplete)
+			// S03 has no dependencies
+			// When dispatching S03, it should NOT be blocked by S01's incompleteness
+			// because S01 is itself blocked by unsatisfied deps.
+			mkdirSync(join(base, ".sf", "milestones", "M001"), { recursive: true });
+			writeFileSync(
+				join(base, ".sf", "milestones", "M001", "ROADMAP.md"),
+				"# M001\n\n" +
+				"- [ ] S01 — first slice\n" +
+				"  - depends: S02\n" +
+				"- [ ] S02 — second slice\n" +
+				"- [ ] S03 — third slice\n",
+			);
+
+			const blocker = getPriorSliceCompletionBlocker(
+				base,
+				"main",
+				"execute-task",
+				"M001/S03/T01",
+			);
+			// S03 should NOT be blocked because S01 has unsatisfied deps (S02 incomplete)
+			expect(blocker).toBeNull();
+		} finally {
+			rmSync(base, { recursive: true, force: true });
+		}
+	});
+});
+
+// ─── Bug 4: extractPackageReferences stopword collision ────────────────────
+
+describe("extractPackageReferences", () => {
+	test("includes 'test' after -D flag", () => {
+		const packages = extractPackageReferences("npm install -D test");
+		expect(packages).toContain("test");
+	});
+
+	test("includes 'test' without flag when it looks like a package name", () => {
+		// "test" by itself is a stopword, but with package-name chars it should pass
+		const packages = extractPackageReferences("npm install test-utils");
+		expect(packages).toContain("test-utils");
+	});
+
+	test("stops on prose stopwords", () => {
+		const packages = extractPackageReferences("npm install and then run build");
+		// "and" is a stopword and should terminate parsing
+		expect(packages).not.toContain("then");
+		expect(packages).not.toContain("run");
+	});
+
+	test("includes scoped packages regardless of stopwords", () => {
+		const packages = extractPackageReferences("npm install @types/test");
+		expect(packages).toContain("@types/test");
+	});
+});
+
+// ─── Bug 5: runaway-guard token false-positive ─────────────────────────────
+
+describe("evaluateRunawayGuard", () => {
+	test("does not warn on token count alone when other signals are healthy", () => {
+		// Simulate a unit that has burned 1.2M tokens but only made 25 tool calls
+		// in 5 minutes with 1 changed file — this is healthy progress, not runaway.
+		resetRunawayGuardState("execute-task", "M001/S01/T01", {
+			sessionTokens: 100_000_000, // baseline: session already had 100M tokens
+			changedFiles: 0,
+			worktreeFingerprint: "abc123",
+		});
+
+		const config = {
+			enabled: true,
+			toolCallWarning: 60,
+			tokenWarning: 1_000_000,
+			elapsedMs: 20 * 60 * 1000,
+			changedFilesWarning: 75,
+			diagnosticTurns: 2,
+			hardPause: true,
+			minIntervalMs: 120_000,
+		};
+
+		const decision = evaluateRunawayGuard(
+			"execute-task",
+			"M001/S01/T01",
+			{
+				toolCalls: 25,
+				sessionTokens: 101_200_000, // delta = 1.2M tokens
+				elapsedMs: 5 * 60 * 1000, // 5 minutes
+				changedFiles: 1,
+				worktreeFingerprint: "def456",
+				topTools: { read: 10, edit: 5, bash: 5 },
+			},
+			config,
+		);
+
+		// Should NOT warn because token count alone is not enough —
+		// tool calls (25 < 60), elapsed (5min < 20min), changed files (1 < 75)
+		// are all well below thresholds.
+		expect(decision.action).toBe("none");
+	});
+
+	test("warns when token count is accompanied by a primary signal", () => {
+		resetRunawayGuardState("execute-task", "M001/S01/T01", {
+			sessionTokens: 100_000_000,
+			changedFiles: 0,
+			worktreeFingerprint: "abc123",
+		});
+
+		const config = {
+			enabled: true,
+			toolCallWarning: 60,
+			tokenWarning: 1_000_000,
+			elapsedMs: 20 * 60 * 1000,
+			changedFilesWarning: 75,
+			diagnosticTurns: 2,
+			hardPause: true,
+			minIntervalMs: 120_000,
+		};
+
+		const decision = evaluateRunawayGuard(
+			"execute-task",
+			"M001/S01/T01",
+			{
+				toolCalls: 65, // exceeds toolCallWarning (60)
+				sessionTokens: 101_200_000, // delta = 1.2M tokens
+				elapsedMs: 5 * 60 * 1000,
+				changedFiles: 1,
+				worktreeFingerprint: "def456",
+				topTools: { read: 30, edit: 20, bash: 15 },
+			},
+			config,
+		);
+
+		// Should warn because tool calls (65 >= 60) is a primary signal,
+		// and tokens (1.2M >= 1M) corroborate.
+		expect(decision.action).toBe("warn");
+	});
+
+	test("warns on no-progress heuristic regardless of primary signals", () => {
+		resetRunawayGuardState("execute-task", "M001/S01/T01", {
+			sessionTokens: 100_000_000,
+			changedFiles: 0,
+			worktreeFingerprint: "abc123",
+		});
+
+		const config = {
+			enabled: true,
+			toolCallWarning: 60,
+			tokenWarning: 1_000_000,
+			elapsedMs: 20 * 60 * 1000,
+			changedFilesWarning: 75,
+			diagnosticTurns: 2,
+			hardPause: true,
+			minIntervalMs: 120_000,
+		};
+
+		const decision = evaluateRunawayGuard(
+			"execute-task",
+			"M001/S01/T01",
+			{
+				toolCalls: 30, // exceeds EXECUTE_NO_PROGRESS_TOOL_WARNING (25)
+				sessionTokens: 100_600_000, // delta = 600K tokens
+				elapsedMs: 10 * 60 * 1000,
+				changedFiles: 0,
+				worktreeFingerprint: "abc123", // unchanged
+				topTools: { read: 15, edit: 10, bash: 5 },
+			},
+			config,
+		);
+
+		// Should warn because no-progress heuristic fires:
+		// 0 changed files, worktree unchanged, 30 tool calls, 600K tokens
+		expect(decision.action).toBe("warn");
+	});
+});