diff --git a/src/resources/extensions/gsd/forensics.ts b/src/resources/extensions/gsd/forensics.ts index 81cc69da2..ba2746f8b 100644 --- a/src/resources/extensions/gsd/forensics.ts +++ b/src/resources/extensions/gsd/forensics.ts @@ -38,7 +38,7 @@ import { ensurePreferencesFile, serializePreferencesToFrontmatter } from "./comm // ─── Types ──────────────────────────────────────────────────────────────────── -interface ForensicAnomaly { +export interface ForensicAnomaly { type: "stuck-loop" | "cost-spike" | "timeout" | "missing-artifact" | "crash" | "doctor-issue" | "error-trace" | "journal-stuck" | "journal-guard-block" | "journal-rapid-iterations" | "journal-worktree-failure"; severity: "info" | "warning" | "error"; unitType?: string; @@ -640,13 +640,29 @@ function getDbCompletionCounts(): DbCompletionCounts | null { // ─── Anomaly Detectors ─────────────────────────────────────────────────────── -function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void { - const counts = new Map(); +/** + * Detect units that were dispatched multiple times (stuck in a loop). + * + * Counts distinct dispatches by grouping on (type, id, startedAt) first to + * collapse idle-watchdog duplicate snapshots (#1943), then counts unique + * startedAt values per type/id to determine actual dispatch count. + * + * Exported for testability. + */ +export function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void { + // First, collect unique startedAt values per type/id key + const dispatchMap = new Map>(); for (const u of units) { const key = `${u.type}/${u.id}`; - counts.set(key, (counts.get(key) ?? 0) + 1); + let starts = dispatchMap.get(key); + if (!starts) { + starts = new Set(); + dispatchMap.set(key, starts); + } + starts.add(u.startedAt); } - for (const [key, count] of counts) { + for (const [key, starts] of dispatchMap) { + const count = starts.size; if (count > 1) { const [unitType, ...idParts] = key.split("/"); anomalies.push({ diff --git a/src/resources/extensions/gsd/metrics.ts b/src/resources/extensions/gsd/metrics.ts index 1467499e4..a29d4f39d 100644 --- a/src/resources/extensions/gsd/metrics.ts +++ b/src/resources/extensions/gsd/metrics.ts @@ -567,7 +567,34 @@ export function loadLedgerFromDisk(base: string): MetricsLedger | null { } function loadLedger(base: string): MetricsLedger { - return loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger); + const raw = loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger); + const before = raw.units.length; + raw.units = deduplicateUnits(raw.units); + if (raw.units.length < before) { + // Persist the cleaned ledger so duplicates don't re-accumulate + saveLedger(base, raw); + } + return raw; +} + +/** + * Collapse duplicate entries with the same (type, id, startedAt) triple. + * Keeps the entry with the highest finishedAt (the most complete snapshot). + * + * This is a defensive measure against idle-watchdog race conditions that can + * produce duplicate entries on disk despite the in-memory idempotency guard + * in snapshotUnitMetrics(). See #1943. + */ +function deduplicateUnits(units: UnitMetrics[]): UnitMetrics[] { + const map = new Map(); + for (const u of units) { + const key = `${u.type}\0${u.id}\0${u.startedAt}`; + const existing = map.get(key); + if (!existing || u.finishedAt > existing.finishedAt) { + map.set(key, u); + } + } + return Array.from(map.values()); } function saveLedger(base: string, data: MetricsLedger): void { diff --git a/src/resources/extensions/gsd/tests/forensics-stuck-loops.test.ts b/src/resources/extensions/gsd/tests/forensics-stuck-loops.test.ts new file mode 100644 index 000000000..555570bab --- /dev/null +++ b/src/resources/extensions/gsd/tests/forensics-stuck-loops.test.ts @@ -0,0 +1,103 @@ +/** + * Forensics detectStuckLoops tests — #1943 + * + * Verifies that detectStuckLoops counts distinct dispatches (unique startedAt + * values per type/id) instead of raw entry count, which produces false-positive + * stuck-loop anomalies when idle-watchdog duplicate metrics entries exist. + */ + +import test from "node:test"; +import assert from "node:assert/strict"; +import type { UnitMetrics } from "../metrics.js"; +import { detectStuckLoops, type ForensicAnomaly } from "../forensics.js"; + +// ── Helpers ────────────────────────────────────────────────────────────────── + +function makeUnit(overrides: Partial = {}): UnitMetrics { + return { + type: "execute-task", + id: "M001/S01/T01", + model: "claude-sonnet-4-20250514", + startedAt: 1000, + finishedAt: 2000, + tokens: { input: 1000, output: 500, cacheRead: 200, cacheWrite: 100, total: 1800 }, + cost: 0.05, + toolCalls: 3, + assistantMessages: 2, + userMessages: 1, + ...overrides, + }; +} + + +// ── Tests ──────────────────────────────────────────────────────────────────── + +test("#1943 detectStuckLoops does not flag idle-watchdog duplicates as stuck loops", () => { + const anomalies: ForensicAnomaly[] = []; + const startedAt = 1774011016218; + + // 20 entries with the SAME startedAt — these are idle-watchdog duplicates, + // not real re-dispatches. They should count as 1 dispatch. + const units: UnitMetrics[] = []; + for (let i = 0; i < 20; i++) { + units.push(makeUnit({ + type: "research-slice", + id: "M009/S02", + startedAt, + finishedAt: startedAt + (i + 1) * 15000, + cost: 1.50 + i * 0.05, + toolCalls: 0, + })); + } + + detectStuckLoops(units, anomalies); + + // A single dispatch (same startedAt) should NOT trigger a stuck-loop anomaly + assert.equal( + anomalies.length, 0, + `expected 0 anomalies for 20 watchdog snapshots of the same dispatch, got ${anomalies.length}: ${anomalies.map(a => a.summary).join(", ")}`, + ); +}); + +test("#1943 detectStuckLoops correctly flags real re-dispatches", () => { + const anomalies: ForensicAnomaly[] = []; + + // 3 entries with DIFFERENT startedAt values — these are real re-dispatches + const units: UnitMetrics[] = [ + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 2000, cost: 0.05 }), + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 3000, finishedAt: 4000, cost: 0.06 }), + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 5000, finishedAt: 6000, cost: 0.07 }), + ]; + + detectStuckLoops(units, anomalies); + + assert.equal(anomalies.length, 1, "3 distinct dispatches of the same unit should flag 1 anomaly"); + assert.equal(anomalies[0].type, "stuck-loop"); + assert.ok(anomalies[0].summary.includes("3 times"), `summary should mention 3 dispatches: ${anomalies[0].summary}`); +}); + +test("#1943 detectStuckLoops ignores watchdog duplicates but flags real re-dispatches in mixed data", () => { + const anomalies: ForensicAnomaly[] = []; + + const units: UnitMetrics[] = [ + // 5 watchdog duplicates for dispatch 1 (same startedAt = 1000) + ...Array.from({ length: 5 }, (_, i) => + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 1000 + (i + 1) * 15000, cost: 0.05 + i * 0.01 }), + ), + // 3 watchdog duplicates for dispatch 2 (same startedAt = 100000) + ...Array.from({ length: 3 }, (_, i) => + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 100000, finishedAt: 100000 + (i + 1) * 15000, cost: 0.08 + i * 0.01 }), + ), + // 1 entry for dispatch 3 (startedAt = 200000) + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 200000, finishedAt: 260000, cost: 0.10 }), + // Different unit — only 1 dispatch, should NOT be flagged + makeUnit({ type: "plan-slice", id: "M001/S01", startedAt: 500, finishedAt: 1500, cost: 0.02 }), + ]; + + detectStuckLoops(units, anomalies); + + // M001/S01/T01 has 3 distinct dispatches (startedAt: 1000, 100000, 200000) — should be flagged + // M001/S01 has 1 dispatch — should NOT be flagged + assert.equal(anomalies.length, 1, `expected 1 anomaly (for the 3x dispatched task), got ${anomalies.length}`); + assert.ok(anomalies[0].summary.includes("3 times")); +}); diff --git a/src/resources/extensions/gsd/tests/metrics.test.ts b/src/resources/extensions/gsd/tests/metrics.test.ts index 98782460e..dc221531a 100644 --- a/src/resources/extensions/gsd/tests/metrics.test.ts +++ b/src/resources/extensions/gsd/tests/metrics.test.ts @@ -6,7 +6,7 @@ import test from "node:test"; import assert from "node:assert/strict"; -import { mkdtempSync, mkdirSync, readFileSync, rmSync } from "node:fs"; +import { mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { @@ -382,3 +382,118 @@ test("snapshotUnitMetrics counts toolCall blocks correctly (#1713)", () => { rmSync(tmpBase, { recursive: true, force: true }); } }); + +// ── #1943 — Duplicate metrics entries from idle watchdog ────────────────────── + +test("#1943 initMetrics deduplicates entries loaded from a corrupted disk ledger", () => { + const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-dedup-load-")); + mkdirSync(join(tmpBase, ".gsd"), { recursive: true }); + + try { + resetMetrics(); + + // Simulate a corrupted metrics.json with duplicate entries on disk + // (same type+id+startedAt but different finishedAt — idle watchdog pattern) + const corruptedLedger: MetricsLedger = { + version: 1, + projectStartedAt: 1700000000000, + units: [ + makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011031218, cost: 1.50, tokens: { input: 6600000, output: 100000, cacheRead: 0, cacheWrite: 0, total: 6700000 } }), + makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011046218, cost: 1.55, tokens: { input: 6800000, output: 110000, cacheRead: 0, cacheWrite: 0, total: 6910000 } }), + makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011061218, cost: 1.60, tokens: { input: 7000000, output: 120000, cacheRead: 0, cacheWrite: 0, total: 7120000 } }), + makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011076218, cost: 1.65, tokens: { input: 7200000, output: 130000, cacheRead: 0, cacheWrite: 0, total: 7330000 } }), + // A different unit — should be preserved + makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1774012000000, finishedAt: 1774012060000, cost: 0.50 }), + ], + }; + writeFileSync( + join(tmpBase, ".gsd", "metrics.json"), + JSON.stringify(corruptedLedger, null, 2), + ); + + // Load the corrupted ledger — duplicates should be collapsed on load + initMetrics(tmpBase); + const ledger = getLedger(); + assert.ok(ledger); + + // The 4 entries with identical (type, id, startedAt) should collapse to 1, + // keeping the latest (highest finishedAt). Plus the 1 different unit = 2 total. + assert.equal( + ledger!.units.length, 2, + `expected 2 entries after dedup (1 collapsed group + 1 unique), got ${ledger!.units.length}`, + ); + + // The surviving duplicate should be the one with the latest finishedAt + const researchEntry = ledger!.units.find(u => u.type === "research-slice"); + assert.ok(researchEntry); + assert.equal(researchEntry!.finishedAt, 1774011076218, "should keep the latest finishedAt"); + assert.equal(researchEntry!.cost, 1.65, "should keep the latest cost"); + + // The on-disk file should also be deduplicated + const diskRaw = readFileSync(join(tmpBase, ".gsd", "metrics.json"), "utf-8"); + const diskLedger: MetricsLedger = JSON.parse(diskRaw); + assert.equal(diskLedger.units.length, 2, "disk should also have deduplicated entries"); + } finally { + resetMetrics(); + rmSync(tmpBase, { recursive: true, force: true }); + } +}); + +test("#1943 getProjectTotals reports correct cost after dedup (no 35% inflation)", () => { + // Simulate the exact scenario from the issue: 20 entries for a single dispatch + // with monotonically increasing token counts and 15s-apart finishedAt values + const startedAt = 1774011016218; + const baseCost = 1.50; + const duplicateUnits: UnitMetrics[] = []; + + for (let i = 0; i < 20; i++) { + duplicateUnits.push(makeUnit({ + type: "research-slice", + id: "M009/S02", + startedAt, + finishedAt: startedAt + (i + 1) * 15000, + cost: baseCost + i * 0.05, + toolCalls: 0, + tokens: { + input: 6600000 + i * 200000, + output: 100000 + i * 10000, + cacheRead: 0, + cacheWrite: 0, + total: 6700000 + i * 210000, + }, + })); + } + + // Without dedup, getProjectTotals would sum all 20 entries' costs + const rawTotals = getProjectTotals(duplicateUnits); + // With dedup (only last entry should count), cost should be the last entry's cost + const lastEntryCost = duplicateUnits[duplicateUnits.length - 1].cost; + + // This test documents the bug: raw totals inflate cost by summing duplicates + assert.ok( + rawTotals.cost > lastEntryCost * 2, + "raw totals with duplicates inflate cost (bug demonstration)", + ); + + // After loading through initMetrics (which should dedup), totals should be correct + const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-cost-inflation-")); + mkdirSync(join(tmpBase, ".gsd"), { recursive: true }); + try { + resetMetrics(); + writeFileSync( + join(tmpBase, ".gsd", "metrics.json"), + JSON.stringify({ version: 1, projectStartedAt: 1700000000000, units: duplicateUnits }, null, 2), + ); + initMetrics(tmpBase); + const ledger = getLedger()!; + const dedupedTotals = getProjectTotals(ledger.units); + assert.equal(ledger.units.length, 1, "20 duplicates should collapse to 1 entry"); + assert.equal( + dedupedTotals.cost, lastEntryCost, + `deduped cost should be ${lastEntryCost}, not ${dedupedTotals.cost}`, + ); + } finally { + resetMetrics(); + rmSync(tmpBase, { recursive: true, force: true }); + } +});