fix(metrics): deduplicate idle-watchdog entries and fix forensics false-positives (#1973)

* fix(metrics): deduplicate idle-watchdog entries on ledger load and fix forensics false-positives The idle watchdog creates duplicate metrics entries with the same (type, id, startedAt) triple, inflating reported cost by ~35% and causing false-positive stuck-loop anomalies in forensics. Add a deduplicateUnits() pass in loadLedger() that collapses entries sharing the same (type, id, startedAt) key, keeping the one with the highest finishedAt. The cleaned ledger is persisted back to disk so duplicates do not re-accumulate across sessions. Fix detectStuckLoops() in forensics.ts to count distinct startedAt values per type/id instead of raw entry count, so idle-watchdog snapshots of the same dispatch are not flagged as stuck loops. Fixes #1943 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: export ForensicAnomaly type and use in test The test declared a local ForensicAnomaly interface with `type: string` which was incompatible with the real union literal type, causing CI typecheck failure. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 00:54:44 -04:00 · 2026-04-05 00:54:44 -04:00 · 3e7a8d8556
commit 3e7a8d8556
parent c8f11019f5
4 changed files with 268 additions and 7 deletions
--- a/src/resources/extensions/gsd/forensics.ts
+++ b/src/resources/extensions/gsd/forensics.ts
@ -38,7 +38,7 @@ import { ensurePreferencesFile, serializePreferencesToFrontmatter } from "./comm

 // ─── Types ────────────────────────────────────────────────────────────────────

-interface ForensicAnomaly {
+export interface ForensicAnomaly {
  type: "stuck-loop" | "cost-spike" | "timeout" | "missing-artifact" | "crash" | "doctor-issue" | "error-trace" | "journal-stuck" | "journal-guard-block" | "journal-rapid-iterations" | "journal-worktree-failure";
  severity: "info" | "warning" | "error";
  unitType?: string;
@ -640,13 +640,29 @@ function getDbCompletionCounts(): DbCompletionCounts | null {

 // ─── Anomaly Detectors ───────────────────────────────────────────────────────

-function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void {
-  const counts = new Map<string, number>();
+/**
+ * Detect units that were dispatched multiple times (stuck in a loop).
+ *
+ * Counts distinct dispatches by grouping on (type, id, startedAt) first to
+ * collapse idle-watchdog duplicate snapshots (#1943), then counts unique
+ * startedAt values per type/id to determine actual dispatch count.
+ *
+ * Exported for testability.
+ */
+export function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void {
+  // First, collect unique startedAt values per type/id key
+  const dispatchMap = new Map<string, Set<number>>();
  for (const u of units) {
    const key = `${u.type}/${u.id}`;
-    counts.set(key, (counts.get(key) ?? 0) + 1);
+    let starts = dispatchMap.get(key);
+    if (!starts) {
+      starts = new Set();
+      dispatchMap.set(key, starts);
+    }
+    starts.add(u.startedAt);
  }
-  for (const [key, count] of counts) {
+  for (const [key, starts] of dispatchMap) {
+    const count = starts.size;
    if (count > 1) {
      const [unitType, ...idParts] = key.split("/");
      anomalies.push({
--- a/src/resources/extensions/gsd/metrics.ts
+++ b/src/resources/extensions/gsd/metrics.ts
@ -567,7 +567,34 @@ export function loadLedgerFromDisk(base: string): MetricsLedger | null {
 }

 function loadLedger(base: string): MetricsLedger {
-  return loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger);
+  const raw = loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger);
+  const before = raw.units.length;
+  raw.units = deduplicateUnits(raw.units);
+  if (raw.units.length < before) {
+    // Persist the cleaned ledger so duplicates don't re-accumulate
+    saveLedger(base, raw);
+  }
+  return raw;
+}
+
+/**
+ * Collapse duplicate entries with the same (type, id, startedAt) triple.
+ * Keeps the entry with the highest finishedAt (the most complete snapshot).
+ *
+ * This is a defensive measure against idle-watchdog race conditions that can
+ * produce duplicate entries on disk despite the in-memory idempotency guard
+ * in snapshotUnitMetrics(). See #1943.
+ */
+function deduplicateUnits(units: UnitMetrics[]): UnitMetrics[] {
+  const map = new Map<string, UnitMetrics>();
+  for (const u of units) {
+    const key = `${u.type}\0${u.id}\0${u.startedAt}`;
+    const existing = map.get(key);
+    if (!existing || u.finishedAt > existing.finishedAt) {
+      map.set(key, u);
+    }
+  }
+  return Array.from(map.values());
 }

 function saveLedger(base: string, data: MetricsLedger): void {
--- a/src/resources/extensions/gsd/tests/forensics-stuck-loops.test.ts
+++ b/src/resources/extensions/gsd/tests/forensics-stuck-loops.test.ts
@ -0,0 +1,103 @@
+/**
+ * Forensics detectStuckLoops tests — #1943
+ *
+ * Verifies that detectStuckLoops counts distinct dispatches (unique startedAt
+ * values per type/id) instead of raw entry count, which produces false-positive
+ * stuck-loop anomalies when idle-watchdog duplicate metrics entries exist.
+ */
+
+import test from "node:test";
+import assert from "node:assert/strict";
+import type { UnitMetrics } from "../metrics.js";
+import { detectStuckLoops, type ForensicAnomaly } from "../forensics.js";
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+function makeUnit(overrides: Partial<UnitMetrics> = {}): UnitMetrics {
+  return {
+    type: "execute-task",
+    id: "M001/S01/T01",
+    model: "claude-sonnet-4-20250514",
+    startedAt: 1000,
+    finishedAt: 2000,
+    tokens: { input: 1000, output: 500, cacheRead: 200, cacheWrite: 100, total: 1800 },
+    cost: 0.05,
+    toolCalls: 3,
+    assistantMessages: 2,
+    userMessages: 1,
+    ...overrides,
+  };
+}
+
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+test("#1943 detectStuckLoops does not flag idle-watchdog duplicates as stuck loops", () => {
+  const anomalies: ForensicAnomaly[] = [];
+  const startedAt = 1774011016218;
+
+  // 20 entries with the SAME startedAt — these are idle-watchdog duplicates,
+  // not real re-dispatches. They should count as 1 dispatch.
+  const units: UnitMetrics[] = [];
+  for (let i = 0; i < 20; i++) {
+    units.push(makeUnit({
+      type: "research-slice",
+      id: "M009/S02",
+      startedAt,
+      finishedAt: startedAt + (i + 1) * 15000,
+      cost: 1.50 + i * 0.05,
+      toolCalls: 0,
+    }));
+  }
+
+  detectStuckLoops(units, anomalies);
+
+  // A single dispatch (same startedAt) should NOT trigger a stuck-loop anomaly
+  assert.equal(
+    anomalies.length, 0,
+    `expected 0 anomalies for 20 watchdog snapshots of the same dispatch, got ${anomalies.length}: ${anomalies.map(a => a.summary).join(", ")}`,
+  );
+});
+
+test("#1943 detectStuckLoops correctly flags real re-dispatches", () => {
+  const anomalies: ForensicAnomaly[] = [];
+
+  // 3 entries with DIFFERENT startedAt values — these are real re-dispatches
+  const units: UnitMetrics[] = [
+    makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 2000, cost: 0.05 }),
+    makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 3000, finishedAt: 4000, cost: 0.06 }),
+    makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 5000, finishedAt: 6000, cost: 0.07 }),
+  ];
+
+  detectStuckLoops(units, anomalies);
+
+  assert.equal(anomalies.length, 1, "3 distinct dispatches of the same unit should flag 1 anomaly");
+  assert.equal(anomalies[0].type, "stuck-loop");
+  assert.ok(anomalies[0].summary.includes("3 times"), `summary should mention 3 dispatches: ${anomalies[0].summary}`);
+});
+
+test("#1943 detectStuckLoops ignores watchdog duplicates but flags real re-dispatches in mixed data", () => {
+  const anomalies: ForensicAnomaly[] = [];
+
+  const units: UnitMetrics[] = [
+    // 5 watchdog duplicates for dispatch 1 (same startedAt = 1000)
+    ...Array.from({ length: 5 }, (_, i) =>
+      makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 1000 + (i + 1) * 15000, cost: 0.05 + i * 0.01 }),
+    ),
+    // 3 watchdog duplicates for dispatch 2 (same startedAt = 100000)
+    ...Array.from({ length: 3 }, (_, i) =>
+      makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 100000, finishedAt: 100000 + (i + 1) * 15000, cost: 0.08 + i * 0.01 }),
+    ),
+    // 1 entry for dispatch 3 (startedAt = 200000)
+    makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 200000, finishedAt: 260000, cost: 0.10 }),
+    // Different unit — only 1 dispatch, should NOT be flagged
+    makeUnit({ type: "plan-slice", id: "M001/S01", startedAt: 500, finishedAt: 1500, cost: 0.02 }),
+  ];
+
+  detectStuckLoops(units, anomalies);
+
+  // M001/S01/T01 has 3 distinct dispatches (startedAt: 1000, 100000, 200000) — should be flagged
+  // M001/S01 has 1 dispatch — should NOT be flagged
+  assert.equal(anomalies.length, 1, `expected 1 anomaly (for the 3x dispatched task), got ${anomalies.length}`);
+  assert.ok(anomalies[0].summary.includes("3 times"));
+});
--- a/src/resources/extensions/gsd/tests/metrics.test.ts
+++ b/src/resources/extensions/gsd/tests/metrics.test.ts
@ -6,7 +6,7 @@

 import test from "node:test";
 import assert from "node:assert/strict";
-import { mkdtempSync, mkdirSync, readFileSync, rmSync } from "node:fs";
+import { mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import {
@ -382,3 +382,118 @@ test("snapshotUnitMetrics counts toolCall blocks correctly (#1713)", () => {
    rmSync(tmpBase, { recursive: true, force: true });
  }
 });
+
+// ── #1943 — Duplicate metrics entries from idle watchdog ──────────────────────
+
+test("#1943 initMetrics deduplicates entries loaded from a corrupted disk ledger", () => {
+  const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-dedup-load-"));
+  mkdirSync(join(tmpBase, ".gsd"), { recursive: true });
+
+  try {
+    resetMetrics();
+
+    // Simulate a corrupted metrics.json with duplicate entries on disk
+    // (same type+id+startedAt but different finishedAt — idle watchdog pattern)
+    const corruptedLedger: MetricsLedger = {
+      version: 1,
+      projectStartedAt: 1700000000000,
+      units: [
+        makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011031218, cost: 1.50, tokens: { input: 6600000, output: 100000, cacheRead: 0, cacheWrite: 0, total: 6700000 } }),
+        makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011046218, cost: 1.55, tokens: { input: 6800000, output: 110000, cacheRead: 0, cacheWrite: 0, total: 6910000 } }),
+        makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011061218, cost: 1.60, tokens: { input: 7000000, output: 120000, cacheRead: 0, cacheWrite: 0, total: 7120000 } }),
+        makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011076218, cost: 1.65, tokens: { input: 7200000, output: 130000, cacheRead: 0, cacheWrite: 0, total: 7330000 } }),
+        // A different unit — should be preserved
+        makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1774012000000, finishedAt: 1774012060000, cost: 0.50 }),
+      ],
+    };
+    writeFileSync(
+      join(tmpBase, ".gsd", "metrics.json"),
+      JSON.stringify(corruptedLedger, null, 2),
+    );
+
+    // Load the corrupted ledger — duplicates should be collapsed on load
+    initMetrics(tmpBase);
+    const ledger = getLedger();
+    assert.ok(ledger);
+
+    // The 4 entries with identical (type, id, startedAt) should collapse to 1,
+    // keeping the latest (highest finishedAt). Plus the 1 different unit = 2 total.
+    assert.equal(
+      ledger!.units.length, 2,
+      `expected 2 entries after dedup (1 collapsed group + 1 unique), got ${ledger!.units.length}`,
+    );
+
+    // The surviving duplicate should be the one with the latest finishedAt
+    const researchEntry = ledger!.units.find(u => u.type === "research-slice");
+    assert.ok(researchEntry);
+    assert.equal(researchEntry!.finishedAt, 1774011076218, "should keep the latest finishedAt");
+    assert.equal(researchEntry!.cost, 1.65, "should keep the latest cost");
+
+    // The on-disk file should also be deduplicated
+    const diskRaw = readFileSync(join(tmpBase, ".gsd", "metrics.json"), "utf-8");
+    const diskLedger: MetricsLedger = JSON.parse(diskRaw);
+    assert.equal(diskLedger.units.length, 2, "disk should also have deduplicated entries");
+  } finally {
+    resetMetrics();
+    rmSync(tmpBase, { recursive: true, force: true });
+  }
+});
+
+test("#1943 getProjectTotals reports correct cost after dedup (no 35% inflation)", () => {
+  // Simulate the exact scenario from the issue: 20 entries for a single dispatch
+  // with monotonically increasing token counts and 15s-apart finishedAt values
+  const startedAt = 1774011016218;
+  const baseCost = 1.50;
+  const duplicateUnits: UnitMetrics[] = [];
+
+  for (let i = 0; i < 20; i++) {
+    duplicateUnits.push(makeUnit({
+      type: "research-slice",
+      id: "M009/S02",
+      startedAt,
+      finishedAt: startedAt + (i + 1) * 15000,
+      cost: baseCost + i * 0.05,
+      toolCalls: 0,
+      tokens: {
+        input: 6600000 + i * 200000,
+        output: 100000 + i * 10000,
+        cacheRead: 0,
+        cacheWrite: 0,
+        total: 6700000 + i * 210000,
+      },
+    }));
+  }
+
+  // Without dedup, getProjectTotals would sum all 20 entries' costs
+  const rawTotals = getProjectTotals(duplicateUnits);
+  // With dedup (only last entry should count), cost should be the last entry's cost
+  const lastEntryCost = duplicateUnits[duplicateUnits.length - 1].cost;
+
+  // This test documents the bug: raw totals inflate cost by summing duplicates
+  assert.ok(
+    rawTotals.cost > lastEntryCost * 2,
+    "raw totals with duplicates inflate cost (bug demonstration)",
+  );
+
+  // After loading through initMetrics (which should dedup), totals should be correct
+  const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-cost-inflation-"));
+  mkdirSync(join(tmpBase, ".gsd"), { recursive: true });
+  try {
+    resetMetrics();
+    writeFileSync(
+      join(tmpBase, ".gsd", "metrics.json"),
+      JSON.stringify({ version: 1, projectStartedAt: 1700000000000, units: duplicateUnits }, null, 2),
+    );
+    initMetrics(tmpBase);
+    const ledger = getLedger()!;
+    const dedupedTotals = getProjectTotals(ledger.units);
+    assert.equal(ledger.units.length, 1, "20 duplicates should collapse to 1 entry");
+    assert.equal(
+      dedupedTotals.cost, lastEntryCost,
+      `deduped cost should be ${lastEntryCost}, not ${dedupedTotals.cost}`,
+    );
+  } finally {
+    resetMetrics();
+    rmSync(tmpBase, { recursive: true, force: true });
+  }
+});