fix(metrics): deduplicate idle-watchdog entries and fix forensics false-positives (#1973)

* fix(metrics): deduplicate idle-watchdog entries on ledger load and fix forensics false-positives

The idle watchdog creates duplicate metrics entries with the same
(type, id, startedAt) triple, inflating reported cost by ~35% and
causing false-positive stuck-loop anomalies in forensics.

Add a deduplicateUnits() pass in loadLedger() that collapses entries
sharing the same (type, id, startedAt) key, keeping the one with the
highest finishedAt. The cleaned ledger is persisted back to disk so
duplicates do not re-accumulate across sessions.

Fix detectStuckLoops() in forensics.ts to count distinct startedAt
values per type/id instead of raw entry count, so idle-watchdog
snapshots of the same dispatch are not flagged as stuck loops.

Fixes #1943

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: export ForensicAnomaly type and use in test

The test declared a local ForensicAnomaly interface with `type: string`
which was incompatible with the real union literal type, causing CI
typecheck failure.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Tom Boucher 2026-04-05 00:54:44 -04:00 committed by GitHub
parent c8f11019f5
commit 3e7a8d8556
4 changed files with 268 additions and 7 deletions

View file

@ -38,7 +38,7 @@ import { ensurePreferencesFile, serializePreferencesToFrontmatter } from "./comm
// ─── Types ────────────────────────────────────────────────────────────────────
interface ForensicAnomaly {
export interface ForensicAnomaly {
type: "stuck-loop" | "cost-spike" | "timeout" | "missing-artifact" | "crash" | "doctor-issue" | "error-trace" | "journal-stuck" | "journal-guard-block" | "journal-rapid-iterations" | "journal-worktree-failure";
severity: "info" | "warning" | "error";
unitType?: string;
@ -640,13 +640,29 @@ function getDbCompletionCounts(): DbCompletionCounts | null {
// ─── Anomaly Detectors ───────────────────────────────────────────────────────
function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void {
const counts = new Map<string, number>();
/**
* Detect units that were dispatched multiple times (stuck in a loop).
*
* Counts distinct dispatches by grouping on (type, id, startedAt) first to
* collapse idle-watchdog duplicate snapshots (#1943), then counts unique
* startedAt values per type/id to determine actual dispatch count.
*
* Exported for testability.
*/
export function detectStuckLoops(units: UnitMetrics[], anomalies: ForensicAnomaly[]): void {
// First, collect unique startedAt values per type/id key
const dispatchMap = new Map<string, Set<number>>();
for (const u of units) {
const key = `${u.type}/${u.id}`;
counts.set(key, (counts.get(key) ?? 0) + 1);
let starts = dispatchMap.get(key);
if (!starts) {
starts = new Set();
dispatchMap.set(key, starts);
}
starts.add(u.startedAt);
}
for (const [key, count] of counts) {
for (const [key, starts] of dispatchMap) {
const count = starts.size;
if (count > 1) {
const [unitType, ...idParts] = key.split("/");
anomalies.push({

View file

@ -567,7 +567,34 @@ export function loadLedgerFromDisk(base: string): MetricsLedger | null {
}
function loadLedger(base: string): MetricsLedger {
return loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger);
const raw = loadJsonFile(metricsPath(base), isMetricsLedger, defaultLedger);
const before = raw.units.length;
raw.units = deduplicateUnits(raw.units);
if (raw.units.length < before) {
// Persist the cleaned ledger so duplicates don't re-accumulate
saveLedger(base, raw);
}
return raw;
}
/**
* Collapse duplicate entries with the same (type, id, startedAt) triple.
* Keeps the entry with the highest finishedAt (the most complete snapshot).
*
* This is a defensive measure against idle-watchdog race conditions that can
* produce duplicate entries on disk despite the in-memory idempotency guard
* in snapshotUnitMetrics(). See #1943.
*/
function deduplicateUnits(units: UnitMetrics[]): UnitMetrics[] {
const map = new Map<string, UnitMetrics>();
for (const u of units) {
const key = `${u.type}\0${u.id}\0${u.startedAt}`;
const existing = map.get(key);
if (!existing || u.finishedAt > existing.finishedAt) {
map.set(key, u);
}
}
return Array.from(map.values());
}
function saveLedger(base: string, data: MetricsLedger): void {

View file

@ -0,0 +1,103 @@
/**
* Forensics detectStuckLoops tests #1943
*
* Verifies that detectStuckLoops counts distinct dispatches (unique startedAt
* values per type/id) instead of raw entry count, which produces false-positive
* stuck-loop anomalies when idle-watchdog duplicate metrics entries exist.
*/
import test from "node:test";
import assert from "node:assert/strict";
import type { UnitMetrics } from "../metrics.js";
import { detectStuckLoops, type ForensicAnomaly } from "../forensics.js";
// ── Helpers ──────────────────────────────────────────────────────────────────
function makeUnit(overrides: Partial<UnitMetrics> = {}): UnitMetrics {
return {
type: "execute-task",
id: "M001/S01/T01",
model: "claude-sonnet-4-20250514",
startedAt: 1000,
finishedAt: 2000,
tokens: { input: 1000, output: 500, cacheRead: 200, cacheWrite: 100, total: 1800 },
cost: 0.05,
toolCalls: 3,
assistantMessages: 2,
userMessages: 1,
...overrides,
};
}
// ── Tests ────────────────────────────────────────────────────────────────────
test("#1943 detectStuckLoops does not flag idle-watchdog duplicates as stuck loops", () => {
const anomalies: ForensicAnomaly[] = [];
const startedAt = 1774011016218;
// 20 entries with the SAME startedAt — these are idle-watchdog duplicates,
// not real re-dispatches. They should count as 1 dispatch.
const units: UnitMetrics[] = [];
for (let i = 0; i < 20; i++) {
units.push(makeUnit({
type: "research-slice",
id: "M009/S02",
startedAt,
finishedAt: startedAt + (i + 1) * 15000,
cost: 1.50 + i * 0.05,
toolCalls: 0,
}));
}
detectStuckLoops(units, anomalies);
// A single dispatch (same startedAt) should NOT trigger a stuck-loop anomaly
assert.equal(
anomalies.length, 0,
`expected 0 anomalies for 20 watchdog snapshots of the same dispatch, got ${anomalies.length}: ${anomalies.map(a => a.summary).join(", ")}`,
);
});
test("#1943 detectStuckLoops correctly flags real re-dispatches", () => {
const anomalies: ForensicAnomaly[] = [];
// 3 entries with DIFFERENT startedAt values — these are real re-dispatches
const units: UnitMetrics[] = [
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 2000, cost: 0.05 }),
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 3000, finishedAt: 4000, cost: 0.06 }),
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 5000, finishedAt: 6000, cost: 0.07 }),
];
detectStuckLoops(units, anomalies);
assert.equal(anomalies.length, 1, "3 distinct dispatches of the same unit should flag 1 anomaly");
assert.equal(anomalies[0].type, "stuck-loop");
assert.ok(anomalies[0].summary.includes("3 times"), `summary should mention 3 dispatches: ${anomalies[0].summary}`);
});
test("#1943 detectStuckLoops ignores watchdog duplicates but flags real re-dispatches in mixed data", () => {
const anomalies: ForensicAnomaly[] = [];
const units: UnitMetrics[] = [
// 5 watchdog duplicates for dispatch 1 (same startedAt = 1000)
...Array.from({ length: 5 }, (_, i) =>
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1000, finishedAt: 1000 + (i + 1) * 15000, cost: 0.05 + i * 0.01 }),
),
// 3 watchdog duplicates for dispatch 2 (same startedAt = 100000)
...Array.from({ length: 3 }, (_, i) =>
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 100000, finishedAt: 100000 + (i + 1) * 15000, cost: 0.08 + i * 0.01 }),
),
// 1 entry for dispatch 3 (startedAt = 200000)
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 200000, finishedAt: 260000, cost: 0.10 }),
// Different unit — only 1 dispatch, should NOT be flagged
makeUnit({ type: "plan-slice", id: "M001/S01", startedAt: 500, finishedAt: 1500, cost: 0.02 }),
];
detectStuckLoops(units, anomalies);
// M001/S01/T01 has 3 distinct dispatches (startedAt: 1000, 100000, 200000) — should be flagged
// M001/S01 has 1 dispatch — should NOT be flagged
assert.equal(anomalies.length, 1, `expected 1 anomaly (for the 3x dispatched task), got ${anomalies.length}`);
assert.ok(anomalies[0].summary.includes("3 times"));
});

View file

@ -6,7 +6,7 @@
import test from "node:test";
import assert from "node:assert/strict";
import { mkdtempSync, mkdirSync, readFileSync, rmSync } from "node:fs";
import { mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import {
@ -382,3 +382,118 @@ test("snapshotUnitMetrics counts toolCall blocks correctly (#1713)", () => {
rmSync(tmpBase, { recursive: true, force: true });
}
});
// ── #1943 — Duplicate metrics entries from idle watchdog ──────────────────────
test("#1943 initMetrics deduplicates entries loaded from a corrupted disk ledger", () => {
const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-dedup-load-"));
mkdirSync(join(tmpBase, ".gsd"), { recursive: true });
try {
resetMetrics();
// Simulate a corrupted metrics.json with duplicate entries on disk
// (same type+id+startedAt but different finishedAt — idle watchdog pattern)
const corruptedLedger: MetricsLedger = {
version: 1,
projectStartedAt: 1700000000000,
units: [
makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011031218, cost: 1.50, tokens: { input: 6600000, output: 100000, cacheRead: 0, cacheWrite: 0, total: 6700000 } }),
makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011046218, cost: 1.55, tokens: { input: 6800000, output: 110000, cacheRead: 0, cacheWrite: 0, total: 6910000 } }),
makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011061218, cost: 1.60, tokens: { input: 7000000, output: 120000, cacheRead: 0, cacheWrite: 0, total: 7120000 } }),
makeUnit({ type: "research-slice", id: "M009/S02", startedAt: 1774011016218, finishedAt: 1774011076218, cost: 1.65, tokens: { input: 7200000, output: 130000, cacheRead: 0, cacheWrite: 0, total: 7330000 } }),
// A different unit — should be preserved
makeUnit({ type: "execute-task", id: "M001/S01/T01", startedAt: 1774012000000, finishedAt: 1774012060000, cost: 0.50 }),
],
};
writeFileSync(
join(tmpBase, ".gsd", "metrics.json"),
JSON.stringify(corruptedLedger, null, 2),
);
// Load the corrupted ledger — duplicates should be collapsed on load
initMetrics(tmpBase);
const ledger = getLedger();
assert.ok(ledger);
// The 4 entries with identical (type, id, startedAt) should collapse to 1,
// keeping the latest (highest finishedAt). Plus the 1 different unit = 2 total.
assert.equal(
ledger!.units.length, 2,
`expected 2 entries after dedup (1 collapsed group + 1 unique), got ${ledger!.units.length}`,
);
// The surviving duplicate should be the one with the latest finishedAt
const researchEntry = ledger!.units.find(u => u.type === "research-slice");
assert.ok(researchEntry);
assert.equal(researchEntry!.finishedAt, 1774011076218, "should keep the latest finishedAt");
assert.equal(researchEntry!.cost, 1.65, "should keep the latest cost");
// The on-disk file should also be deduplicated
const diskRaw = readFileSync(join(tmpBase, ".gsd", "metrics.json"), "utf-8");
const diskLedger: MetricsLedger = JSON.parse(diskRaw);
assert.equal(diskLedger.units.length, 2, "disk should also have deduplicated entries");
} finally {
resetMetrics();
rmSync(tmpBase, { recursive: true, force: true });
}
});
test("#1943 getProjectTotals reports correct cost after dedup (no 35% inflation)", () => {
// Simulate the exact scenario from the issue: 20 entries for a single dispatch
// with monotonically increasing token counts and 15s-apart finishedAt values
const startedAt = 1774011016218;
const baseCost = 1.50;
const duplicateUnits: UnitMetrics[] = [];
for (let i = 0; i < 20; i++) {
duplicateUnits.push(makeUnit({
type: "research-slice",
id: "M009/S02",
startedAt,
finishedAt: startedAt + (i + 1) * 15000,
cost: baseCost + i * 0.05,
toolCalls: 0,
tokens: {
input: 6600000 + i * 200000,
output: 100000 + i * 10000,
cacheRead: 0,
cacheWrite: 0,
total: 6700000 + i * 210000,
},
}));
}
// Without dedup, getProjectTotals would sum all 20 entries' costs
const rawTotals = getProjectTotals(duplicateUnits);
// With dedup (only last entry should count), cost should be the last entry's cost
const lastEntryCost = duplicateUnits[duplicateUnits.length - 1].cost;
// This test documents the bug: raw totals inflate cost by summing duplicates
assert.ok(
rawTotals.cost > lastEntryCost * 2,
"raw totals with duplicates inflate cost (bug demonstration)",
);
// After loading through initMetrics (which should dedup), totals should be correct
const tmpBase = mkdtempSync(join(tmpdir(), "gsd-metrics-cost-inflation-"));
mkdirSync(join(tmpBase, ".gsd"), { recursive: true });
try {
resetMetrics();
writeFileSync(
join(tmpBase, ".gsd", "metrics.json"),
JSON.stringify({ version: 1, projectStartedAt: 1700000000000, units: duplicateUnits }, null, 2),
);
initMetrics(tmpBase);
const ledger = getLedger()!;
const dedupedTotals = getProjectTotals(ledger.units);
assert.equal(ledger.units.length, 1, "20 duplicates should collapse to 1 entry");
assert.equal(
dedupedTotals.cost, lastEntryCost,
`deduped cost should be ${lastEntryCost}, not ${dedupedTotals.cost}`,
);
} finally {
resetMetrics();
rmSync(tmpBase, { recursive: true, force: true });
}
});