feat(doctor): add 7 runtime health checks with auto-fix (#646)

* feat(doctor): add 7 runtime health checks with auto-fix

Add comprehensive runtime health monitoring to /gsd doctor:

- stale_crash_lock: detect dead auto.lock from crashed sessions, auto-clear
- orphaned_completed_units: find completed-unit keys referencing missing artifacts, auto-remove
- stale_hook_state: detect residual hook cycle counts with no running session, auto-clear
- activity_log_bloat: flag activity/ dir exceeding 500 files or 100MB, auto-prune (7-day retention)
- state_file_missing: detect missing STATE.md when milestones exist, auto-generate
- state_file_stale: detect STATE.md drift (wrong phase/milestone/slice), auto-rebuild
- gitignore_missing_patterns: detect missing critical GSD runtime patterns in .gitignore, auto-fix

All checks are non-fatal (gracefully degrade on read errors) and respect
the existing fix/fixLevel system. Includes 34 new test assertions across
9 test scenarios in doctor-runtime.test.ts.

* feat(doctor): add proactive healing layer for auto-mode

Three new mechanisms for automatic health monitoring:

1. Pre-dispatch health gate: runs before each unit dispatch in auto-mode.
   Checks for stale crash locks (auto-clears) and corrupt merge state
   (auto-heals via abortAndReset). Pauses auto-mode if critical issues
   can't be resolved.

2. Health score tracking: records error/warning/fix counts after each
   post-unit doctor run. Tracks trends (improving/stable/degrading)
   across a sliding window of 50 snapshots. Monitors consecutive
   error unit streaks.

3. Auto-heal escalation: when deterministic fixes can't resolve errors
   after 5 consecutive units AND health trend is not improving,
   automatically dispatches LLM-assisted heal (dispatchDoctorHeal).
   Single-fire per session to prevent spam. Defers escalation when
   trend is improving (fixes are working, just slowly).

Integration points in auto.ts:
- resetProactiveHealing() on start/stop
- preDispatchHealthGate() before deriveState in dispatchNextUnit
- recordHealthSnapshot() + checkHealEscalation() in post-unit hook
- formatHealthSummary() available for dashboard display

Includes 30 test assertions across 15 scenarios.
This commit is contained in:
Jeremy McSpadden 2026-03-16 11:34:26 -05:00 committed by GitHub
parent cb9191fa4f
commit 061d826a4e
6 changed files with 1174 additions and 4 deletions

View file

@ -64,7 +64,15 @@ import {
formatValidationIssues,
} from "./observability-validator.js";
import { ensureGitignore, untrackRuntimeFiles } from "./gitignore.js";
import { runGSDDoctor, rebuildState } from "./doctor.js";
import { runGSDDoctor, rebuildState, summarizeDoctorIssues } from "./doctor.js";
import {
preDispatchHealthGate,
recordHealthSnapshot,
checkHealEscalation,
resetProactiveHealing,
formatHealthSummary,
getConsecutiveErrorUnits,
} from "./doctor-proactive.js";
import { snapshotSkills, clearSkillSnapshot } from "./skill-discovery.js";
import { captureAvailableSkills, getAndClearSkills, resetSkillTelemetry } from "./skill-telemetry.js";
import {
@ -559,6 +567,7 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
completedUnits = [];
clearSliceProgressCache();
clearActivityLogState();
resetProactiveHealing();
pendingCrashRecovery = null;
_handlingAgentEnd = false;
ctx?.ui.setStatus("gsd-auto", undefined);
@ -858,6 +867,7 @@ export async function startAuto(
loadPersistedKeys(base, completedKeySet);
resetHookState();
restoreHookState(base);
resetProactiveHealing();
autoStartTime = Date.now();
resourceSyncedAtOnStart = readResourceSyncedAt();
completedUnits = [];
@ -1089,6 +1099,35 @@ export async function handleAgentEnd(
if (report.fixesApplied.length > 0) {
ctx.ui.notify(`Post-hook: applied ${report.fixesApplied.length} fix(es).`, "info");
}
// ── Proactive health tracking ──────────────────────────────────────
// Record health snapshot for trend analysis and escalation logic.
const summary = summarizeDoctorIssues(report.issues);
recordHealthSnapshot(summary.errors, summary.warnings, report.fixesApplied.length);
// Check if we should escalate to LLM-assisted heal
if (summary.errors > 0) {
const unresolvedErrors = report.issues
.filter(i => i.severity === "error" && !i.fixable)
.map(i => ({ code: i.code, message: i.message, unitId: i.unitId }));
const escalation = checkHealEscalation(summary.errors, unresolvedErrors);
if (escalation.shouldEscalate) {
ctx.ui.notify(
`Doctor heal escalation: ${escalation.reason}. Dispatching LLM-assisted heal.`,
"warning",
);
try {
const { formatDoctorIssuesForPrompt, formatDoctorReport } = await import("./doctor.js");
const { dispatchDoctorHeal } = await import("./commands.js");
const actionable = report.issues.filter(i => i.severity === "error");
const reportText = formatDoctorReport(report, { scope: doctorScope, includeWarnings: true });
const structuredIssues = formatDoctorIssuesForPrompt(actionable);
dispatchDoctorHeal(pi, doctorScope, reportText, structuredIssues);
} catch {
// Non-fatal — escalation dispatch failure
}
}
}
} catch {
// Non-fatal — doctor failure should never block dispatch
}
@ -1558,6 +1597,23 @@ async function dispatchNextUnit(
lastPromptCharCount = undefined;
lastBaselineCharCount = undefined;
// ── Pre-dispatch health gate ──────────────────────────────────────────
// Lightweight check for critical issues that would cause the next unit
// to fail or corrupt state. Auto-heals what it can, blocks on the rest.
try {
const healthGate = preDispatchHealthGate(basePath);
if (healthGate.fixesApplied.length > 0) {
ctx.ui.notify(`Pre-dispatch: ${healthGate.fixesApplied.join(", ")}`, "info");
}
if (!healthGate.proceed) {
ctx.ui.notify(healthGate.reason ?? "Pre-dispatch health check failed.", "error");
await pauseAuto(ctx, pi);
return;
}
} catch {
// Non-fatal — health gate failure should never block dispatch
}
let state = await deriveState(basePath);
let mid = state.activeMilestone?.id;
let midTitle = state.activeMilestone?.title;

View file

@ -41,7 +41,7 @@ import { handleUndo } from "./undo.js";
import { handleExport } from "./export.js";
import { nativeBranchList, nativeDetectMainBranch, nativeBranchListMerged, nativeBranchDelete, nativeForEachRef, nativeUpdateRef } from "./native-git-bridge.js";
function dispatchDoctorHeal(pi: ExtensionAPI, scope: string | undefined, reportText: string, structuredIssues: string): void {
export function dispatchDoctorHeal(pi: ExtensionAPI, scope: string | undefined, reportText: string, structuredIssues: string): void {
const workflowPath = process.env.GSD_WORKFLOW_PATH ?? join(process.env.HOME ?? "~", ".pi", "GSD-WORKFLOW.md");
const workflow = readFileSync(workflowPath, "utf-8");
const prompt = loadPrompt("doctor-heal", {

View file

@ -0,0 +1,286 @@
/**
* GSD Doctor Proactive Healing Layer
*
* Three mechanisms for automatic health monitoring during auto-mode:
*
* 1. Pre-dispatch health gate: lightweight check before each unit dispatch.
* Returns blocking issues that should pause auto-mode rather than
* dispatching into a broken state.
*
* 2. Health score tracking: tracks issue counts over time to detect
* degradation trends. If health is declining, surfaces a warning.
*
* 3. Auto-heal escalation: if deterministic fix can't resolve issues
* after N units, escalates to LLM-assisted heal dispatch.
*/
import { existsSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { gsdRoot, resolveGsdRootFile } from "./paths.js";
import { readCrashLock, isLockProcessAlive, clearLock } from "./crash-recovery.js";
import { abortAndReset } from "./git-self-heal.js";
// ── Health Score Tracking ──────────────────────────────────────────────────
export interface HealthSnapshot {
timestamp: number;
errors: number;
warnings: number;
fixesApplied: number;
unitIndex: number; // which unit dispatch triggered this snapshot
}
/** In-memory health history for the current auto-mode session. */
let healthHistory: HealthSnapshot[] = [];
/** Count of consecutive units with unresolved errors. */
let consecutiveErrorUnits = 0;
/** Unit index counter for health tracking. */
let healthUnitIndex = 0;
/**
* Record a health snapshot after a doctor run.
* Called from the post-unit hook in auto.ts.
*/
export function recordHealthSnapshot(errors: number, warnings: number, fixesApplied: number): void {
healthUnitIndex++;
healthHistory.push({
timestamp: Date.now(),
errors,
warnings,
fixesApplied,
unitIndex: healthUnitIndex,
});
// Keep only the last 50 snapshots to bound memory
if (healthHistory.length > 50) {
healthHistory = healthHistory.slice(-50);
}
if (errors > 0) {
consecutiveErrorUnits++;
} else {
consecutiveErrorUnits = 0;
}
}
/**
* Get the current health trend.
* Returns "improving", "stable", "degrading", or "unknown" (not enough data).
*/
export function getHealthTrend(): "improving" | "stable" | "degrading" | "unknown" {
if (healthHistory.length < 3) return "unknown";
const recent = healthHistory.slice(-5);
const older = healthHistory.slice(-10, -5);
if (older.length === 0) return "unknown";
const recentAvg = recent.reduce((sum, s) => sum + s.errors + s.warnings, 0) / recent.length;
const olderAvg = older.reduce((sum, s) => sum + s.errors + s.warnings, 0) / older.length;
const delta = recentAvg - olderAvg;
if (delta > 1) return "degrading";
if (delta < -1) return "improving";
return "stable";
}
/**
* Get the number of consecutive units with unresolved errors.
*/
export function getConsecutiveErrorUnits(): number {
return consecutiveErrorUnits;
}
/**
* Get health history for display (e.g., dashboard overlay).
*/
export function getHealthHistory(): readonly HealthSnapshot[] {
return healthHistory;
}
/**
* Reset health tracking state. Called on auto-mode start/stop.
*/
export function resetHealthTracking(): void {
healthHistory = [];
consecutiveErrorUnits = 0;
healthUnitIndex = 0;
}
// ── Pre-Dispatch Health Gate ───────────────────────────────────────────────
export interface PreDispatchHealthResult {
/** Whether the dispatch should proceed. */
proceed: boolean;
/** If blocked, the reason to show the user. */
reason?: string;
/** Issues found (for logging). */
issues: string[];
/** Whether fix was applied. */
fixesApplied: string[];
}
/**
* Lightweight pre-dispatch health check. Runs fast checks that should
* block dispatch if they fail avoids dispatching into a broken state.
*
* This is NOT a full doctor run it only checks critical, fast-to-evaluate
* conditions that would cause the next unit to fail or corrupt state.
*
* Returns { proceed: true } if dispatch should continue.
*/
export function preDispatchHealthGate(basePath: string): PreDispatchHealthResult {
const issues: string[] = [];
const fixesApplied: string[] = [];
// ── Stale crash lock blocks dispatch ──
// If a stale lock exists, the crash recovery path should handle it,
// not a new dispatch. This prevents double-dispatch after crashes.
try {
const lock = readCrashLock(basePath);
if (lock && !isLockProcessAlive(lock)) {
// Auto-clear it since we're about to dispatch anyway
clearLock(basePath);
fixesApplied.push("cleared stale auto.lock before dispatch");
}
} catch {
// Non-fatal
}
// ── Corrupt merge/rebase state blocks dispatch ──
// Dispatching a unit with MERGE_HEAD present will cause git operations to fail.
try {
const gitDir = join(basePath, ".git");
if (existsSync(gitDir)) {
const blockers = ["MERGE_HEAD", "rebase-apply", "rebase-merge"].filter(
f => existsSync(join(gitDir, f)),
);
if (blockers.length > 0) {
// Try to auto-heal
try {
const result = abortAndReset(basePath);
fixesApplied.push(`pre-dispatch: cleaned merge state (${result.cleaned.join(", ")})`);
} catch {
issues.push(`Corrupt git state: ${blockers.join(", ")}. Run /gsd doctor fix.`);
}
}
}
} catch {
// Non-fatal
}
// ── STATE.md existence check ──
// If STATE.md is missing, deriveState will still work but the LLM
// may get confused. Rebuild it silently.
try {
const stateFile = resolveGsdRootFile(basePath, "STATE");
const milestonesDir = join(gsdRoot(basePath), "milestones");
if (existsSync(milestonesDir) && !existsSync(stateFile)) {
issues.push("STATE.md missing — will rebuild after this unit");
// Don't block dispatch — rebuilding happens in post-hook
}
} catch {
// Non-fatal
}
// If we had critical issues that couldn't be auto-healed, block dispatch
if (issues.length > 0) {
return {
proceed: false,
reason: `Pre-dispatch health check failed:\n${issues.map(i => ` - ${i}`).join("\n")}\nRun /gsd doctor fix to resolve.`,
issues,
fixesApplied,
};
}
return { proceed: true, issues, fixesApplied };
}
// ── Auto-Heal Escalation ──────────────────────────────────────────────────
/** Threshold: escalate to LLM heal after this many consecutive error units. */
const ESCALATION_THRESHOLD = 5;
/** Whether an escalation has already been triggered this session (prevent spam). */
let escalationTriggered = false;
/**
* Check whether auto-heal should escalate from deterministic fix to
* LLM-assisted heal. Called after each post-unit doctor run.
*
* Returns the structured issue text for LLM dispatch, or null if
* escalation is not needed.
*/
export function checkHealEscalation(
errors: number,
unresolvedIssues: Array<{ code: string; message: string; unitId: string }>,
): { shouldEscalate: boolean; reason: string; issues: typeof unresolvedIssues } {
if (escalationTriggered) {
return { shouldEscalate: false, reason: "already escalated this session", issues: [] };
}
if (consecutiveErrorUnits < ESCALATION_THRESHOLD) {
return {
shouldEscalate: false,
reason: `${consecutiveErrorUnits}/${ESCALATION_THRESHOLD} consecutive error units`,
issues: [],
};
}
if (errors === 0) {
return { shouldEscalate: false, reason: "no errors to escalate", issues: [] };
}
const trend = getHealthTrend();
if (trend === "improving") {
return { shouldEscalate: false, reason: "health is improving — deferring escalation", issues: [] };
}
escalationTriggered = true;
return {
shouldEscalate: true,
reason: `${consecutiveErrorUnits} consecutive units with unresolved errors (trend: ${trend})`,
issues: unresolvedIssues,
};
}
/**
* Reset escalation state. Called on auto-mode start/stop.
*/
export function resetEscalation(): void {
escalationTriggered = false;
}
/**
* Format a health summary for display in the auto-mode dashboard.
*/
export function formatHealthSummary(): string {
if (healthHistory.length === 0) return "No health data yet.";
const latest = healthHistory[healthHistory.length - 1]!;
const trend = getHealthTrend();
const trendIcon = trend === "improving" ? "+" : trend === "degrading" ? "-" : "=";
const totalFixes = healthHistory.reduce((sum, s) => sum + s.fixesApplied, 0);
const parts = [
`Health: ${latest.errors}E/${latest.warnings}W`,
`trend:${trendIcon}`,
`fixes:${totalFixes}`,
];
if (consecutiveErrorUnits > 0) {
parts.push(`streak:${consecutiveErrorUnits}/${ESCALATION_THRESHOLD}`);
}
return parts.join(" | ");
}
/**
* Reset all proactive healing state. Called on auto-mode start/stop.
*/
export function resetProactiveHealing(): void {
resetHealthTracking();
resetEscalation();
}

View file

@ -1,4 +1,4 @@
import { existsSync, mkdirSync } from "node:fs";
import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, unlinkSync } from "node:fs";
import { join, sep } from "node:path";
import { loadFile, parsePlan, parseRoadmap, parseSummary, saveFile, parseTaskPlanMustHaves, countMustHavesMentionedInSummary } from "./files.js";
@ -9,6 +9,8 @@ import { listWorktrees } from "./worktree-manager.js";
import { abortAndReset } from "./git-self-heal.js";
import { RUNTIME_EXCLUSION_PATHS } from "./git-service.js";
import { nativeIsRepo, nativeWorktreeRemove, nativeBranchList, nativeBranchDelete, nativeLsFiles, nativeRmCached } from "./native-git-bridge.js";
import { readCrashLock, isLockProcessAlive, clearLock } from "./crash-recovery.js";
import { ensureGitignore } from "./gitignore.js";
export type DoctorSeverity = "info" | "warning" | "error";
export type DoctorIssueCode =
@ -32,7 +34,14 @@ export type DoctorIssueCode =
| "stale_milestone_branch"
| "corrupt_merge_state"
| "tracked_runtime_files"
| "legacy_slice_branches";
| "legacy_slice_branches"
| "stale_crash_lock"
| "orphaned_completed_units"
| "stale_hook_state"
| "activity_log_bloat"
| "state_file_stale"
| "state_file_missing"
| "gitignore_missing_patterns";
export interface DoctorIssue {
severity: DoctorSeverity;
@ -657,6 +666,275 @@ async function checkGitHealth(
}
}
// ── Runtime Health Checks ──────────────────────────────────────────────────
// Checks for stale crash locks, orphaned completed-units, stale hook state,
// activity log bloat, STATE.md drift, and gitignore drift.
async function checkRuntimeHealth(
basePath: string,
issues: DoctorIssue[],
fixesApplied: string[],
shouldFix: (code: DoctorIssueCode) => boolean,
): Promise<void> {
const root = gsdRoot(basePath);
// ── Stale crash lock ──────────────────────────────────────────────────
try {
const lock = readCrashLock(basePath);
if (lock) {
const alive = isLockProcessAlive(lock);
if (!alive) {
issues.push({
severity: "error",
code: "stale_crash_lock",
scope: "project",
unitId: "project",
message: `Stale auto.lock from PID ${lock.pid} (started ${lock.startedAt}, was executing ${lock.unitType} ${lock.unitId}) — process is no longer running`,
file: ".gsd/auto.lock",
fixable: true,
});
if (shouldFix("stale_crash_lock")) {
clearLock(basePath);
fixesApplied.push("cleared stale auto.lock");
}
}
}
} catch {
// Non-fatal — crash lock check failed
}
// ── Orphaned completed-units keys ─────────────────────────────────────
try {
const completedKeysFile = join(root, "completed-units.json");
if (existsSync(completedKeysFile)) {
const raw = readFileSync(completedKeysFile, "utf-8");
const keys: string[] = JSON.parse(raw);
const orphaned: string[] = [];
for (const key of keys) {
// Key format: "unitType/unitId" e.g. "execute-task/M001/S01/T01"
const slashIdx = key.indexOf("/");
if (slashIdx === -1) continue;
const unitType = key.slice(0, slashIdx);
const unitId = key.slice(slashIdx + 1);
// Only validate artifact-producing unit types
const { verifyExpectedArtifact } = await import("./auto-recovery.js");
if (!verifyExpectedArtifact(unitType, unitId, basePath)) {
orphaned.push(key);
}
}
if (orphaned.length > 0) {
issues.push({
severity: "warning",
code: "orphaned_completed_units",
scope: "project",
unitId: "project",
message: `${orphaned.length} completed-unit key(s) reference missing artifacts: ${orphaned.slice(0, 3).join(", ")}${orphaned.length > 3 ? "..." : ""}`,
file: ".gsd/completed-units.json",
fixable: true,
});
if (shouldFix("orphaned_completed_units")) {
const { removePersistedKey } = await import("./auto-recovery.js");
for (const key of orphaned) {
removePersistedKey(basePath, key);
}
fixesApplied.push(`removed ${orphaned.length} orphaned completed-unit key(s)`);
}
}
}
} catch {
// Non-fatal — completed-units check failed
}
// ── Stale hook state ──────────────────────────────────────────────────
try {
const hookStateFile = join(root, "hook-state.json");
if (existsSync(hookStateFile)) {
const raw = readFileSync(hookStateFile, "utf-8");
const state = JSON.parse(raw);
const hasCycleCounts = state.cycleCounts && typeof state.cycleCounts === "object"
&& Object.keys(state.cycleCounts).length > 0;
// Only flag if there are actual cycle counts AND no auto-mode is running
if (hasCycleCounts) {
const lock = readCrashLock(basePath);
const autoRunning = lock ? isLockProcessAlive(lock) : false;
if (!autoRunning) {
issues.push({
severity: "info",
code: "stale_hook_state",
scope: "project",
unitId: "project",
message: `hook-state.json has ${Object.keys(state.cycleCounts).length} residual cycle count(s) from a previous session`,
file: ".gsd/hook-state.json",
fixable: true,
});
if (shouldFix("stale_hook_state")) {
const { clearPersistedHookState } = await import("./post-unit-hooks.js");
clearPersistedHookState(basePath);
fixesApplied.push("cleared stale hook-state.json");
}
}
}
}
} catch {
// Non-fatal — hook state check failed
}
// ── Activity log bloat ────────────────────────────────────────────────
try {
const activityDir = join(root, "activity");
if (existsSync(activityDir)) {
const files = readdirSync(activityDir);
let totalSize = 0;
for (const f of files) {
try {
totalSize += statSync(join(activityDir, f)).size;
} catch {
// stat failed — skip
}
}
const totalMB = totalSize / (1024 * 1024);
const BLOAT_FILE_THRESHOLD = 500;
const BLOAT_SIZE_MB = 100;
if (files.length > BLOAT_FILE_THRESHOLD || totalMB > BLOAT_SIZE_MB) {
issues.push({
severity: "warning",
code: "activity_log_bloat",
scope: "project",
unitId: "project",
message: `Activity logs: ${files.length} files, ${totalMB.toFixed(1)}MB (thresholds: ${BLOAT_FILE_THRESHOLD} files / ${BLOAT_SIZE_MB}MB)`,
file: ".gsd/activity/",
fixable: true,
});
if (shouldFix("activity_log_bloat")) {
const { pruneActivityLogs } = await import("./activity-log.js");
pruneActivityLogs(activityDir, 7); // 7-day retention
fixesApplied.push("pruned activity logs (7-day retention)");
}
}
}
} catch {
// Non-fatal — activity log check failed
}
// ── STATE.md health ───────────────────────────────────────────────────
try {
const stateFilePath = resolveGsdRootFile(basePath, "STATE");
const milestonesPath = milestonesDir(basePath);
if (existsSync(milestonesPath)) {
if (!existsSync(stateFilePath)) {
issues.push({
severity: "warning",
code: "state_file_missing",
scope: "project",
unitId: "project",
message: "STATE.md is missing — state display will not work",
file: ".gsd/STATE.md",
fixable: true,
});
if (shouldFix("state_file_missing")) {
const state = await deriveState(basePath);
await saveFile(stateFilePath, buildStateMarkdown(state));
fixesApplied.push("created STATE.md from derived state");
}
} else {
// Check if STATE.md is stale by comparing active milestone/slice/phase
const currentContent = readFileSync(stateFilePath, "utf-8");
const state = await deriveState(basePath);
const freshContent = buildStateMarkdown(state);
// Extract key fields for comparison — don't compare full content
// since timestamp/formatting differences are normal
const extractFields = (content: string) => {
const milestone = content.match(/\*\*Active Milestone:\*\*\s*(.+)/)?.[1]?.trim() ?? "";
const slice = content.match(/\*\*Active Slice:\*\*\s*(.+)/)?.[1]?.trim() ?? "";
const phase = content.match(/\*\*Phase:\*\*\s*(.+)/)?.[1]?.trim() ?? "";
return { milestone, slice, phase };
};
const current = extractFields(currentContent);
const fresh = extractFields(freshContent);
if (current.milestone !== fresh.milestone || current.slice !== fresh.slice || current.phase !== fresh.phase) {
issues.push({
severity: "warning",
code: "state_file_stale",
scope: "project",
unitId: "project",
message: `STATE.md is stale — shows "${current.phase}" but derived state is "${fresh.phase}"`,
file: ".gsd/STATE.md",
fixable: true,
});
if (shouldFix("state_file_stale")) {
await saveFile(stateFilePath, freshContent);
fixesApplied.push("rebuilt STATE.md from derived state");
}
}
}
}
} catch {
// Non-fatal — STATE.md check failed
}
// ── Gitignore drift ───────────────────────────────────────────────────
try {
const gitignorePath = join(basePath, ".gitignore");
if (existsSync(gitignorePath) && nativeIsRepo(basePath)) {
const content = readFileSync(gitignorePath, "utf-8");
const existingLines = new Set(
content.split("\n").map(l => l.trim()).filter(l => l && !l.startsWith("#")),
);
// Check for critical runtime patterns that must be present
const criticalPatterns = [
".gsd/activity/",
".gsd/runtime/",
".gsd/auto.lock",
".gsd/gsd.db",
".gsd/completed-units.json",
];
// If blanket .gsd/ or .gsd is present, all patterns are covered
const hasBlanketIgnore = existingLines.has(".gsd/") || existingLines.has(".gsd");
if (!hasBlanketIgnore) {
const missing = criticalPatterns.filter(p => !existingLines.has(p));
if (missing.length > 0) {
issues.push({
severity: "warning",
code: "gitignore_missing_patterns",
scope: "project",
unitId: "project",
message: `${missing.length} critical GSD runtime pattern(s) missing from .gitignore: ${missing.join(", ")}`,
file: ".gitignore",
fixable: true,
});
if (shouldFix("gitignore_missing_patterns")) {
ensureGitignore(basePath);
fixesApplied.push("added missing GSD runtime patterns to .gitignore");
}
}
}
}
} catch {
// Non-fatal — gitignore check failed
}
}
export async function runGSDDoctor(basePath: string, options?: { fix?: boolean; scope?: string; fixLevel?: "task" | "all" }): Promise<DoctorReport> {
const issues: DoctorIssue[] = [];
const fixesApplied: string[] = [];
@ -700,6 +978,9 @@ export async function runGSDDoctor(basePath: string, options?: { fix?: boolean;
// Git health checks (orphaned worktrees, stale branches, corrupt merge state, tracked runtime files)
await checkGitHealth(basePath, issues, fixesApplied, shouldFix);
// Runtime health checks (crash locks, completed-units, hook state, activity logs, STATE.md, gitignore)
await checkRuntimeHealth(basePath, issues, fixesApplied, shouldFix);
const milestonesPath = milestonesDir(basePath);
if (!existsSync(milestonesPath)) {
return { ok: issues.every(issue => issue.severity !== "error"), basePath, issues, fixesApplied };

View file

@ -0,0 +1,244 @@
/**
* doctor-proactive.test.ts Tests for proactive healing layer.
*
* Tests:
* - Pre-dispatch health gate (stale lock, merge state)
* - Health score tracking (snapshots, trends)
* - Auto-heal escalation (consecutive errors, threshold)
*/
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync, realpathSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { execSync } from "node:child_process";
import {
preDispatchHealthGate,
recordHealthSnapshot,
getHealthTrend,
getConsecutiveErrorUnits,
getHealthHistory,
checkHealEscalation,
resetProactiveHealing,
formatHealthSummary,
} from "../doctor-proactive.ts";
import { createTestContext } from "./test-helpers.ts";
const { assertEq, assertTrue, report } = createTestContext();
function run(cmd: string, cwd: string): string {
return execSync(cmd, { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
}
function createGitRepo(): string {
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-proactive-")));
run("git init", dir);
run("git config user.email test@test.com", dir);
run("git config user.name Test", dir);
writeFileSync(join(dir, "README.md"), "# test\n");
run("git add .", dir);
run("git commit -m init", dir);
run("git branch -M main", dir);
mkdirSync(join(dir, ".gsd"), { recursive: true });
return dir;
}
async function main(): Promise<void> {
const cleanups: string[] = [];
try {
// ─── Health Score Tracking ─────────────────────────────────────────
console.log("\n=== health tracking: initial state ===");
{
resetProactiveHealing();
assertEq(getHealthTrend(), "unknown", "trend is unknown with no data");
assertEq(getConsecutiveErrorUnits(), 0, "no consecutive errors initially");
assertEq(getHealthHistory().length, 0, "no history initially");
}
console.log("\n=== health tracking: recording snapshots ===");
{
resetProactiveHealing();
recordHealthSnapshot(0, 2, 1);
recordHealthSnapshot(0, 1, 0);
recordHealthSnapshot(0, 0, 0);
assertEq(getHealthHistory().length, 3, "3 snapshots recorded");
assertEq(getConsecutiveErrorUnits(), 0, "no consecutive errors after clean units");
}
console.log("\n=== health tracking: consecutive error counting ===");
{
resetProactiveHealing();
recordHealthSnapshot(2, 1, 0); // errors
recordHealthSnapshot(1, 0, 0); // errors
recordHealthSnapshot(1, 0, 0); // errors
assertEq(getConsecutiveErrorUnits(), 3, "3 consecutive error units");
recordHealthSnapshot(0, 0, 0); // clean
assertEq(getConsecutiveErrorUnits(), 0, "streak reset on clean unit");
}
console.log("\n=== health tracking: trend detection ===");
{
resetProactiveHealing();
// Record 5 older snapshots with low issues
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(0, 1, 0);
}
// Record 5 recent snapshots with high issues
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(3, 5, 0);
}
assertEq(getHealthTrend(), "degrading", "detects degrading trend");
}
console.log("\n=== health tracking: improving trend ===");
{
resetProactiveHealing();
// Record 5 older snapshots with high issues
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(3, 5, 0);
}
// Record 5 recent snapshots with low issues
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(0, 0, 0);
}
assertEq(getHealthTrend(), "improving", "detects improving trend");
}
console.log("\n=== health tracking: stable trend ===");
{
resetProactiveHealing();
for (let i = 0; i < 10; i++) {
recordHealthSnapshot(1, 1, 0);
}
assertEq(getHealthTrend(), "stable", "detects stable trend");
}
// ─── Auto-Heal Escalation ─────────────────────────────────────────
console.log("\n=== escalation: below threshold ===");
{
resetProactiveHealing();
recordHealthSnapshot(1, 0, 0);
recordHealthSnapshot(1, 0, 0);
recordHealthSnapshot(1, 0, 0);
const result = checkHealEscalation(1, [{ code: "test", message: "test error", unitId: "M001/S01" }]);
assertEq(result.shouldEscalate, false, "no escalation below threshold");
assertTrue(result.reason.includes("3/5"), "reason shows progress toward threshold");
}
console.log("\n=== escalation: at threshold ===");
{
resetProactiveHealing();
// Need 5+ consecutive error units AND degrading/stable trend
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(0, 0, 0); // older clean snapshots
}
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(2, 1, 0); // recent error snapshots
}
const result = checkHealEscalation(2, [{ code: "test", message: "test error", unitId: "M001/S01" }]);
assertEq(result.shouldEscalate, true, "escalates at threshold with degrading trend");
assertTrue(result.reason.includes("5 consecutive"), "reason mentions consecutive count");
}
console.log("\n=== escalation: no double escalation ===");
{
// Don't reset — should already be escalated from previous test
recordHealthSnapshot(2, 0, 0);
const result = checkHealEscalation(2, [{ code: "test", message: "test error", unitId: "M001/S01" }]);
assertEq(result.shouldEscalate, false, "no double escalation in same session");
assertTrue(result.reason.includes("already escalated"), "reason explains why no escalation");
}
console.log("\n=== escalation: deferred when improving ===");
{
resetProactiveHealing();
// 5 older snapshots with high errors
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(5, 5, 0);
}
// 5 recent snapshots with fewer errors (still > 0)
for (let i = 0; i < 5; i++) {
recordHealthSnapshot(1, 0, 0);
}
const result = checkHealEscalation(1, [{ code: "test", message: "test error", unitId: "M001/S01" }]);
assertEq(result.shouldEscalate, false, "no escalation when trend is improving");
assertTrue(result.reason.includes("improving"), "reason mentions improving trend");
}
// ─── Health Summary Formatting ────────────────────────────────────
console.log("\n=== formatHealthSummary ===");
{
resetProactiveHealing();
assertEq(formatHealthSummary(), "No health data yet.", "empty summary when no data");
recordHealthSnapshot(2, 3, 1);
const summary = formatHealthSummary();
assertTrue(summary.includes("2E/3W"), "summary includes error/warning counts");
assertTrue(summary.includes("fixes:1"), "summary includes fix count");
assertTrue(summary.includes("streak:1/5"), "summary includes error streak");
}
// ─── Pre-Dispatch Health Gate ─────────────────────────────────────
console.log("\n=== health gate: clean state ===");
{
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-proactive-")));
cleanups.push(dir);
mkdirSync(join(dir, ".gsd"), { recursive: true });
const result = preDispatchHealthGate(dir);
assertTrue(result.proceed, "gate passes on clean state");
assertEq(result.issues.length, 0, "no issues on clean state");
}
console.log("\n=== health gate: stale crash lock auto-cleared ===");
{
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-proactive-")));
cleanups.push(dir);
mkdirSync(join(dir, ".gsd"), { recursive: true });
// Write a stale lock
writeFileSync(join(dir, ".gsd", "auto.lock"), JSON.stringify({
pid: 9999999, startedAt: "2026-03-10T00:00:00Z",
unitType: "execute-task", unitId: "M001/S01/T01",
unitStartedAt: "2026-03-10T00:01:00Z", completedUnits: 3,
}));
const result = preDispatchHealthGate(dir);
assertTrue(result.proceed, "gate passes after auto-clearing stale lock");
assertTrue(result.fixesApplied.some(f => f.includes("cleared stale auto.lock")), "reports lock cleared");
assertTrue(!existsSync(join(dir, ".gsd", "auto.lock")), "lock file removed");
}
console.log("\n=== health gate: corrupt merge state auto-healed ===");
if (process.platform !== "win32") {
{
const dir = createGitRepo();
cleanups.push(dir);
// Inject MERGE_HEAD
const headHash = run("git rev-parse HEAD", dir);
writeFileSync(join(dir, ".git", "MERGE_HEAD"), headHash + "\n");
const result = preDispatchHealthGate(dir);
assertTrue(result.proceed, "gate passes after auto-healing merge state");
assertTrue(result.fixesApplied.some(f => f.includes("cleaned merge state")), "reports merge state cleaned");
assertTrue(!existsSync(join(dir, ".git", "MERGE_HEAD")), "MERGE_HEAD removed");
}
} else {
console.log(" (skipped on Windows)");
}
} finally {
resetProactiveHealing();
for (const dir of cleanups) {
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
}
}
report();
}
main();

View file

@ -0,0 +1,303 @@
/**
* doctor-runtime.test.ts Tests for doctor runtime health checks.
*
* Tests detection and auto-fix of:
* stale_crash_lock, orphaned_completed_units, stale_hook_state,
* activity_log_bloat, state_file_missing, state_file_stale,
* gitignore_missing_patterns
*/
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync, readFileSync, realpathSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { execSync } from "node:child_process";
import { runGSDDoctor } from "../doctor.ts";
import { createTestContext } from "./test-helpers.ts";
const { assertEq, assertTrue, report } = createTestContext();
function run(cmd: string, cwd: string): string {
return execSync(cmd, { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
}
/** Create a minimal .gsd project with a milestone for STATE.md tests. */
function createMinimalProject(): string {
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-runtime-test-")));
const msDir = join(dir, ".gsd", "milestones", "M001");
mkdirSync(msDir, { recursive: true });
writeFileSync(join(msDir, "M001-ROADMAP.md"), `# M001: Test
## Slices
- [ ] **S01: Demo** \`risk:low\` \`depends:[]\`
> After this: done
`);
const sDir = join(msDir, "slices", "S01", "tasks");
mkdirSync(sDir, { recursive: true });
writeFileSync(join(msDir, "slices", "S01", "S01-PLAN.md"), `# S01: Demo
**Goal:** Demo
## Tasks
- [ ] **T01: Do thing** \`est:10m\`
`);
return dir;
}
/** Create a minimal git repo with .gsd for gitignore tests. */
function createGitProject(): string {
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-runtime-git-")));
run("git init", dir);
run("git config user.email test@test.com", dir);
run("git config user.name Test", dir);
writeFileSync(join(dir, "README.md"), "# test\n");
run("git add .", dir);
run("git commit -m init", dir);
run("git branch -M main", dir);
return dir;
}
async function main(): Promise<void> {
const cleanups: string[] = [];
try {
// ─── Test 1: Stale crash lock detection & fix ─────────────────────
console.log("\n=== stale_crash_lock ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// Write a lock file with a PID that is definitely dead (use PID 1 million+)
const lockData = {
pid: 9999999,
startedAt: "2026-03-10T00:00:00Z",
unitType: "execute-task",
unitId: "M001/S01/T01",
unitStartedAt: "2026-03-10T00:01:00Z",
completedUnits: 3,
};
writeFileSync(join(dir, ".gsd", "auto.lock"), JSON.stringify(lockData, null, 2));
const detect = await runGSDDoctor(dir);
const lockIssues = detect.issues.filter(i => i.code === "stale_crash_lock");
assertTrue(lockIssues.length > 0, "detects stale crash lock");
assertTrue(lockIssues[0]?.message.includes("9999999"), "message includes PID");
assertTrue(lockIssues[0]?.fixable === true, "stale lock is fixable");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("cleared stale auto.lock")), "fix clears stale lock");
assertTrue(!existsSync(join(dir, ".gsd", "auto.lock")), "auto.lock removed after fix");
}
// ─── Test 2: No false positive for missing lock ───────────────────
console.log("\n=== stale_crash_lock — no false positive ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
const detect = await runGSDDoctor(dir);
const lockIssues = detect.issues.filter(i => i.code === "stale_crash_lock");
assertEq(lockIssues.length, 0, "no stale lock issue when no lock file exists");
}
// ─── Test 3: Stale hook state detection & fix ─────────────────────
console.log("\n=== stale_hook_state ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// Write hook state with active cycle counts and no auto.lock (no running session)
const hookState = {
cycleCounts: {
"code-review/execute-task/M001/S01/T01": 2,
"lint-check/execute-task/M001/S01/T02": 1,
},
savedAt: "2026-03-10T00:00:00Z",
};
writeFileSync(join(dir, ".gsd", "hook-state.json"), JSON.stringify(hookState, null, 2));
const detect = await runGSDDoctor(dir);
const hookIssues = detect.issues.filter(i => i.code === "stale_hook_state");
assertTrue(hookIssues.length > 0, "detects stale hook state");
assertTrue(hookIssues[0]?.message.includes("2 residual cycle count"), "message includes count");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("cleared stale hook-state.json")), "fix clears hook state");
// Verify the file was cleaned
const content = JSON.parse(readFileSync(join(dir, ".gsd", "hook-state.json"), "utf-8"));
assertEq(Object.keys(content.cycleCounts).length, 0, "hook state cycle counts cleared");
}
// ─── Test 4: Activity log bloat detection ─────────────────────────
console.log("\n=== activity_log_bloat ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// Create an activity dir with > 500 files
const activityDir = join(dir, ".gsd", "activity");
mkdirSync(activityDir, { recursive: true });
for (let i = 0; i < 510; i++) {
writeFileSync(join(activityDir, `${String(i).padStart(3, "0")}-execute-task-M001-S01-T01.jsonl`), `{"test":${i}}\n`);
}
const detect = await runGSDDoctor(dir);
const bloatIssues = detect.issues.filter(i => i.code === "activity_log_bloat");
assertTrue(bloatIssues.length > 0, "detects activity log bloat");
assertTrue(bloatIssues[0]?.message.includes("510 files"), "message includes file count");
}
// ─── Test 5: STATE.md missing detection & fix ─────────────────────
console.log("\n=== state_file_missing ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// No STATE.md exists by default in our minimal setup
const stateFilePath = join(dir, ".gsd", "STATE.md");
assertTrue(!existsSync(stateFilePath), "STATE.md does not exist initially");
const detect = await runGSDDoctor(dir);
const stateIssues = detect.issues.filter(i => i.code === "state_file_missing");
assertTrue(stateIssues.length > 0, "detects missing STATE.md");
assertTrue(stateIssues[0]?.fixable === true, "missing STATE.md is fixable");
assertEq(stateIssues[0]?.severity, "warning", "missing STATE.md is a warning (derived file)");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("created STATE.md")), "fix creates STATE.md");
assertTrue(existsSync(stateFilePath), "STATE.md exists after fix");
// Verify content has expected structure
const content = readFileSync(stateFilePath, "utf-8");
assertTrue(content.includes("# GSD State"), "STATE.md has header");
assertTrue(content.includes("M001"), "STATE.md references milestone");
}
// ─── Test 6: STATE.md stale detection & fix ───────────────────────
console.log("\n=== state_file_stale ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// Write a STATE.md with wrong phase/milestone info
const stateFilePath = join(dir, ".gsd", "STATE.md");
writeFileSync(stateFilePath, `# GSD State
**Active Milestone:** None
**Active Slice:** None
**Phase:** idle
## Milestone Registry
## Recent Decisions
- None recorded
## Blockers
- None
## Next Action
None
`);
const detect = await runGSDDoctor(dir);
const staleIssues = detect.issues.filter(i => i.code === "state_file_stale");
assertTrue(staleIssues.length > 0, "detects stale STATE.md");
assertTrue(staleIssues[0]?.message.includes("idle"), "message references old phase");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("rebuilt STATE.md")), "fix rebuilds STATE.md");
// Verify updated content matches derived state
const content = readFileSync(stateFilePath, "utf-8");
assertTrue(content.includes("M001"), "rebuilt STATE.md references milestone");
}
// ─── Test 7: Gitignore missing patterns detection & fix ───────────
if (process.platform !== "win32") {
console.log("\n=== gitignore_missing_patterns ===");
{
const dir = createGitProject();
cleanups.push(dir);
// Create .gsd dir so checks can run
mkdirSync(join(dir, ".gsd"), { recursive: true });
// Write a .gitignore missing GSD runtime patterns
writeFileSync(join(dir, ".gitignore"), `node_modules/
.env
`);
const detect = await runGSDDoctor(dir);
const gitignoreIssues = detect.issues.filter(i => i.code === "gitignore_missing_patterns");
assertTrue(gitignoreIssues.length > 0, "detects missing gitignore patterns");
assertTrue(gitignoreIssues[0]?.message.includes(".gsd/activity/"), "message lists missing patterns");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("added missing GSD runtime patterns")), "fix adds patterns");
// Verify patterns were added
const content = readFileSync(join(dir, ".gitignore"), "utf-8");
assertTrue(content.includes(".gsd/activity/"), "gitignore now has activity pattern");
assertTrue(content.includes(".gsd/auto.lock"), "gitignore now has auto.lock pattern");
}
} else {
console.log("\n=== gitignore_missing_patterns (skipped on Windows) ===");
}
// ─── Test 8: No false positive when gitignore has blanket .gsd/ ───
if (process.platform !== "win32") {
console.log("\n=== gitignore — blanket .gsd/ ===");
{
const dir = createGitProject();
cleanups.push(dir);
mkdirSync(join(dir, ".gsd"), { recursive: true });
writeFileSync(join(dir, ".gitignore"), `.gsd/
node_modules/
`);
const detect = await runGSDDoctor(dir);
const gitignoreIssues = detect.issues.filter(i => i.code === "gitignore_missing_patterns");
assertEq(gitignoreIssues.length, 0, "no missing patterns when blanket .gsd/ present");
}
} else {
console.log("\n=== gitignore — blanket .gsd/ (skipped on Windows) ===");
}
// ─── Test 9: Orphaned completed-units detection & fix ─────────────
console.log("\n=== orphaned_completed_units ===");
{
const dir = createMinimalProject();
cleanups.push(dir);
// Write completed-units.json with keys that reference non-existent artifacts
const completedKeys = [
"execute-task/M001/S01/T99", // T99 doesn't exist
"complete-slice/M001/S99", // S99 doesn't exist
];
writeFileSync(join(dir, ".gsd", "completed-units.json"), JSON.stringify(completedKeys));
const detect = await runGSDDoctor(dir);
const orphanIssues = detect.issues.filter(i => i.code === "orphaned_completed_units");
assertTrue(orphanIssues.length > 0, "detects orphaned completed-unit keys");
assertTrue(orphanIssues[0]?.message.includes("2 completed-unit key"), "message includes count");
const fixed = await runGSDDoctor(dir, { fix: true });
assertTrue(fixed.fixesApplied.some(f => f.includes("removed") && f.includes("orphaned")), "fix removes orphaned keys");
// Verify keys were cleaned
const content = JSON.parse(readFileSync(join(dir, ".gsd", "completed-units.json"), "utf-8"));
assertEq(content.length, 0, "all orphaned keys removed");
}
} finally {
for (const dir of cleanups) {
try { rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
}
}
report();
}
main();