singularity-forge/src/resources/extensions/sf/auto-timeout-recovery.js

/**
 * Timeout recovery logic for autonomous mode units.
 * Handles idle and hard timeout recovery with escalation, steering messages,
 * and blocker placeholder generation.
 */
import { existsSync, readFileSync } from "node:fs";
import { relative } from "node:path";
import { resolveAgentEnd } from "./auto-loop.js";
import {
	diagnoseExpectedArtifact,
	resolveExpectedArtifactPath,
	verifyExpectedArtifact,
	writeBlockerPlaceholder,
} from "./auto-recovery.js";
import { parseRoadmap } from "./parsers.js";
import {
	relMilestoneFile,
	relSliceFile,
	resolveMilestoneFile,
	resolveSliceFile,
} from "./paths.js";
import { getSlice, isDbAvailable } from "./sf-db.js";
import { parseUnitId } from "./unit-id.js";
import {
	formatExecuteTaskRecoveryStatus,
	inspectExecuteTaskDurability,
	readUnitRuntimeRecord,
	writeUnitRuntimeRecord,
} from "./uok/unit-runtime.js";

function relToBase(basePath, path) {
	const rel = relative(basePath, path);
	return rel && !rel.startsWith("..") ? rel : path;
}
function formatSliceDbStatus(status) {
	return status ? `DB slice status is "${status}" (expected complete)` : "";
}
export function inspectUnitRecoveryStatus(unitType, unitId, basePath) {
	const expected =
		diagnoseExpectedArtifact(unitType, unitId, basePath) ??
		"required durable artifact";
	const missing = [];
	const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath);
	if (!artifactPath) {
		missing.push(`artifact path could not be resolved for ${expected}`);
	} else if (!existsSync(artifactPath)) {
		missing.push(`artifact missing (${relToBase(basePath, artifactPath)})`);
	}
	if (unitType === "complete-slice") {
		const { milestone: mid, slice: sid } = parseUnitId(unitId);
		if (mid && sid) {
			const uatPath = resolveSliceFile(basePath, mid, sid, "UAT");
			if (!uatPath || !existsSync(uatPath)) {
				missing.push(
					`UAT missing (${relSliceFile(basePath, mid, sid, "UAT")})`,
				);
			}
			const dbSlice = getSlice(mid, sid);
			if (dbSlice) {
				const dbStatus = formatSliceDbStatus(dbSlice.status);
				if (dbStatus && dbSlice.status !== "complete") missing.push(dbStatus);
			} else if (!isDbAvailable()) {
				const roadmapPath = resolveMilestoneFile(basePath, mid, "ROADMAP");
				if (!roadmapPath || !existsSync(roadmapPath)) {
					missing.push(
						`roadmap missing (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
					);
				} else {
					try {
						const roadmap = parseRoadmap(readFileSync(roadmapPath, "utf-8"));
						const slice = roadmap.slices.find((s) => s.id === sid);
						if (slice && !slice.done) {
							missing.push(
								`roadmap checkbox not marked [x] (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
							);
						}
					} catch {
						missing.push(
							`roadmap could not be parsed (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
						);
					}
				}
			}
		}
	}
	const verified = verifyExpectedArtifact(unitType, unitId, basePath);
	if (!verified && missing.length === 0) {
		missing.push(`artifact verification failed for ${expected}`);
	}
	return { expected, verified, missing };
}
export function formatUnitRecoveryStatus(status) {
	return status.missing.length > 0
		? status.missing.join("; ")
		: "all durable artifacts present";
}
export async function recoverTimedOutUnit(
	ctx,
	pi,
	unitType,
	unitId,
	reason,
	rctx,
) {
	const { basePath, verbose, currentUnitStartedAt, unitRecoveryCount } = rctx;
	const runtime = readUnitRuntimeRecord(basePath, unitType, unitId);
	const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
	const maxRecoveryAttempts = reason === "idle" ? 2 : 1;
	const recoveryKey = `${unitType}/${unitId}`;
	const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
	unitRecoveryCount.set(recoveryKey, attemptNumber);
	if (attemptNumber > 1) {
		// Exponential backoff: 2^(n-1) seconds, capped at 30s
		const backoffMs = Math.min(1000 * 2 ** (attemptNumber - 2), 30000);
		ctx.ui.notify(
			`Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
			"info",
		);
		await new Promise((r) => setTimeout(r, backoffMs));
	}
	if (unitType === "execute-task") {
		const status = await inspectExecuteTaskDurability(basePath, unitId);
		if (!status) return "paused";
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			recovery: status,
		});
		const durableComplete =
			status.summaryExists && status.taskChecked && status.nextActionAdvanced;
		if (durableComplete) {
			writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
				phase: "finalized",
				recovery: status,
			});
			ctx.ui.notify(
				`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing autonomous mode. (attempt ${attemptNumber})`,
				"info",
			);
			unitRecoveryCount.delete(recoveryKey);
			resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
			return "recovered";
		}
		if (recoveryAttempts < maxRecoveryAttempts) {
			const isEscalation = recoveryAttempts > 0;
			writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
				phase: "recovered",
				recovery: status,
				recoveryAttempts: recoveryAttempts + 1,
				lastRecoveryReason: reason,
				lastProgressAt: Date.now(),
				progressCount: (runtime?.progressCount ?? 0) + 1,
				lastProgressKind:
					reason === "idle" ? "idle-recovery-retry" : "hard-recovery-retry",
			});
			const steeringLines = isEscalation
				? [
						`**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before this task is skipped.**`,
						`You are still executing ${unitType} ${unitId}.`,
						`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
						`Current durability status: ${formatExecuteTaskRecoveryStatus(status)}.`,
						"You MUST finish the durable output NOW, even if incomplete.",
						"Write the task summary with whatever you have accomplished so far.",
						"Mark the task [x] in the plan. Commit your work.",
						"A partial summary is infinitely better than no summary.",
					]
				: [
						`**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — do not stop.**`,
						`You are still executing ${unitType} ${unitId}.`,
						`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
						`Current durability status: ${formatExecuteTaskRecoveryStatus(status)}.`,
						"Do not keep exploring.",
						"Immediately finish the required durable output for this unit.",
						"If full completion is impossible, write the partial artifact/state needed for recovery and make the blocker explicit.",
					];
			pi.sendMessage(
				{
					customType: "sf-auto-timeout-recovery",
					display: verbose,
					content: steeringLines.join("\n"),
				},
				{ triggerTurn: true, deliverAs: "steer" },
			);
			ctx.ui.notify(
				`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
				"warning",
			);
			return "recovered";
		}
		// Retries exhausted — write a blocker placeholder and advance.
		const diagnostic = formatExecuteTaskRecoveryStatus(status);
		const placeholder = writeBlockerPlaceholder(
			unitType,
			unitId,
			basePath,
			`${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}`,
		);
		if (placeholder) {
			writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
				phase: "skipped",
				recovery: status,
				recoveryAttempts: recoveryAttempts + 1,
				lastRecoveryReason: reason,
			});
			ctx.ui.notify(
				`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
				"warning",
			);
			unitRecoveryCount.delete(recoveryKey);
			resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
			return "recovered";
		}
		// Fallback: couldn't write skip artifacts — pause as before.
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			phase: "paused",
			recovery: status,
			recoveryAttempts: recoveryAttempts + 1,
			lastRecoveryReason: reason,
		});
		ctx.ui.notify(
			`${reason === "idle" ? "Idle" : "Timeout"} recovery check for ${unitType} ${unitId}: ${diagnostic}`,
			"warning",
		);
		return "paused";
	}
	const status = inspectUnitRecoveryStatus(unitType, unitId, basePath);
	const diagnostic = formatUnitRecoveryStatus(status);
	// Check full unit durability — the primary artifact alone is not enough for
	// units such as complete-slice, which also require UAT and state transition.
	if (status.verified) {
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			phase: "finalized",
			recoveryAttempts: recoveryAttempts + 1,
			lastRecoveryReason: reason,
		});
		ctx.ui.notify(
			`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} durable state already verified. Advancing. (attempt ${attemptNumber})`,
			"info",
		);
		unitRecoveryCount.delete(recoveryKey);
		resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
		return "recovered";
	}
	if (recoveryAttempts < maxRecoveryAttempts) {
		const isEscalation = recoveryAttempts > 0;
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			phase: "recovered",
			recoveryAttempts: recoveryAttempts + 1,
			lastRecoveryReason: reason,
			lastProgressAt: Date.now(),
			progressCount: (runtime?.progressCount ?? 0) + 1,
			lastProgressKind:
				reason === "idle" ? "idle-recovery-retry" : "hard-recovery-retry",
		});
		const steeringLines = isEscalation
			? [
					`**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before skip.**`,
					`You are still executing ${unitType} ${unitId}.`,
					`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts} — next failure skips this unit.`,
					`Expected durable output: ${status.expected}.`,
					`Current durability status: ${diagnostic}.`,
					"Repair only the missing durability items listed above.",
					"You MUST write the missing artifact/state NOW, even if incomplete.",
					"Write whatever you have — partial research, preliminary findings, best-effort analysis — and record the remaining blocker explicitly.",
					"If you are truly blocked, write the file with a BLOCKER section explaining why.",
				]
			: [
					`**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — stay in autonomous mode.**`,
					`You are still executing ${unitType} ${unitId}.`,
					`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
					`Expected durable output: ${status.expected}.`,
					`Current durability status: ${diagnostic}.`,
					"Stop broad exploration and do not redo work that is already durable.",
					"Repair only the missing durability items listed above.",
					"If blocked, write the partial artifact and explicitly record the blocker instead of going silent.",
				];
		pi.sendMessage(
			{
				customType: "sf-auto-timeout-recovery",
				display: verbose,
				content: steeringLines.join("\n"),
			},
			{ triggerTurn: true, deliverAs: "steer" },
		);
		ctx.ui.notify(
			`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to repair ${diagnostic} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
			"warning",
		);
		return "recovered";
	}
	// #4175: For complete-milestone, never write a blocker placeholder — a stub
	// SUMMARY has no recovery value (milestone is terminal), it does not update
	// DB status, and downstream merge paths can treat the stub as a legitimate
	// completion signal. Pause instead so the worktree branch is preserved.
	if (unitType === "complete-milestone") {
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			phase: "paused",
			recoveryAttempts: recoveryAttempts + 1,
			lastRecoveryReason: reason,
		});
		ctx.ui.notify(
			`Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s): ${diagnostic}. Worktree branch preserved. Re-run /autonomous once blockers are resolved.`,
			"error",
		);
		return "paused";
	}
	// Retries exhausted — write a blocker placeholder and advance the pipeline
	// instead of silently stalling.
	const placeholder = writeBlockerPlaceholder(
		unitType,
		unitId,
		basePath,
		`${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}.`,
	);
	if (placeholder) {
		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
			phase: "skipped",
			recoveryAttempts: recoveryAttempts + 1,
			lastRecoveryReason: reason,
		});
		ctx.ui.notify(
			`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
			"warning",
		);
		unitRecoveryCount.delete(recoveryKey);
		resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
		return "recovered";
	}
	// Fallback: couldn't resolve artifact path — pause as before.
	writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
		phase: "paused",
		recoveryAttempts: recoveryAttempts + 1,
		lastRecoveryReason: reason,
	});
	return "paused";
}