singularity-forge/src/resources/extensions/sf/auto/loop.js

/**
 * auto/loop.ts — Main autonomous mode execution loop.
 *
 * Iterates: derive → dispatch → guards → runUnit → finalize → repeat.
 * Exits when s.active becomes false or a terminal condition is reached.
 *
 * Imports from: auto/types, auto/resolve, auto/phases
 */
import { randomUUID } from "node:crypto";
import { mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { atomicWriteSync } from "../atomic-write.js";
import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
import { runAutomaticAutonomousSolverEval } from "../autonomous-solver-eval.js";
import { debugLog } from "../debug-logger.js";
import { resolveEngine } from "../engine-resolver.js";
import { sfRoot } from "../paths.js";
import {
	ExecutionGraphScheduler,
	scheduleSidecarQueue,
} from "../uok/execution-graph.js";
import { resolveUokFlags } from "../uok/flags.js";
import { logWarning } from "../workflow-logger.js";
import {
	COOLDOWN_FALLBACK_WAIT_MS,
	getCooldownRetryAfterMs,
	isInfrastructureError,
	isTransientCooldownError,
	MAX_COOLDOWN_RETRIES,
} from "./infra-errors.js";
import {
	runDispatch,
	runFinalize,
	runGuards,
	runPreDispatch,
	runUnitPhase,
} from "./phases.js";
import { _clearCurrentResolve } from "./resolve.js";
import { MAX_LOOP_ITERATIONS } from "./types.js";

// ── Stuck detection persistence (#3704) ──────────────────────────────────
// Persist stuck detection state to disk so it survives session restarts.
// Without this, restarting autonomous mode resets all counters, allowing the
// same blocked unit to burn a full retry budget each session.
function stuckStatePath(basePath) {
	return join(sfRoot(basePath), "runtime", "stuck-state.json");
}
function loadStuckState(basePath) {
	try {
		const data = JSON.parse(readFileSync(stuckStatePath(basePath), "utf-8"));
		// Only load state written by a DIFFERENT process (real session restart).
		// If the PID matches the current process, this state was written by an earlier
		// autoLoop call in the same process (e.g., a test that completed before this
		// one), not by a crashed session — skip it to prevent test state pollution.
		if (data.pid === process.pid) {
			return { recentUnits: [], stuckRecoveryAttempts: 0 };
		}
		// Validate the stored PID is actually alive. A dead PID means the prior
		// session crashed or was killed; loading its stuck state is safe. But if
		// the PID is alive, it may be a concurrent session — skip to avoid
		// cross-session pollution (#sf-moqv5o7h-vaabu6).
		if (typeof data.pid === "number" && Number.isFinite(data.pid)) {
			try {
				process.kill(data.pid, 0);
				// PID is alive — this is a concurrent session, not a restart.
				return { recentUnits: [], stuckRecoveryAttempts: 0 };
			} catch {
				// PID is dead — safe to load the persisted stuck state.
			}
		}
		return {
			recentUnits: Array.isArray(data.recentUnits) ? data.recentUnits : [],
			stuckRecoveryAttempts:
				typeof data.stuckRecoveryAttempts === "number"
					? data.stuckRecoveryAttempts
					: 0,
		};
	} catch (err) {
		debugLog("autoLoop", {
			phase: "load-stuck-state-failed",
			error: err instanceof Error ? err.message : String(err),
		});
		return { recentUnits: [], stuckRecoveryAttempts: 0 };
	}
}
function saveStuckState(basePath, state) {
	try {
		const filePath = stuckStatePath(basePath);
		mkdirSync(join(sfRoot(basePath), "runtime"), { recursive: true });
		writeFileSync(
			filePath,
			JSON.stringify({
				pid: process.pid,
				recentUnits: state.recentUnits.slice(-20), // keep last 20 entries
				stuckRecoveryAttempts: state.stuckRecoveryAttempts,
				updatedAt: new Date().toISOString(),
			}) + "\n",
		);
	} catch (err) {
		debugLog("autoLoop", {
			phase: "save-stuck-state-failed",
			error: err instanceof Error ? err.message : String(err),
		});
	}
}
// ── Custom workflow verification retry persistence ───────────────────────
// Custom workflow verifiers can request a retry after a step runs. Persisting
// retry counts under the run directory prevents restart loops from resetting the
// retry budget and repeatedly dispatching the same failing step.
const MAX_CUSTOM_ENGINE_VERIFY_RETRIES = 3;
function customVerifyRetryStateDir(s) {
	return s.activeRunDir
		? join(s.activeRunDir, "runtime")
		: join(sfRoot(s.basePath), "runtime");
}
function customVerifyRetryStatePath(s) {
	return join(customVerifyRetryStateDir(s), "custom-verify-retries.json");
}
function hydrateCustomVerifyRetryCounts(s) {
	if (s.verificationRetryCount.size > 0) {
		return s.verificationRetryCount;
	}
	try {
		const raw = JSON.parse(
			readFileSync(customVerifyRetryStatePath(s), "utf-8"),
		);
		const counts =
			raw &&
			typeof raw === "object" &&
			raw.counts &&
			typeof raw.counts === "object"
				? raw.counts
				: {};
		for (const [key, value] of Object.entries(counts)) {
			if (typeof value === "number" && Number.isFinite(value) && value > 0) {
				s.verificationRetryCount.set(key, Math.floor(value));
			}
		}
	} catch (err) {
		debugLog("autoLoop", {
			phase: "load-custom-verify-retries-failed",
			error: err instanceof Error ? err.message : String(err),
		});
	}
	return s.verificationRetryCount;
}
function saveCustomVerifyRetryCounts(s) {
	const retryCounts = s.verificationRetryCount;
	const filePath = customVerifyRetryStatePath(s);
	try {
		if (retryCounts.size === 0) {
			unlinkSync(filePath);
			return;
		}
		mkdirSync(customVerifyRetryStateDir(s), { recursive: true });
		atomicWriteSync(
			filePath,
			JSON.stringify({
				counts: Object.fromEntries(retryCounts),
				updatedAt: new Date().toISOString(),
			}) + "\n",
		);
	} catch (err) {
		const code =
			err && typeof err === "object" && "code" in err ? err.code : undefined;
		if (code !== "ENOENT") {
			debugLog("autoLoop", {
				phase: "save-custom-verify-retries-failed",
				error: err instanceof Error ? err.message : String(err),
			});
		}
	}
}
// ── Memory pressure monitoring (#3331) ──────────────────────────────────
// Check heap usage every N iterations and trigger graceful shutdown before
// the OS OOM killer sends SIGKILL. The threshold is 90% of the V8 heap
// limit (--max-old-space-size or default ~1.5-4GB depending on platform).
const MEMORY_CHECK_INTERVAL = 5; // check every 5 iterations
const MEMORY_PRESSURE_THRESHOLD = 0.85; // 85% of heap limit
function checkMemoryPressure() {
	const mem = process.memoryUsage();
	// v8.getHeapStatistics() gives heap_size_limit but requires import
	// Use a conservative estimate: RSS > 3GB is danger zone on most systems
	const heapMB = Math.round(mem.heapUsed / 1024 / 1024);
	const _rssMB = Math.round(mem.rss / 1024 / 1024);
	// Try to get the actual V8 heap limit
	let limitMB = 4096; // conservative default
	try {
		// eslint-disable-next-line @typescript-eslint/no-require-imports
		const v8 = require("node:v8");
		const stats = v8.getHeapStatistics();
		limitMB = Math.round(stats.heap_size_limit / 1024 / 1024);
	} catch {
		limitMB = 4096; /* v8 stats unavailable — use conservative default */
	}
	const pct = heapMB / limitMB;
	return { pressured: pct > MEMORY_PRESSURE_THRESHOLD, heapMB, limitMB, pct };
}
/**
 * Tracks the dangling phase promise from the most recent timeout so the next
 * iteration can drain it before proceeding. Promise.race() rejects on timeout
 * but does not cancel the underlying async work; draining here prevents the
 * timed-out phase from mutating state concurrently with the next iteration.
 */
let _danglingPhasePromise = null;
/**
 * Wrap a phase function with a timeout. Rejects with an Error whose message
 * starts with "phase-timeout:" so the blanket catch can handle it specially.
 * Stores the still-running phase promise in _danglingPhasePromise so the caller
 * can drain it before starting a new iteration.
 */
async function withPhaseTimeout(name, fn, timeoutMs) {
	let timer;
	const phasePromise = fn();
	const timeout = new Promise((_, reject) => {
		timer = setTimeout(
			() => reject(new Error(`phase-timeout:${name}`)),
			timeoutMs,
		);
	});
	try {
		return await Promise.race([phasePromise, timeout]);
	} catch (err) {
		if (err instanceof Error && err.message.startsWith("phase-timeout:")) {
			_danglingPhasePromise = phasePromise;
		}
		throw err;
	} finally {
		if (timer !== undefined) clearTimeout(timer);
	}
}
// ── Dispatch contract helpers ─────────────────────────────────────────────
function resolveDispatchNodeKind(unitType, sidecarItem) {
	if (sidecarItem?.kind === "hook") return "hook";
	if (sidecarItem?.kind === "triage") return "verification";
	if (sidecarItem?.kind === "quick-task") return "team-worker";
	if (unitType.startsWith("hook/")) return "hook";
	if (unitType === "reactive-execute") return "subagent";
	if (
		unitType === "gate-evaluate" ||
		unitType === "validate-milestone" ||
		unitType === "run-uat" ||
		unitType === "complete-slice"
	) {
		return "verification";
	}
	if (unitType === "replan-slice" || unitType === "reassess-roadmap") {
		return "reprocess";
	}
	return "unit";
}
async function runUnitPhaseViaContract(ic, iterData, loopState, sidecarItem) {
	const scheduler = new ExecutionGraphScheduler();
	let outcome = null;
	const executeNode = async () => {
		outcome = await runUnitPhase(ic, iterData, loopState, sidecarItem);
	};
	const kinds = [
		"unit",
		"hook",
		"subagent",
		"team-worker",
		"verification",
		"reprocess",
	];
	for (const kind of kinds) scheduler.registerHandler(kind, executeNode);
	const nodeId = `dispatch:${ic.iteration}:${iterData.unitType}:${iterData.unitId}`;
	await scheduler.run(
		[
			{
				id: nodeId,
				kind: resolveDispatchNodeKind(iterData.unitType, sidecarItem),
				dependsOn: [],
				metadata: { unitType: iterData.unitType, unitId: iterData.unitId },
			},
		],
		{ parallel: false, maxWorkers: 1 },
	);
	return (
		outcome ?? { action: "break", reason: "scheduler-dispatch-missing-result" }
	);
}
async function enforceMinRequestInterval(s, prefs) {
	const minInterval = prefs?.min_request_interval_ms ?? 0;
	if (minInterval > 0 && s.lastRequestTimestamp > 0) {
		const elapsed = Math.max(0, Date.now() - s.lastRequestTimestamp);
		if (elapsed < minInterval) {
			const waitMs = minInterval - elapsed;
			debugLog("autoLoop", { phase: "rate-limit-wait", waitMs });
			await new Promise((r) => setTimeout(r, waitMs));
		}
	}
}
async function runExitSolverEval(ctx, s, deps, iteration) {
	try {
		const supervisor =
			deps.loadEffectiveSFPreferences()?.preferences?.auto_supervisor;
		const flowId = randomUUID();
		let seq = 0;
		const emitJournalEvent = (event) =>
			deps.emitJournalEvent({
				...event,
				flowId: event.flowId ?? flowId,
				seq: event.seq ?? ++seq,
			});
		const result = await runAutomaticAutonomousSolverEval({
			basePath: s.basePath,
			enabled: supervisor?.solver_eval_on_autonomous_exit !== false,
			reason: "autonomous-exit",
			emitJournalEvent,
		});
		if (result.ok && result.report?.dbRecorded) {
			ctx.ui.notify(
				`Autonomous solver eval recorded: ${result.report.reportPath}`,
				"info",
			);
		} else if (result.ok && result.report) {
			ctx.ui.notify(
				`Autonomous solver eval wrote ${result.report.reportPath}, but DB evidence was not recorded.`,
				"warning",
			);
		} else if (!result.ok) {
			ctx.ui.notify(
				`Autonomous solver eval did not record: ${result.error}`,
				"warning",
			);
		}
		debugLog("autoLoop", {
			phase: "solver-eval-auto",
			iteration,
			ok: result.ok,
			skipped: result.skipped,
			runId: result.report?.runId,
			error: result.error,
		});
	} catch (err) {
		const message = err instanceof Error ? err.message : String(err);
		ctx.ui.notify(`Autonomous solver eval hook failed: ${message}`, "warning");
		debugLog("autoLoop", {
			phase: "solver-eval-auto-failed",
			iteration,
			error: message,
		});
	}
}
/**
 * Main autonomous mode execution loop. Iterates: derive → dispatch → guards →
 * runUnit → finalize → repeat. Exits when s.active becomes false or a
 * terminal condition is reached.
 *
 * This is the linear replacement for the recursive
 * dispatchNextUnit → handleAgentEnd → dispatchNextUnit chain.
 */
export async function autoLoop(ctx, pi, s, deps) {
	debugLog("autoLoop", { phase: "enter" });
	let iteration = 0;
	// Load persisted stuck state so counters survive session restarts (#3704)
	const persisted = loadStuckState(s.basePath);
	const loopState = {
		recentUnits: persisted.recentUnits,
		stuckRecoveryAttempts: persisted.stuckRecoveryAttempts,
		consecutiveFinalizeTimeouts: 0,
	};
	let consecutiveErrors = 0;
	let consecutiveCooldowns = 0;
	const recentErrorMessages = [];
	while (s.active) {
		iteration++;
		debugLog("autoLoop", { phase: "loop-top", iteration });
		// ── Journal: per-iteration flow grouping ──
		const flowId = randomUUID();
		let seqCounter = 0;
		const nextSeq = () => ++seqCounter;
		const turnId = randomUUID();
		s.currentTraceId = flowId;
		s.currentTurnId = turnId;
		const turnStartedAt = new Date().toISOString();
		let observedUnitType;
		let observedUnitId;
		let turnFinished = false;
		const finishTurn = (status, failureClass = "none", error) => {
			if (turnFinished) return;
			turnFinished = true;
			deps.uokObserver?.onTurnResult({
				traceId: flowId,
				turnId,
				iteration,
				unitType: observedUnitType,
				unitId: observedUnitId,
				status,
				failureClass,
				phaseResults: [],
				error,
				startedAt: turnStartedAt,
				finishedAt: new Date().toISOString(),
			});
			s.currentTraceId = null;
			s.currentTurnId = null;
		};
		deps.uokObserver?.onTurnStart({
			traceId: flowId,
			turnId,
			iteration,
			basePath: s.basePath,
			startedAt: turnStartedAt,
		});
		if (iteration > MAX_LOOP_ITERATIONS) {
			debugLog("autoLoop", {
				phase: "exit",
				reason: "max-iterations",
				iteration,
			});
			await deps.stopAuto(
				ctx,
				pi,
				`Safety: loop exceeded ${MAX_LOOP_ITERATIONS} iterations — possible runaway`,
			);
			finishTurn("stopped", "manual-attention", "max-iterations");
			break;
		}
		// ── Memory pressure check (#3331) ──
		// Graceful shutdown before OOM killer sends SIGKILL.
		if (iteration % MEMORY_CHECK_INTERVAL === 0) {
			const mem = checkMemoryPressure();
			debugLog("autoLoop", { phase: "memory-check", ...mem });
			if (mem.pressured) {
				logWarning(
					"dispatch",
					`Memory pressure: ${mem.heapMB}MB / ${mem.limitMB}MB (${Math.round(mem.pct * 100)}%) — stopping autonomous mode to prevent OOM kill`,
				);
				await deps.stopAuto(
					ctx,
					pi,
					`Memory pressure: heap at ${mem.heapMB}MB / ${mem.limitMB}MB (${Math.round(mem.pct * 100)}%). ` +
						`Stopping gracefully to prevent OOM kill after ${iteration} iterations. ` +
						`Resume with /autonomous to continue from where you left off.`,
				);
				finishTurn("stopped", "timeout", "memory-pressure");
				break;
			}
		}
		if (!s.cmdCtx) {
			debugLog("autoLoop", { phase: "exit", reason: "no-cmdCtx" });
			finishTurn("stopped", "manual-attention", "missing-command-context");
			break;
		}
		// ── Drain any dangling phase promise before starting new work ──
		// Promise.race() on timeout does not cancel the underlying async fn; that
		// fn keeps running and may mutate state after the loop has advanced.
		// Awaiting its completion here ensures no concurrent state writes.
		if (_danglingPhasePromise !== null) {
			const dangling = _danglingPhasePromise;
			_danglingPhasePromise = null;
			try {
				await dangling;
			} catch {
				/* ignore — result is irrelevant */
			}
		}
		try {
			// ── Blanket try/catch: one bad iteration must not kill the session
			const prefs = deps.loadEffectiveSFPreferences()?.preferences;
			const uokFlags = resolveUokFlags(prefs);
			const phaseTimeoutMs =
				(prefs?.auto_supervisor?.phase_timeout_minutes ?? 10) * 60_000;
			// ── Check sidecar queue before deriveState ──
			let sidecarItem;
			if (s.sidecarQueue.length > 0) {
				if (uokFlags.executionGraph && s.sidecarQueue.length > 1) {
					try {
						s.sidecarQueue = await scheduleSidecarQueue(s.sidecarQueue);
					} catch (err) {
						logWarning(
							"dispatch",
							`sidecar queue scheduling failed: ${err instanceof Error ? err.message : String(err)}`,
						);
					}
				}
				sidecarItem = s.sidecarQueue.shift();
				debugLog("autoLoop", {
					phase: "sidecar-dequeue",
					kind: sidecarItem.kind,
					unitType: sidecarItem.unitType,
					unitId: sidecarItem.unitId,
				});
				deps.emitJournalEvent({
					ts: new Date().toISOString(),
					flowId,
					seq: nextSeq(),
					eventType: "sidecar-dequeue",
					data: {
						kind: sidecarItem.kind,
						unitType: sidecarItem.unitType,
						unitId: sidecarItem.unitId,
					},
				});
			}
			const sessionLockBase = deps.lockBase();
			if (sessionLockBase) {
				const lockStatus = deps.validateSessionLock(sessionLockBase);
				if (!lockStatus.valid) {
					debugLog("autoLoop", {
						phase: "session-lock-invalid",
						reason: lockStatus.failureReason ?? "unknown",
						existingPid: lockStatus.existingPid,
						expectedPid: lockStatus.expectedPid,
					});
					deps.handleLostSessionLock(ctx, lockStatus);
					debugLog("autoLoop", {
						phase: "exit",
						reason: "session-lock-lost",
						detail: lockStatus.failureReason ?? "unknown",
					});
					break;
				}
			}
			const ic = {
				ctx,
				pi,
				s,
				deps,
				prefs,
				iteration,
				flowId,
				nextSeq,
			};
			deps.emitJournalEvent({
				ts: new Date().toISOString(),
				flowId,
				seq: nextSeq(),
				eventType: "iteration-start",
				data: { iteration },
			});
			let iterData;
			// ── Custom engine path ──────────────────────────────────────────────
			// When activeEngineId is a non-dev value, bypass runPreDispatch and
			// runDispatch entirely — the custom engine drives its own state via
			// GRAPH.yaml. Shares runGuards and runUnitPhase with the dev path.
			// After unit execution, verifies then reconciles via the engine layer.
			//
			// SF_ENGINE_BYPASS=1 skips the engine layer entirely — falls through
			// to the dev path below.
			if (
				s.activeEngineId != null &&
				s.activeEngineId !== "dev" &&
				!sidecarItem &&
				process.env.SF_ENGINE_BYPASS !== "1"
			) {
				debugLog("autoLoop", {
					phase: "custom-engine-derive",
					iteration,
					engineId: s.activeEngineId,
				});
				const { engine, policy } = resolveEngine({
					activeEngineId: s.activeEngineId,
					activeRunDir: s.activeRunDir,
				});
				const engineState = await engine.deriveState(s.basePath);
				if (engineState.isComplete) {
					await deps.stopAuto(ctx, pi, "Workflow complete");
					break;
				}
				debugLog("autoLoop", { phase: "custom-engine-dispatch", iteration });
				const dispatch = await engine.resolveDispatch(engineState, {
					basePath: s.basePath,
				});
				if (dispatch.action === "stop") {
					await deps.stopAuto(ctx, pi, dispatch.reason ?? "Engine stopped");
					break;
				}
				if (dispatch.action === "skip") {
					continue;
				}
				// dispatch.action === "dispatch"
				const step = dispatch.step;
				const sfState = await deps.deriveState(s.basePath);
				iterData = {
					unitType: step.unitType,
					unitId: step.unitId,
					prompt: step.prompt,
					finalPrompt: step.prompt,
					pauseAfterUatDispatch: false,
					state: sfState,
					mid: s.currentMilestoneId ?? "workflow",
					midTitle: "Workflow",
					isRetry: false,
					previousTier: undefined,
				};
				observedUnitType = iterData.unitType;
				observedUnitId = iterData.unitId;
				// ── Progress widget (mirrors dev path in runDispatch) ──
				deps.updateProgressWidget(
					ctx,
					iterData.unitType,
					iterData.unitId,
					iterData.state,
				);
				// ── Guards (shared with dev path) ──
				const guardsResult = await runGuards(
					ic,
					s.currentMilestoneId ?? "workflow",
					iterData.unitType,
					iterData.unitId,
					iterData.state?.activeSlice?.id,
				);
				deps.uokObserver?.onPhaseResult("guard", guardsResult.action, {
					unitType: iterData.unitType,
					unitId: iterData.unitId,
				});
				if (guardsResult.action === "break") {
					finishTurn("stopped", "manual-attention", "guard-break");
					break;
				}
				// ── Unit execution (shared with dev path) ──
				await enforceMinRequestInterval(s, ic.prefs);
				const unitPhaseResult = await runUnitPhaseViaContract(
					ic,
					iterData,
					loopState,
				);
				if (unitPhaseResult.action === "next") {
					const d = unitPhaseResult.data;
					const requestTimestamp = d?.requestDispatchedAt ?? d?.unitStartedAt;
					if (typeof requestTimestamp === "number")
						s.lastRequestTimestamp = requestTimestamp;
				}
				deps.uokObserver?.onPhaseResult("unit", unitPhaseResult.action, {
					unitType: iterData.unitType,
					unitId: iterData.unitId,
				});
				if (unitPhaseResult.action === "break") {
					finishTurn("stopped", "execution", "unit-break");
					break;
				}
				if (unitPhaseResult.action === "continue") {
					finishTurn("retry");
					continue;
				}
				// ── Verify first, then reconcile (only mark complete on pass) ──
				debugLog("autoLoop", {
					phase: "custom-engine-verify",
					iteration,
					unitId: iterData.unitId,
				});
				const verifyResult = await policy.verify(
					iterData.unitType,
					iterData.unitId,
					{ basePath: s.basePath },
				);
				if (verifyResult === "pause") {
					await deps.pauseAuto(ctx, pi);
					deps.uokObserver?.onPhaseResult("custom-engine", "pause", {
						unitType: iterData.unitType,
						unitId: iterData.unitId,
					});
					finishTurn(
						"paused",
						"manual-attention",
						"custom-engine-verify-pause",
					);
					break;
				}
				if (verifyResult === "retry") {
					const recoveryKey = `${iterData.unitType}/${iterData.unitId}`;
					const retryCounts = hydrateCustomVerifyRetryCounts(s);
					const attempts = (retryCounts.get(recoveryKey) ?? 0) + 1;
					retryCounts.set(recoveryKey, attempts);
					saveCustomVerifyRetryCounts(s);
					debugLog("autoLoop", {
						phase: "custom-engine-verify-retry",
						iteration,
						unitId: iterData.unitId,
						attempts,
					});
					deps.uokObserver?.onPhaseResult("custom-engine", "retry", {
						unitType: iterData.unitType,
						unitId: iterData.unitId,
						attempts,
					});
					if (attempts > MAX_CUSTOM_ENGINE_VERIFY_RETRIES) {
						const recovery = await policy.recover(
							iterData.unitType,
							iterData.unitId,
							{ basePath: s.basePath },
						);
						if (recovery.outcome === "pause") {
							await deps.pauseAuto(ctx, pi);
							finishTurn(
								"paused",
								"manual-attention",
								recovery.reason ?? "custom-engine-verify-retry-exhausted",
							);
							break;
						}
						if (recovery.outcome === "skip") {
							await deps.stopAuto(
								ctx,
								pi,
								recovery.reason ??
									`Custom workflow verification for ${iterData.unitId} requested skip after retry exhaustion, but the custom engine cannot reconcile skipped steps.`,
							);
							finishTurn(
								"stopped",
								"manual-attention",
								"custom-engine-verify-retry-exhausted",
							);
							break;
						}
						const exhaustedReason = `Custom workflow verification for ${iterData.unitId} requested retry ${attempts} times without passing.`;
						await deps.stopAuto(
							ctx,
							pi,
							recovery.outcome === "stop" && recovery.reason
								? recovery.reason
								: exhaustedReason,
						);
						finishTurn(
							"stopped",
							"manual-attention",
							"custom-engine-verify-retry-exhausted",
						);
						break;
					}
					finishTurn("retry");
					continue;
				}
				// Verification passed — mark step complete
				s.verificationRetryCount.delete(
					`${iterData.unitType}/${iterData.unitId}`,
				);
				saveCustomVerifyRetryCounts(s);
				debugLog("autoLoop", {
					phase: "custom-engine-reconcile",
					iteration,
					unitId: iterData.unitId,
				});
				const reconcileResult = await engine.reconcile(engineState, {
					unitType: iterData.unitType,
					unitId: iterData.unitId,
					startedAt: s.currentUnit?.startedAt ?? Date.now(),
					finishedAt: Date.now(),
				});
				deps.clearUnitTimeout();
				consecutiveErrors = 0;
				consecutiveCooldowns = 0;
				recentErrorMessages.length = 0;
				deps.emitJournalEvent({
					ts: new Date().toISOString(),
					flowId,
					seq: nextSeq(),
					eventType: "iteration-end",
					data: { iteration },
				});
				saveStuckState(s.basePath, loopState); // persist across session restarts (#3704)
				debugLog("autoLoop", { phase: "iteration-complete", iteration });
				if (reconcileResult.outcome === "milestone-complete") {
					await deps.stopAuto(ctx, pi, "Workflow complete");
					deps.uokObserver?.onPhaseResult(
						"custom-engine",
						"milestone-complete",
						{
							unitType: iterData.unitType,
							unitId: iterData.unitId,
						},
					);
					finishTurn("completed");
					break;
				}
				if (reconcileResult.outcome === "pause") {
					await deps.pauseAuto(ctx, pi);
					deps.uokObserver?.onPhaseResult("custom-engine", "pause", {
						unitType: iterData.unitType,
						unitId: iterData.unitId,
					});
					finishTurn("paused", "manual-attention");
					break;
				}
				if (reconcileResult.outcome === "stop") {
					await deps.stopAuto(
						ctx,
						pi,
						reconcileResult.reason ?? "Engine stopped",
					);
					deps.uokObserver?.onPhaseResult("custom-engine", "stop", {
						unitType: iterData.unitType,
						unitId: iterData.unitId,
						reason: reconcileResult.reason,
					});
					finishTurn("stopped", "manual-attention", reconcileResult.reason);
					break;
				}
				deps.uokObserver?.onPhaseResult("custom-engine", "continue", {
					unitType: iterData.unitType,
					unitId: iterData.unitId,
				});
				finishTurn("completed");
				continue;
			}
			if (!sidecarItem) {
				// ── P4-A: Doctor issues → reassess escalation ─────────────────────
				// If the health gate detects issues that mention slice IDs (state
				// inconsistencies that reassessment can fix), queue reassess instead
				// of pausing autonomous mode. This runs separately from the gate inside
				// runPreDispatch so we can intercept *before* the break path.
				try {
					const healthCheck = await deps.preDispatchHealthGate(s.basePath);
					if (
						!healthCheck.proceed &&
						healthCheck.issues &&
						healthCheck.issues.length > 0
					) {
						const sliceRefPattern = /\bS\d+\b/;
						const hasSliceRef = healthCheck.issues.some((issue) =>
							sliceRefPattern.test(issue),
						);
						if (hasSliceRef) {
							const sfState = await deps.deriveState(s.basePath);
							const mid = sfState.activeMilestone?.id;
							const midTitle = sfState.activeMilestone?.title ?? "";
							const sliceId = sfState.activeSlice?.id ?? "reassess";
							if (mid) {
								ctx.ui.notify(
									`Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`,
									"warning",
								);
								const { buildReassessRoadmapPrompt } = await import(
									"../auto-prompts.js"
								);
								const reassessPrompt = await buildReassessRoadmapPrompt(
									mid,
									midTitle,
									sliceId,
									s.basePath,
								);
								s.sidecarQueue.unshift({
									kind: "hook",
									unitType: "reassess-roadmap",
									unitId: `${mid}/${sliceId}`,
									prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`,
								});
								finishTurn("retry");
								continue;
							}
						}
					}
				} catch {
					// Non-fatal — fall through to normal runPreDispatch path
				}
				// ── Phase 1: Pre-dispatch ─────────────────────────────────────────
				const preDispatchResult = await withPhaseTimeout(
					"preDispatch",
					() => runPreDispatch(ic, loopState),
					phaseTimeoutMs / 2,
				);
				deps.uokObserver?.onPhaseResult(
					"pre-dispatch",
					preDispatchResult.action,
				);
				if (preDispatchResult.action === "break") {
					finishTurn("stopped", "manual-attention", "pre-dispatch-break");
					break;
				}
				if (preDispatchResult.action === "continue") {
					finishTurn("skipped");
					continue;
				}
				const preData = preDispatchResult.data;
				// ── Phase 2: Dispatch ─────────────────────────────────────────────
				const dispatchResult = await withPhaseTimeout(
					"dispatch",
					() => runDispatch(ic, preData, loopState),
					phaseTimeoutMs,
				);
				deps.uokObserver?.onPhaseResult("dispatch", dispatchResult.action);
				if (dispatchResult.action === "break") {
					finishTurn("stopped", "manual-attention", "dispatch-break");
					break;
				}
				if (dispatchResult.action === "continue") {
					finishTurn("skipped");
					continue;
				}
				iterData = dispatchResult.data;
				observedUnitType = iterData.unitType;
				observedUnitId = iterData.unitId;
				// ── Phase 3: Guards ───────────────────────────────────────────────
				const guardsResult = await runGuards(
					ic,
					iterData.mid ?? preData.mid ?? "workflow",
					iterData.unitType,
					iterData.unitId,
					iterData.state?.activeSlice?.id,
				);
				deps.uokObserver?.onPhaseResult("guard", guardsResult.action);
				if (guardsResult.action === "break") {
					finishTurn("stopped", "manual-attention", "guard-break");
					break;
				}
			} else {
				// ── Sidecar path: use values from the sidecar item directly ──
				const sidecarState = await deps.deriveState(s.basePath);
				iterData = {
					unitType: sidecarItem.unitType,
					unitId: sidecarItem.unitId,
					prompt: sidecarItem.prompt,
					finalPrompt: sidecarItem.prompt,
					pauseAfterUatDispatch: false,
					state: sidecarState,
					mid: sidecarState.activeMilestone?.id,
					midTitle: sidecarState.activeMilestone?.title,
					isRetry: false,
					previousTier: undefined,
				};
				observedUnitType = iterData.unitType;
				observedUnitId = iterData.unitId;
				deps.uokObserver?.onPhaseResult("dispatch", "sidecar", {
					unitType: iterData.unitType,
					unitId: iterData.unitId,
					sidecarKind: sidecarItem.kind,
				});
			}
			await enforceMinRequestInterval(s, ic.prefs);
			const unitPhaseResult = await runUnitPhaseViaContract(
				ic,
				iterData,
				loopState,
				sidecarItem,
			);
			if (unitPhaseResult.action === "next") {
				const d = unitPhaseResult.data;
				const requestTimestamp = d?.requestDispatchedAt ?? d?.unitStartedAt;
				if (typeof requestTimestamp === "number")
					s.lastRequestTimestamp = requestTimestamp;
			}
			deps.uokObserver?.onPhaseResult("unit", unitPhaseResult.action, {
				unitType: iterData.unitType,
				unitId: iterData.unitId,
			});
			if (unitPhaseResult.action === "break") {
				finishTurn("stopped", "execution", "unit-break");
				break;
			}
			if (unitPhaseResult.action === "continue") {
				finishTurn("retry");
				continue;
			}
			// ── Phase 5: Finalize ───────────────────────────────────────────────
			const finalizeResult = await withPhaseTimeout(
				"finalize",
				() => runFinalize(ic, iterData, loopState, sidecarItem),
				phaseTimeoutMs,
			);
			deps.uokObserver?.onPhaseResult("finalize", finalizeResult.action, {
				unitType: iterData.unitType,
				unitId: iterData.unitId,
			});
			if (finalizeResult.action === "break") {
				const finalizeFailureClass =
					finalizeResult.reason === "git-closeout-failure" ? "git" : "closeout";
				finishTurn("stopped", finalizeFailureClass, "finalize-break");
				break;
			}
			if (finalizeResult.action === "continue") {
				finishTurn("retry");
				continue;
			}
			consecutiveErrors = 0; // Iteration completed successfully
			consecutiveCooldowns = 0;
			recentErrorMessages.length = 0;
			deps.emitJournalEvent({
				ts: new Date().toISOString(),
				flowId,
				seq: nextSeq(),
				eventType: "iteration-end",
				data: { iteration },
			});
			saveStuckState(s.basePath, loopState); // persist across session restarts (#3704)
			debugLog("autoLoop", { phase: "iteration-complete", iteration });
			finishTurn("completed");
		} catch (loopErr) {
			// ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ──
			const msg = loopErr instanceof Error ? loopErr.message : String(loopErr);
			debugLog("autoLoop", {
				phase: "iteration-error",
				message: msg,
				stack: loopErr instanceof Error ? loopErr.stack : undefined,
			});
			// Always emit iteration-end on error so the journal records iteration
			// completion even on failure (#2344). Without this, errors in
			// runFinalize leave the journal incomplete, making diagnosis harder.
			deps.emitJournalEvent({
				ts: new Date().toISOString(),
				flowId,
				seq: nextSeq(),
				eventType: "iteration-end",
				data: { iteration, error: msg },
			});
			// ── Pre-send model-policy block: not a retryable error (#4959 / #4850) ──
			// The model-policy gate runs before the prompt is sent.  When every
			// candidate model is denied (cross-provider disabled + flat-rate
			// baseline + tool-policy denial), retrying the same unit produces the
			// same denial — burning the consecutive-error budget toward a 3-strike
			// hard stop and corrupting autonomous mode state.  Pause for user attention
			// instead, with the per-model deny reasons surfaced from the typed error.
			if (loopErr instanceof ModelPolicyDispatchBlockedError) {
				debugLog("autoLoop", {
					phase: "model-policy-blocked",
					iteration,
					unitType: loopErr.unitType,
					unitId: loopErr.unitId,
					reasons: loopErr.reasons,
				});
				ctx.ui.notify(
					`Autonomous mode paused: model-policy denied dispatch for ${loopErr.unitType}/${loopErr.unitId}. ${msg}`,
					"error",
				);
				deps.emitJournalEvent({
					ts: new Date().toISOString(),
					flowId,
					seq: nextSeq(),
					eventType: "unit-end",
					data: {
						unitType: loopErr.unitType,
						unitId: loopErr.unitId,
						status: "blocked",
						reason: "model-policy-dispatch-blocked",
						reasons: loopErr.reasons,
					},
				});
				// Carry the blocked unit identity into the turn-result observer:
				// the throw originated inside dispatch, so observedUnitType/Id were
				// not assigned by the success path — but the typed error already names
				// the unit (#4959 / CodeRabbit).
				observedUnitType = loopErr.unitType;
				observedUnitId = loopErr.unitId;
				await deps.pauseAuto(ctx, pi);
				finishTurn("paused", "manual-attention", msg);
				// Do NOT increment consecutiveErrors — the failure is configuration,
				// not a transient runtime fault.
				break;
			}
			// ── Infrastructure errors: immediate stop, no retry ──
			// These are unrecoverable (disk full, OOM, etc.). Retrying just burns
			// LLM budget on guaranteed failures.
			const infraCode = isInfrastructureError(loopErr);
			if (infraCode) {
				debugLog("autoLoop", {
					phase: "infrastructure-error",
					iteration,
					code: infraCode,
					error: msg,
				});
				ctx.ui.notify(
					`Autonomous mode stopped: infrastructure error ${infraCode} — ${msg}`,
					"error",
				);
				await deps.stopAuto(
					ctx,
					pi,
					`Infrastructure error (${infraCode}): not recoverable by retry`,
				);
				finishTurn("failed", "execution", msg);
				break;
			}
			// ── Phase timeout: log, increment counter, continue ──
			if (msg.startsWith("phase-timeout:")) {
				const phaseName = msg.slice("phase-timeout:".length);
				loopState.consecutiveFinalizeTimeouts++;
				ctx.ui.notify(
					`Phase "${phaseName}" timed out (${loopState.consecutiveFinalizeTimeouts} consecutive) — skipping iteration and continuing.`,
					"warning",
				);
				debugLog("autoLoop", {
					phase: "phase-timeout",
					phaseName,
					consecutiveFinalizeTimeouts: loopState.consecutiveFinalizeTimeouts,
					iteration,
				});
				finishTurn("retry", "timeout", msg);
				continue;
			}
			// ── Credential cooldown: wait and retry with bounded budget ──
			// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
			// getApiKey() retries couldn't outlast the window, the error surfaces
			// here. Wait for the cooldown to clear rather than counting it as a
			// consecutive failure — but cap retries so we don't spin for hours
			// on persistent quota exhaustion.
			if (isTransientCooldownError(loopErr)) {
				consecutiveCooldowns++;
				const retryAfterMs = getCooldownRetryAfterMs(loopErr);
				debugLog("autoLoop", {
					phase: "cooldown-wait",
					iteration,
					consecutiveCooldowns,
					retryAfterMs,
					error: msg,
				});
				if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) {
					ctx.ui.notify(
						`Autonomous mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`,
						"error",
					);
					await deps.stopAuto(
						ctx,
						pi,
						`${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`,
					);
					break;
				}
				const waitMs =
					retryAfterMs !== undefined &&
					retryAfterMs > 0 &&
					retryAfterMs <= 60_000
						? retryAfterMs + 500 // Use structured hint + small buffer
						: COOLDOWN_FALLBACK_WAIT_MS;
				ctx.ui.notify(
					`Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`,
					"warning",
				);
				await new Promise((resolve) => setTimeout(resolve, waitMs));
				finishTurn("retry", "timeout", msg);
				continue; // Retry iteration without incrementing consecutiveErrors
			}
			consecutiveErrors++;
			recentErrorMessages.push(
				msg.length > 120 ? msg.slice(0, 120) + "..." : msg,
			);
			debugLog("autoLoop", {
				phase: "iteration-error",
				iteration,
				consecutiveErrors,
				error: msg,
			});
			if (consecutiveErrors >= 3) {
				// 3+ consecutive: hard stop — something is fundamentally broken
				const errorHistory = recentErrorMessages
					.map((m, i) => `  ${i + 1}. ${m}`)
					.join("\n");
				ctx.ui.notify(
					`Autonomous mode stopped: ${consecutiveErrors} consecutive iteration failures:\n${errorHistory}`,
					"error",
				);
				await deps.stopAuto(
					ctx,
					pi,
					`${consecutiveErrors} consecutive iteration failures`,
				);
				finishTurn("failed", "execution", msg);
				break;
			} else if (consecutiveErrors === 2) {
				// 2nd consecutive: try invalidating caches + re-deriving state
				ctx.ui.notify(
					`Iteration error (attempt ${consecutiveErrors}): ${msg}. Invalidating caches and retrying.`,
					"warning",
				);
				deps.invalidateAllCaches();
			} else {
				// 1st error: log and retry — transient failures happen
				ctx.ui.notify(`Iteration error: ${msg}. Retrying.`, "warning");
			}
			finishTurn("retry", "execution", msg);
		}
	}
	await runExitSolverEval(ctx, s, deps, iteration);
	_clearCurrentResolve();
	debugLog("autoLoop", { phase: "exit", totalIterations: iteration });
}
// ── Dispatch-contract entry points ───────────────────────────────────────
export async function runUokKernelLoop(ctx, pi, s, deps) {
	return autoLoop(ctx, pi, s, deps);
}