singularity-forge/src/resources/extensions/sf/auto/loop.js
2026-05-08 01:34:07 +02:00

1171 lines
39 KiB
JavaScript

/**
* auto/loop.ts — Main autonomous mode execution loop.
*
* Iterates: derive → dispatch → guards → runUnit → finalize → repeat.
* Exits when s.active becomes false or a terminal condition is reached.
*
* Imports from: auto/types, auto/resolve, auto/phases
*/
import { randomUUID } from "node:crypto";
import { mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { atomicWriteSync } from "../atomic-write.js";
import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
import { runAutomaticAutonomousSolverEval } from "../autonomous-solver-eval.js";
import { debugLog } from "../debug-logger.js";
import { resolveEngine } from "../engine-resolver.js";
import { sfRoot } from "../paths.js";
import {
ExecutionGraphScheduler,
scheduleSidecarQueue,
} from "../uok/execution-graph.js";
import { resolveUokFlags } from "../uok/flags.js";
import { logWarning } from "../workflow-logger.js";
import {
COOLDOWN_FALLBACK_WAIT_MS,
getCooldownRetryAfterMs,
isInfrastructureError,
isTransientCooldownError,
MAX_COOLDOWN_RETRIES,
} from "./infra-errors.js";
import {
runDispatch,
runFinalize,
runGuards,
runPreDispatch,
runUnitPhase,
} from "./phases.js";
import { _clearCurrentResolve } from "./resolve.js";
import { MAX_LOOP_ITERATIONS } from "./types.js";
// ── Stuck detection persistence (#3704) ──────────────────────────────────
// Persist stuck detection state to disk so it survives session restarts.
// Without this, restarting autonomous mode resets all counters, allowing the
// same blocked unit to burn a full retry budget each session.
function stuckStatePath(basePath) {
return join(sfRoot(basePath), "runtime", "stuck-state.json");
}
function loadStuckState(basePath) {
try {
const data = JSON.parse(readFileSync(stuckStatePath(basePath), "utf-8"));
// Only load state written by a DIFFERENT process (real session restart).
// If the PID matches the current process, this state was written by an earlier
// autoLoop call in the same process (e.g., a test that completed before this
// one), not by a crashed session — skip it to prevent test state pollution.
if (data.pid === process.pid) {
return { recentUnits: [], stuckRecoveryAttempts: 0 };
}
// Validate the stored PID is actually alive. A dead PID means the prior
// session crashed or was killed; loading its stuck state is safe. But if
// the PID is alive, it may be a concurrent session — skip to avoid
// cross-session pollution (#sf-moqv5o7h-vaabu6).
if (typeof data.pid === "number" && Number.isFinite(data.pid)) {
try {
process.kill(data.pid, 0);
// PID is alive — this is a concurrent session, not a restart.
return { recentUnits: [], stuckRecoveryAttempts: 0 };
} catch {
// PID is dead — safe to load the persisted stuck state.
}
}
return {
recentUnits: Array.isArray(data.recentUnits) ? data.recentUnits : [],
stuckRecoveryAttempts:
typeof data.stuckRecoveryAttempts === "number"
? data.stuckRecoveryAttempts
: 0,
};
} catch (err) {
debugLog("autoLoop", {
phase: "load-stuck-state-failed",
error: err instanceof Error ? err.message : String(err),
});
return { recentUnits: [], stuckRecoveryAttempts: 0 };
}
}
function saveStuckState(basePath, state) {
try {
const filePath = stuckStatePath(basePath);
mkdirSync(join(sfRoot(basePath), "runtime"), { recursive: true });
writeFileSync(
filePath,
JSON.stringify({
pid: process.pid,
recentUnits: state.recentUnits.slice(-20), // keep last 20 entries
stuckRecoveryAttempts: state.stuckRecoveryAttempts,
updatedAt: new Date().toISOString(),
}) + "\n",
);
} catch (err) {
debugLog("autoLoop", {
phase: "save-stuck-state-failed",
error: err instanceof Error ? err.message : String(err),
});
}
}
// ── Custom workflow verification retry persistence ───────────────────────
// Custom workflow verifiers can request a retry after a step runs. Persisting
// retry counts under the run directory prevents restart loops from resetting the
// retry budget and repeatedly dispatching the same failing step.
const MAX_CUSTOM_ENGINE_VERIFY_RETRIES = 3;
function customVerifyRetryStateDir(s) {
return s.activeRunDir
? join(s.activeRunDir, "runtime")
: join(sfRoot(s.basePath), "runtime");
}
function customVerifyRetryStatePath(s) {
return join(customVerifyRetryStateDir(s), "custom-verify-retries.json");
}
function hydrateCustomVerifyRetryCounts(s) {
if (s.verificationRetryCount.size > 0) {
return s.verificationRetryCount;
}
try {
const raw = JSON.parse(
readFileSync(customVerifyRetryStatePath(s), "utf-8"),
);
const counts =
raw &&
typeof raw === "object" &&
raw.counts &&
typeof raw.counts === "object"
? raw.counts
: {};
for (const [key, value] of Object.entries(counts)) {
if (typeof value === "number" && Number.isFinite(value) && value > 0) {
s.verificationRetryCount.set(key, Math.floor(value));
}
}
} catch (err) {
debugLog("autoLoop", {
phase: "load-custom-verify-retries-failed",
error: err instanceof Error ? err.message : String(err),
});
}
return s.verificationRetryCount;
}
function saveCustomVerifyRetryCounts(s) {
const retryCounts = s.verificationRetryCount;
const filePath = customVerifyRetryStatePath(s);
try {
if (retryCounts.size === 0) {
unlinkSync(filePath);
return;
}
mkdirSync(customVerifyRetryStateDir(s), { recursive: true });
atomicWriteSync(
filePath,
JSON.stringify({
counts: Object.fromEntries(retryCounts),
updatedAt: new Date().toISOString(),
}) + "\n",
);
} catch (err) {
const code =
err && typeof err === "object" && "code" in err ? err.code : undefined;
if (code !== "ENOENT") {
debugLog("autoLoop", {
phase: "save-custom-verify-retries-failed",
error: err instanceof Error ? err.message : String(err),
});
}
}
}
// ── Memory pressure monitoring (#3331) ──────────────────────────────────
// Check heap usage every N iterations and trigger graceful shutdown before
// the OS OOM killer sends SIGKILL. The threshold is 90% of the V8 heap
// limit (--max-old-space-size or default ~1.5-4GB depending on platform).
const MEMORY_CHECK_INTERVAL = 5; // check every 5 iterations
const MEMORY_PRESSURE_THRESHOLD = 0.85; // 85% of heap limit
function checkMemoryPressure() {
const mem = process.memoryUsage();
// v8.getHeapStatistics() gives heap_size_limit but requires import
// Use a conservative estimate: RSS > 3GB is danger zone on most systems
const heapMB = Math.round(mem.heapUsed / 1024 / 1024);
const _rssMB = Math.round(mem.rss / 1024 / 1024);
// Try to get the actual V8 heap limit
let limitMB = 4096; // conservative default
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const v8 = require("node:v8");
const stats = v8.getHeapStatistics();
limitMB = Math.round(stats.heap_size_limit / 1024 / 1024);
} catch {
limitMB = 4096; /* v8 stats unavailable — use conservative default */
}
const pct = heapMB / limitMB;
return { pressured: pct > MEMORY_PRESSURE_THRESHOLD, heapMB, limitMB, pct };
}
/**
* Tracks the dangling phase promise from the most recent timeout so the next
* iteration can drain it before proceeding. Promise.race() rejects on timeout
* but does not cancel the underlying async work; draining here prevents the
* timed-out phase from mutating state concurrently with the next iteration.
*/
let _danglingPhasePromise = null;
/**
* Wrap a phase function with a timeout. Rejects with an Error whose message
* starts with "phase-timeout:" so the blanket catch can handle it specially.
* Stores the still-running phase promise in _danglingPhasePromise so the caller
* can drain it before starting a new iteration.
*/
async function withPhaseTimeout(name, fn, timeoutMs) {
let timer;
const phasePromise = fn();
const timeout = new Promise((_, reject) => {
timer = setTimeout(
() => reject(new Error(`phase-timeout:${name}`)),
timeoutMs,
);
});
try {
return await Promise.race([phasePromise, timeout]);
} catch (err) {
if (err instanceof Error && err.message.startsWith("phase-timeout:")) {
_danglingPhasePromise = phasePromise;
}
throw err;
} finally {
if (timer !== undefined) clearTimeout(timer);
}
}
// ── Dispatch contract helpers ─────────────────────────────────────────────
function resolveDispatchNodeKind(unitType, sidecarItem) {
if (sidecarItem?.kind === "hook") return "hook";
if (sidecarItem?.kind === "triage") return "verification";
if (sidecarItem?.kind === "quick-task") return "team-worker";
if (unitType.startsWith("hook/")) return "hook";
if (unitType === "reactive-execute") return "subagent";
if (
unitType === "gate-evaluate" ||
unitType === "validate-milestone" ||
unitType === "run-uat" ||
unitType === "complete-slice"
) {
return "verification";
}
if (unitType === "replan-slice" || unitType === "reassess-roadmap") {
return "reprocess";
}
return "unit";
}
async function runUnitPhaseViaContract(ic, iterData, loopState, sidecarItem) {
const scheduler = new ExecutionGraphScheduler();
let outcome = null;
const executeNode = async () => {
outcome = await runUnitPhase(ic, iterData, loopState, sidecarItem);
};
const kinds = [
"unit",
"hook",
"subagent",
"team-worker",
"verification",
"reprocess",
];
for (const kind of kinds) scheduler.registerHandler(kind, executeNode);
const nodeId = `dispatch:${ic.iteration}:${iterData.unitType}:${iterData.unitId}`;
await scheduler.run(
[
{
id: nodeId,
kind: resolveDispatchNodeKind(iterData.unitType, sidecarItem),
dependsOn: [],
metadata: { unitType: iterData.unitType, unitId: iterData.unitId },
},
],
{ parallel: false, maxWorkers: 1 },
);
return (
outcome ?? { action: "break", reason: "scheduler-dispatch-missing-result" }
);
}
async function enforceMinRequestInterval(s, prefs) {
const minInterval = prefs?.min_request_interval_ms ?? 0;
if (minInterval > 0 && s.lastRequestTimestamp > 0) {
const elapsed = Math.max(0, Date.now() - s.lastRequestTimestamp);
if (elapsed < minInterval) {
const waitMs = minInterval - elapsed;
debugLog("autoLoop", { phase: "rate-limit-wait", waitMs });
await new Promise((r) => setTimeout(r, waitMs));
}
}
}
async function runExitSolverEval(ctx, s, deps, iteration) {
try {
const supervisor =
deps.loadEffectiveSFPreferences()?.preferences?.auto_supervisor;
const flowId = randomUUID();
let seq = 0;
const emitJournalEvent = (event) =>
deps.emitJournalEvent({
...event,
flowId: event.flowId ?? flowId,
seq: event.seq ?? ++seq,
});
const result = await runAutomaticAutonomousSolverEval({
basePath: s.basePath,
enabled: supervisor?.solver_eval_on_autonomous_exit !== false,
reason: "autonomous-exit",
emitJournalEvent,
});
if (result.ok && result.report?.dbRecorded) {
ctx.ui.notify(
`Autonomous solver eval recorded: ${result.report.reportPath}`,
"info",
);
} else if (result.ok && result.report) {
ctx.ui.notify(
`Autonomous solver eval wrote ${result.report.reportPath}, but DB evidence was not recorded.`,
"warning",
);
} else if (!result.ok) {
ctx.ui.notify(
`Autonomous solver eval did not record: ${result.error}`,
"warning",
);
}
debugLog("autoLoop", {
phase: "solver-eval-auto",
iteration,
ok: result.ok,
skipped: result.skipped,
runId: result.report?.runId,
error: result.error,
});
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
ctx.ui.notify(`Autonomous solver eval hook failed: ${message}`, "warning");
debugLog("autoLoop", {
phase: "solver-eval-auto-failed",
iteration,
error: message,
});
}
}
/**
* Main autonomous mode execution loop. Iterates: derive → dispatch → guards →
* runUnit → finalize → repeat. Exits when s.active becomes false or a
* terminal condition is reached.
*
* This is the linear replacement for the recursive
* dispatchNextUnit → handleAgentEnd → dispatchNextUnit chain.
*/
export async function autoLoop(ctx, pi, s, deps) {
debugLog("autoLoop", { phase: "enter" });
let iteration = 0;
// Load persisted stuck state so counters survive session restarts (#3704)
const persisted = loadStuckState(s.basePath);
const loopState = {
recentUnits: persisted.recentUnits,
stuckRecoveryAttempts: persisted.stuckRecoveryAttempts,
consecutiveFinalizeTimeouts: 0,
};
let consecutiveErrors = 0;
let consecutiveCooldowns = 0;
const recentErrorMessages = [];
while (s.active) {
iteration++;
debugLog("autoLoop", { phase: "loop-top", iteration });
// ── Journal: per-iteration flow grouping ──
const flowId = randomUUID();
let seqCounter = 0;
const nextSeq = () => ++seqCounter;
const turnId = randomUUID();
s.currentTraceId = flowId;
s.currentTurnId = turnId;
const turnStartedAt = new Date().toISOString();
let observedUnitType;
let observedUnitId;
let turnFinished = false;
const finishTurn = (status, failureClass = "none", error) => {
if (turnFinished) return;
turnFinished = true;
deps.uokObserver?.onTurnResult({
traceId: flowId,
turnId,
iteration,
unitType: observedUnitType,
unitId: observedUnitId,
status,
failureClass,
phaseResults: [],
error,
startedAt: turnStartedAt,
finishedAt: new Date().toISOString(),
});
s.currentTraceId = null;
s.currentTurnId = null;
};
deps.uokObserver?.onTurnStart({
traceId: flowId,
turnId,
iteration,
basePath: s.basePath,
startedAt: turnStartedAt,
});
if (iteration > MAX_LOOP_ITERATIONS) {
debugLog("autoLoop", {
phase: "exit",
reason: "max-iterations",
iteration,
});
await deps.stopAuto(
ctx,
pi,
`Safety: loop exceeded ${MAX_LOOP_ITERATIONS} iterations — possible runaway`,
);
finishTurn("stopped", "manual-attention", "max-iterations");
break;
}
// ── Memory pressure check (#3331) ──
// Graceful shutdown before OOM killer sends SIGKILL.
if (iteration % MEMORY_CHECK_INTERVAL === 0) {
const mem = checkMemoryPressure();
debugLog("autoLoop", { phase: "memory-check", ...mem });
if (mem.pressured) {
logWarning(
"dispatch",
`Memory pressure: ${mem.heapMB}MB / ${mem.limitMB}MB (${Math.round(mem.pct * 100)}%) — stopping autonomous mode to prevent OOM kill`,
);
await deps.stopAuto(
ctx,
pi,
`Memory pressure: heap at ${mem.heapMB}MB / ${mem.limitMB}MB (${Math.round(mem.pct * 100)}%). ` +
`Stopping gracefully to prevent OOM kill after ${iteration} iterations. ` +
`Resume with /autonomous to continue from where you left off.`,
);
finishTurn("stopped", "timeout", "memory-pressure");
break;
}
}
if (!s.cmdCtx) {
debugLog("autoLoop", { phase: "exit", reason: "no-cmdCtx" });
finishTurn("stopped", "manual-attention", "missing-command-context");
break;
}
// ── Drain any dangling phase promise before starting new work ──
// Promise.race() on timeout does not cancel the underlying async fn; that
// fn keeps running and may mutate state after the loop has advanced.
// Awaiting its completion here ensures no concurrent state writes.
if (_danglingPhasePromise !== null) {
const dangling = _danglingPhasePromise;
_danglingPhasePromise = null;
try {
await dangling;
} catch {
/* ignore — result is irrelevant */
}
}
try {
// ── Blanket try/catch: one bad iteration must not kill the session
const prefs = deps.loadEffectiveSFPreferences()?.preferences;
const uokFlags = resolveUokFlags(prefs);
const phaseTimeoutMs =
(prefs?.auto_supervisor?.phase_timeout_minutes ?? 10) * 60_000;
// ── Check sidecar queue before deriveState ──
let sidecarItem;
if (s.sidecarQueue.length > 0) {
if (uokFlags.executionGraph && s.sidecarQueue.length > 1) {
try {
s.sidecarQueue = await scheduleSidecarQueue(s.sidecarQueue);
} catch (err) {
logWarning(
"dispatch",
`sidecar queue scheduling failed: ${err instanceof Error ? err.message : String(err)}`,
);
}
}
sidecarItem = s.sidecarQueue.shift();
debugLog("autoLoop", {
phase: "sidecar-dequeue",
kind: sidecarItem.kind,
unitType: sidecarItem.unitType,
unitId: sidecarItem.unitId,
});
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "sidecar-dequeue",
data: {
kind: sidecarItem.kind,
unitType: sidecarItem.unitType,
unitId: sidecarItem.unitId,
},
});
}
const sessionLockBase = deps.lockBase();
if (sessionLockBase) {
const lockStatus = deps.validateSessionLock(sessionLockBase);
if (!lockStatus.valid) {
debugLog("autoLoop", {
phase: "session-lock-invalid",
reason: lockStatus.failureReason ?? "unknown",
existingPid: lockStatus.existingPid,
expectedPid: lockStatus.expectedPid,
});
deps.handleLostSessionLock(ctx, lockStatus);
debugLog("autoLoop", {
phase: "exit",
reason: "session-lock-lost",
detail: lockStatus.failureReason ?? "unknown",
});
break;
}
}
const ic = {
ctx,
pi,
s,
deps,
prefs,
iteration,
flowId,
nextSeq,
};
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "iteration-start",
data: { iteration },
});
let iterData;
// ── Custom engine path ──────────────────────────────────────────────
// When activeEngineId is a non-dev value, bypass runPreDispatch and
// runDispatch entirely — the custom engine drives its own state via
// GRAPH.yaml. Shares runGuards and runUnitPhase with the dev path.
// After unit execution, verifies then reconciles via the engine layer.
//
// SF_ENGINE_BYPASS=1 skips the engine layer entirely — falls through
// to the dev path below.
if (
s.activeEngineId != null &&
s.activeEngineId !== "dev" &&
!sidecarItem &&
process.env.SF_ENGINE_BYPASS !== "1"
) {
debugLog("autoLoop", {
phase: "custom-engine-derive",
iteration,
engineId: s.activeEngineId,
});
const { engine, policy } = resolveEngine({
activeEngineId: s.activeEngineId,
activeRunDir: s.activeRunDir,
});
const engineState = await engine.deriveState(s.basePath);
if (engineState.isComplete) {
await deps.stopAuto(ctx, pi, "Workflow complete");
break;
}
debugLog("autoLoop", { phase: "custom-engine-dispatch", iteration });
const dispatch = await engine.resolveDispatch(engineState, {
basePath: s.basePath,
});
if (dispatch.action === "stop") {
await deps.stopAuto(ctx, pi, dispatch.reason ?? "Engine stopped");
break;
}
if (dispatch.action === "skip") {
continue;
}
// dispatch.action === "dispatch"
const step = dispatch.step;
const sfState = await deps.deriveState(s.basePath);
iterData = {
unitType: step.unitType,
unitId: step.unitId,
prompt: step.prompt,
finalPrompt: step.prompt,
pauseAfterUatDispatch: false,
state: sfState,
mid: s.currentMilestoneId ?? "workflow",
midTitle: "Workflow",
isRetry: false,
previousTier: undefined,
};
observedUnitType = iterData.unitType;
observedUnitId = iterData.unitId;
// ── Progress widget (mirrors dev path in runDispatch) ──
deps.updateProgressWidget(
ctx,
iterData.unitType,
iterData.unitId,
iterData.state,
);
// ── Guards (shared with dev path) ──
const guardsResult = await runGuards(
ic,
s.currentMilestoneId ?? "workflow",
iterData.unitType,
iterData.unitId,
iterData.state?.activeSlice?.id,
);
deps.uokObserver?.onPhaseResult("guard", guardsResult.action, {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
if (guardsResult.action === "break") {
finishTurn("stopped", "manual-attention", "guard-break");
break;
}
// ── Unit execution (shared with dev path) ──
await enforceMinRequestInterval(s, ic.prefs);
const unitPhaseResult = await runUnitPhaseViaContract(
ic,
iterData,
loopState,
);
if (unitPhaseResult.action === "next") {
const d = unitPhaseResult.data;
const requestTimestamp = d?.requestDispatchedAt ?? d?.unitStartedAt;
if (typeof requestTimestamp === "number")
s.lastRequestTimestamp = requestTimestamp;
}
deps.uokObserver?.onPhaseResult("unit", unitPhaseResult.action, {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
if (unitPhaseResult.action === "break") {
finishTurn("stopped", "execution", "unit-break");
break;
}
if (unitPhaseResult.action === "continue") {
finishTurn("retry");
continue;
}
// ── Verify first, then reconcile (only mark complete on pass) ──
debugLog("autoLoop", {
phase: "custom-engine-verify",
iteration,
unitId: iterData.unitId,
});
const verifyResult = await policy.verify(
iterData.unitType,
iterData.unitId,
{ basePath: s.basePath },
);
if (verifyResult === "pause") {
await deps.pauseAuto(ctx, pi);
deps.uokObserver?.onPhaseResult("custom-engine", "pause", {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
finishTurn(
"paused",
"manual-attention",
"custom-engine-verify-pause",
);
break;
}
if (verifyResult === "retry") {
const recoveryKey = `${iterData.unitType}/${iterData.unitId}`;
const retryCounts = hydrateCustomVerifyRetryCounts(s);
const attempts = (retryCounts.get(recoveryKey) ?? 0) + 1;
retryCounts.set(recoveryKey, attempts);
saveCustomVerifyRetryCounts(s);
debugLog("autoLoop", {
phase: "custom-engine-verify-retry",
iteration,
unitId: iterData.unitId,
attempts,
});
deps.uokObserver?.onPhaseResult("custom-engine", "retry", {
unitType: iterData.unitType,
unitId: iterData.unitId,
attempts,
});
if (attempts > MAX_CUSTOM_ENGINE_VERIFY_RETRIES) {
const recovery = await policy.recover(
iterData.unitType,
iterData.unitId,
{ basePath: s.basePath },
);
if (recovery.outcome === "pause") {
await deps.pauseAuto(ctx, pi);
finishTurn(
"paused",
"manual-attention",
recovery.reason ?? "custom-engine-verify-retry-exhausted",
);
break;
}
if (recovery.outcome === "skip") {
await deps.stopAuto(
ctx,
pi,
recovery.reason ??
`Custom workflow verification for ${iterData.unitId} requested skip after retry exhaustion, but the custom engine cannot reconcile skipped steps.`,
);
finishTurn(
"stopped",
"manual-attention",
"custom-engine-verify-retry-exhausted",
);
break;
}
const exhaustedReason = `Custom workflow verification for ${iterData.unitId} requested retry ${attempts} times without passing.`;
await deps.stopAuto(
ctx,
pi,
recovery.outcome === "stop" && recovery.reason
? recovery.reason
: exhaustedReason,
);
finishTurn(
"stopped",
"manual-attention",
"custom-engine-verify-retry-exhausted",
);
break;
}
finishTurn("retry");
continue;
}
// Verification passed — mark step complete
s.verificationRetryCount.delete(
`${iterData.unitType}/${iterData.unitId}`,
);
saveCustomVerifyRetryCounts(s);
debugLog("autoLoop", {
phase: "custom-engine-reconcile",
iteration,
unitId: iterData.unitId,
});
const reconcileResult = await engine.reconcile(engineState, {
unitType: iterData.unitType,
unitId: iterData.unitId,
startedAt: s.currentUnit?.startedAt ?? Date.now(),
finishedAt: Date.now(),
});
deps.clearUnitTimeout();
consecutiveErrors = 0;
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "iteration-end",
data: { iteration },
});
saveStuckState(s.basePath, loopState); // persist across session restarts (#3704)
debugLog("autoLoop", { phase: "iteration-complete", iteration });
if (reconcileResult.outcome === "milestone-complete") {
await deps.stopAuto(ctx, pi, "Workflow complete");
deps.uokObserver?.onPhaseResult(
"custom-engine",
"milestone-complete",
{
unitType: iterData.unitType,
unitId: iterData.unitId,
},
);
finishTurn("completed");
break;
}
if (reconcileResult.outcome === "pause") {
await deps.pauseAuto(ctx, pi);
deps.uokObserver?.onPhaseResult("custom-engine", "pause", {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
finishTurn("paused", "manual-attention");
break;
}
if (reconcileResult.outcome === "stop") {
await deps.stopAuto(
ctx,
pi,
reconcileResult.reason ?? "Engine stopped",
);
deps.uokObserver?.onPhaseResult("custom-engine", "stop", {
unitType: iterData.unitType,
unitId: iterData.unitId,
reason: reconcileResult.reason,
});
finishTurn("stopped", "manual-attention", reconcileResult.reason);
break;
}
deps.uokObserver?.onPhaseResult("custom-engine", "continue", {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
finishTurn("completed");
continue;
}
if (!sidecarItem) {
// ── P4-A: Doctor issues → reassess escalation ─────────────────────
// If the health gate detects issues that mention slice IDs (state
// inconsistencies that reassessment can fix), queue reassess instead
// of pausing autonomous mode. This runs separately from the gate inside
// runPreDispatch so we can intercept *before* the break path.
try {
const healthCheck = await deps.preDispatchHealthGate(s.basePath);
if (
!healthCheck.proceed &&
healthCheck.issues &&
healthCheck.issues.length > 0
) {
const sliceRefPattern = /\bS\d+\b/;
const hasSliceRef = healthCheck.issues.some((issue) =>
sliceRefPattern.test(issue),
);
if (hasSliceRef) {
const sfState = await deps.deriveState(s.basePath);
const mid = sfState.activeMilestone?.id;
const midTitle = sfState.activeMilestone?.title ?? "";
const sliceId = sfState.activeSlice?.id ?? "reassess";
if (mid) {
ctx.ui.notify(
`Health issues detected with slice references — queuing reassess-roadmap instead of pausing.`,
"warning",
);
const { buildReassessRoadmapPrompt } = await import(
"../auto-prompts.js"
);
const reassessPrompt = await buildReassessRoadmapPrompt(
mid,
midTitle,
sliceId,
s.basePath,
);
s.sidecarQueue.unshift({
kind: "hook",
unitType: "reassess-roadmap",
unitId: `${mid}/${sliceId}`,
prompt: `## Doctor Health Issues\n\n${healthCheck.issues.map((i) => `- ${i}`).join("\n")}\n\n${reassessPrompt}`,
});
finishTurn("retry");
continue;
}
}
}
} catch {
// Non-fatal — fall through to normal runPreDispatch path
}
// ── Phase 1: Pre-dispatch ─────────────────────────────────────────
const preDispatchResult = await withPhaseTimeout(
"preDispatch",
() => runPreDispatch(ic, loopState),
phaseTimeoutMs / 2,
);
deps.uokObserver?.onPhaseResult(
"pre-dispatch",
preDispatchResult.action,
);
if (preDispatchResult.action === "break") {
finishTurn("stopped", "manual-attention", "pre-dispatch-break");
break;
}
if (preDispatchResult.action === "continue") {
finishTurn("skipped");
continue;
}
const preData = preDispatchResult.data;
// ── Phase 2: Dispatch ─────────────────────────────────────────────
const dispatchResult = await withPhaseTimeout(
"dispatch",
() => runDispatch(ic, preData, loopState),
phaseTimeoutMs,
);
deps.uokObserver?.onPhaseResult("dispatch", dispatchResult.action);
if (dispatchResult.action === "break") {
finishTurn("stopped", "manual-attention", "dispatch-break");
break;
}
if (dispatchResult.action === "continue") {
finishTurn("skipped");
continue;
}
iterData = dispatchResult.data;
observedUnitType = iterData.unitType;
observedUnitId = iterData.unitId;
// ── Phase 3: Guards ───────────────────────────────────────────────
const guardsResult = await runGuards(
ic,
iterData.mid ?? preData.mid ?? "workflow",
iterData.unitType,
iterData.unitId,
iterData.state?.activeSlice?.id,
);
deps.uokObserver?.onPhaseResult("guard", guardsResult.action);
if (guardsResult.action === "break") {
finishTurn("stopped", "manual-attention", "guard-break");
break;
}
} else {
// ── Sidecar path: use values from the sidecar item directly ──
const sidecarState = await deps.deriveState(s.basePath);
iterData = {
unitType: sidecarItem.unitType,
unitId: sidecarItem.unitId,
prompt: sidecarItem.prompt,
finalPrompt: sidecarItem.prompt,
pauseAfterUatDispatch: false,
state: sidecarState,
mid: sidecarState.activeMilestone?.id,
midTitle: sidecarState.activeMilestone?.title,
isRetry: false,
previousTier: undefined,
};
observedUnitType = iterData.unitType;
observedUnitId = iterData.unitId;
deps.uokObserver?.onPhaseResult("dispatch", "sidecar", {
unitType: iterData.unitType,
unitId: iterData.unitId,
sidecarKind: sidecarItem.kind,
});
}
await enforceMinRequestInterval(s, ic.prefs);
const unitPhaseResult = await runUnitPhaseViaContract(
ic,
iterData,
loopState,
sidecarItem,
);
if (unitPhaseResult.action === "next") {
const d = unitPhaseResult.data;
const requestTimestamp = d?.requestDispatchedAt ?? d?.unitStartedAt;
if (typeof requestTimestamp === "number")
s.lastRequestTimestamp = requestTimestamp;
}
deps.uokObserver?.onPhaseResult("unit", unitPhaseResult.action, {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
if (unitPhaseResult.action === "break") {
finishTurn("stopped", "execution", "unit-break");
break;
}
if (unitPhaseResult.action === "continue") {
finishTurn("retry");
continue;
}
// ── Phase 5: Finalize ───────────────────────────────────────────────
const finalizeResult = await withPhaseTimeout(
"finalize",
() => runFinalize(ic, iterData, loopState, sidecarItem),
phaseTimeoutMs,
);
deps.uokObserver?.onPhaseResult("finalize", finalizeResult.action, {
unitType: iterData.unitType,
unitId: iterData.unitId,
});
if (finalizeResult.action === "break") {
const finalizeFailureClass =
finalizeResult.reason === "git-closeout-failure" ? "git" : "closeout";
finishTurn("stopped", finalizeFailureClass, "finalize-break");
break;
}
if (finalizeResult.action === "continue") {
finishTurn("retry");
continue;
}
consecutiveErrors = 0; // Iteration completed successfully
consecutiveCooldowns = 0;
recentErrorMessages.length = 0;
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "iteration-end",
data: { iteration },
});
saveStuckState(s.basePath, loopState); // persist across session restarts (#3704)
debugLog("autoLoop", { phase: "iteration-complete", iteration });
finishTurn("completed");
} catch (loopErr) {
// ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ──
const msg = loopErr instanceof Error ? loopErr.message : String(loopErr);
debugLog("autoLoop", {
phase: "iteration-error",
message: msg,
stack: loopErr instanceof Error ? loopErr.stack : undefined,
});
// Always emit iteration-end on error so the journal records iteration
// completion even on failure (#2344). Without this, errors in
// runFinalize leave the journal incomplete, making diagnosis harder.
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "iteration-end",
data: { iteration, error: msg },
});
// ── Pre-send model-policy block: not a retryable error (#4959 / #4850) ──
// The model-policy gate runs before the prompt is sent. When every
// candidate model is denied (cross-provider disabled + flat-rate
// baseline + tool-policy denial), retrying the same unit produces the
// same denial — burning the consecutive-error budget toward a 3-strike
// hard stop and corrupting autonomous mode state. Pause for user attention
// instead, with the per-model deny reasons surfaced from the typed error.
if (loopErr instanceof ModelPolicyDispatchBlockedError) {
debugLog("autoLoop", {
phase: "model-policy-blocked",
iteration,
unitType: loopErr.unitType,
unitId: loopErr.unitId,
reasons: loopErr.reasons,
});
ctx.ui.notify(
`Autonomous mode paused: model-policy denied dispatch for ${loopErr.unitType}/${loopErr.unitId}. ${msg}`,
"error",
);
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId,
seq: nextSeq(),
eventType: "unit-end",
data: {
unitType: loopErr.unitType,
unitId: loopErr.unitId,
status: "blocked",
reason: "model-policy-dispatch-blocked",
reasons: loopErr.reasons,
},
});
// Carry the blocked unit identity into the turn-result observer:
// the throw originated inside dispatch, so observedUnitType/Id were
// not assigned by the success path — but the typed error already names
// the unit (#4959 / CodeRabbit).
observedUnitType = loopErr.unitType;
observedUnitId = loopErr.unitId;
await deps.pauseAuto(ctx, pi);
finishTurn("paused", "manual-attention", msg);
// Do NOT increment consecutiveErrors — the failure is configuration,
// not a transient runtime fault.
break;
}
// ── Infrastructure errors: immediate stop, no retry ──
// These are unrecoverable (disk full, OOM, etc.). Retrying just burns
// LLM budget on guaranteed failures.
const infraCode = isInfrastructureError(loopErr);
if (infraCode) {
debugLog("autoLoop", {
phase: "infrastructure-error",
iteration,
code: infraCode,
error: msg,
});
ctx.ui.notify(
`Autonomous mode stopped: infrastructure error ${infraCode}${msg}`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`Infrastructure error (${infraCode}): not recoverable by retry`,
);
finishTurn("failed", "execution", msg);
break;
}
// ── Phase timeout: log, increment counter, continue ──
if (msg.startsWith("phase-timeout:")) {
const phaseName = msg.slice("phase-timeout:".length);
loopState.consecutiveFinalizeTimeouts++;
ctx.ui.notify(
`Phase "${phaseName}" timed out (${loopState.consecutiveFinalizeTimeouts} consecutive) — skipping iteration and continuing.`,
"warning",
);
debugLog("autoLoop", {
phase: "phase-timeout",
phaseName,
consecutiveFinalizeTimeouts: loopState.consecutiveFinalizeTimeouts,
iteration,
});
finishTurn("retry", "timeout", msg);
continue;
}
// ── Credential cooldown: wait and retry with bounded budget ──
// A 429 triggers a 30s credential backoff in AuthStorage. If the SDK's
// getApiKey() retries couldn't outlast the window, the error surfaces
// here. Wait for the cooldown to clear rather than counting it as a
// consecutive failure — but cap retries so we don't spin for hours
// on persistent quota exhaustion.
if (isTransientCooldownError(loopErr)) {
consecutiveCooldowns++;
const retryAfterMs = getCooldownRetryAfterMs(loopErr);
debugLog("autoLoop", {
phase: "cooldown-wait",
iteration,
consecutiveCooldowns,
retryAfterMs,
error: msg,
});
if (consecutiveCooldowns > MAX_COOLDOWN_RETRIES) {
ctx.ui.notify(
`Autonomous mode stopped: ${consecutiveCooldowns} consecutive credential cooldowns — rate limit or quota may be persistently exhausted.`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`${consecutiveCooldowns} consecutive credential cooldowns exceeded retry budget`,
);
break;
}
const waitMs =
retryAfterMs !== undefined &&
retryAfterMs > 0 &&
retryAfterMs <= 60_000
? retryAfterMs + 500 // Use structured hint + small buffer
: COOLDOWN_FALLBACK_WAIT_MS;
ctx.ui.notify(
`Credentials in cooldown (${consecutiveCooldowns}/${MAX_COOLDOWN_RETRIES}) — waiting ${Math.round(waitMs / 1000)}s before retrying.`,
"warning",
);
await new Promise((resolve) => setTimeout(resolve, waitMs));
finishTurn("retry", "timeout", msg);
continue; // Retry iteration without incrementing consecutiveErrors
}
consecutiveErrors++;
recentErrorMessages.push(
msg.length > 120 ? msg.slice(0, 120) + "..." : msg,
);
debugLog("autoLoop", {
phase: "iteration-error",
iteration,
consecutiveErrors,
error: msg,
});
if (consecutiveErrors >= 3) {
// 3+ consecutive: hard stop — something is fundamentally broken
const errorHistory = recentErrorMessages
.map((m, i) => ` ${i + 1}. ${m}`)
.join("\n");
ctx.ui.notify(
`Autonomous mode stopped: ${consecutiveErrors} consecutive iteration failures:\n${errorHistory}`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`${consecutiveErrors} consecutive iteration failures`,
);
finishTurn("failed", "execution", msg);
break;
} else if (consecutiveErrors === 2) {
// 2nd consecutive: try invalidating caches + re-deriving state
ctx.ui.notify(
`Iteration error (attempt ${consecutiveErrors}): ${msg}. Invalidating caches and retrying.`,
"warning",
);
deps.invalidateAllCaches();
} else {
// 1st error: log and retry — transient failures happen
ctx.ui.notify(`Iteration error: ${msg}. Retrying.`, "warning");
}
finishTurn("retry", "execution", msg);
}
}
await runExitSolverEval(ctx, s, deps, iteration);
_clearCurrentResolve();
debugLog("autoLoop", { phase: "exit", totalIterations: iteration });
}
// ── Dispatch-contract entry points ───────────────────────────────────────
export async function runUokKernelLoop(ctx, pi, s, deps) {
return autoLoop(ctx, pi, s, deps);
}