334 lines
13 KiB
JavaScript
334 lines
13 KiB
JavaScript
/**
|
|
* Timeout recovery logic for autonomous mode units.
|
|
* Handles idle and hard timeout recovery with escalation, steering messages,
|
|
* and blocker placeholder generation.
|
|
*/
|
|
import { existsSync, readFileSync } from "node:fs";
|
|
import { relative } from "node:path";
|
|
import { resolveAgentEnd } from "./auto-loop.js";
|
|
import {
|
|
diagnoseExpectedArtifact,
|
|
resolveExpectedArtifactPath,
|
|
verifyExpectedArtifact,
|
|
writeBlockerPlaceholder,
|
|
} from "./auto-recovery.js";
|
|
import { parseRoadmap } from "./parsers.js";
|
|
import {
|
|
relMilestoneFile,
|
|
relSliceFile,
|
|
resolveMilestoneFile,
|
|
resolveSliceFile,
|
|
} from "./paths.js";
|
|
import { getSlice, isDbAvailable } from "./sf-db.js";
|
|
import { parseUnitId } from "./unit-id.js";
|
|
import {
|
|
formatExecuteTaskRecoveryStatus,
|
|
inspectExecuteTaskDurability,
|
|
readUnitRuntimeRecord,
|
|
writeUnitRuntimeRecord,
|
|
} from "./uok/unit-runtime.js";
|
|
|
|
function relToBase(basePath, path) {
|
|
const rel = relative(basePath, path);
|
|
return rel && !rel.startsWith("..") ? rel : path;
|
|
}
|
|
function formatSliceDbStatus(status) {
|
|
return status ? `DB slice status is "${status}" (expected complete)` : "";
|
|
}
|
|
export function inspectUnitRecoveryStatus(unitType, unitId, basePath) {
|
|
const expected =
|
|
diagnoseExpectedArtifact(unitType, unitId, basePath) ??
|
|
"required durable artifact";
|
|
const missing = [];
|
|
const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath);
|
|
if (!artifactPath) {
|
|
missing.push(`artifact path could not be resolved for ${expected}`);
|
|
} else if (!existsSync(artifactPath)) {
|
|
missing.push(`artifact missing (${relToBase(basePath, artifactPath)})`);
|
|
}
|
|
if (unitType === "complete-slice") {
|
|
const { milestone: mid, slice: sid } = parseUnitId(unitId);
|
|
if (mid && sid) {
|
|
const uatPath = resolveSliceFile(basePath, mid, sid, "UAT");
|
|
if (!uatPath || !existsSync(uatPath)) {
|
|
missing.push(
|
|
`UAT missing (${relSliceFile(basePath, mid, sid, "UAT")})`,
|
|
);
|
|
}
|
|
const dbSlice = getSlice(mid, sid);
|
|
if (dbSlice) {
|
|
const dbStatus = formatSliceDbStatus(dbSlice.status);
|
|
if (dbStatus && dbSlice.status !== "complete") missing.push(dbStatus);
|
|
} else if (!isDbAvailable()) {
|
|
const roadmapPath = resolveMilestoneFile(basePath, mid, "ROADMAP");
|
|
if (!roadmapPath || !existsSync(roadmapPath)) {
|
|
missing.push(
|
|
`roadmap missing (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
|
|
);
|
|
} else {
|
|
try {
|
|
const roadmap = parseRoadmap(readFileSync(roadmapPath, "utf-8"));
|
|
const slice = roadmap.slices.find((s) => s.id === sid);
|
|
if (slice && !slice.done) {
|
|
missing.push(
|
|
`roadmap checkbox not marked [x] (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
|
|
);
|
|
}
|
|
} catch {
|
|
missing.push(
|
|
`roadmap could not be parsed (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
const verified = verifyExpectedArtifact(unitType, unitId, basePath);
|
|
if (!verified && missing.length === 0) {
|
|
missing.push(`artifact verification failed for ${expected}`);
|
|
}
|
|
return { expected, verified, missing };
|
|
}
|
|
export function formatUnitRecoveryStatus(status) {
|
|
return status.missing.length > 0
|
|
? status.missing.join("; ")
|
|
: "all durable artifacts present";
|
|
}
|
|
export async function recoverTimedOutUnit(
|
|
ctx,
|
|
pi,
|
|
unitType,
|
|
unitId,
|
|
reason,
|
|
rctx,
|
|
) {
|
|
const { basePath, verbose, currentUnitStartedAt, unitRecoveryCount } = rctx;
|
|
const runtime = readUnitRuntimeRecord(basePath, unitType, unitId);
|
|
const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
|
|
const maxRecoveryAttempts = reason === "idle" ? 2 : 1;
|
|
const recoveryKey = `${unitType}/${unitId}`;
|
|
const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
|
|
unitRecoveryCount.set(recoveryKey, attemptNumber);
|
|
if (attemptNumber > 1) {
|
|
// Exponential backoff: 2^(n-1) seconds, capped at 30s
|
|
const backoffMs = Math.min(1000 * 2 ** (attemptNumber - 2), 30000);
|
|
ctx.ui.notify(
|
|
`Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
|
|
"info",
|
|
);
|
|
await new Promise((r) => setTimeout(r, backoffMs));
|
|
}
|
|
if (unitType === "execute-task") {
|
|
const status = await inspectExecuteTaskDurability(basePath, unitId);
|
|
if (!status) return "paused";
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
recovery: status,
|
|
});
|
|
const durableComplete =
|
|
status.summaryExists && status.taskChecked && status.nextActionAdvanced;
|
|
if (durableComplete) {
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "finalized",
|
|
recovery: status,
|
|
});
|
|
ctx.ui.notify(
|
|
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing autonomous mode. (attempt ${attemptNumber})`,
|
|
"info",
|
|
);
|
|
unitRecoveryCount.delete(recoveryKey);
|
|
resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
|
|
return "recovered";
|
|
}
|
|
if (recoveryAttempts < maxRecoveryAttempts) {
|
|
const isEscalation = recoveryAttempts > 0;
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "recovered",
|
|
recovery: status,
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
lastProgressAt: Date.now(),
|
|
progressCount: (runtime?.progressCount ?? 0) + 1,
|
|
lastProgressKind:
|
|
reason === "idle" ? "idle-recovery-retry" : "hard-recovery-retry",
|
|
});
|
|
const steeringLines = isEscalation
|
|
? [
|
|
`**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before this task is skipped.**`,
|
|
`You are still executing ${unitType} ${unitId}.`,
|
|
`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
|
|
`Current durability status: ${formatExecuteTaskRecoveryStatus(status)}.`,
|
|
"You MUST finish the durable output NOW, even if incomplete.",
|
|
"Write the task summary with whatever you have accomplished so far.",
|
|
"Mark the task [x] in the plan. Commit your work.",
|
|
"A partial summary is infinitely better than no summary.",
|
|
]
|
|
: [
|
|
`**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — do not stop.**`,
|
|
`You are still executing ${unitType} ${unitId}.`,
|
|
`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
|
|
`Current durability status: ${formatExecuteTaskRecoveryStatus(status)}.`,
|
|
"Do not keep exploring.",
|
|
"Immediately finish the required durable output for this unit.",
|
|
"If full completion is impossible, write the partial artifact/state needed for recovery and make the blocker explicit.",
|
|
];
|
|
pi.sendMessage(
|
|
{
|
|
customType: "sf-auto-timeout-recovery",
|
|
display: verbose,
|
|
content: steeringLines.join("\n"),
|
|
},
|
|
{ triggerTurn: true, deliverAs: "steer" },
|
|
);
|
|
ctx.ui.notify(
|
|
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
|
|
"warning",
|
|
);
|
|
return "recovered";
|
|
}
|
|
// Retries exhausted — write a blocker placeholder and advance.
|
|
const diagnostic = formatExecuteTaskRecoveryStatus(status);
|
|
const placeholder = writeBlockerPlaceholder(
|
|
unitType,
|
|
unitId,
|
|
basePath,
|
|
`${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}`,
|
|
);
|
|
if (placeholder) {
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "skipped",
|
|
recovery: status,
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
ctx.ui.notify(
|
|
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
|
|
"warning",
|
|
);
|
|
unitRecoveryCount.delete(recoveryKey);
|
|
resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
|
|
return "recovered";
|
|
}
|
|
// Fallback: couldn't write skip artifacts — pause as before.
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "paused",
|
|
recovery: status,
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
ctx.ui.notify(
|
|
`${reason === "idle" ? "Idle" : "Timeout"} recovery check for ${unitType} ${unitId}: ${diagnostic}`,
|
|
"warning",
|
|
);
|
|
return "paused";
|
|
}
|
|
const status = inspectUnitRecoveryStatus(unitType, unitId, basePath);
|
|
const diagnostic = formatUnitRecoveryStatus(status);
|
|
// Check full unit durability — the primary artifact alone is not enough for
|
|
// units such as complete-slice, which also require UAT and state transition.
|
|
if (status.verified) {
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "finalized",
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
ctx.ui.notify(
|
|
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} durable state already verified. Advancing. (attempt ${attemptNumber})`,
|
|
"info",
|
|
);
|
|
unitRecoveryCount.delete(recoveryKey);
|
|
resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
|
|
return "recovered";
|
|
}
|
|
if (recoveryAttempts < maxRecoveryAttempts) {
|
|
const isEscalation = recoveryAttempts > 0;
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "recovered",
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
lastProgressAt: Date.now(),
|
|
progressCount: (runtime?.progressCount ?? 0) + 1,
|
|
lastProgressKind:
|
|
reason === "idle" ? "idle-recovery-retry" : "hard-recovery-retry",
|
|
});
|
|
const steeringLines = isEscalation
|
|
? [
|
|
`**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before skip.**`,
|
|
`You are still executing ${unitType} ${unitId}.`,
|
|
`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts} — next failure skips this unit.`,
|
|
`Expected durable output: ${status.expected}.`,
|
|
`Current durability status: ${diagnostic}.`,
|
|
"Repair only the missing durability items listed above.",
|
|
"You MUST write the missing artifact/state NOW, even if incomplete.",
|
|
"Write whatever you have — partial research, preliminary findings, best-effort analysis — and record the remaining blocker explicitly.",
|
|
"If you are truly blocked, write the file with a BLOCKER section explaining why.",
|
|
]
|
|
: [
|
|
`**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — stay in autonomous mode.**`,
|
|
`You are still executing ${unitType} ${unitId}.`,
|
|
`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
|
|
`Expected durable output: ${status.expected}.`,
|
|
`Current durability status: ${diagnostic}.`,
|
|
"Stop broad exploration and do not redo work that is already durable.",
|
|
"Repair only the missing durability items listed above.",
|
|
"If blocked, write the partial artifact and explicitly record the blocker instead of going silent.",
|
|
];
|
|
pi.sendMessage(
|
|
{
|
|
customType: "sf-auto-timeout-recovery",
|
|
display: verbose,
|
|
content: steeringLines.join("\n"),
|
|
},
|
|
{ triggerTurn: true, deliverAs: "steer" },
|
|
);
|
|
ctx.ui.notify(
|
|
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to repair ${diagnostic} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
|
|
"warning",
|
|
);
|
|
return "recovered";
|
|
}
|
|
// #4175: For complete-milestone, never write a blocker placeholder — a stub
|
|
// SUMMARY has no recovery value (milestone is terminal), it does not update
|
|
// DB status, and downstream merge paths can treat the stub as a legitimate
|
|
// completion signal. Pause instead so the worktree branch is preserved.
|
|
if (unitType === "complete-milestone") {
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "paused",
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
ctx.ui.notify(
|
|
`Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s): ${diagnostic}. Worktree branch preserved. Re-run /autonomous once blockers are resolved.`,
|
|
"error",
|
|
);
|
|
return "paused";
|
|
}
|
|
// Retries exhausted — write a blocker placeholder and advance the pipeline
|
|
// instead of silently stalling.
|
|
const placeholder = writeBlockerPlaceholder(
|
|
unitType,
|
|
unitId,
|
|
basePath,
|
|
`${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}.`,
|
|
);
|
|
if (placeholder) {
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "skipped",
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
ctx.ui.notify(
|
|
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
|
|
"warning",
|
|
);
|
|
unitRecoveryCount.delete(recoveryKey);
|
|
resolveAgentEnd({ messages: [], _synthetic: "timeout-recovery" });
|
|
return "recovered";
|
|
}
|
|
// Fallback: couldn't resolve artifact path — pause as before.
|
|
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
|
|
phase: "paused",
|
|
recoveryAttempts: recoveryAttempts + 1,
|
|
lastRecoveryReason: reason,
|
|
});
|
|
return "paused";
|
|
}
|