fix(auto): abort swarm checkpoint loops
This commit is contained in:
parent
22760e03d5
commit
091168303c
4 changed files with 199 additions and 38 deletions
|
|
@ -80,18 +80,49 @@ if (hasTsFiles) {
|
|||
"bin",
|
||||
"tsgo.js",
|
||||
);
|
||||
const compile = spawnSync(
|
||||
process.execPath,
|
||||
[tsgoBin, "--project", resourcesTsconfig],
|
||||
{
|
||||
const compileWithTsgo = () =>
|
||||
spawnSync(process.execPath, [tsgoBin, "--project", resourcesTsconfig], {
|
||||
cwd: root,
|
||||
stdio: "inherit",
|
||||
},
|
||||
);
|
||||
});
|
||||
const compileWithTsc = () => {
|
||||
const tscBin = require.resolve("typescript/bin/tsc");
|
||||
return spawnSync(
|
||||
process.execPath,
|
||||
[tscBin, "--project", resourcesTsconfig],
|
||||
{
|
||||
cwd: root,
|
||||
stdio: "inherit",
|
||||
},
|
||||
);
|
||||
};
|
||||
|
||||
let compile = compileWithTsgo();
|
||||
|
||||
if (compile.status !== 0) {
|
||||
process.exit(compile.status ?? 1);
|
||||
}
|
||||
|
||||
// Native tsgo has reported successful emits while leaving .ts resources copied
|
||||
// without their .js runtime output. Verify a known TS-only resource before the
|
||||
// launcher tries to import it, and fall back to local tsc if needed.
|
||||
const requiredJs = join(
|
||||
distResources,
|
||||
"extensions",
|
||||
"sf",
|
||||
"model-registry.js",
|
||||
);
|
||||
try {
|
||||
readFileSync(requiredJs);
|
||||
} catch {
|
||||
console.warn(
|
||||
"[copy-resources] tsgo did not emit model-registry.js; retrying resources build with tsc",
|
||||
);
|
||||
compile = compileWithTsc();
|
||||
if (compile.status !== 0) {
|
||||
process.exit(compile.status ?? 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No .ts files — just create the dist/resources directory and copy .js files
|
||||
mkdirSync(distResources, { recursive: true });
|
||||
|
|
|
|||
|
|
@ -9,7 +9,10 @@
|
|||
* path. Default behavior is unchanged when the flag is unset.
|
||||
*/
|
||||
|
||||
import { appendAutonomousSolverCheckpoint } from "../autonomous-solver.js";
|
||||
import {
|
||||
appendAutonomousSolverCheckpoint,
|
||||
MAX_CHECKPOINTS_PER_ITERATION,
|
||||
} from "../autonomous-solver.js";
|
||||
import { scopeActiveToolsForUnitType } from "../constants.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { getErrorMessage } from "../error-utils.js";
|
||||
|
|
@ -298,6 +301,16 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
? Math.floor(configuredNoOutputTimeoutMs)
|
||||
: 180_000,
|
||||
);
|
||||
const controller = new AbortController();
|
||||
if (options?.signal) {
|
||||
if (options.signal.aborted) {
|
||||
controller.abort();
|
||||
} else {
|
||||
options.signal.addEventListener("abort", () => controller.abort(), {
|
||||
once: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ── Event collector: capture real tool calls and completion signal ──────────
|
||||
// The worker agent emits events as it runs. We intercept "toolcall_end"
|
||||
|
|
@ -311,6 +324,8 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
let workerCompletedItems = null;
|
||||
let workerRemainingItems = null;
|
||||
let workerVerificationEvidence = null;
|
||||
let checkpointToolCallCount = 0;
|
||||
let checkpointAbortReason = null;
|
||||
|
||||
function onEvent(event) {
|
||||
if (
|
||||
|
|
@ -331,6 +346,7 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
// This is the same tool inspected by the legacy runUnit path's solver pass
|
||||
// (phases-unit.js line ~989: activeToolsAllowlist: ["checkpoint"]).
|
||||
if (toolCall.name === "checkpoint") {
|
||||
checkpointToolCallCount += 1;
|
||||
const args = toolCall.arguments ?? {};
|
||||
const rawOutcome = String(args.outcome ?? "").toLowerCase();
|
||||
if (
|
||||
|
|
@ -359,6 +375,13 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
if (Array.isArray(args.verificationEvidence)) {
|
||||
workerVerificationEvidence = args.verificationEvidence.map(String);
|
||||
}
|
||||
if (
|
||||
checkpointToolCallCount >= MAX_CHECKPOINTS_PER_ITERATION &&
|
||||
!checkpointAbortReason
|
||||
) {
|
||||
checkpointAbortReason = `checkpoint limit reached (${checkpointToolCallCount}/${MAX_CHECKPOINTS_PER_ITERATION})`;
|
||||
controller.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -369,9 +392,41 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
swarmResult = await swarmDispatchAndWait(basePath, envelope, {
|
||||
timeoutMs,
|
||||
noOutputTimeoutMs,
|
||||
signal: controller.signal,
|
||||
onEvent,
|
||||
});
|
||||
} catch (err) {
|
||||
if (checkpointAbortReason) {
|
||||
return {
|
||||
status: "completed",
|
||||
event: {
|
||||
messages: [
|
||||
{
|
||||
role: "assistant",
|
||||
content: [
|
||||
...collectedToolCalls,
|
||||
{
|
||||
type: "text",
|
||||
text: `Swarm worker stopped: ${checkpointAbortReason}.`,
|
||||
},
|
||||
],
|
||||
_swarm: true,
|
||||
targetAgent: "unknown",
|
||||
},
|
||||
],
|
||||
},
|
||||
requestDispatchedAt,
|
||||
_via: "swarm",
|
||||
_swarmResult: {
|
||||
error: checkpointAbortReason,
|
||||
reply: null,
|
||||
replyMessageId: null,
|
||||
},
|
||||
swarmToolCallCount: collectedToolCalls.filter(
|
||||
(tc) => tc.name !== "checkpoint",
|
||||
).length,
|
||||
};
|
||||
}
|
||||
const msg = `swarmDispatchAndWait threw: ${getErrorMessage(err)}`;
|
||||
debugLog("runUnit[swarm]", {
|
||||
phase: "dispatch-error",
|
||||
|
|
@ -395,6 +450,34 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
|
||||
// Error from the swarm (no reply, runAgentTurn failure, etc.)
|
||||
if (swarmResult.error || swarmResult.reply === null) {
|
||||
if (checkpointAbortReason) {
|
||||
return {
|
||||
status: "completed",
|
||||
event: {
|
||||
messages: [
|
||||
{
|
||||
role: "assistant",
|
||||
content: [
|
||||
...collectedToolCalls,
|
||||
{
|
||||
type: "text",
|
||||
text: `Swarm worker stopped: ${checkpointAbortReason}.`,
|
||||
},
|
||||
],
|
||||
_swarm: true,
|
||||
replyMessageId: swarmResult.replyMessageId,
|
||||
targetAgent: swarmResult.targetAgent,
|
||||
},
|
||||
],
|
||||
},
|
||||
requestDispatchedAt,
|
||||
_via: "swarm",
|
||||
_swarmResult: swarmResult,
|
||||
swarmToolCallCount: collectedToolCalls.filter(
|
||||
(tc) => tc.name !== "checkpoint",
|
||||
).length,
|
||||
};
|
||||
}
|
||||
const reason = swarmResult.error ?? "swarm returned no reply";
|
||||
debugLog("runUnit[swarm]", {
|
||||
phase: "dispatch-no-reply",
|
||||
|
|
@ -505,8 +588,7 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
|
|||
failureBoundary:
|
||||
"If the loop's solver disagrees, it will override via its own assessment.",
|
||||
evidence: `Worker produced ${replyText.length} chars and ${collectedToolCalls.length} tool calls but no checkpoint.`,
|
||||
nonGoals:
|
||||
"Never claims completion from a synthetic checkpoint.",
|
||||
nonGoals: "Never claims completion from a synthetic checkpoint.",
|
||||
invariants:
|
||||
"Synthetic checkpoints are only injected when no real checkpoint exists.",
|
||||
assumptions:
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ import {
|
|||
} from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { atomicWriteSync } from "./atomic-write.js";
|
||||
import { sfRoot } from "./paths.js";
|
||||
import { emitJournalEvent } from "./journal.js";
|
||||
import { sfRoot } from "./paths.js";
|
||||
import { loadPrompt } from "./prompt-loader.js";
|
||||
|
||||
export const AUTONOMOUS_SOLVER_OUTCOMES = [
|
||||
|
|
@ -32,7 +32,7 @@ const DEFAULT_SOLVER_MAX_ITERATIONS = 30000;
|
|||
const MIN_SOLVER_MAX_ITERATIONS = 1;
|
||||
const MAX_SOLVER_MAX_ITERATIONS = 100000;
|
||||
const DEFAULT_MISSING_CHECKPOINT_REPAIR_ATTEMPTS = 4;
|
||||
const MAX_CHECKPOINTS_PER_ITERATION = 5;
|
||||
export const MAX_CHECKPOINTS_PER_ITERATION = 5;
|
||||
const SOLVER_CHECKPOINT_SCHEMA_VERSION = 1;
|
||||
const SOLVER_STEERING_SCHEMA_VERSION = 1;
|
||||
const STALL_THRESHOLD_ITERATIONS = 3;
|
||||
|
|
@ -347,7 +347,7 @@ function buildAutonomousLoopVars(state) {
|
|||
close:
|
||||
`CLOSE PHASE (final ${CLOSE_PHASE_LOOKAHEAD} iterations): You are approaching the iteration budget.\n` +
|
||||
"Priority: verify all acceptance criteria, run the test suite, and confirm the unit is complete.\n" +
|
||||
"If the unit cannot be completed in the remaining iterations, checkpoint with outcome=\"blocked\" and a precise reason.\n" +
|
||||
'If the unit cannot be completed in the remaining iterations, checkpoint with outcome="blocked" and a precise reason.\n' +
|
||||
"Do NOT start new work — finish and verify existing work.",
|
||||
};
|
||||
|
||||
|
|
@ -534,25 +534,6 @@ export function isNoOpExecutorTranscript(messages) {
|
|||
if (block.type === "tool_use" && isWorkToolName(block.name)) {
|
||||
return false;
|
||||
}
|
||||
if (block.type === "tool_result") {
|
||||
// tool_result has no name on the block itself; presence of a
|
||||
// non-checkpoint tool_result implies a non-checkpoint tool_use
|
||||
// preceded it. The pair-match would require backward scan; for
|
||||
// robustness, treat ANY tool_result as evidence of work unless
|
||||
// it's a checkpoint result (which would have been emitted by
|
||||
// the assistant's checkpoint tool_use earlier in this same
|
||||
// transcript — but that's protocol, not work). Without the
|
||||
// block name we can't distinguish, so be conservative: a
|
||||
// tool_result is non-no-op work UNLESS the entire transcript's
|
||||
// only tool_use was `checkpoint`. We carry that check via the
|
||||
// tool_use scan above — if a non-checkpoint tool_use exists,
|
||||
// we've already returned false. If only `checkpoint` was used,
|
||||
// the tool_result here is the checkpoint reply and we should
|
||||
// keep scanning.
|
||||
// Simpler approach: ignore tool_result blocks for the
|
||||
// classification; the tool_use scan is authoritative.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -610,8 +591,7 @@ export function appendAutonomousSolverCheckpoint(basePath, params) {
|
|||
persisted.unitId &&
|
||||
persisted.status !== "complete";
|
||||
const isMismatch =
|
||||
hasActiveIdentity &&
|
||||
!sameUnit(persisted, params.unitType, params.unitId);
|
||||
hasActiveIdentity && !sameUnit(persisted, params.unitType, params.unitId);
|
||||
if (isMismatch) {
|
||||
emitJournalEvent(basePath, {
|
||||
flowId: `${state.unitType}-${state.unitId}-${Date.now()}`,
|
||||
|
|
@ -714,8 +694,7 @@ export function appendAutonomousSolverCheckpoint(basePath, params) {
|
|||
checkpoint.summary,
|
||||
].slice(-ROLLING_SUMMARY_WINDOW),
|
||||
// Increment checkpoint count for this iteration (safety cap)
|
||||
checkpointCountThisIteration:
|
||||
(state.checkpointCountThisIteration || 0) + 1,
|
||||
checkpointCountThisIteration: (state.checkpointCountThisIteration || 0) + 1,
|
||||
};
|
||||
mkdirSync(dirname(historyPath(basePath)), { recursive: true });
|
||||
writeFileSync(historyPath(basePath), `${JSON.stringify(checkpoint)}\n`, {
|
||||
|
|
@ -1099,8 +1078,7 @@ export function assessAutonomousSolverTurn(
|
|||
}
|
||||
// Hard cap on excessive checkpoints within a single iteration
|
||||
if (
|
||||
(state.checkpointCountThisIteration || 0) >=
|
||||
MAX_CHECKPOINTS_PER_ITERATION
|
||||
(state.checkpointCountThisIteration || 0) >= MAX_CHECKPOINTS_PER_ITERATION
|
||||
) {
|
||||
return {
|
||||
action: "pause",
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ const { mockAppendCheckpoint } = vi.hoisted(() => {
|
|||
|
||||
vi.mock("../autonomous-solver.js", () => ({
|
||||
appendAutonomousSolverCheckpoint: mockAppendCheckpoint,
|
||||
MAX_CHECKPOINTS_PER_ITERATION: 5,
|
||||
}));
|
||||
|
||||
// ─── Mock everything runUnit imports that touches DB / session infrastructure ─
|
||||
|
|
@ -911,10 +912,79 @@ describe("runUnit — Round 6: real tool calls captured from onEvent", () => {
|
|||
|
||||
expect(toolUseBlocks[0]).toMatchObject({ type: "tool_use", name: "Bash" });
|
||||
expect(toolUseBlocks[1]).toMatchObject({ type: "tool_use", name: "Read" });
|
||||
expect(toolUseBlocks[2]).toMatchObject({ type: "tool_use", name: "checkpoint" });
|
||||
expect(toolUseBlocks[2]).toMatchObject({
|
||||
type: "tool_use",
|
||||
name: "checkpoint",
|
||||
});
|
||||
expect(textBlocks[0].text).toBe(MOCK_REPLY);
|
||||
});
|
||||
|
||||
test("checkpoint loop aborts swarm turn at the per-iteration cap", async () => {
|
||||
process.env.SF_AUTONOMOUS_VIA_SWARM = "1";
|
||||
|
||||
mockSwarmDispatchAndWait.mockImplementationOnce(
|
||||
async (_basePath, envelope, opts = {}) => {
|
||||
for (let i = 0; i < 6; i++) {
|
||||
opts.onEvent?.({
|
||||
type: "message_update",
|
||||
assistantMessageEvent: {
|
||||
type: "toolcall_end",
|
||||
contentIndex: i,
|
||||
toolCall: {
|
||||
type: "toolCall",
|
||||
id: `tc-checkpoint-${i}`,
|
||||
name: "checkpoint",
|
||||
arguments: {
|
||||
outcome: "continue",
|
||||
unitType: "plan-milestone",
|
||||
unitId: "M002",
|
||||
summary: "still discovering",
|
||||
completedItems: ["read files"],
|
||||
remainingItems: ["keep planning"],
|
||||
verificationEvidence: ["read:file.py"],
|
||||
},
|
||||
},
|
||||
partial: {},
|
||||
},
|
||||
});
|
||||
if (opts.signal?.aborted) break;
|
||||
}
|
||||
return {
|
||||
messageId: "msg-dispatch-001",
|
||||
targetAgent: MOCK_TARGET,
|
||||
swarmName: "default",
|
||||
envelope,
|
||||
error: "aborted",
|
||||
reply: null,
|
||||
replyMessageId: null,
|
||||
};
|
||||
},
|
||||
);
|
||||
|
||||
const ctx = makeCtx("/proj");
|
||||
const pi = makePi();
|
||||
const s = makeS("/proj");
|
||||
|
||||
const result = await runUnit(
|
||||
ctx,
|
||||
pi,
|
||||
s,
|
||||
"plan-slice",
|
||||
"M002/S01",
|
||||
"plan it",
|
||||
{},
|
||||
);
|
||||
|
||||
expect(result.status).toBe("completed");
|
||||
expect(mockAppendCheckpoint).not.toHaveBeenCalled();
|
||||
const lastMsg = result.event.messages[result.event.messages.length - 1];
|
||||
const toolUseBlocks = lastMsg.content.filter((b) => b.type === "tool_use");
|
||||
expect(toolUseBlocks).toHaveLength(5);
|
||||
expect(toolUseBlocks.every((b) => b.name === "checkpoint")).toBe(true);
|
||||
expect(lastMsg.content.at(-1).text).toContain("checkpoint limit reached");
|
||||
expect(result.swarmToolCallCount).toBe(0);
|
||||
});
|
||||
|
||||
test("checkpoint tool call with outcome=complete → appendCheckpoint called with outcome=complete", async () => {
|
||||
// The canonical completion detection: when the worker calls checkpoint with
|
||||
// outcome='complete', runUnitViaSwarm should pass outcome='complete' to
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue