From 5e478d650664b0b9189f1b5b4b9f2942926c0d30 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Fri, 15 May 2026 11:01:08 +0200 Subject: [PATCH] fix(auto): avoid duplicate swarm checkpoints --- src/resources/extensions/sf/auto/run-unit.js | 80 ++++++++++--------- .../sf/tests/run-unit-via-swarm.test.mjs | 17 +--- 2 files changed, 48 insertions(+), 49 deletions(-) diff --git a/src/resources/extensions/sf/auto/run-unit.js b/src/resources/extensions/sf/auto/run-unit.js index d91c05973..38bc64c85 100644 --- a/src/resources/extensions/sf/auto/run-unit.js +++ b/src/resources/extensions/sf/auto/run-unit.js @@ -619,45 +619,53 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) { ]; } - try { - appendAutonomousSolverCheckpoint(basePath, { + if (!hasCheckpointCall) { + try { + appendAutonomousSolverCheckpoint(basePath, { + unitType, + unitId, + outcome, + summary: summary || "Swarm agent completed unit turn.", + completedItems, + remainingItems, + verificationEvidence, + pdd: { + purpose: + "Synthetic checkpoint from swarm agent reply when the worker did not call the checkpoint tool.", + consumer: "phases-unit.js assessAutonomousSolverTurn", + contract: + "Falls back to 'continue' so the loop re-evaluates rather than incorrectly completing.", + failureBoundary: + "appendAutonomousSolverCheckpoint failure is swallowed — the loop will repair via its own missing-checkpoint retry path.", + evidence: `swarm-agent ${swarmResult.targetAgent} replied with ${replyText.length} chars; workerSignaledOutcome=${workerSignaledOutcome ?? "null"}; collectedToolCalls=${collectedToolCalls.length}`, + nonGoals: "Does not synthesize completion.", + invariants: + "Synthetic checkpoints are only written when the worker emitted no real checkpoint call.", + assumptions: + "The swarm agent processed the unit prompt and returned a non-empty reply.", + }, + }); + debugLog("runUnit[swarm]", { + phase: "synthesized-checkpoint", + unitType, + unitId, + outcome, + }); + } catch (cpErr) { + // Fail-open: if checkpoint synthesis fails, the repair loop will handle it. + debugLog("runUnit[swarm]", { + phase: "synthesized-checkpoint-error", + unitType, + unitId, + error: getErrorMessage(cpErr), + }); + } + } else { + debugLog("runUnit[swarm]", { + phase: "real-checkpoint-observed", unitType, unitId, outcome, - summary: summary || "Swarm agent completed unit turn.", - completedItems, - remainingItems, - verificationEvidence, - pdd: { - purpose: - "Checkpoint from swarm agent reply — real outcome when worker called checkpoint tool, conservative fallback otherwise.", - consumer: "phases-unit.js assessAutonomousSolverTurn", - contract: - "outcome reflects the worker's checkpoint call when available; falls back to 'continue' so the loop re-evaluates rather than incorrectly completing.", - failureBoundary: - "appendAutonomousSolverCheckpoint failure is swallowed — the loop will repair via its own missing-checkpoint retry path.", - evidence: `swarm-agent ${swarmResult.targetAgent} replied with ${replyText.length} chars; workerSignaledOutcome=${workerSignaledOutcome ?? "null"}; collectedToolCalls=${collectedToolCalls.length}`, - nonGoals: - "Does not synthesize fake tool calls — uses real ones when available.", - invariants: - "Never claims outcome=complete unless the worker explicitly called checkpoint with outcome='complete'.", - assumptions: - "The swarm agent processed the unit prompt and returned a non-empty reply.", - }, - }); - debugLog("runUnit[swarm]", { - phase: "synthesized-checkpoint", - unitType, - unitId, - outcome, - }); - } catch (cpErr) { - // Fail-open: if checkpoint synthesis fails, the repair loop will handle it. - debugLog("runUnit[swarm]", { - phase: "synthesized-checkpoint-error", - unitType, - unitId, - error: getErrorMessage(cpErr), }); } diff --git a/src/resources/extensions/sf/tests/run-unit-via-swarm.test.mjs b/src/resources/extensions/sf/tests/run-unit-via-swarm.test.mjs index 52f195f94..c4cc936bb 100644 --- a/src/resources/extensions/sf/tests/run-unit-via-swarm.test.mjs +++ b/src/resources/extensions/sf/tests/run-unit-via-swarm.test.mjs @@ -985,10 +985,10 @@ describe("runUnit — Round 6: real tool calls captured from onEvent", () => { expect(result.swarmToolCallCount).toBe(0); }); - test("checkpoint tool call with outcome=complete → appendCheckpoint called with outcome=complete", async () => { + test("checkpoint tool call with outcome=complete relies on the real checkpoint write", async () => { // The canonical completion detection: when the worker calls checkpoint with - // outcome='complete', runUnitViaSwarm should pass outcome='complete' to - // appendAutonomousSolverCheckpoint (not hardcode 'continue'). + // outcome='complete', runUnitViaSwarm must not append a duplicate parent + // checkpoint. The tool execution already updated solver state. process.env.SF_AUTONOMOUS_VIA_SWARM = "1"; mockWithToolCallEvents([ @@ -1030,16 +1030,7 @@ describe("runUnit — Round 6: real tool calls captured from onEvent", () => { expect(result.status).toBe("completed"); - // appendAutonomousSolverCheckpoint must have been called with outcome='complete' - expect(mockAppendCheckpoint).toHaveBeenCalledOnce(); - const [, params] = mockAppendCheckpoint.mock.calls[0]; - expect(params.outcome).toBe("complete"); - expect(params.completedItems).toEqual([ - "feature implemented", - "tests passing", - ]); - expect(params.remainingItems).toEqual([]); - expect(params.verificationEvidence).toEqual(["npm test: all green"]); + expect(mockAppendCheckpoint).not.toHaveBeenCalled(); // The real checkpoint tool_use block must appear in event.messages[last].content const lastMsg = result.event.messages[result.event.messages.length - 1];