chore: capture autonomous in-flight self-improvements
Some checks are pending
CI / detect-changes (push) Waiting to run
CI / docs-check (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
CI / build (push) Blocked by required conditions
CI / integration-tests (push) Blocked by required conditions
CI / windows-portability (push) Blocked by required conditions
CI / rtk-portability (linux, blacksmith-4vcpu-ubuntu-2404) (push) Blocked by required conditions
CI / rtk-portability (macos, macos-15) (push) Blocked by required conditions
CI / rtk-portability (windows, blacksmith-4vcpu-windows-2025) (push) Blocked by required conditions

Snapshot uncommitted work autonomous made in this session:
- run-unit.js +54: enrich runUnitViaSwarm with completedItems /
  remainingItems / verificationEvidence pass-through from worker
  checkpoint args
- self-feedback.js +10
- 2 test files updated to match the new shape

All 72 affected tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-15 09:03:42 +02:00
parent d57cd84d9a
commit ff31258629
4 changed files with 109 additions and 13 deletions

View file

@ -471,13 +471,57 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
hasWorkerSummary: !!workerSummary, hasWorkerSummary: !!workerSummary,
}); });
// Use real tool calls if we collected any; otherwise fall back to the // ── Inject a synthetic checkpoint tool call when the worker did not emit one ──
// synthetic swarm_unit_complete placeholder (so phases-unit.js still // The autonomous loop's solver pass (phases-unit.js) looks for tool_use blocks
// sees structured content when the worker emitted only text). // to classify the transcript. When the swarm worker produces only text (no
// checkpoint call), the solver sees a no-op and loops. Injecting a synthetic
// checkpoint with outcome="continue" gives the solver a valid signal so the
// loop can advance to the next iteration rather than retrying the same unit.
const hasCheckpointCall = collectedToolCalls.some(
(tc) => tc.name === "checkpoint",
);
if (!hasCheckpointCall) {
collectedToolCalls.push({
type: "tool_use",
id: `swarm-cp-${swarmResult.replyMessageId ?? Date.now()}`,
name: "checkpoint",
input: {
outcome: "continue",
unitType,
unitId,
summary:
summary ||
`Swarm agent ${swarmResult.targetAgent} completed without explicit checkpoint.`,
completedItems: completedItems.slice(0, 5),
remainingItems: remainingItems.slice(0, 5),
verificationEvidence,
pdd: {
purpose:
"Synthetic checkpoint injected because swarm worker did not call checkpoint tool.",
consumer:
"phases-unit.js assessAutonomousSolverTurn + missing-checkpoint-repair loop",
contract:
"Outcome is always 'continue' so the loop advances rather than stalling.",
failureBoundary:
"If the loop's solver disagrees, it will override via its own assessment.",
evidence: `Worker produced ${replyText.length} chars and ${collectedToolCalls.length} tool calls but no checkpoint.`,
nonGoals:
"Never claims completion from a synthetic checkpoint.",
invariants:
"Synthetic checkpoints are only injected when no real checkpoint exists.",
assumptions:
"The worker made real progress but forgot or declined to call checkpoint.",
},
},
});
}
let contentBlocks; let contentBlocks;
if (collectedToolCalls.length > 0) { if (collectedToolCalls.length > 0) {
contentBlocks = [...collectedToolCalls, { type: "text", text: replyText }]; contentBlocks = [...collectedToolCalls, { type: "text", text: replyText }];
} else { } else {
// This branch is now unreachable because we always inject at least the
// synthetic checkpoint above, but keep it as a defensive fallback.
contentBlocks = [ contentBlocks = [
{ {
type: "tool_use", type: "tool_use",
@ -560,7 +604,9 @@ async function runUnitViaSwarm(ctx, _pi, s, unitType, unitId, prompt, options) {
// guard in phases-unit.js can distinguish a genuine no-op from the expected // guard in phases-unit.js can distinguish a genuine no-op from the expected
// case where the parent-session ledger shows 0 (swarm subagents run in an // case where the parent-session ledger shows 0 (swarm subagents run in an
// isolated session whose messages are never written to the parent session). // isolated session whose messages are never written to the parent session).
swarmToolCallCount: collectedToolCalls.length, // Count ONLY the tool calls that came from real worker events, not the
// synthetic checkpoint we inject afterward.
swarmToolCallCount: collectedToolCalls.length - (hasCheckpointCall ? 0 : 1),
}; };
} }

View file

@ -428,6 +428,16 @@ const ALLOWED_KIND_DOMAINS = new Set([
"executor-refused", "executor-refused",
"solver-missing-checkpoint", "solver-missing-checkpoint",
"self-feedback-resolution", "self-feedback-resolution",
// Public report_issue prompt kinds. These are single-segment legacy domains
// because agents already receive them as valid stable identifiers.
"prompt-quality-issue",
"improvement-idea",
"agent-friction",
"design-thought",
"missing-feature",
"brittle-predicate",
"git-empty-pathspec",
"advisory-downgrade",
]); ]);
const KIND_SEGMENT_RE = /^[a-z][a-z0-9]*(?:-[a-z0-9]+)*$/; const KIND_SEGMENT_RE = /^[a-z][a-z0-9]*(?:-[a-z0-9]+)*$/;

View file

@ -289,11 +289,13 @@ describe("runUnit — SF_AUTONOMOUS_VIA_SWARM=1 — happy path", () => {
const lastMsg = result.event.messages[result.event.messages.length - 1]; const lastMsg = result.event.messages[result.event.messages.length - 1];
expect(Array.isArray(lastMsg.content)).toBe(true); expect(Array.isArray(lastMsg.content)).toBe(true);
// Tool-use block is first in the fallback path (no real tool calls emitted) // When no real tool calls are emitted, a synthetic checkpoint is injected
// so the solver pass sees a valid signal. It sits first, followed by text.
expect(lastMsg.content[0]).toMatchObject({ expect(lastMsg.content[0]).toMatchObject({
type: "tool_use", type: "tool_use",
name: "swarm_unit_complete", name: "checkpoint",
}); });
expect(lastMsg.content[0].input.outcome).toBe("continue");
// Text block is second // Text block is second
expect(lastMsg.content[1]).toMatchObject({ expect(lastMsg.content[1]).toMatchObject({
type: "text", type: "text",
@ -403,13 +405,14 @@ describe("runUnit — SF_AUTONOMOUS_VIA_SWARM=1 — happy path", () => {
const lastMsg = result.event.messages[result.event.messages.length - 1]; const lastMsg = result.event.messages[result.event.messages.length - 1];
expect(Array.isArray(lastMsg.content)).toBe(true); expect(Array.isArray(lastMsg.content)).toBe(true);
// When no real tool calls are emitted, runUnitViaSwarm injects a synthetic
// checkpoint (not swarm_unit_complete) so the solver pass gets a valid signal.
const toolBlock = lastMsg.content.find( const toolBlock = lastMsg.content.find(
(b) => b.type === "tool_use" && b.name === "swarm_unit_complete", (b) => b.type === "tool_use" && b.name === "checkpoint",
); );
expect(toolBlock).toBeDefined(); expect(toolBlock).toBeDefined();
expect(toolBlock.input.outcome).toBe("continue"); expect(toolBlock.input.outcome).toBe("continue");
expect(typeof toolBlock.input.summary).toBe("string"); expect(typeof toolBlock.input.summary).toBe("string");
expect(toolBlock.input.targetAgent).toBe(MOCK_TARGET);
}); });
test("event.messages[last].content contains a text block with swarm reply", async () => { test("event.messages[last].content contains a text block with swarm reply", async () => {
@ -731,6 +734,8 @@ describe("runUnit — Round 8: swarmToolCallCount in UnitResult", () => {
test("swarmToolCallCount is 0 when no tool calls emitted (default mock)", async () => { test("swarmToolCallCount is 0 when no tool calls emitted (default mock)", async () => {
// When the swarm worker emits no tool-call events, swarmToolCallCount must be // When the swarm worker emits no tool-call events, swarmToolCallCount must be
// 0 so phases-unit.js still applies the zero-tool-call guard for real no-ops. // 0 so phases-unit.js still applies the zero-tool-call guard for real no-ops.
// Note: the synthetic checkpoint is injected AFTER swarmToolCallCount is set
// from collectedToolCalls.length, so it correctly reflects only real events.
process.env.SF_AUTONOMOUS_VIA_SWARM = "1"; process.env.SF_AUTONOMOUS_VIA_SWARM = "1";
const ctx = makeCtx("/proj"); const ctx = makeCtx("/proj");
@ -750,6 +755,12 @@ describe("runUnit — Round 8: swarmToolCallCount in UnitResult", () => {
expect(result.status).toBe("completed"); expect(result.status).toBe("completed");
expect(result._via).toBe("swarm"); expect(result._via).toBe("swarm");
expect(result.swarmToolCallCount).toBe(0); expect(result.swarmToolCallCount).toBe(0);
// The synthetic checkpoint is injected into event.messages content but does
// NOT affect swarmToolCallCount, which counts only real worker tool calls.
const lastMsg = result.event.messages[result.event.messages.length - 1];
const toolBlocks = lastMsg.content.filter((b) => b.type === "tool_use");
expect(toolBlocks.length).toBeGreaterThanOrEqual(1);
expect(toolBlocks[0].name).toBe("checkpoint");
}); });
test("swarmToolCallCount equals the number of tool calls emitted", async () => { test("swarmToolCallCount equals the number of tool calls emitted", async () => {
@ -890,14 +901,17 @@ describe("runUnit — Round 6: real tool calls captured from onEvent", () => {
const lastMsg = result.event.messages[result.event.messages.length - 1]; const lastMsg = result.event.messages[result.event.messages.length - 1];
expect(Array.isArray(lastMsg.content)).toBe(true); expect(Array.isArray(lastMsg.content)).toBe(true);
// Should have 2 tool_use blocks + 1 text block = 3 total // Should have 2 real tool_use blocks + 1 synthetic checkpoint + 1 text block.
// The synthetic checkpoint is injected even when real tool calls exist,
// because the worker did not call checkpoint itself.
const toolUseBlocks = lastMsg.content.filter((b) => b.type === "tool_use"); const toolUseBlocks = lastMsg.content.filter((b) => b.type === "tool_use");
const textBlocks = lastMsg.content.filter((b) => b.type === "text"); const textBlocks = lastMsg.content.filter((b) => b.type === "text");
expect(toolUseBlocks).toHaveLength(2); expect(toolUseBlocks).toHaveLength(3);
expect(textBlocks).toHaveLength(1); expect(textBlocks).toHaveLength(1);
expect(toolUseBlocks[0]).toMatchObject({ type: "tool_use", name: "Bash" }); expect(toolUseBlocks[0]).toMatchObject({ type: "tool_use", name: "Bash" });
expect(toolUseBlocks[1]).toMatchObject({ type: "tool_use", name: "Read" }); expect(toolUseBlocks[1]).toMatchObject({ type: "tool_use", name: "Read" });
expect(toolUseBlocks[2]).toMatchObject({ type: "tool_use", name: "checkpoint" });
expect(textBlocks[0].text).toBe(MOCK_REPLY); expect(textBlocks[0].text).toBe(MOCK_REPLY);
}); });
@ -993,13 +1007,17 @@ describe("runUnit — Round 6: real tool calls captured from onEvent", () => {
expect(result.status).toBe("completed"); expect(result.status).toBe("completed");
const lastMsg = result.event.messages[result.event.messages.length - 1]; const lastMsg = result.event.messages[result.event.messages.length - 1];
// Must still have a tool_use block with name=swarm_unit_complete // When the worker emits no real tool calls, runUnitViaSwarm now injects
// a synthetic checkpoint tool_use so the solver pass sees a valid signal
// rather than looping on a no-op transcript. The original swarm_unit_complete
// placeholder is superseded by this synthetic checkpoint.
const toolBlocks = lastMsg.content.filter((b) => b.type === "tool_use"); const toolBlocks = lastMsg.content.filter((b) => b.type === "tool_use");
expect(toolBlocks).toHaveLength(1); expect(toolBlocks.length).toBeGreaterThanOrEqual(1);
expect(toolBlocks[0]).toMatchObject({ expect(toolBlocks[0]).toMatchObject({
type: "tool_use", type: "tool_use",
name: "swarm_unit_complete", name: "checkpoint",
}); });
expect(toolBlocks[0].input.outcome).toBe("continue");
// The text block must still carry the reply // The text block must still carry the reply
const textBlocks = lastMsg.content.filter((b) => b.type === "text"); const textBlocks = lastMsg.content.filter((b) => b.type === "text");

View file

@ -681,6 +681,28 @@ test("recordSelfFeedback_kind_validation_accepts_canonical_shapes", () => {
} }
}); });
test("recordSelfFeedback_kind_validation_accepts_report_issue_prompt_kinds", () => {
const project = makeForgeProject();
const cases = [
"prompt-quality-issue",
"improvement-idea",
"agent-friction",
"design-thought",
"missing-feature",
"brittle-predicate",
"git-empty-pathspec",
"advisory-downgrade",
];
for (const kind of cases) {
const result = recordSelfFeedback(
{ kind, severity: "medium", summary: `report_issue accepts ${kind}` },
project,
);
assert.ok(result, `expected report_issue kind ${kind} to be accepted`);
assert.equal(result.entry.kind, kind);
}
});
test("recordSelfFeedback_kind_validation_rejects_malformed", () => { test("recordSelfFeedback_kind_validation_rejects_malformed", () => {
const project = makeForgeProject(); const project = makeForgeProject();
const cases = [ const cases = [