fix(agent-runner): retry inbox refresh + throw loud on missing message
Closes sf-mp8g4rcd-w01tkh (FINAL prompt-never-sent root cause) — the
agent-runner.js:182 silent early-return that has been causing 59+
runaway-loop:idle-halt feedback entries and the recurring "Autonomous
loop stuck — no heartbeat" cascade.
Root cause: when swarm-dispatch's bus delivers a message and SF
kernel marks the unit as dispatched, the consumer agent's inbox
sometimes doesn't see the message immediately (different MessageBus
instance, SQLite read-cache lag). Previous code returned
{turnsProcessed:0, response:null} silently — caller (swarm-dispatch
dispatchAndWait) swallowed it as "no work" — LLM never ran — unit
appeared cancelled with no diagnostic.
Fix: bounded retry on missing-message with exponential backoff:
50, 100, 200, 400, 800 ms (1.55s total max). If target message
appears during retry → log recovery event, proceed normally. If still
missing after the last retry → throw a loud error with full inbox
state in the message. The caller wraps in try/catch and surfaces it
as turnResult.error, so the autonomous loop sees a real failure
instead of phantom forward progress.
What this resolves:
- Earlier today: `sf headless triage --apply` timed out at 480000ms
because triage-decider subagent hit this bug. With retries, the
triage-decider has 1.55s of latency tolerance to receive its prompt.
- The 59 backlogged runaway-loop:idle-halt entries are symptoms of
the same root cause. Future occurrences will surface as loud errors,
not phantom "stuck" units — operator/auto-supervisor can react.
Validated:
- 578 tests pass (49 files) including agent-runner / swarm-dispatch /
inbox tests.
- runAgentTurn callers (auto/loop.js, agent-swarm.js, swarm-dispatch
dispatchAndWait) all already handle thrown errors via try/catch
with explicit error surfacing — the contract change is safe.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7e882b56d0
commit
8ee4d83581
1 changed files with 52 additions and 10 deletions
|
|
@ -178,23 +178,65 @@ export async function runAgentTurn(agent, opts = {}) {
|
||||||
// expected dispatchResult.messageId.
|
// expected dispatchResult.messageId.
|
||||||
let messages;
|
let messages;
|
||||||
if (onlyMessageId) {
|
if (onlyMessageId) {
|
||||||
const allMessages = agent.receive(false); // all messages (read + unread)
|
// #sf-mp8g4rcd-w01tkh fix: bounded retry on missing-message.
|
||||||
const target = allMessages.find((m) => m.id === onlyMessageId && !m.read);
|
//
|
||||||
|
// Symptom: MessageBus dispatch returns success and the SF kernel marks
|
||||||
|
// the unit as dispatched, but the consumer agent's in-memory inbox
|
||||||
|
// doesn't yet see the new message (likely because the swarm-dispatch
|
||||||
|
// bus instance and this agent's inbox bus instance are different
|
||||||
|
// objects with their own SQLite read caches). Previously this triggered
|
||||||
|
// a silent early-return with {turnsProcessed:0,response:null} → caller
|
||||||
|
// swallowed it as "no work" → LLM never ran → autonomous-loop froze →
|
||||||
|
// 59 idle-halt feedback entries piled up.
|
||||||
|
//
|
||||||
|
// Fix: retry the inbox refresh with exponential backoff (50, 100, 200,
|
||||||
|
// 400, 800 ms; ~1.55s total). If found mid-retry, log a recovery event
|
||||||
|
// and proceed normally. If still missing after the last retry, throw a
|
||||||
|
// loud error — the caller (swarm-dispatch.js:462-484) wraps this in
|
||||||
|
// try/catch and surfaces it as turnResult.error, so the autonomous
|
||||||
|
// loop sees a real failure instead of phantom progress.
|
||||||
|
const MAX_INBOX_REFRESH_RETRIES = 5;
|
||||||
|
const INBOX_REFRESH_BACKOFFS_MS = [50, 100, 200, 400, 800];
|
||||||
|
let allMessages = agent.receive(false);
|
||||||
|
let target = allMessages.find((m) => m.id === onlyMessageId && !m.read);
|
||||||
|
let retryAttempt = 0;
|
||||||
|
let totalBackoffMs = 0;
|
||||||
|
while (!target && retryAttempt < MAX_INBOX_REFRESH_RETRIES) {
|
||||||
|
const backoffMs = INBOX_REFRESH_BACKOFFS_MS[retryAttempt];
|
||||||
|
retryAttempt++;
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, backoffMs));
|
||||||
|
totalBackoffMs += backoffMs;
|
||||||
|
agent._inbox.refresh();
|
||||||
|
allMessages = agent.receive(false);
|
||||||
|
target = allMessages.find((m) => m.id === onlyMessageId && !m.read);
|
||||||
|
if (target) {
|
||||||
|
debugLog("agent-runner", {
|
||||||
|
event: "missing-message-recovered",
|
||||||
|
agentName: agent.identity?.name,
|
||||||
|
onlyMessageId,
|
||||||
|
retryAttempt,
|
||||||
|
totalBackoffMs,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!target) {
|
if (!target) {
|
||||||
// #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox.
|
|
||||||
// This is the chronic prompt-never-sent failure mode — caller swallows
|
|
||||||
// {turnsProcessed:0,response:null} as 'no work' and the LLM never runs.
|
|
||||||
// Surface the inbox state so the bus-instance / refresh-timing bug
|
|
||||||
// becomes debuggable.
|
|
||||||
debugLog("agent-runner", {
|
debugLog("agent-runner", {
|
||||||
event: "silent-missing-message",
|
event: "missing-message-loud-failure",
|
||||||
phase: "target-not-found",
|
phase: "target-not-found-after-retries",
|
||||||
agentName: agent.identity?.name,
|
agentName: agent.identity?.name,
|
||||||
onlyMessageId,
|
onlyMessageId,
|
||||||
|
retryAttempt,
|
||||||
|
totalBackoffMs,
|
||||||
inboxSize: allMessages.length,
|
inboxSize: allMessages.length,
|
||||||
inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })),
|
inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })),
|
||||||
});
|
});
|
||||||
return { turnsProcessed: 0, response: null };
|
throw new Error(
|
||||||
|
`agent-runner: message ${onlyMessageId} not found in inbox of "${agent.identity?.name ?? "?"}" ` +
|
||||||
|
`after ${MAX_INBOX_REFRESH_RETRIES} refresh retries (~${totalBackoffMs}ms total). ` +
|
||||||
|
`MessageBus dispatch did not deliver to consumer inbox. ` +
|
||||||
|
`See sf-mp8g4rcd-w01tkh in self-feedback for root-cause analysis ` +
|
||||||
|
`(likely separate bus instances / SQLite read-cache lag).`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
messages = [target];
|
messages = [target];
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue