From 8ee4d835818d479805ec7e87a9f0fdace104c0f0 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 16 May 2026 20:30:58 +0200 Subject: [PATCH] fix(agent-runner): retry inbox refresh + throw loud on missing message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes sf-mp8g4rcd-w01tkh (FINAL prompt-never-sent root cause) — the agent-runner.js:182 silent early-return that has been causing 59+ runaway-loop:idle-halt feedback entries and the recurring "Autonomous loop stuck — no heartbeat" cascade. Root cause: when swarm-dispatch's bus delivers a message and SF kernel marks the unit as dispatched, the consumer agent's inbox sometimes doesn't see the message immediately (different MessageBus instance, SQLite read-cache lag). Previous code returned {turnsProcessed:0, response:null} silently — caller (swarm-dispatch dispatchAndWait) swallowed it as "no work" — LLM never ran — unit appeared cancelled with no diagnostic. Fix: bounded retry on missing-message with exponential backoff: 50, 100, 200, 400, 800 ms (1.55s total max). If target message appears during retry → log recovery event, proceed normally. If still missing after the last retry → throw a loud error with full inbox state in the message. The caller wraps in try/catch and surfaces it as turnResult.error, so the autonomous loop sees a real failure instead of phantom forward progress. What this resolves: - Earlier today: `sf headless triage --apply` timed out at 480000ms because triage-decider subagent hit this bug. With retries, the triage-decider has 1.55s of latency tolerance to receive its prompt. - The 59 backlogged runaway-loop:idle-halt entries are symptoms of the same root cause. Future occurrences will surface as loud errors, not phantom "stuck" units — operator/auto-supervisor can react. Validated: - 578 tests pass (49 files) including agent-runner / swarm-dispatch / inbox tests. - runAgentTurn callers (auto/loop.js, agent-swarm.js, swarm-dispatch dispatchAndWait) all already handle thrown errors via try/catch with explicit error surfacing — the contract change is safe. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../extensions/sf/uok/agent-runner.js | 62 ++++++++++++++++--- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/src/resources/extensions/sf/uok/agent-runner.js b/src/resources/extensions/sf/uok/agent-runner.js index 6c1fa15f2..f8e485997 100644 --- a/src/resources/extensions/sf/uok/agent-runner.js +++ b/src/resources/extensions/sf/uok/agent-runner.js @@ -178,23 +178,65 @@ export async function runAgentTurn(agent, opts = {}) { // expected dispatchResult.messageId. let messages; if (onlyMessageId) { - const allMessages = agent.receive(false); // all messages (read + unread) - const target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + // #sf-mp8g4rcd-w01tkh fix: bounded retry on missing-message. + // + // Symptom: MessageBus dispatch returns success and the SF kernel marks + // the unit as dispatched, but the consumer agent's in-memory inbox + // doesn't yet see the new message (likely because the swarm-dispatch + // bus instance and this agent's inbox bus instance are different + // objects with their own SQLite read caches). Previously this triggered + // a silent early-return with {turnsProcessed:0,response:null} → caller + // swallowed it as "no work" → LLM never ran → autonomous-loop froze → + // 59 idle-halt feedback entries piled up. + // + // Fix: retry the inbox refresh with exponential backoff (50, 100, 200, + // 400, 800 ms; ~1.55s total). If found mid-retry, log a recovery event + // and proceed normally. If still missing after the last retry, throw a + // loud error — the caller (swarm-dispatch.js:462-484) wraps this in + // try/catch and surfaces it as turnResult.error, so the autonomous + // loop sees a real failure instead of phantom progress. + const MAX_INBOX_REFRESH_RETRIES = 5; + const INBOX_REFRESH_BACKOFFS_MS = [50, 100, 200, 400, 800]; + let allMessages = agent.receive(false); + let target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + let retryAttempt = 0; + let totalBackoffMs = 0; + while (!target && retryAttempt < MAX_INBOX_REFRESH_RETRIES) { + const backoffMs = INBOX_REFRESH_BACKOFFS_MS[retryAttempt]; + retryAttempt++; + await new Promise((resolve) => setTimeout(resolve, backoffMs)); + totalBackoffMs += backoffMs; + agent._inbox.refresh(); + allMessages = agent.receive(false); + target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + if (target) { + debugLog("agent-runner", { + event: "missing-message-recovered", + agentName: agent.identity?.name, + onlyMessageId, + retryAttempt, + totalBackoffMs, + }); + } + } if (!target) { - // #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox. - // This is the chronic prompt-never-sent failure mode — caller swallows - // {turnsProcessed:0,response:null} as 'no work' and the LLM never runs. - // Surface the inbox state so the bus-instance / refresh-timing bug - // becomes debuggable. debugLog("agent-runner", { - event: "silent-missing-message", - phase: "target-not-found", + event: "missing-message-loud-failure", + phase: "target-not-found-after-retries", agentName: agent.identity?.name, onlyMessageId, + retryAttempt, + totalBackoffMs, inboxSize: allMessages.length, inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })), }); - return { turnsProcessed: 0, response: null }; + throw new Error( + `agent-runner: message ${onlyMessageId} not found in inbox of "${agent.identity?.name ?? "?"}" ` + + `after ${MAX_INBOX_REFRESH_RETRIES} refresh retries (~${totalBackoffMs}ms total). ` + + `MessageBus dispatch did not deliver to consumer inbox. ` + + `See sf-mp8g4rcd-w01tkh in self-feedback for root-cause analysis ` + + `(likely separate bus instances / SQLite read-cache lag).`, + ); } messages = [target]; } else {