diff --git a/src/resources/extensions/sf/uok/agent-runner.js b/src/resources/extensions/sf/uok/agent-runner.js index 6c1fa15f2..f8e485997 100644 --- a/src/resources/extensions/sf/uok/agent-runner.js +++ b/src/resources/extensions/sf/uok/agent-runner.js @@ -178,23 +178,65 @@ export async function runAgentTurn(agent, opts = {}) { // expected dispatchResult.messageId. let messages; if (onlyMessageId) { - const allMessages = agent.receive(false); // all messages (read + unread) - const target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + // #sf-mp8g4rcd-w01tkh fix: bounded retry on missing-message. + // + // Symptom: MessageBus dispatch returns success and the SF kernel marks + // the unit as dispatched, but the consumer agent's in-memory inbox + // doesn't yet see the new message (likely because the swarm-dispatch + // bus instance and this agent's inbox bus instance are different + // objects with their own SQLite read caches). Previously this triggered + // a silent early-return with {turnsProcessed:0,response:null} → caller + // swallowed it as "no work" → LLM never ran → autonomous-loop froze → + // 59 idle-halt feedback entries piled up. + // + // Fix: retry the inbox refresh with exponential backoff (50, 100, 200, + // 400, 800 ms; ~1.55s total). If found mid-retry, log a recovery event + // and proceed normally. If still missing after the last retry, throw a + // loud error — the caller (swarm-dispatch.js:462-484) wraps this in + // try/catch and surfaces it as turnResult.error, so the autonomous + // loop sees a real failure instead of phantom progress. + const MAX_INBOX_REFRESH_RETRIES = 5; + const INBOX_REFRESH_BACKOFFS_MS = [50, 100, 200, 400, 800]; + let allMessages = agent.receive(false); + let target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + let retryAttempt = 0; + let totalBackoffMs = 0; + while (!target && retryAttempt < MAX_INBOX_REFRESH_RETRIES) { + const backoffMs = INBOX_REFRESH_BACKOFFS_MS[retryAttempt]; + retryAttempt++; + await new Promise((resolve) => setTimeout(resolve, backoffMs)); + totalBackoffMs += backoffMs; + agent._inbox.refresh(); + allMessages = agent.receive(false); + target = allMessages.find((m) => m.id === onlyMessageId && !m.read); + if (target) { + debugLog("agent-runner", { + event: "missing-message-recovered", + agentName: agent.identity?.name, + onlyMessageId, + retryAttempt, + totalBackoffMs, + }); + } + } if (!target) { - // #sf-mp8g4rcd-w01tkh: silent early-return when target isn't in inbox. - // This is the chronic prompt-never-sent failure mode — caller swallows - // {turnsProcessed:0,response:null} as 'no work' and the LLM never runs. - // Surface the inbox state so the bus-instance / refresh-timing bug - // becomes debuggable. debugLog("agent-runner", { - event: "silent-missing-message", - phase: "target-not-found", + event: "missing-message-loud-failure", + phase: "target-not-found-after-retries", agentName: agent.identity?.name, onlyMessageId, + retryAttempt, + totalBackoffMs, inboxSize: allMessages.length, inboxIds: allMessages.map((m) => ({ id: m.id, read: m.read })), }); - return { turnsProcessed: 0, response: null }; + throw new Error( + `agent-runner: message ${onlyMessageId} not found in inbox of "${agent.identity?.name ?? "?"}" ` + + `after ${MAX_INBOX_REFRESH_RETRIES} refresh retries (~${totalBackoffMs}ms total). ` + + `MessageBus dispatch did not deliver to consumer inbox. ` + + `See sf-mp8g4rcd-w01tkh in self-feedback for root-cause analysis ` + + `(likely separate bus instances / SQLite read-cache lag).`, + ); } messages = [target]; } else {