fix(sf): recover model routes and self-feedback

2026-05-02 22:07:10 +02:00 · 2026-05-02 22:07:10 +02:00 · dd126ddc8b
commit dd126ddc8b
parent c308a492d7
37 changed files with 4295 additions and 563 deletions
--- a/packages/daemon/src/session-manager.test.ts
+++ b/packages/daemon/src/session-manager.test.ts
@ -27,6 +27,7 @@ class MockRpcClient {
  stopped = false;
  aborted = false;
  prompted: string[] = [];
+  switchedSessions: string[] = [];
  private eventListeners: Array<(event: Record<string, unknown>) => void> = [];
  uiResponses: Array<{ requestId: string; response: Record<string, unknown> }> = [];

@ -69,6 +70,16 @@ class MockRpcClient {

  async prompt(message: string): Promise<void> {
    this.prompted.push(message);
+    if (message === '/sf pause') {
+      queueMicrotask(() => {
+        this.emitEvent({
+          type: 'extension_ui_request',
+          id: 'pause-notice',
+          method: 'notify',
+          message: 'Auto-mode paused: daemon reload requested',
+        });
+      });
+    }
  }

  async abort(): Promise<void> {
@ -79,6 +90,18 @@ class MockRpcClient {
    this.uiResponses.push({ requestId, response });
  }

+  async getState(): Promise<{ sessionFile: string; sessionId: string }> {
+    return {
+      sessionFile: `/tmp/${this.initSessionId}.jsonl`,
+      sessionId: this.initSessionId,
+    };
+  }
+
+  async switchSession(sessionPath: string): Promise<{ cancelled: boolean }> {
+    this.switchedSessions.push(sessionPath);
+    return { cancelled: false };
+  }
+
  /** Test helper — emit an event to all listeners */
  emitEvent(event: Record<string, unknown>): void {
    for (const listener of this.eventListeners) {
@ -98,6 +121,15 @@ class TestableSessionManager extends SessionManager {
  nextInitError: Error | null = null;
  nextStartError: Error | null = null;

+  protected override createRpcClient(_cliPath: string, cwd: string, args: string[]): any {
+    this.sessionCounter++;
+    const client = new MockRpcClient({ cwd, args });
+    client.initSessionId = `mock-session-${String(this.sessionCounter).padStart(3, '0')}`;
+    this.lastClient = client;
+    this.allClients.push(client);
+    return client;
+  }
+
  override async startSession(options: { projectDir: string; command?: string; model?: string; bare?: boolean; cliPath?: string }): Promise<string> {
    const { projectDir } = options;

@ -116,7 +148,7 @@ class TestableSessionManager extends SessionManager {
      );
    }

-    const client = new MockRpcClient({ cwd: resolvedDir, args: [] });
+    const client = this.createRpcClient('mock-sf', resolvedDir, []);
    if (this.nextStartError) {
      client.startError = this.nextStartError;
      this.nextStartError = null;
@ -126,22 +158,19 @@ class TestableSessionManager extends SessionManager {
      this.nextInitError = null;
    }

-    this.sessionCounter++;
-    client.initSessionId = `mock-session-${String(this.sessionCounter).padStart(3, '0')}`;
-    this.lastClient = client;
-    this.allClients.push(client);
-
    // Build session shell
    const session: ManagedSession = {
      sessionId: '',
      projectDir: resolvedDir,
      projectName,
      status: 'starting',
+      reloadState: 'running',
      client: client as any, // duck-typed mock
      events: [],
      pendingBlocker: null,
      cost: { totalCost: 0, tokens: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 } },
      startTime: Date.now(),
+      startOptions: { ...options, projectDir: resolvedDir },
    };

    // Insert into internal sessions map
@ -300,6 +329,38 @@ describe('SessionManager', () => {
    assert.equal(completedLogs.length, 1);
  });

+  it('runtime epoch mismatch restarts child and resumes prior session file', async () => {
+    const { manager } = createManager();
+
+    const sessionId = await manager.startSession({ projectDir: '/tmp/reload-project' });
+    const originalClient = manager.lastClient!;
+    const restarted = new Promise<void>((resolve) => {
+      manager.once('session:restarted', () => resolve());
+    });
+
+    originalClient.emitEvent({
+      type: 'runtime_heartbeat',
+      sessionId,
+      sessionFile: '/tmp/reload-session.jsonl',
+      unitType: 'execute-task',
+      unitId: 'M001/S01/T01',
+      runtimeEpoch: 100,
+      sourceEpoch: 200,
+      emittedAt: Date.now(),
+    });
+
+    await restarted;
+
+    const session = manager.getSession('mock-session-002')!;
+    assert.ok(session);
+    assert.equal(originalClient.stopped, true);
+    assert.equal(manager.allClients.length, 2);
+    const replacement = manager.allClients[1];
+    assert.deepEqual(replacement.switchedSessions, ['/tmp/mock-session-001.jsonl']);
+    assert.deepEqual(replacement.prompted, ['/sf autonomous']);
+    assert.equal(session.reloadState, 'running');
+  });
+
  // ---- Lifecycle: start → running → blocked → resolve → running → completed ----

  it('start → blocked → resolve → running → completed lifecycle', async () => {
@ -723,8 +784,10 @@ describe('SessionManager', () => {

    assert.equal(result.sessionId, sessionId);
    assert.equal(result.status, 'running');
+    assert.equal(result.reloadState, 'running');
    assert.equal(result.projectName, 'result-test');
    assert.equal(result.error, null);
+    assert.equal(result.lastHeartbeat, null);
    assert.equal(result.pendingBlocker, null);
    assert.ok(typeof result.durationMs === 'number');
    assert.ok(result.cost);
--- a/packages/daemon/src/session-manager.ts
+++ b/packages/daemon/src/session-manager.ts
@ -22,6 +22,7 @@ import type {
  ManagedSession,
  StartSessionOptions,
  PendingBlocker,
+  RuntimeHeartbeat,
 } from './types.js';
 import { MAX_EVENTS, INIT_TIMEOUT_MS } from './types.js';
 import type { Logger } from './logger.js';
@ -34,7 +35,8 @@ const FIRE_AND_FORGET_METHODS = new Set([
  'notify', 'setStatus', 'setWidget', 'setTitle', 'set_editor_text',
 ]);

-const TERMINAL_PREFIXES = ['auto-mode stopped', 'step-mode stopped'];
+const TERMINAL_PREFIXES = ['auto-mode stopped', 'auto-mode paused', 'step-mode stopped'];
+const RELOAD_PAUSE_TIMEOUT_MS = 5_000;

 function isTerminalNotification(event: Record<string, unknown>): boolean {
  if (event.type !== 'extension_ui_request' || event.method !== 'notify') return false;
@ -45,7 +47,7 @@ function isTerminalNotification(event: Record<string, unknown>): boolean {
 function isBlockedNotification(event: Record<string, unknown>): boolean {
  if (event.type !== 'extension_ui_request' || event.method !== 'notify') return false;
  const message = String(event.message ?? '').toLowerCase();
-  return message.includes('blocked:');
+  return message.includes('blocked:') || message.startsWith('auto-mode paused');
 }

 function isBlockingUIRequest(event: Record<string, unknown>): boolean {
@ -96,11 +98,7 @@ export class SessionManager extends EventEmitter {
    if (options.model) args.push('--model', options.model);
    if (options.bare) args.push('--bare');

-    const client = new RpcClient({
-      cliPath,
-      cwd: resolvedDir,
-      args,
-    });
+    const client = this.createRpcClient(cliPath, resolvedDir, args);

    // Build the session shell before async operations so we can track state
    const session: ManagedSession = {
@ -108,11 +106,13 @@ export class SessionManager extends EventEmitter {
      projectDir: resolvedDir,
      projectName,
      status: 'starting',
+      reloadState: 'running',
      client,
      events: [],
      pendingBlocker: null,
      cost: { totalCost: 0, tokens: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 } },
      startTime: Date.now(),
+      startOptions: { ...options, projectDir: resolvedDir },
    };

    // Insert into map early (keyed by dir) so concurrent starts are rejected
@ -231,6 +231,18 @@ export class SessionManager extends EventEmitter {
    this.logger.info('session cancelled', { sessionId, projectDir: session.projectDir });
  }

+  /**
+   * Restart a managed RPC child and resume the same persisted session when possible.
+   *
+   * Purpose: make daemon-managed auto sessions pick up changed runtime/source
+   * files at process boundaries instead of trying unsafe in-process hot reload.
+   */
+  async reloadSession(sessionId: string, reason = 'runtime epoch changed'): Promise<void> {
+    const session = this.getSession(sessionId);
+    if (!session) throw new Error(`Session not found: ${sessionId}`);
+    await this.restartSession(session, reason);
+  }
+
  /**
   * Build a HeadlessJsonResult-shaped object from accumulated session state.
   */
@ -245,9 +257,11 @@ export class SessionManager extends EventEmitter {
      projectDir: session.projectDir,
      projectName: session.projectName,
      status: session.status,
+      reloadState: session.reloadState ?? 'running',
      durationMs,
      cost: session.cost,
      recentEvents: session.events.slice(-10),
+      lastHeartbeat: session.lastHeartbeat ?? null,
      pendingBlocker: session.pendingBlocker
        ? { id: session.pendingBlocker.id, method: session.pendingBlocker.method, message: session.pendingBlocker.message }
        : null,
@ -311,6 +325,10 @@ export class SessionManager extends EventEmitter {
    this.logger.debug('session event', { sessionId: session.sessionId, type: (event as Record<string, unknown>).type as string });
    this.emit('session:event', { sessionId: session.sessionId, projectDir: session.projectDir, event });

+    if ((event as Record<string, unknown>).type === 'runtime_heartbeat') {
+      this.handleRuntimeHeartbeat(session, event as unknown as RuntimeHeartbeat);
+    }
+
    // Cost tracking (K004 — cumulative-max)
    if ((event as Record<string, unknown>).type === 'cost_update') {
      const costEvent = event as unknown as RpcCostUpdateEvent;
@ -371,6 +389,135 @@ export class SessionManager extends EventEmitter {
      });
    }
  }
+
+  private handleRuntimeHeartbeat(session: ManagedSession, heartbeat: RuntimeHeartbeat): void {
+    session.lastHeartbeat = heartbeat;
+    if (heartbeat.runtimeEpoch === heartbeat.sourceEpoch) return;
+    if (session.reloadState === 'reloading') return;
+    if (session.status !== 'running' && session.status !== 'blocked') return;
+
+    this.logger.info('runtime epoch mismatch detected', {
+      sessionId: session.sessionId,
+      projectDir: session.projectDir,
+      unitType: heartbeat.unitType,
+      unitId: heartbeat.unitId,
+      runtimeEpoch: heartbeat.runtimeEpoch,
+      sourceEpoch: heartbeat.sourceEpoch,
+    });
+
+    void this.restartSession(session, 'runtime epoch changed').catch((err) => {
+      session.reloadState = 'reload_failed';
+      session.status = 'error';
+      session.error = err instanceof Error ? err.message : String(err);
+      this.logger.error('session reload failed', {
+        sessionId: session.sessionId,
+        projectDir: session.projectDir,
+        error: session.error,
+      });
+      this.emit('session:error', {
+        sessionId: session.sessionId,
+        projectDir: session.projectDir,
+        projectName: session.projectName,
+        error: session.error,
+      });
+    });
+  }
+
+  private async restartSession(session: ManagedSession, reason: string): Promise<void> {
+    if (session.reloadState === 'reloading') return;
+    session.reloadState = 'reloading';
+
+    let sessionFile = session.lastHeartbeat?.sessionFile;
+    try {
+      const state = await session.client.getState();
+      sessionFile = state.sessionFile ?? sessionFile;
+    } catch {
+      // Best effort: a wedged child may not answer state requests.
+    }
+
+    try {
+      await session.client.prompt('/sf pause');
+      await waitFor(
+        () => session.status === 'blocked' || session.status === 'completed' || session.status === 'cancelled',
+        RELOAD_PAUSE_TIMEOUT_MS,
+      );
+    } catch {
+      // Timeout or prompt failure: stop() escalates SIGTERM to SIGKILL.
+    }
+
+    session.unsubscribe?.();
+    try {
+      await session.client.stop();
+    } catch {
+      // stop() is best-effort; subsequent start creates a new child.
+    }
+
+    const opts = session.startOptions ?? { projectDir: session.projectDir };
+    const cliPath = opts.cliPath ?? SessionManager.resolveCLIPath();
+    const args: string[] = ['--mode', 'rpc'];
+    if (opts.model) args.push('--model', opts.model);
+    if (opts.bare) args.push('--bare');
+
+    const client = this.createRpcClient(cliPath, session.projectDir, args);
+
+    await Promise.race([
+      client.start(),
+      timeout(INIT_TIMEOUT_MS, `RpcClient.start() timed out after ${INIT_TIMEOUT_MS}ms`),
+    ]);
+
+    const initResult: RpcInitResult = await Promise.race([
+      client.init(),
+      timeout(INIT_TIMEOUT_MS, `RpcClient.init() timed out after ${INIT_TIMEOUT_MS}ms`),
+    ]) as RpcInitResult;
+
+    session.client = client;
+    session.sessionId = initResult.sessionId;
+    session.status = 'running';
+    session.pendingBlocker = null;
+    session.reloadState = 'restarted';
+    session.error = undefined;
+    session.startOptions = { ...opts, projectDir: session.projectDir };
+    session.unsubscribe = client.onEvent((event: SdkAgentEvent) => {
+      this.handleEvent(session, event);
+    });
+
+    if (sessionFile) {
+      try {
+        await client.switchSession(sessionFile);
+      } catch (err) {
+        this.logger.warn('session reload could not switch to previous session file', {
+          sessionId: session.sessionId,
+          projectDir: session.projectDir,
+          sessionFile,
+          error: err instanceof Error ? err.message : String(err),
+        });
+      }
+    }
+
+    await client.prompt(opts.command ?? '/sf autonomous');
+    session.reloadState = 'running';
+    this.logger.info('session reloaded', {
+      sessionId: session.sessionId,
+      projectDir: session.projectDir,
+      reason,
+      resumedSessionFile: sessionFile,
+    });
+    this.emit('session:restarted', {
+      sessionId: session.sessionId,
+      projectDir: session.projectDir,
+      projectName: session.projectName,
+      reason,
+      sessionFile,
+    });
+  }
+
+  protected createRpcClient(cliPath: string, cwd: string, args: string[]): RpcClient {
+    return new RpcClient({
+      cliPath,
+      cwd,
+      args,
+    });
+  }
 }

 // ---------------------------------------------------------------------------
@ -383,6 +530,24 @@ function timeout(ms: number, message: string): Promise<never> {
  });
 }

+function waitFor(predicate: () => boolean, timeoutMs: number): Promise<void> {
+  if (predicate()) return Promise.resolve();
+  return new Promise((resolve, reject) => {
+    const startedAt = Date.now();
+    const interval = setInterval(() => {
+      if (predicate()) {
+        clearInterval(interval);
+        resolve();
+        return;
+      }
+      if (Date.now() - startedAt >= timeoutMs) {
+        clearInterval(interval);
+        reject(new Error(`Timed out after ${timeoutMs}ms`));
+      }
+    }, 100);
+  });
+}
+
 function extractBlocker(event: SdkAgentEvent): PendingBlocker {
  const uiEvent = event as unknown as RpcExtensionUIRequest;
  return {
--- a/packages/daemon/src/types.ts
+++ b/packages/daemon/src/types.ts
@ -57,6 +57,17 @@ export interface DaemonConfig {
 // ---------------------------------------------------------------------------

 export type SessionStatus = 'starting' | 'running' | 'blocked' | 'completed' | 'error' | 'cancelled';
+export type ReloadState = 'running' | 'reloading' | 'restarted' | 'reload_failed';
+
+export interface RuntimeHeartbeat {
+  sessionId: string;
+  sessionFile?: string;
+  unitType?: string;
+  unitId?: string;
+  runtimeEpoch: number;
+  sourceEpoch: number;
+  emittedAt: number;
+}

 // ---------------------------------------------------------------------------
 // Managed Session
@ -78,6 +89,9 @@ export interface ManagedSession {
  /** Current lifecycle status */
  status: SessionStatus;

+  /** Daemon-managed runtime reload state */
+  reloadState?: ReloadState;
+
  /** The RpcClient instance managing the agent process */
  client: RpcClient;

@ -96,6 +110,12 @@ export interface ManagedSession {
  /** Error message if status is 'error' */
  error?: string;

+  /** Latest runtime heartbeat received from the RPC child */
+  lastHeartbeat?: RuntimeHeartbeat;
+
+  /** Original session start options used for daemon-managed restarts */
+  startOptions?: StartSessionOptions;
+
  /** Cleanup function to unsubscribe from events */
  unsubscribe?: () => void;
 }
--- a/packages/pi-ai/src/providers/google-gemini-cli.test.ts
+++ b/packages/pi-ai/src/providers/google-gemini-cli.test.ts
@ -0,0 +1,70 @@
+import assert from "node:assert/strict";
+import { describe, test, vi } from "vitest";
+import type { Context, Model } from "../types.js";
+
+const geminiCliCore = vi.hoisted(() => ({
+	retryError: undefined as Error | undefined,
+	retryOptions: undefined as Record<string, unknown> | undefined,
+}));
+
+vi.mock("@google/gemini-cli-core", () => ({
+	AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" },
+	CodeAssistServer: class {
+		async generateContentStream(): Promise<AsyncGenerator<unknown>> {
+			return (async function* emptyStream() {})();
+		}
+	},
+	getOauthClient: vi.fn(async () => ({})),
+	makeFakeConfig: vi.fn(() => ({})),
+	retryWithBackoff: vi.fn(async (_fn: unknown, options: Record<string, unknown>) => {
+		geminiCliCore.retryOptions = options;
+		throw geminiCliCore.retryError ?? new Error("quota exhausted");
+	}),
+	setupUser: vi.fn(async () => ({ projectId: "test-project" })),
+}));
+
+import { streamGoogleGeminiCli } from "./google-gemini-cli.js";
+
+function makeModel(): Model<"google-gemini-cli"> {
+	return {
+		id: "gemini-3-flash-preview",
+		name: "Gemini 3 Flash Preview",
+		api: "google-gemini-cli",
+		provider: "google-gemini-cli",
+		baseUrl: "",
+		reasoning: true,
+		input: ["text"],
+		cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+		contextWindow: 1_000_000,
+		maxTokens: 8192,
+	};
+}
+
+function makeContext(): Context {
+	return {
+		messages: [{ role: "user", content: "hello", timestamp: 0 }],
+	};
+}
+
+describe("google-gemini-cli provider retry ownership", () => {
+	test("google_gemini_cli_when_quota_resets_soon_returns_error_to_caller_without_cli_retry_loop", async () => {
+		geminiCliCore.retryOptions = undefined;
+		geminiCliCore.retryError = Object.assign(
+			new Error(
+				"You have exhausted your capacity on this model. Your quota will reset after 54s.",
+			),
+			{ retryDelayMs: 54_000 },
+		);
+
+		const stream = streamGoogleGeminiCli(makeModel(), makeContext());
+		const result = await stream.result();
+
+		const retryOptions = geminiCliCore.retryOptions as
+			| { maxAttempts?: unknown }
+			| undefined;
+		assert.equal(retryOptions?.maxAttempts, 1);
+		assert.equal(result.stopReason, "error");
+		assert.match(result.errorMessage ?? "", /exhausted your capacity/i);
+		assert.equal(result.retryAfterMs, 54_000);
+	});
+});
--- a/packages/pi-ai/src/providers/google-gemini-cli.ts
+++ b/packages/pi-ai/src/providers/google-gemini-cli.ts
@ -5,7 +5,8 @@
 * @google/gemini-cli-core — the same library the real `gemini` CLI uses.
 * cli-core reads ~/.gemini/oauth_creds.json itself, refreshes tokens,
 * discovers the project (free-tier or whatever's onboarded server-side)
- * via setupUser(), and handles all the User-Agent / retry / 429 details.
+ * via setupUser(), and handles all the User-Agent / quota-classification details.
+ * Request retry/fallback stays in the caller so SF can move to the next model.
 */

 import {
@ -227,6 +228,9 @@ export const streamGoogleGeminiCli: StreamFunction<
 				() => server.generateContentStream(req as any, promptId, "USER" as any),
 				{
 					authType: AuthType.LOGIN_WITH_GOOGLE,
+					// SF owns cross-model fallback. Let cli-core classify quota errors,
+					// but do not let it hold the turn through its 10-attempt retry loop.
+					maxAttempts: 1,
 					signal: options?.signal,
 				},
 			);
--- a/packages/pi-coding-agent/src/modes/rpc/rpc-mode.ts
+++ b/packages/pi-coding-agent/src/modes/rpc/rpc-mode.ts
@ -12,6 +12,8 @@
 */

 import * as crypto from "node:crypto";
+import { existsSync, readFileSync, readdirSync, statSync } from "node:fs";
+import { dirname, join, resolve } from "node:path";
 import type { AgentSession } from "../../core/agent-session.js";
 import { killTrackedDetachedChildren } from "../../utils/shell.js";
 import type {
@ -34,6 +36,110 @@ import type {
 	RpcSlashCommand,
 } from "./rpc-types.js";

+const RUNTIME_HEARTBEAT_INTERVAL_MS = Number(
+	process.env.SF_RUNTIME_HEARTBEAT_INTERVAL_MS ?? 10_000,
+);
+
+function findRuntimeSourceRoot(): string {
+	const explicit =
+		process.env.SF_RUNTIME_SOURCE_ROOT ?? process.env.SF_SOURCE_ROOT;
+	if (explicit) return resolve(explicit);
+
+	let dir = resolve(dirname(process.argv[1] ?? process.cwd()));
+	while (true) {
+		if (existsSync(join(dir, "package.json")) && existsSync(join(dir, "src"))) {
+			return dir;
+		}
+		const parent = dirname(dir);
+		if (parent === dir) return process.cwd();
+		dir = parent;
+	}
+}
+
+function newestSourceMtimeMs(root: string): number {
+	let newest = 0;
+	const skip = new Set([
+		".git",
+		".sf",
+		"dist",
+		"node_modules",
+		"target",
+		".next",
+		"coverage",
+	]);
+	const stack = [root];
+	while (stack.length > 0) {
+		const dir = stack.pop()!;
+		let entries: import("node:fs").Dirent[];
+		try {
+			entries = readdirSync(dir, { withFileTypes: true });
+		} catch {
+			continue;
+		}
+		for (const entry of entries) {
+			if (skip.has(entry.name)) continue;
+			const full = join(dir, entry.name);
+			if (entry.isDirectory()) {
+				stack.push(full);
+				continue;
+			}
+			if (!entry.isFile() || !/\.(?:ts|tsx|mts|cts)$/.test(entry.name)) {
+				continue;
+			}
+			try {
+				newest = Math.max(newest, statSync(full).mtimeMs);
+			} catch {
+				// Ignore files that disappear during a scan.
+			}
+		}
+	}
+	return newest;
+}
+
+interface RuntimeUnitState {
+	unitType?: string;
+	unitId?: string;
+	sessionFile?: string;
+}
+
+function effectiveAutoLockFile(): string {
+	const milestoneLock = process.env.SF_PARALLEL_WORKER
+		? process.env.SF_MILESTONE_LOCK
+		: undefined;
+	return milestoneLock ? `auto-${milestoneLock}.lock` : "auto.lock";
+}
+
+function readRuntimeUnitState(): RuntimeUnitState {
+	const roots = [process.env.SF_PROJECT_ROOT, process.cwd()].filter(
+		(root): root is string => Boolean(root),
+	);
+	const seen = new Set<string>();
+	for (const root of roots) {
+		const resolvedRoot = resolve(root);
+		if (seen.has(resolvedRoot)) continue;
+		seen.add(resolvedRoot);
+		const lockPath = join(resolvedRoot, ".sf", effectiveAutoLockFile());
+		try {
+			if (!existsSync(lockPath)) continue;
+			const data = JSON.parse(readFileSync(lockPath, "utf-8")) as Record<
+				string,
+				unknown
+			>;
+			return {
+				unitType:
+					typeof data.unitType === "string" ? data.unitType : undefined,
+				unitId: typeof data.unitId === "string" ? data.unitId : undefined,
+				sessionFile:
+					typeof data.sessionFile === "string" ? data.sessionFile : undefined,
+			};
+		} catch {
+			// Heartbeats should never fail because lock metadata is temporarily absent
+			// or being rewritten.
+		}
+	}
+	return {};
+}
+
 // Re-export types for consumers
 export type {
 	RpcCommand,
@ -519,6 +625,32 @@ export async function runRpcMode(session: AgentSession): Promise<never> {
 		}
 	});

+	const runtimeSourceRoot = findRuntimeSourceRoot();
+	const runtimeEpoch = newestSourceMtimeMs(runtimeSourceRoot);
+	const emitRuntimeHeartbeat = () => {
+		const runtimeUnit = readRuntimeUnitState();
+		const heartbeat = {
+			type: "runtime_heartbeat" as const,
+			sessionId: session.sessionId,
+			sessionFile: runtimeUnit.sessionFile ?? session.sessionFile,
+			unitType: runtimeUnit.unitType,
+			unitId: runtimeUnit.unitId,
+			runtimeEpoch,
+			sourceEpoch: newestSourceMtimeMs(runtimeSourceRoot),
+			emittedAt: Date.now(),
+		};
+		if (!eventFilter || eventFilter.has("runtime_heartbeat")) {
+			output(heartbeat);
+		}
+	};
+	const runtimeHeartbeatTimer =
+		RUNTIME_HEARTBEAT_INTERVAL_MS > 0
+			? setInterval(emitRuntimeHeartbeat, RUNTIME_HEARTBEAT_INTERVAL_MS)
+			: undefined;
+	if (runtimeHeartbeatTimer) {
+		signalCleanupHandlers.push(() => clearInterval(runtimeHeartbeatTimer));
+	}
+
 	// Handle a single command
 	const handleCommand = async (command: RpcCommand): Promise<RpcResponse> => {
 		const id = command.id;
@ -901,7 +1033,7 @@ export async function runRpcMode(session: AgentSession): Promise<never> {
 						protocolVersion: 2,
 						sessionId: session.sessionId,
 						capabilities: {
-							events: ["execution_complete", "cost_update"],
+							events: ["execution_complete", "cost_update", "runtime_heartbeat"],
 							commands: ["init", "shutdown", "subscribe"],
 						},
 					};
--- a/packages/pi-coding-agent/src/modes/rpc/rpc-protocol-v2.test.ts
+++ b/packages/pi-coding-agent/src/modes/rpc/rpc-protocol-v2.test.ts
@ -148,7 +148,7 @@ describe("v2 type shapes", () => {
 			protocolVersion: 2,
 			sessionId: "test-session-123",
 			capabilities: {
-				events: ["execution_complete", "cost_update"],
+				events: ["execution_complete", "cost_update", "runtime_heartbeat"],
 				commands: ["init", "shutdown", "subscribe"],
 			},
 		};
@ -158,6 +158,7 @@ describe("v2 type shapes", () => {
 		assert.ok(Array.isArray(initResult.capabilities.commands));
 		assert.ok(initResult.capabilities.events.includes("execution_complete"));
 		assert.ok(initResult.capabilities.events.includes("cost_update"));
+		assert.ok(initResult.capabilities.events.includes("runtime_heartbeat"));
 		assert.ok(initResult.capabilities.commands.includes("init"));
 		assert.ok(initResult.capabilities.commands.includes("shutdown"));
 		assert.ok(initResult.capabilities.commands.includes("subscribe"));
@ -231,6 +232,16 @@ describe("v2 type shapes", () => {
 				cumulativeCost: 0.03,
 				tokens: { input: 100, output: 50, cacheRead: 10, cacheWrite: 5 },
 			},
+			{
+				type: "runtime_heartbeat",
+				sessionId: "s1",
+				sessionFile: "/tmp/s1.jsonl",
+				unitType: "execute-task",
+				unitId: "M001/S01/T01",
+				runtimeEpoch: 100,
+				sourceEpoch: 101,
+				emittedAt: 123,
+			},
 		];

 		for (const event of events) {
@ -242,6 +253,9 @@ describe("v2 type shapes", () => {
 				// TypeScript narrows to RpcCostUpdateEvent
 				assert.ok("turnCost" in event);
 				assert.ok("tokens" in event);
+			} else if (event.type === "runtime_heartbeat") {
+				assert.ok("runtimeEpoch" in event);
+				assert.ok("sourceEpoch" in event);
 			} else {
 				assert.fail(`Unexpected event type: ${(event as any).type}`);
 			}
@ -569,7 +583,7 @@ describe("Client ↔ Mock server protocol exchange", () => {
 			protocolVersion: 2,
 			sessionId: "sess-abc",
 			capabilities: {
-				events: ["execution_complete", "cost_update"],
+				events: ["execution_complete", "cost_update", "runtime_heartbeat"],
 				commands: ["init", "shutdown", "subscribe"],
 			},
 		};
--- a/packages/pi-coding-agent/src/modes/rpc/rpc-types.ts
+++ b/packages/pi-coding-agent/src/modes/rpc/rpc-types.ts
@ -273,8 +273,23 @@ export interface RpcCostUpdateEvent {
 	};
 }

+/** Runtime heartbeat emitted by long-lived RPC children for daemon reload supervision. */
+export interface RpcRuntimeHeartbeatEvent {
+	type: "runtime_heartbeat";
+	sessionId: string;
+	sessionFile?: string;
+	unitType?: string;
+	unitId?: string;
+	runtimeEpoch: number;
+	sourceEpoch: number;
+	emittedAt: number;
+}
+
 /** Discriminated union of all v2-only event types */
-export type RpcV2Event = RpcExecutionCompleteEvent | RpcCostUpdateEvent;
+export type RpcV2Event =
+	| RpcExecutionCompleteEvent
+	| RpcCostUpdateEvent
+	| RpcRuntimeHeartbeatEvent;

 // ============================================================================
 // Extension UI Events (stdout)
--- a/packages/rpc-client/src/rpc-client.test.ts
+++ b/packages/rpc-client/src/rpc-client.test.ts
@ -216,7 +216,7 @@ describe("type shapes", () => {
 		assert.equal(v2, 2);
 	});

-	it("RpcV2Event discriminated union covers both event types", () => {
+	it("RpcV2Event discriminated union covers protocol event types", () => {
 		const events: RpcV2Event[] = [
 			{
 				type: "execution_complete",
@ -241,10 +241,19 @@ describe("type shapes", () => {
 				cumulativeCost: 0.001,
 				tokens: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0 },
 			},
+			{
+				type: "runtime_heartbeat",
+				sessionId: "s1",
+				sessionFile: "/tmp/s1.jsonl",
+				runtimeEpoch: 100,
+				sourceEpoch: 100,
+				emittedAt: 123,
+			},
 		];
-		assert.equal(events.length, 2);
+		assert.equal(events.length, 3);
 		assert.equal(events[0].type, "execution_complete");
 		assert.equal(events[1].type, "cost_update");
+		assert.equal(events[2].type, "runtime_heartbeat");
 	});
 });

--- a/packages/rpc-client/src/rpc-types.ts
+++ b/packages/rpc-client/src/rpc-types.ts
@ -336,8 +336,23 @@ export interface RpcCostUpdateEvent {
 	};
 }

+/** Runtime heartbeat emitted by long-lived RPC children for daemon reload supervision. */
+export interface RpcRuntimeHeartbeatEvent {
+	type: "runtime_heartbeat";
+	sessionId: string;
+	sessionFile?: string;
+	unitType?: string;
+	unitId?: string;
+	runtimeEpoch: number;
+	sourceEpoch: number;
+	emittedAt: number;
+}
+
 /** Discriminated union of all v2-only event types */
-export type RpcV2Event = RpcExecutionCompleteEvent | RpcCostUpdateEvent;
+export type RpcV2Event =
+	| RpcExecutionCompleteEvent
+	| RpcCostUpdateEvent
+	| RpcRuntimeHeartbeatEvent;

 // ============================================================================
 // Extension UI Events (stdout)
--- a/src/headless-query.ts
+++ b/src/headless-query.ts
@ -15,8 +15,9 @@
 * bypassing the extension loader's jiti setup (#1137).
 */

+import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { homedir } from "node:os";
-import { join } from "node:path";
+import { dirname, join } from "node:path";
 import { createJiti } from "@mariozechner/jiti";
 import { resolveBundledSourceResource } from "./bundled-resource-path.js";
 import type { SFState } from "./resources/extensions/sf/types.js";
@ -33,7 +34,6 @@ const agentExtensionsDir = join(
 	"extensions",
 	"sf",
 );
-const { existsSync } = await import("node:fs");
 const useAgentDir = existsSync(join(agentExtensionsDir, "state.js"));
 const sfExtensionPath = (moduleName: string) =>
 	useAgentDir
@ -46,10 +46,7 @@ const sfExtensionPath = (moduleName: string) =>
 			);

 async function loadExtensionModules() {
-	const stateModule = (await jiti.import(
-		sfExtensionPath("state"),
-		{},
-	)) as any;
+	const stateModule = (await jiti.import(sfExtensionPath("state"), {})) as any;
 	const dispatchModule = (await jiti.import(
 		sfExtensionPath("auto-dispatch"),
 		{},
@ -86,6 +83,43 @@ async function loadExtensionModules() {

 // ─── Types ──────────────────────────────────────────────────────────────────

+type RuntimeDispatchDecisionSummary = {
+	action: "dispatch" | "retry" | "notify" | "block" | "skip";
+	reasonCode:
+		| "no-runtime-record"
+		| "queued"
+		| "retry-budget-available"
+		| "terminal-ready-to-notify"
+		| "retry-budget-exhausted"
+		| "synthetic-reset-required"
+		| "already-notified"
+		| "active-or-claimed"
+		| "notified"
+		| "terminal-nonretryable";
+	retryCount: number;
+	maxRetries: number;
+	retryBudgetRemaining: number;
+};
+
+type RuntimeUnitSummary = {
+	unitType: string;
+	unitId: string;
+	phase: string;
+	status: string;
+	startedAt: number | null;
+	updatedAt: number | null;
+	retryCount: number;
+	maxRetries: number;
+	retryBudgetRemaining: number;
+	lastHeartbeatAt: number | null;
+	lastProgressAt: number | null;
+	lastOutputAt: number | null;
+	outputPath: string | null;
+	watchdogReason: string | null;
+	notifiedAt: number | null;
+	dispatchDecision: RuntimeDispatchDecisionSummary;
+};
+
 export interface QuerySnapshot {
 	schemaVersion: 1;
 	state: SFState;
@ -105,6 +139,9 @@ export interface QuerySnapshot {
 		}>;
 		total: number;
 	};
+	runtime: {
+		units: RuntimeUnitSummary[];
+	};
 }

 export interface QueryResult {
@ -114,6 +151,192 @@ export interface QueryResult {

 // ─── Implementation ─────────────────────────────────────────────────────────

+const QUERY_TERMINAL_STATUSES = new Set([
+	"completed",
+	"failed",
+	"blocked",
+	"cancelled",
+	"stale",
+	"runaway-recovered",
+]);
+const QUERY_RETRYABLE_TERMINAL_STATUSES = new Set([
+	"failed",
+	"stale",
+	"runaway-recovered",
+]);
+const DEFAULT_QUERY_MAX_RETRIES = 1;
+
+function resolveSfRootForQuery(basePath: string): string {
+	let current = basePath;
+	while (true) {
+		const candidate = join(current, ".sf");
+		if (existsSync(candidate)) return candidate;
+		const parent = dirname(current);
+		if (parent === current) return join(basePath, ".sf");
+		current = parent;
+	}
+}
+
+function stringField(value: unknown, fallback = ""): string {
+	return typeof value === "string" ? value : fallback;
+}
+
+function numberField(value: unknown): number | null {
+	return typeof value === "number" && Number.isFinite(value) ? value : null;
+}
+
+function inferQueryStatus(
+	phase: string,
+	record: Record<string, unknown>,
+): string {
+	switch (phase) {
+		case "queued":
+		case "claimed":
+		case "running":
+		case "progress":
+		case "completed":
+		case "failed":
+		case "blocked":
+		case "cancelled":
+		case "stale":
+		case "runaway-recovered":
+		case "notified":
+			return phase;
+		case "dispatched":
+			return "running";
+		case "wrapup-warning-sent":
+		case "runaway-warning-sent":
+		case "runaway-final-warning-sent":
+		case "recovered":
+			return "progress";
+		case "timeout":
+			return "stale";
+		case "finalized":
+			return "completed";
+		case "paused":
+			return record.runawayGuardPause ? "runaway-recovered" : "blocked";
+		case "skipped":
+			return "blocked";
+		default:
+			return "running";
+	}
+}
+
+function queryRuntimeDecision(input: {
+	unitType: string;
+	unitId: string;
+	status: string;
+	retryCount: number;
+	maxRetries: number;
+	notifiedAt: number | null;
+}): RuntimeDispatchDecisionSummary {
+	const retryBudgetRemaining = Math.max(0, input.maxRetries - input.retryCount);
+	const common = {
+		retryCount: input.retryCount,
+		maxRetries: input.maxRetries,
+		retryBudgetRemaining,
+	};
+	if (input.notifiedAt !== null) {
+		return { action: "skip", reasonCode: "already-notified", ...common };
+	}
+	if (input.status === "notified") {
+		return { action: "skip", reasonCode: "notified", ...common };
+	}
+	if (input.status === "queued") {
+		return { action: "dispatch", reasonCode: "queued", ...common };
+	}
+	if (!QUERY_TERMINAL_STATUSES.has(input.status)) {
+		return { action: "skip", reasonCode: "active-or-claimed", ...common };
+	}
+	const synthetic =
+		input.unitType === "synthetic" ||
+		input.unitId.includes("parallel-research");
+	if (synthetic && input.status !== "completed") {
+		return {
+			action: "block",
+			reasonCode: "synthetic-reset-required",
+			...common,
+		};
+	}
+	if (QUERY_RETRYABLE_TERMINAL_STATUSES.has(input.status)) {
+		return retryBudgetRemaining > 0
+			? { action: "retry", reasonCode: "retry-budget-available", ...common }
+			: { action: "block", reasonCode: "retry-budget-exhausted", ...common };
+	}
+	if (
+		input.status === "completed" ||
+		input.status === "blocked" ||
+		input.status === "cancelled"
+	) {
+		return {
+			action: "notify",
+			reasonCode: "terminal-ready-to-notify",
+			...common,
+		};
+	}
+	return { action: "skip", reasonCode: "terminal-nonretryable", ...common };
+}
+
+function readRuntimeUnitSummaries(basePath: string): RuntimeUnitSummary[] {
+	const unitsDir = join(resolveSfRootForQuery(basePath), "runtime", "units");
+	if (!existsSync(unitsDir)) return [];
+	const results: RuntimeUnitSummary[] = [];
+	for (const file of readdirSync(unitsDir)) {
+		if (!file.endsWith(".json")) continue;
+		try {
+			const record = JSON.parse(
+				readFileSync(join(unitsDir, file), "utf-8"),
+			) as Record<string, unknown>;
+			const unitType = stringField(record.unitType);
+			const unitId = stringField(record.unitId);
+			if (!unitType || !unitId) continue;
+			const phase = stringField(record.phase, "dispatched");
+			const status = stringField(
+				record.status,
+				inferQueryStatus(phase, record),
+			);
+			const recoveryAttempts = numberField(record.recoveryAttempts) ?? 0;
+			const retryCount = numberField(record.retryCount) ?? recoveryAttempts;
+			const maxRetries =
+				numberField(record.maxRetries) ?? DEFAULT_QUERY_MAX_RETRIES;
+			const notifiedAt = numberField(record.notifiedAt);
+			const dispatchDecision = queryRuntimeDecision({
+				unitType,
+				unitId,
+				status,
+				retryCount,
+				maxRetries,
+				notifiedAt,
+			});
+			results.push({
+				unitType,
+				unitId,
+				phase,
+				status,
+				startedAt: numberField(record.startedAt),
+				updatedAt: numberField(record.updatedAt),
+				retryCount,
+				maxRetries,
+				retryBudgetRemaining: dispatchDecision.retryBudgetRemaining,
+				lastHeartbeatAt: numberField(record.lastHeartbeatAt),
+				lastProgressAt: numberField(record.lastProgressAt),
+				lastOutputAt: numberField(record.lastOutputAt),
+				outputPath:
+					typeof record.outputPath === "string" ? record.outputPath : null,
+				watchdogReason:
+					typeof record.watchdogReason === "string"
+						? record.watchdogReason
+						: null,
+				notifiedAt,
+				dispatchDecision,
+			});
+		} catch {
+			// Runtime query must stay best-effort; malformed unit files are ignored.
+		}
+	}
+	return results;
+}
+
 export async function buildQuerySnapshot(
 	basePath: string,
 ): Promise<QuerySnapshot> {
@ -169,6 +392,7 @@ export async function buildQuerySnapshot(
 		state,
 		next,
 		cost: { workers, total: workers.reduce((sum, w) => sum + w.cost, 0) },
+		runtime: { units: readRuntimeUnitSummaries(basePath) },
 	};

 	return snapshot;
--- a/src/resources/extensions/sf/auto.ts
+++ b/src/resources/extensions/sf/auto.ts
@ -15,6 +15,7 @@ import type {
 	ExtensionCommandContext,
 	ExtensionContext,
 } from "@singularity-forge/pi-coding-agent";
+import type { Api, Model } from "@singularity-forge/pi-ai";
 import { getManifestStatus } from "./files.js";
 import {
 	assessInterruptedSession,
@ -47,7 +48,11 @@ import { getRtkSessionSavings } from "../shared/rtk-session-stats.js";
 import { deactivateSF } from "../shared/sf-phase-state.js";
 import { clearActivityLogState } from "./activity-log.js";
 import { atomicWriteSync } from "./atomic-write.js";
-import { AutoSession, getAutoSession } from "./auto/session.js";
+import {
+	AutoSession,
+	getAutoSession,
+	type ModelFailureRecord,
+} from "./auto/session.js";
 // import { startSliceParallel } from "./slice-parallel-orchestrator.js"; (decoy for legacy regex tests)
 import {
 	getBudgetAlertLevel,
@ -542,6 +547,64 @@ export function setCurrentDispatchedModelId(
 	s.currentDispatchedModelId = model ? `${model.provider}/${model.id}` : null;
 }

+/**
+ * Update the concrete model tracked for the currently running unit.
+ *
+ * Purpose: keep fresh-session restoration and dashboard state aligned after
+ * runtime provider recovery switches models mid-unit.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts after a configured fallback route
+ * is successfully applied.
+ */
+export function setCurrentUnitModel(model: Model<Api> | null): void {
+	s.currentUnitModel = model;
+	setCurrentDispatchedModelId(model);
+}
+
+/**
+ * Record that a provider/model route failed for the current auto unit.
+ *
+ * Purpose: prevent retry loops on quota/rate-limit/server failures by making
+ * subsequent recovery skip the failed route for this unit.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts before selecting the next configured
+ * fallback route.
+ */
+export function recordCurrentModelFailure(input: {
+	provider: string;
+	modelId: string;
+	reason: string;
+	timestamp?: number;
+}): void {
+	if (!s.currentUnit) return;
+	s.modelFailures.push({
+		unitType: s.currentUnit.type,
+		unitId: s.currentUnit.id,
+		provider: input.provider,
+		modelId: input.modelId,
+		reason: input.reason,
+		timestamp: input.timestamp ?? Date.now(),
+	});
+}
+
+/**
+ * Return model failures scoped to the currently running auto unit.
+ *
+ * Purpose: keep recovery decisions unit-local so a quota failure in one unit
+ * does not permanently suppress a model in later work.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts when resolving the next configured
+ * fallback route.
+ */
+export function getCurrentUnitModelFailures(): ModelFailureRecord[] {
+	if (!s.currentUnit) return [];
+	return s.modelFailures.filter(
+		(failure) =>
+			failure.unitType === s.currentUnit?.type &&
+			failure.unitId === s.currentUnit?.id,
+	);
+}
+
 /**
 * Mark the current research unit as terminal after saving its RESEARCH artifact.
 *
--- a/src/resources/extensions/sf/auto/session.ts
+++ b/src/resources/extensions/sf/auto/session.ts
@ -47,6 +47,15 @@ export interface StartModel {
 	id: string;
 }

+export interface ModelFailureRecord {
+	unitType: string;
+	unitId: string;
+	provider: string;
+	modelId: string;
+	reason: string;
+	timestamp: number;
+}
+
 export interface PendingVerificationRetry {
 	unitId: string;
 	failureContext: string;
@ -156,6 +165,8 @@ export class AutoSession {
 	currentUnitModel: Model<Api> | null = null;
 	/** Fully-qualified model ID (provider/id) set after selectAndApplyModel + hook overrides (#2899). */
 	currentDispatchedModelId: string | null = null;
+	/** Per-session, per-unit failed model routes skipped by runtime recovery. */
+	readonly modelFailures: ModelFailureRecord[] = [];
 	originalModelId: string | null = null;
 	originalModelProvider: string | null = null;
 	lastBudgetAlertLevel: BudgetAlertLevel = 0;
@ -348,6 +359,7 @@ export class AutoSession {
 		this.manualSessionModelOverride = null;
 		this.currentUnitModel = null;
 		this.currentDispatchedModelId = null;
+		this.modelFailures.length = 0;
 		this.originalModelId = null;
 		this.originalModelProvider = null;
 		this.lastBudgetAlertLevel = 0;
--- a/src/resources/extensions/sf/bootstrap/agent-end-recovery.ts
+++ b/src/resources/extensions/sf/bootstrap/agent-end-recovery.ts
@ -4,13 +4,13 @@ import type {
 } from "@singularity-forge/pi-coding-agent";
 import {
 	getAutoDashboardData,
-	getAutoModeStartModel,
+	getCurrentUnitModelFailures,
 	isAutoActive,
 	pauseAuto,
-	setCurrentDispatchedModelId,
+	recordCurrentModelFailure,
+	setCurrentUnitModel,
 } from "../auto.js";
 import { isSessionSwitchInFlight, resolveAgentEnd } from "../auto-loop.js";
-import { resolveModelId } from "../auto-model-selection.js";
 import { blockModel, isModelBlocked } from "../blocked-models.js";
 import {
 	classifyError,
@ -21,76 +21,122 @@ import {
 } from "../error-classifier.js";
 import { checkAutoStartAfterDiscuss } from "../guided-flow.js";
 import {
-	getNextFallbackModel,
+	type ModelRouteRef,
+	resolveNextModelRoute,
+} from "../model-route-failure.js";
+import {
 	resolveModelWithFallbacksForUnit,
 	resolvePersistModelChanges,
 } from "../preferences.js";
 import { pauseAutoForProviderError } from "../provider-error-pause.js";
 import { logWarning } from "../workflow-logger.js";
-import { resumeAutoAfterProviderDelay } from "./provider-error-resume.js";
 import { clearDiscussionFlowState } from "./write-gate.js";

 const retryState = createRetryState();
-const MAX_NETWORK_RETRIES = 2;
-const MAX_TRANSIENT_AUTO_RESUMES = 8;

 /**
 * Reset the module-level retry state so a resumed auto-session starts fresh.
- * Called by provider-error-resume.ts before startAuto() — without this, the
- * consecutiveTransientCount accumulates across pause/resume cycles and locks
- * out auto-resume after MAX_TRANSIENT_AUTO_RESUMES total (not consecutive) errors.
+ * Called by provider-error-resume.ts before startAuto() so legacy paused
+ * provider recovery does not inherit stale transient counters.
 */
 export function resetTransientRetryState(): void {
 	resetRetryState(retryState);
 }

-async function pauseTransientWithBackoff(
-	cls: ErrorClass,
-	pi: ExtensionAPI,
+function getCurrentRouteFromMessage(
+	lastMsg: unknown,
 	ctx: ExtensionContext,
-	errorDetail: string,
-	isRateLimit: boolean,
-): Promise<void> {
-	retryState.consecutiveTransientCount += 1;
-	const baseRetryAfterMs = "retryAfterMs" in cls ? cls.retryAfterMs : 15_000;
-	const retryAfterMs =
-		baseRetryAfterMs *
-		2 ** Math.max(0, retryState.consecutiveTransientCount - 1);
-	const allowAutoResume =
-		retryState.consecutiveTransientCount <= MAX_TRANSIENT_AUTO_RESUMES;
-	if (!allowAutoResume) {
-		ctx.ui.notify(
-			`Transient provider errors persisted after ${MAX_TRANSIENT_AUTO_RESUMES} auto-resume attempts. Pausing for manual review.`,
+): ModelRouteRef | undefined {
+	const msg = lastMsg as Record<string, unknown> | undefined;
+	const provider =
+		typeof msg?.provider === "string" ? msg.provider : ctx.model?.provider;
+	const id = typeof msg?.model === "string" ? msg.model : ctx.model?.id;
+	return provider && id ? { provider, id } : undefined;
+}
+
+function isModelRouteFailure(cls: ErrorClass): boolean {
+	return (
+		cls.kind === "rate-limit" ||
+		cls.kind === "network" ||
+		cls.kind === "server" ||
+		cls.kind === "connection" ||
+		cls.kind === "stream"
+	);
+}
+
+async function trySwitchToFallbackModel(args: {
+	pi: ExtensionAPI;
+	ctx: ExtensionContext;
+	current: ModelRouteRef | undefined;
+	reason: string;
+	unitType: string;
+	basePath: string | undefined;
+	errorDetail: string;
+	persistModelChanges: boolean;
+}): Promise<boolean> {
+	const modelConfig = resolveModelWithFallbacksForUnit(args.unitType, {
+		autoBenchmark: true,
+	});
+
+	if (args.current) {
+		recordCurrentModelFailure({
+			provider: args.current.provider,
+			modelId: args.current.id,
+			reason: args.reason,
+		});
+	}
+
+	const availableModels = args.ctx.modelRegistry.getAvailable();
+	const isBlocked = args.basePath
+		? (model: { provider: string; id: string }) =>
+				isModelBlocked(args.basePath!, model.provider, model.id)
+		: undefined;
+
+	for (
+		let attempt = 0;
+		attempt < availableModels.length + (modelConfig?.fallbacks.length ?? 0) + 1;
+		attempt++
+	) {
+		const nextRoute = resolveNextModelRoute({
+			current: args.current,
+			modelConfig,
+			availableModels,
+			failedRoutes: getCurrentUnitModelFailures(),
+			isBlocked,
+		});
+
+		if (!nextRoute) return false;
+
+		const ok = await args.pi.setModel(nextRoute.model, {
+			persist: args.persistModelChanges,
+		});
+		if (!ok) {
+			recordCurrentModelFailure({
+				provider: nextRoute.model.provider,
+				modelId: nextRoute.model.id,
+				reason: "setModel failed during provider recovery",
+			});
+			continue;
+		}
+
+		resetRetryState(retryState);
+		setCurrentUnitModel(nextRoute.model);
+		args.ctx.ui.notify(
+			`Model route failed${args.errorDetail}. Switched to ${nextRoute.source === "configured" ? "configured fallback" : "available fallback"}: ${nextRoute.model.provider}/${nextRoute.model.id}.`,
 			"warning",
 		);
+		args.pi.sendMessage(
+			{
+				customType: "sf-auto-timeout-recovery",
+				content: "Continue execution.",
+				display: false,
+			},
+			{ triggerTurn: true },
+		);
+		return true;
 	}
-	await pauseAutoForProviderError(
-		ctx.ui,
-		errorDetail,
-		() =>
-			pauseAuto(ctx, pi, {
-				message: `Provider error: ${errorDetail}`,
-				category: "provider",
-				isTransient: allowAutoResume,
-				retryAfterMs,
-			}),
-		{
-			isRateLimit,
-			isTransient: allowAutoResume,
-			retryAfterMs,
-			resume: allowAutoResume
-				? () => {
-						void resumeAutoAfterProviderDelay(pi, ctx).catch((err) => {
-							const message = err instanceof Error ? err.message : String(err);
-							ctx.ui.notify(
-								`Provider error recovery delay elapsed, but auto-mode failed to resume: ${message}`,
-								"error",
-							);
-						});
-					}
-				: undefined,
-		},
-	);
+
+	return false;
 }

 export async function handleAgentEnd(
@ -172,14 +218,12 @@ export async function handleAgentEnd(

 		// ── 1. Classify using rawErrorMsg to avoid prose false-positives ────
 		const cls = classifyError(rawErrorMsg, explicitRetryAfterMs);
+		const currentRoute = getCurrentRouteFromMessage(lastMsg, ctx);
+		const dash = getAutoDashboardData();

-		// ── 1b. Defer to Core RetryHandler for most transient errors ────────
-		// Core retries transient failures in-session after this handler.
-		// Keep that behavior for non-rate-limit classes to avoid pause/retry races,
-		// but let rate-limit continue into model fallback logic below (#4373).
-		if (isTransient(cls) && cls.kind !== "rate-limit") {
-			return;
-		}
+		// SF owns provider-route recovery in auto-mode. Quota/rate-limit/server/
+		// stream/connection failures must leave the failed provider/model route
+		// immediately instead of sleeping or waiting for same-model retry loops.

 		// Cap rate-limit backoff for CLI-style providers (openai-codex, google-gemini-cli)
 		// which use per-user quotas with shorter windows (#2922).
@ -198,9 +242,8 @@ export async function handleAgentEnd(
 		//        same dead model isn't reselected on the next /sf auto restart,
 		//        then try a fallback before pausing.
 		if (cls.kind === "unsupported-model") {
-			const dash = getAutoDashboardData();
-			const rejectedProvider = ctx.model?.provider;
-			const rejectedId = ctx.model?.id;
+			const rejectedProvider = currentRoute?.provider;
+			const rejectedId = currentRoute?.id;
 			if (dash.basePath && rejectedProvider && rejectedId) {
 				try {
 					blockModel(
@ -219,62 +262,18 @@ export async function handleAgentEnd(
 				}
 			}

-			// Try configured fallback chain, skipping anything already blocked.
 			if (dash.currentUnit && dash.basePath) {
-				const modelConfig = resolveModelWithFallbacksForUnit(
-					dash.currentUnit.type,
-				);
-				if (modelConfig && modelConfig.fallbacks.length > 0) {
-					const availableModels = ctx.modelRegistry.getAvailable();
-					let cursorModelId: string | undefined = ctx.model?.id;
-					while (true) {
-						const nextModelId = getNextFallbackModel(
-							cursorModelId,
-							modelConfig,
-						);
-						if (!nextModelId) break;
-						if (
-							isModelBlocked(dash.basePath, ctx.model?.provider, nextModelId)
-						) {
-							cursorModelId = nextModelId;
-							continue;
-						}
-						const modelToSet = resolveModelId(
-							nextModelId,
-							availableModels,
-							ctx.model?.provider,
-						);
-						if (
-							modelToSet &&
-							!isModelBlocked(dash.basePath, modelToSet.provider, modelToSet.id)
-						) {
-							const persistModelChanges = resolvePersistModelChanges();
-							const ok = await pi.setModel(modelToSet, {
-								persist: persistModelChanges,
-							});
-							if (ok) {
-								setCurrentDispatchedModelId({
-									provider: modelToSet.provider,
-									id: modelToSet.id,
-								});
-								ctx.ui.notify(
-									`Switched to unblocked fallback: ${nextModelId} and resuming.`,
-									"info",
-								);
-								pi.sendMessage(
-									{
-										customType: "sf-auto-timeout-recovery",
-										content: "Continue execution.",
-										display: false,
-									},
-									{ triggerTurn: true },
-								);
-								return;
-							}
-						}
-						cursorModelId = nextModelId;
-					}
-				}
+				const switched = await trySwitchToFallbackModel({
+					pi,
+					ctx,
+					current: currentRoute,
+					reason: rawErrorMsg || "unsupported for account",
+					unitType: dash.currentUnit.type,
+					basePath: dash.basePath,
+					errorDetail,
+					persistModelChanges,
+				});
+				if (switched) return;
 			}

 			// No usable fallback — pause
@ -292,150 +291,42 @@ export async function handleAgentEnd(

 		// ── 2. Decide & Act ──────────────────────────────────────────────────

-		// --- Network errors: same-model retry with backoff ---
-		if (cls.kind === "network") {
-			const currentModelId = ctx.model?.id ?? "unknown";
-			if (retryState.currentRetryModelId !== currentModelId) {
-				retryState.networkRetryCount = 0;
-				retryState.currentRetryModelId = currentModelId;
-			}
-			if (retryState.networkRetryCount < MAX_NETWORK_RETRIES) {
-				retryState.networkRetryCount += 1;
-				retryState.consecutiveTransientCount += 1;
-				const attempt = retryState.networkRetryCount;
-				const delayMs = attempt * cls.retryAfterMs;
-				ctx.ui.notify(
-					`Network error on ${currentModelId}${errorDetail}. Retry ${attempt}/${MAX_NETWORK_RETRIES} in ${delayMs / 1000}s...`,
-					"warning",
-				);
-				setTimeout(() => {
-					pi.sendMessage(
-						{
-							customType: "sf-auto-timeout-recovery",
-							content:
-								"Continue execution — retrying after transient network error.",
-							display: false,
-						},
-						{ triggerTurn: true },
-					);
-				}, delayMs);
-				return;
-			}
-			// Network retries exhausted — fall through to model fallback
-			retryState.networkRetryCount = 0;
-			retryState.currentRetryModelId = undefined;
-			ctx.ui.notify(
-				`Network retries exhausted for ${currentModelId}. Attempting model fallback.`,
-				"warning",
-			);
-		}
-
-		// --- Transient errors: try model fallback first, then pause ---
-		// Rate limits are often per-model, so switching models can bypass them.
-		if (
-			cls.kind === "rate-limit" ||
-			cls.kind === "network" ||
-			cls.kind === "server" ||
-			cls.kind === "connection" ||
-			cls.kind === "stream"
-		) {
-			// Try model fallback
-			const dash = getAutoDashboardData();
-			if (dash.currentUnit) {
-				const modelConfig = resolveModelWithFallbacksForUnit(
-					dash.currentUnit.type,
-				);
-				if (modelConfig && modelConfig.fallbacks.length > 0) {
-					const availableModels = ctx.modelRegistry.getAvailable();
-					const nextModelId = getNextFallbackModel(ctx.model?.id, modelConfig);
-					if (nextModelId) {
-						retryState.networkRetryCount = 0;
-						retryState.currentRetryModelId = undefined;
-						const modelToSet = resolveModelId(
-							nextModelId,
-							availableModels,
-							ctx.model?.provider,
-						);
-						if (modelToSet) {
-							const ok = await pi.setModel(modelToSet, {
-								persist: persistModelChanges,
-							});
-							if (ok) {
-								setCurrentDispatchedModelId({
-									provider: modelToSet.provider,
-									id: modelToSet.id,
-								});
-								ctx.ui.notify(
-									`Model error${errorDetail}. Switched to fallback: ${nextModelId} and resuming.`,
-									"warning",
-								);
-								pi.sendMessage(
-									{
-										customType: "sf-auto-timeout-recovery",
-										content: "Continue execution.",
-										display: false,
-									},
-									{ triggerTurn: true },
-								);
-								return;
-							}
-						}
-					}
-				}
-			}
-
-			// Try restoring session model
-			const sessionModel = getAutoModeStartModel();
-			if (sessionModel) {
-				if (
-					ctx.model?.id !== sessionModel.id ||
-					ctx.model?.provider !== sessionModel.provider
-				) {
-					const startModel = ctx.modelRegistry
-						.getAvailable()
-						.find(
-							(m) =>
-								m.provider === sessionModel.provider &&
-								m.id === sessionModel.id,
-						);
-					if (startModel) {
-						const ok = await pi.setModel(startModel, {
-							persist: persistModelChanges,
-						});
-						if (ok) {
-							setCurrentDispatchedModelId({
-								provider: startModel.provider,
-								id: startModel.id,
-							});
-							retryState.networkRetryCount = 0;
-							retryState.currentRetryModelId = undefined;
-							ctx.ui.notify(
-								`Model error${errorDetail}. Restored session model: ${sessionModel.provider}/${sessionModel.id} and resuming.`,
-								"warning",
-							);
-							pi.sendMessage(
-								{
-									customType: "sf-auto-timeout-recovery",
-									content: "Continue execution.",
-									display: false,
-								},
-								{ triggerTurn: true },
-							);
-							return;
-						}
-					}
-				}
-			}
-		}
-
-		// --- Transient fallback: pause with auto-resume ---
-		if (isTransient(cls)) {
-			await pauseTransientWithBackoff(
-				cls,
+		// --- Route failures: try configured fallback first, then any available route ---
+		if (isModelRouteFailure(cls) && dash.currentUnit) {
+			const switched = await trySwitchToFallbackModel({
 				pi,
 				ctx,
+				current: currentRoute,
+				reason: rawErrorMsg || cls.kind,
+				unitType: dash.currentUnit.type,
+				basePath: dash.basePath,
 				errorDetail,
-				cls.kind === "rate-limit",
+				persistModelChanges,
+			});
+			if (switched) return;
+		}
+
+		// --- Transient fallback exhausted: pause without same-route auto-resume ---
+		if (isTransient(cls)) {
+			const message =
+				isModelRouteFailure(cls) && dash.currentUnit
+					? `Provider route failed and no usable fallback model remains${errorDetail}`
+					: `Provider error${errorDetail}`;
+			await pauseAutoForProviderError(
+				ctx.ui,
+				errorDetail,
+				() =>
+					pauseAuto(ctx, pi, {
+						message,
+						category: "provider",
+						isTransient: false,
+						retryAfterMs: "retryAfterMs" in cls ? cls.retryAfterMs : undefined,
+					}),
+				{
+					isRateLimit: cls.kind === "rate-limit",
+					isTransient: false,
+					retryAfterMs: "retryAfterMs" in cls ? cls.retryAfterMs : 0,
+				},
 			);
 			return;
 		}
--- a/src/resources/extensions/sf/bootstrap/db-tools.ts
+++ b/src/resources/extensions/sf/bootstrap/db-tools.ts
@ -10,7 +10,7 @@ import {
 	nextMilestoneId,
 } from "../guided-flow.js";
 import { loadEffectiveSFPreferences } from "../preferences.js";
-import { recordSelfFeedback } from "../self-feedback.js";
+import { markResolved, recordSelfFeedback } from "../self-feedback.js";
 import {
 	executeCompleteMilestone,
 	executePlanMilestone,
@ -687,7 +687,7 @@ export function registerDbTools(pi: ExtensionAPI): void {
 		promptGuidelines: [
 			"Use sf_self_report for ANY sf-internal observation — not just bugs. Acceptable kinds include: 'prompt-quality-issue' (you found a prompt ambiguous, contradictory, or missing context), 'improvement-idea' (a non-bug enhancement that would help), 'agent-friction' (workflow friction you worked around), 'design-thought' (broader speculation), 'missing-feature' (capability you wished sf had), as well as classic bug kinds like 'brittle-predicate' or 'git-empty-pathspec'.",
 			"Do NOT use this for bugs in the user's project, for your own task work, or to track your task's todo list. ONLY for observations about sf-the-tool itself.",
-			"This tool FILES new entries; it does not address or resolve existing ones. Self-feedback is a triage inbox awaiting human/triage-agent review — do NOT autonomously pick entries off self-feedback and try to fix them. Treat existing entries as out of scope unless your task plan explicitly names a self-feedback entry id as the work.",
+			"This tool FILES new entries; it does not resolve existing ones. High/critical forge self-feedback may be queued autonomously at startup or an idle turn boundary as repair work. Use sf_self_feedback_resolve after fixing an entry; do not hand-edit the JSONL.",
 			"Over-reporting is preferred to under-reporting at this stage. If you noticed it about sf, file it. Dedup and threshold-to-roadmap promotion are tracked as their own self-feedback items and will eventually clean noise.",
 			"Severity guide: low = cosmetic / nice-to-have / improvement idea. medium = noisy or imperfect or recurring friction. high = blocked the unit (sf-the-tool prevented you from completing the task). critical = needs immediate fix (currently treated as high until inline-fix dispatch lands).",
 			"high/critical entries mark the originating unit as blocked: it will not seal as success, and will be re-queued only after sf is bumped past the recorded version.",
@ -780,6 +780,145 @@ export function registerDbTools(pi: ExtensionAPI): void {

 	pi.registerTool(selfReportTool);

+	// ─── sf_self_feedback_resolve ────────────────────────────────────────
+	// Agent-callable resolver for inline self-feedback repair turns. The
+	// inline-fix prompt must not rely on hand-editing JSONL: the tool updates
+	// the structured source of truth and regenerates the markdown view.
+	const selfFeedbackResolveExecute = async (
+		_toolCallId: string,
+		params: any,
+		_signal: AbortSignal | undefined,
+		_onUpdate: unknown,
+		_ctx: unknown,
+	): Promise<AgentToolResult<Record<string, unknown>>> => {
+		try {
+			const ok = markResolved(
+				params.id,
+				{
+					reason: params.reason,
+					evidence: {
+						kind: "agent-fix",
+						commitSha: params.commit_sha,
+						testPath: params.test_path,
+						summaryNarrative: params.summary_narrative,
+					},
+					criteriaMet: params.criteria_met,
+				},
+				process.cwd(),
+			);
+			if (!ok) {
+				return {
+					content: [
+						{
+							type: "text" as const,
+							text: `Error: unresolved self-feedback entry not found: ${params.id}`,
+						},
+					],
+					details: {
+						operation: "self_feedback_resolve",
+						id: params.id,
+						error: "not_found_or_already_resolved",
+					},
+				};
+			}
+			return {
+				content: [
+					{
+						type: "text" as const,
+						text: `Resolved self-feedback ${params.id}`,
+					},
+				],
+				details: {
+					operation: "self_feedback_resolve",
+					id: params.id,
+					resolved: true,
+				},
+			};
+		} catch (err) {
+			const msg = err instanceof Error ? err.message : String(err);
+			logError("tool", `sf_self_feedback_resolve tool failed: ${msg}`, {
+				tool: "sf_self_feedback_resolve",
+				error: String(err),
+			});
+			return {
+				content: [
+					{
+						type: "text" as const,
+						text: `Error in sf_self_feedback_resolve: ${msg}`,
+					},
+				],
+				details: {
+					operation: "self_feedback_resolve",
+					id: params.id,
+					error: msg,
+				},
+			};
+		}
+	};
+
+	pi.registerTool({
+		name: "sf_self_feedback_resolve",
+		label: "Resolve Self Feedback",
+		description:
+			"Mark a repaired SF self-feedback entry resolved with structured agent-fix evidence. " +
+			"Use this only after verifying the entry no longer applies, landing the fix, and citing the commit or verification evidence.",
+		promptSnippet:
+			"Resolve a repaired SF self-feedback entry with commit/test evidence",
+		promptGuidelines: [
+			"Use sf_self_feedback_resolve during self-feedback inline-fix repair turns after the fix is implemented and verified.",
+			"Do not hand-edit `.sf/self-feedback.jsonl`; this tool updates the JSONL source of truth and regenerates `.sf/SELF-FEEDBACK.md`.",
+			"If the entry has acceptance criteria, pass criteria_met with the criteria that were satisfied.",
+			"Pass commit_sha when a commit exists. If an entry was already fixed, cite the existing commit or include summary_narrative and test_path.",
+		],
+		parameters: Type.Object({
+			id: Type.String({
+				description: "Self-feedback entry id, e.g. sf-moocz9so-4ffov2",
+			}),
+			reason: Type.String({
+				description: "Short explanation of why the entry is resolved",
+			}),
+			commit_sha: Type.Optional(
+				Type.String({ description: "Commit SHA containing the fix" }),
+			),
+			test_path: Type.Optional(
+				Type.String({ description: "Focused test or verification path" }),
+			),
+			summary_narrative: Type.Optional(
+				Type.String({
+					description:
+						"Concise verification summary when a commit/test path alone is not enough",
+				}),
+			),
+			criteria_met: Type.Optional(
+				Type.Array(Type.String(), {
+					description:
+						"Acceptance criteria satisfied by this fix, if the entry provided criteria",
+				}),
+			),
+		}),
+		execute: selfFeedbackResolveExecute,
+		renderCall(args: any, theme: any) {
+			let text = theme.fg("toolTitle", theme.bold("sf_self_feedback_resolve "));
+			if (args.id) text += theme.fg("muted", args.id);
+			return new Text(text, 0, 0);
+		},
+		renderResult(result: any, _options: any, theme: any) {
+			const d = result.details;
+			if (result.isError || d?.error) {
+				return new Text(
+					theme.fg("error", `Error: ${d?.error ?? "unknown"}`),
+					0,
+					0,
+				);
+			}
+			return new Text(
+				theme.fg("success", `Resolved ${d?.id ?? "self-feedback"}`),
+				0,
+				0,
+			);
+		},
+	});
+
 	// ─── sf_plan_milestone ────────────────────────────────────────────────

 	const planMilestoneExecute = async (
--- a/src/resources/extensions/sf/bootstrap/register-hooks.ts
+++ b/src/resources/extensions/sf/bootstrap/register-hooks.ts
@ -13,9 +13,9 @@ import {
 	hasResearchTerminalTransition,
 	isAutoActive,
 	isAutoPaused,
+	markResearchTerminalTransition,
 	markToolEnd,
 	markToolStart,
-	markResearchTerminalTransition,
 	recordToolInvocationError,
 } from "../auto.js";
 import {
@ -194,6 +194,18 @@ export function registerHooks(
 			}
 		}
 		loadToolApiKeys();
+		// Flow audit is read-only by default: surface stale dispatched units,
+		// missing session pointers, runaway history, and optional child hangs at
+		// startup before another auto unit compounds the same milestone failure.
+		try {
+			const { runFlowAudit } = await import("../doctor.js");
+			const flow = await runFlowAudit(process.cwd());
+			if (!flow.ok) {
+				ctx.ui?.notify?.(`Flow audit: ${flow.recommendedAction}`, "warning");
+			}
+		} catch {
+			/* non-fatal — flow audit must never block session start */
+		}
 		// Drain self-feedback: auto-resolve entries whose blocking
 		// sf-version constraint has been satisfied by the current sf bump,
 		// and surface entries that remain blocked to the operator. Done after
@ -239,9 +251,9 @@ export function registerHooks(
 					"warning",
 				);
 			}
-			// Forge-only: surface high/critical entries as inline-fix candidates so
-			// the operator (or a follow-up dispatcher) can drain self-reported bugs
-			// without leaving the session. Read-only signal for now — no auto-dispatch.
+			// Forge-only: high/critical entries are queued as hidden follow-up repair
+			// work on startup, even outside /sf auto. The drain helper owns claim TTL
+			// and delivery failure retry, so this is safe to call opportunistically.
 			const highBlocked = triage.stillBlocked.filter(
 				(e) => e.severity === "high" || e.severity === "critical",
 			);
@ -366,6 +378,16 @@ export function registerHooks(
 		resetToolCallLoopGuard();
 		resetAskUserQuestionsCache();
 		await handleAgentEnd(pi, event, ctx);
+		// Best-effort embedding backfill: when SF_LLM_GATEWAY_KEY is set and the
+		// gateway has an embed worker online, embed any memories that don't yet
+		// have a vector. Bounded per invocation; logs once-per-minute when the
+		// gateway is unavailable so we don't spam the journal.
+		try {
+			const { runEmbeddingBackfill } = await import("../memory-embeddings.js");
+			await runEmbeddingBackfill();
+		} catch {
+			// Never break agent_end on backfill issues.
+		}
 	});

 	// Squash-merge quick-task branch back to the original branch after the
@ -378,9 +400,10 @@ export function registerHooks(
 			// Best-effort: don't break the turn lifecycle if cleanup fails.
 		}
 		try {
-			const { consumeCompletedInlineFixClaim } = await import(
-				"../self-feedback-drain.js"
-			);
+			const {
+				consumeCompletedInlineFixClaim,
+				dispatchSelfFeedbackInlineFixIfNeeded,
+			} = await import("../self-feedback-drain.js");
 			const resolvedIds = consumeCompletedInlineFixClaim(process.cwd());
 			if (resolvedIds.length > 0) {
 				const requestReload = (
@ -391,7 +414,9 @@ export function registerHooks(
 				requestReload?.(
 					`self-feedback inline fix resolved ${resolvedIds.length} entr${resolvedIds.length === 1 ? "y" : "ies"}`,
 				);
+				return;
 			}
+			dispatchSelfFeedbackInlineFixIfNeeded(process.cwd(), ctx, pi);
 		} catch {
 			// Best-effort: stale code should not break normal turn completion.
 		}
@ -511,6 +536,7 @@ export function registerHooks(
 							block: true,
 							reason:
 								`Research unit terminal transition: ${currentUnit.type} ${currentUnit.id} has already completed its RESEARCH artifact. ` +
+								`Post-artifact drift is blocked before runaway supervision treats it as legitimate large research. ` +
 								`Planning tools (${event.toolName}) are blocked. The orchestrator will dispatch planner units after research.`,
 						};
 					}
--- a/src/resources/extensions/sf/commands-handlers.ts
+++ b/src/resources/extensions/sf/commands-handlers.ts
@ -131,25 +131,65 @@ export async function handleDoctor(

 	// ── Flow audit subcommand (sf-moocz9so-4ffov2) ─────────────────────────
 	if (trimmed === "flow" || trimmed.startsWith("flow ")) {
-		const flowResult = await runFlowAudit(projectRoot());
+		const flowResult = await runFlowAudit(projectRoot(), {
+			killOverBudgetChildren: /\b(--kill-children|kill-children|kill)\b/.test(
+				trimmed,
+			),
+		});
 		const lines: string[] = ["## SF Flow Audit", ""];
+		if (flowResult.activeMilestone) {
+			lines.push(
+				`**Active milestone:** ${flowResult.activeMilestone.id}${flowResult.activeMilestone.title ? ` — ${flowResult.activeMilestone.title}` : ""}`,
+				flowResult.activeMilestone.phase
+					? `- Phase: ${flowResult.activeMilestone.phase}`
+					: "",
+				"",
+			);
+		} else {
+			lines.push("**Active milestone:** none", "");
+		}
 		if (flowResult.activeUnit) {
 			const ageMin = Math.round(flowResult.activeUnit.ageMs / 60000);
+			const progressAgeMin = Math.round(
+				flowResult.activeUnit.progressAgeMs / 60000,
+			);
 			lines.push(
 				`**Active unit:** ${flowResult.activeUnit.unitType} ${flowResult.activeUnit.unitId}`,
 				`- Phase: ${flowResult.activeUnit.phase}`,
 				`- Started: ${flowResult.activeUnit.startedAt}`,
 				`- Age: ${ageMin} minutes`,
+				`- Progress age: ${progressAgeMin} minutes`,
+				flowResult.activeUnit.lastProgressAt
+					? `- Last progress: ${flowResult.activeUnit.lastProgressAt}`
+					: "",
 				"",
 			);
 		} else {
 			lines.push("**Active unit:** none", "");
 		}
+		lines.push(
+			`**Session pointer:** ${
+				flowResult.sessionPointer?.sessionFile ??
+				flowResult.sessionPointer?.sessionId ??
+				"none recorded"
+			}`,
+			`**Recommended action:** ${flowResult.recommendedAction}`,
+			"",
+		);
 		if (flowResult.warnings.length > 0) {
 			lines.push("**Warnings:**");
 			for (const w of flowResult.warnings) lines.push(`- ${w}`);
 			lines.push("");
 		}
+		if (flowResult.staleDispatchedUnits.length > 0) {
+			lines.push("**Stale dispatched units:**");
+			for (const unit of flowResult.staleDispatchedUnits.slice(0, 5)) {
+				lines.push(
+					`- ${unit.unitType} ${unit.unitId}: progress age ${Math.round(unit.progressAgeMs / 60000)} minutes`,
+				);
+			}
+			lines.push("");
+		}
 		if (flowResult.recommendations.length > 0) {
 			lines.push("**Recommendations:**");
 			for (const r of flowResult.recommendations) lines.push(`- ${r}`);
@ -158,7 +198,19 @@ export async function handleDoctor(
 		if (flowResult.childProcesses.length > 0) {
 			lines.push("**Child processes:**");
 			for (const cp of flowResult.childProcesses.slice(0, 10)) {
-				lines.push(`- pid=${cp.pid} [${cp.classification}] ${cp.cmd.slice(0, 60)}`);
+				const age =
+					cp.ageMs === undefined ? "" : ` age=${Math.round(cp.ageMs / 60000)}m`;
+				const nonBlocking = cp.nonBlocking ? " non-blocking" : "";
+				lines.push(
+					`- pid=${cp.pid} ppid=${cp.ppid} [${cp.classification}]${age}${nonBlocking} action=${cp.action} ${cp.cmd.slice(0, 80)}`,
+				);
+			}
+			lines.push("");
+		}
+		if (flowResult.runawayHistory.length > 0) {
+			lines.push("**Runaway history:**");
+			for (const event of flowResult.runawayHistory.slice(-5)) {
+				lines.push(`- ${event}`);
 			}
 			lines.push("");
 		}
--- a/src/resources/extensions/sf/commands-harness.ts
+++ b/src/resources/extensions/sf/commands-harness.ts
@ -7,13 +7,15 @@
 * tracked docs artifacts (sf-moocr4rv-au7r3l).
 */

-import { existsSync, mkdirSync, writeFileSync } from "node:fs";
+import { mkdirSync, writeFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import type { ExtensionCommandContext } from "@singularity-forge/pi-coding-agent";
 import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
 import { projectRoot } from "./commands/context.js";
-import { profileRepository } from "./repo-profiler.js";
-import { recordRepoProfile } from "./sf-db.js";
+import { profileRepository, type RepoProfile } from "./repo-profiler.js";
+import { getLatestRepoProfile, recordRepoProfile } from "./sf-db.js";
+
+const HARNESS_PROMOTION_REPO_DIR = "docs/exec-plans/active";

 /**
 * Format a repo profile summary for user notification.
@ -47,10 +49,91 @@ function formatProfileSummary(
 		`Stacks: ${stacks}`,
 		`Risk hints: ${risks}`,
 		"",
-		"Untracked files were recorded as observations only; SF did not stage or adopt them.",
+		"Runtime observation boundary:",
+		"- Profile state was stored only in .sf runtime state.",
+		"- No repo-committable artifact was written by profiling.",
+		"- Use /sf harness promote <finding-id> after review to create a tracked docs artifact.",
+		"- Untracked files remain observed_only; SF did not stage or adopt them.",
 	].join("\n");
 }

+/**
+ * Convert a finding id into a stable filename segment.
+ *
+ * Purpose: keep promotion artifacts deterministic while preventing path
+ * traversal through user-provided finding IDs.
+ *
+ * Consumer: `/sf harness promote <finding-id>`.
+ */
+function findingIdSlug(findingId: string): string {
+	const slug = findingId
+		.trim()
+		.toLowerCase()
+		.replace(/[^a-z0-9._-]+/g, "-")
+		.replace(/^-+|-+$/g, "")
+		.slice(0, 120);
+	return slug || "finding";
+}
+
+/**
+ * Parse the persisted repo profile JSON from .sf runtime state.
+ *
+ * Purpose: promotion must be a writeback from recorded observations, not a new
+ * profiler run that can observe its own artifact or introduce timestamps.
+ *
+ * Consumer: `/sf harness promote <finding-id>`.
+ */
+function parseRecordedProfile(profileJson: string): RepoProfile | null {
+	try {
+		const parsed = JSON.parse(profileJson) as Partial<RepoProfile>;
+		if (
+			typeof parsed.profileId === "string" &&
+			typeof parsed.createdAt === "string" &&
+			parsed.git &&
+			Array.isArray(parsed.git.changedFiles)
+		) {
+			return parsed as RepoProfile;
+		}
+	} catch {
+		// Fall back to row-level metadata below.
+	}
+	return null;
+}
+
+/**
+ * Build the stable JSON payload embedded in a promotion artifact.
+ *
+ * Purpose: document the recorded observation facts without leaking absolute
+ * runtime paths or adding promotion-time fields.
+ *
+ * Consumer: `/sf harness promote <finding-id>`.
+ */
+function profilePromotionPayload(
+	profile: RepoProfile | null,
+	fallback: {
+		profileId: string;
+		branch: string | null;
+		dirty: boolean;
+		createdAt: string;
+	},
+): Record<string, unknown> {
+	return {
+		profileId: profile?.profileId ?? fallback.profileId,
+		profileCapturedAt: profile?.createdAt ?? fallback.createdAt,
+		branch: profile?.git.branch ?? fallback.branch,
+		dirty: profile?.git.dirty ?? fallback.dirty,
+		changedFiles: profile?.git.changedFiles ?? [],
+		stacks: profile?.stacks ?? [],
+		entrypoints: profile?.entrypoints ?? [],
+		tests: profile?.tests ?? [],
+		ci: profile?.ci ?? [],
+		docs: profile?.docs ?? [],
+		dataStores: profile?.dataStores ?? [],
+		networkSurfaces: profile?.networkSurfaces ?? [],
+		riskHints: profile?.riskHints ?? [],
+	};
+}
+
 /**
 * Promote a harness/profile finding from .sf runtime observations into a
 * tracked docs artifact. This is the writeback path that turns operational
@ -80,42 +163,57 @@ export async function handleHarnessPromote(
 		return;
 	}

-	// Determine the target tracked-docs path
+	const displayFindingId = findingId.trim();
+	const latestProfile = getLatestRepoProfile();
+	if (!latestProfile) {
+		ctx.ui.notify(
+			"No recorded harness profile found. Run /sf harness profile first; promotion writes tracked docs only from .sf runtime observations.",
+			"warning",
+		);
+		return;
+	}
+
+	const slug = findingIdSlug(displayFindingId);
+	const relativePath = `${HARNESS_PROMOTION_REPO_DIR}/harness-promotion-${slug}.md`;
 	const trackedDir = resolve(basePath, "docs", "exec-plans", "active");
-	const targetPath = join(trackedDir, `harness-promotion-${findingId}.md`);
+	const targetPath = join(trackedDir, `harness-promotion-${slug}.md`);

 	// Ensure the tracked directory exists (creates under the repo, not .sf)
 	mkdirSync(trackedDir, { recursive: true });

-	// Read the latest profile from DB to include in the promotion
-	const profile = profileRepository(basePath);
+	const recordedProfile = parseRecordedProfile(latestProfile.profileJson);
+	const payload = profilePromotionPayload(recordedProfile, {
+		profileId: latestProfile.profileId,
+		branch: latestProfile.branch,
+		dirty: latestProfile.dirty,
+		createdAt: latestProfile.createdAt,
+	});

 	// Build the promoted artifact content
 	const content = [
-		`# Harness Promotion: ${findingId}`,
+		`# Harness Promotion: ${displayFindingId}`,
 		"",
-		`Promoted from: \`.sf\` runtime observations`,
-		`Promoted at: ${new Date().toISOString()}`,
-		`Source profile: ${profile.profileId}`,
-		`Source branch: ${profile.git.branch ?? "unknown"}`,
+		`Finding ID: ${displayFindingId}`,
+		`Repo artifact: \`${relativePath}\``,
+		"Source: `.sf` runtime observations",
+		`Source profile: ${latestProfile.profileId}`,
+		`Source profile captured at: ${latestProfile.createdAt}`,
+		`Source branch: ${latestProfile.branch ?? "unknown"}`,
 		"",
-		"## Observed State",
+		"## Runtime Boundary",
+		"",
+		"- `.sf` remains operational runtime state and is not repo output.",
+		"- Unpromoted .sf runtime observations remain `observed_only`.",
+		"- This Markdown file is the repo-committable artifact created by promotion.",
+		"- Promotion does not stage or claim untracked observed files.",
+		"",
+		"## Observed Profile",
 		"",
 		"```json",
-		JSON.stringify(
-			{
-				profileId: profile.profileId,
-				branch: profile.git.branch,
-				changedFiles: profile.git.changedFiles,
-				stacks: profile.stacks,
-				riskHints: profile.riskHints,
-			},
-			null,
-			2,
-		),
+		JSON.stringify(payload, null, 2),
 		"```",
 		"",
-		"## Status",
+		"## Review Checklist",
 		"",
 		"- [ ] Reviewed by human",
 		"- [ ] Adopted into milestone plan",
@ -131,10 +229,10 @@ export async function handleHarnessPromote(

 	ctx.ui.notify(
 		[
-			`Harness finding '${findingId}' promoted to tracked docs.`,
-			`Path: ${targetPath}`,
+			`Harness finding '${displayFindingId}' promoted to tracked docs.`,
+			`Path: ${relativePath}`,
 			"",
-			"This artifact is now part of the repo's tracked documentation.",
+			"This Markdown file is now the repo-committable artifact for review.",
 			"Unpromoted .sf runtime state remains observed_only.",
 		].join("\n"),
 		"info",
@ -161,7 +259,7 @@ export async function handleHarness(
 	}
 	if (!["profile", "snapshot", "status"].includes(subcommand)) {
 		ctx.ui.notify(
-			"Usage: /sf harness profile | /sf harness promote <finding-id>\nRecords a read-only repo profile or promotes a finding to tracked docs.",
+			"Usage: /sf harness profile | /sf harness promote <finding-id>\nRecords a read-only .sf runtime profile or promotes a reviewed finding to tracked docs.",
 			"warning",
 		);
 		return;
--- a/src/resources/extensions/sf/doctor.ts
+++ b/src/resources/extensions/sf/doctor.ts
@ -50,34 +50,486 @@ import {
 	loadEffectiveSFPreferences,
 	type SFPreferences,
 } from "./preferences.js";
+import {
+	type PersistedSelfFeedbackEntry,
+	readAllSelfFeedback,
+	recordSelfFeedback,
+} from "./self-feedback.js";
 import { getMilestoneSlices, getSliceTasks, isDbAvailable } from "./sf-db.js";
 import { deriveState, isMilestoneComplete } from "./state.js";
 import { isClosedStatus } from "./status-guards.js";
 import type { RoadmapSliceEntry } from "./types.js";
+import { parseUnitId } from "./unit-id.js";

 // ─── Flow Audit Types (sf-moocz9so-4ffov2) ────────────────────────────────

+export type FlowAuditChildClassification =
+	| "active-session"
+	| "warmup"
+	| "background"
+	| "orphan"
+	| "unknown";
+
+export type FlowAuditChildAction = "observe" | "non-blocking" | "kill";
+
+/**
+ * Configure `runFlowAudit` for deterministic tests and explicit recovery mode.
+ *
+ * Purpose: keep the default auditor read-only during startup while allowing
+ * `/sf doctor flow --kill-children` and tests to exercise bounded child cleanup.
+ *
+ * Consumer: session_start, `/sf doctor flow`, and flow-audit regression tests.
+ */
+export interface FlowAuditOptions {
+	nowMs?: number;
+	staleProgressMs?: number;
+	optionalChildBudgetMs?: number;
+	psOutput?: string;
+	killOverBudgetChildren?: boolean;
+	killProcess?: (pid: number) => void;
+	recordSelfFeedback?: boolean;
+}
+
+/**
+ * Flow-audit output returned to commands and startup hooks.
+ *
+ * Purpose: preserve enough structured evidence for operators and tests to avoid
+ * reconstructing stuck auto-mode state from locks, runtime files, sessions, and ps.
+ *
+ * Consumer: `/sf doctor flow`, session_start notifications, and regression tests.
+ */
 export interface FlowAuditResult {
 	ok: boolean;
+	activeMilestone?: {
+		id: string;
+		title?: string;
+		phase?: string;
+	};
 	activeUnit?: {
 		unitType: string;
 		unitId: string;
 		phase: string;
 		startedAt: string;
 		ageMs: number;
+		progressAgeMs: number;
+		lastProgressAt?: string;
+	};
+	sessionPointer?: {
+		sessionId?: string;
+		sessionFile?: string;
+		source: "auto.lock" | "runtime-unit";
 	};
 	recommendations: string[];
+	recommendedAction: string;
 	warnings: string[];
 	childProcesses: Array<{
 		pid: number;
+		ppid: number;
 		cmd: string;
-		classification: "active-session" | "warmup" | "orphan" | "unknown";
+		classification: FlowAuditChildClassification;
+		ageMs?: number;
+		nonBlocking: boolean;
+		overBudget: boolean;
+		action: FlowAuditChildAction;
+		killed?: boolean;
+		killError?: string;
 	}>;
 	lastErrors: string[];
+	staleDispatchedUnits: Array<{
+		unitType: string;
+		unitId: string;
+		phase: string;
+		progressAgeMs: number;
+		lastProgressAt?: string;
+	}>;
+	runawayHistory: string[];
+	loopEvidence?: {
+		milestoneId: string;
+		sliceId?: string;
+		taskId?: string;
+		completedPriorTasks: string[];
+		missingSummaries: string[];
+	};
+	repeatedFailureRollup?: {
+		filed: boolean;
+		milestoneId: string;
+		count: number;
+		entryId?: string;
+	};
 }

 // ─── Flow Audit Implementation ────────────────────────────────────────────

+const DEFAULT_STALE_PROGRESS_MS = 20 * 60 * 1000;
+const DEFAULT_OPTIONAL_CHILD_BUDGET_MS = 30 * 60 * 1000;
+const REPEATED_FAILURE_THRESHOLD = 3;
+const FLOW_AUDIT_ROLLUP_KIND = "flow-audit:repeated-milestone-failure";
+
+interface AutoLockAuditRecord {
+	pid?: number;
+	unitType?: string;
+	unitId?: string;
+	startedAt?: string | number;
+	phase?: string;
+	sessionId?: string;
+	sessionFile?: string;
+}
+
+interface RuntimeUnitAuditRecord {
+	unitType?: string;
+	unitId?: string;
+	phase?: string;
+	startedAt?: number | string;
+	updatedAt?: number | string;
+	lastProgressAt?: number | string;
+	lastProgressKind?: string;
+	progressCount?: number;
+	sessionId?: string;
+	sessionFile?: string;
+	runawayGuardPause?: {
+		reason?: string;
+		unitType?: string;
+		unitId?: string;
+		pausedAt?: number;
+	};
+}
+
+interface PsAuditRow {
+	pid: number;
+	ppid: number;
+	ageMs?: number;
+	cmd: string;
+}
+
+function parseEpochMs(value: unknown, fallbackMs: number): number {
+	if (typeof value === "number" && Number.isFinite(value)) {
+		return value < 10_000_000_000 ? value * 1000 : value;
+	}
+	if (typeof value === "string" && value.trim()) {
+		const parsed = new Date(value).getTime();
+		if (Number.isFinite(parsed)) return parsed;
+	}
+	return fallbackMs;
+}
+
+function formatIso(ms: number | undefined): string | undefined {
+	if (ms === undefined || !Number.isFinite(ms)) return undefined;
+	return new Date(ms).toISOString();
+}
+
+function minutes(ms: number): number {
+	return Math.max(0, Math.round(ms / 60_000));
+}
+
+function readJsonFile<T>(path: string): T | null {
+	try {
+		if (!existsSync(path)) return null;
+		return JSON.parse(readFileSync(path, "utf8")) as T;
+	} catch {
+		return null;
+	}
+}
+
+function readRuntimeUnits(runtimeUnitsDir: string): RuntimeUnitAuditRecord[] {
+	if (!existsSync(runtimeUnitsDir)) return [];
+	const records: RuntimeUnitAuditRecord[] = [];
+	try {
+		for (const file of readdirSync(runtimeUnitsDir)) {
+			if (!file.endsWith(".json")) continue;
+			const record = readJsonFile<RuntimeUnitAuditRecord>(
+				join(runtimeUnitsDir, file),
+			);
+			if (record) records.push(record);
+		}
+	} catch {
+		// Runtime audit must stay best-effort.
+	}
+	return records;
+}
+
+function parsePsOutput(psOutput: string): PsAuditRow[] {
+	const rows: PsAuditRow[] = [];
+	for (const line of psOutput.split("\n")) {
+		const trimmed = line.trim();
+		if (!trimmed) continue;
+		const match = trimmed.match(/^(\d+)\s+(\d+)(?:\s+(\d+))?\s+(.+)$/);
+		if (!match) continue;
+		const pid = Number.parseInt(match[1], 10);
+		const ppid = Number.parseInt(match[2], 10);
+		if (!Number.isFinite(pid) || !Number.isFinite(ppid)) continue;
+		const elapsedSeconds =
+			match[3] === undefined ? undefined : Number.parseInt(match[3], 10);
+		rows.push({
+			pid,
+			ppid,
+			ageMs:
+				elapsedSeconds !== undefined && Number.isFinite(elapsedSeconds)
+					? elapsedSeconds * 1000
+					: undefined,
+			cmd: match[4],
+		});
+	}
+	return rows;
+}
+
+async function readPsRows(options: FlowAuditOptions): Promise<PsAuditRow[]> {
+	if (options.psOutput !== undefined) return parsePsOutput(options.psOutput);
+	if (process.platform === "win32") return [];
+	try {
+		const { execSync } = await import("node:child_process");
+		const psOutput = execSync("ps -eo pid,ppid,etimes,cmd --no-headers", {
+			encoding: "utf8",
+			timeout: 5000,
+		});
+		return parsePsOutput(psOutput);
+	} catch {
+		return [];
+	}
+}
+
+function classifyProcess(row: PsAuditRow): FlowAuditChildClassification {
+	const cmd = row.cmd.toLowerCase();
+	if (cmd.includes("sift") || cmd.includes("warmup")) return "warmup";
+	if (row.ppid === 1 && cmd.includes("next-server")) return "orphan";
+	if (
+		cmd.includes("next-server") ||
+		cmd.includes("vite") ||
+		cmd.includes("turbopack")
+	) {
+		return "background";
+	}
+	if (
+		(cmd.includes("node") || cmd.includes("sf-run") || cmd.includes("codex")) &&
+		(cmd.includes(" sf") ||
+			cmd.includes("/sf") ||
+			cmd.includes("dist/loader") ||
+			cmd.includes("tool-session") ||
+			cmd.includes("headless"))
+	) {
+		return "active-session";
+	}
+	return "unknown";
+}
+
+function isOptionalChild(
+	classification: FlowAuditChildClassification,
+): boolean {
+	return (
+		classification === "warmup" ||
+		classification === "background" ||
+		classification === "orphan"
+	);
+}
+
+function shouldIncludeProcess(
+	row: PsAuditRow,
+	classification: FlowAuditChildClassification,
+	activePid: number | undefined,
+): boolean {
+	if (classification !== "unknown") return true;
+	if (activePid === undefined) return false;
+	return row.pid === activePid || row.ppid === activePid;
+}
+
+function readRecentErrors(runtimeRoot: string): string[] {
+	const notificationsPath = join(runtimeRoot, "notifications.jsonl");
+	if (!existsSync(notificationsPath)) return [];
+	const errors: string[] = [];
+	try {
+		const lines = readFileSync(notificationsPath, "utf8")
+			.split("\n")
+			.filter((l) => l.trim());
+		for (const line of lines.slice(-20)) {
+			try {
+				const entry = JSON.parse(line) as {
+					severity?: string;
+					message?: string;
+					text?: string;
+				};
+				const message = entry.message ?? entry.text ?? "";
+				if (
+					entry.severity === "error" ||
+					message.toLowerCase().includes("error") ||
+					message.toLowerCase().includes("failed")
+				) {
+					errors.push(message || "Unknown error");
+				}
+			} catch {
+				// skip malformed notification rows
+			}
+		}
+	} catch {
+		// non-fatal
+	}
+	return errors;
+}
+
+function buildLoopEvidence(
+	basePath: string,
+	unitType: string,
+	unitId: string,
+): FlowAuditResult["loopEvidence"] | undefined {
+	if (unitType !== "execute-task") return undefined;
+	const { milestone, slice, task } = parseUnitId(unitId);
+	if (!milestone || !slice || !task) return undefined;
+	const planPath = resolveSliceFile(basePath, milestone, slice, "PLAN");
+	if (!planPath || !existsSync(planPath)) return undefined;
+	const completedPriorTasks: string[] = [];
+	const missingSummaries: string[] = [];
+	try {
+		const plan = parsePlan(readFileSync(planPath, "utf8"));
+		const currentIndex = plan.tasks.findIndex((t) => t.id === task);
+		if (currentIndex > 0) {
+			for (const prior of plan.tasks.slice(0, currentIndex)) {
+				if (prior.done) completedPriorTasks.push(prior.id);
+			}
+		}
+		if (!resolveTaskFile(basePath, milestone, slice, task, "SUMMARY")) {
+			missingSummaries.push(`${milestone}/${slice}/${task} task SUMMARY`);
+		}
+		const allTasksDone =
+			plan.tasks.length > 0 && plan.tasks.every((t) => t.done);
+		if (
+			allTasksDone &&
+			!resolveSliceFile(basePath, milestone, slice, "SUMMARY")
+		) {
+			missingSummaries.push(`${milestone}/${slice} slice SUMMARY`);
+		}
+	} catch {
+		return undefined;
+	}
+	return {
+		milestoneId: milestone,
+		sliceId: slice,
+		taskId: task,
+		completedPriorTasks,
+		missingSummaries,
+	};
+}
+
+function collectRunawayHistory(
+	runtimeUnits: RuntimeUnitAuditRecord[],
+	feedback: PersistedSelfFeedbackEntry[],
+	milestoneId: string | undefined,
+): string[] {
+	const history: string[] = [];
+	for (const unit of runtimeUnits) {
+		const pause = unit.runawayGuardPause;
+		if (!pause) continue;
+		const id = pause.unitId ?? unit.unitId ?? "unknown";
+		if (milestoneId && !id.startsWith(`${milestoneId}/`)) continue;
+		history.push(pause.reason ?? `Runaway guard paused ${id}`);
+	}
+	for (const entry of feedback) {
+		if (entry.resolvedAt) continue;
+		if (milestoneId && entry.occurredIn?.milestone !== milestoneId) continue;
+		if (
+			entry.kind.includes("runaway") ||
+			entry.summary.toLowerCase().includes("runaway")
+		) {
+			history.push(`${entry.kind}: ${entry.summary}`);
+		}
+	}
+	return Array.from(new Set(history)).slice(-10);
+}
+
+function maybeRecordRepeatedFailureRollup(
+	basePath: string,
+	milestoneId: string | undefined,
+	feedback: PersistedSelfFeedbackEntry[],
+	options: FlowAuditOptions,
+): FlowAuditResult["repeatedFailureRollup"] | undefined {
+	if (!milestoneId || options.recordSelfFeedback === false) return undefined;
+	const failures = feedback.filter(
+		(e) =>
+			!e.resolvedAt &&
+			e.occurredIn?.milestone === milestoneId &&
+			e.kind !== FLOW_AUDIT_ROLLUP_KIND,
+	);
+	if (failures.length < REPEATED_FAILURE_THRESHOLD) return undefined;
+	const openRollup = feedback.find(
+		(e) =>
+			!e.resolvedAt &&
+			e.kind === FLOW_AUDIT_ROLLUP_KIND &&
+			e.occurredIn?.milestone === milestoneId,
+	);
+	if (openRollup) {
+		return {
+			filed: false,
+			milestoneId,
+			count: failures.length,
+			entryId: openRollup.id,
+		};
+	}
+	const evidence = failures
+		.slice(-8)
+		.map(
+			(e) =>
+				`[${e.id}] ${e.kind} ${[
+					e.occurredIn?.milestone,
+					e.occurredIn?.slice,
+					e.occurredIn?.task,
+				]
+					.filter(Boolean)
+					.join("/")}: ${e.summary}`,
+		)
+		.join("\n");
+	const recorded = recordSelfFeedback(
+		{
+			kind: FLOW_AUDIT_ROLLUP_KIND,
+			severity: "high",
+			summary: `${failures.length} unresolved flow failures on ${milestoneId} need one recovery fix`,
+			evidence,
+			suggestedFix:
+				"Fix the shared milestone-flow failure instead of filing one item per failed unit. Use the flow audit evidence to repair stale dispatch, missing summary, runaway, or child-process handling.",
+			acceptanceCriteria:
+				"AC1: flow audit reports the active milestone/unit and session pointer. AC2: stale dispatched unit with no progress is flagged. AC3: runaway history and child-process hang evidence are preserved. AC4: repeated same-milestone failures stay deduplicated into one open item.",
+			source: "detector",
+			occurredIn: { milestone: milestoneId, unitType: "flow-audit" },
+		},
+		basePath,
+	);
+	if (!recorded) return undefined;
+	return {
+		filed: true,
+		milestoneId,
+		count: failures.length,
+		entryId: recorded.entry.id,
+	};
+}
+
+function chooseRecommendedAction(args: {
+	activeUnit?: FlowAuditResult["activeUnit"];
+	sessionPointer?: FlowAuditResult["sessionPointer"];
+	staleDispatchedUnits: FlowAuditResult["staleDispatchedUnits"];
+	childProcesses: FlowAuditResult["childProcesses"];
+	lastErrors: string[];
+	activeMilestone?: FlowAuditResult["activeMilestone"];
+}): string {
+	if (args.staleDispatchedUnits.length > 0) {
+		const unit = args.staleDispatchedUnits[0];
+		const session = args.sessionPointer?.sessionFile
+			? ` ${args.sessionPointer.sessionFile}`
+			: args.sessionPointer?.sessionId
+				? ` ${args.sessionPointer.sessionId}`
+				: "";
+		return `Inspect session${session} for ${unit.unitType} ${unit.unitId}; if no new output exists, stop/requeue the stale dispatched unit before continuing.`;
+	}
+	const overBudgetOptional = args.childProcesses.find(
+		(p) => p.nonBlocking && p.overBudget,
+	);
+	if (overBudgetOptional) {
+		return `Optional ${overBudgetOptional.classification} child pid ${overBudgetOptional.pid} is over budget; it is non-blocking, or rerun with --kill-children to terminate it.`;
+	}
+	if (args.lastErrors.length > 0) {
+		return "Review recent errors before dispatching another unit.";
+	}
+	if (args.activeMilestone && !args.activeUnit) {
+		return `Dispatch or resume the next unit for ${args.activeMilestone.id}.`;
+	}
+	return "No flow-auditor action needed.";
+}
+
 /**
 * Run a flow audit: inspect active unit state, auto.lock, runtime artifacts,
 * and child processes to diagnose stuck milestones without human forensic work.
@ -86,165 +538,228 @@ export interface FlowAuditResult {
 * milestone/unit, progress age, session pointer, child processes, last errors,
 * and recommended action.
 *
- * Consumer: `/sf doctor flow` command.
+ * Consumer: `/sf doctor flow` command and session_start startup health sweep.
 */
-export async function runFlowAudit(basePath: string): Promise<FlowAuditResult> {
+export async function runFlowAudit(
+	basePath: string,
+	options: FlowAuditOptions = {},
+): Promise<FlowAuditResult> {
+	const nowMs = options.nowMs ?? Date.now();
+	const staleProgressMs = options.staleProgressMs ?? DEFAULT_STALE_PROGRESS_MS;
+	const optionalChildBudgetMs =
+		options.optionalChildBudgetMs ?? DEFAULT_OPTIONAL_CHILD_BUDGET_MS;
+	const runtimeRoot = sfRoot(basePath);
 	const warnings: string[] = [];
 	const recommendations: string[] = [];
 	const childProcesses: FlowAuditResult["childProcesses"] = [];
-	const lastErrors: string[] = [];
+	const lastErrors = readRecentErrors(runtimeRoot);
+	const staleDispatchedUnits: FlowAuditResult["staleDispatchedUnits"] = [];
+	let sessionPointer: FlowAuditResult["sessionPointer"] | undefined;
+	let activeMilestone: FlowAuditResult["activeMilestone"] | undefined;

-	// Read auto.lock for active unit info
-	const autoLockPath = join(basePath, ".sf", "auto.lock");
+	const autoLockPath = join(runtimeRoot, "auto.lock");
 	let activeUnit: FlowAuditResult["activeUnit"] | undefined;
-	if (existsSync(autoLockPath)) {
-		try {
-			const lockContent = readFileSync(autoLockPath, "utf8");
-			const lockData = JSON.parse(lockContent) as {
-				unitType?: string;
-				unitId?: string;
-				startedAt?: string;
-				phase?: string;
+	let activePid: number | undefined;
+	const lockData = readJsonFile<AutoLockAuditRecord>(autoLockPath);
+	if (lockData) {
+		if (lockData.unitType && lockData.unitId) {
+			const startedAtMs = parseEpochMs(lockData.startedAt, nowMs);
+			const parsed = parseUnitId(lockData.unitId);
+			activeMilestone = { id: parsed.milestone };
+			activePid =
+				typeof lockData.pid === "number" && Number.isFinite(lockData.pid)
+					? lockData.pid
+					: undefined;
+			activeUnit = {
+				unitType: lockData.unitType,
+				unitId: lockData.unitId,
+				phase: lockData.phase ?? "unknown",
+				startedAt: formatIso(startedAtMs) ?? new Date(nowMs).toISOString(),
+				ageMs: Math.max(0, nowMs - startedAtMs),
+				progressAgeMs: Math.max(0, nowMs - startedAtMs),
 			};
-			if (lockData.unitType && lockData.unitId) {
-				const startedAt = lockData.startedAt
-					? new Date(lockData.startedAt).getTime()
-					: Date.now();
-				const ageMs = Date.now() - startedAt;
-				activeUnit = {
-					unitType: lockData.unitType,
-					unitId: lockData.unitId,
-					phase: lockData.phase ?? "unknown",
-					startedAt: lockData.startedAt ?? new Date().toISOString(),
-					ageMs,
+			if (lockData.sessionId || lockData.sessionFile) {
+				sessionPointer = {
+					sessionId: lockData.sessionId,
+					sessionFile: lockData.sessionFile,
+					source: "auto.lock",
 				};
-				if (ageMs > 30 * 60 * 1000) {
-					warnings.push(
-						`Active unit ${lockData.unitId} has been running for ${Math.round(ageMs / 60000)} minutes.`,
-					);
-					recommendations.push(
-						`Consider checking if ${lockData.unitId} is stuck or making progress.`,
-					);
-				}
 			}
-		} catch {
-			warnings.push("Could not parse .sf/auto.lock");
 		}
+	} else if (existsSync(autoLockPath)) {
+		warnings.push("Could not parse .sf/auto.lock");
 	}

-	// Read runtime units directory
-	const runtimeUnitsDir = join(basePath, ".sf", "runtime", "units");
-	if (existsSync(runtimeUnitsDir)) {
-		try {
-			const files = readdirSync(runtimeUnitsDir);
-			let dispatchedCount = 0;
-			for (const file of files) {
-				if (!file.endsWith(".json")) continue;
-				try {
-					const content = readFileSync(
-						join(runtimeUnitsDir, file),
-						"utf8",
-					);
-					const unit = JSON.parse(content) as {
-						phase?: string;
-						unitType?: string;
-						unitId?: string;
-					};
-					if (unit.phase === "dispatched") dispatchedCount++;
-				} catch {
-					// skip malformed
-				}
-			}
-			if (dispatchedCount > 1) {
-				warnings.push(
-					`${dispatchedCount} units are in dispatched phase simultaneously.`,
-				);
-			}
-		} catch {
-			// ignore
-		}
-	}
-
-	// Read notifications for recent errors
-	const notificationsPath = join(basePath, ".sf", "notifications.jsonl");
-	if (existsSync(notificationsPath)) {
-		try {
-			const lines = readFileSync(notificationsPath, "utf8")
-				.split("\n")
-				.filter((l) => l.trim());
-			const recentLines = lines.slice(-20);
-			for (const line of recentLines) {
-				try {
-					const entry = JSON.parse(line) as {
-						severity?: string;
-						message?: string;
-					};
-					if (
-						entry.severity === "error" ||
-						entry.message?.toLowerCase().includes("error")
-					) {
-						lastErrors.push(entry.message ?? "Unknown error");
-					}
-				} catch {
-					// skip malformed
-				}
-			}
-		} catch {
-			// ignore
-		}
-	}
-
-	// Scan child processes (Linux/macOS only)
-	if (process.platform !== "win32") {
-		try {
-			const { execSync } = await import("node:child_process");
-			const psOutput = execSync("ps -eo pid,ppid,cmd --no-headers", {
-				encoding: "utf8",
-				timeout: 5000,
+	const runtimeUnits = readRuntimeUnits(join(runtimeRoot, "runtime", "units"));
+	let dispatchedCount = 0;
+	for (const unit of runtimeUnits) {
+		if (unit.phase === "dispatched") dispatchedCount++;
+		if (!unit.unitType || !unit.unitId) continue;
+		const progressBaseMs = parseEpochMs(
+			unit.lastProgressAt ?? unit.updatedAt ?? unit.startedAt,
+			nowMs,
+		);
+		const progressAgeMs = Math.max(0, nowMs - progressBaseMs);
+		const lastProgressAt = formatIso(progressBaseMs);
+		const stale =
+			unit.phase === "dispatched" && progressAgeMs > staleProgressMs;
+		if (stale) {
+			staleDispatchedUnits.push({
+				unitType: unit.unitType,
+				unitId: unit.unitId,
+				phase: unit.phase ?? "unknown",
+				progressAgeMs,
+				lastProgressAt,
 			});
-			const lines = psOutput.split("\n").filter((l) => l.trim());
-			for (const line of lines) {
-				const parts = line.trim().split(/\s+/);
-				if (parts.length < 3) continue;
-				const pid = Number.parseInt(parts[0], 10);
-				const ppid = Number.parseInt(parts[1], 10);
-				const cmd = parts.slice(2).join(" ");
-				if (!Number.isFinite(pid)) continue;
-				// Classify processes
-				let classification: FlowAuditResult["childProcesses"][0]["classification"] = "unknown";
-				if (cmd.includes("sift") || cmd.includes("warmup")) {
-					classification = "warmup";
-				} else if (cmd.includes("node") && cmd.includes("sf")) {
-					classification = "active-session";
-				} else if (ppid === 1 && cmd.includes("next-server")) {
-					classification = "orphan";
-				}
-				childProcesses.push({ pid, cmd, classification });
+			warnings.push(
+				`Unit ${unit.unitId} has no progress for ${minutes(progressAgeMs)} minutes (phase=${unit.phase}).`,
+			);
+		}
+		if (
+			activeUnit &&
+			unit.unitType === activeUnit.unitType &&
+			unit.unitId === activeUnit.unitId
+		) {
+			activeUnit.phase = unit.phase ?? activeUnit.phase;
+			activeUnit.progressAgeMs = progressAgeMs;
+			activeUnit.lastProgressAt = lastProgressAt;
+			if (!sessionPointer && (unit.sessionId || unit.sessionFile)) {
+				sessionPointer = {
+					sessionId: unit.sessionId,
+					sessionFile: unit.sessionFile,
+					source: "runtime-unit",
+				};
 			}
-		} catch {
-			// ignore on platforms without ps
 		}
 	}
+	if (dispatchedCount > 1) {
+		warnings.push(
+			`${dispatchedCount} units are in dispatched phase simultaneously.`,
+		);
+	}
+
+	const psRows = await readPsRows(options);
+	for (const row of psRows) {
+		const classification = classifyProcess(row);
+		if (!shouldIncludeProcess(row, classification, activePid)) continue;
+		const nonBlocking = isOptionalChild(classification);
+		const overBudget =
+			nonBlocking &&
+			row.ageMs !== undefined &&
+			row.ageMs > optionalChildBudgetMs;
+		let action: FlowAuditChildAction = nonBlocking ? "non-blocking" : "observe";
+		let killed = false;
+		let killError: string | undefined;
+		if (overBudget) {
+			warnings.push(
+				`${classification} child pid ${row.pid} is over budget (${minutes(row.ageMs ?? 0)} minutes).`,
+			);
+			if (options.killOverBudgetChildren) {
+				action = "kill";
+				try {
+					if (options.killProcess) options.killProcess(row.pid);
+					else process.kill(row.pid, "SIGTERM");
+					killed = true;
+				} catch (err) {
+					killError = err instanceof Error ? err.message : String(err);
+					warnings.push(
+						`Failed to kill over-budget ${classification} child pid ${row.pid}: ${killError}`,
+					);
+				}
+			}
+		}
+		childProcesses.push({
+			pid: row.pid,
+			ppid: row.ppid,
+			cmd: row.cmd,
+			classification,
+			ageMs: row.ageMs,
+			nonBlocking,
+			overBudget,
+			action,
+			killed: killed || undefined,
+			killError,
+		});
+	}

-	// Derive state for milestone context
 	try {
 		const state = await deriveState(basePath);
+		if (state.activeMilestone) {
+			activeMilestone = {
+				id: state.activeMilestone.id,
+				title: state.activeMilestone.title,
+				phase: state.phase,
+			};
+		}
 		if (state.activeMilestone && !activeUnit) {
 			recommendations.push(
 				`No active unit detected, but milestone ${state.activeMilestone.id} is active. Consider dispatching the next unit.`,
 			);
 		}
 	} catch {
-		// ignore
+		// State derivation is useful context but not required for the audit.
+	}
+
+	const loopEvidence =
+		activeUnit &&
+		buildLoopEvidence(basePath, activeUnit.unitType, activeUnit.unitId);
+	if (
+		loopEvidence?.completedPriorTasks.length &&
+		loopEvidence.missingSummaries.length
+	) {
+		warnings.push(
+			`${loopEvidence.milestoneId}/${loopEvidence.sliceId} has ${loopEvidence.completedPriorTasks.length} completed prior tasks but missing final summary evidence for ${loopEvidence.missingSummaries.join(", ")}.`,
+		);
+	}
+
+	const feedback = readAllSelfFeedback(basePath);
+	const milestoneId = activeMilestone?.id;
+	const runawayHistory = collectRunawayHistory(
+		runtimeUnits,
+		feedback,
+		milestoneId,
+	);
+	const repeatedFailureRollup = maybeRecordRepeatedFailureRollup(
+		basePath,
+		milestoneId,
+		feedback,
+		options,
+	);
+	if (repeatedFailureRollup?.filed) {
+		recommendations.push(
+			`Filed ${FLOW_AUDIT_ROLLUP_KIND} for ${milestoneId} after ${repeatedFailureRollup.count} repeated failures.`,
+		);
+	}
+
+	const recommendedAction = chooseRecommendedAction({
+		activeUnit,
+		sessionPointer,
+		staleDispatchedUnits,
+		childProcesses,
+		lastErrors,
+		activeMilestone,
+	});
+	if (!recommendations.includes(recommendedAction)) {
+		recommendations.unshift(recommendedAction);
 	}

 	return {
-		ok: warnings.length === 0 && lastErrors.length === 0,
+		ok:
+			warnings.length === 0 &&
+			lastErrors.length === 0 &&
+			staleDispatchedUnits.length === 0,
+		activeMilestone,
 		activeUnit,
+		sessionPointer,
 		recommendations,
+		recommendedAction,
 		warnings,
 		childProcesses,
 		lastErrors,
+		staleDispatchedUnits,
+		runawayHistory,
+		loopEvidence,
+		repeatedFailureRollup,
 	};
 }

--- a/src/resources/extensions/sf/extension-manifest.json
+++ b/src/resources/extensions/sf/extension-manifest.json
@ -15,7 +15,8 @@
 			"sf_summary_save",
 			"sf_requirement_update",
 			"sf_milestone_generate_id",
-			"sf_self_report"
+			"sf_self_report",
+			"sf_self_feedback_resolve"
 		],
 		"commands": ["sf", "kill", "worktree", "exit"],
 		"hooks": [
@ -25,6 +26,7 @@
 			"session_fork",
 			"before_agent_start",
 			"agent_end",
+			"turn_end",
 			"session_before_compact",
 			"session_shutdown",
 			"tool_call",
--- a/src/resources/extensions/sf/model-route-failure.ts
+++ b/src/resources/extensions/sf/model-route-failure.ts
@ -0,0 +1,179 @@
+import type { Api, Model } from "@singularity-forge/pi-ai";
+import type { ModelFailureRecord } from "./auto/session.js";
+import { resolveModelId } from "./auto-model-selection.js";
+import type { ResolvedModelConfig } from "./preferences.js";
+
+export interface ModelRouteRef {
+	provider: string;
+	id: string;
+}
+
+export interface NextModelRouteResult {
+	model: Model<Api>;
+	route: string;
+	source: "configured" | "available";
+}
+
+/**
+ * Build the stable identity key for a concrete provider route.
+ *
+ * Purpose: make fallback recovery compare full provider/model routes instead of
+ * ambiguous bare model ids.
+ *
+ * Consumer: resolveNextConfiguredModelRoute() when skipping failed and current
+ * runtime routes.
+ */
+export function modelRouteKey(route: ModelRouteRef): string {
+	return `${route.provider.toLowerCase()}/${route.id.toLowerCase()}`;
+}
+
+function dedupeConfiguredRoutes(modelConfig: ResolvedModelConfig): string[] {
+	const seen = new Set<string>();
+	const routes: string[] = [];
+	for (const route of [modelConfig.primary, ...modelConfig.fallbacks]) {
+		const key = route.toLowerCase();
+		if (seen.has(key)) continue;
+		seen.add(key);
+		routes.push(route);
+	}
+	return routes;
+}
+
+/**
+ * Resolve the next configured model route after a provider/model failure.
+ *
+ * Purpose: keep auto-mode recovery inside the user's explicit primary/fallback
+ * chain, skip routes already failed for this unit, and avoid returning the same
+ * provider/model again.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts when a provider returns quota,
+ * rate-limit, server, stream, or connection failures during a unit.
+ */
+export function resolveNextConfiguredModelRoute(args: {
+	current: ModelRouteRef | undefined;
+	modelConfig: ResolvedModelConfig;
+	availableModels: Model<Api>[];
+	failedRoutes: readonly ModelFailureRecord[];
+	isBlocked?: (model: Model<Api>) => boolean;
+}): NextModelRouteResult | undefined {
+	const routes = dedupeConfiguredRoutes(args.modelConfig);
+	const currentKey = args.current ? modelRouteKey(args.current) : undefined;
+	const failedKeys = new Set(
+		args.failedRoutes.map((failure) =>
+			modelRouteKey({ provider: failure.provider, id: failure.modelId }),
+		),
+	);
+
+	const resolvedRoutes = routes.map((configuredRoute) => ({
+		configuredRoute,
+		model: resolveModelId(
+			configuredRoute,
+			args.availableModels,
+			args.current?.provider,
+		) as Model<Api> | undefined,
+	}));
+
+	const currentIndex =
+		currentKey === undefined
+			? -1
+			: resolvedRoutes.findIndex(
+					(route) => route.model && modelRouteKey(route.model) === currentKey,
+				);
+	const candidates =
+		currentIndex >= 0 ? resolvedRoutes.slice(currentIndex + 1) : resolvedRoutes;
+
+	for (const candidate of candidates) {
+		if (!candidate.model) continue;
+		const candidateKey = modelRouteKey(candidate.model);
+		if (candidateKey === currentKey) continue;
+		if (failedKeys.has(candidateKey)) continue;
+		if (args.isBlocked?.(candidate.model)) continue;
+		return {
+			model: candidate.model,
+			route: candidate.configuredRoute,
+			source: "configured",
+		};
+	}
+
+	return undefined;
+}
+
+/**
+ * Resolve another currently available provider/model route when configured
+ * fallbacks are missing or exhausted.
+ *
+ * Purpose: keep auto-mode moving on quota/rate-limit/server failures instead
+ * of pausing just because the configured fallback chain did not cover every
+ * live provider route.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts after configured fallback lookup
+ * fails for a model-route failure.
+ */
+export function resolveNextAvailableModelRoute(args: {
+	current: ModelRouteRef | undefined;
+	availableModels: Model<Api>[];
+	failedRoutes: readonly ModelFailureRecord[];
+	isBlocked?: (model: Model<Api>) => boolean;
+}): NextModelRouteResult | undefined {
+	const currentKey = args.current ? modelRouteKey(args.current) : undefined;
+	const failedKeys = new Set(
+		args.failedRoutes.map((failure) =>
+			modelRouteKey({ provider: failure.provider, id: failure.modelId }),
+		),
+	);
+	const candidates = args.availableModels.filter((model) => {
+		const key = modelRouteKey(model);
+		if (key === currentKey) return false;
+		if (failedKeys.has(key)) return false;
+		if (args.isBlocked?.(model)) return false;
+		return true;
+	});
+	if (candidates.length === 0) return undefined;
+
+	const differentProvider =
+		args.current &&
+		candidates.find(
+			(model) =>
+				model.provider.toLowerCase() !== args.current!.provider.toLowerCase(),
+		);
+	const model = differentProvider ?? candidates[0];
+	return {
+		model,
+		route: `${model.provider}/${model.id}`,
+		source: "available",
+	};
+}
+
+/**
+ * Resolve the next model route by trying configured policy first, then any
+ * other live route.
+ *
+ * Purpose: preserve configured fallback ordering when it exists while still
+ * enforcing the no-pause contract for transient provider/model failures.
+ *
+ * Consumer: bootstrap/agent-end-recovery.ts during provider-route recovery.
+ */
+export function resolveNextModelRoute(args: {
+	current: ModelRouteRef | undefined;
+	modelConfig: ResolvedModelConfig | undefined;
+	availableModels: Model<Api>[];
+	failedRoutes: readonly ModelFailureRecord[];
+	isBlocked?: (model: Model<Api>) => boolean;
+}): NextModelRouteResult | undefined {
+	if (args.modelConfig) {
+		const configured = resolveNextConfiguredModelRoute({
+			current: args.current,
+			modelConfig: args.modelConfig,
+			availableModels: args.availableModels,
+			failedRoutes: args.failedRoutes,
+			isBlocked: args.isBlocked,
+		});
+		if (configured) return configured;
+	}
+	return resolveNextAvailableModelRoute({
+		current: args.current,
+		availableModels: args.availableModels,
+		failedRoutes: args.failedRoutes,
+		isBlocked: args.isBlocked,
+	});
+}
--- a/src/resources/extensions/sf/self-feedback-drain.ts
+++ b/src/resources/extensions/sf/self-feedback-drain.ts
@ -21,6 +21,7 @@ import type {
 	ExtensionAPI,
 	ExtensionContext,
 } from "@singularity-forge/pi-coding-agent";
+import { getErrorMessage } from "./error-utils.js";
 import { sfRuntimeRoot } from "./paths.js";
 import type { PersistedSelfFeedbackEntry } from "./self-feedback.js";
 import {
@ -33,6 +34,7 @@ const CLAIM_TTL_MS = 30 * 60 * 1000;
 interface InlineFixClaim {
 	ids: string[];
 	dispatchedAt: string;
+	lastDispatchError?: string;
 }

 function claimPath(basePath: string): string {
@ -63,6 +65,28 @@ function writeClaim(basePath: string, ids: string[]): void {
 	);
 }

+function writeFailedClaim(
+	basePath: string,
+	ids: string[],
+	error: string,
+): void {
+	const path = claimPath(basePath);
+	mkdirSync(dirname(path), { recursive: true });
+	writeFileSync(
+		path,
+		JSON.stringify(
+			{
+				ids,
+				dispatchedAt: new Date(Date.now() - CLAIM_TTL_MS - 1).toISOString(),
+				lastDispatchError: error,
+			},
+			null,
+			2,
+		),
+		"utf-8",
+	);
+}
+
 function clearClaim(basePath: string): void {
 	try {
 		unlinkSync(claimPath(basePath));
@ -147,10 +171,10 @@ function buildInlineFixPrompt(entries: PersistedSelfFeedbackEntry[]): string {
 		)
 		.join("\n\n");

-		return [
-			"You are executing SF self-feedback inline-fix mode.",
-			"",
-			"These high/critical self-feedback entries are unresolved sf defects. Do not only triage them; repair the current codebase directly.",
+	return [
+		"You are executing SF self-feedback inline-fix mode.",
+		"",
+		"These high/critical self-feedback entries are unresolved sf defects. Do not only triage them; repair the current codebase directly.",
 		"",
 		rendered,
 		"",
@ -159,8 +183,9 @@ function buildInlineFixPrompt(entries: PersistedSelfFeedbackEntry[]): string {
 		"2. Fix the smallest coherent set of code/docs/tests needed to satisfy the acceptance criteria.",
 		"3. Run focused verification and typecheck for touched areas.",
 		"4. Commit the fix with a conventional commit message.",
-		"5. Mark the repaired entries resolved in `.sf/self-feedback.jsonl` with agent-fix evidence and the commit SHA.",
-		"6. If an entry is already fixed, mark it resolved with agent-fix evidence and explain the verification.",
+		"5. Call `sf_self_feedback_resolve` for each repaired entry with agent-fix evidence and the commit SHA.",
+		"6. If an entry is already fixed, verify it and call `sf_self_feedback_resolve` with the verification evidence.",
+		"7. Do not hand-edit `.sf/self-feedback.jsonl`; use the resolver tool so markdown, JSONL, and reload detection stay consistent.",
 		"",
 		"When done, say: Self-feedback inline fix complete.",
 	].join("\n");
@ -195,17 +220,25 @@ export function dispatchSelfFeedbackInlineFixIfNeeded(
 	writeClaim(basePath, ids);
 	const prompt = buildInlineFixPrompt(candidates);
 	ctx.ui.notify(
-		`Dispatching self-feedback inline fix for ${ids.length} high/critical entr${ids.length === 1 ? "y" : "ies"}.`,
+		`Queueing self-feedback inline fix for ${ids.length} high/critical entr${ids.length === 1 ? "y" : "ies"}.`,
 		"warning",
 	);
-	pi.sendMessage(
+	const dispatch = pi.sendMessage(
 		{
 			customType: "sf-self-feedback-inline-fix",
 			content: prompt,
 			display: false,
 		},
-		{ triggerTurn: true },
+		{ triggerTurn: true, deliverAs: "followUp" },
 	);
+	void Promise.resolve(dispatch).catch((error) => {
+		const message = getErrorMessage(error);
+		writeFailedClaim(basePath, ids, message);
+		ctx.ui.notify(
+			`Self-feedback inline fix dispatch failed; will retry at the next idle point: ${message}`,
+			"warning",
+		);
+	});
 	return candidates.length;
 }

--- a/src/resources/extensions/sf/tests/commands-harness.test.ts
+++ b/src/resources/extensions/sf/tests/commands-harness.test.ts
@ -0,0 +1,171 @@
+import assert from "node:assert/strict";
+import { execFileSync } from "node:child_process";
+import {
+	appendFileSync,
+	existsSync,
+	mkdirSync,
+	mkdtempSync,
+	readFileSync,
+	realpathSync,
+	rmSync,
+	symlinkSync,
+	writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, test } from "vitest";
+
+import { handleHarness } from "../commands-harness.ts";
+import { profileRepository } from "../repo-profiler.ts";
+import {
+	closeDatabase,
+	getRepoFileObservations,
+	openDatabase,
+	recordRepoProfile,
+} from "../sf-db.ts";
+
+const originalCwd = process.cwd();
+const originalProjectRoot = process.env.SF_PROJECT_ROOT;
+let roots: string[] = [];
+
+afterEach(() => {
+	process.chdir(originalCwd);
+	closeDatabase();
+	for (const root of roots) rmSync(root, { recursive: true, force: true });
+	roots = [];
+	if (originalProjectRoot === undefined) delete process.env.SF_PROJECT_ROOT;
+	else process.env.SF_PROJECT_ROOT = originalProjectRoot;
+});
+
+function runGit(args: string[], cwd: string): string {
+	return execFileSync("git", args, {
+		cwd,
+		stdio: ["ignore", "pipe", "pipe"],
+		encoding: "utf-8",
+	}).trim();
+}
+
+function makeRepo(prefix: string): string {
+	const repo = realpathSync(mkdtempSync(join(tmpdir(), prefix)));
+	roots.push(repo);
+	runGit(["init", "-b", "main"], repo);
+	runGit(["config", "user.email", "test@example.com"], repo);
+	runGit(["config", "user.name", "SF Test"], repo);
+	writeFileSync(join(repo, "README.md"), "# Repo\n", "utf8");
+	writeFileSync(
+		join(repo, "package.json"),
+		'{"scripts":{"test":"node --test"}}\n',
+		"utf8",
+	);
+	runGit(["add", "README.md", "package.json"], repo);
+	runGit(["commit", "-m", "init"], repo);
+	return repo;
+}
+
+function makeExternalSfState(repo: string): string {
+	const externalState = realpathSync(mkdtempSync(join(tmpdir(), "sf-state-")));
+	roots.push(externalState);
+	symlinkSync(externalState, join(repo, ".sf"), "junction");
+	appendFileSync(join(repo, ".git", "info", "exclude"), "\n.sf\n", "utf8");
+	return externalState;
+}
+
+function makeMockCtx(): {
+	notifications: Array<{ message: string; level?: string }>;
+	ui: { notify(message: string, level?: string): void };
+} {
+	const notifications: Array<{ message: string; level?: string }> = [];
+	return {
+		notifications,
+		ui: {
+			notify(message: string, level?: string) {
+				notifications.push({ message, level });
+			},
+		},
+	};
+}
+
+test("harnessPromote_when_sf_is_external_symlink_writes_tracked_docs_not_runtime_target", async () => {
+	const repo = makeRepo("sf-harness-promote-");
+	const externalState = makeExternalSfState(repo);
+	mkdirSync(join(repo, "notes"), { recursive: true });
+	writeFileSync(join(repo, "notes", "local-finding.md"), "# Finding\n", "utf8");
+
+	closeDatabase();
+	assert.equal(openDatabase(join(repo, ".sf", "sf.db")), true);
+	recordRepoProfile(
+		profileRepository(repo, {
+			now: () => "2026-05-02T10:00:00.000Z",
+		}),
+	);
+	closeDatabase();
+
+	delete process.env.SF_PROJECT_ROOT;
+	process.chdir(repo);
+	const ctx = makeMockCtx();
+
+	await handleHarness("promote sf-moocr4rv-au7r3l", ctx as any);
+
+	const relativeArtifact =
+		"docs/exec-plans/active/harness-promotion-sf-moocr4rv-au7r3l.md";
+	const artifact = join(repo, relativeArtifact);
+	assert.ok(existsSync(artifact), "promotion writes a repo docs artifact");
+	assert.ok(
+		!existsSync(join(externalState, relativeArtifact)),
+		"promotion must not write into the external .sf symlink target",
+	);
+	assert.equal(
+		runGit(["status", "--short", "--", relativeArtifact], repo),
+		`?? ${relativeArtifact}`,
+		"promoted docs artifact is visible to git as repo output",
+	);
+
+	const firstContent = readFileSync(artifact, "utf8");
+	await handleHarness("promote sf-moocr4rv-au7r3l", ctx as any);
+	assert.equal(
+		readFileSync(artifact, "utf8"),
+		firstContent,
+		"promotion content is deterministic for the same recorded profile",
+	);
+	assert.doesNotMatch(firstContent, /Promoted at:/);
+	assert.match(
+		firstContent,
+		/Unpromoted \.sf runtime observations remain `observed_only`/,
+	);
+	assert.match(firstContent, /"ownership": "observed_only"/);
+	assert.match(
+		firstContent,
+		new RegExp(`Repo artifact: \`${relativeArtifact}\``),
+	);
+	assert.match(
+		ctx.notifications.at(-1)?.message ?? "",
+		/Unpromoted \.sf runtime state remains observed_only/,
+	);
+});
+
+test("harnessProfile_when_recording_runtime_state_reports_no_repo_artifact", async () => {
+	const repo = makeRepo("sf-harness-profile-");
+	makeExternalSfState(repo);
+	mkdirSync(join(repo, "notes"), { recursive: true });
+	writeFileSync(join(repo, "notes", "scratch.md"), "# Scratch\n", "utf8");
+
+	delete process.env.SF_PROJECT_ROOT;
+	process.chdir(repo);
+	const ctx = makeMockCtx();
+
+	await handleHarness("profile", ctx as any);
+
+	const observations = getRepoFileObservations();
+	const scratch = observations.find((obs) => obs.path === "notes/scratch.md");
+	assert.equal(scratch?.ownership, "observed_only");
+	assert.ok(
+		!existsSync(join(repo, "docs", "exec-plans", "active")),
+		"profile does not create repo-committable docs output",
+	);
+
+	const notice = ctx.notifications[0]?.message ?? "";
+	assert.match(notice, /Runtime observation boundary:/);
+	assert.match(notice, /No repo-committable artifact was written/);
+	assert.match(notice, /\/sf harness promote <finding-id>/);
+	assert.doesNotMatch(notice, /tracked documentation artifact created/);
+});
--- a/src/resources/extensions/sf/tests/error-classifier-quota-reset.test.ts
+++ b/src/resources/extensions/sf/tests/error-classifier-quota-reset.test.ts
@ -0,0 +1,15 @@
+import assert from "node:assert/strict";
+import { test } from "vitest";
+
+import { classifyError } from "../error-classifier.ts";
+
+test("quota_reset_after_seconds_is_rate_limit_with_retry_delay", () => {
+	const result = classifyError(
+		"You have exhausted your capacity on this model. Your quota will reset after 33s.",
+	);
+
+	assert.equal(result.kind, "rate-limit");
+	if (result.kind === "rate-limit") {
+		assert.equal(result.retryAfterMs, 33_000);
+	}
+});
--- a/src/resources/extensions/sf/tests/flow-audit.test.ts
+++ b/src/resources/extensions/sf/tests/flow-audit.test.ts
@ -0,0 +1,255 @@
+import assert from "node:assert/strict";
+import {
+	mkdirSync,
+	mkdtempSync,
+	readFileSync,
+	rmSync,
+	writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, describe, test } from "vitest";
+import { runFlowAudit } from "../doctor.ts";
+import { readAllSelfFeedback, recordSelfFeedback } from "../self-feedback.ts";
+
+const roots: string[] = [];
+
+afterEach(() => {
+	for (const root of roots) rmSync(root, { recursive: true, force: true });
+	roots.length = 0;
+});
+
+function makeForgeProject(): string {
+	const root = mkdtempSync(join(tmpdir(), "sf-flow-audit-"));
+	roots.push(root);
+	mkdirSync(join(root, ".sf"), { recursive: true });
+	writeFileSync(
+		join(root, "package.json"),
+		JSON.stringify({ name: "singularity-forge", version: "0.0.1" }),
+		"utf-8",
+	);
+	return root;
+}
+
+function writeM007LoopFixture(root: string, nowMs: number): void {
+	const sf = join(root, ".sf");
+	const unitId = "M007/S01/T10";
+	const startedAt = nowMs - 45 * 60 * 1000;
+	const lastProgressAt = nowMs - 31 * 60 * 1000;
+	const sliceDir = join(sf, "milestones", "M007", "slices", "S01");
+	const tasksDir = join(sliceDir, "tasks");
+	const unitsDir = join(sf, "runtime", "units");
+	mkdirSync(tasksDir, { recursive: true });
+	mkdirSync(unitsDir, { recursive: true });
+
+	writeFileSync(
+		join(sf, "auto.lock"),
+		JSON.stringify(
+			{
+				pid: 5000,
+				unitType: "execute-task",
+				unitId,
+				phase: "dispatched",
+				startedAt: new Date(startedAt).toISOString(),
+				sessionId: "sess-m007",
+				sessionFile: "/tmp/sessions/m007.jsonl",
+			},
+			null,
+			2,
+		),
+		"utf-8",
+	);
+	writeFileSync(
+		join(unitsDir, "execute-task-M007-S01-T10.json"),
+		JSON.stringify(
+			{
+				version: 1,
+				unitType: "execute-task",
+				unitId,
+				startedAt,
+				updatedAt: lastProgressAt,
+				phase: "dispatched",
+				wrapupWarningSent: false,
+				continueHereFired: false,
+				timeoutAt: null,
+				lastProgressAt,
+				progressCount: 0,
+				lastProgressKind: "dispatch",
+				runawayGuardPause: {
+					reason: "Runaway guard paused execute-task M007/S01/T09",
+					pausedAt: lastProgressAt - 60_000,
+					unitType: "execute-task",
+					unitId: "M007/S01/T09",
+					diagnosticTurns: 2,
+					warningsSent: 2,
+					thresholdReasons: ["budget kept growing"],
+					metrics: {
+						toolCalls: 90,
+						sessionTokens: 1_200_000,
+						elapsedMs: 2_000_000,
+						changedFiles: 0,
+						worktreeChangedSinceStart: false,
+						topTools: { read: 80, bash: 10 },
+					},
+					thresholds: {
+						toolCallWarning: 60,
+						tokenWarning: 1_000_000,
+						elapsedMs: 1_200_000,
+						changedFilesWarning: 75,
+						minIntervalMs: 120_000,
+					},
+				},
+			},
+			null,
+			2,
+		),
+		"utf-8",
+	);
+
+	const taskLines: string[] = [];
+	for (let i = 1; i <= 10; i++) {
+		const id = `T${String(i).padStart(2, "0")}`;
+		taskLines.push(
+			`- [${i < 10 ? "x" : " "}] **${id}: Task ${i}** \`est:10m\``,
+		);
+		if (i < 10) {
+			writeFileSync(
+				join(tasksDir, `${id}-SUMMARY.md`),
+				`# ${id} summary\n\nDone.\n`,
+				"utf-8",
+			);
+		}
+	}
+	writeFileSync(
+		join(sliceDir, "S01-PLAN.md"),
+		`# S01: Loop Evidence\n\n## Tasks\n\n${taskLines.join("\n")}\n`,
+		"utf-8",
+	);
+	writeFileSync(
+		join(sf, "notifications.jsonl"),
+		JSON.stringify({
+			severity: "error",
+			message: "session creation failed before final summary",
+		}) + "\n",
+		"utf-8",
+	);
+
+	for (const task of ["T08", "T09", "T10"]) {
+		recordSelfFeedback(
+			{
+				kind: "runaway-guard-hard-pause",
+				severity: "medium",
+				summary: `Runaway guard paused execute-task M007/S01/${task}`,
+				evidence: `${task} had no final closure`,
+				source: "detector",
+				occurredIn: {
+					milestone: "M007",
+					slice: "S01",
+					task,
+					unitType: "execute-task",
+				},
+			},
+			root,
+		);
+	}
+}
+
+describe("flow audit", () => {
+	test("audit_when_m007_loop_evidence_exists_reports_actionable_stale_flow", async () => {
+		const root = makeForgeProject();
+		const nowMs = Date.UTC(2026, 4, 2, 13, 45, 0);
+		writeM007LoopFixture(root, nowMs);
+
+		const result = await runFlowAudit(root, {
+			nowMs,
+			psOutput:
+				"5000 1 2700 node dist/loader.js sf headless auto\n" +
+				"5100 5000 2400 sift search --json --strategy page-index-hybrid warmup\n" +
+				"5200 5000 120 node dist/loader.js sf tool-session\n",
+			staleProgressMs: 20 * 60 * 1000,
+			optionalChildBudgetMs: 30 * 60 * 1000,
+		});
+
+		assert.equal(result.ok, false);
+		assert.equal(result.activeMilestone?.id, "M007");
+		assert.equal(result.activeUnit?.unitId, "M007/S01/T10");
+		assert.equal(result.activeUnit?.progressAgeMs, 31 * 60 * 1000);
+		assert.equal(result.sessionPointer?.sessionId, "sess-m007");
+		assert.equal(
+			result.sessionPointer?.sessionFile,
+			"/tmp/sessions/m007.jsonl",
+		);
+		assert.equal(result.staleDispatchedUnits.length, 1);
+		assert.match(result.warnings.join("\n"), /no progress for 31 minutes/);
+		assert.deepEqual(result.loopEvidence?.completedPriorTasks.slice(-2), [
+			"T08",
+			"T09",
+		]);
+		assert.match(result.loopEvidence?.missingSummaries.join("\n") ?? "", /T10/);
+		assert.match(result.lastErrors.join("\n"), /session creation failed/);
+		assert.match(result.runawayHistory.join("\n"), /M007\/S01\/T09/);
+		assert.match(result.recommendedAction, /Inspect session/);
+
+		const warmup = result.childProcesses.find((p) => p.pid === 5100);
+		assert.ok(warmup, "warmup child should be reported");
+		assert.equal(warmup.classification, "warmup");
+		assert.equal(warmup.nonBlocking, true);
+		assert.equal(warmup.overBudget, true);
+		assert.equal(warmup.action, "non-blocking");
+
+		const active = result.childProcesses.find((p) => p.pid === 5200);
+		assert.ok(active, "active tool child should be reported");
+		assert.equal(active.classification, "active-session");
+		assert.equal(active.nonBlocking, false);
+
+		const entries = readAllSelfFeedback(root);
+		const rollups = entries.filter(
+			(e) =>
+				e.kind === "flow-audit:repeated-milestone-failure" && !e.resolvedAt,
+		);
+		assert.equal(rollups.length, 1);
+		assert.equal(rollups[0]?.severity, "high");
+		assert.match(rollups[0]?.summary ?? "", /M007/);
+		assert.match(rollups[0]?.acceptanceCriteria ?? "", /stale dispatched unit/);
+
+		await runFlowAudit(root, { nowMs, psOutput: "" });
+		assert.equal(
+			readAllSelfFeedback(root).filter(
+				(e) => e.kind === "flow-audit:repeated-milestone-failure",
+			).length,
+			1,
+			"same milestone rollup stays single while open",
+		);
+	});
+
+	test("audit_when_optional_child_is_over_budget_can_kill_it_explicitly", async () => {
+		const root = makeForgeProject();
+		const killed: number[] = [];
+		const result = await runFlowAudit(root, {
+			nowMs: Date.UTC(2026, 4, 2, 13, 45, 0),
+			psOutput:
+				"5100 5000 2400 sift search --json --strategy page-index-hybrid warmup\n",
+			optionalChildBudgetMs: 60_000,
+			killOverBudgetChildren: true,
+			killProcess: (pid) => {
+				killed.push(pid);
+			},
+		});
+
+		assert.deepEqual(killed, [5100]);
+		assert.equal(result.childProcesses[0]?.classification, "warmup");
+		assert.equal(result.childProcesses[0]?.action, "kill");
+		assert.equal(result.childProcesses[0]?.killed, true);
+	});
+
+	test("session_start_when_registered_runs_flow_auditor", () => {
+		const source = readFileSync(
+			join(import.meta.dirname, "..", "bootstrap", "register-hooks.ts"),
+			"utf-8",
+		);
+
+		assert.match(source, /pi\.on\("session_start"/);
+		assert.match(source, /runFlowAudit/);
+		assert.match(source, /Flow audit:/);
+	});
+});
--- a/src/resources/extensions/sf/tests/model-route-failure.test.ts
+++ b/src/resources/extensions/sf/tests/model-route-failure.test.ts
@ -0,0 +1,155 @@
+import assert from "node:assert/strict";
+import { describe, test } from "vitest";
+
+import {
+	modelRouteKey,
+	resolveNextAvailableModelRoute,
+	resolveNextConfiguredModelRoute,
+	resolveNextModelRoute,
+} from "../model-route-failure.ts";
+
+const models = [
+	{ provider: "google-gemini-cli", id: "gemini-3-flash-preview" },
+	{ provider: "google", id: "gemini-3-flash-preview" },
+	{ provider: "anthropic", id: "claude-sonnet-4-6" },
+	{ provider: "zai", id: "glm-5.1" },
+] as any[];
+
+describe("configured model route failure recovery", () => {
+	test("quota_when_current_route_fails_returns_next_configured_fallback", () => {
+		const next = resolveNextConfiguredModelRoute({
+			current: {
+				provider: "google-gemini-cli",
+				id: "gemini-3-flash-preview",
+			},
+			modelConfig: {
+				primary: "google-gemini-cli/gemini-3-flash-preview",
+				fallbacks: ["anthropic/claude-sonnet-4-6", "zai/glm-5.1"],
+			},
+			availableModels: models,
+			failedRoutes: [
+				{
+					unitType: "execute-task",
+					unitId: "M001/S01/T01",
+					provider: "google-gemini-cli",
+					modelId: "gemini-3-flash-preview",
+					reason: "quota reset after 33s",
+					timestamp: 1,
+				},
+			],
+		});
+
+		assert.equal(next?.model.provider, "anthropic");
+		assert.equal(next?.model.id, "claude-sonnet-4-6");
+	});
+
+	test("current_model_not_in_config_starts_at_configured_primary", () => {
+		const next = resolveNextConfiguredModelRoute({
+			current: { provider: "google-gemini-cli", id: "gemini-3-flash-preview" },
+			modelConfig: {
+				primary: "anthropic/claude-sonnet-4-6",
+				fallbacks: ["zai/glm-5.1"],
+			},
+			availableModels: models,
+			failedRoutes: [],
+		});
+
+		assert.equal(next?.model.provider, "anthropic");
+		assert.equal(next?.model.id, "claude-sonnet-4-6");
+	});
+
+	test("exhausted_chain_returns_undefined", () => {
+		const next = resolveNextConfiguredModelRoute({
+			current: { provider: "zai", id: "glm-5.1" },
+			modelConfig: {
+				primary: "anthropic/claude-sonnet-4-6",
+				fallbacks: ["zai/glm-5.1"],
+			},
+			availableModels: models,
+			failedRoutes: [
+				{
+					unitType: "execute-task",
+					unitId: "M001/S01/T01",
+					provider: "zai",
+					modelId: "glm-5.1",
+					reason: "server overloaded",
+					timestamp: 1,
+				},
+			],
+		});
+
+		assert.equal(next, undefined);
+	});
+
+	test("exhausted_configured_chain_uses_available_route_before_pause", () => {
+		const next = resolveNextModelRoute({
+			current: { provider: "zai", id: "glm-5.1" },
+			modelConfig: {
+				primary: "anthropic/claude-sonnet-4-6",
+				fallbacks: ["zai/glm-5.1"],
+			},
+			availableModels: models,
+			failedRoutes: [
+				{
+					unitType: "execute-task",
+					unitId: "M001/S01/T01",
+					provider: "zai",
+					modelId: "glm-5.1",
+					reason: "server overloaded",
+					timestamp: 1,
+				},
+			],
+		});
+
+		assert.equal(next?.source, "available");
+		assert.equal(next?.model.provider, "google-gemini-cli");
+		assert.equal(next?.model.id, "gemini-3-flash-preview");
+	});
+
+	test("missing_config_uses_available_route_and_prefers_different_provider", () => {
+		const next = resolveNextAvailableModelRoute({
+			current: { provider: "google-gemini-cli", id: "gemini-3-flash-preview" },
+			availableModels: models,
+			failedRoutes: [
+				{
+					unitType: "execute-task",
+					unitId: "M001/S01/T01",
+					provider: "google-gemini-cli",
+					modelId: "gemini-3-flash-preview",
+					reason: "quota",
+					timestamp: 1,
+				},
+			],
+		});
+
+		assert.equal(next?.source, "available");
+		assert.notEqual(next?.model.provider, "google-gemini-cli");
+		assert.notEqual(
+			modelRouteKey(next!.model),
+			"google-gemini-cli/gemini-3-flash-preview",
+		);
+	});
+
+	test("provider_model_identity_skips_only_the_failed_route", () => {
+		const next = resolveNextConfiguredModelRoute({
+			current: { provider: "google-gemini-cli", id: "gemini-3-flash-preview" },
+			modelConfig: {
+				primary: "google-gemini-cli/gemini-3-flash-preview",
+				fallbacks: ["google/gemini-3-flash-preview"],
+			},
+			availableModels: models,
+			failedRoutes: [
+				{
+					unitType: "execute-task",
+					unitId: "M001/S01/T01",
+					provider: "google-gemini-cli",
+					modelId: "gemini-3-flash-preview",
+					reason: "quota",
+					timestamp: 1,
+				},
+			],
+		});
+
+		assert.equal(modelRouteKey(next!.model), "google/gemini-3-flash-preview");
+	});
+});
--- a/src/resources/extensions/sf/tests/provider-errors.test.ts
+++ b/src/resources/extensions/sf/tests/provider-errors.test.ts
@ -8,8 +8,8 @@
 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { dirname, join } from "node:path";
-import { test } from 'vitest';
 import { fileURLToPath } from "node:url";
+import { test } from "vitest";
 import { resumeAutoAfterProviderDelay } from "../bootstrap/provider-error-resume.ts";
 import {
 	classifyError,
@ -388,22 +388,18 @@ test("resumeAutoAfterProviderDelay restarts paused auto-mode from the recorded b
 		ui: { notify() {} },
 		newSession: async () => ({ cancelled: false }),
 	} as any;
-	const result = await resumeAutoAfterProviderDelay(
-		{} as any,
-		commandCtx,
-		{
-			getSnapshot: () => ({
-				active: false,
-				paused: true,
-				stepMode: true,
-				basePath: "/tmp/project",
-			}),
-			resetTransientRetryState: () => {},
-			startAuto: async (_ctx, _pi, base, verboseMode, options) => {
-				startCalls.push({ base, verboseMode, step: options?.step });
-			},
+	const result = await resumeAutoAfterProviderDelay({} as any, commandCtx, {
+		getSnapshot: () => ({
+			active: false,
+			paused: true,
+			stepMode: true,
+			basePath: "/tmp/project",
+		}),
+		resetTransientRetryState: () => {},
+		startAuto: async (_ctx, _pi, base, verboseMode, options) => {
+			startCalls.push({ base, verboseMode, step: options?.step });
 		},
-	);
+	});

 	assert.equal(result, "resumed");
 	assert.deepEqual(startCalls, [
@ -545,21 +541,21 @@ test("resumeAutoAfterProviderDelay leaves paused when no command context is avai
 	]);
 });

-// ── Escalating backoff for transient errors (#1166) ─────────────────────────
+// ── Configured model-route recovery for provider failures ───────────────────

-test("agent-end-recovery.ts tracks consecutive transient errors for escalating backoff", () => {
+test("agent-end-recovery.ts records failed provider routes for configured fallback", () => {
 	const src = readFileSync(
 		join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"),
 		"utf-8",
 	);

 	assert.ok(
-		src.includes("consecutiveTransientCount"),
-		"agent-end-recovery.ts must track consecutiveTransientCount for escalating backoff (#1166)",
+		src.includes("recordCurrentModelFailure"),
+		"agent-end-recovery.ts must record failed provider/model routes before resolving fallbacks",
 	);
 	assert.ok(
-		src.includes("MAX_TRANSIENT_AUTO_RESUMES"),
-		"agent-end-recovery.ts must define MAX_TRANSIENT_AUTO_RESUMES to cap infinite retries (#1166)",
+		src.includes("getCurrentUnitModelFailures"),
+		"agent-end-recovery.ts must skip routes already failed for the current unit",
 	);
 });

@ -576,34 +572,35 @@ test("agent-end-recovery.ts resets retry state before resolveAgentEnd on success
 	);
 });

-test("agent-end-recovery.ts applies escalating delay for repeated transient errors", () => {
+test("agent-end-recovery.ts does not sleep or same-route retry model-route failures", () => {
 	const src = readFileSync(
 		join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"),
 		"utf-8",
 	);

-	// Must contain the exponential backoff formula (may span multiple lines)
 	assert.ok(
-		src.includes("2 ** Math.max(0, retryState.consecutiveTransientCount"),
-		"agent-end-recovery.ts must escalate retryAfterMs exponentially for consecutive transient errors (#1166)",
+		!src.includes("pauseTransientWithBackoff"),
+		"model-route failures must not enter same-model transient backoff",
+	);
+	assert.ok(
+		!src.includes("resumeAutoAfterProviderDelay"),
+		"model-route failures must not schedule same-model auto-resume",
 	);
 });

-test("agent-end-recovery.ts resumes transient provider pauses through startAuto instead of a hidden prompt", () => {
+test("agent-end-recovery.ts sends hidden continue after any successful fallback switch", () => {
 	const src = readFileSync(
 		join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"),
 		"utf-8",
 	);

 	assert.ok(
-		src.includes("resumeAutoAfterProviderDelay"),
-		"agent-end-recovery.ts must resume paused auto-mode through resumeAutoAfterProviderDelay (#2813)",
+		src.includes('customType: "sf-auto-timeout-recovery"'),
+		"successful fallback switches should continue the active unit with a hidden message",
 	);
 	assert.ok(
-		!src.includes(
-			"Continue execution — provider error recovery delay elapsed.",
-		),
-		"transient provider resume must not rely on a hidden continue prompt (#2813)",
+		src.includes("configured fallback") && src.includes("available fallback"),
+		"hidden continue must be tied to a successful model switch, whether configured or available",
 	);
 });

@ -613,8 +610,9 @@ test("agent-end-recovery.ts does not defer rate-limit errors to core retry handl
 		"utf-8",
 	);
 	assert.ok(
-		src.includes('if (isTransient(cls) && cls.kind !== "rate-limit")'),
-		"rate-limit errors must bypass transient core-retry deferral so fallback can execute (#4373)",
+		src.includes("isModelRouteFailure(cls)") &&
+			src.includes('cls.kind === "rate-limit"'),
+		"rate-limit errors must enter model-route recovery before pausing (#4373)",
 	);
 });

@ -624,8 +622,8 @@ test("agent-end-recovery.ts updates dashboard dispatched model after fallback sw
 		"utf-8",
 	);
 	assert.ok(
-		src.includes("setCurrentDispatchedModelId"),
-		"agent-end-recovery.ts should update currentDispatchedModelId when recovery switches model",
+		src.includes("setCurrentUnitModel"),
+		"agent-end-recovery.ts should update current unit/dashboard model state when recovery switches model",
 	);
 });

@ -704,19 +702,17 @@ test("phases.ts handles timeout session-creation failures with pause instead of
 	);
 });

-// ── Fix 3: MAX_TRANSIENT_AUTO_RESUMES raised to 8 ───────────────────────────
+// ── Fix 3: same-route transient retry cap removed for route failures ────────

-test("MAX_TRANSIENT_AUTO_RESUMES is at least 8 for sustained overload resilience", () => {
+test("agent-end-recovery.ts does not keep a same-route transient resume cap", () => {
 	const src = readFileSync(
 		join(__dirname, "..", "bootstrap", "agent-end-recovery.ts"),
 		"utf-8",
 	);
-	const match = src.match(/MAX_TRANSIENT_AUTO_RESUMES\s*=\s*(\d+)/);
-	assert.ok(match, "MAX_TRANSIENT_AUTO_RESUMES must be defined");
-	const value = Number(match![1]);
+
 	assert.ok(
-		value >= 8,
-		`MAX_TRANSIENT_AUTO_RESUMES must be >= 8 for sustained overload resilience, got ${value}`,
+		!src.includes("MAX_TRANSIENT_AUTO_RESUMES"),
+		"provider route failures should switch explicit routes or pause, not count same-route resumes",
 	);
 });

--- a/src/resources/extensions/sf/tests/rate-limit-model-fallback.test.ts
+++ b/src/resources/extensions/sf/tests/rate-limit-model-fallback.test.ts
@ -1,15 +1,16 @@
 /**
 * rate-limit-model-fallback.test.ts — Regression test for #2770.
 *
- * Rate-limit errors enter the model fallback path before falling through
- * to pause. This verifies the structural contract in agent-end-recovery.ts.
+ * Rate-limit errors enter model-route fallback before pausing.
+ * Recovery must switch to configured fallbacks first, then any other available
+ * route before pausing.
 */

 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { dirname, join } from "node:path";
-import { test } from 'vitest';
 import { fileURLToPath } from "node:url";
+import { test } from "vitest";

 const __dirname = dirname(fileURLToPath(import.meta.url));
 const RECOVERY_PATH = join(
@ -28,16 +29,10 @@ function getRecoverySource(): string {
 test("rate-limit errors enter the model fallback branch alongside other transient errors", () => {
 	const src = getRecoverySource();

-	// The condition that gates model fallback must include rate-limit.
-	// Match the if-condition that contains both "rate-limit" and fallback-related kinds.
-	const fallbackConditionRe =
-		/if\s*\([^)]*cls\.kind\s*===\s*"rate-limit"[^)]*cls\.kind\s*===\s*"network"/;
-	const fallbackConditionReAlt =
-		/if\s*\([^)]*cls\.kind\s*===\s*"network"[^)]*cls\.kind\s*===\s*"rate-limit"/;
-
 	assert.ok(
-		fallbackConditionRe.test(src) || fallbackConditionReAlt.test(src),
-		"rate-limit must appear in the same if-condition as network/server for model fallback (#2770)",
+		src.includes('cls.kind === "rate-limit"') &&
+			src.includes("isModelRouteFailure(cls)"),
+		"rate-limit must enter the configured model-route failure path (#2770)",
 	);
 });

@ -54,23 +49,50 @@ test("rate-limit errors are NOT short-circuited to pause before model fallback",
 	);
 });

-test("rate-limit errors fall through to pause if no fallback model is available", () => {
+test("model fallback uses configured routes first then automatic available routes", () => {
 	const src = getRecoverySource();

-	// After the fallback block, the transient fallback pause must still fire for rate-limit.
-	// The isTransient check covers rate-limit (verified by error-classifier tests).
-	// Verify pauseTransientWithBackoff is called with isRateLimit derived from cls.kind.
 	assert.ok(
-		src.includes('cls.kind === "rate-limit"'),
-		'agent-end-recovery.ts must reference cls.kind === "rate-limit" for fallback and pause paths (#2770)',
+		src.includes("resolveNextModelRoute"),
+		"agent-end-recovery.ts must route through the configured-or-available route helper",
 	);
-
-	// The transient fallback pause must pass the isRateLimit flag correctly.
-	const pauseCallRe =
-		/pauseTransientWithBackoff\([^)]*cls\.kind\s*===\s*"rate-limit"/;
 	assert.ok(
-		pauseCallRe.test(src),
-		'pauseTransientWithBackoff must receive isRateLimit based on cls.kind === "rate-limit" (#2770)',
+		src.includes("autoBenchmark: true"),
+		"runtime recovery must allow benchmark-provided fallbacks when preferences do not pin the full chain",
+	);
+	assert.ok(
+		!src.includes("getAutoModeStartModel"),
+		"runtime recovery must not restore a session/system model as an inferred fallback",
+	);
+});
+
+test("rate-limit errors pause only when no configured_or_available fallback remains", () => {
+	const src = getRecoverySource();
+
+	assert.ok(
+		src.includes("available fallback"),
+		"exhausted configured fallback chain should try another available model before pausing",
+	);
+	assert.ok(
+		src.includes("no usable fallback model remains"),
+		"only complete fallback exhaustion should pause with a clear provider error",
+	);
+	assert.ok(
+		/isTransient:\s*false/.test(src),
+		"complete provider route exhaustion must not same-route auto-resume",
+	);
+});
+
+test("setModel failure advances to the next configured fallback", () => {
+	const src = getRecoverySource();
+
+	assert.ok(
+		src.includes('reason: "setModel failed during provider recovery"'),
+		"failed fallback routes should be recorded so the next configured route can be tried",
+	);
+	assert.ok(
+		/if\s*\(!ok\)\s*\{[\s\S]{0,300}continue;/.test(src),
+		"setModel failure should continue walking the configured fallback chain",
 	);
 });

--- a/src/resources/extensions/sf/tests/research-terminal-transition.test.ts
+++ b/src/resources/extensions/sf/tests/research-terminal-transition.test.ts
@ -4,14 +4,12 @@ import { mkdirSync, rmSync } from "node:fs";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { test } from "vitest";
-import {
-	getAutoSession,
-	resetAutoSession,
-} from "../auto/session.js";
+import { getAutoSession } from "../auto/session.js";
 import {
 	hasResearchTerminalTransition,
 	markResearchTerminalTransition,
 } from "../auto.js";
+import { registerHooks } from "../bootstrap/register-hooks.ts";

 function makeTmpBase(): string {
 	const base = join(tmpdir(), `sf-research-terminal-${randomUUID()}`);
@ -96,6 +94,69 @@ test("research terminal transition blocks planning tools", async () => {
 	}
 });

+test("post_summary_planning_tool_attempt_is_blocked_without_followup_turn", async () => {
+	const session = getAutoSession();
+	session.reset();
+	session.active = true;
+	session.currentUnit = {
+		type: "research-slice",
+		id: "M001/S01",
+		startedAt: Date.now(),
+	};
+
+	const sentMessages: unknown[] = [];
+	const handlers = new Map<string, Array<(event: any, ctx?: any) => any>>();
+	const pi = {
+		on(event: string, handler: (event: any, ctx?: any) => any) {
+			const existing = handlers.get(event) ?? [];
+			existing.push(handler);
+			handlers.set(event, existing);
+		},
+		sendMessage(message: unknown) {
+			sentMessages.push(message);
+		},
+	} as any;
+
+	registerHooks(pi);
+	const toolResultHandlers = handlers.get("tool_result") ?? [];
+	const toolCallHandlers = handlers.get("tool_call") ?? [];
+	assert.ok(toolResultHandlers.length, "tool_result handler should register");
+	assert.ok(toolCallHandlers.length, "tool_call handler should register");
+
+	for (const handler of toolResultHandlers) {
+		await handler({
+			toolName: "sf_summary_save",
+			content: [{ type: "text", text: "Saved RESEARCH" }],
+			details: {
+				terminal_transition: true,
+				unit_type: "research",
+			},
+		});
+	}
+	assert.equal(hasResearchTerminalTransition(), true);
+
+	const planningAttempt = {
+		toolName: "sf_plan_milestone",
+		input: {},
+	};
+	const results = [];
+	for (const handler of toolCallHandlers) {
+		results.push(await handler(planningAttempt));
+	}
+
+	const block = results.find((result) => result?.block === true);
+	assert.ok(block, "post-summary planning attempt should be blocked");
+	assert.match(block.reason, /Post-artifact drift/);
+	assert.match(block.reason, /sf_plan_milestone/);
+	assert.equal(
+		sentMessages.length,
+		0,
+		"blocking the tool call must not enqueue another agent turn",
+	);
+
+	session.reset();
+});
+
 test("research terminal transition does not block non-planning tools", () => {
 	const session = getAutoSession();
 	// Reset to known state
@ -113,7 +174,7 @@ test("research terminal transition does not block non-planning tools", () => {

 	// Non-planning tools should not be blocked by the research terminal transition
 	// (the actual blocking logic only checks planning tools)
-	const nonPlanningTools = [
+	const _nonPlanningTools = [
 		"read",
 		"write",
 		"edit",
--- a/src/resources/extensions/sf/tests/self-feedback-drain.test.ts
+++ b/src/resources/extensions/sf/tests/self-feedback-drain.test.ts
@ -1,6 +1,12 @@
 import assert from "node:assert/strict";
 import { execFileSync } from "node:child_process";
-import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import {
+	mkdirSync,
+	mkdtempSync,
+	readFileSync,
+	rmSync,
+	writeFileSync,
+} from "node:fs";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { afterEach, describe, it } from "vitest";
@ -90,7 +96,7 @@ describe("self-feedback inline drain", () => {
 			root,
 		);

-		const messages: unknown[] = [];
+		const messages: Array<{ message: unknown; options: unknown }> = [];
 		const notifications: string[] = [];
 		const ctx = {
 			ui: {
@ -100,18 +106,72 @@ describe("self-feedback inline drain", () => {
 			},
 		} as any;
 		const pi = {
+			sendMessage(message: unknown, options: unknown) {
+				messages.push({ message, options });
+			},
+		} as any;
+
+		assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, pi), 1);
+		assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, pi), 0);
+		assert.equal(messages.length, 1);
+		assert.equal(notifications.length, 2);
+		assert.match(
+			JSON.stringify(messages[0]?.message),
+			/sf-self-feedback-inline-fix/,
+		);
+		assert.match(
+			JSON.stringify(messages[0]?.message),
+			/sf_self_feedback_resolve/,
+		);
+		assert.deepEqual(messages[0]?.options, {
+			triggerTurn: true,
+			deliverAs: "followUp",
+		});
+		assert.match(notifications[1], /already dispatched/);
+	});
+
+	it("dispatch_failure_expires_claim_so_next_idle_turn_can_retry", async () => {
+		const root = makeForgeProject();
+		recordSelfFeedback(
+			{
+				kind: "startup-dispatch-race",
+				severity: "critical",
+				summary: "Startup dispatch can fail before the turn is accepted",
+				source: "detector",
+			},
+			root,
+		);
+
+		const notifications: string[] = [];
+		const ctx = {
+			ui: {
+				notify(message: string) {
+					notifications.push(message);
+				},
+			},
+		} as any;
+		const failingPi = {
+			sendMessage() {
+				return Promise.reject(new Error("agent busy"));
+			},
+		} as any;
+		assert.equal(
+			dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, failingPi),
+			1,
+		);
+		await Promise.resolve();
+		await Promise.resolve();
+
+		const messages: unknown[] = [];
+		const retryPi = {
 			sendMessage(message: unknown) {
 				messages.push(message);
 			},
 		} as any;
-
-			assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, pi), 1);
-			assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, pi), 0);
-			assert.equal(messages.length, 1);
-			assert.equal(notifications.length, 2);
-			assert.match(JSON.stringify(messages[0]), /sf-self-feedback-inline-fix/);
-			assert.match(notifications[1], /already dispatched/);
-		});
+		assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, retryPi), 1);
+		assert.equal(messages.length, 1);
+		assert.match(notifications.join("\n"), /will retry at the next idle point/);
+	});

 	it("consumes the claim after the inline-fix entries are resolved", () => {
 		const root = makeForgeProject();
@ -162,7 +222,11 @@ describe("self-feedback inline drain", () => {
 		const ctx = { ui: { notify() {} } } as any;
 		const pi = { sendMessage() {} } as any;
 		assert.equal(dispatchSelfFeedbackInlineFixIfNeeded(root, ctx, pi), 1);
-		writeFileSync(join(root, "dirty.ts"), "export const dirty = true;\n", "utf-8");
+		writeFileSync(
+			join(root, "dirty.ts"),
+			"export const dirty = true;\n",
+			"utf-8",
+		);
 		assert.equal(
 			markResolved(
 				recorded.entry.id,
@ -199,4 +263,20 @@ describe("self-feedback inline drain", () => {
 		);
 		assert.equal(selected[0]?.repoIdentity, "external");
 	});
+
+	it("session_start_hook_queues_inline_fix_followup_not_only_warning", () => {
+		const source = readFileSync(
+			join(import.meta.dirname, "..", "bootstrap", "register-hooks.ts"),
+			"utf-8",
+		);
+		const start = source.indexOf('pi.on("session_start"');
+		const end = source.indexOf("return buildBeforeAgentStartResult", start);
+		assert.notEqual(start, -1);
+		assert.notEqual(end, -1);
+		const sessionStartBlock = source.slice(start, end);
+
+		assert.match(sessionStartBlock, /dispatchSelfFeedbackInlineFixIfNeeded/);
+		assert.match(sessionStartBlock, /even outside \/sf auto/);
+		assert.doesNotMatch(sessionStartBlock, /no auto-dispatch/);
+	});
 });
--- a/src/resources/extensions/sf/tests/self-feedback-resolve-tool.test.ts
+++ b/src/resources/extensions/sf/tests/self-feedback-resolve-tool.test.ts
@ -0,0 +1,106 @@
+import assert from "node:assert/strict";
+import {
+	mkdirSync,
+	mkdtempSync,
+	readFileSync,
+	rmSync,
+	writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, describe, test } from "vitest";
+import { registerDbTools } from "../bootstrap/db-tools.ts";
+import { readAllSelfFeedback, recordSelfFeedback } from "../self-feedback.ts";
+
+const originalCwd = process.cwd();
+const originalSfHome = process.env.SF_HOME;
+let roots: string[] = [];
+
+afterEach(() => {
+	process.chdir(originalCwd);
+	for (const root of roots) rmSync(root, { recursive: true, force: true });
+	roots = [];
+	if (originalSfHome === undefined) delete process.env.SF_HOME;
+	else process.env.SF_HOME = originalSfHome;
+});
+
+function makeForgeProject(): string {
+	const root = mkdtempSync(join(tmpdir(), "sf-self-feedback-resolve-"));
+	roots.push(root);
+	mkdirSync(join(root, ".sf"), { recursive: true });
+	process.env.SF_HOME = join(root, "sf-home");
+	writeFileSync(
+		join(root, "package.json"),
+		JSON.stringify({ name: "singularity-forge", version: "0.0.1" }),
+		"utf-8",
+	);
+	return root;
+}
+
+function makeMockPi() {
+	const tools: any[] = [];
+	return {
+		registerTool(tool: any) {
+			tools.push(tool);
+		},
+		tools,
+	} as any;
+}
+
+describe("sf_self_feedback_resolve", () => {
+	test("resolve_when_entry_is_fixed_sets_resolved_evidence_and_regenerates_markdown", async () => {
+		const root = makeForgeProject();
+		const recorded = recordSelfFeedback(
+			{
+				kind: "inline-fix-resolution-gap",
+				severity: "high",
+				summary: "Inline fix landed but entry stayed unresolved",
+				acceptanceCriteria: "1. Resolver tool exists. 2. JSONL is updated.",
+				source: "detector",
+			},
+			root,
+		);
+		assert.ok(recorded);
+		process.chdir(root);
+
+		const pi = makeMockPi();
+		registerDbTools(pi);
+		const tool = pi.tools.find(
+			(t: any) => t.name === "sf_self_feedback_resolve",
+		);
+		assert.ok(tool, "resolver tool should be registered");
+
+		const result = await tool.execute(
+			"call-1",
+			{
+				id: recorded.entry.id,
+				reason: "resolver tool verified",
+				commit_sha: "abc1234",
+				test_path:
+					"src/resources/extensions/sf/tests/self-feedback-resolve-tool.test.ts",
+				criteria_met: ["Resolver tool exists", "JSONL is updated"],
+			},
+			undefined,
+			undefined,
+			undefined,
+		);
+
+		assert.equal(result.details?.resolved, true);
+		const [entry] = readAllSelfFeedback(root).filter(
+			(e) => e.id === recorded.entry.id,
+		);
+		assert.ok(entry?.resolvedAt);
+		assert.equal(entry.resolvedEvidence?.kind, "agent-fix");
+		assert.equal(entry.resolvedEvidence?.commitSha, "abc1234");
+		assert.deepEqual(entry.resolvedCriteriaMet, [
+			"Resolver tool exists",
+			"JSONL is updated",
+		]);
+		const markdown = readFileSync(
+			join(root, ".sf", "SELF-FEEDBACK.md"),
+			"utf-8",
+		);
+		assert.match(markdown, /Recently Resolved/);
+		assert.match(markdown, /inline-fix-resolution-gap/);
+	});
+});
--- a/src/resources/extensions/sf/tests/tool-naming.test.ts
+++ b/src/resources/extensions/sf/tests/tool-naming.test.ts
@ -26,6 +26,7 @@ const CANONICAL_DB_TOOLS = [
 	"sf_summary_save",
 	"sf_milestone_generate_id",
 	"sf_self_report",
+	"sf_self_feedback_resolve",
 	"sf_plan_milestone",
 	"sf_plan_slice",
 	"sf_plan_task",
--- a/src/resources/extensions/sf/tests/triage-protocol-registry.test.ts
+++ b/src/resources/extensions/sf/tests/triage-protocol-registry.test.ts
@ -0,0 +1,401 @@
+/**
+ * Triage protocol — registry integration tests.
+ *
+ * Purpose: Validate that every finding in the M008 bug registry conforms to
+ * the triage protocol definitions (severity, status, cluster routing), and
+ * that the systematic-debugging skill correctly references the protocol.
+ *
+ * Consumer: CI gate that blocks milestone completion when registry and
+ * protocol drift out of sync.
+ */
+
+import assert from "node:assert/strict";
+import { readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { describe, test } from "vitest";
+import { fileURLToPath } from "node:url";
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const repoRoot = join(__dirname, "..", "..", "..", "..", "..");
+
+// ─── Load canonical artifacts ────────────────────────────────────────────────
+
+const registryPath = join(repoRoot, ".sf", "milestones", "M008", "bugs", "bug-registry.json");
+const protocolPath = join(repoRoot, ".sf", "milestones", "M008", "triage-protocol.md");
+const skillPath = join(repoRoot, "src", "resources", "extensions", "sf", "skills", "systematic-debugging", "SKILL.md");
+
+const registry = JSON.parse(readFileSync(registryPath, "utf-8")) as {
+	schema_version: string;
+	meta: {
+		source: string;
+		date: string;
+		totalFindings: number;
+		clusters: string[];
+	};
+	findings: Array<{
+		id: string;
+		file: string;
+		lines: string;
+		category: string;
+		severity: string;
+		status: string;
+		description: string;
+		suggestedFix: string;
+		cluster: string;
+		fixedByTaskId?: string;
+	}>;
+	summary: {
+		severity: Record<string, number>;
+		status: Record<string, number>;
+		cluster: Record<string, number>;
+	};
+};
+
+const protocol = readFileSync(protocolPath, "utf-8");
+const skill = (() => {
+	try {
+		return readFileSync(skillPath, "utf-8");
+	} catch {
+		return "";
+	}
+})();
+
+// ─── Severity definitions from protocol ──────────────────────────────────────
+
+const VALID_SEVERITIES = ["HIGH", "MEDIUM", "LOW", "FALSE_POSITIVE"] as const;
+const VALID_STATUSES = ["CONFIRMED", "FALSE_POSITIVE", "FIXED", "WONTFIX", "IN_PROGRESS"] as const;
+
+// Cluster routing table from protocol
+const PROTOCOL_CLUSTERS = [
+	"engine + verification",
+	"scaffold + doctor",
+	"worktree + git",
+	"memory + state + cache",
+	"bootstrap + workflow",
+	"notification + detection + headless",
+] as const;
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+function assertFinding(
+	condition: boolean,
+	findingId: string,
+	message: string,
+): void {
+	assert.ok(condition, `Finding ${findingId}: ${message}`);
+}
+
+// ─── Registry structural validity ────────────────────────────────────────────
+
+describe("triage-protocol-registry", () => {
+	test("registry_schema_version_is_1_0_0", () => {
+		assert.strictEqual(registry.schema_version, "1.0.0", "schema_version must be 1.0.0");
+	});
+
+	test("registry_meta_totalFindings_matches_actual_count", () => {
+		assert.strictEqual(
+			registry.meta.totalFindings,
+			registry.findings.length,
+			`meta.totalFindings (${registry.meta.totalFindings}) must equal actual findings count (${registry.findings.length})`,
+		);
+	});
+
+	test("registry_meta_clusters_match_protocol_clusters", () => {
+		const registryClusters = new Set(registry.meta.clusters);
+		const protocolClusterSet = new Set(PROTOCOL_CLUSTERS);
+		assert.deepStrictEqual(
+			registryClusters,
+			protocolClusterSet,
+			"registry meta.clusters must exactly match protocol cluster routing table",
+		);
+	});
+
+	// ─── Per-finding validation ──────────────────────────────────────────────
+
+	test("every_finding_has_valid_severity", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				VALID_SEVERITIES.includes(f.severity as (typeof VALID_SEVERITIES)[number]),
+				f.id,
+				`severity "${f.severity}" is not one of ${VALID_SEVERITIES.join(", ")}`,
+			);
+		}
+	});
+
+	test("every_finding_has_valid_status", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				VALID_STATUSES.includes(f.status as (typeof VALID_STATUSES)[number]),
+				f.id,
+				`status "${f.status}" is not one of ${VALID_STATUSES.join(", ")}`,
+			);
+		}
+	});
+
+	test("every_finding_belongs_to_protocol_cluster", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				PROTOCOL_CLUSTERS.includes(f.cluster as (typeof PROTOCOL_CLUSTERS)[number]),
+				f.id,
+				`cluster "${f.cluster}" is not in the protocol routing table`,
+			);
+		}
+	});
+
+	test("every_finding_has_non_empty_id", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				f.id.length > 0,
+				f.id,
+				"finding id must not be empty",
+			);
+		}
+	});
+
+	test("every_finding_has_non_empty_description", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				f.description.length > 0,
+				f.id,
+				"description must not be empty",
+			);
+		}
+	});
+
+	test("every_finding_has_non_empty_suggestedFix", () => {
+		for (const f of registry.findings) {
+			assertFinding(
+				f.suggestedFix.length > 0,
+				f.id,
+				"suggestedFix must not be empty",
+			);
+		}
+	});
+
+	// ─── Severity / status consistency rules ─────────────────────────────────
+
+	test("severity_FALSE_POSITIVE_implies_status_FALSE_POSITIVE", () => {
+		for (const f of registry.findings) {
+			if (f.severity === "FALSE_POSITIVE") {
+				assertFinding(
+					f.status === "FALSE_POSITIVE",
+					f.id,
+					`severity=FALSE_POSITIVE requires status=FALSE_POSITIVE, got status=${f.status}`,
+				);
+			}
+		}
+	});
+
+	test("status_FALSE_POSITIVE_implies_severity_FALSE_POSITIVE", () => {
+		for (const f of registry.findings) {
+			if (f.status === "FALSE_POSITIVE") {
+				assertFinding(
+					f.severity === "FALSE_POSITIVE",
+					f.id,
+					`status=FALSE_POSITIVE requires severity=FALSE_POSITIVE, got severity=${f.severity}`,
+				);
+			}
+		}
+	});
+
+	test("status_FIXED_implies_fixedByTaskId_present", () => {
+		for (const f of registry.findings) {
+			if (f.status === "FIXED") {
+				assertFinding(
+					f.fixedByTaskId !== undefined && f.fixedByTaskId.length > 0,
+					f.id,
+					`status=FIXED requires fixedByTaskId to be set`,
+				);
+			}
+		}
+	});
+
+	test("fixedByTaskId_present_only_when_status_FIXED", () => {
+		for (const f of registry.findings) {
+			if (f.fixedByTaskId !== undefined) {
+				assertFinding(
+					f.status === "FIXED",
+					f.id,
+					`fixedByTaskId (${f.fixedByTaskId}) should only be present when status=FIXED, got status=${f.status}`,
+				);
+			}
+		}
+	});
+
+	// ─── Summary statistics accuracy ─────────────────────────────────────────
+
+	test("summary_severity_counts_match_actual", () => {
+		const actual: Record<string, number> = {};
+		for (const f of registry.findings) {
+			actual[f.severity] = (actual[f.severity] ?? 0) + 1;
+		}
+		assert.deepStrictEqual(
+			registry.summary.severity,
+			actual,
+			"summary.severity counts must match actual finding severities",
+		);
+	});
+
+	test("summary_status_counts_match_actual", () => {
+		const actual: Record<string, number> = {};
+		for (const f of registry.findings) {
+			actual[f.status] = (actual[f.status] ?? 0) + 1;
+		}
+		// Compare only keys that exist in either object; zero-count keys in summary are allowed
+		const allKeys = new Set([...Object.keys(registry.summary.status), ...Object.keys(actual)]);
+		for (const key of allKeys) {
+			const expectedCount = registry.summary.status[key] ?? 0;
+			const actualCount = actual[key] ?? 0;
+			assert.strictEqual(
+				actualCount,
+				expectedCount,
+				`summary.status["${key}"]: expected ${expectedCount}, got ${actualCount}`,
+			);
+		}
+	});
+
+	test("summary_cluster_counts_match_actual", () => {
+		const actual: Record<string, number> = {};
+		for (const f of registry.findings) {
+			actual[f.cluster] = (actual[f.cluster] ?? 0) + 1;
+		}
+		assert.deepStrictEqual(
+			registry.summary.cluster,
+			actual,
+			"summary.cluster counts must match actual finding clusters",
+		);
+	});
+
+	// ─── Protocol content validation ─────────────────────────────────────────
+
+	test("protocol_defines_all_severity_levels", () => {
+		for (const sev of VALID_SEVERITIES) {
+			assert.ok(
+				protocol.includes(sev),
+				`triage-protocol.md must mention severity level ${sev}`,
+			);
+		}
+	});
+
+	test("protocol_defines_all_status_values", () => {
+		for (const st of VALID_STATUSES) {
+			assert.ok(
+				protocol.includes(st),
+				`triage-protocol.md must mention status value ${st}`,
+			);
+		}
+	});
+
+	test("protocol_defines_all_clusters_in_routing_table", () => {
+		for (const cluster of PROTOCOL_CLUSTERS) {
+			assert.ok(
+				protocol.includes(cluster),
+				`triage-protocol.md cluster routing table must include "${cluster}"`,
+			);
+		}
+	});
+
+	test("protocol_contains_confidence_gate_table", () => {
+		assert.ok(
+			protocol.includes("Confidence Gate Requirements"),
+			"protocol must contain Confidence Gate Requirements section",
+		);
+		assert.ok(
+			protocol.includes("0.90") || protocol.includes("0.95") || protocol.includes("0.80"),
+			"protocol must list numeric confidence thresholds",
+		);
+	});
+
+	test("protocol_contains_escalation_rules", () => {
+		assert.ok(
+			protocol.includes("Escalation Rules"),
+			"protocol must contain Escalation Rules section",
+		);
+	});
+
+	// ─── Skill references protocol correctly ─────────────────────────────────
+
+	test("skill_references_triage_protocol_file", () => {
+		assert.ok(
+			skill.includes("triage-protocol.md") || skill.includes("triage protocol"),
+			"systematic-debugging SKILL.md must reference the triage protocol",
+		);
+	});
+
+	test("skill_references_bug_registry", () => {
+		assert.ok(
+			skill.includes("bug-registry.json"),
+			"systematic-debugging SKILL.md must reference bug-registry.json",
+		);
+	});
+
+	test("skill_lists_severity_values", () => {
+		assert.ok(
+			(skill.includes('"HIGH"') || skill.includes('`HIGH`')) &&
+			(skill.includes('"MEDIUM"') || skill.includes('`MEDIUM`')) &&
+			(skill.includes('"LOW"') || skill.includes('`LOW`')),
+			"systematic-debugging SKILL.md must list HIGH / MEDIUM / LOW severity values",
+		);
+	});
+
+	test("skill_mentions_confidence_gate_thresholds", () => {
+		assert.ok(
+			skill.includes("0.80") || skill.includes("0.85") || skill.includes("0.90") || skill.includes("0.95"),
+			"systematic-debugging SKILL.md must mention confidence gate thresholds",
+		);
+	});
+
+	test("skill_mentions_cluster_aware_fixes", () => {
+		assert.ok(
+			skill.includes("cluster-aware") || skill.includes("Cluster-aware"),
+			"systematic-debugging SKILL.md must mention cluster-aware fixes",
+		);
+	});
+
+	test("skill_mentions_registry_update_after_fix", () => {
+		assert.ok(
+			skill.includes("Update the registry") || skill.includes("update the registry") || skill.includes("bug-registry.json"),
+			"systematic-debugging SKILL.md must instruct updating registry after fix",
+		);
+	});
+
+	// ─── Protocol decision flow integrity ────────────────────────────────────
+
+	test("protocol_decision_flow_has_all_severity_branches", () => {
+		// The decision flow should branch on HIGH, MEDIUM, and LOW
+		assert.ok(
+			protocol.includes("severity = HIGH") || protocol.includes("Is severity = HIGH"),
+			"protocol decision flow must branch on HIGH severity",
+		);
+		assert.ok(
+			protocol.includes("severity = MEDIUM") || protocol.includes("Is severity = MEDIUM"),
+			"protocol decision flow must branch on MEDIUM severity",
+		);
+		assert.ok(
+			protocol.includes("severity = LOW") || protocol.includes("Is severity = LOW"),
+			"protocol decision flow must branch on LOW severity",
+		);
+	});
+
+	test("protocol_high_severity_requires_regression_test", () => {
+		const highSection = protocol.slice(protocol.indexOf("severity = HIGH"));
+		assert.ok(
+			highSection.includes("regression test") || protocol.includes("Require regression test"),
+			"protocol must require regression test for HIGH severity",
+		);
+	});
+
+	test("protocol_medium_severity_has_confidence_gate_0_85", () => {
+		assert.ok(
+			protocol.includes("0.85"),
+			"protocol must specify 0.85 confidence gate for MEDIUM severity",
+		);
+	});
+
+	test("protocol_low_severity_has_confidence_gate_0_80", () => {
+		assert.ok(
+			protocol.includes("0.80"),
+			"protocol must specify 0.80 confidence gate for LOW severity",
+		);
+	});
+});
--- a/src/resources/extensions/sf/tests/unit-runtime-fsm.test.ts
+++ b/src/resources/extensions/sf/tests/unit-runtime-fsm.test.ts
@ -0,0 +1,264 @@
+import assert from "node:assert/strict";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, test } from "vitest";
+import { buildQuerySnapshot } from "../../../../headless-query.ts";
+import { resolveDispatch } from "../auto-dispatch.ts";
+import {
+	clearUnitRuntimeRecord,
+	decideUnitRuntimeDispatch,
+	readUnitRuntimeRecord,
+	UNIT_RUNTIME_STATUSES,
+	UNIT_RUNTIME_TERMINAL_STATUSES,
+	UNIT_RUNTIME_TRANSITIONS,
+	writeUnitRuntimeRecord,
+} from "../unit-runtime.ts";
+
+const tmpDirs: string[] = [];
+
+function makeTmpBase(prefix = "sf-unit-runtime-fsm-"): string {
+	const base = mkdtempSync(join(tmpdir(), prefix));
+	tmpDirs.push(base);
+	mkdirSync(join(base, ".sf", "milestones"), { recursive: true });
+	return base;
+}
+
+function makeParallelResearchProject(): string {
+	const base = makeTmpBase("sf-unit-runtime-parallel-");
+	const milestoneDir = join(base, ".sf", "milestones", "M001");
+	mkdirSync(milestoneDir, { recursive: true });
+	writeFileSync(
+		join(milestoneDir, "M001-ROADMAP.md"),
+		[
+			"# M001: Parallel Research Milestone",
+			"",
+			"**Vision:** Research-ready slices.",
+			"",
+			"## Slices",
+			"",
+			"- [ ] **S01: Alpha** `risk:low` `depends:[]`",
+			"- [ ] **S02: Beta** `risk:low` `depends:[]`",
+			"",
+		].join("\n"),
+		"utf-8",
+	);
+	return base;
+}
+
+async function resolvePlanningDispatch(base: string) {
+	return resolveDispatch({
+		basePath: base,
+		mid: "M001",
+		midTitle: "Parallel Research Milestone",
+		state: {
+			phase: "planning",
+			activeMilestone: {
+				id: "M001",
+				title: "Parallel Research Milestone",
+				status: "active",
+			},
+			activeSlice: { id: "S01", title: "Alpha" },
+			activeTask: null,
+			registry: [],
+			blockers: [],
+		} as any,
+		prefs: undefined,
+	});
+}
+
+afterEach(() => {
+	for (const dir of tmpDirs) {
+		rmSync(dir, { recursive: true, force: true });
+	}
+	tmpDirs.length = 0;
+});
+
+test("unit_runtime_transitions_when_enumerated_cover_all_statuses", () => {
+	assert.deepEqual(UNIT_RUNTIME_STATUSES, [
+		"queued",
+		"claimed",
+		"running",
+		"progress",
+		"completed",
+		"failed",
+		"blocked",
+		"cancelled",
+		"stale",
+		"runaway-recovered",
+		"notified",
+	]);
+	assert.deepEqual(UNIT_RUNTIME_TERMINAL_STATUSES, [
+		"completed",
+		"failed",
+		"blocked",
+		"cancelled",
+		"stale",
+		"runaway-recovered",
+	]);
+	assert.deepEqual(UNIT_RUNTIME_TRANSITIONS, {
+		queued: ["claimed", "cancelled"],
+		claimed: ["running", "stale", "cancelled"],
+		running: [
+			"progress",
+			"completed",
+			"failed",
+			"blocked",
+			"cancelled",
+			"stale",
+			"runaway-recovered",
+		],
+		progress: [
+			"running",
+			"completed",
+			"failed",
+			"blocked",
+			"cancelled",
+			"stale",
+			"runaway-recovered",
+		],
+		completed: ["notified"],
+		failed: ["queued", "notified"],
+		blocked: ["notified"],
+		cancelled: ["notified"],
+		stale: ["queued", "notified"],
+		"runaway-recovered": ["queued", "notified"],
+		notified: ["queued"],
+	});
+});
+
+test("synthetic_failed_unit_when_not_reset_cannot_redispatch", async () => {
+	const base = makeParallelResearchProject();
+	writeUnitRuntimeRecord(
+		base,
+		"research-slice",
+		"M001/parallel-research",
+		1000,
+		{
+			status: "failed",
+			retryCount: 0,
+			maxRetries: 2,
+		},
+	);
+
+	const record = readUnitRuntimeRecord(
+		base,
+		"research-slice",
+		"M001/parallel-research",
+	);
+	const decision = decideUnitRuntimeDispatch(record);
+	assert.equal(decision.action, "block");
+	assert.equal(decision.reasonCode, "synthetic-reset-required");
+	assert.equal(decision.retryCount, 0);
+	assert.equal(decision.maxRetries, 2);
+
+	const blockedDispatch = await resolvePlanningDispatch(base);
+	assert.equal(blockedDispatch.action, "dispatch");
+	if (blockedDispatch.action === "dispatch") {
+		assert.equal(blockedDispatch.unitType, "research-slice");
+		assert.equal(blockedDispatch.unitId, "M001/S01");
+	}
+
+	clearUnitRuntimeRecord(base, "research-slice", "M001/parallel-research");
+	const resetDecision = decideUnitRuntimeDispatch(
+		readUnitRuntimeRecord(base, "research-slice", "M001/parallel-research"),
+	);
+	assert.equal(resetDecision.action, "dispatch");
+	assert.equal(resetDecision.reasonCode, "no-runtime-record");
+
+	const resetDispatch = await resolvePlanningDispatch(base);
+	assert.equal(resetDispatch.action, "dispatch");
+	if (resetDispatch.action === "dispatch") {
+		assert.equal(resetDispatch.unitType, "research-slice");
+		assert.equal(resetDispatch.unitId, "M001/parallel-research");
+	}
+});
+
+test("terminal_status_when_budget_available_produces_expected_dispatch_decision", () => {
+	const base = makeTmpBase();
+	const cases = [
+		["completed", "notify", "terminal-ready-to-notify"],
+		["failed", "retry", "retry-budget-available"],
+		["blocked", "notify", "terminal-ready-to-notify"],
+		["cancelled", "notify", "terminal-ready-to-notify"],
+		["stale", "retry", "retry-budget-available"],
+		["runaway-recovered", "retry", "retry-budget-available"],
+	] as const;
+
+	for (const [status, expectedAction, expectedReason] of cases) {
+		writeUnitRuntimeRecord(base, "execute-task", `M001/S01/${status}`, 1000, {
+			status,
+			retryCount: 0,
+			maxRetries: 2,
+		});
+		const record = readUnitRuntimeRecord(
+			base,
+			"execute-task",
+			`M001/S01/${status}`,
+		);
+		const decision = decideUnitRuntimeDispatch(record);
+		assert.equal(decision.action, expectedAction, status);
+		assert.equal(decision.reasonCode, expectedReason, status);
+	}
+});
+
+test("retryable_terminal_status_when_budget_exhausted_blocks_dispatch", () => {
+	const base = makeTmpBase();
+	for (const status of ["failed", "stale", "runaway-recovered"] as const) {
+		writeUnitRuntimeRecord(base, "execute-task", `M001/S01/${status}`, 1000, {
+			status,
+			retryCount: 2,
+			maxRetries: 2,
+		});
+		const decision = decideUnitRuntimeDispatch(
+			readUnitRuntimeRecord(base, "execute-task", `M001/S01/${status}`),
+		);
+		assert.equal(decision.action, "block", status);
+		assert.equal(decision.reasonCode, "retry-budget-exhausted", status);
+		assert.equal(decision.retryCount, 2, status);
+		assert.equal(decision.maxRetries, 2, status);
+	}
+});
+
+test("terminal_status_when_already_notified_skips_dispatch", () => {
+	const base = makeTmpBase();
+	writeUnitRuntimeRecord(base, "execute-task", "M001/S01/T01", 1000, {
+		status: "failed",
+		retryCount: 0,
+		maxRetries: 2,
+		notifiedAt: 2000,
+	});
+
+	const decision = decideUnitRuntimeDispatch(
+		readUnitRuntimeRecord(base, "execute-task", "M001/S01/T01"),
+	);
+	assert.equal(decision.action, "skip");
+	assert.equal(decision.reasonCode, "already-notified");
+});
+
+test("headless_query_when_runtime_record_exists_shows_retry_budget", async () => {
+	const base = makeTmpBase();
+	writeUnitRuntimeRecord(base, "execute-task", "M001/S01/T01", 1000, {
+		status: "failed",
+		retryCount: 1,
+		maxRetries: 2,
+		watchdogReason: "no heartbeat",
+		outputPath: ".sf/runtime/units/M001-S01-T01.log",
+	});
+
+	const snapshot = await buildQuerySnapshot(base);
+	const unit = snapshot.runtime.units.find(
+		(item) =>
+			item.unitType === "execute-task" && item.unitId === "M001/S01/T01",
+	);
+
+	assert.ok(unit);
+	assert.equal(unit.status, "failed");
+	assert.equal(unit.retryCount, 1);
+	assert.equal(unit.maxRetries, 2);
+	assert.equal(unit.retryBudgetRemaining, 1);
+	assert.equal(unit.dispatchDecision.action, "retry");
+	assert.equal(unit.dispatchDecision.reasonCode, "retry-budget-available");
+	assert.equal(unit.watchdogReason, "no heartbeat");
+	assert.equal(unit.outputPath, ".sf/runtime/units/M001-S01-T01.log");
+});
--- a/src/resources/extensions/sf/unit-runtime.ts
+++ b/src/resources/extensions/sf/unit-runtime.ts
@ -22,7 +22,126 @@ import {
 } from "./paths.js";
 import { parseUnitId } from "./unit-id.js";

+/**
+ * Lists every durable unit runtime status in FSM order.
+ *
+ * Purpose: give dispatch, recovery, and query surfaces one canonical state
+ * vocabulary so terminal units cannot be redispatched by ambiguous legacy phases.
+ *
+ * Consumer: auto runtime persistence, unit-runtime tests, headless query summaries.
+ */
+export const UNIT_RUNTIME_STATUSES = [
+	"queued",
+	"claimed",
+	"running",
+	"progress",
+	"completed",
+	"failed",
+	"blocked",
+	"cancelled",
+	"stale",
+	"runaway-recovered",
+	"notified",
+] as const;
+
+/**
+ * Names the unit statuses that end an execution attempt.
+ *
+ * Purpose: centralize the terminal-state union so retry and notification policy
+ * does not drift between watchdog recovery and dispatch preview logic.
+ *
+ * Consumer: decideUnitRuntimeDispatch and operator-facing query summaries.
+ */
+export const UNIT_RUNTIME_TERMINAL_STATUSES = [
+	"completed",
+	"failed",
+	"blocked",
+	"cancelled",
+	"stale",
+	"runaway-recovered",
+] as const;
+
+/**
+ * Describes the explicit unit runtime finite-state-machine transitions.
+ *
+ * Purpose: make retry, notification, and reset transitions reviewable as data
+ * instead of implied by ad hoc marker files or legacy phase strings.
+ *
+ * Consumer: unit runtime tests, future dispatch/reconciler guards.
+ */
+export const UNIT_RUNTIME_TRANSITIONS = {
+	queued: ["claimed", "cancelled"],
+	claimed: ["running", "stale", "cancelled"],
+	running: [
+		"progress",
+		"completed",
+		"failed",
+		"blocked",
+		"cancelled",
+		"stale",
+		"runaway-recovered",
+	],
+	progress: [
+		"running",
+		"completed",
+		"failed",
+		"blocked",
+		"cancelled",
+		"stale",
+		"runaway-recovered",
+	],
+	completed: ["notified"],
+	failed: ["queued", "notified"],
+	blocked: ["notified"],
+	cancelled: ["notified"],
+	stale: ["queued", "notified"],
+	"runaway-recovered": ["queued", "notified"],
+	notified: ["queued"],
+} as const satisfies Record<UnitRuntimeStatus, readonly UnitRuntimeStatus[]>;
+
+/**
+ * Enumerates every durable unit runtime status.
+ *
+ * Purpose: let persistence and dispatch decisions share one exhaustive status
+ * type while legacy `phase` remains available for older call sites.
+ *
+ * Consumer: AutoUnitRuntimeRecord.status, retry decisions, query summaries.
+ */
+export type UnitRuntimeStatus = (typeof UNIT_RUNTIME_STATUSES)[number];
+
+/**
+ * Enumerates statuses that end a unit execution attempt.
+ *
+ * Purpose: distinguish states that need notify/retry/block policy from active
+ * states that should not start a second copy of the same unit.
+ *
+ * Consumer: decideUnitRuntimeDispatch.
+ */
+export type UnitRuntimeTerminalStatus =
+	(typeof UNIT_RUNTIME_TERMINAL_STATUSES)[number];
+
+/**
+ * Captures the durable FSM state embedded in a unit runtime record.
+ *
+ * Purpose: expose retry budget, liveness, and notification fields together so
+ * callers can decide whether a unit should run, retry, block, or notify.
+ *
+ * Consumer: writeUnitRuntimeRecord, decideUnitRuntimeDispatch, headless query.
+ */
+export interface UnitRuntimeState {
+	status: UnitRuntimeStatus;
+	retryCount: number;
+	maxRetries: number;
+	lastHeartbeatAt: number | null;
+	lastProgressAt: number;
+	lastOutputAt: number | null;
+	outputPath: string | null;
+	watchdogReason: string | null;
+	notifiedAt: number | null;
+}
+
 export type UnitRuntimePhase =
+	| UnitRuntimeStatus
 	| "dispatched"
 	| "wrapup-warning-sent"
 	| "runaway-warning-sent"
@ -33,6 +152,14 @@ export type UnitRuntimePhase =
 	| "paused"
 	| "skipped";

+const DEFAULT_UNIT_RUNTIME_MAX_RETRIES = 1;
+
+const RETRYABLE_TERMINAL_STATUSES = new Set<UnitRuntimeStatus>([
+	"failed",
+	"stale",
+	"runaway-recovered",
+]);
+
 export interface ExecuteTaskRecoveryStatus {
 	planPath: string;
 	summaryPath: string;
@ -50,18 +177,263 @@ export interface AutoUnitRuntimeRecord {
 	startedAt: number;
 	updatedAt: number;
 	phase: UnitRuntimePhase;
+	status: UnitRuntimeStatus;
 	wrapupWarningSent: boolean;
 	continueHereFired: boolean;
 	timeoutAt: number | null;
+	lastHeartbeatAt?: number | null;
 	lastProgressAt: number;
 	progressCount: number;
 	lastProgressKind: string;
+	lastOutputAt?: number | null;
+	outputPath?: string | null;
+	watchdogReason?: string | null;
+	notifiedAt?: number | null;
 	recovery?: ExecuteTaskRecoveryStatus;
 	recoveryAttempts?: number;
+	retryCount?: number;
+	maxRetries?: number;
 	lastRecoveryReason?: "idle" | "hard";
 	runawayGuardPause?: RunawayGuardPauseMetadata;
 }

+/**
+ * Describes whether dispatch may run a unit from its runtime record.
+ *
+ * Purpose: surface the same retry-budget decision to tests, dispatch preview,
+ * and operator diagnostics without reinterpreting terminal states ad hoc.
+ *
+ * Consumer: unit-runtime FSM tests and headless query runtime summaries.
+ */
+export type UnitRuntimeDispatchDecision =
+	| {
+			action: "dispatch";
+			reasonCode: "no-runtime-record" | "queued";
+			retryCount: number;
+			maxRetries: number;
+			retryBudgetRemaining: number;
+	  }
+	| {
+			action: "retry";
+			reasonCode: "retry-budget-available";
+			retryCount: number;
+			maxRetries: number;
+			retryBudgetRemaining: number;
+	  }
+	| {
+			action: "notify";
+			reasonCode: "terminal-ready-to-notify";
+			retryCount: number;
+			maxRetries: number;
+			retryBudgetRemaining: number;
+	  }
+	| {
+			action: "block";
+			reasonCode: "retry-budget-exhausted" | "synthetic-reset-required";
+			retryCount: number;
+			maxRetries: number;
+			retryBudgetRemaining: number;
+	  }
+	| {
+			action: "skip";
+			reasonCode:
+				| "already-notified"
+				| "active-or-claimed"
+				| "notified"
+				| "terminal-nonretryable";
+			retryCount: number;
+			maxRetries: number;
+			retryBudgetRemaining: number;
+	  };
+
+function hasUpdate<K extends keyof AutoUnitRuntimeRecord>(
+	updates: Partial<AutoUnitRuntimeRecord>,
+	key: K,
+): boolean {
+	return Object.hasOwn(updates, key);
+}
+
+function phaseForStatus(status: UnitRuntimeStatus): UnitRuntimePhase {
+	switch (status) {
+		case "queued":
+		case "claimed":
+		case "running":
+			return "dispatched";
+		case "progress":
+			return "wrapup-warning-sent";
+		case "completed":
+			return "finalized";
+		default:
+			return status;
+	}
+}
+
+function inferStatusFromPhase(
+	phase: UnitRuntimePhase,
+	record?: Pick<AutoUnitRuntimeRecord, "runawayGuardPause"> | null,
+): UnitRuntimeStatus {
+	if ((UNIT_RUNTIME_STATUSES as readonly string[]).includes(phase)) {
+		return phase as UnitRuntimeStatus;
+	}
+	switch (phase) {
+		case "dispatched":
+			return "running";
+		case "wrapup-warning-sent":
+		case "runaway-warning-sent":
+		case "runaway-final-warning-sent":
+		case "recovered":
+			return "progress";
+		case "timeout":
+			return "stale";
+		case "finalized":
+			return "completed";
+		case "paused":
+			return record?.runawayGuardPause ? "runaway-recovered" : "blocked";
+		case "skipped":
+			return "blocked";
+		default:
+			return "running";
+	}
+}
+
+function retryBudgetRemaining(retryCount: number, maxRetries: number): number {
+	return Math.max(0, maxRetries - retryCount);
+}
+
+/**
+ * Returns true when a runtime status is terminal for one execution attempt.
+ *
+ * Purpose: keep terminal-state checks exhaustive against the exported terminal
+ * union rather than hard-coded differently at each caller.
+ *
+ * Consumer: decideUnitRuntimeDispatch and query summary generation.
+ */
+export function isTerminalUnitRuntimeStatus(
+	status: UnitRuntimeStatus,
+): status is UnitRuntimeTerminalStatus {
+	return (UNIT_RUNTIME_TERMINAL_STATUSES as readonly string[]).includes(status);
+}
+
+/**
+ * Returns the normalized FSM state embedded in a runtime record.
+ *
+ * Purpose: let legacy records with only `phase` still participate in retry and
+ * query policy while new records persist explicit FSM fields.
+ *
+ * Consumer: decideUnitRuntimeDispatch and headless query summaries.
+ */
+export function getUnitRuntimeState(
+	record: AutoUnitRuntimeRecord,
+): UnitRuntimeState {
+	const status = record.status ?? inferStatusFromPhase(record.phase, record);
+	const retryCount = record.retryCount ?? record.recoveryAttempts ?? 0;
+	const maxRetries = record.maxRetries ?? DEFAULT_UNIT_RUNTIME_MAX_RETRIES;
+	return {
+		status,
+		retryCount,
+		maxRetries,
+		lastHeartbeatAt: record.lastHeartbeatAt ?? null,
+		lastProgressAt: record.lastProgressAt,
+		lastOutputAt: record.lastOutputAt ?? null,
+		outputPath: record.outputPath ?? null,
+		watchdogReason: record.watchdogReason ?? null,
+		notifiedAt: record.notifiedAt ?? null,
+	};
+}
+
+/**
+ * Returns true for synthetic units that must be reset before rerun.
+ *
+ * Purpose: prevent synthetic orchestration units such as parallel research from
+ * looping after failure while preserving normal task retry behavior.
+ *
+ * Consumer: decideUnitRuntimeDispatch.
+ */
+export function isSyntheticUnitRuntime(record: AutoUnitRuntimeRecord): boolean {
+	return (
+		record.unitType === "synthetic" ||
+		record.unitId.includes("parallel-research")
+	);
+}
+
+/**
+ * Decides whether a unit runtime record permits dispatch, retry, notify, or block.
+ *
+ * Purpose: enforce retry budgets and explicit reset requirements before callers
+ * schedule another copy of a failed or stale unit.
+ *
+ * Consumer: unit-runtime FSM tests and headless query runtime summaries.
+ */
+export function decideUnitRuntimeDispatch(
+	record: AutoUnitRuntimeRecord | null,
+	options: { synthetic?: boolean } = {},
+): UnitRuntimeDispatchDecision {
+	if (!record) {
+		return {
+			action: "dispatch",
+			reasonCode: "no-runtime-record",
+			retryCount: 0,
+			maxRetries: DEFAULT_UNIT_RUNTIME_MAX_RETRIES,
+			retryBudgetRemaining: DEFAULT_UNIT_RUNTIME_MAX_RETRIES,
+		};
+	}
+
+	const state = getUnitRuntimeState(record);
+	const remaining = retryBudgetRemaining(state.retryCount, state.maxRetries);
+	const common = {
+		retryCount: state.retryCount,
+		maxRetries: state.maxRetries,
+		retryBudgetRemaining: remaining,
+	};
+
+	if (state.notifiedAt !== null) {
+		return { action: "skip", reasonCode: "already-notified", ...common };
+	}
+	if (state.status === "notified") {
+		return { action: "skip", reasonCode: "notified", ...common };
+	}
+	if (state.status === "queued") {
+		return { action: "dispatch", reasonCode: "queued", ...common };
+	}
+	if (!isTerminalUnitRuntimeStatus(state.status)) {
+		return { action: "skip", reasonCode: "active-or-claimed", ...common };
+	}
+
+	const synthetic = options.synthetic ?? isSyntheticUnitRuntime(record);
+	if (synthetic && state.status !== "completed") {
+		return {
+			action: "block",
+			reasonCode: "synthetic-reset-required",
+			...common,
+		};
+	}
+
+	if (RETRYABLE_TERMINAL_STATUSES.has(state.status)) {
+		if (remaining > 0) {
+			return {
+				action: "retry",
+				reasonCode: "retry-budget-available",
+				...common,
+			};
+		}
+		return { action: "block", reasonCode: "retry-budget-exhausted", ...common };
+	}
+
+	if (
+		state.status === "completed" ||
+		state.status === "blocked" ||
+		state.status === "cancelled"
+	) {
+		return {
+			action: "notify",
+			reasonCode: "terminal-ready-to-notify",
+			...common,
+		};
+	}
+
+	return { action: "skip", reasonCode: "terminal-nonretryable", ...common };
+}
+
 function runtimeDir(basePath: string): string {
 	return join(sfRoot(basePath), "runtime", "units");
 }
@ -105,25 +477,68 @@ export function writeUnitRuntimeRecord(
 	mkdirSync(dir, { recursive: true });
 	const path = runtimePath(basePath, unitType, unitId);
 	const prev = _runtimeCache.get(path) ?? null;
+	const phase =
+		updates.phase ??
+		(updates.status ? phaseForStatus(updates.status) : prev?.phase) ??
+		"dispatched";
+	const status =
+		updates.status ??
+		(updates.phase || !prev?.status
+			? inferStatusFromPhase(phase, {
+					runawayGuardPause:
+						updates.runawayGuardPause ?? prev?.runawayGuardPause,
+				})
+			: prev.status);
+	const recoveryAttempts = hasUpdate(updates, "recoveryAttempts")
+		? (updates.recoveryAttempts ?? 0)
+		: (prev?.recoveryAttempts ?? 0);
+	const retryCount = hasUpdate(updates, "retryCount")
+		? (updates.retryCount ?? 0)
+		: hasUpdate(updates, "recoveryAttempts")
+			? (updates.recoveryAttempts ?? 0)
+			: (prev?.retryCount ?? recoveryAttempts ?? 0);
 	const next: AutoUnitRuntimeRecord = {
 		version: 1,
 		unitType,
 		unitId,
 		startedAt,
 		updatedAt: Date.now(),
-		phase: updates.phase ?? prev?.phase ?? "dispatched",
+		phase,
+		status,
 		wrapupWarningSent:
 			updates.wrapupWarningSent ?? prev?.wrapupWarningSent ?? false,
 		continueHereFired:
 			updates.continueHereFired ?? prev?.continueHereFired ?? false,
-		timeoutAt: updates.timeoutAt ?? prev?.timeoutAt ?? null,
+		timeoutAt: hasUpdate(updates, "timeoutAt")
+			? (updates.timeoutAt ?? null)
+			: (prev?.timeoutAt ?? null),
+		lastHeartbeatAt: hasUpdate(updates, "lastHeartbeatAt")
+			? (updates.lastHeartbeatAt ?? null)
+			: (prev?.lastHeartbeatAt ?? startedAt),
 		lastProgressAt:
 			updates.lastProgressAt ?? prev?.lastProgressAt ?? Date.now(),
 		progressCount: updates.progressCount ?? prev?.progressCount ?? 0,
 		lastProgressKind:
 			updates.lastProgressKind ?? prev?.lastProgressKind ?? "dispatch",
+		lastOutputAt: hasUpdate(updates, "lastOutputAt")
+			? (updates.lastOutputAt ?? null)
+			: (prev?.lastOutputAt ?? null),
+		outputPath: hasUpdate(updates, "outputPath")
+			? (updates.outputPath ?? null)
+			: (prev?.outputPath ?? null),
+		watchdogReason: hasUpdate(updates, "watchdogReason")
+			? (updates.watchdogReason ?? null)
+			: (prev?.watchdogReason ?? null),
+		notifiedAt: hasUpdate(updates, "notifiedAt")
+			? (updates.notifiedAt ?? null)
+			: (prev?.notifiedAt ?? null),
 		recovery: updates.recovery ?? prev?.recovery,
-		recoveryAttempts: updates.recoveryAttempts ?? prev?.recoveryAttempts ?? 0,
+		recoveryAttempts,
+		retryCount,
+		maxRetries:
+			updates.maxRetries ??
+			prev?.maxRetries ??
+			DEFAULT_UNIT_RUNTIME_MAX_RETRIES,
 		lastRecoveryReason: updates.lastRecoveryReason ?? prev?.lastRecoveryReason,
 		runawayGuardPause: updates.runawayGuardPause ?? prev?.runawayGuardPause,
 	};
--- a/src/tests/integration/web-mode-cli.test.ts
+++ b/src/tests/integration/web-mode-cli.test.ts
@ -8,7 +8,7 @@ import {
 } from "node:fs";
 import { tmpdir } from "node:os";
 import { join, resolve } from "node:path";
-import { test, afterEach } from 'vitest';
+import { afterEach, test } from "vitest";

 const projectRoot = process.cwd();

@ -954,3 +954,42 @@ test("reapOrphanedNextServerProcesses returns zero reaped on non-Linux platforms
 test("reapOrphanedNextServerProcesses is exported and callable", () => {
 	assert.equal(typeof webMode.reapOrphanedNextServerProcesses, "function");
 });
+
+test("reapOrphanedNextServerProcesses kills orphaned standalone next-server", () => {
+	const killed: Array<{ pid: number; signal: string }> = [];
+	const stderrChunks: string[] = [];
+	const packageRoot = "/tmp/sf-package";
+	const result = webMode.reapOrphanedNextServerProcesses(
+		{
+			write: (chunk: string) => {
+				stderrChunks.push(chunk);
+				return true;
+			},
+		},
+		packageRoot,
+		{
+			platform: "linux",
+			execSync: (() =>
+				[
+					"123 1 node /tmp/sf-package/dist/web/standalone/node_modules/next/dist/server/next-server.js node",
+					"124 999 node /tmp/sf-package/dist/web/standalone/node_modules/next/dist/server/next-server.js node",
+					"125 1 node /elsewhere/next-server.js node",
+				].join("\n")) as any,
+			readlinkSync: ((path: string) => {
+				if (path === "/proc/123/cwd")
+					return "/tmp/sf-package/dist/web/standalone";
+				if (path === "/proc/124/cwd")
+					return "/tmp/sf-package/dist/web/standalone";
+				return "/elsewhere";
+			}) as any,
+			kill: ((pid: number, signal: string) => {
+				killed.push({ pid, signal });
+				return true;
+			}) as any,
+		},
+	);
+
+	assert.equal(result.reaped, 1);
+	assert.deepEqual(killed, [{ pid: 123, signal: "SIGTERM" }]);
+	assert.match(stderrChunks.join(""), /Reaped orphaned next-server/);
+});
--- a/src/web-mode.ts
+++ b/src/web-mode.ts
@ -6,7 +6,13 @@ import {
 	spawn,
 } from "node:child_process";
 import { randomBytes } from "node:crypto";
-import { existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
+import {
+	existsSync,
+	readFileSync,
+	readlinkSync,
+	unlinkSync,
+	writeFileSync,
+} from "node:fs";
 import { request as httpRequest } from "node:http";
 import { createServer } from "node:net";
 import { join, resolve } from "node:path";
@ -16,10 +22,7 @@ import {
 } from "./app-paths.js";

 const DEFAULT_HOST = "127.0.0.1";
-const DEFAULT_PACKAGE_ROOT = resolve(
-	import.meta.dirname,
-	"..",
-);
+const DEFAULT_PACKAGE_ROOT = resolve(import.meta.dirname, "..");

 /** Open a URL in the user's default browser. */
 function openBrowser(url: string): void {
@ -685,10 +688,17 @@ function cleanupStaleInstance(
 export function reapOrphanedNextServerProcesses(
 	stderr: WritableLike,
 	packageRoot = DEFAULT_PACKAGE_ROOT,
+	deps: {
+		execSync?: typeof execSync;
+		readlinkSync?: typeof readlinkSync;
+		kill?: typeof process.kill;
+		platform?: NodeJS.Platform;
+	} = {},
 ): { reaped: number; errors: string[] } {
 	const errors: string[] = [];
 	let reaped = 0;
-	if (process.platform === "win32") {
+	const platform = deps.platform ?? process.platform;
+	if (platform === "win32") {
 		// Windows orphan detection not implemented; rely on port-kill fallback
 		return { reaped: 0, errors: [] };
 	}
@ -696,10 +706,10 @@ export function reapOrphanedNextServerProcesses(
 		// Find next-server processes with cwd matching our standalone host path
 		const standalonePath = resolve(packageRoot, "dist", "web", "standalone");
 		// Use ps to find node processes with next-server in their command line
-		const psOutput = execSync(
+		const psOutput = (deps.execSync ?? execSync)(
 			"ps -eo pid,ppid,cmd,comm --no-headers",
 			{ encoding: "utf8", timeout: 5000 },
-		);
+		) as string;
 		const lines = psOutput.split("\n").filter((line) => line.trim());
 		for (const line of lines) {
 			const parts = line.trim().split(/\s+/);
@ -715,7 +725,7 @@ export function reapOrphanedNextServerProcesses(
 			// Check if the process cwd matches our standalone path (or deleted variant)
 			let cwd: string | null = null;
 			try {
-				cwd = readFileSync(`/proc/${pid}/cwd`, "utf8").trim();
+				cwd = (deps.readlinkSync ?? readlinkSync)(`/proc/${pid}/cwd`);
 			} catch {
 				// Process may have exited between ps and readlink
 				continue;
@ -728,7 +738,7 @@ export function reapOrphanedNextServerProcesses(
 				const isOrphan = ppid === 1;
 				if (isOrphan) {
 					try {
-						process.kill(pid, "SIGTERM");
+						(deps.kill ?? process.kill)(pid, "SIGTERM");
 						reaped++;
 						stderr.write(
 							`[forge] Reaped orphaned next-server (pid=${pid}, cwd=${cwd})\n`,