From 73a464f574c365e28c8b143b32bda3a5bbd6ceb6 Mon Sep 17 00:00:00 2001
From: Mikael Hugo <mikkihugo@users.noreply.github.com>
Date: Sun, 17 May 2026 03:39:08 +0200
Subject: [PATCH] feat(ops): SF autonomous watchdog for continuous unattended
 dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scripts/sf-autonomous-watchdog.sh — bash daemon that supervises
`sf headless autonomous` across crashes/timeouts. Per-cycle:
  1. Cleans stale state (lock + zombie inline-fix dispatch)
  2. Kills orphan sf processes from prior runs
  3. Launches sf with 30-min hard timeout (longest sf accepts cleanly)
  4. On exit (timeout / dispatch-stop / crash), logs and restarts after
     15s cooldown (10min cooldown if all milestones complete)

Run: nohup bash scripts/sf-autonomous-watchdog.sh > .sf/watchdog.log 2>&1 &
Stop: pkill -f sf-autonomous-watchdog

This is the operational mode for the 2-4 week delivery horizon — SF
runs continuously, the watchdog catches all exit conditions, and
progress accumulates across many autonomous cycles.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/sf-autonomous-watchdog.sh | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100755 scripts/sf-autonomous-watchdog.sh

diff --git a/scripts/sf-autonomous-watchdog.sh b/scripts/sf-autonomous-watchdog.sh
new file mode 100755
index 000000000..f58b9494a
--- /dev/null
+++ b/scripts/sf-autonomous-watchdog.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# sf-autonomous-watchdog.sh — keep `sf headless autonomous` running across crashes / timeouts.
+#
+# Run as: nohup bash scripts/sf-autonomous-watchdog.sh > .sf/watchdog.log 2>&1 &
+# Stop:   pkill -f sf-autonomous-watchdog
+#
+# Each SF run uses a 30-minute hard timeout (the longest sf accepts cleanly).
+# When it exits (timeout, dispatch-stop, or crash), the watchdog:
+#   1. Logs the exit reason
+#   2. Cleans stale state (lock + zombie inline-fix dispatch)
+#   3. Waits a short cooldown
+#   4. Restarts SF
+# Loops forever until killed.
+
+set -u
+cd "$(dirname "$0")/.." || exit 1
+
+LOG=".sf/watchdog.log"
+COOLDOWN_S=15
+TIMEOUT_MS=1800000  # 30 min per SF cycle
+
+echo "[$(date -u +%FT%TZ)] watchdog start, pid=$$" >> "$LOG"
+
+while true; do
+    # Clean stale state from prior crash / lock holder
+    rm -f .sf/sf.lock
+    rm -f .sf/runtime/units/*.json 2>/dev/null
+    echo '{"ids":[],"dispatchedAt":null}' > .sf/runtime/self-feedback-inline-fix.json
+
+    # Make sure no orphan sf processes hold resources
+    pgrep -f "sf-from-source headless autonomous" | xargs -r kill -9 2>/dev/null
+    sleep 2
+
+    started_at=$(date +%s)
+    echo "[$(date -u +%FT%TZ)] starting sf headless autonomous (timeout=${TIMEOUT_MS}ms)" >> "$LOG"
+
+    SF_DEBUG=1 timeout 1900 sf headless autonomous --timeout "$TIMEOUT_MS" --json \
+        > ".sf/watchdog-run-$(date -u +%Y%m%dT%H%M%SZ).log" 2>&1
+
+    exit_code=$?
+    elapsed=$(( $(date +%s) - started_at ))
+
+    echo "[$(date -u +%FT%TZ)] sf exited code=${exit_code} elapsed=${elapsed}s" >> "$LOG"
+
+    # If dispatch-stop / all-complete, sleep longer before next poll — there's nothing to do
+    if grep -q "All milestones complete\|dispatch-stop" .sf/watchdog-run-*.log 2>/dev/null | tail -5 | grep -q "All milestones complete\|dispatch-stop"; then
+        echo "[$(date -u +%FT%TZ)] all milestones complete — long sleep before next check" >> "$LOG"
+        sleep 600
+    else
+        sleep "$COOLDOWN_S"
+    fi
+done