From 73a464f574c365e28c8b143b32bda3a5bbd6ceb6 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 03:39:08 +0200 Subject: [PATCH] feat(ops): SF autonomous watchdog for continuous unattended dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/sf-autonomous-watchdog.sh — bash daemon that supervises `sf headless autonomous` across crashes/timeouts. Per-cycle: 1. Cleans stale state (lock + zombie inline-fix dispatch) 2. Kills orphan sf processes from prior runs 3. Launches sf with 30-min hard timeout (longest sf accepts cleanly) 4. On exit (timeout / dispatch-stop / crash), logs and restarts after 15s cooldown (10min cooldown if all milestones complete) Run: nohup bash scripts/sf-autonomous-watchdog.sh > .sf/watchdog.log 2>&1 & Stop: pkill -f sf-autonomous-watchdog This is the operational mode for the 2-4 week delivery horizon — SF runs continuously, the watchdog catches all exit conditions, and progress accumulates across many autonomous cycles. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/sf-autonomous-watchdog.sh | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 scripts/sf-autonomous-watchdog.sh diff --git a/scripts/sf-autonomous-watchdog.sh b/scripts/sf-autonomous-watchdog.sh new file mode 100755 index 000000000..f58b9494a --- /dev/null +++ b/scripts/sf-autonomous-watchdog.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# sf-autonomous-watchdog.sh — keep `sf headless autonomous` running across crashes / timeouts. +# +# Run as: nohup bash scripts/sf-autonomous-watchdog.sh > .sf/watchdog.log 2>&1 & +# Stop: pkill -f sf-autonomous-watchdog +# +# Each SF run uses a 30-minute hard timeout (the longest sf accepts cleanly). +# When it exits (timeout, dispatch-stop, or crash), the watchdog: +# 1. Logs the exit reason +# 2. Cleans stale state (lock + zombie inline-fix dispatch) +# 3. Waits a short cooldown +# 4. Restarts SF +# Loops forever until killed. + +set -u +cd "$(dirname "$0")/.." || exit 1 + +LOG=".sf/watchdog.log" +COOLDOWN_S=15 +TIMEOUT_MS=1800000 # 30 min per SF cycle + +echo "[$(date -u +%FT%TZ)] watchdog start, pid=$$" >> "$LOG" + +while true; do + # Clean stale state from prior crash / lock holder + rm -f .sf/sf.lock + rm -f .sf/runtime/units/*.json 2>/dev/null + echo '{"ids":[],"dispatchedAt":null}' > .sf/runtime/self-feedback-inline-fix.json + + # Make sure no orphan sf processes hold resources + pgrep -f "sf-from-source headless autonomous" | xargs -r kill -9 2>/dev/null + sleep 2 + + started_at=$(date +%s) + echo "[$(date -u +%FT%TZ)] starting sf headless autonomous (timeout=${TIMEOUT_MS}ms)" >> "$LOG" + + SF_DEBUG=1 timeout 1900 sf headless autonomous --timeout "$TIMEOUT_MS" --json \ + > ".sf/watchdog-run-$(date -u +%Y%m%dT%H%M%SZ).log" 2>&1 + + exit_code=$? + elapsed=$(( $(date +%s) - started_at )) + + echo "[$(date -u +%FT%TZ)] sf exited code=${exit_code} elapsed=${elapsed}s" >> "$LOG" + + # If dispatch-stop / all-complete, sleep longer before next poll — there's nothing to do + if grep -q "All milestones complete\|dispatch-stop" .sf/watchdog-run-*.log 2>/dev/null | tail -5 | grep -q "All milestones complete\|dispatch-stop"; then + echo "[$(date -u +%FT%TZ)] all milestones complete — long sleep before next check" >> "$LOG" + sleep 600 + else + sleep "$COOLDOWN_S" + fi +done