fix(watchdog): pre-flight smoke + crash-loop backoff

Two guards added after today's 2-hour crash-loop on missing DEFAULT_STALE_TIMEOUT_MS export: 1. Pre-flight smoke test: \`sf --version\` must succeed before each cycle. If dist is broken (missing export, syntax error), pause 5min + log loudly instead of immediately respawning into the same crash. 2. Crash-loop detection: 3 consecutive <90s failure exits → assume crash-loop, back off 5min before retry. Prevents the "100 crashes in 2 hours, 0 useful work" pattern we just hit. Together: a broken dist causes ONE crash + a 5min pause, not a 2-hour CPU burn. Operator notices the pause in .sf/watchdog.log and intervenes; in the meantime no resources wasted. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 08:31:07 +02:00 · 2026-05-17 08:31:07 +02:00 · 8122a2b6c7
commit 8122a2b6c7
parent 80ede48f06
1 changed files with 33 additions and 0 deletions
--- a/scripts/sf-autonomous-watchdog.sh
+++ b/scripts/sf-autonomous-watchdog.sh
@ -21,6 +21,12 @@ TIMEOUT_MS=1800000  # 30 min per SF cycle

 echo "[$(date -u +%FT%TZ)] watchdog start, pid=$$" >> "$LOG"

+CRASHLOOP_THRESHOLD=3      # consecutive <90s failures
+CRASHLOOP_BACKOFF_S=300    # 5min pause after crash-loop detected
+CRASHLOOP_COUNT=0
+LAST_EXIT_CODE=0
+LAST_ELAPSED=999
+
 while true; do
    # Clean stale state from prior crash / lock holder.
    # Also clear active.json — a stuck "in-progress" unit from a crashed
@ -30,6 +36,31 @@ while true; do
    rm -f .sf/runtime/autonomous-solver/active.json 2>/dev/null
    echo '{"ids":[],"dispatchedAt":null}' > .sf/runtime/self-feedback-inline-fix.json

+    # #wiggums: pre-flight smoke test — `sf --version` must succeed before
+    # starting an autonomous cycle. If it fails, dist is broken (e.g.
+    # missing export, syntax error) and there's no point looping. Pause
+    # for 5min + log loudly so operator notices.
+    if ! sf --version >/dev/null 2>&1; then
+        echo "[$(date -u +%FT%TZ)] PRE-FLIGHT FAIL: sf --version errored — dist may be broken. Pausing ${CRASHLOOP_BACKOFF_S}s." >> "$LOG"
+        sleep "$CRASHLOOP_BACKOFF_S"
+        continue
+    fi
+
+    # Crash-loop detection: 3 consecutive <90s exits = SF is crashing at
+    # startup. Back off 5min instead of hammering it. Two hours of
+    # 65s/cycle crashes on 2026-05-17 (DEFAULT_STALE_TIMEOUT_MS missing
+    # export) is what motivated this guard.
+    if [ "$LAST_EXIT_CODE" != "0" ] && [ "$LAST_ELAPSED" -lt 90 ]; then
+        CRASHLOOP_COUNT=$((CRASHLOOP_COUNT + 1))
+        if [ "$CRASHLOOP_COUNT" -ge "$CRASHLOOP_THRESHOLD" ]; then
+            echo "[$(date -u +%FT%TZ)] CRASH-LOOP DETECTED ($CRASHLOOP_COUNT consecutive <90s failures). Backing off ${CRASHLOOP_BACKOFF_S}s before retry." >> "$LOG"
+            sleep "$CRASHLOOP_BACKOFF_S"
+            CRASHLOOP_COUNT=0
+        fi
+    else
+        CRASHLOOP_COUNT=0
+    fi
+
    # Make sure no orphan sf processes hold resources
    pgrep -f "sf-from-source headless autonomous" | xargs -r kill -9 2>/dev/null
    sleep 2
@ -48,6 +79,8 @@ while true; do

    exit_code=$?
    elapsed=$(( $(date +%s) - started_at ))
+    LAST_EXIT_CODE="$exit_code"
+    LAST_ELAPSED="$elapsed"

    echo "[$(date -u +%FT%TZ)] sf exited code=${exit_code} elapsed=${elapsed}s" >> "$LOG"