From 8122a2b6c72f58183ed8446bfbc4cd9402a86457 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 08:31:07 +0200 Subject: [PATCH] fix(watchdog): pre-flight smoke + crash-loop backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two guards added after today's 2-hour crash-loop on missing DEFAULT_STALE_TIMEOUT_MS export: 1. Pre-flight smoke test: \`sf --version\` must succeed before each cycle. If dist is broken (missing export, syntax error), pause 5min + log loudly instead of immediately respawning into the same crash. 2. Crash-loop detection: 3 consecutive <90s failure exits → assume crash-loop, back off 5min before retry. Prevents the "100 crashes in 2 hours, 0 useful work" pattern we just hit. Together: a broken dist causes ONE crash + a 5min pause, not a 2-hour CPU burn. Operator notices the pause in .sf/watchdog.log and intervenes; in the meantime no resources wasted. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/sf-autonomous-watchdog.sh | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/sf-autonomous-watchdog.sh b/scripts/sf-autonomous-watchdog.sh index 24a4bf748..44fdd1ef1 100755 --- a/scripts/sf-autonomous-watchdog.sh +++ b/scripts/sf-autonomous-watchdog.sh @@ -21,6 +21,12 @@ TIMEOUT_MS=1800000 # 30 min per SF cycle echo "[$(date -u +%FT%TZ)] watchdog start, pid=$$" >> "$LOG" +CRASHLOOP_THRESHOLD=3 # consecutive <90s failures +CRASHLOOP_BACKOFF_S=300 # 5min pause after crash-loop detected +CRASHLOOP_COUNT=0 +LAST_EXIT_CODE=0 +LAST_ELAPSED=999 + while true; do # Clean stale state from prior crash / lock holder. # Also clear active.json — a stuck "in-progress" unit from a crashed @@ -30,6 +36,31 @@ while true; do rm -f .sf/runtime/autonomous-solver/active.json 2>/dev/null echo '{"ids":[],"dispatchedAt":null}' > .sf/runtime/self-feedback-inline-fix.json + # #wiggums: pre-flight smoke test — `sf --version` must succeed before + # starting an autonomous cycle. If it fails, dist is broken (e.g. + # missing export, syntax error) and there's no point looping. Pause + # for 5min + log loudly so operator notices. + if ! sf --version >/dev/null 2>&1; then + echo "[$(date -u +%FT%TZ)] PRE-FLIGHT FAIL: sf --version errored — dist may be broken. Pausing ${CRASHLOOP_BACKOFF_S}s." >> "$LOG" + sleep "$CRASHLOOP_BACKOFF_S" + continue + fi + + # Crash-loop detection: 3 consecutive <90s exits = SF is crashing at + # startup. Back off 5min instead of hammering it. Two hours of + # 65s/cycle crashes on 2026-05-17 (DEFAULT_STALE_TIMEOUT_MS missing + # export) is what motivated this guard. + if [ "$LAST_EXIT_CODE" != "0" ] && [ "$LAST_ELAPSED" -lt 90 ]; then + CRASHLOOP_COUNT=$((CRASHLOOP_COUNT + 1)) + if [ "$CRASHLOOP_COUNT" -ge "$CRASHLOOP_THRESHOLD" ]; then + echo "[$(date -u +%FT%TZ)] CRASH-LOOP DETECTED ($CRASHLOOP_COUNT consecutive <90s failures). Backing off ${CRASHLOOP_BACKOFF_S}s before retry." >> "$LOG" + sleep "$CRASHLOOP_BACKOFF_S" + CRASHLOOP_COUNT=0 + fi + else + CRASHLOOP_COUNT=0 + fi + # Make sure no orphan sf processes hold resources pgrep -f "sf-from-source headless autonomous" | xargs -r kill -9 2>/dev/null sleep 2 @@ -48,6 +79,8 @@ while true; do exit_code=$? elapsed=$(( $(date +%s) - started_at )) + LAST_EXIT_CODE="$exit_code" + LAST_ELAPSED="$elapsed" echo "[$(date -u +%FT%TZ)] sf exited code=${exit_code} elapsed=${elapsed}s" >> "$LOG"