fix(watchdog): pre-flight smoke + crash-loop backoff
Two guards added after today's 2-hour crash-loop on missing DEFAULT_STALE_TIMEOUT_MS export: 1. Pre-flight smoke test: \`sf --version\` must succeed before each cycle. If dist is broken (missing export, syntax error), pause 5min + log loudly instead of immediately respawning into the same crash. 2. Crash-loop detection: 3 consecutive <90s failure exits → assume crash-loop, back off 5min before retry. Prevents the "100 crashes in 2 hours, 0 useful work" pattern we just hit. Together: a broken dist causes ONE crash + a 5min pause, not a 2-hour CPU burn. Operator notices the pause in .sf/watchdog.log and intervenes; in the meantime no resources wasted. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
80ede48f06
commit
8122a2b6c7
1 changed files with 33 additions and 0 deletions
|
|
@ -21,6 +21,12 @@ TIMEOUT_MS=1800000 # 30 min per SF cycle
|
|||
|
||||
echo "[$(date -u +%FT%TZ)] watchdog start, pid=$$" >> "$LOG"
|
||||
|
||||
CRASHLOOP_THRESHOLD=3 # consecutive <90s failures
|
||||
CRASHLOOP_BACKOFF_S=300 # 5min pause after crash-loop detected
|
||||
CRASHLOOP_COUNT=0
|
||||
LAST_EXIT_CODE=0
|
||||
LAST_ELAPSED=999
|
||||
|
||||
while true; do
|
||||
# Clean stale state from prior crash / lock holder.
|
||||
# Also clear active.json — a stuck "in-progress" unit from a crashed
|
||||
|
|
@ -30,6 +36,31 @@ while true; do
|
|||
rm -f .sf/runtime/autonomous-solver/active.json 2>/dev/null
|
||||
echo '{"ids":[],"dispatchedAt":null}' > .sf/runtime/self-feedback-inline-fix.json
|
||||
|
||||
# #wiggums: pre-flight smoke test — `sf --version` must succeed before
|
||||
# starting an autonomous cycle. If it fails, dist is broken (e.g.
|
||||
# missing export, syntax error) and there's no point looping. Pause
|
||||
# for 5min + log loudly so operator notices.
|
||||
if ! sf --version >/dev/null 2>&1; then
|
||||
echo "[$(date -u +%FT%TZ)] PRE-FLIGHT FAIL: sf --version errored — dist may be broken. Pausing ${CRASHLOOP_BACKOFF_S}s." >> "$LOG"
|
||||
sleep "$CRASHLOOP_BACKOFF_S"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Crash-loop detection: 3 consecutive <90s exits = SF is crashing at
|
||||
# startup. Back off 5min instead of hammering it. Two hours of
|
||||
# 65s/cycle crashes on 2026-05-17 (DEFAULT_STALE_TIMEOUT_MS missing
|
||||
# export) is what motivated this guard.
|
||||
if [ "$LAST_EXIT_CODE" != "0" ] && [ "$LAST_ELAPSED" -lt 90 ]; then
|
||||
CRASHLOOP_COUNT=$((CRASHLOOP_COUNT + 1))
|
||||
if [ "$CRASHLOOP_COUNT" -ge "$CRASHLOOP_THRESHOLD" ]; then
|
||||
echo "[$(date -u +%FT%TZ)] CRASH-LOOP DETECTED ($CRASHLOOP_COUNT consecutive <90s failures). Backing off ${CRASHLOOP_BACKOFF_S}s before retry." >> "$LOG"
|
||||
sleep "$CRASHLOOP_BACKOFF_S"
|
||||
CRASHLOOP_COUNT=0
|
||||
fi
|
||||
else
|
||||
CRASHLOOP_COUNT=0
|
||||
fi
|
||||
|
||||
# Make sure no orphan sf processes hold resources
|
||||
pgrep -f "sf-from-source headless autonomous" | xargs -r kill -9 2>/dev/null
|
||||
sleep 2
|
||||
|
|
@ -48,6 +79,8 @@ while true; do
|
|||
|
||||
exit_code=$?
|
||||
elapsed=$(( $(date +%s) - started_at ))
|
||||
LAST_EXIT_CODE="$exit_code"
|
||||
LAST_ELAPSED="$elapsed"
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] sf exited code=${exit_code} elapsed=${elapsed}s" >> "$LOG"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue