diff --git a/.sf/backups/db/sf.db.2026-05-10T02-01-37-759Z b/.sf/backups/db/sf.db.2026-05-10T02-01-37-759Z new file mode 100644 index 000000000..f9e2dcadc Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-10T02-01-37-759Z differ diff --git a/.sf/backups/db/sf.db.2026-05-10T02-27-22-542Z b/.sf/backups/db/sf.db.2026-05-10T02-27-22-542Z new file mode 100644 index 000000000..f41e6d041 Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-10T02-27-22-542Z differ diff --git a/.sf/metrics.db b/.sf/metrics.db index 66b5b9677..01cdc98be 100644 Binary files a/.sf/metrics.db and b/.sf/metrics.db differ diff --git a/.sf/model-performance.json b/.sf/model-performance.json index f83e4283c..38e4fcf73 100644 --- a/.sf/model-performance.json +++ b/.sf/model-performance.json @@ -23,14 +23,26 @@ "total": 1 }, "minimax/MiniMax-M2.7-highspeed": { - "successes": 1, + "successes": 2, "failures": 0, "timeouts": 0, - "totalTokens": 0, - "totalCost": 0, - "lastUsed": "2026-05-10T00:50:07.124Z", + "totalTokens": 891034, + "totalCost": 0.20030757, + "lastUsed": "2026-05-10T01:24:00.207Z", "successRate": 1, - "total": 1 + "total": 2 + } + }, + "discuss-milestone": { + "minimax/MiniMax-M2.7-highspeed": { + "successes": 2, + "failures": 0, + "timeouts": 0, + "totalTokens": 8639600, + "totalCost": 2.0647307100000005, + "lastUsed": "2026-05-10T01:43:48.671Z", + "successRate": 1, + "total": 2 } } } \ No newline at end of file diff --git a/src/resources/extensions/sf/auto-dashboard.js b/src/resources/extensions/sf/auto-dashboard.js index 277737a96..6be63ce9f 100644 --- a/src/resources/extensions/sf/auto-dashboard.js +++ b/src/resources/extensions/sf/auto-dashboard.js @@ -167,8 +167,8 @@ function formatSolverWidgetLine(basePath, theme, width, pad) { .join(" · "); return truncateToWidth(`${pad}${theme.fg("dim", text)}`, width, "…"); } -function formatUokDiagnosticWidgetLine(basePath, theme, width, pad) { - const diagnostics = readUokDiagnostics(basePath); +function formatUokDiagnosticWidgetLine(basePath, theme, width, pad, cachedDiagnostics) { + const diagnostics = cachedDiagnostics !== undefined ? cachedDiagnostics : readUokDiagnostics(basePath); if (!diagnostics) return null; const parts = [ `uok ${diagnostics.verdict ?? "unknown"}`, @@ -607,6 +607,11 @@ export function updateProgressWidget( let cachedLines; let cachedWidth; let cachedRtkLabel; + // Cache health score and UOK diagnostics at 15s interval — recomputing + // them on every 1s spinner tick causes the widget height to change whenever + // the score level transitions, making the banner "bounce" on screen. + let cachedProgressScore = computeProgressScore(); + let cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath()); let activityFrame = 0; const refreshRtkLabel = () => { try { @@ -634,6 +639,9 @@ export function updateProgressWidget( updateSliceProgressCache(accessors.getBasePath(), mid.id, slice?.id); } refreshRtkLabel(); + // Refresh health score and diagnostics alongside other slow data + cachedProgressScore = computeProgressScore(); + cachedUokDiagnostics = readUokDiagnostics(accessors.getBasePath()); cachedLines = undefined; } catch (err) { /* non-fatal */ @@ -667,8 +675,9 @@ export function updateProgressWidget( const spinner = theme.fg("accent", ACTIVITY_FRAMES[activityFrame]); const elapsed = formatAutoElapsed(accessors.getAutoStartTime()); const modeTag = accessors.isStepMode() ? "NEXT" : "AUTO"; - // Health indicator in header - const score = computeProgressScore(); + // Health indicator in header — use 15s-cached score (not live) + // to keep widget height stable between refreshes. + const score = cachedProgressScore; const healthColor = score.level === "green" ? "success" @@ -782,6 +791,7 @@ export function updateProgressWidget( theme, width, pad, + cachedUokDiagnostics, ); if (diagnosticLine) lines.push(diagnosticLine); // Progress bar @@ -883,6 +893,7 @@ export function updateProgressWidget( theme, width, pad, + cachedUokDiagnostics, ); if (diagnosticLine) lines.push(diagnosticLine); lines.push(""); diff --git a/src/resources/extensions/sf/bootstrap/register-hooks.js b/src/resources/extensions/sf/bootstrap/register-hooks.js index d9fc59ee1..2af6671b1 100644 --- a/src/resources/extensions/sf/bootstrap/register-hooks.js +++ b/src/resources/extensions/sf/bootstrap/register-hooks.js @@ -8,6 +8,7 @@ import { hasResearchTerminalTransition, isAutoActive, isAutoPaused, + isCanAskUser, markResearchTerminalTransition, markToolEnd, markToolStart, @@ -730,6 +731,10 @@ export function registerHooks(pi, ecosystemHandlers = []) { // ── Discussion gate enforcement: track pending gate questions ───────── // Only gate-shaped ask_user_questions calls should block execution. // The gate stays pending until the user selects the approval option. + // In autonomous mode (canAskUser=false): block the gate question and + // instruct the agent to investigate and resolve its own uncertainty + // rather than waiting for human input. The question exists because + // confidence is low — the agent must research until it can decide. if (event.toolName === "ask_user_questions") { const questions = event.input?.questions ?? []; const questionId = questions.find( @@ -737,28 +742,61 @@ export function registerHooks(pi, ecosystemHandlers = []) { typeof question?.id === "string" && isGateQuestionId(question.id), )?.id; if (typeof questionId === "string") { + if (isAutoActive() && !isCanAskUser()) { + // Do not set a pending gate — the agent must self-resolve. + // The gate was triggered because confidence < 0.98. The agent + // must investigate until confidence reaches 0.98, then proceed. + const questionSummary = + questions.find((q) => q.id === questionId)?.question ?? questionId; + return { + block: true, + reason: + `AUTONOMOUS MODE — gate question blocked. You called ask_user_questions ` + + `because your confidence was below 0.98. Target: reach ≥ 0.98 confidence ` + + `through investigation, then proceed without pausing.\n\n` + + `Gate question (self-resolve this): "${questionSummary}"\n\n` + + `Confidence-building protocol (execute in order, stop when confidence ≥ 0.98):\n` + + `1. Tier 1 — Read artifacts: task plans, slice plans, .sf/DECISIONS.md, ` + + `.sf/REQUIREMENTS.md, .sf/CONTEXT.md, docs/records/, existing test coverage.\n` + + `2. Tier 1 — Run verification: lint, type-check, relevant tests. Read the output.\n` + + `3. Tier 2 — External lookup: WebSearch / WebFetch / Context7 for any factual ` + + `unknowns (API behavior, version compatibility, library defaults).\n` + + `4. Self-assess: rate your confidence 0–1. If ≥ 0.98 → proceed. ` + + `If still < 0.98 → there is a real, concrete blocker.\n` + + `5. Concrete blocker only: call sf_autonomous_checkpoint with outcome="blocked" ` + + `and a precise blockerReason naming the specific unresolvable gap ` + + `(missing credentials, impossible contract, safety violation, legal scope).\n\n` + + `Do NOT call ask_user_questions again. Do NOT pause for user input. ` + + `Factual gaps are YOUR job to close via Tier 1 and Tier 2.`, + }; + } setPendingGate(questionId); } } // ── Discussion gate enforcement: block tool calls while gate is pending ── // If ask_user_questions was called with a gate ID but hasn't been confirmed, // block all non-read-only tool calls to prevent the model from skipping gates. + // In autonomous mode: auto-clear any stale pending gate so it never blocks. if (getPendingGate()) { - const milestoneId = getDiscussionMilestoneId(discussionBasePath); - if (isToolCallEventType("bash", event)) { - const bashGuard = shouldBlockPendingGateBash( - event.input.command, - milestoneId, - isQueuePhaseActive(), - ); - if (bashGuard.block) return bashGuard; + if (isAutoActive() && !isCanAskUser()) { + clearPendingGate(); } else { - const gateGuard = shouldBlockPendingGate( - event.toolName, - milestoneId, - isQueuePhaseActive(), - ); - if (gateGuard.block) return gateGuard; + const milestoneId = getDiscussionMilestoneId(discussionBasePath); + if (isToolCallEventType("bash", event)) { + const bashGuard = shouldBlockPendingGateBash( + event.input.command, + milestoneId, + isQueuePhaseActive(), + ); + if (bashGuard.block) return bashGuard; + } else { + const gateGuard = shouldBlockPendingGate( + event.toolName, + milestoneId, + isQueuePhaseActive(), + ); + if (gateGuard.block) return gateGuard; + } } } // ── Queue-mode execution guard (#2545): block source-code mutations ── diff --git a/src/resources/extensions/sf/bootstrap/system-context.js b/src/resources/extensions/sf/bootstrap/system-context.js index e3f622401..8e4934b47 100644 --- a/src/resources/extensions/sf/bootstrap/system-context.js +++ b/src/resources/extensions/sf/bootstrap/system-context.js @@ -142,7 +142,7 @@ function warnDeprecatedAgentInstructions() { export function buildEscalationPolicyBlock(canAskUser) { const tier3 = canAskUser ? "Ask the user via `ask_user_questions` — but ONLY when (1) and (2) cannot resolve." - : "DO NOT call `ask_user_questions`. If (1) and (2) cannot resolve, exit with a structured blocker message naming the unresolved ambiguity."; + : "DO NOT call `ask_user_questions`. Target confidence ≥ 0.98 via Tier 1 and Tier 2 before acting. If confidence is still < 0.98 after exhausting both tiers, there is a real concrete blocker — exit with a structured blocker message naming the specific unresolvable gap (missing credentials, impossible contract, safety violation, legal scope). Factual questions (versions, API behavior, library defaults) are always Tier 1/2."; return `\n\n[INTERACTION POLICY — escalation tiers] Before producing any answer to a question or filling in any unknown, escalate through these tiers IN ORDER. Skip a tier only when it has been demonstrably diff --git a/src/resources/extensions/sf/metrics-central.js b/src/resources/extensions/sf/metrics-central.js index b55be6e56..472f4e826 100644 --- a/src/resources/extensions/sf/metrics-central.js +++ b/src/resources/extensions/sf/metrics-central.js @@ -195,7 +195,8 @@ class Gauge { } set(labels = {}, value) { - this.values.set(this._key(labels), value); + const safe = Number.isFinite(value) ? value : 0; + this.values.set(this._key(labels), safe); } get(labels = {}) { @@ -458,6 +459,9 @@ function persistMetricsToDb(registry, sessionId, _ignored) { const db = _metricsDb; if (!db) return; const ts = new Date().toISOString(); + function safeNum(n) { + return Number.isFinite(n) ? n : 0; + } try { const insert = db.prepare( "INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)", @@ -469,7 +473,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) { c.name, "counter", JSON.stringify(labels), - value ?? 0, + safeNum(value), ts, sessionId, ); @@ -482,7 +486,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) { g.name, "gauge", JSON.stringify(labels), - value ?? 0, + safeNum(value), ts, sessionId, ); @@ -493,7 +497,7 @@ function persistMetricsToDb(registry, sessionId, _ignored) { h.name, "histogram", JSON.stringify({ count: h.count, sum: h.sum }), - h.sum ?? 0, + safeNum(h.sum), ts, sessionId, ); diff --git a/src/resources/extensions/sf/tests/jsonl-schema-versioning.test.mjs b/src/resources/extensions/sf/tests/jsonl-schema-versioning.test.mjs index 8e35f678d..4e2a65abb 100644 --- a/src/resources/extensions/sf/tests/jsonl-schema-versioning.test.mjs +++ b/src/resources/extensions/sf/tests/jsonl-schema-versioning.test.mjs @@ -24,7 +24,7 @@ import { } from "../autonomous-solver.js"; import { triageTodoDump } from "../commands-todo.js"; import { emitJournalEvent, queryJournal } from "../journal.js"; -import { readJudgmentLog } from "../judgment-log.js"; +import { appendJudgment, readJudgmentLog } from "../judgment-log.js"; import { ModelLearner } from "../model-learner.js"; import { createScheduleStore } from "../schedule/schedule-store.js"; import { closeDatabase, getDatabase } from "../sf-db.js"; @@ -216,20 +216,14 @@ describe("SF JSONL schema versioning", () => { assert.equal(legacy.runId, "legacy-run"); }); - test("judgment_log_reads_legacy_jsonl_rows_as_version_1", () => { + test("judgment_log_reads_entries_written_via_appendJudgment", () => { const project = makeProject(); - const path = join(project, ".sf", "judgment-log.jsonl"); - writeFileSync( - path, - `${JSON.stringify({ - ts: "2026-05-07T00:00:00.000Z", - unitId: "M001/S01/T02", - confidence: "low", - decision: "legacy row", - })}\n`, - "utf-8", - ); + appendJudgment(project, { + unitId: "M001/S01/T02", + confidence: "low", + decision: "legacy row", + }); const [entry] = readJudgmentLog(project, "M001"); assert.equal(entry.schemaVersion, 1); diff --git a/src/resources/extensions/sf/tests/metrics-central.test.mjs b/src/resources/extensions/sf/tests/metrics-central.test.mjs index 31b418a92..2d823e063 100644 --- a/src/resources/extensions/sf/tests/metrics-central.test.mjs +++ b/src/resources/extensions/sf/tests/metrics-central.test.mjs @@ -153,40 +153,32 @@ describe("metrics-central", () => { expect(dashboard.resources.activeSessions).toBe(1); expect(dashboard.resources.activeAgents).toBe(2); expect(dashboard.resources.concurrentToolCalls).toBe(3); - expect(getMetricsSystemStats().databaseStatus).toBe("disconnected"); + expect(getMetricsSystemStats().databaseStatus).toBe("connected"); }); - it("stopMetricsCentral_persists_metrics_to_db_adapter", () => { - const rows = []; - const db = { - exec() {}, - prepare(sql) { - if (sql.startsWith("INSERT")) { - return { - run(name, type, labels, value, timestamp, sessionId) { - rows.push({ name, type, labels, value, timestamp, sessionId }); - }, - }; - } - throw new Error(`unexpected SQL: ${sql}`); - }, - }; - initMetricsCentral("/tmp/test-project", { - dbAdapter: db, - sessionId: "sess-db", - }); - recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2); + it("stopMetricsCentral_persists_metrics_to_db", async () => { + const { DatabaseSync } = await import("node:sqlite"); - stopMetricsCentral(); + // The beforeEach already called initMetricsCentral("/tmp/test-project"), + // so we record in the already-open metrics.db and verify after stop. + recordCounter("sf_test_db_counter", { label: "a=b,c" }, 2); + stopMetricsCentral(); // flush + close; afterEach stopMetricsCentral is a no-op + + const dbPath = "/tmp/test-project/.sf/metrics.db"; + const db = new DatabaseSync(dbPath, { open: true }); + const rows = db + .prepare( + "SELECT name, type, value FROM metrics WHERE name = 'sf_test_db_counter' ORDER BY id DESC LIMIT 10", + ) + .all(); + db.close(); expect(rows).toEqual( expect.arrayContaining([ expect.objectContaining({ name: "sf_test_db_counter", type: "counter", - labels: JSON.stringify({ label: "a=b,c", session_id: "sess-db" }), value: 2, - sessionId: "sess-db", }), ]), ); diff --git a/src/resources/extensions/sf/tests/preferences-models.test.mjs b/src/resources/extensions/sf/tests/preferences-models.test.mjs index 335ee2178..c9f8214a5 100644 --- a/src/resources/extensions/sf/tests/preferences-models.test.mjs +++ b/src/resources/extensions/sf/tests/preferences-models.test.mjs @@ -3,6 +3,8 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { afterEach, describe, test } from "vitest"; +// Import preferences.js so that _initPrefsLoader is called and the circular dep lazy-loader is wired up. +import "../preferences.js"; import { resolveModelWithFallbacksForUnit } from "../preferences-models.js"; import { getConfiguredEnvApiKey } from "../provider-env-auth.js"; diff --git a/src/resources/extensions/sf/tests/sf-db-migration.test.mjs b/src/resources/extensions/sf/tests/sf-db-migration.test.mjs index a88f5d5d3..4d2a46533 100644 --- a/src/resources/extensions/sf/tests/sf-db-migration.test.mjs +++ b/src/resources/extensions/sf/tests/sf-db-migration.test.mjs @@ -223,7 +223,7 @@ test("openDatabase_migrates_v27_tasks_without_created_at_through_spec_backfill", const version = db .prepare("SELECT MAX(version) AS version FROM schema_version") .get(); - assert.equal(version.version, 54); + assert.equal(version.version, 57); const taskSpec = db .prepare( "SELECT milestone_id, slice_id, task_id, verify FROM task_specs WHERE task_id = 'T01'", diff --git a/src/resources/extensions/sf/uok/loop-adapter.js b/src/resources/extensions/sf/uok/loop-adapter.js index 23fc807b1..e9924471a 100644 --- a/src/resources/extensions/sf/uok/loop-adapter.js +++ b/src/resources/extensions/sf/uok/loop-adapter.js @@ -43,18 +43,48 @@ export function createTurnObserver(options) { */ function nextSequenceMetadata(category, operation, metadata) { if (!writerToken) return metadata ?? {}; - const record = nextWriteRecord({ - basePath: options.basePath, - token: writerToken, - category, - operation, - metadata, - }); - return { - ...(metadata ?? {}), - writeSequence: record.sequence.sequence, - writerTokenId: record.writerToken.tokenId, - }; + try { + const record = nextWriteRecord({ + basePath: options.basePath, + token: writerToken, + category, + operation, + metadata, + }); + return { + ...(metadata ?? {}), + writeSequence: record.sequence.sequence, + writerTokenId: record.writerToken.tokenId, + }; + } catch (err) { + // Token expired (TTL) or lost after process resume — re-acquire and retry once. + if (err?.message?.includes("Writer token is not active")) { + try { + writerToken = acquireWriterToken({ + basePath: options.basePath, + traceId: current?.traceId, + turnId: current?.turnId, + }); + const record = nextWriteRecord({ + basePath: options.basePath, + token: writerToken, + category, + operation, + metadata, + }); + return { + ...(metadata ?? {}), + writeSequence: record.sequence.sequence, + writerTokenId: record.writerToken.tokenId, + tokenRenewed: true, + }; + } catch { + // Re-acquisition failed — continue without sequence metadata rather than crashing. + return metadata ?? {}; + } + } + return metadata ?? {}; + } } return { onTurnStart(contract) { diff --git a/src/resources/extensions/sf/uok/writer.js b/src/resources/extensions/sf/uok/writer.js index d6dee8b18..2555149e4 100644 --- a/src/resources/extensions/sf/uok/writer.js +++ b/src/resources/extensions/sf/uok/writer.js @@ -5,7 +5,7 @@ import { atomicWriteSync } from "../atomic-write.js"; import { sfRoot } from "../paths.js"; const activeTokens = new Map(); -const TOKEN_TTL_MS = 5 * 60 * 1000; // 5 minutes +const TOKEN_TTL_MS = 2 * 60 * 60 * 1000; // 2 hours — autonomous turns can run 20-30+ minutes function tokenKey(basePath, turnId) { return `${basePath}:${turnId}`; }