fix(sf-db,autonomous-solver): resolve schema-drift and checkpoint runaway loop
- sf-db-schema.js: per-migration transaction boundaries (runMigrationStep) so a late migration failure does not roll back earlier successful ones. Post-migration assertion recreates routing_history if missing. - routing-history.js: catch missing routing_history table at init and latch _dbTableAvailable=false so auto-start does not crash. - autonomous-solver.js: sticky identity guard in appendAutonomousSolverCheckpoint pins to orchestrator's unitType/unitId instead of trusting agent's claim. Emit journal event on identity mismatch. Record mismatchedIdentity diagnostic. Hard cap MAX_CHECKPOINTS_PER_ITERATION=5 in assessAutonomousSolverTurn. - Tests: add v52 DB smoke test with auto-start path; add sticky identity tests (4 cases); add excessive-checkpoint pause test. Fixes: sf-mp36kfqm-rjrzju, sf-mp37kjmo-1mfuru
This commit is contained in:
parent
a49ea1da87
commit
1ed505669b
5 changed files with 1254 additions and 646 deletions
|
|
@ -17,6 +17,7 @@ import {
|
|||
import { dirname, join } from "node:path";
|
||||
import { atomicWriteSync } from "./atomic-write.js";
|
||||
import { sfRoot } from "./paths.js";
|
||||
import { emitJournalEvent } from "./journal.js";
|
||||
|
||||
export const AUTONOMOUS_SOLVER_OUTCOMES = [
|
||||
"continue",
|
||||
|
|
@ -30,6 +31,7 @@ const DEFAULT_SOLVER_MAX_ITERATIONS = 30000;
|
|||
const MIN_SOLVER_MAX_ITERATIONS = 1;
|
||||
const MAX_SOLVER_MAX_ITERATIONS = 100000;
|
||||
const DEFAULT_MISSING_CHECKPOINT_REPAIR_ATTEMPTS = 4;
|
||||
const MAX_CHECKPOINTS_PER_ITERATION = 5;
|
||||
const SOLVER_CHECKPOINT_SCHEMA_VERSION = 1;
|
||||
const SOLVER_STEERING_SCHEMA_VERSION = 1;
|
||||
const STALL_THRESHOLD_ITERATIONS = 3;
|
||||
|
|
@ -265,6 +267,8 @@ export function beginAutonomousSolverIteration(
|
|||
? existing.recentCheckpointSummaries
|
||||
: []
|
||||
: [],
|
||||
// Safety cap: how many checkpoints have been written this iteration
|
||||
checkpointCountThisIteration: 0,
|
||||
};
|
||||
writeState(basePath, state);
|
||||
return state;
|
||||
|
|
@ -463,11 +467,13 @@ export function buildSolverPassPrompt(
|
|||
"",
|
||||
"## Classification Rubric",
|
||||
"",
|
||||
"- `executor-refused`: The executor emitted a generic refusal ('I'm sorry', 'I cannot help', 'I don't have the necessary tools'). → checkpoint outcome=`blocked`, blockerReason=`executor-refused`.",
|
||||
"- `executor-noop`: The executor emitted prose but made zero tool calls, zero file edits, and zero measurable progress. → checkpoint outcome=`blocked` (or `continue` ONLY if the executor explicitly states it is waiting for an external event).",
|
||||
"- `progress`: The executor made concrete progress (file edits, tests run, tools called). → checkpoint outcome=`continue` with accurate completedItems/remainingItems.",
|
||||
"- `complete`: The executor finished the unit's required artifact AND called any mandatory completion tool. → checkpoint outcome=`complete`.",
|
||||
"- `blocker-other`: The executor hit a hard blocker (missing credentials, broken environment). → checkpoint outcome=`blocked` with a precise blockerReason.",
|
||||
"Apply these in order; emit the FIRST one that matches.",
|
||||
"",
|
||||
"1. `executor-refused`: The executor emitted a generic refusal ('I'm sorry', 'I cannot help', 'I don't have the necessary tools', 'outside my capabilities'). → checkpoint outcome=`blocked`, blockerReason=`executor-refused`.",
|
||||
"2. `executor-noop`: The executor emitted prose but made zero tool calls, zero file edits, and zero measurable progress. → checkpoint outcome=`blocked`, blockerReason=`executor-noop`. There is no `continue` escape hatch for this case — synthesizing forward progress over a no-op iteration is the exact bug ADR-0079 closes. If the executor genuinely needs an external event, that is a `blocker-external-wait` (rule 5), not a continue.",
|
||||
"3. `progress`: The executor made concrete progress (file edits, tests run, tools called). → checkpoint outcome=`continue` with accurate completedItems/remainingItems.",
|
||||
"4. `complete`: The executor finished the unit's required artifact AND called any mandatory completion tool. → checkpoint outcome=`complete`.",
|
||||
"5. `blocker-other`: The executor hit a hard blocker (missing credentials, broken environment, external wait). → checkpoint outcome=`blocked` with a precise blockerReason naming the cause.",
|
||||
"",
|
||||
"## Executor Transcript",
|
||||
"",
|
||||
|
|
@ -500,6 +506,29 @@ export function buildSolverPassPrompt(
|
|||
* must not satisfy the repair gate.
|
||||
*
|
||||
* Consumer: assessAutonomousSolverTurn to reject no-op continues.
|
||||
*
|
||||
* Implementation: structural inspection only. We look for evidence that the
|
||||
* executor actually invoked tools, in either of the two message shapes used
|
||||
* across SF's provider runtimes:
|
||||
*
|
||||
* 1. Anthropic-style: `msg.content` is an array of blocks; tool activity
|
||||
* shows as `{ type: "tool_use", name: ... }` (assistant) or
|
||||
* `{ type: "tool_result", ... }` (user/tool role). This is the shape
|
||||
* Claude messages take when stored in pi's agent_end events (see
|
||||
* undo.js:431-447 which uses the same pattern to extract tool_result
|
||||
* content).
|
||||
* 2. OpenAI-style: `msg.tool_calls` array on the assistant message and
|
||||
* `msg.role === "tool"` (or "tool_result") with `msg.name` on the
|
||||
* reply. Used by OpenAI-compatible providers.
|
||||
*
|
||||
* A `checkpoint` tool call by itself doesn't count as work — that's the
|
||||
* protocol step, not the unit deliverable. Any other named tool counts.
|
||||
*
|
||||
* We deliberately do NOT grep prose ("File edited", "```diff", …). Prose
|
||||
* patterns are runtime-specific and produce false negatives that mark real
|
||||
* work as no-op, which would synthesize a blocker over completed iterations.
|
||||
* If a transcript has zero structural tool activity, it really is a no-op
|
||||
* even if its prose is plausible.
|
||||
*/
|
||||
export function isNoOpExecutorTranscript(messages) {
|
||||
if (!Array.isArray(messages) || messages.length === 0) return true;
|
||||
|
|
@ -507,38 +536,57 @@ export function isNoOpExecutorTranscript(messages) {
|
|||
// Refusal is always a no-op
|
||||
if (classifyExecutorRefusal(messages)) return true;
|
||||
|
||||
const isWorkToolName = (name) => {
|
||||
if (!name || typeof name !== "string") return false;
|
||||
// `checkpoint` is the protocol; the executor calling it is not unit work.
|
||||
// (Per ADR-0079 the executor isn't even supposed to call it.) Anything
|
||||
// else — reads, writes, bash, complete_task, save_summary — counts.
|
||||
return name !== "checkpoint";
|
||||
};
|
||||
|
||||
for (const msg of messages) {
|
||||
if (!msg || typeof msg !== "object") continue;
|
||||
|
||||
// Assistant requested non-checkpoint tool calls
|
||||
if (Array.isArray(msg.tool_calls)) {
|
||||
for (const tc of msg.tool_calls) {
|
||||
const name = tc?.function?.name ?? tc?.name ?? "";
|
||||
if (name && name !== "checkpoint") {
|
||||
// ── Anthropic-style: content is an array of typed blocks ──
|
||||
if (Array.isArray(msg.content)) {
|
||||
for (const block of msg.content) {
|
||||
if (!block || typeof block !== "object") continue;
|
||||
if (block.type === "tool_use" && isWorkToolName(block.name)) {
|
||||
return false;
|
||||
}
|
||||
if (block.type === "tool_result") {
|
||||
// tool_result has no name on the block itself; presence of a
|
||||
// non-checkpoint tool_result implies a non-checkpoint tool_use
|
||||
// preceded it. The pair-match would require backward scan; for
|
||||
// robustness, treat ANY tool_result as evidence of work unless
|
||||
// it's a checkpoint result (which would have been emitted by
|
||||
// the assistant's checkpoint tool_use earlier in this same
|
||||
// transcript — but that's protocol, not work). Without the
|
||||
// block name we can't distinguish, so be conservative: a
|
||||
// tool_result is non-no-op work UNLESS the entire transcript's
|
||||
// only tool_use was `checkpoint`. We carry that check via the
|
||||
// tool_use scan above — if a non-checkpoint tool_use exists,
|
||||
// we've already returned false. If only `checkpoint` was used,
|
||||
// the tool_result here is the checkpoint reply and we should
|
||||
// keep scanning.
|
||||
// Simpler approach: ignore tool_result blocks for the
|
||||
// classification; the tool_use scan is authoritative.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tool results from non-checkpoint tools
|
||||
if (msg.role === "tool" || msg.role === "tool_result") {
|
||||
const name = msg.name ?? "";
|
||||
if (name && name !== "checkpoint") {
|
||||
return false;
|
||||
// ── OpenAI-style: msg.tool_calls on assistant ──
|
||||
if (Array.isArray(msg.tool_calls)) {
|
||||
for (const tc of msg.tool_calls) {
|
||||
const name = tc?.function?.name ?? tc?.name ?? "";
|
||||
if (isWorkToolName(name)) return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Content that shows concrete work was done
|
||||
const content = typeof msg.content === "string" ? msg.content : "";
|
||||
if (
|
||||
content.includes("File edited") ||
|
||||
content.includes("File written") ||
|
||||
content.includes("File created") ||
|
||||
content.includes("```diff") ||
|
||||
content.includes("--- a/") ||
|
||||
content.includes("+++ b/")
|
||||
) {
|
||||
return false;
|
||||
// ── OpenAI-style: tool reply rows ──
|
||||
if (msg.role === "tool" || msg.role === "tool_result") {
|
||||
if (isWorkToolName(msg.name)) return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -554,17 +602,61 @@ export function isNoOpExecutorTranscript(messages) {
|
|||
* Consumer: checkpoint tool.
|
||||
*/
|
||||
export function appendAutonomousSolverCheckpoint(basePath, params) {
|
||||
const persisted = readJson(statePath(basePath));
|
||||
const state =
|
||||
readJson(statePath(basePath)) ??
|
||||
persisted ??
|
||||
beginAutonomousSolverIteration(basePath, params.unitType, params.unitId);
|
||||
// ── Sticky identity guard ──
|
||||
// The orchestrator owns the active unit identity (it called
|
||||
// beginAutonomousSolverIteration with the canonical unitType/unitId).
|
||||
// If the agent's checkpoint call passes a *different* unitType/unitId
|
||||
// because it guessed wrong (real-world: minimax/M2.1 stuck at
|
||||
// 2026-05-13 calling checkpoint with `parallel-research` /
|
||||
// `1-ci-build-pipeline/parallel-research` / `research-slice 1-...`
|
||||
// — three different strings — none matching the orchestrator's
|
||||
// active identity), the previous implementation silently overwrote
|
||||
// state.unitType/unitId with the wrong claim. assessAutonomousSolverTurn
|
||||
// then failed sameUnit() against the orchestrator's identity, fired
|
||||
// missing-checkpoint-retry, the agent re-checkpointed with another
|
||||
// wrong guess, and the loop ran indefinitely (60+ wasted calls).
|
||||
//
|
||||
// Fix: when there is an active running/paused state, pin the
|
||||
// checkpoint to the active state's identity instead of trusting the
|
||||
// agent's claim. Surface the mismatch on the checkpoint payload so it
|
||||
// is visible in traces.
|
||||
const hasActiveIdentity =
|
||||
persisted &&
|
||||
persisted.unitType &&
|
||||
persisted.unitId &&
|
||||
persisted.status !== "complete";
|
||||
const isMismatch =
|
||||
hasActiveIdentity &&
|
||||
!sameUnit(persisted, params.unitType, params.unitId);
|
||||
if (isMismatch) {
|
||||
emitJournalEvent(basePath, {
|
||||
flowId: `${state.unitType}-${state.unitId}-${Date.now()}`,
|
||||
seq: 1,
|
||||
ts: nowIso(),
|
||||
eventType: "checkpoint-identity-mismatch",
|
||||
data: {
|
||||
claimedUnitType: params.unitType,
|
||||
claimedUnitId: params.unitId,
|
||||
pinnedToUnitType: state.unitType,
|
||||
pinnedToUnitId: state.unitId,
|
||||
},
|
||||
});
|
||||
}
|
||||
const effectiveUnitType = isMismatch ? state.unitType : params.unitType;
|
||||
const effectiveUnitId = isMismatch ? state.unitId : params.unitId;
|
||||
const checkpoint = {
|
||||
schemaVersion: SOLVER_CHECKPOINT_SCHEMA_VERSION,
|
||||
ts: nowIso(),
|
||||
unitType: params.unitType,
|
||||
unitId: params.unitId,
|
||||
iteration: sameUnit(state, params.unitType, params.unitId)
|
||||
? state.iteration
|
||||
: 1,
|
||||
unitType: effectiveUnitType,
|
||||
unitId: effectiveUnitId,
|
||||
// Iteration must match the orchestrator's current iteration so
|
||||
// assessAutonomousSolverTurn's hasCurrentCheckpoint check passes
|
||||
// and the outcome (especially `complete`) is honored.
|
||||
iteration: state.iteration,
|
||||
outcome: params.outcome,
|
||||
summary: String(params.summary ?? "").trim(),
|
||||
completedItems: sanitizeList(params.completedItems),
|
||||
|
|
@ -586,11 +678,22 @@ export function appendAutonomousSolverCheckpoint(basePath, params) {
|
|||
invariants: String(params.pdd?.invariants ?? "").trim(),
|
||||
assumptions: String(params.pdd?.assumptions ?? "").trim(),
|
||||
},
|
||||
// Diagnostic: when the agent's claim differs from the active unit,
|
||||
// record both so trace consumers can flag the model's confusion.
|
||||
...(isMismatch
|
||||
? {
|
||||
mismatchedIdentity: {
|
||||
claimedUnitType: String(params.unitType ?? ""),
|
||||
claimedUnitId: String(params.unitId ?? ""),
|
||||
pinnedToActive: { unitType: state.unitType, unitId: state.unitId },
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
};
|
||||
const nextState = {
|
||||
...state,
|
||||
unitType: params.unitType,
|
||||
unitId: params.unitId,
|
||||
unitType: effectiveUnitType,
|
||||
unitId: effectiveUnitId,
|
||||
status:
|
||||
params.outcome === "complete"
|
||||
? "complete"
|
||||
|
|
@ -630,6 +733,9 @@ export function appendAutonomousSolverCheckpoint(basePath, params) {
|
|||
: []),
|
||||
checkpoint.summary,
|
||||
].slice(-ROLLING_SUMMARY_WINDOW),
|
||||
// Increment checkpoint count for this iteration (safety cap)
|
||||
checkpointCountThisIteration:
|
||||
(state.checkpointCountThisIteration || 0) + 1,
|
||||
};
|
||||
mkdirSync(dirname(historyPath(basePath)), { recursive: true });
|
||||
writeFileSync(historyPath(basePath), `${JSON.stringify(checkpoint)}\n`, {
|
||||
|
|
@ -985,6 +1091,20 @@ export function assessAutonomousSolverTurn(
|
|||
maxRepairAttempts: DEFAULT_MISSING_CHECKPOINT_REPAIR_ATTEMPTS,
|
||||
};
|
||||
}
|
||||
// Hard cap on excessive checkpoints within a single iteration
|
||||
if (
|
||||
(state.checkpointCountThisIteration || 0) >=
|
||||
MAX_CHECKPOINTS_PER_ITERATION
|
||||
) {
|
||||
return {
|
||||
action: "pause",
|
||||
reason: "solver-excessive-checkpoints",
|
||||
state,
|
||||
checkpoint,
|
||||
checkpointCount: state.checkpointCountThisIteration,
|
||||
maxCheckpointCount: MAX_CHECKPOINTS_PER_ITERATION,
|
||||
};
|
||||
}
|
||||
if (
|
||||
state.iteration >= state.maxIterations &&
|
||||
checkpoint.outcome !== "complete"
|
||||
|
|
|
|||
|
|
@ -15,15 +15,45 @@ const FAILURE_THRESHOLD = 0.2; // >20% failure rate triggers tier bump
|
|||
const FEEDBACK_WEIGHT = 2; // feedback signals count 2x vs automatic
|
||||
// ─── In-Memory State ─────────────────────────────────────────────────────────
|
||||
let history = null;
|
||||
// Latches to false when the `routing_history` table is observed missing at
|
||||
// init time. Subsequent DB writes from recordOutcome/recordFeedback are then
|
||||
// skipped so a stale-schema project doesn't repeatedly throw on every
|
||||
// dispatch. The next openDatabase that successfully runs the v53 migration
|
||||
// will let a later initRoutingHistory call flip this back to true.
|
||||
let _dbTableAvailable = true;
|
||||
// ─── Public API ──────────────────────────────────────────────────────────────
|
||||
/**
|
||||
* Initialize routing history for a project.
|
||||
*
|
||||
* Resilient to a missing `routing_history` table: a project DB whose schema
|
||||
* version predates the routing_history migration (v53) will throw `no such
|
||||
* table: routing_history` on the underlying SELECT. We swallow that one
|
||||
* specific case so auto-start is not blocked by a stale schema; the
|
||||
* in-memory history simply starts empty and accumulates from scratch.
|
||||
* Anything else (corrupt DB, permission errors) re-throws so it remains
|
||||
* visible.
|
||||
*/
|
||||
export function initRoutingHistory(_base) {
|
||||
history = createEmptyHistory();
|
||||
const db = getDatabase();
|
||||
if (!db) return;
|
||||
const rows = getAllRoutingHistory(db);
|
||||
let rows;
|
||||
try {
|
||||
rows = getAllRoutingHistory(db);
|
||||
_dbTableAvailable = true;
|
||||
} catch (err) {
|
||||
const message = err?.message ? String(err.message) : "";
|
||||
if (/no such table:\s*routing_history/i.test(message)) {
|
||||
// Schema lags the code — fresh project, or a project whose DB never
|
||||
// migrated past v52. Start with empty in-memory state and latch the
|
||||
// flag so recordOutcome/recordFeedback skip their DB writes for the
|
||||
// remainder of the session instead of crashing on every dispatch.
|
||||
rows = [];
|
||||
_dbTableAvailable = false;
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
for (const row of rows) {
|
||||
if (!history.patterns[row.pattern]) {
|
||||
history.patterns[row.pattern] = {
|
||||
|
|
@ -62,13 +92,14 @@ export function resetRoutingHistory() {
|
|||
export function recordOutcome(unitType, tier, success, tags) {
|
||||
if (!history) return;
|
||||
const db = getDatabase();
|
||||
const canWriteDb = db && _dbTableAvailable;
|
||||
// Record for the base unit type
|
||||
const basePattern = unitType;
|
||||
ensurePattern(basePattern);
|
||||
const outcome = history.patterns[basePattern][tier];
|
||||
if (success) outcome.success++;
|
||||
else outcome.fail++;
|
||||
if (db) upsertRoutingOutcome(db, basePattern, tier, success);
|
||||
if (canWriteDb) upsertRoutingOutcome(db, basePattern, tier, success);
|
||||
// Record for tag-specific patterns (e.g. "execute-task:docs")
|
||||
if (tags && tags.length > 0) {
|
||||
for (const tag of tags) {
|
||||
|
|
@ -77,7 +108,7 @@ export function recordOutcome(unitType, tier, success, tags) {
|
|||
const tagOutcome = history.patterns[tagPattern][tier];
|
||||
if (success) tagOutcome.success++;
|
||||
else tagOutcome.fail++;
|
||||
if (db) upsertRoutingOutcome(db, tagPattern, tier, success);
|
||||
if (canWriteDb) upsertRoutingOutcome(db, tagPattern, tier, success);
|
||||
}
|
||||
}
|
||||
// Apply rolling window — cap total entries per tier per pattern
|
||||
|
|
@ -111,7 +142,8 @@ export function recordFeedback(unitType, _unitId, tier, rating) {
|
|||
history.feedback = history.feedback.slice(-200);
|
||||
}
|
||||
const db = getDatabase();
|
||||
if (db) insertRoutingFeedback(db, unitType, tier, rating);
|
||||
const canWriteDb = db && _dbTableAvailable;
|
||||
if (canWriteDb) insertRoutingFeedback(db, unitType, tier, rating);
|
||||
// Apply feedback as weighted outcome
|
||||
const pattern = unitType;
|
||||
ensurePattern(pattern);
|
||||
|
|
@ -122,7 +154,7 @@ export function recordFeedback(unitType, _unitId, tier, rating) {
|
|||
if (lower) {
|
||||
const outcomes = history.patterns[pattern][lower];
|
||||
outcomes.success += FEEDBACK_WEIGHT;
|
||||
if (db) {
|
||||
if (canWriteDb) {
|
||||
for (let i = 0; i < FEEDBACK_WEIGHT; i++) {
|
||||
upsertRoutingOutcome(db, pattern, lower, true);
|
||||
}
|
||||
|
|
@ -132,7 +164,7 @@ export function recordFeedback(unitType, _unitId, tier, rating) {
|
|||
// User says this needed a better model → record as failure at current tier
|
||||
const outcomes = history.patterns[pattern][tier];
|
||||
outcomes.fail += FEEDBACK_WEIGHT;
|
||||
if (db) {
|
||||
if (canWriteDb) {
|
||||
for (let i = 0; i < FEEDBACK_WEIGHT; i++) {
|
||||
upsertRoutingOutcome(db, pattern, tier, false);
|
||||
}
|
||||
|
|
@ -165,7 +197,7 @@ export function getAdaptiveTierAdjustment(unitType, currentTier, tags) {
|
|||
export function clearRoutingHistory(_base) {
|
||||
history = createEmptyHistory();
|
||||
const db = getDatabase();
|
||||
if (db) dbClearRoutingHistory(db);
|
||||
if (db && _dbTableAvailable) dbClearRoutingHistory(db);
|
||||
}
|
||||
/**
|
||||
* Get current history data (for display/debugging).
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -317,6 +317,36 @@ describe("autonomous solver", () => {
|
|||
expect(result.reason).toBe("solver-max-iterations");
|
||||
});
|
||||
|
||||
test("assessAutonomousSolverTurn_excessive_checkpoints_pauses_after_cap", () => {
|
||||
// Fail-fast when the agent calls checkpoint 5+ times within a single
|
||||
// iteration without making other tool progress. Prevents the 60+
|
||||
// no-op checkpoint loop from sf-mp37kjmo-1mfuru.
|
||||
const project = makeProject();
|
||||
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");
|
||||
|
||||
for (let i = 0; i < 5; i++) {
|
||||
appendAutonomousSolverCheckpoint(project, {
|
||||
unitType: "execute-task",
|
||||
unitId: "M001/S01/T01",
|
||||
outcome: "continue",
|
||||
summary: `Checkpoint ${i + 1} — still stuck.`,
|
||||
completedItems: [],
|
||||
remainingItems: ["need help"],
|
||||
verificationEvidence: [],
|
||||
pdd: pdd(),
|
||||
});
|
||||
}
|
||||
|
||||
const result = assessAutonomousSolverTurn(
|
||||
project,
|
||||
"execute-task",
|
||||
"M001/S01/T01",
|
||||
);
|
||||
expect(result.action).toBe("pause");
|
||||
expect(result.reason).toBe("solver-excessive-checkpoints");
|
||||
expect(result.checkpointCount).toBe(5);
|
||||
});
|
||||
|
||||
test("steering_append_consume_is_idempotent", () => {
|
||||
const project = makeProject();
|
||||
appendAutonomousSolverSteering(project, "Prefer runtime enforcement.");
|
||||
|
|
@ -934,3 +964,114 @@ describe("assessAutonomousSolverTurn no-op detection", () => {
|
|||
expect(result.reason).toBe("solver-noop-continue");
|
||||
});
|
||||
});
|
||||
|
||||
describe("appendAutonomousSolverCheckpoint sticky identity", () => {
|
||||
test("pins to orchestrator unit identity when agent passes a different unitId", () => {
|
||||
// Real-world bug (2026-05-13): minimax/M2.1 stuck in 60+ checkpoint
|
||||
// loop because each call passed a guessed unitId
|
||||
// ("parallel-research" / "research-slice 1-ci-build-pipeline/..."),
|
||||
// silently overwriting state.unitId. assessAutonomousSolverTurn then
|
||||
// failed sameUnit() against the orchestrator's identity and re-fired
|
||||
// repair forever. The active state's identity must be sticky.
|
||||
const project = makeProject();
|
||||
beginAutonomousSolverIteration(
|
||||
project,
|
||||
"execute-task",
|
||||
"M001/S04/T02",
|
||||
);
|
||||
appendAutonomousSolverCheckpoint(project, {
|
||||
unitType: "execute-task",
|
||||
unitId: "parallel-research", // <-- agent guesses wrong
|
||||
outcome: "complete",
|
||||
summary: "Done.",
|
||||
completedItems: ["work"],
|
||||
remainingItems: [],
|
||||
verificationEvidence: ["ls -la"],
|
||||
pdd: pdd(),
|
||||
});
|
||||
const state = readAutonomousSolverState(project);
|
||||
// State identity must NOT shift to the agent's wrong claim.
|
||||
expect(state.unitType).toBe("execute-task");
|
||||
expect(state.unitId).toBe("M001/S04/T02");
|
||||
// Checkpoint payload itself is pinned to the orchestrator's identity.
|
||||
expect(state.latestCheckpoint.unitType).toBe("execute-task");
|
||||
expect(state.latestCheckpoint.unitId).toBe("M001/S04/T02");
|
||||
// Mismatch is surfaced diagnostically.
|
||||
expect(state.latestCheckpoint.mismatchedIdentity).toEqual({
|
||||
claimedUnitType: "execute-task",
|
||||
claimedUnitId: "parallel-research",
|
||||
pinnedToActive: {
|
||||
unitType: "execute-task",
|
||||
unitId: "M001/S04/T02",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("assessAutonomousSolverTurn honors complete after sticky-pin rescue", () => {
|
||||
// End-to-end: agent passes wrong unitId, checkpoint stickys to
|
||||
// orchestrator's identity, assess sees outcome=complete and returns
|
||||
// action=complete (NOT missing-checkpoint-retry).
|
||||
const project = makeProject();
|
||||
beginAutonomousSolverIteration(
|
||||
project,
|
||||
"execute-task",
|
||||
"M001/S04/T02",
|
||||
);
|
||||
appendAutonomousSolverCheckpoint(project, {
|
||||
unitType: "execute-task",
|
||||
unitId: "wrong-guess",
|
||||
outcome: "complete",
|
||||
summary: "Done.",
|
||||
completedItems: ["work"],
|
||||
remainingItems: [],
|
||||
verificationEvidence: ["ls -la"],
|
||||
pdd: pdd(),
|
||||
});
|
||||
const assessment = assessAutonomousSolverTurn(
|
||||
project,
|
||||
"execute-task",
|
||||
"M001/S04/T02",
|
||||
);
|
||||
expect(assessment.action).toBe("complete");
|
||||
});
|
||||
|
||||
test("matching unitId does not flag mismatch", () => {
|
||||
const project = makeProject();
|
||||
beginAutonomousSolverIteration(
|
||||
project,
|
||||
"execute-task",
|
||||
"M001/S04/T02",
|
||||
);
|
||||
appendAutonomousSolverCheckpoint(project, {
|
||||
unitType: "execute-task",
|
||||
unitId: "M001/S04/T02",
|
||||
outcome: "continue",
|
||||
summary: "Progress",
|
||||
completedItems: ["read files"],
|
||||
remainingItems: ["edit code"],
|
||||
verificationEvidence: ["grep -n"],
|
||||
pdd: pdd(),
|
||||
});
|
||||
const state = readAutonomousSolverState(project);
|
||||
expect(state.latestCheckpoint.mismatchedIdentity).toBeUndefined();
|
||||
});
|
||||
|
||||
test("fresh project with no active state accepts agent-provided identity", () => {
|
||||
// Bootstrap case: state is null on first call; the agent's claim
|
||||
// initializes the state. (Same behavior as before the sticky fix.)
|
||||
const project = makeProject();
|
||||
appendAutonomousSolverCheckpoint(project, {
|
||||
unitType: "execute-task",
|
||||
unitId: "M001/S01/T01",
|
||||
outcome: "continue",
|
||||
summary: "First iteration",
|
||||
completedItems: [],
|
||||
remainingItems: ["plan"],
|
||||
verificationEvidence: [],
|
||||
pdd: pdd(),
|
||||
});
|
||||
const state = readAutonomousSolverState(project);
|
||||
expect(state.unitId).toBe("M001/S01/T01");
|
||||
expect(state.latestCheckpoint.mismatchedIdentity).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import {
|
|||
openDatabase,
|
||||
reconcileWorktreeDb,
|
||||
} from "../sf-db.js";
|
||||
import { initRoutingHistory } from "../routing-history.js";
|
||||
|
||||
const tmpDirs = [];
|
||||
|
||||
|
|
@ -149,6 +150,56 @@ function makeLegacyV27Db() {
|
|||
return dbPath;
|
||||
}
|
||||
|
||||
function makeLegacyV52Db() {
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-legacy-v52-"));
|
||||
tmpDirs.push(dir);
|
||||
const sfDir = join(dir, ".sf");
|
||||
mkdirSync(sfDir, { recursive: true });
|
||||
const dbPath = join(sfDir, "sf.db");
|
||||
const db = new DatabaseSync(dbPath);
|
||||
db.exec(`
|
||||
CREATE TABLE schema_version (
|
||||
version INTEGER NOT NULL,
|
||||
applied_at TEXT NOT NULL
|
||||
);
|
||||
INSERT INTO schema_version (version, applied_at)
|
||||
VALUES (52, '2026-05-06T00:00:00.000Z');
|
||||
|
||||
CREATE TABLE milestones (
|
||||
id TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
status TEXT NOT NULL DEFAULT 'active',
|
||||
depends_on TEXT NOT NULL DEFAULT '[]',
|
||||
created_at TEXT NOT NULL DEFAULT '',
|
||||
completed_at TEXT DEFAULT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE slices (
|
||||
milestone_id TEXT NOT NULL,
|
||||
id TEXT NOT NULL,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
status TEXT NOT NULL DEFAULT 'pending',
|
||||
risk TEXT NOT NULL DEFAULT 'medium',
|
||||
depends TEXT NOT NULL DEFAULT '[]',
|
||||
demo TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL DEFAULT '',
|
||||
completed_at TEXT DEFAULT NULL,
|
||||
PRIMARY KEY (milestone_id, id)
|
||||
);
|
||||
|
||||
CREATE TABLE tasks (
|
||||
milestone_id TEXT NOT NULL,
|
||||
slice_id TEXT NOT NULL,
|
||||
id TEXT NOT NULL,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
status TEXT NOT NULL DEFAULT 'pending',
|
||||
PRIMARY KEY (milestone_id, slice_id, id)
|
||||
);
|
||||
`);
|
||||
db.close();
|
||||
return dbPath;
|
||||
}
|
||||
|
||||
function makeLegacyV35GateRunsDb() {
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-legacy-v35-gates-"));
|
||||
tmpDirs.push(dir);
|
||||
|
|
@ -262,6 +313,35 @@ test("openDatabase_migrates_v27_tasks_without_created_at_through_spec_backfill",
|
|||
assert.deepEqual(schedulerRow, { status: "queued" });
|
||||
});
|
||||
|
||||
test("openDatabase_v52_db_heals_routing_history_and_auto_start_path_works", () => {
|
||||
const dbPath = makeLegacyV52Db();
|
||||
|
||||
assert.equal(openDatabase(dbPath), true);
|
||||
const db = getDatabase();
|
||||
|
||||
// ensurePostBootstrapTables should have created routing_history
|
||||
const routingTable = db
|
||||
.prepare(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='routing_history'",
|
||||
)
|
||||
.get();
|
||||
assert.ok(
|
||||
routingTable,
|
||||
"routing_history table should exist after ensurePostBootstrapTables",
|
||||
);
|
||||
|
||||
// initRoutingHistory (auto-start path) must not crash on a v52 DB
|
||||
assert.doesNotThrow(() => {
|
||||
initRoutingHistory(dbPath);
|
||||
}, "initRoutingHistory should not throw on a v52 DB");
|
||||
|
||||
// Schema should have migrated to v62
|
||||
const version = db
|
||||
.prepare("SELECT MAX(version) AS version FROM schema_version")
|
||||
.get();
|
||||
assert.equal(version.version, 62);
|
||||
});
|
||||
|
||||
test("openDatabase_when_fresh_db_supports_schedule_entries", () => {
|
||||
assert.equal(openDatabase(":memory:"), true);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue