diff --git a/src/resources/extensions/sf/auto/phases-unit.js b/src/resources/extensions/sf/auto/phases-unit.js index e32b1c96c..4999f8bcf 100644 --- a/src/resources/extensions/sf/auto/phases-unit.js +++ b/src/resources/extensions/sf/auto/phases-unit.js @@ -35,8 +35,10 @@ import { classifyExecutorRefusal, consumePendingAutonomousSolverSteering, getConfiguredAutonomousSolverMaxIterations, + MAX_EXECUTOR_REFUSAL_ESCALATIONS, readAutonomousSolverState, recordAutonomousSolverMissingCheckpointRetry, + recordExecutorRefusalEscalation, } from "../autonomous-solver.js"; import { blockModel } from "../blocked-models.js"; import { resumeAutoAfterProviderDelay } from "../bootstrap/provider-error-resume.js"; @@ -745,8 +747,13 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { ? { action: "none" } : { action: "pending" }; - // Refusal short-circuit: when the executor model returned a generic refusal, - // synthesize a blocked checkpoint immediately and skip the solver pass. + // Refusal handling: when the executor model returned a generic refusal, the + // model is capability-mismatched for this unit. Block it (so selectAndApplyModel + // excludes it on the next dispatch), evict slice routing, file self-feedback, + // and re-dispatch with a tier-escalated model — bounded by MAX_EXECUTOR_REFUSAL_ + // ESCALATIONS so a fallback chain of refusing models cannot loop forever. Only + // when the escalation budget is exhausted do we fall back to a blocked + // checkpoint that pauses the loop for operator intervention. if (unitResult.status !== "cancelled" && refusal) { const executorModel = s.currentUnitModel?.provider && s.currentUnitModel?.id @@ -760,9 +767,8 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { } catch { // best-effort } - // Temporarily block the refusing model so the router skips it on retry. - // This satisfies AC1 of sf-mp3bm6u0-2fskt8: the executor model is - // escalated because the blocked model will be excluded from selection. + // Block the refusing model so the router skips it on retry. The next + // selectAndApplyModel call will pick a higher-tier fallback. try { const refusedProvider = s.currentUnitModel?.provider ?? ""; const refusedId = s.currentUnitModel?.id ?? ""; @@ -778,46 +784,14 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { } catch { // best-effort — blocking must not break the refusal handler } - try { - appendAutonomousSolverCheckpoint(s.basePath, { - unitType, - unitId, - outcome: "blocked", - summary: `Executor (${executorModel}) refused the task. Pattern: ${refusal.pattern}. The model has been temporarily blocked and will be skipped on retry; escalate the executor model or unblock this unit manually.`, - completedItems: [], - remainingItems: [ - `Re-run ${unitType} ${unitId} with a more capable executor model — current routing selected an incapable model.`, - ], - verificationEvidence: [ - `executor-refusal-pattern=${refusal.pattern}`, - `executor-model=${executorModel}`, - ], - blockerReason: `executor-refused (${refusal.pattern})`, - pdd: { - purpose: - "Surface executor refusals as protocol-level blockers instead of synthesizing fake progress.", - consumer: "autonomous loop pause-handler", - contract: - "On `executor-refused`, the loop pauses and self-feedback is filed; the operator must escalate the executor model.", - failureBoundary: - "If the operator does not escalate, the same refusal will recur on next dispatch.", - evidence: "classifyExecutorRefusal matched a refusal pattern", - nonGoals: - "This does not retry the unit automatically — capability mismatches require operator judgement (or a future automatic escalation policy).", - invariants: "Refusal never silently synthesizes a continue.", - assumptions: - "The refusal pattern set in classifyExecutorRefusal is conservative — false positives are rare and require operator review.", - }, - }); - } catch { - // If synthesis fails, fall through to solver pass - } + // File self-feedback for observability (operator-visible signal that + // this unit type is being routed to capability-mismatched models). try { const feedback = recordSelfFeedback( { kind: "executor-refused", severity: "high", - summary: `Executor ${executorModel} refused ${unitType} ${unitId} with pattern ${refusal.pattern}; loop paused to prevent fake-progress synthesis.`, + summary: `Executor ${executorModel} refused ${unitType} ${unitId} with pattern ${refusal.pattern}; model blocked and re-dispatching with tier escalation.`, evidence: [ `unit=${unitType} ${unitId}`, `executor=${executorModel}`, @@ -826,9 +800,9 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { refusal.evidence ?? "", ].join("\n"), suggestedFix: - "Escalate the executor model for this unit (or unit type) — the currently routed model lacks the agentic capabilities required. Long-term: separate the executor and autonomous-solver roles per ADR-0079 and pin the solver to a stable agentic model.", + "Routing repeatedly selects a capability-mismatched executor for this unit type. Update the router's tier-floor for this unit type so the refusing tier is excluded by default, or add the refusing model to a permanent block list.", acceptanceCriteria: [ - "Executor model for this unit type is escalated to a model that passes the refusal-resistant tier.", + "Router's effective tier-floor for this unit type excludes the refusing model class without requiring a runtime block.", "Refusal pattern is added to classifyExecutorRefusal if a novel phrasing slipped through.", ], occurredIn: { unitType, unitId }, @@ -853,8 +827,120 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) { } catch { // self-feedback is observability; never block loop progression on it } + // Bounded re-dispatch: if the per-unit escalation budget is not yet + // exhausted, synthesize a `continue` checkpoint and return the + // re-dispatch action directly. We bypass assessAutonomousSolverTurn + // here because the no-op detector would (correctly) reject a continue + // over a refusal transcript — but in this case the "no-op" is the whole + // point, since we are explicitly changing the routed model on retry. + const escalation = recordExecutorRefusalEscalation( + s.basePath, + unitType, + unitId, + ); + if (escalation <= MAX_EXECUTOR_REFUSAL_ESCALATIONS) { + try { + appendAutonomousSolverCheckpoint(s.basePath, { + unitType, + unitId, + outcome: "continue", + summary: `Executor (${executorModel}) refused. Pattern: ${refusal.pattern}. Model blocked; re-dispatching with model excluded (escalation ${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}).`, + completedItems: [], + remainingItems: [ + `Re-dispatch ${unitType} ${unitId}; refusing model is blocked so selectAndApplyModel will pick a higher-tier fallback.`, + ], + verificationEvidence: [ + `executor-refusal-pattern=${refusal.pattern}`, + `executor-model=${executorModel}`, + `refusal-escalation=${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}`, + ], + pdd: { + purpose: + "Auto-escalate capability-mismatched executor selections by blocking the refusing model and re-dispatching, so SF completes work without operator intervention when a higher-tier model exists in the fallback chain.", + consumer: "autonomous loop continue handler", + contract: + "On `executor-refused`, block the refusing model and emit `continue` so the loop re-dispatches with the blocked model excluded — bounded by MAX_EXECUTOR_REFUSAL_ESCALATIONS to prevent runaway loops over an all-refusing fallback chain.", + failureBoundary: + "After MAX_EXECUTOR_REFUSAL_ESCALATIONS refusals on the same unit, fall back to the legacy blocked-and-pause path so the operator can intervene.", + evidence: + "classifyExecutorRefusal matched a refusal pattern; the responsible model is now in blocked-models.json with a 1-hour TTL.", + nonGoals: + "This does not change the router's tier-floor — repeated refusals across units indicate the router still needs tuning (filed via self-feedback).", + invariants: + "The refusing model is never silently retried; it is always blocked before re-dispatch.", + assumptions: + "The fallback chain in effectiveModelConfig contains at least one higher-tier model that does not also refuse.", + }, + }); + } catch { + // If synthesis fails, fall through to the budget-exhausted branch + // below so the loop still has a defined outcome. + } + deps.emitJournalEvent({ + ts: new Date().toISOString(), + flowId: ic.flowId, + seq: ic.nextSeq(), + eventType: "executor-refused-redispatch", + data: { + unitType, + unitId, + executorModel, + pattern: refusal.pattern, + escalation, + maxEscalations: MAX_EXECUTOR_REFUSAL_ESCALATIONS, + }, + }); + ctx.ui.notify( + `Executor ${executorModel} refused ${unitType} ${unitId} (${refusal.pattern}); blocked and re-dispatching (escalation ${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}).`, + "warning", + ); + return { + action: "continue", + data: { + unitStartedAt: s.currentUnit?.startedAt, + requestDispatchedAt: unitResult.requestDispatchedAt, + }, + }; + } + // Escalation budget exhausted: emit the legacy blocked checkpoint and + // let the existing pause path take over so the operator can intervene. + try { + appendAutonomousSolverCheckpoint(s.basePath, { + unitType, + unitId, + outcome: "blocked", + summary: `Executor (${executorModel}) refused the task. Pattern: ${refusal.pattern}. Refusal-escalation budget exhausted (${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}) — every model tried in the fallback chain refused. Operator must escalate routing or add a permanent block.`, + completedItems: [], + remainingItems: [ + `Re-run ${unitType} ${unitId} with a more capable executor model — the entire ${MAX_EXECUTOR_REFUSAL_ESCALATIONS}-step fallback chain refused.`, + ], + verificationEvidence: [ + `executor-refusal-pattern=${refusal.pattern}`, + `executor-model=${executorModel}`, + `refusal-escalations-exhausted=${escalation}`, + ], + blockerReason: `executor-refused-budget-exhausted (${refusal.pattern})`, + pdd: { + purpose: + "Surface executor refusals as protocol-level blockers when bounded auto-escalation has been exhausted.", + consumer: "autonomous loop pause-handler", + contract: + "After MAX_EXECUTOR_REFUSAL_ESCALATIONS refusals on the same unit, pause the loop and require operator intervention.", + failureBoundary: + "If the operator does not escalate, the same refusal will recur on next dispatch.", + evidence: `${escalation} consecutive executor refusals on this unit`, + nonGoals: + "This does not retry the unit automatically beyond the budget — capability mismatches that defeat the entire fallback chain require operator judgement.", + invariants: "Refusal never silently synthesizes a continue.", + assumptions: + "The refusal pattern set in classifyExecutorRefusal is conservative — false positives are rare and require operator review.", + }, + }); + } catch { + // If synthesis fails, fall through to solver pass + } ctx.ui.notify( - `Executor ${executorModel} refused ${unitType} ${unitId} (${refusal.pattern}); autonomous loop pausing instead of synthesizing fake progress. See SELF-FEEDBACK.md for escalation guidance.`, + `Executor refused ${unitType} ${unitId} after ${MAX_EXECUTOR_REFUSAL_ESCALATIONS} tier escalations; pausing for operator review. See SELF-FEEDBACK.md.`, "error", ); solverAssessment = assessAutonomousSolverTurn(s.basePath, unitType, unitId); diff --git a/src/resources/extensions/sf/autonomous-solver.js b/src/resources/extensions/sf/autonomous-solver.js index 934be6631..c5c3da9b6 100644 --- a/src/resources/extensions/sf/autonomous-solver.js +++ b/src/resources/extensions/sf/autonomous-solver.js @@ -269,11 +269,51 @@ export function beginAutonomousSolverIteration( : [], // Safety cap: how many checkpoints have been written this iteration checkpointCountThisIteration: 0, + // Per-unit budget for executor-refusal-driven re-dispatches. Reset when + // the unit changes; persists across iterations of the same unit so a + // resumed run does not silently get a fresh budget. + executorRefusalEscalations: sameUnit(existing, unitType, unitId) + ? Number(existing.executorRefusalEscalations) || 0 + : 0, }; writeState(basePath, state); return state; } +/** + * Maximum number of executor-refusal-driven re-dispatches for a single unit + * before the loop falls back to the legacy blocked-and-pause behavior. Three + * gives the router two tier escalations after the initial refusal (typical + * tier ladder depth), which is plenty without enabling runaway loops if every + * fallback model also refuses. + */ +export const MAX_EXECUTOR_REFUSAL_ESCALATIONS = 3; + +/** + * Increment the per-unit executor-refusal escalation counter and return the + * new count. + * + * Purpose: when classifyExecutorRefusal fires, the refusal handler blocks the + * refusing model and re-dispatches the unit so selectAndApplyModel picks a + * higher-tier alternative. This counter bounds that retry loop so an entire + * fallback chain of refusing models cannot loop forever — once the budget is + * exhausted, the unit pauses for operator intervention via the legacy + * blocked-checkpoint path. + * + * Consumer: runUnitPhase refusal branch in auto/phases-unit.js. + */ +export function recordExecutorRefusalEscalation(basePath, unitType, unitId) { + const state = readJson(statePath(basePath)); + if (!sameUnit(state, unitType, unitId)) return 0; + const next = (Number(state.executorRefusalEscalations) || 0) + 1; + writeState(basePath, { + ...state, + executorRefusalEscalations: next, + updatedAt: nowIso(), + }); + return next; +} + /** * Build the PDD autonomous solver prompt block appended to unit prompts. * diff --git a/src/resources/extensions/sf/tests/autonomous-solver.test.mjs b/src/resources/extensions/sf/tests/autonomous-solver.test.mjs index a48aabad7..409b0f63d 100644 --- a/src/resources/extensions/sf/tests/autonomous-solver.test.mjs +++ b/src/resources/extensions/sf/tests/autonomous-solver.test.mjs @@ -18,9 +18,11 @@ import { getConfiguredAutonomousSolverMaxIterations, getSolverPhase, isNoOpExecutorTranscript, + MAX_EXECUTOR_REFUSAL_ESCALATIONS, readAutonomousSolverState, readLatestAutonomousSolverCheckpoint, recordAutonomousSolverMissingCheckpointRetry, + recordExecutorRefusalEscalation, } from "../autonomous-solver.js"; let tempDirs = []; @@ -84,6 +86,50 @@ describe("autonomous solver", () => { expect(next.iteration).toBe(1); }); + test("recordExecutorRefusalEscalation_increments_per_unit_and_resets_on_new_unit", () => { + const project = makeProject(); + beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01"); + + expect( + recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"), + ).toBe(1); + expect( + recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"), + ).toBe(2); + + const stateAfterT01 = readAutonomousSolverState(project); + expect(stateAfterT01.executorRefusalEscalations).toBe(2); + + // Same iteration advance preserves the counter (refusal budget is + // per-unit, not per-iteration). + beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01"); + expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe( + 2, + ); + + // Switching to a new unit resets the counter — a fresh unit gets a + // fresh budget. + beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T02"); + expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe( + 0, + ); + + // Mismatched unit ids do not mutate state and return 0. + expect( + recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"), + ).toBe(0); + expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe( + 0, + ); + }); + + test("MAX_EXECUTOR_REFUSAL_ESCALATIONS_is_a_positive_integer", () => { + // Budget must be > 0 (otherwise the refusal handler never re-dispatches) + // and must be a finite integer (used in comparisons). + expect(Number.isInteger(MAX_EXECUTOR_REFUSAL_ESCALATIONS)).toBe(true); + expect(MAX_EXECUTOR_REFUSAL_ESCALATIONS).toBeGreaterThan(0); + }); + test("appendAutonomousSolverCheckpoint_writes_pdd_projection_and_history", () => { const project = makeProject(); beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");