fix(auto): re-dispatch on executor refusal instead of pausing

The autonomous solver was designed precisely to handle executor refusals
(per its own docstring: "the solver role MUST stay on a stable, agentic,
refusal-resistant model independent of any per-unit routing choices"),
but the refusal handler short-circuited past it and emitted a `blocked`
checkpoint, which assessAutonomousSolverTurn unconditionally turns into
a `pause` — defeating autonomous mode every time the router selects a
capability-mismatched executor.

The 1h model-block added in 3f2babb5d was the right primitive but had no
consumer: nothing actually re-dispatched the unit after the model was
blocked, so the block only mattered if the operator manually unpaused
and retried.

This change wires the missing consumer:

- Add per-unit `executorRefusalEscalations` counter to solver state plus
  a `recordExecutorRefusalEscalation` helper. Counter persists across
  iterations of the same unit and resets on unit change.
- On `executor-refused`: block the refusing model and slice-routing entry
  (unchanged), file self-feedback (unchanged), then synthesize a
  `continue` checkpoint and return `{ action: "continue" }` directly so
  the auto loop re-dispatches the unit. selectAndApplyModel will skip
  the now-blocked model and pick a higher-tier fallback.
- Bounded by `MAX_EXECUTOR_REFUSAL_ESCALATIONS=3`. When the budget is
  exhausted (an entire fallback chain refused on the same unit), fall
  back to the legacy blocked-and-pause path so the operator can review.
- Bypass `assessAutonomousSolverTurn` on the refusal-continue path
  because its no-op detector would (correctly) reject a continue over a
  refusal transcript — but here the "no-op" is the whole point: we are
  explicitly swapping the routed model.

Tests cover the new state field's init/persistence/reset semantics and
the constant's invariants. Full SF extension suite (1369 tests) passes.

Refs: sf-mp3bm6u0-2fskt8 (now fully addressed, not just AC1)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-13 21:49:51 +02:00
parent 288a2a5fd7
commit 5a2618c05d
3 changed files with 215 additions and 43 deletions

View file

@ -35,8 +35,10 @@ import {
classifyExecutorRefusal,
consumePendingAutonomousSolverSteering,
getConfiguredAutonomousSolverMaxIterations,
MAX_EXECUTOR_REFUSAL_ESCALATIONS,
readAutonomousSolverState,
recordAutonomousSolverMissingCheckpointRetry,
recordExecutorRefusalEscalation,
} from "../autonomous-solver.js";
import { blockModel } from "../blocked-models.js";
import { resumeAutoAfterProviderDelay } from "../bootstrap/provider-error-resume.js";
@ -745,8 +747,13 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
? { action: "none" }
: { action: "pending" };
// Refusal short-circuit: when the executor model returned a generic refusal,
// synthesize a blocked checkpoint immediately and skip the solver pass.
// Refusal handling: when the executor model returned a generic refusal, the
// model is capability-mismatched for this unit. Block it (so selectAndApplyModel
// excludes it on the next dispatch), evict slice routing, file self-feedback,
// and re-dispatch with a tier-escalated model — bounded by MAX_EXECUTOR_REFUSAL_
// ESCALATIONS so a fallback chain of refusing models cannot loop forever. Only
// when the escalation budget is exhausted do we fall back to a blocked
// checkpoint that pauses the loop for operator intervention.
if (unitResult.status !== "cancelled" && refusal) {
const executorModel =
s.currentUnitModel?.provider && s.currentUnitModel?.id
@ -760,9 +767,8 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
} catch {
// best-effort
}
// Temporarily block the refusing model so the router skips it on retry.
// This satisfies AC1 of sf-mp3bm6u0-2fskt8: the executor model is
// escalated because the blocked model will be excluded from selection.
// Block the refusing model so the router skips it on retry. The next
// selectAndApplyModel call will pick a higher-tier fallback.
try {
const refusedProvider = s.currentUnitModel?.provider ?? "";
const refusedId = s.currentUnitModel?.id ?? "";
@ -778,46 +784,14 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
} catch {
// best-effort — blocking must not break the refusal handler
}
try {
appendAutonomousSolverCheckpoint(s.basePath, {
unitType,
unitId,
outcome: "blocked",
summary: `Executor (${executorModel}) refused the task. Pattern: ${refusal.pattern}. The model has been temporarily blocked and will be skipped on retry; escalate the executor model or unblock this unit manually.`,
completedItems: [],
remainingItems: [
`Re-run ${unitType} ${unitId} with a more capable executor model — current routing selected an incapable model.`,
],
verificationEvidence: [
`executor-refusal-pattern=${refusal.pattern}`,
`executor-model=${executorModel}`,
],
blockerReason: `executor-refused (${refusal.pattern})`,
pdd: {
purpose:
"Surface executor refusals as protocol-level blockers instead of synthesizing fake progress.",
consumer: "autonomous loop pause-handler",
contract:
"On `executor-refused`, the loop pauses and self-feedback is filed; the operator must escalate the executor model.",
failureBoundary:
"If the operator does not escalate, the same refusal will recur on next dispatch.",
evidence: "classifyExecutorRefusal matched a refusal pattern",
nonGoals:
"This does not retry the unit automatically — capability mismatches require operator judgement (or a future automatic escalation policy).",
invariants: "Refusal never silently synthesizes a continue.",
assumptions:
"The refusal pattern set in classifyExecutorRefusal is conservative — false positives are rare and require operator review.",
},
});
} catch {
// If synthesis fails, fall through to solver pass
}
// File self-feedback for observability (operator-visible signal that
// this unit type is being routed to capability-mismatched models).
try {
const feedback = recordSelfFeedback(
{
kind: "executor-refused",
severity: "high",
summary: `Executor ${executorModel} refused ${unitType} ${unitId} with pattern ${refusal.pattern}; loop paused to prevent fake-progress synthesis.`,
summary: `Executor ${executorModel} refused ${unitType} ${unitId} with pattern ${refusal.pattern}; model blocked and re-dispatching with tier escalation.`,
evidence: [
`unit=${unitType} ${unitId}`,
`executor=${executorModel}`,
@ -826,9 +800,9 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
refusal.evidence ?? "",
].join("\n"),
suggestedFix:
"Escalate the executor model for this unit (or unit type) — the currently routed model lacks the agentic capabilities required. Long-term: separate the executor and autonomous-solver roles per ADR-0079 and pin the solver to a stable agentic model.",
"Routing repeatedly selects a capability-mismatched executor for this unit type. Update the router's tier-floor for this unit type so the refusing tier is excluded by default, or add the refusing model to a permanent block list.",
acceptanceCriteria: [
"Executor model for this unit type is escalated to a model that passes the refusal-resistant tier.",
"Router's effective tier-floor for this unit type excludes the refusing model class without requiring a runtime block.",
"Refusal pattern is added to classifyExecutorRefusal if a novel phrasing slipped through.",
],
occurredIn: { unitType, unitId },
@ -853,8 +827,120 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
} catch {
// self-feedback is observability; never block loop progression on it
}
// Bounded re-dispatch: if the per-unit escalation budget is not yet
// exhausted, synthesize a `continue` checkpoint and return the
// re-dispatch action directly. We bypass assessAutonomousSolverTurn
// here because the no-op detector would (correctly) reject a continue
// over a refusal transcript — but in this case the "no-op" is the whole
// point, since we are explicitly changing the routed model on retry.
const escalation = recordExecutorRefusalEscalation(
s.basePath,
unitType,
unitId,
);
if (escalation <= MAX_EXECUTOR_REFUSAL_ESCALATIONS) {
try {
appendAutonomousSolverCheckpoint(s.basePath, {
unitType,
unitId,
outcome: "continue",
summary: `Executor (${executorModel}) refused. Pattern: ${refusal.pattern}. Model blocked; re-dispatching with model excluded (escalation ${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}).`,
completedItems: [],
remainingItems: [
`Re-dispatch ${unitType} ${unitId}; refusing model is blocked so selectAndApplyModel will pick a higher-tier fallback.`,
],
verificationEvidence: [
`executor-refusal-pattern=${refusal.pattern}`,
`executor-model=${executorModel}`,
`refusal-escalation=${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}`,
],
pdd: {
purpose:
"Auto-escalate capability-mismatched executor selections by blocking the refusing model and re-dispatching, so SF completes work without operator intervention when a higher-tier model exists in the fallback chain.",
consumer: "autonomous loop continue handler",
contract:
"On `executor-refused`, block the refusing model and emit `continue` so the loop re-dispatches with the blocked model excluded — bounded by MAX_EXECUTOR_REFUSAL_ESCALATIONS to prevent runaway loops over an all-refusing fallback chain.",
failureBoundary:
"After MAX_EXECUTOR_REFUSAL_ESCALATIONS refusals on the same unit, fall back to the legacy blocked-and-pause path so the operator can intervene.",
evidence:
"classifyExecutorRefusal matched a refusal pattern; the responsible model is now in blocked-models.json with a 1-hour TTL.",
nonGoals:
"This does not change the router's tier-floor — repeated refusals across units indicate the router still needs tuning (filed via self-feedback).",
invariants:
"The refusing model is never silently retried; it is always blocked before re-dispatch.",
assumptions:
"The fallback chain in effectiveModelConfig contains at least one higher-tier model that does not also refuse.",
},
});
} catch {
// If synthesis fails, fall through to the budget-exhausted branch
// below so the loop still has a defined outcome.
}
deps.emitJournalEvent({
ts: new Date().toISOString(),
flowId: ic.flowId,
seq: ic.nextSeq(),
eventType: "executor-refused-redispatch",
data: {
unitType,
unitId,
executorModel,
pattern: refusal.pattern,
escalation,
maxEscalations: MAX_EXECUTOR_REFUSAL_ESCALATIONS,
},
});
ctx.ui.notify(
`Executor ${executorModel} refused ${unitType} ${unitId} (${refusal.pattern}); blocked and re-dispatching (escalation ${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}).`,
"warning",
);
return {
action: "continue",
data: {
unitStartedAt: s.currentUnit?.startedAt,
requestDispatchedAt: unitResult.requestDispatchedAt,
},
};
}
// Escalation budget exhausted: emit the legacy blocked checkpoint and
// let the existing pause path take over so the operator can intervene.
try {
appendAutonomousSolverCheckpoint(s.basePath, {
unitType,
unitId,
outcome: "blocked",
summary: `Executor (${executorModel}) refused the task. Pattern: ${refusal.pattern}. Refusal-escalation budget exhausted (${escalation}/${MAX_EXECUTOR_REFUSAL_ESCALATIONS}) — every model tried in the fallback chain refused. Operator must escalate routing or add a permanent block.`,
completedItems: [],
remainingItems: [
`Re-run ${unitType} ${unitId} with a more capable executor model — the entire ${MAX_EXECUTOR_REFUSAL_ESCALATIONS}-step fallback chain refused.`,
],
verificationEvidence: [
`executor-refusal-pattern=${refusal.pattern}`,
`executor-model=${executorModel}`,
`refusal-escalations-exhausted=${escalation}`,
],
blockerReason: `executor-refused-budget-exhausted (${refusal.pattern})`,
pdd: {
purpose:
"Surface executor refusals as protocol-level blockers when bounded auto-escalation has been exhausted.",
consumer: "autonomous loop pause-handler",
contract:
"After MAX_EXECUTOR_REFUSAL_ESCALATIONS refusals on the same unit, pause the loop and require operator intervention.",
failureBoundary:
"If the operator does not escalate, the same refusal will recur on next dispatch.",
evidence: `${escalation} consecutive executor refusals on this unit`,
nonGoals:
"This does not retry the unit automatically beyond the budget — capability mismatches that defeat the entire fallback chain require operator judgement.",
invariants: "Refusal never silently synthesizes a continue.",
assumptions:
"The refusal pattern set in classifyExecutorRefusal is conservative — false positives are rare and require operator review.",
},
});
} catch {
// If synthesis fails, fall through to solver pass
}
ctx.ui.notify(
`Executor ${executorModel} refused ${unitType} ${unitId} (${refusal.pattern}); autonomous loop pausing instead of synthesizing fake progress. See SELF-FEEDBACK.md for escalation guidance.`,
`Executor refused ${unitType} ${unitId} after ${MAX_EXECUTOR_REFUSAL_ESCALATIONS} tier escalations; pausing for operator review. See SELF-FEEDBACK.md.`,
"error",
);
solverAssessment = assessAutonomousSolverTurn(s.basePath, unitType, unitId);

View file

@ -269,11 +269,51 @@ export function beginAutonomousSolverIteration(
: [],
// Safety cap: how many checkpoints have been written this iteration
checkpointCountThisIteration: 0,
// Per-unit budget for executor-refusal-driven re-dispatches. Reset when
// the unit changes; persists across iterations of the same unit so a
// resumed run does not silently get a fresh budget.
executorRefusalEscalations: sameUnit(existing, unitType, unitId)
? Number(existing.executorRefusalEscalations) || 0
: 0,
};
writeState(basePath, state);
return state;
}
/**
* Maximum number of executor-refusal-driven re-dispatches for a single unit
* before the loop falls back to the legacy blocked-and-pause behavior. Three
* gives the router two tier escalations after the initial refusal (typical
* tier ladder depth), which is plenty without enabling runaway loops if every
* fallback model also refuses.
*/
export const MAX_EXECUTOR_REFUSAL_ESCALATIONS = 3;
/**
* Increment the per-unit executor-refusal escalation counter and return the
* new count.
*
* Purpose: when classifyExecutorRefusal fires, the refusal handler blocks the
* refusing model and re-dispatches the unit so selectAndApplyModel picks a
* higher-tier alternative. This counter bounds that retry loop so an entire
* fallback chain of refusing models cannot loop forever once the budget is
* exhausted, the unit pauses for operator intervention via the legacy
* blocked-checkpoint path.
*
* Consumer: runUnitPhase refusal branch in auto/phases-unit.js.
*/
export function recordExecutorRefusalEscalation(basePath, unitType, unitId) {
const state = readJson(statePath(basePath));
if (!sameUnit(state, unitType, unitId)) return 0;
const next = (Number(state.executorRefusalEscalations) || 0) + 1;
writeState(basePath, {
...state,
executorRefusalEscalations: next,
updatedAt: nowIso(),
});
return next;
}
/**
* Build the PDD autonomous solver prompt block appended to unit prompts.
*

View file

@ -18,9 +18,11 @@ import {
getConfiguredAutonomousSolverMaxIterations,
getSolverPhase,
isNoOpExecutorTranscript,
MAX_EXECUTOR_REFUSAL_ESCALATIONS,
readAutonomousSolverState,
readLatestAutonomousSolverCheckpoint,
recordAutonomousSolverMissingCheckpointRetry,
recordExecutorRefusalEscalation,
} from "../autonomous-solver.js";
let tempDirs = [];
@ -84,6 +86,50 @@ describe("autonomous solver", () => {
expect(next.iteration).toBe(1);
});
test("recordExecutorRefusalEscalation_increments_per_unit_and_resets_on_new_unit", () => {
const project = makeProject();
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");
expect(
recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"),
).toBe(1);
expect(
recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"),
).toBe(2);
const stateAfterT01 = readAutonomousSolverState(project);
expect(stateAfterT01.executorRefusalEscalations).toBe(2);
// Same iteration advance preserves the counter (refusal budget is
// per-unit, not per-iteration).
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");
expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe(
2,
);
// Switching to a new unit resets the counter — a fresh unit gets a
// fresh budget.
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T02");
expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe(
0,
);
// Mismatched unit ids do not mutate state and return 0.
expect(
recordExecutorRefusalEscalation(project, "execute-task", "M001/S01/T01"),
).toBe(0);
expect(readAutonomousSolverState(project).executorRefusalEscalations).toBe(
0,
);
});
test("MAX_EXECUTOR_REFUSAL_ESCALATIONS_is_a_positive_integer", () => {
// Budget must be > 0 (otherwise the refusal handler never re-dispatches)
// and must be a finite integer (used in comparisons).
expect(Number.isInteger(MAX_EXECUTOR_REFUSAL_ESCALATIONS)).toBe(true);
expect(MAX_EXECUTOR_REFUSAL_ESCALATIONS).toBeGreaterThan(0);
});
test("appendAutonomousSolverCheckpoint_writes_pdd_projection_and_history", () => {
const project = makeProject();
beginAutonomousSolverIteration(project, "execute-task", "M001/S01/T01");