diff --git a/src/resources/extensions/sf/auto-model-selection.ts b/src/resources/extensions/sf/auto-model-selection.ts index db9cdf6fb..c5a27c2ec 100644 --- a/src/resources/extensions/sf/auto-model-selection.ts +++ b/src/resources/extensions/sf/auto-model-selection.ts @@ -18,6 +18,38 @@ import { getSessionModelOverride } from "./session-model-override.js"; import { logWarning } from "./workflow-logger.js"; import { resolveUokFlags } from "./uok/flags.js"; import { applyModelPolicyFilter } from "./uok/model-policy.js"; +import { getRequiredWorkflowToolsForAutoUnit } from "./workflow-mcp.js"; + +/** + * Thrown when the model-policy gate rejects every candidate model for a unit + * dispatch (#4959 / #4681 / #4850). The auto-loop catches this specifically + * to classify the unit as `blocked` rather than counting it as a retryable + * iteration error — pre-send policy denial is a configuration problem, not a + * transient runtime failure, so retrying just burns the consecutive-error + * budget toward a hard stop. + */ +export class ModelPolicyDispatchBlockedError extends Error { + readonly unitType: string; + readonly unitId: string; + readonly reasons: ReadonlyArray<{ provider: string; modelId: string; reason: string }>; + constructor( + unitType: string, + unitId: string, + reasons: ReadonlyArray<{ provider: string; modelId: string; reason: string }>, + ) { + const summary = reasons.length === 0 + ? "no candidate models" + : reasons + .slice(0, 4) + .map((r) => `${r.provider}/${r.modelId} (${r.reason})`) + .join("; "); + super(`Model policy denied dispatch for ${unitType}/${unitId} before prompt send. Rejected: ${summary}`); + this.name = "ModelPolicyDispatchBlockedError"; + this.unitType = unitType; + this.unitId = unitId; + this.reasons = reasons; + } +} export interface ModelSelectionResult { /** Routing metadata for metrics recording */ @@ -26,6 +58,51 @@ export interface ModelSelectionResult { appliedModel: Model | null; } +// Baseline active-tool set per-`pi` instance, captured the first time +// `selectAndApplyModel` runs against that instance during an auto session +// and re-applied before each subsequent dispatch. WeakMap so that test +// fakes / disposed sessions are garbage-collected normally. See +// #4959 / #4681 cross-unit poisoning notes at the call site below. +// +// LIFECYCLE: the baseline is tied to a single auto session, NOT to the +// lifetime of the `pi` instance (which can outlive many auto runs and have +// the user mutate tools between them). `clearToolBaseline` MUST be called +// at auto start AND auto stop so that a second `/sf auto` run on the same +// `pi` does not silently restore a stale snapshot from the prior run and +// undo any tool changes the user made between sessions. +const TOOL_BASELINE = new WeakMap(); + +/** + * Drop the captured tool baseline for `pi` so the next `selectAndApplyModel` + * call re-captures from the live active set. Wired into `startAuto` and + * `stopAuto` in `auto.ts` to bound the baseline to a single auto session. + * + * Safe to call when no baseline is recorded (no-op). + */ +export function clearToolBaseline(pi: ExtensionAPI | object): void { + TOOL_BASELINE.delete(pi as unknown as object); +} + +function restoreToolBaseline(pi: ExtensionAPI): void { + const key = pi as unknown as object; + const baseline = TOOL_BASELINE.get(key); + if (baseline === undefined) { + // First call: capture the canonical pre-dispatch tool set. At auto-mode + // start the active set has not yet been narrowed for any provider. + // Guarded against test fakes that omit getActiveTools — record an empty + // baseline so subsequent calls don't keep re-probing. + const initial = typeof pi.getActiveTools === "function" ? pi.getActiveTools() : []; + TOOL_BASELINE.set(key, [...initial]); + return; + } + // Restore baseline before the next unit reads getActiveTools / applies + // post-selection adjustToolSet. Older fakes that omit setActiveTools are + // tolerated — the test asserts call order on real fakes. + if (typeof pi.setActiveTools === "function") { + pi.setActiveTools([...baseline]); + } +} + export function resolvePreferredModelConfig( unitType: string, autoModeStartModel: { provider: string; id: string; flatRateCtx?: FlatRateContext } | null, @@ -77,6 +154,21 @@ export async function selectAndApplyModel( /** Explicit /sf model pin captured at bootstrap for long-running auto loops. */ sessionModelOverride?: { provider: string; id: string } | null, ): Promise { + // ── Restore active-tool baseline before policy evaluation (#4959, #4681, #4850) ── + // Per-unit narrowing at the bottom of this function calls + // `pi.setActiveTools(finalToolNames)` and monotonically narrows the active + // set across units. Without restoration, a previously-dispatched unit on a + // narrow-API provider (e.g. openai-completions) leaves the active set + // missing tools that the next unit's selected model fully supports, but + // `pi.getActiveTools()` snapshot-as-hard-gate (the old behaviour) blocked + // dispatch with "tool policy denied" anyway. + // + // The baseline is captured once per `pi` instance via a WeakMap and + // re-applied here so each unit starts from a clean slate. Soft adaptation + // (adjustToolSet at the bottom of this function) still trims for the + // selected model. + restoreToolBaseline(pi); + const uokFlags = resolveUokFlags(prefs); const persistModelChanges = resolvePersistModelChanges(); const effectiveSessionModelOverride = sessionModelOverride === undefined @@ -143,7 +235,16 @@ export async function selectAndApplyModel( ? extractTaskMetadata(unitId, basePath) : undefined; + let policyDenyReasons: Array<{ provider: string; modelId: string; reason: string }> = []; if (uokFlags.modelPolicy) { + // Use the workflow-spec required-tool subset for the unit type rather + // than the live `pi.getActiveTools()` snapshot (#4959). The active set + // is poisoned by per-unit narrowing for narrow-API providers — using it + // as a hard gate promotes soft adaptation (adjustToolSet) into a layering + // violation that throws before dispatch. The smaller workflow-required + // subset reflects what the unit actually needs; soft adaptation post- + // selection still trims provider-incompatible tools. + const requiredTools = getRequiredWorkflowToolsForAutoUnit(unitType); const policy = applyModelPolicyFilter( availableModels, { @@ -154,15 +255,18 @@ export async function selectAndApplyModel( taskMetadata: taskMetadataForPolicy, currentProvider: ctx.model?.provider, allowCrossProvider: routingConfig.cross_provider !== false, - requiredTools: pi.getActiveTools(), + requiredTools, }, ); routingEligibleModels = policy.eligible; policyAllowedModelKeys = new Set( policy.eligible.map((m) => `${m.provider.toLowerCase()}/${m.id.toLowerCase()}`), ); + policyDenyReasons = policy.decisions + .filter((d) => !d.allowed) + .map((d) => ({ provider: d.provider, modelId: d.modelId, reason: d.reason })); if (routingEligibleModels.length === 0) { - throw new Error(`Model policy denied all candidate models for ${unitType}/${unitId}`); + throw new ModelPolicyDispatchBlockedError(unitType, unitId, policyDenyReasons); } } @@ -411,7 +515,7 @@ export async function selectAndApplyModel( } if (uokFlags.modelPolicy && policyAllowedModelKeys && !attemptedPolicyEligible) { - throw new Error(`Model policy denied dispatch for ${unitType}/${unitId} before prompt send`); + throw new ModelPolicyDispatchBlockedError(unitType, unitId, policyDenyReasons); } } else if (autoModeStartModel) { // No model preference for this unit type — re-apply the model captured diff --git a/src/resources/extensions/sf/auto.ts b/src/resources/extensions/sf/auto.ts index a314e96f8..cf37dd5f3 100644 --- a/src/resources/extensions/sf/auto.ts +++ b/src/resources/extensions/sf/auto.ts @@ -87,7 +87,7 @@ import { } from "./auto-tool-tracking.js"; import { closeoutUnit } from "./auto-unit-closeout.js"; import { recoverTimedOutUnit } from "./auto-timeout-recovery.js"; -import { selectAndApplyModel, resolveModelId } from "./auto-model-selection.js"; +import { selectAndApplyModel, resolveModelId, clearToolBaseline } from "./auto-model-selection.js"; import { resetRoutingHistory, recordOutcome } from "./routing-history.js"; import { checkPostUnitHooks, @@ -992,6 +992,12 @@ export async function stopAuto( restoreProjectRootEnv(); restoreMilestoneLockEnv(); + // Drop the active-tool baseline so a subsequent /sf auto run on the + // same `pi` instance recaptures from the live tool set rather than + // restoring this session's snapshot and silently undoing any tool + // changes the user made between sessions (#4959 / CodeRabbit). + if (pi) clearToolBaseline(pi); + // Reset all session state in one call s.reset(); } @@ -1278,6 +1284,15 @@ export async function startAuto( return; } + // On a *fresh* start, drop any stale active-tool baseline left by a prior + // auto session that didn't run stopAuto cleanly. Skip on resume: pauseAuto + // leaves the last provider-trimmed active tools in place, so clearing here + // would let the next selectAndApplyModel recapture that already-narrowed + // set as the new baseline — exactly the cross-unit poisoning this PR is + // fixing (#4959 / CodeRabbit Major). The pre-pause baseline survives in + // the WeakMap keyed by `pi`. + if (!s.paused) clearToolBaseline(pi); + const requestedStepMode = options?.step ?? false; const interruptedAssessment = options?.interrupted ?? null; if (options?.milestoneLock !== undefined) { diff --git a/src/resources/extensions/sf/auto/loop.ts b/src/resources/extensions/sf/auto/loop.ts index edf3b8231..146aed292 100644 --- a/src/resources/extensions/sf/auto/loop.ts +++ b/src/resources/extensions/sf/auto/loop.ts @@ -28,6 +28,7 @@ import { } from "./phases.js"; import { debugLog } from "../debug-logger.js"; import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js"; +import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js"; import { resolveEngine } from "../engine-resolver.js"; import { logWarning } from "../workflow-logger.js"; import { sfRoot } from "../paths.js"; @@ -597,6 +598,51 @@ export async function autoLoop( // runFinalize leave the journal incomplete, making diagnosis harder. deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration, error: msg } }); + // ── Pre-send model-policy block: not a retryable error (#4959 / #4850) ── + // The model-policy gate runs before the prompt is sent. When every + // candidate model is denied (cross-provider disabled + flat-rate + // baseline + tool-policy denial), retrying the same unit produces the + // same denial — burning the consecutive-error budget toward a 3-strike + // hard stop and corrupting auto-mode state. Pause for user attention + // instead, with the per-model deny reasons surfaced from the typed error. + if (loopErr instanceof ModelPolicyDispatchBlockedError) { + debugLog("autoLoop", { + phase: "model-policy-blocked", + iteration, + unitType: loopErr.unitType, + unitId: loopErr.unitId, + reasons: loopErr.reasons, + }); + ctx.ui.notify( + `Auto-mode paused: model-policy denied dispatch for ${loopErr.unitType}/${loopErr.unitId}. ${msg}`, + "error", + ); + deps.emitJournalEvent({ + ts: new Date().toISOString(), + flowId, + seq: nextSeq(), + eventType: "unit-end", + data: { + unitType: loopErr.unitType, + unitId: loopErr.unitId, + status: "blocked", + reason: "model-policy-dispatch-blocked", + reasons: loopErr.reasons, + }, + }); + // Carry the blocked unit identity into the turn-result observer: + // the throw originated inside dispatch, so observedUnitType/Id were + // not assigned by the success path — but the typed error already names + // the unit (#4959 / CodeRabbit). + observedUnitType = loopErr.unitType; + observedUnitId = loopErr.unitId; + await deps.pauseAuto(ctx, pi); + finishTurn("paused", "manual-attention", msg); + // Do NOT increment consecutiveErrors — the failure is configuration, + // not a transient runtime fault. + break; + } + // ── Infrastructure errors: immediate stop, no retry ── // These are unrecoverable (disk full, OOM, etc.). Retrying just burns // LLM budget on guaranteed failures.