port gsd2 #4961: stop using active-tool snapshot as model-policy gate
Fixes a bug where per-unit tool narrowing poisoned the policy gate for subsequent units, causing "Model policy denied dispatch before prompt send" errors on complete-slice and discuss-milestone (100% Win repro). Four-part port from gsd2@817031b2a: - ModelPolicyDispatchBlockedError class with per-model deny reasons - TOOL_BASELINE WeakMap + clearToolBaseline/restoreToolBaseline lifecycle - auto-model-selection: use getRequiredWorkflowToolsForAutoUnit as requiredTools - auto/loop: catch ModelPolicyDispatchBlockedError as non-retryable (pause) - auto.ts: wire clearToolBaseline at startAuto (fresh only) and stopAuto Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4fdd8700a3
commit
6c36d62f35
3 changed files with 169 additions and 4 deletions
|
|
@ -18,6 +18,38 @@ import { getSessionModelOverride } from "./session-model-override.js";
|
|||
import { logWarning } from "./workflow-logger.js";
|
||||
import { resolveUokFlags } from "./uok/flags.js";
|
||||
import { applyModelPolicyFilter } from "./uok/model-policy.js";
|
||||
import { getRequiredWorkflowToolsForAutoUnit } from "./workflow-mcp.js";
|
||||
|
||||
/**
|
||||
* Thrown when the model-policy gate rejects every candidate model for a unit
|
||||
* dispatch (#4959 / #4681 / #4850). The auto-loop catches this specifically
|
||||
* to classify the unit as `blocked` rather than counting it as a retryable
|
||||
* iteration error — pre-send policy denial is a configuration problem, not a
|
||||
* transient runtime failure, so retrying just burns the consecutive-error
|
||||
* budget toward a hard stop.
|
||||
*/
|
||||
export class ModelPolicyDispatchBlockedError extends Error {
|
||||
readonly unitType: string;
|
||||
readonly unitId: string;
|
||||
readonly reasons: ReadonlyArray<{ provider: string; modelId: string; reason: string }>;
|
||||
constructor(
|
||||
unitType: string,
|
||||
unitId: string,
|
||||
reasons: ReadonlyArray<{ provider: string; modelId: string; reason: string }>,
|
||||
) {
|
||||
const summary = reasons.length === 0
|
||||
? "no candidate models"
|
||||
: reasons
|
||||
.slice(0, 4)
|
||||
.map((r) => `${r.provider}/${r.modelId} (${r.reason})`)
|
||||
.join("; ");
|
||||
super(`Model policy denied dispatch for ${unitType}/${unitId} before prompt send. Rejected: ${summary}`);
|
||||
this.name = "ModelPolicyDispatchBlockedError";
|
||||
this.unitType = unitType;
|
||||
this.unitId = unitId;
|
||||
this.reasons = reasons;
|
||||
}
|
||||
}
|
||||
|
||||
export interface ModelSelectionResult {
|
||||
/** Routing metadata for metrics recording */
|
||||
|
|
@ -26,6 +58,51 @@ export interface ModelSelectionResult {
|
|||
appliedModel: Model<Api> | null;
|
||||
}
|
||||
|
||||
// Baseline active-tool set per-`pi` instance, captured the first time
|
||||
// `selectAndApplyModel` runs against that instance during an auto session
|
||||
// and re-applied before each subsequent dispatch. WeakMap so that test
|
||||
// fakes / disposed sessions are garbage-collected normally. See
|
||||
// #4959 / #4681 cross-unit poisoning notes at the call site below.
|
||||
//
|
||||
// LIFECYCLE: the baseline is tied to a single auto session, NOT to the
|
||||
// lifetime of the `pi` instance (which can outlive many auto runs and have
|
||||
// the user mutate tools between them). `clearToolBaseline` MUST be called
|
||||
// at auto start AND auto stop so that a second `/sf auto` run on the same
|
||||
// `pi` does not silently restore a stale snapshot from the prior run and
|
||||
// undo any tool changes the user made between sessions.
|
||||
const TOOL_BASELINE = new WeakMap<object, string[]>();
|
||||
|
||||
/**
|
||||
* Drop the captured tool baseline for `pi` so the next `selectAndApplyModel`
|
||||
* call re-captures from the live active set. Wired into `startAuto` and
|
||||
* `stopAuto` in `auto.ts` to bound the baseline to a single auto session.
|
||||
*
|
||||
* Safe to call when no baseline is recorded (no-op).
|
||||
*/
|
||||
export function clearToolBaseline(pi: ExtensionAPI | object): void {
|
||||
TOOL_BASELINE.delete(pi as unknown as object);
|
||||
}
|
||||
|
||||
function restoreToolBaseline(pi: ExtensionAPI): void {
|
||||
const key = pi as unknown as object;
|
||||
const baseline = TOOL_BASELINE.get(key);
|
||||
if (baseline === undefined) {
|
||||
// First call: capture the canonical pre-dispatch tool set. At auto-mode
|
||||
// start the active set has not yet been narrowed for any provider.
|
||||
// Guarded against test fakes that omit getActiveTools — record an empty
|
||||
// baseline so subsequent calls don't keep re-probing.
|
||||
const initial = typeof pi.getActiveTools === "function" ? pi.getActiveTools() : [];
|
||||
TOOL_BASELINE.set(key, [...initial]);
|
||||
return;
|
||||
}
|
||||
// Restore baseline before the next unit reads getActiveTools / applies
|
||||
// post-selection adjustToolSet. Older fakes that omit setActiveTools are
|
||||
// tolerated — the test asserts call order on real fakes.
|
||||
if (typeof pi.setActiveTools === "function") {
|
||||
pi.setActiveTools([...baseline]);
|
||||
}
|
||||
}
|
||||
|
||||
export function resolvePreferredModelConfig(
|
||||
unitType: string,
|
||||
autoModeStartModel: { provider: string; id: string; flatRateCtx?: FlatRateContext } | null,
|
||||
|
|
@ -77,6 +154,21 @@ export async function selectAndApplyModel(
|
|||
/** Explicit /sf model pin captured at bootstrap for long-running auto loops. */
|
||||
sessionModelOverride?: { provider: string; id: string } | null,
|
||||
): Promise<ModelSelectionResult> {
|
||||
// ── Restore active-tool baseline before policy evaluation (#4959, #4681, #4850) ──
|
||||
// Per-unit narrowing at the bottom of this function calls
|
||||
// `pi.setActiveTools(finalToolNames)` and monotonically narrows the active
|
||||
// set across units. Without restoration, a previously-dispatched unit on a
|
||||
// narrow-API provider (e.g. openai-completions) leaves the active set
|
||||
// missing tools that the next unit's selected model fully supports, but
|
||||
// `pi.getActiveTools()` snapshot-as-hard-gate (the old behaviour) blocked
|
||||
// dispatch with "tool policy denied" anyway.
|
||||
//
|
||||
// The baseline is captured once per `pi` instance via a WeakMap and
|
||||
// re-applied here so each unit starts from a clean slate. Soft adaptation
|
||||
// (adjustToolSet at the bottom of this function) still trims for the
|
||||
// selected model.
|
||||
restoreToolBaseline(pi);
|
||||
|
||||
const uokFlags = resolveUokFlags(prefs);
|
||||
const persistModelChanges = resolvePersistModelChanges();
|
||||
const effectiveSessionModelOverride = sessionModelOverride === undefined
|
||||
|
|
@ -143,7 +235,16 @@ export async function selectAndApplyModel(
|
|||
? extractTaskMetadata(unitId, basePath)
|
||||
: undefined;
|
||||
|
||||
let policyDenyReasons: Array<{ provider: string; modelId: string; reason: string }> = [];
|
||||
if (uokFlags.modelPolicy) {
|
||||
// Use the workflow-spec required-tool subset for the unit type rather
|
||||
// than the live `pi.getActiveTools()` snapshot (#4959). The active set
|
||||
// is poisoned by per-unit narrowing for narrow-API providers — using it
|
||||
// as a hard gate promotes soft adaptation (adjustToolSet) into a layering
|
||||
// violation that throws before dispatch. The smaller workflow-required
|
||||
// subset reflects what the unit actually needs; soft adaptation post-
|
||||
// selection still trims provider-incompatible tools.
|
||||
const requiredTools = getRequiredWorkflowToolsForAutoUnit(unitType);
|
||||
const policy = applyModelPolicyFilter(
|
||||
availableModels,
|
||||
{
|
||||
|
|
@ -154,15 +255,18 @@ export async function selectAndApplyModel(
|
|||
taskMetadata: taskMetadataForPolicy,
|
||||
currentProvider: ctx.model?.provider,
|
||||
allowCrossProvider: routingConfig.cross_provider !== false,
|
||||
requiredTools: pi.getActiveTools(),
|
||||
requiredTools,
|
||||
},
|
||||
);
|
||||
routingEligibleModels = policy.eligible;
|
||||
policyAllowedModelKeys = new Set(
|
||||
policy.eligible.map((m) => `${m.provider.toLowerCase()}/${m.id.toLowerCase()}`),
|
||||
);
|
||||
policyDenyReasons = policy.decisions
|
||||
.filter((d) => !d.allowed)
|
||||
.map((d) => ({ provider: d.provider, modelId: d.modelId, reason: d.reason }));
|
||||
if (routingEligibleModels.length === 0) {
|
||||
throw new Error(`Model policy denied all candidate models for ${unitType}/${unitId}`);
|
||||
throw new ModelPolicyDispatchBlockedError(unitType, unitId, policyDenyReasons);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -411,7 +515,7 @@ export async function selectAndApplyModel(
|
|||
}
|
||||
|
||||
if (uokFlags.modelPolicy && policyAllowedModelKeys && !attemptedPolicyEligible) {
|
||||
throw new Error(`Model policy denied dispatch for ${unitType}/${unitId} before prompt send`);
|
||||
throw new ModelPolicyDispatchBlockedError(unitType, unitId, policyDenyReasons);
|
||||
}
|
||||
} else if (autoModeStartModel) {
|
||||
// No model preference for this unit type — re-apply the model captured
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ import {
|
|||
} from "./auto-tool-tracking.js";
|
||||
import { closeoutUnit } from "./auto-unit-closeout.js";
|
||||
import { recoverTimedOutUnit } from "./auto-timeout-recovery.js";
|
||||
import { selectAndApplyModel, resolveModelId } from "./auto-model-selection.js";
|
||||
import { selectAndApplyModel, resolveModelId, clearToolBaseline } from "./auto-model-selection.js";
|
||||
import { resetRoutingHistory, recordOutcome } from "./routing-history.js";
|
||||
import {
|
||||
checkPostUnitHooks,
|
||||
|
|
@ -992,6 +992,12 @@ export async function stopAuto(
|
|||
restoreProjectRootEnv();
|
||||
restoreMilestoneLockEnv();
|
||||
|
||||
// Drop the active-tool baseline so a subsequent /sf auto run on the
|
||||
// same `pi` instance recaptures from the live tool set rather than
|
||||
// restoring this session's snapshot and silently undoing any tool
|
||||
// changes the user made between sessions (#4959 / CodeRabbit).
|
||||
if (pi) clearToolBaseline(pi);
|
||||
|
||||
// Reset all session state in one call
|
||||
s.reset();
|
||||
}
|
||||
|
|
@ -1278,6 +1284,15 @@ export async function startAuto(
|
|||
return;
|
||||
}
|
||||
|
||||
// On a *fresh* start, drop any stale active-tool baseline left by a prior
|
||||
// auto session that didn't run stopAuto cleanly. Skip on resume: pauseAuto
|
||||
// leaves the last provider-trimmed active tools in place, so clearing here
|
||||
// would let the next selectAndApplyModel recapture that already-narrowed
|
||||
// set as the new baseline — exactly the cross-unit poisoning this PR is
|
||||
// fixing (#4959 / CodeRabbit Major). The pre-pause baseline survives in
|
||||
// the WeakMap keyed by `pi`.
|
||||
if (!s.paused) clearToolBaseline(pi);
|
||||
|
||||
const requestedStepMode = options?.step ?? false;
|
||||
const interruptedAssessment = options?.interrupted ?? null;
|
||||
if (options?.milestoneLock !== undefined) {
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import {
|
|||
} from "./phases.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { isInfrastructureError, isTransientCooldownError, getCooldownRetryAfterMs, COOLDOWN_FALLBACK_WAIT_MS, MAX_COOLDOWN_RETRIES } from "./infra-errors.js";
|
||||
import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
|
||||
import { resolveEngine } from "../engine-resolver.js";
|
||||
import { logWarning } from "../workflow-logger.js";
|
||||
import { sfRoot } from "../paths.js";
|
||||
|
|
@ -597,6 +598,51 @@ export async function autoLoop(
|
|||
// runFinalize leave the journal incomplete, making diagnosis harder.
|
||||
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration, error: msg } });
|
||||
|
||||
// ── Pre-send model-policy block: not a retryable error (#4959 / #4850) ──
|
||||
// The model-policy gate runs before the prompt is sent. When every
|
||||
// candidate model is denied (cross-provider disabled + flat-rate
|
||||
// baseline + tool-policy denial), retrying the same unit produces the
|
||||
// same denial — burning the consecutive-error budget toward a 3-strike
|
||||
// hard stop and corrupting auto-mode state. Pause for user attention
|
||||
// instead, with the per-model deny reasons surfaced from the typed error.
|
||||
if (loopErr instanceof ModelPolicyDispatchBlockedError) {
|
||||
debugLog("autoLoop", {
|
||||
phase: "model-policy-blocked",
|
||||
iteration,
|
||||
unitType: loopErr.unitType,
|
||||
unitId: loopErr.unitId,
|
||||
reasons: loopErr.reasons,
|
||||
});
|
||||
ctx.ui.notify(
|
||||
`Auto-mode paused: model-policy denied dispatch for ${loopErr.unitType}/${loopErr.unitId}. ${msg}`,
|
||||
"error",
|
||||
);
|
||||
deps.emitJournalEvent({
|
||||
ts: new Date().toISOString(),
|
||||
flowId,
|
||||
seq: nextSeq(),
|
||||
eventType: "unit-end",
|
||||
data: {
|
||||
unitType: loopErr.unitType,
|
||||
unitId: loopErr.unitId,
|
||||
status: "blocked",
|
||||
reason: "model-policy-dispatch-blocked",
|
||||
reasons: loopErr.reasons,
|
||||
},
|
||||
});
|
||||
// Carry the blocked unit identity into the turn-result observer:
|
||||
// the throw originated inside dispatch, so observedUnitType/Id were
|
||||
// not assigned by the success path — but the typed error already names
|
||||
// the unit (#4959 / CodeRabbit).
|
||||
observedUnitType = loopErr.unitType;
|
||||
observedUnitId = loopErr.unitId;
|
||||
await deps.pauseAuto(ctx, pi);
|
||||
finishTurn("paused", "manual-attention", msg);
|
||||
// Do NOT increment consecutiveErrors — the failure is configuration,
|
||||
// not a transient runtime fault.
|
||||
break;
|
||||
}
|
||||
|
||||
// ── Infrastructure errors: immediate stop, no retry ──
|
||||
// These are unrecoverable (disk full, OOM, etc.). Retrying just burns
|
||||
// LLM budget on guaranteed failures.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue