fix(auto): timeout silent swarm turns despite heartbeats
This commit is contained in:
parent
85f6650852
commit
5f92320c7d
5 changed files with 289 additions and 14 deletions
|
|
@ -3,6 +3,9 @@
|
|||
* Handles complexity-based routing, model resolution across providers,
|
||||
* and fallback chains.
|
||||
*/
|
||||
import { readFileSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { unitPhaseLabel } from "./auto-dashboard.js";
|
||||
import { isModelBlocked } from "./blocked-models.js";
|
||||
import {
|
||||
|
|
@ -10,6 +13,7 @@ import {
|
|||
extractTaskMetadata,
|
||||
tierLabel,
|
||||
} from "./complexity-classifier.js";
|
||||
import { getErrorMessage } from "./error-utils.js";
|
||||
import { getLedger, getProjectTotals } from "./metrics.js";
|
||||
import { routesFor } from "./model-registry.js";
|
||||
import {
|
||||
|
|
@ -19,9 +23,9 @@ import {
|
|||
loadCapabilityOverrides,
|
||||
resolveModelForComplexity,
|
||||
} from "./model-router.js";
|
||||
import { readStickyModelForUnit } from "./slice-routing-cache.js";
|
||||
import {
|
||||
filterModelsByProviderModelAllow,
|
||||
isModelInEnabledList,
|
||||
isProviderAllowedByLists,
|
||||
isProviderAllowedForAdvisor,
|
||||
resolveDynamicRoutingConfig,
|
||||
|
|
@ -29,6 +33,7 @@ import {
|
|||
resolvePersistModelChanges,
|
||||
} from "./preferences-models.js";
|
||||
import { getSessionModelOverride } from "./session-model-override.js";
|
||||
import { readStickyModelForUnit } from "./slice-routing-cache.js";
|
||||
import { resolveUokFlags } from "./uok/flags.js";
|
||||
import { applyModelPolicyFilter } from "./uok/model-policy.js";
|
||||
import {
|
||||
|
|
@ -37,7 +42,43 @@ import {
|
|||
} from "./uok/model-role-policy.js";
|
||||
import { logWarning } from "./workflow-logger.js";
|
||||
import { getRequiredWorkflowToolsForAutoUnit } from "./workflow-tools.js";
|
||||
import { getErrorMessage } from "./error-utils.js";
|
||||
|
||||
/**
|
||||
* Read the operator's enabledModels allowlist from ~/.sf/agent/settings.json.
|
||||
*
|
||||
* Returns undefined when:
|
||||
* - the file is missing or unreadable (→ no constraint)
|
||||
* - enabledModels is absent or empty (→ no constraint)
|
||||
*
|
||||
* IO errors are silently swallowed — a missing settings.json must never
|
||||
* prevent autonomous dispatch.
|
||||
*
|
||||
* Set SF_BYPASS_ENABLED_MODELS=1 to disable the allowlist check entirely
|
||||
* (emergency escape hatch when the allowlist gets misconfigured).
|
||||
*/
|
||||
function readEnabledModels() {
|
||||
if (process.env.SF_BYPASS_ENABLED_MODELS === "1") return undefined;
|
||||
try {
|
||||
const settingsPath = join(homedir(), ".sf", "agent", "settings.json");
|
||||
const settings = JSON.parse(readFileSync(settingsPath, "utf-8"));
|
||||
if (process.env.SF_DEBUG_ENABLED_MODELS === "1") {
|
||||
process.stderr.write(
|
||||
`[readEnabledModels] HOME=${process.env.HOME} homedir=${homedir()} path=${settingsPath} enabledModels=${JSON.stringify(settings?.enabledModels)}\n`,
|
||||
);
|
||||
}
|
||||
return Array.isArray(settings?.enabledModels) &&
|
||||
settings.enabledModels.length > 0
|
||||
? settings.enabledModels
|
||||
: undefined;
|
||||
} catch (err) {
|
||||
if (process.env.SF_DEBUG_ENABLED_MODELS === "1") {
|
||||
process.stderr.write(
|
||||
`[readEnabledModels] error: ${err?.message}\n`,
|
||||
);
|
||||
}
|
||||
return undefined; // settings missing or unreadable → no allowlist constraint
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Thrown when the model-policy gate rejects every candidate model for a unit
|
||||
* dispatch (#4959 / #4681 / #4850). The auto-loop catches this specifically
|
||||
|
|
@ -273,9 +314,7 @@ export function applyLineageDiverseFilter(candidates, workerModelId) {
|
|||
if (!workerModelId || String(workerModelId).length === 0) {
|
||||
return { filtered: candidates, fellBack: false };
|
||||
}
|
||||
const kept = candidates.filter(
|
||||
(m) => !isSameRootVendor(m.id, workerModelId),
|
||||
);
|
||||
const kept = candidates.filter((m) => !isSameRootVendor(m.id, workerModelId));
|
||||
if (kept.length === 0) {
|
||||
return { filtered: candidates, fellBack: true };
|
||||
}
|
||||
|
|
@ -505,8 +544,7 @@ export async function selectAndApplyModel(
|
|||
// Build an allow-set of keys to enforce in the resolution loop.
|
||||
const allowedKeys = new Set(
|
||||
filtered.map(
|
||||
(m) =>
|
||||
`${m.provider.toLowerCase()}/${m.id.toLowerCase()}`,
|
||||
(m) => `${m.provider.toLowerCase()}/${m.id.toLowerCase()}`,
|
||||
),
|
||||
);
|
||||
lineageAllowedKeys = allowedKeys;
|
||||
|
|
@ -664,11 +702,7 @@ export async function selectAndApplyModel(
|
|||
// on a sibling unit in this slice when its capability score is
|
||||
// within window of the winner. Cleared on executor refusal so a
|
||||
// failing model does not re-attach to the slice.
|
||||
const stickyHint = readStickyModelForUnit(
|
||||
basePath,
|
||||
unitType,
|
||||
unitId,
|
||||
);
|
||||
const stickyHint = readStickyModelForUnit(basePath, unitType, unitId);
|
||||
routingResult = resolveModelForComplexity(
|
||||
classification,
|
||||
modelConfig,
|
||||
|
|
@ -718,6 +752,14 @@ export async function selectAndApplyModel(
|
|||
effectiveModelConfig.primary,
|
||||
...effectiveModelConfig.fallbacks,
|
||||
];
|
||||
// ── enabledModels allowlist (operator gate) ──────────────────────────
|
||||
// Read once per unit dispatch; undefined = no constraint (open).
|
||||
// SF_BYPASS_ENABLED_MODELS=1 disables the check entirely — emergency
|
||||
// escape hatch when the allowlist gets misconfigured and autonomous
|
||||
// can't find a viable fallback.
|
||||
const enabledModels = readEnabledModels();
|
||||
let enabledModelsFilteredCount = 0;
|
||||
let enabledModelsResolvedCount = 0; // models that existed in the pool (with or without allowlist)
|
||||
let attemptedPolicyEligible = false;
|
||||
for (const modelId of modelsToTry) {
|
||||
const resolutionPool = uokFlags.modelPolicy
|
||||
|
|
@ -733,6 +775,22 @@ export async function selectAndApplyModel(
|
|||
ctx.ui.notify(`Model ${modelId} not found, trying fallback.`, "info");
|
||||
continue;
|
||||
}
|
||||
enabledModelsResolvedCount++;
|
||||
// ── Enforce operator enabledModels allowlist ──────────────────────
|
||||
// Applied as the first model-level filter so disallowed providers
|
||||
// never reach policy/routing checks. enabledModels is only set when
|
||||
// the operator has an explicit allowlist; otherwise this is a no-op.
|
||||
if (
|
||||
enabledModels !== undefined &&
|
||||
!isModelInEnabledList(model.provider, model.id, enabledModels)
|
||||
) {
|
||||
ctx.ui.notify(
|
||||
`Model ${model.provider}/${model.id} not in enabledModels allowlist; trying next fallback.`,
|
||||
"info",
|
||||
);
|
||||
enabledModelsFilteredCount++;
|
||||
continue;
|
||||
}
|
||||
// Enforce lineage-diverse-from-worker: skip candidates whose root vendor
|
||||
// matches the worker's. lineageAllowedKeys is populated only when the
|
||||
// constraint applies (adversary / reviewer with a known worker model id).
|
||||
|
|
@ -873,6 +931,22 @@ export async function selectAndApplyModel(
|
|||
}
|
||||
}
|
||||
}
|
||||
// ── enabledModels exhaustion error ───────────────────────────────────
|
||||
// When every resolved candidate was filtered out by the enabledModels
|
||||
// allowlist (and no model was applied), surface a clear operator-
|
||||
// actionable error rather than silently falling back to the session default.
|
||||
if (
|
||||
appliedModel === null &&
|
||||
enabledModels !== undefined &&
|
||||
enabledModelsResolvedCount > 0 &&
|
||||
enabledModelsFilteredCount === enabledModelsResolvedCount
|
||||
) {
|
||||
throw new Error(
|
||||
`All fallback candidates filtered by enabledModels allowlist. ` +
|
||||
`Either add a provider/model to enabledModels in ~/.sf/agent/settings.json ` +
|
||||
`or unset enabledModels. Set SF_BYPASS_ENABLED_MODELS=1 to disable the check.`,
|
||||
);
|
||||
}
|
||||
if (
|
||||
uokFlags.modelPolicy &&
|
||||
policyAllowedModelKeys &&
|
||||
|
|
@ -909,6 +983,7 @@ export async function selectAndApplyModel(
|
|||
} else if (autoModeStartModel) {
|
||||
// No model preference for this unit type — re-apply the model captured
|
||||
// at autonomous mode start to prevent bleed from shared global settings.json (#650).
|
||||
const startEnabledModels = readEnabledModels();
|
||||
const availableModels = filterModelsByProviderModelAllow(
|
||||
ctx.modelRegistry
|
||||
.getAvailable()
|
||||
|
|
@ -918,6 +993,9 @@ export async function selectAndApplyModel(
|
|||
prefs?.allowed_providers,
|
||||
prefs?.blocked_providers,
|
||||
),
|
||||
)
|
||||
.filter((m) =>
|
||||
isModelInEnabledList(m.provider, m.id, startEnabledModels),
|
||||
),
|
||||
prefs?.provider_model_allow,
|
||||
prefs?.provider_model_block,
|
||||
|
|
|
|||
|
|
@ -195,6 +195,36 @@ export function filterModelsByProviderModelAllow(
|
|||
),
|
||||
);
|
||||
}
|
||||
/**
|
||||
* Check whether a provider/modelId combination is in the operator's
|
||||
* enabledModels allowlist from ~/.sf/agent/settings.json.
|
||||
*
|
||||
* Returns true when:
|
||||
* - enabledModels is undefined/empty (no filter — all allowed)
|
||||
* - any pattern matches "<provider>/<modelId>"
|
||||
*
|
||||
* Pattern syntax: "provider/*" (all models in a provider) or
|
||||
* "provider/specific-model" (exact). Wildcards only at the end of the
|
||||
* pattern; intentional to keep policy simple and auditable.
|
||||
*
|
||||
* @param {string} provider - e.g. "mistral"
|
||||
* @param {string} modelId - e.g. "mistral-large-2512"
|
||||
* @param {string[]|undefined} enabledModels
|
||||
* @returns {boolean}
|
||||
*/
|
||||
export function isModelInEnabledList(provider, modelId, enabledModels) {
|
||||
if (!Array.isArray(enabledModels) || enabledModels.length === 0) return true;
|
||||
const candidate = `${provider}/${modelId}`;
|
||||
for (const pattern of enabledModels) {
|
||||
if (typeof pattern !== "string") continue;
|
||||
if (pattern === candidate) return true;
|
||||
if (pattern.endsWith("/*")) {
|
||||
const prefix = pattern.slice(0, -1); // e.g. "mistral/"
|
||||
if (candidate.startsWith(prefix)) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Check if a provider is in the allowed list and not in the blocked list.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -5,7 +5,10 @@ import { join } from "node:path";
|
|||
import { afterEach, describe, test } from "vitest";
|
||||
// Import preferences.js so that _initPrefsLoader is called and the circular dep lazy-loader is wired up.
|
||||
import "../preferences.js";
|
||||
import { resolveModelWithFallbacksForUnit } from "../preferences-models.js";
|
||||
import {
|
||||
isModelInEnabledList,
|
||||
resolveModelWithFallbacksForUnit,
|
||||
} from "../preferences-models.js";
|
||||
|
||||
const originalCwd = process.cwd();
|
||||
const originalEnv = { ...process.env };
|
||||
|
|
@ -107,4 +110,33 @@ describe("preferences model resolution", () => {
|
|||
fallbacks: [],
|
||||
});
|
||||
});
|
||||
|
||||
test("isModelInEnabledList_when_list_empty_allows_any_model", () => {
|
||||
assert.equal(isModelInEnabledList("kimi-coding", "kimi-k2.6", []), true);
|
||||
assert.equal(
|
||||
isModelInEnabledList("kimi-coding", "kimi-k2.6", undefined),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("isModelInEnabledList_when_exact_or_provider_wildcard_matches_returns_true", () => {
|
||||
assert.equal(
|
||||
isModelInEnabledList("kimi-coding", "kimi-k2.6", [
|
||||
"openai/gpt-5.2",
|
||||
"kimi-coding/kimi-k2.6",
|
||||
]),
|
||||
true,
|
||||
);
|
||||
assert.equal(
|
||||
isModelInEnabledList("kimi-coding", "kimi-k2.6", ["kimi-coding/*"]),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("isModelInEnabledList_when_no_pattern_matches_returns_false", () => {
|
||||
assert.equal(
|
||||
isModelInEnabledList("kimi-coding", "kimi-k2.6", ["openai/*"]),
|
||||
false,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -581,6 +581,45 @@ describe("SwarmDispatchLayer.dispatchAndWait — Round 7: executor config forwar
|
|||
expect(capturedOpts.noOutputTimeoutMs).toBe(45_000);
|
||||
});
|
||||
|
||||
test("noOutputTimeoutMs aborts a silent turn even when runtime heartbeats continue", async () => {
|
||||
vi.useFakeTimers();
|
||||
const { runAgentTurn } = await import("../uok/agent-runner.js");
|
||||
|
||||
runAgentTurn.mockImplementationOnce(async (_agent, opts = {}) => {
|
||||
setTimeout(() => {
|
||||
opts.onEvent?.({ type: "runtime_heartbeat" });
|
||||
}, 5);
|
||||
setTimeout(() => {
|
||||
opts.onEvent?.({ type: "runtime_heartbeat" });
|
||||
}, 9);
|
||||
return new Promise(() => {
|
||||
// Simulate setup/model code that ignores AbortSignal. The outer watchdog
|
||||
// must still resolve dispatchAndWait instead of waiting for this promise.
|
||||
});
|
||||
});
|
||||
|
||||
const root = makeProject();
|
||||
const layer = new SwarmDispatchLayer(root);
|
||||
const pending = layer.dispatchAndWait(
|
||||
{
|
||||
unitId: "task-heartbeat-only-timeout",
|
||||
unitType: "execute-task",
|
||||
workMode: "build",
|
||||
payload: "edit files",
|
||||
priority: 5,
|
||||
scope: "scope-heartbeat-only-timeout",
|
||||
},
|
||||
{ noOutputTimeoutMs: 10, timeoutMs: 1_000 },
|
||||
);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(20);
|
||||
const result = await pending;
|
||||
vi.useRealTimers();
|
||||
|
||||
expect(result.reply).toBeNull();
|
||||
expect(result.error).toContain("produced no output for 10ms");
|
||||
});
|
||||
|
||||
test("envelope without executorSystemPrompt does not forward systemPromptOverride", async () => {
|
||||
// Envelopes without the optional fields must not pass undefined opts to runAgentTurn.
|
||||
const { runAgentTurn } = await import("../uok/agent-runner.js");
|
||||
|
|
|
|||
|
|
@ -41,6 +41,102 @@ async function getRunAgentTurn() {
|
|||
return _runAgentTurnFn;
|
||||
}
|
||||
|
||||
function isMeaningfulAgentTurnEvent(event) {
|
||||
if (String(event?.type) === "runtime_heartbeat") return false;
|
||||
if (event?.type === "message_update") {
|
||||
const streamEvent = event.assistantMessageEvent;
|
||||
if (streamEvent?.type === "text_delta") {
|
||||
return String(streamEvent.delta ?? "").length > 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async function runAgentTurnWithOuterWatchdogs(runAgentTurn, agent, opts = {}) {
|
||||
const { timeoutMs = 480_000, noOutputTimeoutMs, signal, onEvent } = opts;
|
||||
const controller = new AbortController();
|
||||
let hardTimer = null;
|
||||
let noOutputTimer = null;
|
||||
let settled = false;
|
||||
let resolveWatchdog = null;
|
||||
const watchdogResult = new Promise((resolve) => {
|
||||
resolveWatchdog = resolve;
|
||||
});
|
||||
|
||||
function abort(reason) {
|
||||
if (settled) return;
|
||||
try {
|
||||
controller.abort(reason);
|
||||
} catch {
|
||||
controller.abort();
|
||||
}
|
||||
resolveWatchdog?.({
|
||||
turnsProcessed: 0,
|
||||
response: null,
|
||||
error: reason,
|
||||
});
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
settled = true;
|
||||
if (hardTimer) clearTimeout(hardTimer);
|
||||
if (noOutputTimer) clearTimeout(noOutputTimer);
|
||||
}
|
||||
|
||||
function armNoOutputTimer() {
|
||||
if (!noOutputTimeoutMs || noOutputTimeoutMs <= 0) return;
|
||||
if (noOutputTimer) clearTimeout(noOutputTimer);
|
||||
noOutputTimer = setTimeout(() => {
|
||||
abort(`agent turn produced no output for ${noOutputTimeoutMs}ms`);
|
||||
}, noOutputTimeoutMs);
|
||||
}
|
||||
|
||||
if (signal) {
|
||||
if (signal.aborted) abort("agent turn cancelled");
|
||||
else
|
||||
signal.addEventListener("abort", () => abort("agent turn cancelled"), {
|
||||
once: true,
|
||||
});
|
||||
}
|
||||
if (timeoutMs > 0) {
|
||||
hardTimer = setTimeout(() => {
|
||||
abort(`agent turn timed out after ${timeoutMs}ms`);
|
||||
}, timeoutMs);
|
||||
}
|
||||
armNoOutputTimer();
|
||||
|
||||
const turnResult = runAgentTurn(agent, {
|
||||
...opts,
|
||||
signal: controller.signal,
|
||||
onEvent: (event) => {
|
||||
if (isMeaningfulAgentTurnEvent(event)) armNoOutputTimer();
|
||||
onEvent?.(event);
|
||||
},
|
||||
}).catch((err) => {
|
||||
if (controller.signal.aborted) {
|
||||
return {
|
||||
turnsProcessed: 0,
|
||||
response: null,
|
||||
error:
|
||||
typeof controller.signal.reason === "string"
|
||||
? controller.signal.reason
|
||||
: "agent turn aborted",
|
||||
};
|
||||
}
|
||||
throw err;
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await Promise.race([turnResult, watchdogResult]);
|
||||
cleanup();
|
||||
return result;
|
||||
} catch (err) {
|
||||
cleanup();
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// Module-level cache keyed by `${basePath}:${swarmName}`
|
||||
const _cache = new Map();
|
||||
|
||||
|
|
@ -334,7 +430,7 @@ export class SwarmDispatchLayer {
|
|||
const runAgentTurn = await getRunAgentTurn();
|
||||
let turnResult;
|
||||
try {
|
||||
turnResult = await runAgentTurn(agent, {
|
||||
turnResult = await runAgentTurnWithOuterWatchdogs(runAgentTurn, agent, {
|
||||
timeoutMs,
|
||||
...(noOutputTimeoutMs ? { noOutputTimeoutMs } : {}),
|
||||
signal,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue