feat: Created draft mapping of SF patterns to ACE reference draft

SF-Task: S05/T01
This commit is contained in:
Mikael Hugo 2026-05-13 02:01:41 +02:00
parent 1ed505669b
commit 65e195a9fd
49 changed files with 2263 additions and 272 deletions

13
.gitignore vendored
View file

@ -106,4 +106,17 @@ repowise.db
.sf/scaffold-manifest.json .sf/scaffold-manifest.json
.sf/interactive.lock .sf/interactive.lock
.sf/interactive.lock.d/ .sf/interactive.lock.d/
# SQLite WAL/SHM are ephemeral checkpoint files — only the .db is durable.
.sf/metrics.db-wal
.sf/metrics.db-shm
.sf/sf.db-wal
.sf/sf.db-shm
# Per-dispatch trace files accumulate one-per-request and are runtime-only.
# Consumers (sf-db-gates, adaptive verification policy) read by mtime window
# (24h30d) — on-disk retention is needed, but git tracking is not.
.sf/traces/pre-dispatch:*.jsonl
.sf/traces/finalize:*.jsonl
.sf/traces/guard:*.jsonl
# `latest` is a symlink retargeted on every dispatch — pure git noise.
.sf/traces/latest
test_output.log test_output.log

View file

@ -1,3 +1,3 @@
{ {
"lastFullVacuumAt": "2026-05-12T13:59:07.765Z" "lastFullVacuumAt": "2026-05-12T20:58:28.744Z"
} }

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -60,5 +60,5 @@
"confidence": "EXTRACTED" "confidence": "EXTRACTED"
} }
], ],
"builtAt": "2026-05-12T15:26:43.252Z" "builtAt": "2026-05-12T23:53:23.408Z"
} }

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1 +1 @@
{"fetchedAt":"2026-05-12T14:54:31.656Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]} {"fetchedAt":"2026-05-12T21:25:20.919Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]}

File diff suppressed because one or more lines are too long

View file

@ -109,26 +109,26 @@
"total": 1 "total": 1
}, },
"kimi-coding/kimi-k2.6": { "kimi-coding/kimi-k2.6": {
"successes": 1, "successes": 2,
"failures": 0, "failures": 0,
"timeouts": 0, "timeouts": 0,
"totalTokens": 1821480, "totalTokens": 1892068,
"totalCost": 0, "totalCost": 0.030715552,
"lastUsed": "2026-05-12T20:57:45.179Z", "lastUsed": "2026-05-12T23:58:57.132Z",
"successRate": 1, "successRate": 1,
"total": 1 "total": 2
} }
}, },
"complete-slice": { "complete-slice": {
"kimi-coding/kimi-k2.6": { "kimi-coding/kimi-k2.6": {
"successes": 1, "successes": 2,
"failures": 0, "failures": 0,
"timeouts": 0, "timeouts": 0,
"totalTokens": 719526, "totalTokens": 814376,
"totalCost": 0.026709, "totalCost": 0.053080319800000005,
"lastUsed": "2026-05-12T15:26:57.708Z", "lastUsed": "2026-05-12T23:54:01.143Z",
"successRate": 1, "successRate": 1,
"total": 1 "total": 2
} }
} }
} }

View file

@ -1,16 +0,0 @@
[
{
"kind": "write",
"toolCallId": "write_1778619443353_32",
"path": ".sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md",
"timestamp": 1778619443535
},
{
"kind": "bash",
"toolCallId": "bash_1778619447339_33",
"command": "test -f .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && grep -q \"status\" .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && echo \"Matrix exists and contains status command info.\"",
"exitCode": 0,
"outputSnippet": "Matrix exists and contains status command info.\n",
"timestamp": 1778619447544
}
]

View file

@ -0,0 +1 @@
[]

View file

@ -0,0 +1,16 @@
[
{
"kind": "write",
"toolCallId": "DgPnxQEen",
"path": "docs/dev/sf-ace-patterns.md.draft",
"timestamp": 1778630297060
},
{
"kind": "bash",
"toolCallId": "8FjDDZSlA",
"command": "test -f docs/dev/sf-ace-patterns.md.draft && grep -c \"SF Implementation\" docs/dev/sf-ace-patterns.md.draft | grep -q \"6\"",
"exitCode": 0,
"outputSnippet": "(no output)",
"timestamp": 1778630298077
}
]

16
.sf/slice-routing.json Normal file
View file

@ -0,0 +1,16 @@
{
"M001-6377a4/S04": {
"provider": "minimax",
"id": "MiniMax-M2.1",
"ts": "2026-05-12T23:54:01.079Z",
"lastUnitType": "complete-slice",
"lastUnitId": "M001-6377a4/S04"
},
"M001-6377a4/S05": {
"provider": "mistral",
"id": "codestral-latest",
"ts": "2026-05-12T23:58:57.088Z",
"lastUnitType": "execute-task",
"lastUnitId": "M001-6377a4/S05/T01"
}
}

View file

@ -1 +1 @@
guard:76c7c307-91b4-426e-8fad-4ff951d5a52e.jsonl guard:b8cbf9df-9fe8-4203-9c63-79fc7264d74e.jsonl

36
TODO.md
View file

@ -3,3 +3,39 @@
Dump anything here. Dump anything here.
--- ---
## Self-Feedback Inbox
### [prompt-modularization] Phase 3 — migrate remaining builders to `composeUnitContext` v2
**Context:** Phase 1 (fragment infrastructure, 17-prompt Working Directory deduplication) and
Phase 2 (5 stub manifests for deploy/smoke-production/release/rollback/challenge) shipped in
commit `ca5d869e3`. 9 of 26 unit types are now fully manifest-driven via `composeInlinedContext`.
**What's blocked and why:**
Migrating the remaining 17 builders to `composeInlinedContext` (v1) is the wrong path because:
1. `inlineKnowledgeScoped` and `inlineGraphSubgraph` are NOT in `ARTIFACT_KEYS` — these
artifacts would remain imperative and undeclared in every manifest, making manifests
structurally unreliable descriptions of actual builder behavior.
2. Injecting knowledge/graph at the right position in the composed string requires fragile
sentinel-string searches (e.g., `body.lastIndexOf("### Task Summary:")`). This pattern
is already untested in the 2 migrated complex builders (`research-milestone`, `complete-slice`).
3. `composeUnitContext` (v2) in `unit-context-composer.js` already has `computed`, `prepend`,
and `excerpt` support — knowledge and graph inlining maps cleanly to `computed` entries.
Migrating to v1 now creates a half-migration state that must be undone when v2 lands.
**Recommended next slice:**
1. Add `"knowledge"` and `"graph"` to `ARTIFACT_KEYS` in `unit-context-manifest.js`.
2. Register them as `computed` entries in relevant `UNIT_MANIFESTS` entries.
3. Wire one builder (e.g., `buildResearchSlicePrompt`) through `composeUnitContext` v2 as pilot.
4. Add position-assertion tests to already-migrated complex builders (`research-milestone`,
`complete-slice`) to guard against silent ordering degradation.
5. Then migrate remaining builders in batches: slice builders → milestone builders → execute-task.
**Note on `prompt-cache-optimizer.js`:** Entirely dead code — `optimizeForCaching()`,
`estimateCacheSavings()`, `computeCacheHitRate()` have zero importers. `reorderForCaching()`
is wired at `phases-unit.js:519` but no `cache_control` markers are written to outgoing
requests. Remove the file or wire it in the same slice that adds `cache_control` breakpoints.
---

View file

@ -0,0 +1,29 @@
# SF Patterns to ACE Reference Draft Mapping
## Preferences
**SF Implementation:** `src/resources/extensions/sf/preferences.js`
## PDD
**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js`
## UOK Gates
**SF Implementation:** `src/resources/extensions/sf/uok/gate-runner.js`
## Notifications
**SF Implementation:** `src/resources/extensions/sf/skills/frontmatter.js`
## Skills-as-Contracts
**SF Implementation:** `src/resources/extensions/sf/steerable-autonomous-panel.js`
## Idempotency
**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js`
## Verification
- All 6 patterns have verified file paths in this document.

View file

@ -0,0 +1,85 @@
# SF Product Surface Capabilities
This document defines the command and feature availability across SF's three product surfaces: **CLI / Headless**, **TUI**, and **Web**. It records intentional gaps so they are not mistaken for bugs.
## Surface Definitions
| Surface | Description | Primary Consumer |
| :--- | :--- | :--- |
| **CLI / Headless** | Non-interactive command-line interface and machine-surface protocol (`sf headless`). | Scripts, CI/CD, editor integrations, autonomous dispatch. |
| **TUI** | Interactive Terminal User Interface with dashboards, visualizers, and live overlays. | Developers working locally who prefer keyboard-driven interaction. |
| **Web** | Browser-based interface (Next.js) with panels, command surfaces, and visual tools. | Developers who prefer a GUI, remote access, or power-mode workflows. |
## Feature Matrix
| Command / Feature | CLI / Headless | TUI | Web | Notes |
| :--- | :--- | :--- | :--- | :--- |
| `/status` | ✅ | ✅ | ✅ | Text in CLI/Headless; dashboard overlay in TUI; terminal or `sf-status` panel in Web. |
| `/plan` | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. |
| `/run` (`/next`, `/autonomous`) | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. |
| `/steer` | ✅ | ✅ | ✅ | Web exposes via `sf-steer` panel. |
| `/undo` | ✅ | ✅ | ✅ | Web exposes via `sf-undo` panel. |
| `/history` | ✅ | ✅ | ✅ | Web exposes via `sf-history` panel. |
| `/doctor` | ✅ | ✅ | ✅ | Web exposes via `sf-doctor` panel. |
| `/forensics` | ✅ | ✅ | ✅ | Web exposes via `sf-forensics` panel. |
| `/skills` | ✅ | ✅ | ✅ | Web exposes via `sf-skill-health` panel. |
| `/capture` | ✅ | ✅ | ✅ | Web exposes via `sf-capture` panel. |
| `/triage` | ✅ | ✅ | ✅ | Web exposes via `sf-triage` panel. |
| `/inspect` | ✅ | ✅ | ✅ | Web exposes via `sf-inspect` panel. |
| `/hooks` | ✅ | ✅ | ✅ | Web exposes via `sf-hooks` panel. |
| `/cleanup` | ✅ | ✅ | ✅ | Web exposes via `sf-cleanup` panel. |
| `/export` | ✅ | ✅ | ✅ | Web exposes via `sf-export` panel. |
| `/queue` | ✅ | ✅ | ✅ | Web exposes via `sf-queue` panel. |
| `/visualize` | ✅ | ✅ | ✅ | Web exposes via `sf-visualize` panel. |
| `/prefs` | ✅ | ✅ | ✅ | Web exposes via `sf-prefs` panel. |
| `/config` | ✅ | ✅ | ✅ | Web exposes via `sf-config` panel. |
| `/mode` | ✅ | ✅ | ✅ | Web exposes via `sf-mode` panel. |
| `/model` | ✅ | ✅ | ✅ | Web exposes via dedicated **Model** command surface. |
| `/thinking` | ✅ | ✅ | ✅ | Web exposes via dedicated **Thinking** command surface. |
| `/git` | ✅ | ✅ | ✅ | Web exposes via dedicated **Git** command surface. |
| `/settings` | ✅ | ✅ | ✅ | Web exposes via dedicated **Settings** command surface (general, recovery, auth, admin, experimental). |
| `/resume` | ✅ | ✅ | ✅ | Web exposes via dedicated **Resume** command surface. |
| `/name` | ✅ | ✅ | ✅ | Web exposes via dedicated **Name** command surface. |
| `/fork` | ✅ | ✅ | ✅ | Web exposes via dedicated **Fork** command surface. |
| `/session` | ✅ | ✅ | ✅ | Web exposes via dedicated **Session** command surface. |
| `/compact` | ✅ | ✅ | ✅ | Web exposes via dedicated **Compact** command surface. |
| `/tasks` | ✅ | ✅ | ✅ | Web exposes via Dashboard and Activity views. |
| `/research` | ✅ | ✅ | ✅ | Web terminal supports typing the command. |
| `/implement` | ✅ | ✅ | ✅ | Web terminal supports typing the command. |
## Intentional Gaps
### `/plan` is not available as a first-class Web UI workflow
**Why:** The web UI uses a different, browser-native planning and execution model. Planning artifacts are promoted through CLI-first workflows (`sf plan promote`) that require filesystem access, Git operations, and markdown rendering pipelines that are optimized for terminal and editor surfaces. The web surface focuses on higher-level UI interactions (roadmap views, milestone explorers, visual planning tools) rather than raw slash-command promotion.
**What web users do instead:**
- Use the **Roadmap** and **Milestone Explorer** views to inspect and navigate planning state.
- Type `/plan` in the embedded terminal if needed; the command executes but the full promotion workflow is CLI-first.
### `/run` (`/next`, `/autonomous`) is not available as a first-class Web UI workflow
**Why:** The web UI uses a different, browser-native execution model. Backend execution is managed via specific API routes and WebSocket/bridge communication rather than a `/run` command dispatch. The web surface prioritizes supervised, click-driven execution (e.g., **Power Mode**, action buttons, workflow steppers) over autonomous terminal-style dispatch.
**What web users do instead:**
- Use **Power Mode** for guided, step-by-step unit execution.
- Use **Chat Mode** for conversational task dispatch.
- Type `/autonomous` or `/next` in the embedded terminal if needed; execution proceeds via the PTY bridge.
## Design Principle
> **Behavioral coherence, not visual parity.**
>
> Every surface must expose the *same underlying state* (via `deriveState()`, UOK diagnostics, and bridge data) but may present it through different interaction models. A gap is intentional only when the surface provides an equivalent or superior alternative workflow for the same user goal.
## Verification
This matrix is verified against:
- `src/resources/extensions/sf/commands/handlers/core.js` — CLI/TUI `status` handler.
- `src/resources/extensions/sf/commands/handlers/ops.js` — CLI/TUI `plan` and `run` handlers.
- `src/headless.ts` — Headless status and execution entrypoints.
- `web/components/sf/command-surface.tsx` — Web command surface registry.
- `web/lib/command-surface-contract.ts` — Web command surface type definitions.
- `web/components/sf/sidebar.tsx` — Web navigation and exposed commands.
For the full behavioral audit, see `.sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md`.

View file

@ -0,0 +1,75 @@
import assert from "node:assert/strict";
import { describe, it } from "vitest";
import type { Context, Model, OpenAICompletionsCompat } from "../types.js";
import { convertMessages } from "./openai-completions.js";
const compat = {
supportsDeveloperRole: false,
requiresAssistantAfterToolResult: false,
requiresThinkingAsText: false,
} as Required<OpenAICompletionsCompat>;
function model(provider: string, id: string): Model<"openai-completions"> {
return {
id,
name: id,
api: "openai-completions",
provider,
baseUrl:
provider === "openrouter"
? "https://openrouter.ai/api/v1"
: "https://api.openai.com/v1",
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 128_000,
maxTokens: 4096,
};
}
function contextWithCacheControl(): Context {
return {
messages: [
{
role: "user",
content: [
{
type: "text",
text: "stable prefix",
cache_control: { type: "ephemeral" },
} as any,
{ type: "text", text: "dynamic suffix" },
],
timestamp: Date.now(),
},
],
};
}
describe("convertMessages cache_control", () => {
it("preserves_cache_control_when_openrouter_anthropic_model", () => {
const messages = convertMessages(
model("openrouter", "anthropic/claude-sonnet-4.5"),
contextWithCacheControl(),
compat,
);
const content = messages[0].content;
assert.ok(Array.isArray(content));
assert.deepEqual((content[0] as any).cache_control, {
type: "ephemeral",
});
});
it("strips_cache_control_when_openai_compatible_model_does_not_support_it", () => {
const messages = convertMessages(
model("openai", "gpt-5.3-chat-latest"),
contextWithCacheControl(),
compat,
);
const content = messages[0].content;
assert.ok(Array.isArray(content));
assert.equal((content[0] as any).cache_control, undefined);
});
});

View file

@ -493,6 +493,12 @@ function maybeAddOpenRouterAnthropicToolCacheControl(
} }
} }
function supportsOpenRouterAnthropicCacheControl(
model: Model<"openai-completions">,
): boolean {
return model.provider === "openrouter" && model.id.startsWith("anthropic/");
}
function mapReasoningEffort( function mapReasoningEffort(
effort: NonNullable<OpenAICompletionsOptions["reasoningEffort"]>, effort: NonNullable<OpenAICompletionsOptions["reasoningEffort"]>,
reasoningEffortMap: Partial< reasoningEffortMap: Partial<
@ -506,8 +512,7 @@ function maybeAddOpenRouterAnthropicCacheControl(
model: Model<"openai-completions">, model: Model<"openai-completions">,
messages: ChatCompletionMessageParam[], messages: ChatCompletionMessageParam[],
): void { ): void {
if (model.provider !== "openrouter" || !model.id.startsWith("anthropic/")) if (!supportsOpenRouterAnthropicCacheControl(model)) return;
return;
// Anthropic-style caching requires cache_control on a text part. Add a breakpoint // Anthropic-style caching requires cache_control on a text part. Add a breakpoint
// on the last user/assistant message (walking backwards until we find text content). // on the last user/assistant message (walking backwards until we find text content).
@ -622,9 +627,11 @@ export function convertMessages(
// Preserve cache_control if present (set upstream for Anthropic prompt caching). // Preserve cache_control if present (set upstream for Anthropic prompt caching).
// The property is not in the OpenAI SDK type but is accepted by providers // The property is not in the OpenAI SDK type but is accepted by providers
// that support Anthropic-style caching (openrouter/anthropic/*). // that support Anthropic-style caching (openrouter/anthropic/*).
const cacheControl = ( const cacheControl = supportsOpenRouterAnthropicCacheControl(
item as unknown as Record<string, unknown> model,
).cache_control; )
? (item as unknown as Record<string, unknown>).cache_control
: undefined;
if (cacheControl) { if (cacheControl) {
(part as unknown as Record<string, unknown>).cache_control = (part as unknown as Record<string, unknown>).cache_control =
cacheControl; cacheControl;

View file

@ -0,0 +1,64 @@
---
name: rubber-duck
description: Constructive pre-implementation critic — catches design flaws, missing edge cases, and gaps before code is written
model: sonnet
tools: read, grep, find, ls, bash
---
You are a constructive critic. Your job is to identify real problems in a plan, design, or code change **before** implementation is committed to — when course corrections are still cheap.
You are **read-only**. Do not edit files. Do not run commands that change the environment.
## What you review
You receive a plan, a design proposal, a code diff, or a task description. You review it for:
- **Logic errors** — incorrect assumptions, wrong control flow, missing invariants
- **Missing edge cases** — inputs/states the plan doesn't account for
- **Design flaws** — abstractions that won't hold, coupling that will hurt, missing separation of concerns
- **Security issues** — unvalidated inputs, exposed secrets, auth gaps
- **Test gaps** — behavior that will be untested or untestable with the proposed approach
- **Spec contradictions** — where the plan conflicts with stated requirements or existing behavior
## What you do NOT comment on
- Code style, formatting, naming conventions
- Grammar or wording in comments/docs
- Best practices that don't cause an actual problem
- Refactoring that doesn't change correctness
- Minor improvements that don't affect the task outcome
If something is fine, say so. Do not manufacture findings to seem thorough. A short report with two real findings beats a long report with ten nitpicks.
## Output format
For each finding:
```
## [Blocking|Non-blocking|Suggestion] — <title>
**What:** <the specific problem, stated precisely>
**Why it matters:** <the actual impact what breaks, under what condition>
**Fix:** <concrete change to address it>
```
Then a final verdict:
```
## Verdict
READY / NEEDS-REVISION
One sentence: overall assessment.
```
- `READY` — no blocking findings; the plan/code can proceed as-is
- `NEEDS-REVISION` — at least one blocking finding must be addressed first
## Severity guide
- **Blocking** — will cause a bug, data loss, security issue, or test failure if not fixed
- **Non-blocking** — should be fixed for quality but won't break the task
- **Suggestion** — worth considering; low priority
Lead with blocking findings. If there are none, say so explicitly before the non-blocking ones.

View file

@ -18,6 +18,7 @@ import {
loadCapabilityOverrides, loadCapabilityOverrides,
resolveModelForComplexity, resolveModelForComplexity,
} from "./model-router.js"; } from "./model-router.js";
import { readStickyModelForUnit } from "./slice-routing-cache.js";
import { import {
filterModelsByProviderModelAllow, filterModelsByProviderModelAllow,
isProviderAllowedByLists, isProviderAllowedByLists,
@ -543,6 +544,15 @@ export async function selectAndApplyModel(
selectionMethod: "tier-only", selectionMethod: "tier-only",
}; };
} else { } else {
// Slice-sticky hint: prefer the model that previously succeeded
// on a sibling unit in this slice when its capability score is
// within window of the winner. Cleared on executor refusal so a
// failing model does not re-attach to the slice.
const stickyHint = readStickyModelForUnit(
basePath,
unitType,
unitId,
);
routingResult = resolveModelForComplexity( routingResult = resolveModelForComplexity(
classification, classification,
modelConfig, modelConfig,
@ -551,6 +561,7 @@ export async function selectAndApplyModel(
unitType, unitType,
classification.taskMetadata, classification.taskMetadata,
capabilityOverrides, capabilityOverrides,
stickyHint,
); );
} }
if (routingResult.wasDowngraded) { if (routingResult.wasDowngraded) {

View file

@ -82,7 +82,9 @@ import {
import { initRoutingHistory } from "./routing-history.js"; import { initRoutingHistory } from "./routing-history.js";
import { import {
acquireSessionLock, acquireSessionLock,
isSessionPidAlive,
releaseSessionLock, releaseSessionLock,
terminateExistingSession,
updateSessionLock, updateSessionLock,
} from "./session-lock.js"; } from "./session-lock.js";
import { getSessionModelOverride } from "./session-model-override.js"; import { getSessionModelOverride } from "./session-model-override.js";
@ -342,15 +344,91 @@ export async function bootstrapAutoSession(
lockBase, lockBase,
buildResolver, buildResolver,
} = deps; } = deps;
const lockResult = acquireSessionLock(base, { let lockResult = acquireSessionLock(base, {
sessionId: ctx.sessionManager?.getSessionId?.(), sessionId: ctx.sessionManager?.getSessionId?.(),
sessionFile: ctx.sessionManager?.getSessionFile?.(), sessionFile: ctx.sessionManager?.getSessionFile?.(),
}); });
// Lock busy on a *live* peer: instead of just refusing to start, ask the
// operator whether to terminate the existing session and take over. Two
// non-interactive escape hatches keep CI/headless usage predictable:
// - SF_KILL_EXISTING=1 (or =true / =yes) — auto-confirm the kill
// - SF_KILL_EXISTING=0 (or =false / =no) — auto-decline (current behavior)
// - SF_HEADLESS=1 with no SF_KILL_EXISTING — auto-decline (safe default
// for batch contexts where a hung interactive prompt would deadlock)
if (!lockResult.acquired && lockResult.existingPid) {
const existingPid = Number(lockResult.existingPid);
if (isSessionPidAlive(existingPid)) {
const envKill = String(process.env.SF_KILL_EXISTING ?? "")
.trim()
.toLowerCase();
const headless =
process.env.SF_HEADLESS === "1" ||
String(process.env.SF_HEADLESS ?? "").toLowerCase() === "true";
let confirmed;
if (envKill === "1" || envKill === "true" || envKill === "yes") {
confirmed = true;
} else if (envKill === "0" || envKill === "false" || envKill === "no") {
confirmed = false;
} else if (headless) {
// Headless without an explicit opt-in: refuse to kill silently.
confirmed = false;
} else if (typeof ctx.ui?.confirm === "function") {
confirmed = await ctx.ui.confirm(
"Stop running SF session?",
`Another SF autonomous session (PID ${existingPid}) is already running on this project. Stop it and start a fresh session?`,
);
} else {
confirmed = false;
}
if (confirmed) {
ctx.ui.notify(
`Stopping existing SF session (PID ${existingPid})…`,
"info",
);
let result;
try {
result = await terminateExistingSession(existingPid);
} catch (err) {
ctx.ui.notify(
`Failed to stop existing SF session (PID ${existingPid}): ${err?.message ?? err}. Stop it manually with \`kill ${existingPid}\`.`,
"error",
);
return false;
}
if (!result.terminated) {
ctx.ui.notify(
`Unable to stop existing SF session (PID ${existingPid}). It may belong to another user or be unresponsive. Stop it manually with \`kill -9 ${existingPid}\`.`,
"error",
);
return false;
}
ctx.ui.notify(
result.escalated
? `Existing SF session (PID ${existingPid}) did not exit on SIGTERM; SIGKILL applied.`
: `Existing SF session (PID ${existingPid}) stopped.`,
result.escalated ? "warning" : "info",
);
lockResult = acquireSessionLock(base, {
sessionId: ctx.sessionManager?.getSessionId?.(),
sessionFile: ctx.sessionManager?.getSessionFile?.(),
});
}
}
}
if (!lockResult.acquired) { if (!lockResult.acquired) {
const reason = lockResult.reason; const reason = lockResult.reason;
ctx.ui.notify(reason, "error"); ctx.ui.notify(reason, "error");
return false; return false;
} }
// Session-start janitor: prune per-flow trace files older than the longest
// analyzer window (30d). Best-effort, never blocks startup, errors swallowed
// in pruneStaleTraces. Keeps `.sf/traces/` from growing without bound.
try {
const { pruneStaleTraces } = await import("./uok/trace-writer.js");
pruneStaleTraces(base);
} catch {
// trace janitor must never break autonomous startup
}
function releaseLockAndReturn() { function releaseLockAndReturn() {
releaseSessionLock(base); releaseSessionLock(base);
clearLock(base); clearLock(base);

View file

@ -6,6 +6,7 @@
import { scopeActiveToolsForUnitType } from "../constants.js"; import { scopeActiveToolsForUnitType } from "../constants.js";
import { debugLog } from "../debug-logger.js"; import { debugLog } from "../debug-logger.js";
import { getErrorMessage } from "../error-utils.js";
import { import {
resolveAutoSupervisorConfig, resolveAutoSupervisorConfig,
resolvePersistModelChanges, resolvePersistModelChanges,
@ -27,11 +28,29 @@ import {
getCurrentTurnGeneration, getCurrentTurnGeneration,
runWithTurnGeneration, runWithTurnGeneration,
} from "./turn-epoch.js"; } from "./turn-epoch.js";
import { getErrorMessage } from "../error-utils.js";
// Tracks the latest session-switch attempt so a late timeout settlement from an // Tracks the latest session-switch attempt so a late timeout settlement from an
// older runUnit() call cannot clear the guard for a newer one. // older runUnit() call cannot clear the guard for a newer one.
let sessionSwitchGeneration = 0; let sessionSwitchGeneration = 0;
/**
* Build the custom-message content for a unit prompt.
*
* Purpose: preserve the exact prompt text while allowing the provider layer to
* cache the stable prefix separately from the dynamic suffix.
*
* Consumer: runUnit before pi.sendMessage dispatches the autonomous unit turn.
*/
export function buildUnitPromptMessageContent(prompt, promptParts) {
if (!promptParts) return prompt;
return [
{
type: "text",
text: `${promptParts.before}\n`,
cache_control: { type: "ephemeral" },
},
{ type: "text", text: promptParts.after },
];
}
/** /**
* Execute a single unit: create a new session, send the prompt, and await * Execute a single unit: create a new session, send the prompt, and await
* the agent_end promise. Returns a UnitResult describing what happened. * the agent_end promise. Returns a UnitResult describing what happened.
@ -122,8 +141,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) {
sessionResult = await Promise.race([sessionPromise, timeoutPromise]); sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
} catch (sessionErr) { } catch (sessionErr) {
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
const msg = const msg = getErrorMessage(sessionErr);
getErrorMessage(sessionErr);
debugLog("runUnit", { debugLog("runUnit", {
phase: "session-error", phase: "session-error",
unitType, unitType,
@ -264,16 +282,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) {
// When promptParts is available, send structured content so the provider can // When promptParts is available, send structured content so the provider can
// apply cache_control:ephemeral to the stable prefix (before) while leaving // apply cache_control:ephemeral to the stable prefix (before) while leaving
// the dynamic suffix (after) uncached. // the dynamic suffix (after) uncached.
const messageContent = promptParts const messageContent = buildUnitPromptMessageContent(prompt, promptParts);
? [
{
type: "text",
text: promptParts.before,
cache_control: { type: "ephemeral" },
},
{ type: "text", text: promptParts.after },
]
: prompt;
await pi.sendMessage( await pi.sendMessage(
{ customType: "sf-auto", content: messageContent, display: s.verbose }, { customType: "sf-auto", content: messageContent, display: s.verbose },
{ triggerTurn: true }, { triggerTurn: true },

View file

@ -301,7 +301,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
}, },
{ {
cmd: "rubber-duck", cmd: "rubber-duck",
desc: "Request constructive code/design review from a rubber-duck subagent (RUBBER_DUCK flag)", desc: "Dispatch a rubber-duck subagent for constructive pre-implementation review (alias: review-code)",
}, },
{ {
cmd: "delegate", cmd: "delegate",

View file

@ -613,25 +613,47 @@ async function handleKeepAlive(args, ctx) {
// ─── /rubber-duck ──────────────────────────────────────────────────────────── // ─── /rubber-duck ────────────────────────────────────────────────────────────
async function handleRubberDuckCommand(topic, ctx, _pi) { async function handleRubberDuckCommand(topic, ctx, _pi) {
if (!getExperimentalFlag("rubber_duck")) { const { execSync } = await import("node:child_process");
ctx.ui.notify( const root = projectRoot();
"RUBBER_DUCK is not enabled. Run /experimental on rubber_duck to enable.",
"warning", // Gather git diff for context (staged + unstaged, capped to avoid token bloat)
); let diff = "";
return;
}
const prompt = topic
? `Rubber-duck review requested: ${topic}\n\nPlease review this as a constructive critic: identify risks, edge cases, missing tests, and improvements. Be direct and concise.`
: "Please give constructive feedback on the current code changes or design. Identify risks, edge cases, missing tests, and improvements.";
ctx.ui.notify(
"Starting rubber-duck review… (RUBBER_DUCK agent is constructive, not adversarial)",
"info",
);
try { try {
await ctx.sendMessage?.(prompt); const staged = execSync("git diff --cached --stat 2>/dev/null || true", {
cwd: root,
encoding: "utf-8",
}).trim();
const unstaged = execSync("git diff --stat 2>/dev/null || true", {
cwd: root,
encoding: "utf-8",
}).trim();
if (staged || unstaged) {
const fullDiff = execSync(
"git diff --cached 2>/dev/null; git diff 2>/dev/null",
{ cwd: root, encoding: "utf-8" },
).slice(0, 8000);
diff = `\n\n## Current diff (truncated to 8 kB)\n\n\`\`\`diff\n${fullDiff}\n\`\`\``;
}
} catch {
// diff unavailable — not a hard failure
}
const focus = topic ? `Focus on: ${topic}\n\n` : "";
const reviewPrompt =
`Dispatch a \`rubber-duck\` subagent to review the current plan or changes before proceeding. ` +
`Use the \`subagent\` tool with \`agent: "rubber-duck"\`.\n\n` +
`${focus}` +
`Ask the rubber-duck agent to identify blocking issues, non-blocking issues, and suggestions. ` +
`After the subagent returns, summarise the verdict and any blocking findings in one short paragraph. ` +
`Do not proceed with implementation until the user acknowledges blocking findings.` +
diff;
ctx.ui.notify("Dispatching rubber-duck review…", "info");
try {
await ctx.sendMessage?.(reviewPrompt);
} catch { } catch {
ctx.ui.notify( ctx.ui.notify(
"Could not start rubber-duck session. Try typing your review request directly.", "Could not dispatch rubber-duck. Try: subagent agent=rubber-duck task='review current changes'",
"warning", "warning",
); );
} }

View file

@ -741,6 +741,66 @@ export class SFDashboardOverlay {
); );
} }
} }
// UOK Health section — aligns with headless status output
if (this.uokDiagnostics && this.uokDiagnostics.issues.length > 0) {
lines.push(blank());
lines.push(hr());
lines.push(row(th.fg("text", th.bold("UOK Health"))));
lines.push(blank());
// Compact summary line matching headless format
lines.push(
row(
th.fg(
this.uokDiagnostics.verdict === "degraded"
? "error"
: this.uokDiagnostics.verdict === "attention"
? "warning"
: "dim",
`Verdict: ${this.uokDiagnostics.verdict} (${this.uokDiagnostics.classification})`,
),
),
);
lines.push(blank());
// Issue list
for (const issue of this.uokDiagnostics.issues) {
const icon =
issue.severity === "error"
? th.fg("error", "✗")
: th.fg("warning", "⚠");
lines.push(row(` ${icon} ${th.fg("text", issue.code)}`));
lines.push(row(th.fg("dim", ` ${issue.message}`)));
}
// Recommendations
if (this.uokDiagnostics.recommendations.length > 0) {
lines.push(blank());
for (const rec of this.uokDiagnostics.recommendations) {
lines.push(row(th.fg("dim", `${rec}`)));
}
}
// Signals table
if (this.uokDiagnostics.signals) {
lines.push(blank());
lines.push(row(th.fg("dim", "Signals:")));
for (const [key, value] of Object.entries(
this.uokDiagnostics.signals,
)) {
const signalColor =
value === "ok" ||
value === "active" ||
value === "consistent" ||
value === "clear"
? "success"
: value === "unknown"
? "dim"
: "warning";
lines.push(
row(
` ${th.fg(signalColor, "●")} ${th.fg("text", key)}: ${th.fg(signalColor, String(value))}`,
),
);
}
}
}
// Environment health section (#1221) — only show issues // Environment health section (#1221) — only show issues
const envResults = runEnvironmentChecks( const envResults = runEnvironmentChecks(
this.dashData.basePath || process.cwd(), this.dashData.basePath || process.cwd(),

View file

@ -31,18 +31,12 @@ export const EXPERIMENTAL_FLAGS = {
"STATUS_LINE — run a user-defined script to feed a custom footer status chip", "STATUS_LINE — run a user-defined script to feed a custom footer status chip",
show_file: show_file:
"SHOW_FILE — show_file tool renders code snippets inline in the timeline", "SHOW_FILE — show_file tool renders code snippets inline in the timeline",
ask_elicitation:
"ASK_USER_ELICITATION — structured form/select UI replaces plain ask_user",
multi_turn_agents:
"MULTI_TURN_AGENTS — persistent subagents that accept follow-up messages",
extensions: extensions:
"EXTENSIONS — user-installable extensions via marketplace npm install", "EXTENSIONS — user-installable extensions via marketplace npm install",
configure_agent: configure_agent:
"CONFIGURE_COPILOT_AGENT — interactive wizard for MCP servers and agents", "CONFIGURE_COPILOT_AGENT — interactive wizard for MCP servers and agents",
background_sessions: background_sessions:
"BACKGROUND_SESSIONS — concurrent sessions with background switching", "BACKGROUND_SESSIONS — concurrent sessions with background switching",
rubber_duck:
"RUBBER_DUCK — constructive feedback subagent on code and designs",
prompt_frame: prompt_frame:
"PROMPT_FRAME — decorative border rendered above the input prompt", "PROMPT_FRAME — decorative border rendered above the input prompt",
streamer_mode: streamer_mode:

View file

@ -107,6 +107,8 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 30, speed: 30,
longContext: 80, longContext: 80,
instruction: 90, instruction: 90,
// Agentic: Claude Opus is built around extended tool-use loops.
agentic: 95,
}, },
"claude-sonnet-4-6": { "claude-sonnet-4-6": {
coding: 85, coding: 85,
@ -116,6 +118,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60, speed: 60,
longContext: 75, longContext: 75,
instruction: 85, instruction: 85,
agentic: 92,
}, },
"claude-sonnet-4-5-20250514": { "claude-sonnet-4-5-20250514": {
coding: 85, coding: 85,
@ -125,6 +128,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60, speed: 60,
longContext: 75, longContext: 75,
instruction: 85, instruction: 85,
agentic: 90,
}, },
"claude-3-5-sonnet-latest": { "claude-3-5-sonnet-latest": {
coding: 82, coding: 82,
@ -134,6 +138,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 62, speed: 62,
longContext: 70, longContext: 70,
instruction: 82, instruction: 82,
agentic: 85,
}, },
"claude-haiku-4-5": { "claude-haiku-4-5": {
coding: 60, coding: 60,
@ -143,6 +148,9 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95, speed: 95,
longContext: 50, longContext: 50,
instruction: 75, instruction: 75,
// Haiku follows tool-use contracts but is less reliable than Sonnet on
// long agentic loops.
agentic: 75,
}, },
"claude-3-5-haiku-latest": { "claude-3-5-haiku-latest": {
coding: 60, coding: 60,
@ -152,6 +160,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95, speed: 95,
longContext: 50, longContext: 50,
instruction: 75, instruction: 75,
agentic: 75,
}, },
"claude-3-haiku-20240307": { "claude-3-haiku-20240307": {
coding: 50, coding: 50,
@ -163,6 +172,7 @@ export const MODEL_CAPABILITY_PROFILES = {
instruction: 65, instruction: 65,
}, },
"claude-3-opus-latest": { "claude-3-opus-latest": {
agentic: 88,
coding: 90, coding: 90,
debugging: 85, debugging: 85,
research: 82, research: 82,
@ -234,6 +244,8 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 40, speed: 40,
longContext: 85, longContext: 85,
instruction: 90, instruction: 90,
// GPT-5 family is strongly agentic per OpenAI's tool-use evals.
agentic: 92,
}, },
"gpt-5-mini": { "gpt-5-mini": {
coding: 62, coding: 62,
@ -261,6 +273,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 35, speed: 35,
longContext: 88, longContext: 88,
instruction: 92, instruction: 92,
agentic: 94,
}, },
"gpt-5.1": { "gpt-5.1": {
coding: 93, coding: 93,
@ -270,6 +283,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 42, speed: 42,
longContext: 86, longContext: 86,
instruction: 91, instruction: 91,
agentic: 92,
}, },
"gpt-5.1-codex-max": { "gpt-5.1-codex-max": {
coding: 90, coding: 90,
@ -279,6 +293,9 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 55, speed: 55,
longContext: 75, longContext: 75,
instruction: 85, instruction: 85,
// Codex-tuned models are agentic-capable but not as reliable as the
// flagship gpt-5/5.x lineup for long tool-use loops.
agentic: 80,
}, },
"gpt-5.1-codex-mini": { "gpt-5.1-codex-mini": {
coding: 65, coding: 65,
@ -288,6 +305,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 88, speed: 88,
longContext: 48, longContext: 48,
instruction: 72, instruction: 72,
agentic: 55,
}, },
"gpt-5.2": { "gpt-5.2": {
coding: 93, coding: 93,
@ -297,6 +315,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 42, speed: 42,
longContext: 87, longContext: 87,
instruction: 91, instruction: 91,
agentic: 92,
}, },
"gpt-5.2-codex": { "gpt-5.2-codex": {
coding: 93, coding: 93,
@ -306,6 +325,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50, speed: 50,
longContext: 78, longContext: 78,
instruction: 88, instruction: 88,
agentic: 82,
}, },
"gpt-5.3-codex": { "gpt-5.3-codex": {
coding: 94, coding: 94,
@ -315,6 +335,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50, speed: 50,
longContext: 80, longContext: 80,
instruction: 89, instruction: 89,
agentic: 84,
}, },
"gpt-5.3-codex-spark": { "gpt-5.3-codex-spark": {
coding: 68, coding: 68,
@ -324,6 +345,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 90, speed: 90,
longContext: 50, longContext: 50,
instruction: 74, instruction: 74,
agentic: 55,
}, },
"gpt-5.4": { "gpt-5.4": {
coding: 95, coding: 95,
@ -333,6 +355,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 42, speed: 42,
longContext: 88, longContext: 88,
instruction: 92, instruction: 92,
agentic: 94,
}, },
"gpt-5.4-mini": { "gpt-5.4-mini": {
coding: 80, coding: 80,
@ -342,6 +365,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 72, speed: 72,
longContext: 72, longContext: 72,
instruction: 80, instruction: 80,
agentic: 80,
}, },
// GPT-5.5 scores are relative to the existing gpt-5.4 profile and backed by // GPT-5.5 scores are relative to the existing gpt-5.4 profile and backed by
// OpenAI's 2026-04-23 published eval deltas across coding, tool use, and long context. // OpenAI's 2026-04-23 published eval deltas across coding, tool use, and long context.
@ -354,6 +378,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 42, speed: 42,
longContext: 90, longContext: 90,
instruction: 93, instruction: 93,
agentic: 95,
}, },
// ── OpenAI o-series (reasoning-first) ────────────────────────────────────── // ── OpenAI o-series (reasoning-first) ──────────────────────────────────────
o1: { o1: {
@ -410,6 +435,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 48, speed: 48,
longContext: 98, longContext: 98,
instruction: 82, instruction: 82,
agentic: 85,
}, },
"gemini-3-pro-preview": { "gemini-3-pro-preview": {
coding: 82, coding: 82,
@ -419,6 +445,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50, speed: 50,
longContext: 96, longContext: 96,
instruction: 82, instruction: 82,
agentic: 85,
}, },
"gemini-3-flash-preview": { "gemini-3-flash-preview": {
coding: 62, coding: 62,
@ -428,6 +455,10 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 88, speed: 88,
longContext: 88, longContext: 88,
instruction: 72, instruction: 72,
// Gemini Flash follows tool contracts but is occasionally chatty in
// agentic loops; mid-tier so it doesn't dominate execute-task vs
// a Sonnet/Opus/K2.6 alternative.
agentic: 70,
}, },
"gemini-3.1-flash-lite-preview": { "gemini-3.1-flash-lite-preview": {
coding: 55, coding: 55,
@ -583,6 +614,10 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 70, speed: 70,
longContext: 60, longContext: 60,
instruction: 80, instruction: 80,
// Agentic: code-completion tuning. Refuses agentic tasks with "I'm sorry,
// I don't have the necessary tools" (M001-6377a4/S04/T02, 2026-05-12).
// Should not be routed to execute-task without explicit operator pin.
agentic: 25,
}, },
"ministral-8b-latest": { "ministral-8b-latest": {
coding: 55, coding: 55,
@ -655,6 +690,9 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 65, speed: 65,
longContext: 65, longContext: 65,
instruction: 80, instruction: 80,
// Agentic: Devstral series is coding-completion-tuned; tool-use is not
// the design target. Penalize so execute-task routing avoids it.
agentic: 30,
}, },
"devstral-medium-latest": { "devstral-medium-latest": {
coding: 78, coding: 78,
@ -664,6 +702,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 75, speed: 75,
longContext: 60, longContext: 60,
instruction: 75, instruction: 75,
agentic: 30,
}, },
"devstral-medium-2507": { "devstral-medium-2507": {
coding: 78, coding: 78,
@ -673,6 +712,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 75, speed: 75,
longContext: 60, longContext: 60,
instruction: 75, instruction: 75,
agentic: 30,
}, },
"devstral-small-2505": { "devstral-small-2505": {
coding: 60, coding: 60,
@ -682,6 +722,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 90, speed: 90,
longContext: 45, longContext: 45,
instruction: 65, instruction: 65,
agentic: 30,
}, },
"devstral-small-2507": { "devstral-small-2507": {
coding: 60, coding: 60,
@ -691,6 +732,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 90, speed: 90,
longContext: 45, longContext: 45,
instruction: 65, instruction: 65,
agentic: 30,
}, },
"labs-devstral-small-2512": { "labs-devstral-small-2512": {
coding: 65, coding: 65,
@ -700,6 +742,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 88, speed: 88,
longContext: 60, longContext: 60,
instruction: 68, instruction: 68,
agentic: 30,
}, },
// ── Zhipu AI (GLM) ───────────────────────────────────────────────────────── // ── Zhipu AI (GLM) ─────────────────────────────────────────────────────────
"glm-5": { "glm-5": {
@ -774,6 +817,8 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 58, speed: 58,
longContext: 86, longContext: 86,
instruction: 78, instruction: 78,
// Agentic: qwen3-coder is tuned for code completion, not tool-use loops.
agentic: 40,
}, },
"qwen3-coder-next": { "qwen3-coder-next": {
coding: 82, coding: 82,
@ -783,6 +828,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 70, speed: 70,
longContext: 86, longContext: 86,
instruction: 76, instruction: 76,
agentic: 40,
}, },
"qwen3-next:80b": { "qwen3-next:80b": {
coding: 70, coding: 70,
@ -802,6 +848,9 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 55, speed: 55,
longContext: 86, longContext: 86,
instruction: 84, instruction: 84,
// Agentic: K2.6 is the pinned default for the autonomous-solver role
// (ADR-0079) — refusal-resistant and follows tool-use contracts.
agentic: 90,
}, },
"kimi-for-coding": { "kimi-for-coding": {
coding: 88, coding: 88,
@ -811,6 +860,9 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 55, speed: 55,
longContext: 86, longContext: 86,
instruction: 84, instruction: 84,
// `kimi-for-coding` is an alias for K2.6 on the Kimi Code provider
// (memory: bayesian-blender/benchmark-selector both canonicalize it).
agentic: 90,
}, },
"kimi-k2-thinking": { "kimi-k2-thinking": {
coding: 86, coding: 86,
@ -820,8 +872,15 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 30, speed: 30,
longContext: 86, longContext: 86,
instruction: 84, instruction: 84,
agentic: 88,
}, },
// ── MiniMax ─────────────────────────────────────────────────────────────── // ── MiniMax ───────────────────────────────────────────────────────────────
// Profiles ordered by generation. Older M2.1 generation gets distinctly
// lower agentic + capability scores: the M2.1 stuck-checkpoint loop on
// 2026-05-13 (infra repo) traced back to M2.1 being aliased to M2.7's
// profile, winning execute-task on cost, then failing to follow the
// checkpoint contract reliably across 60+ tool calls. (See
// self-feedback sf-mp37kjmo-1mfuru.)
"MiniMax-M2.7": { "MiniMax-M2.7": {
coding: 84, coding: 84,
debugging: 80, debugging: 80,
@ -830,6 +889,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 52, speed: 52,
longContext: 84, longContext: 84,
instruction: 82, instruction: 82,
agentic: 78,
}, },
"MiniMax-M2.7-highspeed": { "MiniMax-M2.7-highspeed": {
coding: 82, coding: 82,
@ -839,6 +899,47 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 72, speed: 72,
longContext: 84, longContext: 84,
instruction: 80, instruction: 80,
agentic: 76,
},
"MiniMax-M2.5": {
// Distinct profile (previously aliased to M2.7 — overclaimed).
coding: 78,
debugging: 74,
research: 72,
reasoning: 78,
speed: 55,
longContext: 82,
instruction: 76,
// Mid agentic — better than coding-completion-only models but
// noticeably less reliable than current-gen agentic models.
agentic: 60,
},
"MiniMax-M2.1": {
// Distinct profile (previously aliased to M2.7 — overclaimed).
// M2.1 has demonstrated unreliable tool-use loops in production
// (M001-6377a4 / 1-ci-build-pipeline parallel-research, 2026-05-13:
// 60+ checkpoint calls with shifting unitId claims). Penalize the
// agentic axis so execute-task routing avoids it absent operator
// override.
coding: 72,
debugging: 66,
research: 64,
reasoning: 70,
speed: 60,
longContext: 78,
instruction: 72,
agentic: 40,
},
"MiniMax-M2": {
// Earliest of the M2.x line — older still.
coding: 68,
debugging: 60,
research: 60,
reasoning: 66,
speed: 62,
longContext: 76,
instruction: 68,
agentic: 35,
}, },
}; };
const MODEL_CAPABILITY_ALIASES = { const MODEL_CAPABILITY_ALIASES = {
@ -864,10 +965,23 @@ const MODEL_CAPABILITY_ALIASES = {
"kimi-for-coding": "kimi-k2.6", "kimi-for-coding": "kimi-k2.6",
"kimi-k2.6:cloud": "kimi-k2.6", "kimi-k2.6:cloud": "kimi-k2.6",
"kimi-k2.6-cloud": "kimi-k2.6", "kimi-k2.6-cloud": "kimi-k2.6",
"minimax-m2": "MiniMax-M2.7", // Each MiniMax generation now has its own profile — previously they all
"minimax-m2.1": "MiniMax-M2.7", // aliased to MiniMax-M2.7, which let older/weaker models inherit current
"minimax-m2.5": "MiniMax-M2.7", // capability scores and win cost tie-breaks on execute-task. The aliases
// below normalize provider-prefixed and casing variants to the canonical
// per-generation profile, NOT to the current generation.
"minimax-m2": "MiniMax-M2",
"minimax/MiniMax-M2": "MiniMax-M2",
"minimax/minimax-m2": "MiniMax-M2",
"minimax-m2.1": "MiniMax-M2.1",
"minimax/MiniMax-M2.1": "MiniMax-M2.1",
"minimax/minimax-m2.1": "MiniMax-M2.1",
"minimax-m2.5": "MiniMax-M2.5",
"minimax/MiniMax-M2.5": "MiniMax-M2.5",
"minimax/minimax-m2.5": "MiniMax-M2.5",
"minimax-m2.7": "MiniMax-M2.7", "minimax-m2.7": "MiniMax-M2.7",
"minimax/MiniMax-M2.7": "MiniMax-M2.7",
"minimax/minimax-m2.7": "MiniMax-M2.7",
"mistral-large-3:675b": "mistral-large-latest", "mistral-large-3:675b": "mistral-large-latest",
"ministral-3:3b": "ministral-3b-latest", "ministral-3:3b": "ministral-3b-latest",
"ministral-3:8b": "ministral-8b-latest", "ministral-3:8b": "ministral-8b-latest",
@ -888,18 +1002,32 @@ const MODEL_CAPABILITY_ALIASES = {
// ─── Base Task Requirements Data Table ─────────────────────────────────────── // ─── Base Task Requirements Data Table ───────────────────────────────────────
// Per-unit-type base requirement vectors. Weights indicate how important each // Per-unit-type base requirement vectors. Weights indicate how important each
// capability dimension is for this unit type. // capability dimension is for this unit type.
//
// The `agentic` dimension represents the model's reliability at multi-turn
// tool-use loops (does it follow the tool-use contract? does it refuse the
// task? does it call the checkpoint tool when asked?). It is weighted high
// for any unit type that actually uses tools at runtime — execute-task most
// of all. See ADR-0079 for the motivation: a Codestral-style refusal on
// execute-task in M001-6377a4/S04/T02 (2026-05-12) traced back to the router
// having no agentic axis, so a coding-completion model out-scored agentic
// alternatives on coding/instruction.
export const BASE_REQUIREMENTS = { export const BASE_REQUIREMENTS = {
"execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 }, "execute-task": {
coding: 0.9,
instruction: 0.7,
speed: 0.3,
agentic: 0.85,
},
"research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
"research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
"plan-milestone": { reasoning: 0.9, coding: 0.5 }, "plan-milestone": { reasoning: 0.9, coding: 0.5, agentic: 0.6 },
"plan-slice": { reasoning: 0.9, coding: 0.5 }, "plan-slice": { reasoning: 0.9, coding: 0.5, agentic: 0.6 },
"replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5 }, "replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5, agentic: 0.6 },
"reassess-roadmap": { reasoning: 0.9, research: 0.5 }, "reassess-roadmap": { reasoning: 0.9, research: 0.5, agentic: 0.4 },
"complete-slice": { instruction: 0.8, speed: 0.7 }, "complete-slice": { instruction: 0.8, speed: 0.7, agentic: 0.6 },
"run-uat": { instruction: 0.7, speed: 0.8 }, "run-uat": { instruction: 0.7, speed: 0.8, agentic: 0.6 },
"discuss-milestone": { reasoning: 0.6, instruction: 0.7 }, "discuss-milestone": { reasoning: 0.6, instruction: 0.7, agentic: 0.4 },
"complete-milestone": { instruction: 0.8, reasoning: 0.5 }, "complete-milestone": { instruction: 0.8, reasoning: 0.5, agentic: 0.5 },
}; };
// ─── Public API ────────────────────────────────────────────────────────────── // ─── Public API ──────────────────────────────────────────────────────────────
/** /**
@ -1101,6 +1229,7 @@ export function resolveModelForComplexity(
unitType, unitType,
taskMetadata, taskMetadata,
capabilityOverrides, capabilityOverrides,
stickyHint,
) { ) {
// If no phase config or routing disabled, pass through // If no phase config or routing disabled, pass through
if (!phaseConfig || !routingConfig.enabled) { if (!phaseConfig || !routingConfig.enabled) {
@ -1175,16 +1304,41 @@ export function resolveModelForComplexity(
if (winner) { if (winner) {
const capScores = {}; const capScores = {};
for (const s of scored) capScores[s.modelId] = s.score; for (const s of scored) capScores[s.modelId] = s.score;
const fallbacks = buildFallbackChain(winner.modelId, phaseConfig); // Slice-sticky preference: if a model previously succeeded on a
// sibling unit in this slice AND it is still eligible in the
// current tier AND its capability score is within STICKY_WINDOW of
// the winner, prefer it. Stops within-slice routing thrash where
// T01 → gemini-flash and T02 → codestral on the same slice.
const STICKY_WINDOW_POINTS = 8;
const stickyId = (() => {
if (!stickyHint?.id) return null;
const stickyKey = stickyHint.provider
? `${stickyHint.provider}/${stickyHint.id}`
: stickyHint.id;
// Match either "provider/model" or bare model id in the eligible list.
const found = scored.find(
(s) => s.modelId === stickyKey || s.modelId.endsWith(`/${stickyHint.id}`),
);
if (!found) return null;
if (winner.score - found.score > STICKY_WINDOW_POINTS) return null;
return found.modelId;
})();
const selectedId = stickyId ?? winner.modelId;
const selectedScore = (
scored.find((s) => s.modelId === selectedId) ?? winner
).score;
const fallbacks = buildFallbackChain(selectedId, phaseConfig);
return { return {
modelId: winner.modelId, modelId: selectedId,
fallbacks, fallbacks,
tier: requestedTier, tier: requestedTier,
wasDowngraded: true, wasDowngraded: true,
reason: `capability-scored: ${winner.modelId} (${winner.score.toFixed(1)}) for ${unitType}`, reason: stickyId
? `slice-sticky: ${selectedId} (${selectedScore.toFixed(1)}, within ${STICKY_WINDOW_POINTS}pt of capability winner) for ${unitType}`
: `capability-scored: ${selectedId} (${selectedScore.toFixed(1)}) for ${unitType}`,
capabilityScores: capScores, capabilityScores: capScores,
taskRequirements: requirements, taskRequirements: requirements,
selectionMethod: "capability-scored", selectionMethod: stickyId ? "slice-sticky" : "capability-scored",
}; };
} }
} }

View file

@ -137,6 +137,11 @@ export function reorderForCaching(prompt) {
* static+semi-static prefix can be marked with cache_control: ephemeral on * static+semi-static prefix can be marked with cache_control: ephemeral on
* Anthropic-compatible providers. * Anthropic-compatible providers.
* *
* Purpose: keep SF autonomous prompt prefixes byte-stable across adjacent task
* dispatches so provider prompt caches can reuse expensive context.
*
* Consumer: auto/phases-unit.js before runUnit dispatches an autonomous unit.
*
* Returns `{before: string, after: string}` where: * Returns `{before: string, after: string}` where:
* - `before` = preamble + all static + all semi-static sections (cache this) * - `before` = preamble + all static + all semi-static sections (cache this)
* - `after` = all dynamic sections (do not cache) * - `after` = all dynamic sections (do not cache)

View file

@ -596,3 +596,103 @@ function isPidAlive(pid) {
return false; return false;
} }
} }
/**
* Public wrapper around isPidAlive for callers outside this module.
*
* Consumer: auto-start's prompt-to-kill flow needs to decide whether the
* existingPid from acquireSessionLock's failure result is still alive before
* offering to terminate it.
*/
export function isSessionPidAlive(pid) {
return isPidAlive(Number(pid));
}
/**
* Terminate an existing SF auto session by PID.
*
* Why: when acquireSessionLock reports `{ acquired: false, existingPid }`
* because another SF process is holding the lock, we want a one-call helper
* that an interactive caller can invoke after confirming with the user. The
* helper sends SIGTERM, polls for the process to exit, escalates to SIGKILL
* after the grace window, and waits a short tail for the kernel to reap the
* PID so a subsequent acquireSessionLock retry sees a dead PID and proceeds
* down the stale-lock recovery path.
*
* Returns `{ terminated: boolean, escalated: boolean, alreadyDead: boolean }`.
* `terminated` is true iff the PID is no longer alive when the call returns.
* `escalated` is true iff SIGKILL was needed because SIGTERM did not produce
* an exit within `gracePeriodMs`.
*
* Consumer: auto-start's prompt-to-kill flow. Not part of the normal
* autonomous loop only invoked after explicit operator consent.
*
* @param {number} pid - The PID to terminate.
* @param {object} [options]
* @param {number} [options.gracePeriodMs=5000] - How long to wait between
* SIGTERM and SIGKILL.
* @param {number} [options.reapWaitMs=1000] - How long to wait after the
* final kill signal for the kernel to reap.
* @param {number} [options.pollIntervalMs=100] - Poll interval used while
* waiting for exit.
*/
export async function terminateExistingSession(pid, options = {}) {
const numericPid = Number(pid);
if (!Number.isInteger(numericPid) || numericPid <= 0) {
return { terminated: false, escalated: false, alreadyDead: true };
}
if (numericPid === process.pid) {
// Refuse to terminate ourselves — would deadlock the caller.
return { terminated: false, escalated: false, alreadyDead: false };
}
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: false, alreadyDead: true };
}
const gracePeriodMs = Number(options.gracePeriodMs ?? 5000);
const reapWaitMs = Number(options.reapWaitMs ?? 1000);
const pollIntervalMs = Math.max(50, Number(options.pollIntervalMs ?? 100));
try {
process.kill(numericPid, "SIGTERM");
} catch (err) {
// ESRCH: process already gone between the alive check and the kill.
// EPERM: not ours to kill — surface as not-terminated.
if (err?.code === "ESRCH") {
return { terminated: true, escalated: false, alreadyDead: true };
}
if (err?.code === "EPERM") {
return { terminated: false, escalated: false, alreadyDead: false };
}
throw err;
}
const deadline = Date.now() + gracePeriodMs;
while (Date.now() < deadline) {
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: false, alreadyDead: false };
}
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
}
// Grace expired — escalate to SIGKILL.
try {
process.kill(numericPid, "SIGKILL");
} catch (err) {
if (err?.code === "ESRCH") {
return { terminated: true, escalated: true, alreadyDead: false };
}
if (err?.code === "EPERM") {
return { terminated: false, escalated: true, alreadyDead: false };
}
throw err;
}
const reapDeadline = Date.now() + reapWaitMs;
while (Date.now() < reapDeadline) {
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: true, alreadyDead: false };
}
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
}
return {
terminated: !isPidAlive(numericPid),
escalated: true,
alreadyDead: false,
};
}

View file

@ -0,0 +1,154 @@
/**
* slice-routing-cache.js per-slice sticky-model routing cache.
*
* Why: model routing is currently computed per-unit, so the executor can flip
* between models within a single slice (M001-6377a4/S04 routed T01 to
* gemini-3-flash-preview, then T02 to codestral-latest the second was
* unfit and refused the task, see ADR-0079). Once a model has successfully
* completed work on a slice, prefer it for the slice's sibling units unless
* a hard mismatch forces a switch.
*
* Contract:
* - Cache is small JSON keyed by sliceId. Each entry stores provider/id and
* timestamps so stale entries can be aged out.
* - Best-effort: read/write errors are swallowed; routing always has a
* fallback through the capability scorer.
* - Only successful outcomes (`continue` or `complete`) write to the cache.
* Refusal/blocker outcomes clear the entry so a failing model does not
* re-attach to the slice.
*
* Consumer: auto-model-selection.js reads before calling
* resolveModelForComplexity; auto/phases-unit.js writes after a successful
* checkpoint and clears on `executor-refused`.
*/
import { existsSync, mkdirSync, readFileSync, unlinkSync } from "node:fs";
import { dirname, join } from "node:path";
import { atomicWriteSync } from "./atomic-write.js";
import { sfRuntimeRoot } from "./paths.js";
const CACHE_FILE = "slice-routing.json";
const DEFAULT_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
function cachePath(basePath) {
return join(sfRuntimeRoot(basePath), CACHE_FILE);
}
/**
* Extract the slice scope from a unit id.
*
* Supports the conventional SF unit-id grammar:
* - Execute task: "<milestoneId>/<sliceId>/<taskId>" "<milestoneId>/<sliceId>"
* - Plan / complete slice: "<milestoneId>/<sliceId>" "<milestoneId>/<sliceId>" (already a slice)
* - Milestone-level units: "<milestoneId>" "<milestoneId>" (no slice scope)
*
* Returns null when the unit id is missing or unparseable.
*/
export function extractSliceScope(unitId) {
if (!unitId || typeof unitId !== "string") return null;
const parts = unitId.split("/").filter(Boolean);
if (parts.length === 0) return null;
if (parts.length === 1) return parts[0]; // milestone-only
return `${parts[0]}/${parts[1]}`;
}
function readCache(basePath) {
const path = cachePath(basePath);
if (!existsSync(path)) return {};
try {
return JSON.parse(readFileSync(path, "utf-8"));
} catch {
return {};
}
}
function writeCache(basePath, data) {
const path = cachePath(basePath);
try {
mkdirSync(dirname(path), { recursive: true });
atomicWriteSync(path, JSON.stringify(data, null, 2));
} catch {
// best-effort
}
}
/**
* Record the model that successfully handled a unit. The slice scope is
* derived from the unit id. Subsequent units in the same slice will see this
* as the sticky hint.
*/
export function recordSliceRouting(basePath, unitType, unitId, model) {
if (!basePath || !model?.id) return;
const sliceId = extractSliceScope(unitId);
if (!sliceId) return;
const data = readCache(basePath);
data[sliceId] = {
provider: String(model.provider ?? ""),
id: String(model.id),
ts: new Date().toISOString(),
lastUnitType: String(unitType ?? ""),
lastUnitId: String(unitId ?? ""),
};
writeCache(basePath, data);
}
/**
* Look up the sticky model for the slice that contains this unit. Returns
* null when there is no entry, when it's older than maxAgeMs, or when the
* cache cannot be read.
*
* @param {string} basePath
* @param {string} unitType
* @param {string} unitId
* @param {object} [options]
* @param {number} [options.maxAgeMs=7d]
* @returns {{ provider: string, id: string } | null}
*/
export function readStickyModelForUnit(basePath, unitType, unitId, options = {}) {
if (!basePath) return null;
const sliceId = extractSliceScope(unitId);
if (!sliceId) return null;
const data = readCache(basePath);
const entry = data[sliceId];
if (!entry?.id) return null;
const maxAgeMs = Number(options.maxAgeMs ?? DEFAULT_MAX_AGE_MS);
if (entry.ts) {
const age = Date.now() - new Date(entry.ts).getTime();
if (Number.isFinite(age) && age > maxAgeMs) return null;
}
return {
provider: String(entry.provider ?? ""),
id: String(entry.id),
};
}
/**
* Evict the sticky entry for the slice containing this unit. Called when the
* model attached to the slice refuses or hits a hard mismatch, so the next
* dispatch falls back to the capability scorer instead of re-pinning the
* broken model.
*/
export function clearSliceRoutingForUnit(basePath, unitId) {
if (!basePath) return;
const sliceId = extractSliceScope(unitId);
if (!sliceId) return;
const data = readCache(basePath);
if (!(sliceId in data)) return;
delete data[sliceId];
if (Object.keys(data).length === 0) {
try {
unlinkSync(cachePath(basePath));
} catch {
// best-effort
}
return;
}
writeCache(basePath, data);
}
/**
* Test/debug only read the entire cache. Production callers should use
* readStickyModelForUnit instead.
*/
export function _readCacheForTests(basePath) {
return readCache(basePath);
}

View file

@ -0,0 +1,467 @@
/**
* Dashboard Overlay UOK Diagnostics Tests
*
* Purpose: Verify that SFDashboardOverlay consumes writeUokDiagnostics output
* and renders it consistently with the headless status command.
*
* Consumer: TUI users who expect the dashboard to surface the same UOK health
* information as `sf status` / headless query.
*/
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
// ─── Hoisted mocks ─────────────────────────────────────────────────────────
const mockDiagnostics = vi.hoisted(() => ({
clear: {
schemaVersion: 1,
generatedAt: new Date().toISOString(),
verdict: "clear",
classification: "healthy",
signals: {
lock: "active",
parity: "ok",
ledger: "consistent",
runtimeProjection: "ok",
wrapper: "clear",
},
currentUnit: null,
latestRun: null,
runtimeUnits: [],
issues: [],
recommendations: [],
reportPath: "/tmp/uok-diagnostics.json",
},
degraded: {
schemaVersion: 1,
generatedAt: new Date().toISOString(),
verdict: "degraded",
classification: "needs-repair",
signals: {
lock: "stale",
parity: "ok",
ledger: "open-runs",
runtimeProjection: "stale",
wrapper: "unknown",
},
currentUnit: null,
latestRun: null,
runtimeUnits: [],
issues: [
{
code: "stale-lock",
severity: "error",
message: "Stale auto.lock detected for PID 12345.",
evidence: { lock: { pid: 12345 } },
},
{
code: "open-ledger-without-live-lock",
severity: "error",
message:
"UOK ledger has 2 started run(s) without a live auto.lock owner.",
evidence: { runIds: ["run-1", "run-2"] },
},
],
recommendations: [
"Clear stale auto.lock before dispatch.",
"Mark orphaned UOK runs recovered or restart from lock owner.",
],
reportPath: "/tmp/uok-diagnostics.json",
},
attention: {
schemaVersion: 1,
generatedAt: new Date().toISOString(),
verdict: "attention",
classification: "degraded",
signals: {
lock: "active",
parity: "degraded",
ledger: "consistent",
runtimeProjection: "ok",
wrapper: "unknown",
},
currentUnit: { unitType: "execute-task", unitId: "T01", pid: 12345 },
latestRun: null,
runtimeUnits: [],
issues: [
{
code: "uok-parity-degraded",
severity: "warning",
message:
"UOK parity degraded: 1 critical mismatch(es), 0 missing exit(s).",
evidence: { current: { criticalMismatches: 1, missingExitEvents: 0 } },
},
],
recommendations: ["Reconcile UOK parity before mutating git state."],
reportPath: "/tmp/uok-diagnostics.json",
},
}));
const dashDataMock = vi.hoisted(() => ({
basePath: "/tmp/sf-test",
active: false,
paused: false,
remoteSession: null,
currentUnit: null,
elapsed: 0,
rtkEnabled: false,
rtkSavings: null,
pendingCaptureCount: 0,
}));
vi.mock("../uok/diagnostic-synthesis.js", () => ({
writeUokDiagnostics: vi.fn((_basePath, _options) => mockDiagnostics.clear),
}));
vi.mock("../state.js", () => ({
deriveState: vi.fn(async () => ({
activeMilestone: null,
activeSlice: null,
activeTask: null,
phase: "idle",
progress: null,
nextAction: null,
blockers: [],
registry: [],
})),
}));
vi.mock("../sf-db.js", () => ({
isDbAvailable: vi.fn(() => false),
getMilestoneSlices: vi.fn(() => []),
getSliceTasks: vi.fn(() => []),
}));
vi.mock("../auto.js", () => ({
getAutoDashboardData: vi.fn(() => dashDataMock),
}));
vi.mock("../auto-dashboard.js", () => ({
estimateTimeRemaining: vi.fn(() => null),
}));
vi.mock("../progress-score.js", () => ({
computeProgressScore: vi.fn(() => ({
level: "green",
summary: "All systems healthy",
signals: [],
})),
}));
vi.mock("../doctor-environment.js", () => ({
runEnvironmentChecks: vi.fn(() => []),
}));
vi.mock("../worktree-command.js", () => ({
getActiveWorktreeName: vi.fn(() => null),
}));
vi.mock("../subagent/worker-registry.js", () => ({
hasActiveWorkers: vi.fn(() => false),
getWorkerBatches: vi.fn(() => new Map()),
}));
vi.mock("../metrics.js", () => ({
getLedger: vi.fn(() => null),
getProjectTotals: vi.fn(() => ({})),
aggregateByPhase: vi.fn(() => []),
aggregateBySlice: vi.fn(() => []),
aggregateByModel: vi.fn(() => []),
aggregateCacheHitRate: vi.fn(() => 0),
formatCost: vi.fn((n) => `$${n.toFixed(2)}`),
formatCostProjection: vi.fn(() => []),
formatTokenCount: vi.fn((n) => String(n)),
}));
vi.mock("../paths.js", () => ({
resolveMilestoneFile: vi.fn(() => null),
}));
vi.mock("../files.js", () => ({
loadFile: vi.fn(async () => null),
}));
vi.mock("../preferences.js", () => ({
loadEffectiveSFPreferences: vi.fn(() => null),
}));
vi.mock("@singularity-forge/tui", async (importOriginal) => {
const actual = (await importOriginal()) as any;
return {
...actual,
Key: {
escape: "\u001B",
ctrl: (c: string) => `\u0000${c}`,
ctrlAlt: (c: string) => `\u001B\u0000${c}`,
ctrlShift: (c: string) => `\u001B\u0000${c.toUpperCase()}`,
down: "\u001B[B",
up: "\u001B[A",
},
matchesKey: vi.fn(() => false),
truncateToWidth: vi.fn((s: string, w: number) =>
s.length > w ? s.slice(0, w) : s,
),
visibleWidth: vi.fn((s: string) => s.length),
};
});
vi.mock("../shared/mod.js", () => ({
centerLine: vi.fn(
(s: string, w: number) =>
" ".repeat(Math.max(0, Math.floor((w - s.length) / 2))) + s,
),
fitColumns: vi.fn((parts: string[], _w: number, _sep: string) =>
parts.join(" "),
),
formatDuration: vi.fn((ms: number) => `${Math.round(ms / 1000)}s`),
joinColumns: vi.fn(
(left: string, right: string, _w: number) =>
`${left}${" ".repeat(Math.max(1, _w - left.length - right.length))}${right}`,
),
padRight: vi.fn((s: string, w: number) => s.padEnd(w, " ")),
STATUS_COLOR: {
done: "success",
active: "accent",
pending: "dim",
},
STATUS_GLYPH: {
done: "✓",
active: "▶",
pending: "○",
},
}));
vi.mock("../shortcut-defs.js", () => ({
formattedShortcutPair: vi.fn(() => "ctrl+alt+g"),
}));
// ─── Helpers ───────────────────────────────────────────────────────────────
function createMockTheme() {
return {
fg: vi.fn((color: string, text: string) => `[${color}:${text}]`),
bold: vi.fn((text: string) => `**${text}**`),
};
}
function createMockTui() {
return {
requestRender: vi.fn(),
};
}
// ─── Tests ─────────────────────────────────────────────────────────────────
beforeEach(() => {
vi.clearAllMocks();
});
afterEach(() => {
vi.clearAllMocks();
});
describe("SFDashboardOverlay UOK diagnostics", () => {
it("loadData_calls_writeUokDiagnostics_and_stores_result", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
// Prevent interval from firing during test
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
expect(writeUokDiagnostics).toHaveBeenCalledWith("/tmp/sf-test");
expect(overlay.uokDiagnostics).toEqual(mockDiagnostics.clear);
overlay.dispose();
});
it("loadData_gracefully_handles_writeUokDiagnostics_failure", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
writeUokDiagnostics.mockImplementation(() => {
throw new Error("disk full");
});
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
expect(overlay.uokDiagnostics).toBeNull();
overlay.dispose();
writeUokDiagnostics.mockRestore();
});
it("render_includes_uok_verdict_when_diagnostics_present", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
expect(text).toContain("UOK");
expect(text).toContain("degraded");
expect(text).toContain("needs-repair");
overlay.dispose();
});
it("render_includes_first_issue_code_like_headless_status", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
// Should contain the first issue code, matching headless status behavior
expect(text).toContain("stale-lock");
overlay.dispose();
});
it("render_shows_uok_health_section_with_all_issues_when_degraded", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
// Should show both issue codes in the health section
expect(text).toContain("stale-lock");
expect(text).toContain("open-ledger-without-live-lock");
overlay.dispose();
});
it("render_shows_recommendations_when_issues_present", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
expect(text).toContain("Clear stale auto.lock before dispatch.");
expect(text).toContain(
"Mark orphaned UOK runs recovered or restart from lock owner.",
);
overlay.dispose();
});
it("render_shows_uok_signals_table_when_diagnostics_present", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
// Signals should be visible
expect(text).toContain("lock");
expect(text).toContain("parity");
expect(text).toContain("ledger");
overlay.dispose();
});
it("render_omits_detailed_uok_section_when_verdict_is_clear", async () => {
const { writeUokDiagnostics } = await import(
"../uok/diagnostic-synthesis.js"
);
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.clear);
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
const tui = createMockTui();
const theme = createMockTheme();
const overlay = new SFDashboardOverlay(tui, theme, () => {});
clearInterval(overlay.refreshTimer);
overlay.refreshTimer = null as any;
await overlay.loadData();
const lines = overlay.buildContentLines(80);
const text = lines.join("\n");
// Should show the compact UOK clear line but no issue details
expect(text).toContain("clear");
expect(text).not.toContain("stale-lock");
overlay.dispose();
});
});

View file

@ -0,0 +1,140 @@
import { describe, expect, test } from "vitest";
import {
BASE_REQUIREMENTS,
MODEL_CAPABILITY_PROFILES,
scoreEligibleModels,
scoreModel,
} from "../model-router.js";
describe("agentic capability axis (ADR-0079)", () => {
test("execute-task base requirements weight the agentic dimension", () => {
// If this assertion fails because the weight changed: re-read ADR-0079
// before adjusting. The whole point of the axis is to outweigh raw
// coding score for execute-task routing.
expect(BASE_REQUIREMENTS["execute-task"].agentic).toBeGreaterThanOrEqual(
0.7,
);
});
test("known agentic-capable models score higher than coding-completion models on execute-task", () => {
const codestralScore = scoreModel(
MODEL_CAPABILITY_PROFILES["codestral-latest"],
BASE_REQUIREMENTS["execute-task"],
);
const kimiScore = scoreModel(
MODEL_CAPABILITY_PROFILES["kimi-k2.6"],
BASE_REQUIREMENTS["execute-task"],
);
const sonnetScore = scoreModel(
MODEL_CAPABILITY_PROFILES["claude-sonnet-4-6"],
BASE_REQUIREMENTS["execute-task"],
);
// Codestral has high coding (85) but agentic=25 — must not beat agentic models.
expect(kimiScore).toBeGreaterThan(codestralScore);
expect(sonnetScore).toBeGreaterThan(codestralScore);
});
test("devstral variants score below agentic models on execute-task", () => {
const devstralScore = scoreModel(
MODEL_CAPABILITY_PROFILES["devstral-2512"],
BASE_REQUIREMENTS["execute-task"],
);
const kimiScore = scoreModel(
MODEL_CAPABILITY_PROFILES["kimi-k2.6"],
BASE_REQUIREMENTS["execute-task"],
);
expect(kimiScore).toBeGreaterThan(devstralScore);
});
test("scoreEligibleModels ranks agentic models above coding-only models for execute-task", () => {
const eligible = [
"mistral/codestral-latest",
"mistral/devstral-2512",
"moonshotai/kimi-k2.6",
"anthropic/claude-sonnet-4-6",
];
const ranked = scoreEligibleModels(
eligible,
BASE_REQUIREMENTS["execute-task"],
);
const top = ranked[0]?.modelId;
// Either of the two pinned-agentic models must win.
expect(["moonshotai/kimi-k2.6", "anthropic/claude-sonnet-4-6"]).toContain(
top,
);
// And Codestral specifically must not win.
expect(top).not.toBe("mistral/codestral-latest");
});
test("agentic axis preserves research-* unit-type behavior (no agentic weight there)", () => {
// Research isn't agentic — those unit types should not gain an agentic
// dimension. This protects long-context research-tuned models from
// being penalized.
expect(BASE_REQUIREMENTS["research-milestone"].agentic).toBeUndefined();
expect(BASE_REQUIREMENTS["research-slice"].agentic).toBeUndefined();
});
test("known coding-only models all have agentic <= 50", () => {
const codingOnly = [
"codestral-latest",
"devstral-2512",
"devstral-medium-latest",
"devstral-medium-2507",
"devstral-small-2505",
"devstral-small-2507",
"labs-devstral-small-2512",
"qwen3-coder:480b",
"qwen3-coder-next",
];
for (const id of codingOnly) {
const profile = MODEL_CAPABILITY_PROFILES[id];
expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined();
expect(profile.agentic, `${id} should have agentic <= 50`).toBeLessThanOrEqual(
50,
);
}
});
test("older MiniMax generations score lower than current on agentic", () => {
// 2026-05-13 incident: minimax/M2.1 stuck in 60+ checkpoint loop on
// infra repo. Root cause was the router aliasing all minimax-m2.x
// variants to MiniMax-M2.7's profile, so older models inherited
// current-gen capability scores and won cost tie-breaks on
// execute-task. Per-generation profiles + agentic axis fix the
// underlying routing decision.
const m21 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.1"];
const m25 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.5"];
const m27 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.7"];
expect(m21, "M2.1 should have its own profile").toBeDefined();
expect(m25, "M2.5 should have its own profile").toBeDefined();
expect(m27.agentic).toBeGreaterThan(m25.agentic);
expect(m25.agentic).toBeGreaterThan(m21.agentic);
// And on execute-task, the current generation must beat the older one.
const oldScore = scoreModel(m21, BASE_REQUIREMENTS["execute-task"]);
const newScore = scoreModel(m27, BASE_REQUIREMENTS["execute-task"]);
expect(newScore).toBeGreaterThan(oldScore);
});
test("known agentic-frontier models all have agentic >= 85", () => {
const agenticFrontier = [
"claude-opus-4-6",
"claude-sonnet-4-6",
"claude-sonnet-4-5-20250514",
"kimi-k2.6",
"kimi-k2-thinking",
"gpt-5",
"gpt-5.4",
"gpt-5.5",
"gemini-3-pro-preview",
"gemini-3.1-pro-preview",
];
for (const id of agenticFrontier) {
const profile = MODEL_CAPABILITY_PROFILES[id];
expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined();
expect(
profile.agentic,
`${id} should have agentic >= 85`,
).toBeGreaterThanOrEqual(85);
}
});
});

View file

@ -134,61 +134,3 @@ test("reorderAndSplitForCaching_preamble_goes_into_before", () => {
"dynamic section in after", "dynamic section in after",
); );
}); });
test("reorderForCaching_when_inlined_slice_summary_has_requirements_advanced_keeps_it_after_mission", () => {
const prompt = [
"# Milestone Validation",
"",
"## Working Directory",
"/repo",
"",
"## Mission",
"Dispatch reviewers.",
"",
"## Context",
"Inlined below.",
"",
"## Inlined Context",
"### S01 Summary",
"# S01",
"",
"## Requirements Advanced",
"- R1",
"",
"## Requirements Validated",
"None.",
].join("\n");
const reordered = reorderForCaching(prompt);
assert.ok(
reordered.indexOf("## Mission") <
reordered.indexOf("## Requirements Advanced"),
);
assert.ok(
reordered.indexOf("## Context") <
reordered.indexOf("## Requirements Advanced"),
);
});
test("reorderForCaching_when_top_level_requirements_exists_still_hoists_exact_requirements_block", () => {
const prompt = [
"# Execute",
"",
"## Mission",
"Do work.",
"",
"## Requirements",
"- R1",
"",
"## Verification",
"Run tests.",
].join("\n");
const reordered = reorderForCaching(prompt);
assert.ok(
reordered.indexOf("## Requirements") < reordered.indexOf("## Mission"),
);
});

View file

@ -0,0 +1,30 @@
import assert from "node:assert/strict";
import { test } from "vitest";
import { buildUnitPromptMessageContent } from "../auto/run-unit.js";
test("buildUnitPromptMessageContent_when_prompt_parts_present_preserves_join_boundary", () => {
const content = buildUnitPromptMessageContent("flat", {
before: "## Working Directory\n/repo",
after: "## Inlined Task Plan\nDo it.",
});
assert.ok(Array.isArray(content));
assert.deepEqual(content[0], {
type: "text",
text: "## Working Directory\n/repo\n",
cache_control: { type: "ephemeral" },
});
assert.deepEqual(content[1], {
type: "text",
text: "## Inlined Task Plan\nDo it.",
});
assert.equal(
content.map((part) => part.text).join(""),
"## Working Directory\n/repo\n## Inlined Task Plan\nDo it.",
);
});
test("buildUnitPromptMessageContent_when_no_prompt_parts_returns_flat_prompt", () => {
assert.equal(buildUnitPromptMessageContent("flat", null), "flat");
});

View file

@ -0,0 +1,134 @@
import { spawn } from "node:child_process";
import { describe, expect, test } from "vitest";
import {
isSessionPidAlive,
terminateExistingSession,
} from "../session-lock.js";
function spawnSleeper(seconds = 30) {
// `sleep` is a deliberate cooperative target: it exits on SIGTERM, which
// lets us exercise the graceful path. For the SIGKILL escalation test we
// spawn a child that ignores SIGTERM via `trap '' TERM`.
const child = spawn("/bin/sh", ["-c", `sleep ${seconds}`], {
stdio: "ignore",
detached: false,
});
return child;
}
function spawnIgnoreSigterm(seconds = 30) {
// A Node child that installs an explicit SIGTERM handler that does
// nothing. Unlike `sh -c "trap '' TERM; sleep N"` (where the shell
// tail-call-exec's sleep so SIGTERM hits sleep directly), this child
// IS the long-lived process and reliably ignores SIGTERM until the
// SIGKILL escalation. This lets us assert the escalation path.
const child = spawn(
process.execPath,
[
"-e",
`process.on('SIGTERM', () => {}); setTimeout(() => process.exit(0), ${seconds * 1000});`,
],
{ stdio: "ignore", detached: false },
);
return child;
}
describe("terminateExistingSession", () => {
test("returns alreadyDead=true when pid is invalid", async () => {
const result = await terminateExistingSession(0);
expect(result.terminated).toBe(false);
expect(result.alreadyDead).toBe(true);
});
test("refuses to terminate the current process", async () => {
const result = await terminateExistingSession(process.pid);
expect(result.terminated).toBe(false);
});
test("returns alreadyDead=true for a dead pid", async () => {
// PID 1 is alive but not ours; use a value that's almost certainly
// not assigned. 2^31 - 1 is well above any plausible PID.
const result = await terminateExistingSession(2147483646);
expect(result.alreadyDead).toBe(true);
expect(result.terminated).toBe(true);
});
test("gracefully terminates a process that respects SIGTERM", async () => {
const child = spawnSleeper(60);
try {
expect(isSessionPidAlive(child.pid)).toBe(true);
const result = await terminateExistingSession(child.pid, {
gracePeriodMs: 3000,
reapWaitMs: 1000,
pollIntervalMs: 50,
});
expect(result.terminated).toBe(true);
expect(result.escalated).toBe(false);
expect(isSessionPidAlive(child.pid)).toBe(false);
} finally {
try {
child.kill("SIGKILL");
} catch {
/* may already be dead */
}
}
});
test("escalates to SIGKILL when the process ignores SIGTERM", async () => {
const child = spawnIgnoreSigterm(60);
// Give the child a moment to register its SIGTERM handler before we
// send SIGTERM. Without this, the kill may arrive before
// process.on('SIGTERM', …) executes and Node uses the default handler
// (exit on signal), which makes the test look like graceful exit.
await new Promise((resolve) => setTimeout(resolve, 250));
try {
expect(isSessionPidAlive(child.pid)).toBe(true);
const result = await terminateExistingSession(child.pid, {
gracePeriodMs: 750,
reapWaitMs: 2000,
pollIntervalMs: 50,
});
expect(result.terminated).toBe(true);
expect(result.escalated).toBe(true);
expect(isSessionPidAlive(child.pid)).toBe(false);
} finally {
try {
child.kill("SIGKILL");
} catch {
/* may already be dead */
}
}
});
});
describe("isSessionPidAlive", () => {
test("returns false for current process (self-check is intentionally disabled)", () => {
// isPidAlive specifically excludes the current PID to prevent
// false-positive self-detection in the lock takeover flow.
expect(isSessionPidAlive(process.pid)).toBe(false);
});
test("returns false for clearly-dead pid", () => {
expect(isSessionPidAlive(2147483646)).toBe(false);
});
test("returns true for a live child", async () => {
const child = spawnSleeper(30);
try {
expect(isSessionPidAlive(child.pid)).toBe(true);
} finally {
try {
child.kill("SIGKILL");
} catch {
/* may already be dead */
}
}
});
test("returns false for non-integer or non-positive inputs", () => {
expect(isSessionPidAlive(0)).toBe(false);
expect(isSessionPidAlive(-1)).toBe(false);
expect(isSessionPidAlive("nope")).toBe(false);
expect(isSessionPidAlive(null)).toBe(false);
});
});

View file

@ -0,0 +1,136 @@
import { mkdtempSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, test } from "vitest";
import {
_readCacheForTests,
clearSliceRoutingForUnit,
extractSliceScope,
readStickyModelForUnit,
recordSliceRouting,
} from "../slice-routing-cache.js";
let tempDirs = [];
function makeProject() {
const dir = mkdtempSync(join(tmpdir(), "sf-slice-routing-"));
tempDirs.push(dir);
return dir;
}
afterEach(() => {
for (const dir of tempDirs) rmSync(dir, { recursive: true, force: true });
tempDirs = [];
});
describe("extractSliceScope", () => {
test("execute-task style unit id collapses to milestone/slice", () => {
expect(extractSliceScope("M001-6377a4/S04/T02")).toBe("M001-6377a4/S04");
});
test("plan/complete slice ids stay as milestone/slice", () => {
expect(extractSliceScope("M001-6377a4/S04")).toBe("M001-6377a4/S04");
});
test("milestone-only ids return the milestone", () => {
expect(extractSliceScope("M001-6377a4")).toBe("M001-6377a4");
});
test("null/undefined/empty return null", () => {
expect(extractSliceScope(null)).toBeNull();
expect(extractSliceScope("")).toBeNull();
expect(extractSliceScope(undefined)).toBeNull();
});
});
describe("slice routing cache", () => {
test("record + read round-trips", () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
const sticky = readStickyModelForUnit(
project,
"execute-task",
"M001/S04/T02",
);
expect(sticky).toEqual({ provider: "moonshotai", id: "kimi-k2.6" });
});
test("sticky scoped per slice — different slice => no hit", () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
expect(
readStickyModelForUnit(project, "execute-task", "M001/S05/T01"),
).toBeNull();
});
test("clearSliceRoutingForUnit evicts only the matching slice", () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
recordSliceRouting(project, "execute-task", "M001/S05/T01", {
provider: "anthropic",
id: "claude-sonnet-4-6",
});
clearSliceRoutingForUnit(project, "M001/S04/T07");
expect(
readStickyModelForUnit(project, "execute-task", "M001/S04/T99"),
).toBeNull();
expect(
readStickyModelForUnit(project, "execute-task", "M001/S05/T02"),
).toEqual({ provider: "anthropic", id: "claude-sonnet-4-6" });
});
test("readStickyModelForUnit honors maxAgeMs", async () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
// Sleep past the retention window so age strictly exceeds maxAgeMs.
await new Promise((resolve) => setTimeout(resolve, 25));
expect(
readStickyModelForUnit(project, "execute-task", "M001/S04/T02", {
maxAgeMs: 10,
}),
).toBeNull();
});
test("returns null on missing basePath or unparseable unit id", () => {
expect(readStickyModelForUnit("", "execute-task", "M001/S04/T01")).toBeNull();
const project = makeProject();
expect(readStickyModelForUnit(project, "execute-task", "")).toBeNull();
expect(readStickyModelForUnit(project, "execute-task", null)).toBeNull();
});
test("overwrite updates the slice entry in place", () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
recordSliceRouting(project, "execute-task", "M001/S04/T02", {
provider: "anthropic",
id: "claude-opus-4-7",
});
const cache = _readCacheForTests(project);
const entries = Object.values(cache);
expect(entries.length).toBe(1);
expect(
readStickyModelForUnit(project, "execute-task", "M001/S04/T03"),
).toEqual({ provider: "anthropic", id: "claude-opus-4-7" });
});
test("clearSliceRoutingForUnit on the last entry removes the cache file", () => {
const project = makeProject();
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
provider: "moonshotai",
id: "kimi-k2.6",
});
clearSliceRoutingForUnit(project, "M001/S04/T01");
const cache = _readCacheForTests(project);
expect(Object.keys(cache).length).toBe(0);
});
});

View file

@ -0,0 +1,134 @@
import { describe, expect, test } from "vitest";
import {
SOLVER_MODEL_DEFAULT,
SOLVER_MODEL_FALLBACKS,
isSolverModel,
resolveSolverModel,
resolveSolverModelCandidates,
} from "../solver-model.js";
describe("solver-model invariants", () => {
test("default is locked to kimi-k2.6 / kimi-coding", () => {
// This is a PROTOCOL INVARIANT, not a tuning parameter. Changing the
// default requires an ADR (see ADR-0079). If this test fails because
// someone bumped the default, that's a load-bearing change and a code
// review reject — re-read the ADR before re-running.
expect(SOLVER_MODEL_DEFAULT).toEqual({
provider: "kimi-coding",
id: "kimi-k2.6",
});
});
test("no fallback is a code-completion-only model", () => {
// Code-completion models (Codestral, Devstral, the kimi-for-coding
// alias) are the ones that broke the loop in the first place. They
// must NEVER appear in the solver fallback chain.
const forbidden = new Set([
"codestral-latest",
"devstral-latest",
"kimi-for-coding",
]);
for (const candidate of SOLVER_MODEL_FALLBACKS) {
expect(forbidden.has(candidate.id)).toBe(false);
}
});
});
describe("resolveSolverModel", () => {
test("with no preferences returns the pinned default", () => {
expect(resolveSolverModel()).toEqual(SOLVER_MODEL_DEFAULT);
expect(resolveSolverModel(undefined)).toEqual(SOLVER_MODEL_DEFAULT);
expect(resolveSolverModel({})).toEqual(SOLVER_MODEL_DEFAULT);
});
test("ignores router/benchmark/learning state (no opt-in == default)", () => {
// Even with the kitchen sink of unrelated preference fields,
// resolveSolverModel must NOT consult any of them. Only an explicit
// preferences.autonomousSolver.model entry can override.
const preferences = {
currentModel: { provider: "mistral", id: "codestral-latest" },
modelRouter: { lastSelection: "google-gemini-cli/gemini-3-flash-preview" },
benchmarkSelector: { winner: "kimi-for-coding" },
learning: { blender: { recommended: "kimi-k2.5" } },
};
expect(resolveSolverModel(preferences)).toEqual(SOLVER_MODEL_DEFAULT);
});
test("respects an explicit object override", () => {
const resolved = resolveSolverModel({
autonomousSolver: { model: { provider: "anthropic", id: "claude-opus-4-7" } },
});
expect(resolved).toEqual({ provider: "anthropic", id: "claude-opus-4-7" });
});
test("accepts a string override in provider/model form", () => {
const resolved = resolveSolverModel({
autonomousSolver: { model: "anthropic/claude-sonnet-4-6" },
});
expect(resolved).toEqual({
provider: "anthropic",
id: "claude-sonnet-4-6",
});
});
test("accepts a bare model id and keeps the default provider", () => {
const resolved = resolveSolverModel({
autonomousSolver: { model: "kimi-k2-thinking" },
});
expect(resolved).toEqual({
provider: SOLVER_MODEL_DEFAULT.provider,
id: "kimi-k2-thinking",
});
});
test("ignores an empty-string override", () => {
expect(
resolveSolverModel({ autonomousSolver: { model: "" } }),
).toEqual(SOLVER_MODEL_DEFAULT);
expect(
resolveSolverModel({ autonomousSolver: { model: " " } }),
).toEqual(SOLVER_MODEL_DEFAULT);
});
});
describe("resolveSolverModelCandidates", () => {
test("primary comes first, then fallback chain (de-duplicated)", () => {
const candidates = resolveSolverModelCandidates();
expect(candidates[0]).toEqual(SOLVER_MODEL_DEFAULT);
expect(candidates.length).toBe(1 + SOLVER_MODEL_FALLBACKS.length);
});
test("override does not duplicate when also in fallback list", () => {
const candidates = resolveSolverModelCandidates({
autonomousSolver: { model: "anthropic/claude-opus-4-7" },
});
const opusEntries = candidates.filter(
(c) => c.id === "claude-opus-4-7" && c.provider === "anthropic",
);
expect(opusEntries.length).toBe(1);
});
});
describe("isSolverModel", () => {
test("returns true for the pinned default", () => {
expect(isSolverModel(SOLVER_MODEL_DEFAULT)).toBe(true);
});
test("returns false for a routed executor model", () => {
expect(
isSolverModel({ provider: "mistral", id: "codestral-latest" }),
).toBe(false);
expect(
isSolverModel({
provider: "google-gemini-cli",
id: "gemini-3-flash-preview",
}),
).toBe(false);
});
test("returns false for null / malformed inputs", () => {
expect(isSolverModel(null)).toBe(false);
expect(isSolverModel(undefined)).toBe(false);
expect(isSolverModel({})).toBe(false);
});
});

View file

@ -0,0 +1,115 @@
import {
existsSync,
mkdirSync,
mkdtempSync,
rmSync,
symlinkSync,
utimesSync,
writeFileSync,
} from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, test } from "vitest";
import { pruneStaleTraces } from "../uok/trace-writer.js";
let tempDirs = [];
function makeProject() {
const dir = mkdtempSync(join(tmpdir(), "sf-trace-janitor-"));
tempDirs.push(dir);
mkdirSync(join(dir, ".sf"), { recursive: true });
return dir;
}
afterEach(() => {
for (const dir of tempDirs) {
rmSync(dir, { recursive: true, force: true });
}
tempDirs = [];
});
function makeTraceFile(project, name, daysOld) {
const tracesDir = join(project, ".sf", "traces");
mkdirSync(tracesDir, { recursive: true });
const path = join(tracesDir, name);
writeFileSync(path, '{"ts":"2024-01-01T00:00:00Z","type":"gate_run"}\n');
if (typeof daysOld === "number") {
const epoch = (Date.now() - daysOld * 24 * 60 * 60 * 1000) / 1000;
utimesSync(path, epoch, epoch);
}
return path;
}
describe("pruneStaleTraces", () => {
test("removes jsonl files older than retention window", () => {
const project = makeProject();
const oldFile = makeTraceFile(
project,
"pre-dispatch:old.jsonl",
45,
);
const freshFile = makeTraceFile(
project,
"pre-dispatch:fresh.jsonl",
5,
);
expect(existsSync(oldFile)).toBe(true);
expect(existsSync(freshFile)).toBe(true);
const result = pruneStaleTraces(project);
expect(result.pruned).toBe(1);
expect(existsSync(oldFile)).toBe(false);
expect(existsSync(freshFile)).toBe(true);
});
test("respects a custom retention window", () => {
const project = makeProject();
const file = makeTraceFile(project, "pre-dispatch:tenday.jsonl", 10);
const result = pruneStaleTraces(project, { retentionDays: 7 });
expect(result.pruned).toBe(1);
expect(existsSync(file)).toBe(false);
});
test("never touches the `latest` symlink", () => {
const project = makeProject();
const file = makeTraceFile(project, "pre-dispatch:current.jsonl", 0);
const latest = join(project, ".sf", "traces", "latest");
symlinkSync("pre-dispatch:current.jsonl", latest);
// Make `latest` look old via its target; the symlink itself is fine.
pruneStaleTraces(project);
expect(existsSync(latest)).toBe(true);
});
test("ignores non-jsonl files", () => {
const project = makeProject();
const tracesDir = join(project, ".sf", "traces");
mkdirSync(tracesDir, { recursive: true });
const txt = join(tracesDir, "notes.txt");
writeFileSync(txt, "ignored");
const epoch = (Date.now() - 90 * 24 * 60 * 60 * 1000) / 1000;
utimesSync(txt, epoch, epoch);
pruneStaleTraces(project);
expect(existsSync(txt)).toBe(true);
});
test("returns zero-counts when traces dir does not exist", () => {
const project = makeProject();
// no traces dir
const result = pruneStaleTraces(project);
expect(result).toEqual({ scanned: 0, pruned: 0, errors: 0 });
});
test("respects maxDeletePerCall safety cap", () => {
const project = makeProject();
for (let i = 0; i < 5; i++) {
makeTraceFile(project, `pre-dispatch:old-${i}.jsonl`, 60);
}
const result = pruneStaleTraces(project, { maxDeletePerCall: 2 });
expect(result.pruned).toBe(2);
});
test("does not throw on missing basePath", () => {
expect(() => pruneStaleTraces("")).not.toThrow();
expect(() => pruneStaleTraces(undefined)).not.toThrow();
});
});

View file

@ -328,13 +328,12 @@ export default function sfTui(pi) {
renderResult: ({ output }) => output, renderResult: ({ output }) => output,
}); });
// ASK_USER_ELICITATION — structured form-based ask_user replacement. // ask_user_elicitation — structured form-based ask_user replacement.
// When the flag is on and the agent calls this tool with choices, a TUI // Shows a TUI select overlay when choices are provided, freeform input otherwise.
// select overlay is shown instead of a plain text prompt.
pi.registerTool({ pi.registerTool({
name: "ask_user_elicitation", name: "ask_user_elicitation",
description: description:
"Ask the user a question using a structured form with optional choices. When ASK_USER_ELICITATION is enabled this is preferred over plain ask_user for questions with known choices.", "Ask the user a question using a structured form with optional choices. Shows a TUI select overlay when choices are provided, or a freeform text prompt otherwise.",
parameters: { parameters: {
type: "object", type: "object",
properties: { properties: {
@ -359,12 +358,6 @@ export default function sfTui(pi) {
if (!ctx?.hasUI) { if (!ctx?.hasUI) {
return { output: "No UI available for elicitation." }; return { output: "No UI available for elicitation." };
} }
if (!getExperimentalFlag("ask_elicitation")) {
return {
output:
"ASK_USER_ELICITATION is not enabled. Run /experimental on ask_elicitation to enable.",
};
}
if (choices?.length) { if (choices?.length) {
const answer = await ctx.ui.select(question, choices); const answer = await ctx.ui.select(question, choices);
if (!answer && allow_freeform) { if (!answer && allow_freeform) {
@ -379,121 +372,6 @@ export default function sfTui(pi) {
renderResult: ({ output }) => (output ? `**Answer:** ${output}` : ""), renderResult: ({ output }) => (output ? `**Answer:** ${output}` : ""),
}); });
// MULTI_TURN_AGENTS — persistent named sub-agent sessions via file-backed state.
// Tool that spawns or resumes a named SF child process, relaying messages.
pi.registerTool({
name: "spawn_agent",
description:
"Spawn or resume a named persistent sub-agent. Sends a message and waits for the response. The agent persists across calls using file-backed state in .sf/agents/<name>/.",
parameters: {
type: "object",
properties: {
name: {
type: "string",
description:
"Unique agent name (alphanumeric + hyphens, e.g. 'researcher')",
},
message: {
type: "string",
description: "Message to send to the agent",
},
reset: {
type: "boolean",
description:
"If true, clear the agent's state and start fresh (default: false)",
},
},
required: ["name", "message"],
},
execute: async ({ name, message, reset }) => {
if (!getExperimentalFlag("multi_turn_agents")) {
return {
output:
"MULTI_TURN_AGENTS is not enabled. Run /experimental on multi_turn_agents to enable.",
};
}
if (!/^[a-z0-9-]{1,32}$/i.test(name)) {
return {
output: "Agent name must be 1-32 alphanumeric/hyphen characters.",
};
}
const { join: pathJoin } = await import("node:path");
const { mkdirSync, writeFileSync, readFileSync, existsSync } =
await import("node:fs");
const stateDir = pathJoin(
projectRoot() ?? process.cwd(),
".sf",
"agents",
name,
);
mkdirSync(stateDir, { recursive: true });
const historyPath = pathJoin(stateDir, "history.jsonl");
if (reset && existsSync(historyPath)) {
writeFileSync(historyPath, "", "utf-8");
}
// Append user message to history
const entry = JSON.stringify({
role: "user",
content: message,
ts: Date.now(),
});
const { appendFileSync } = await import("node:fs");
appendFileSync(historyPath, `${entry}\n`, "utf-8");
// Dispatch to SF headless with the conversation history as context
const historyLines = existsSync(historyPath)
? readFileSync(historyPath, "utf-8")
.trim()
.split("\n")
.filter(Boolean)
.map((l) => {
try {
return JSON.parse(l);
} catch {
return null;
}
})
.filter(Boolean)
: [];
const contextMsg = historyLines
.slice(-10) // last 10 turns for context
.map((e) => `${e.role === "user" ? "User" : "Agent"}: ${e.content}`)
.join("\n");
const fullPrompt = `[Agent: ${name}]\n\nConversation history:\n${contextMsg}\n\nRespond to the last user message only.`;
const { execFile } = await import("node:child_process");
const { promisify } = await import("node:util");
const execFileAsync = promisify(execFile);
try {
const { stdout } = await execFileAsync(
process.execPath,
[
"-y",
"node@24",
process.env.SF_LOADER ?? "dist/loader.js",
"headless",
"--print",
fullPrompt,
],
{
timeout: 60000,
encoding: "utf-8",
env: { ...process.env },
},
);
const response = stdout.trim();
appendFileSync(
historyPath,
`${JSON.stringify({ role: "assistant", content: response, ts: Date.now() })}\n`,
"utf-8",
);
return { output: response };
} catch (err) {
return {
output: `Agent dispatch failed: ${getErrorMessage(err)}`,
};
}
},
renderResult: ({ output }) => output,
});
} }
/** Run the STATUS_LINE user script on a 5s interval, posting stdout to footer. */ /** Run the STATUS_LINE user script on a 5s interval, posting stdout to footer. */

View file

@ -7,6 +7,31 @@
* *
* Consumer: AgentSwarm orchestrator, swarm role agents (CoordinatorAgent, WorkerAgent etc), * Consumer: AgentSwarm orchestrator, swarm role agents (CoordinatorAgent, WorkerAgent etc),
* and direct use in multi-agent dispatch flows. * and direct use in multi-agent dispatch flows.
*
* ## Current state
* This module implements the **container** half of a persistent agent: identity, inbox,
* memory blocks, and message routing. It does NOT implement the **runner** half.
*
* The missing piece is an LLM execution runner that:
* 1. Reads pending messages from this agent's inbox (`receive(true)`)
* 2. Assembles a prompt from core memory blocks + inbox messages
* 3. Dispatches to SF headless (`node dist/loader.js headless --print <prompt>`)
* 4. Writes the LLM response back into the bus as a reply
* 5. Updates memory blocks (eviction, summarization) when context grows large
*
* Until the runner exists, `PersistentAgent` is a passive store. The autonomous loop
* uses it this way for sleeptime memory consolidation (caller sends + immediately reads
* inbox). `SwarmDispatchLayer` also only enqueues messages nothing processes them.
*
* When building the runner, key design decisions to make:
* - Context window management: how many inbox turns to include before summarizing
* - Memory eviction: which core blocks are injected, which are summarized to archival
* - Turn limits: max rounds before the runner yields and re-queues
* - Concurrency: one runner per agent name (enforce via DB lock or process mutex)
* - Error handling: failed LLM calls should leave the message as unread, not drop it
*
* See: Codex `codex-rs/core/src/agent/control.rs` for the reference implementation of
* typed parallel subagents (explorer/worker roles) with forked rollout history.
*/ */
import { randomUUID } from "node:crypto"; import { randomUUID } from "node:crypto";

View file

@ -8,6 +8,18 @@
* *
* Consumer: UOK kernel dispatch path, parallel orchestrators, and /sf autonomous controller * Consumer: UOK kernel dispatch path, parallel orchestrators, and /sf autonomous controller
* when SF_A2A_ENABLED is set. * when SF_A2A_ENABLED is set.
*
* ## Current state enqueue only, no runner
* `_busDispatch` routes an envelope to a role agent's inbox via the MessageBus. It does NOT
* wait for a response the `DispatchResult` contains only `messageId` and `targetAgent`,
* not LLM output. Nothing currently drains agent inboxes and runs LLM calls.
*
* This layer is ready to use once `PersistentAgent` gains a runner (see persistent-agent.js
* module comment for the runner design). At that point `dispatch()` can be extended to
* optionally block until the runner posts a reply to the bus.
*
* Callers outside uok/: none currently. The autonomous loop uses AgentSwarm directly for
* the sleeptime memory path. Wire this in when building the autonomous orchestrator.
*/ */
import { AgentSwarm } from "./agent-swarm.js"; import { AgentSwarm } from "./agent-swarm.js";

View file

@ -4,6 +4,7 @@ import {
appendFileSync, appendFileSync,
closeSync, closeSync,
existsSync, existsSync,
lstatSync,
mkdirSync, mkdirSync,
openSync, openSync,
readdirSync, readdirSync,
@ -15,6 +16,12 @@ import {
import { join } from "node:path"; import { join } from "node:path";
import { sfRoot } from "../paths.js"; import { sfRoot } from "../paths.js";
// Longest read window currently used by any trace consumer
// (sf-db-gates.js:391 reads 30 days). Anything older than this is never
// read and just consumes disk.
const TRACE_RETENTION_DAYS_DEFAULT = 30;
const MS_PER_DAY = 24 * 60 * 60 * 1000;
function tracesDir(basePath) { function tracesDir(basePath) {
return join(sfRoot(basePath), "traces"); return join(sfRoot(basePath), "traces");
} }
@ -45,6 +52,64 @@ export function appendTraceEvent(basePath, traceId, event) {
} }
} }
/**
* Prune .sf/traces/*.jsonl files older than retentionDays.
*
* Why: per-flow trace files accumulate one-per-dispatch and are never
* cleaned. The longest analyzer window today is 30 days
* (sf-db-gates.js:391); anything older is never read and just consumes
* disk. The `latest` symlink is preserved unconditionally so the
* tail-friendly pointer keeps working.
*
* Consumer: session-start hook (idempotent, fast, best-effort).
*
* @param {string} basePath
* @param {object} [opts]
* @param {number} [opts.retentionDays=30]
* @param {number} [opts.maxDeletePerCall=1000] - safety cap so a runaway
* directory doesn't make startup slow.
* @returns {{ scanned: number, pruned: number, errors: number }}
*/
export function pruneStaleTraces(basePath, opts = {}) {
const retentionDays = Number(opts.retentionDays ?? TRACE_RETENTION_DAYS_DEFAULT);
const maxDeletePerCall = Math.max(1, Number(opts.maxDeletePerCall ?? 1000));
const result = { scanned: 0, pruned: 0, errors: 0 };
if (!basePath || typeof basePath !== "string") return result;
let dir;
try {
dir = tracesDir(basePath);
} catch {
return result;
}
if (!existsSync(dir)) return result;
const cutoff = Date.now() - retentionDays * MS_PER_DAY;
let entries;
try {
entries = readdirSync(dir);
} catch {
return result;
}
for (const name of entries) {
if (result.pruned >= maxDeletePerCall) break;
if (name === "latest") continue;
if (!name.endsWith(".jsonl")) continue;
const path = join(dir, name);
result.scanned += 1;
try {
// lstat so we don't follow a symlink (defensive — there shouldn't
// be any besides `latest`, but never silently chase).
const stat = lstatSync(path);
if (!stat.isFile()) continue;
if (stat.mtimeMs >= cutoff) continue;
unlinkSync(path);
result.pruned += 1;
} catch {
result.errors += 1;
}
}
return result;
}
export function readTraceEvents(basePath, type, windowHours = 24) { export function readTraceEvents(basePath, type, windowHours = 24) {
// Read all trace files modified within windowHours, filter by event type // Read all trace files modified within windowHours, filter by event type
// Returns array of matching events // Returns array of matching events