feat: Created draft mapping of SF patterns to ACE reference draft
SF-Task: S05/T01
This commit is contained in:
parent
1ed505669b
commit
65e195a9fd
49 changed files with 2263 additions and 272 deletions
13
.gitignore
vendored
13
.gitignore
vendored
|
|
@ -106,4 +106,17 @@ repowise.db
|
||||||
.sf/scaffold-manifest.json
|
.sf/scaffold-manifest.json
|
||||||
.sf/interactive.lock
|
.sf/interactive.lock
|
||||||
.sf/interactive.lock.d/
|
.sf/interactive.lock.d/
|
||||||
|
# SQLite WAL/SHM are ephemeral checkpoint files — only the .db is durable.
|
||||||
|
.sf/metrics.db-wal
|
||||||
|
.sf/metrics.db-shm
|
||||||
|
.sf/sf.db-wal
|
||||||
|
.sf/sf.db-shm
|
||||||
|
# Per-dispatch trace files accumulate one-per-request and are runtime-only.
|
||||||
|
# Consumers (sf-db-gates, adaptive verification policy) read by mtime window
|
||||||
|
# (24h–30d) — on-disk retention is needed, but git tracking is not.
|
||||||
|
.sf/traces/pre-dispatch:*.jsonl
|
||||||
|
.sf/traces/finalize:*.jsonl
|
||||||
|
.sf/traces/guard:*.jsonl
|
||||||
|
# `latest` is a symlink retargeted on every dispatch — pure git noise.
|
||||||
|
.sf/traces/latest
|
||||||
test_output.log
|
test_output.log
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
{
|
{
|
||||||
"lastFullVacuumAt": "2026-05-12T13:59:07.765Z"
|
"lastFullVacuumAt": "2026-05-12T20:58:28.744Z"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
.sf/backups/db/sf.db.2026-05-12T20-58-28-491Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-12T20-58-28-491Z
Normal file
Binary file not shown.
BIN
.sf/backups/db/sf.db.2026-05-12T21-15-56-990Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-12T21-15-56-990Z
Normal file
Binary file not shown.
BIN
.sf/backups/db/sf.db.2026-05-12T23-50-31-488Z
Normal file
BIN
.sf/backups/db/sf.db.2026-05-12T23-50-31-488Z
Normal file
Binary file not shown.
|
|
@ -60,5 +60,5 @@
|
||||||
"confidence": "EXTRACTED"
|
"confidence": "EXTRACTED"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"builtAt": "2026-05-12T15:26:43.252Z"
|
"builtAt": "2026-05-12T23:53:23.408Z"
|
||||||
}
|
}
|
||||||
BIN
.sf/metrics.db
BIN
.sf/metrics.db
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1 +1 @@
|
||||||
{"fetchedAt":"2026-05-12T14:54:31.656Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]}
|
{"fetchedAt":"2026-05-12T21:25:20.919Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]}
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -109,26 +109,26 @@
|
||||||
"total": 1
|
"total": 1
|
||||||
},
|
},
|
||||||
"kimi-coding/kimi-k2.6": {
|
"kimi-coding/kimi-k2.6": {
|
||||||
"successes": 1,
|
"successes": 2,
|
||||||
"failures": 0,
|
"failures": 0,
|
||||||
"timeouts": 0,
|
"timeouts": 0,
|
||||||
"totalTokens": 1821480,
|
"totalTokens": 1892068,
|
||||||
"totalCost": 0,
|
"totalCost": 0.030715552,
|
||||||
"lastUsed": "2026-05-12T20:57:45.179Z",
|
"lastUsed": "2026-05-12T23:58:57.132Z",
|
||||||
"successRate": 1,
|
"successRate": 1,
|
||||||
"total": 1
|
"total": 2
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"complete-slice": {
|
"complete-slice": {
|
||||||
"kimi-coding/kimi-k2.6": {
|
"kimi-coding/kimi-k2.6": {
|
||||||
"successes": 1,
|
"successes": 2,
|
||||||
"failures": 0,
|
"failures": 0,
|
||||||
"timeouts": 0,
|
"timeouts": 0,
|
||||||
"totalTokens": 719526,
|
"totalTokens": 814376,
|
||||||
"totalCost": 0.026709,
|
"totalCost": 0.053080319800000005,
|
||||||
"lastUsed": "2026-05-12T15:26:57.708Z",
|
"lastUsed": "2026-05-12T23:54:01.143Z",
|
||||||
"successRate": 1,
|
"successRate": 1,
|
||||||
"total": 1
|
"total": 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"kind": "write",
|
|
||||||
"toolCallId": "write_1778619443353_32",
|
|
||||||
"path": ".sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md",
|
|
||||||
"timestamp": 1778619443535
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"kind": "bash",
|
|
||||||
"toolCallId": "bash_1778619447339_33",
|
|
||||||
"command": "test -f .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && grep -q \"status\" .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && echo \"Matrix exists and contains status command info.\"",
|
|
||||||
"exitCode": 0,
|
|
||||||
"outputSnippet": "Matrix exists and contains status command info.\n",
|
|
||||||
"timestamp": 1778619447544
|
|
||||||
}
|
|
||||||
]
|
|
||||||
1
.sf/safety/evidence-M001-6377a4-S04-T02.json
Normal file
1
.sf/safety/evidence-M001-6377a4-S04-T02.json
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
[]
|
||||||
16
.sf/safety/evidence-M001-6377a4-S05-T01.json
Normal file
16
.sf/safety/evidence-M001-6377a4-S05-T01.json
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "write",
|
||||||
|
"toolCallId": "DgPnxQEen",
|
||||||
|
"path": "docs/dev/sf-ace-patterns.md.draft",
|
||||||
|
"timestamp": 1778630297060
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "bash",
|
||||||
|
"toolCallId": "8FjDDZSlA",
|
||||||
|
"command": "test -f docs/dev/sf-ace-patterns.md.draft && grep -c \"SF Implementation\" docs/dev/sf-ace-patterns.md.draft | grep -q \"6\"",
|
||||||
|
"exitCode": 0,
|
||||||
|
"outputSnippet": "(no output)",
|
||||||
|
"timestamp": 1778630298077
|
||||||
|
}
|
||||||
|
]
|
||||||
16
.sf/slice-routing.json
Normal file
16
.sf/slice-routing.json
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
{
|
||||||
|
"M001-6377a4/S04": {
|
||||||
|
"provider": "minimax",
|
||||||
|
"id": "MiniMax-M2.1",
|
||||||
|
"ts": "2026-05-12T23:54:01.079Z",
|
||||||
|
"lastUnitType": "complete-slice",
|
||||||
|
"lastUnitId": "M001-6377a4/S04"
|
||||||
|
},
|
||||||
|
"M001-6377a4/S05": {
|
||||||
|
"provider": "mistral",
|
||||||
|
"id": "codestral-latest",
|
||||||
|
"ts": "2026-05-12T23:58:57.088Z",
|
||||||
|
"lastUnitType": "execute-task",
|
||||||
|
"lastUnitId": "M001-6377a4/S05/T01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1 +1 @@
|
||||||
guard:76c7c307-91b4-426e-8fad-4ff951d5a52e.jsonl
|
guard:b8cbf9df-9fe8-4203-9c63-79fc7264d74e.jsonl
|
||||||
36
TODO.md
36
TODO.md
|
|
@ -3,3 +3,39 @@
|
||||||
Dump anything here.
|
Dump anything here.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Self-Feedback Inbox
|
||||||
|
|
||||||
|
### [prompt-modularization] Phase 3 — migrate remaining builders to `composeUnitContext` v2
|
||||||
|
|
||||||
|
**Context:** Phase 1 (fragment infrastructure, 17-prompt Working Directory deduplication) and
|
||||||
|
Phase 2 (5 stub manifests for deploy/smoke-production/release/rollback/challenge) shipped in
|
||||||
|
commit `ca5d869e3`. 9 of 26 unit types are now fully manifest-driven via `composeInlinedContext`.
|
||||||
|
|
||||||
|
**What's blocked and why:**
|
||||||
|
|
||||||
|
Migrating the remaining 17 builders to `composeInlinedContext` (v1) is the wrong path because:
|
||||||
|
1. `inlineKnowledgeScoped` and `inlineGraphSubgraph` are NOT in `ARTIFACT_KEYS` — these
|
||||||
|
artifacts would remain imperative and undeclared in every manifest, making manifests
|
||||||
|
structurally unreliable descriptions of actual builder behavior.
|
||||||
|
2. Injecting knowledge/graph at the right position in the composed string requires fragile
|
||||||
|
sentinel-string searches (e.g., `body.lastIndexOf("### Task Summary:")`). This pattern
|
||||||
|
is already untested in the 2 migrated complex builders (`research-milestone`, `complete-slice`).
|
||||||
|
3. `composeUnitContext` (v2) in `unit-context-composer.js` already has `computed`, `prepend`,
|
||||||
|
and `excerpt` support — knowledge and graph inlining maps cleanly to `computed` entries.
|
||||||
|
Migrating to v1 now creates a half-migration state that must be undone when v2 lands.
|
||||||
|
|
||||||
|
**Recommended next slice:**
|
||||||
|
1. Add `"knowledge"` and `"graph"` to `ARTIFACT_KEYS` in `unit-context-manifest.js`.
|
||||||
|
2. Register them as `computed` entries in relevant `UNIT_MANIFESTS` entries.
|
||||||
|
3. Wire one builder (e.g., `buildResearchSlicePrompt`) through `composeUnitContext` v2 as pilot.
|
||||||
|
4. Add position-assertion tests to already-migrated complex builders (`research-milestone`,
|
||||||
|
`complete-slice`) to guard against silent ordering degradation.
|
||||||
|
5. Then migrate remaining builders in batches: slice builders → milestone builders → execute-task.
|
||||||
|
|
||||||
|
**Note on `prompt-cache-optimizer.js`:** Entirely dead code — `optimizeForCaching()`,
|
||||||
|
`estimateCacheSavings()`, `computeCacheHitRate()` have zero importers. `reorderForCaching()`
|
||||||
|
is wired at `phases-unit.js:519` but no `cache_control` markers are written to outgoing
|
||||||
|
requests. Remove the file or wire it in the same slice that adds `cache_control` breakpoints.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
|
||||||
29
docs/dev/sf-ace-patterns.md.draft
Normal file
29
docs/dev/sf-ace-patterns.md.draft
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
# SF Patterns to ACE Reference Draft Mapping
|
||||||
|
|
||||||
|
## Preferences
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/preferences.js`
|
||||||
|
|
||||||
|
## PDD
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js`
|
||||||
|
|
||||||
|
## UOK Gates
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/uok/gate-runner.js`
|
||||||
|
|
||||||
|
## Notifications
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/skills/frontmatter.js`
|
||||||
|
|
||||||
|
## Skills-as-Contracts
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/steerable-autonomous-panel.js`
|
||||||
|
|
||||||
|
## Idempotency
|
||||||
|
|
||||||
|
**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js`
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
- All 6 patterns have verified file paths in this document.
|
||||||
85
docs/product/SURFACE_CAPABILITIES.md
Normal file
85
docs/product/SURFACE_CAPABILITIES.md
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
# SF Product Surface Capabilities
|
||||||
|
|
||||||
|
This document defines the command and feature availability across SF's three product surfaces: **CLI / Headless**, **TUI**, and **Web**. It records intentional gaps so they are not mistaken for bugs.
|
||||||
|
|
||||||
|
## Surface Definitions
|
||||||
|
|
||||||
|
| Surface | Description | Primary Consumer |
|
||||||
|
| :--- | :--- | :--- |
|
||||||
|
| **CLI / Headless** | Non-interactive command-line interface and machine-surface protocol (`sf headless`). | Scripts, CI/CD, editor integrations, autonomous dispatch. |
|
||||||
|
| **TUI** | Interactive Terminal User Interface with dashboards, visualizers, and live overlays. | Developers working locally who prefer keyboard-driven interaction. |
|
||||||
|
| **Web** | Browser-based interface (Next.js) with panels, command surfaces, and visual tools. | Developers who prefer a GUI, remote access, or power-mode workflows. |
|
||||||
|
|
||||||
|
## Feature Matrix
|
||||||
|
|
||||||
|
| Command / Feature | CLI / Headless | TUI | Web | Notes |
|
||||||
|
| :--- | :--- | :--- | :--- | :--- |
|
||||||
|
| `/status` | ✅ | ✅ | ✅ | Text in CLI/Headless; dashboard overlay in TUI; terminal or `sf-status` panel in Web. |
|
||||||
|
| `/plan` | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. |
|
||||||
|
| `/run` (`/next`, `/autonomous`) | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. |
|
||||||
|
| `/steer` | ✅ | ✅ | ✅ | Web exposes via `sf-steer` panel. |
|
||||||
|
| `/undo` | ✅ | ✅ | ✅ | Web exposes via `sf-undo` panel. |
|
||||||
|
| `/history` | ✅ | ✅ | ✅ | Web exposes via `sf-history` panel. |
|
||||||
|
| `/doctor` | ✅ | ✅ | ✅ | Web exposes via `sf-doctor` panel. |
|
||||||
|
| `/forensics` | ✅ | ✅ | ✅ | Web exposes via `sf-forensics` panel. |
|
||||||
|
| `/skills` | ✅ | ✅ | ✅ | Web exposes via `sf-skill-health` panel. |
|
||||||
|
| `/capture` | ✅ | ✅ | ✅ | Web exposes via `sf-capture` panel. |
|
||||||
|
| `/triage` | ✅ | ✅ | ✅ | Web exposes via `sf-triage` panel. |
|
||||||
|
| `/inspect` | ✅ | ✅ | ✅ | Web exposes via `sf-inspect` panel. |
|
||||||
|
| `/hooks` | ✅ | ✅ | ✅ | Web exposes via `sf-hooks` panel. |
|
||||||
|
| `/cleanup` | ✅ | ✅ | ✅ | Web exposes via `sf-cleanup` panel. |
|
||||||
|
| `/export` | ✅ | ✅ | ✅ | Web exposes via `sf-export` panel. |
|
||||||
|
| `/queue` | ✅ | ✅ | ✅ | Web exposes via `sf-queue` panel. |
|
||||||
|
| `/visualize` | ✅ | ✅ | ✅ | Web exposes via `sf-visualize` panel. |
|
||||||
|
| `/prefs` | ✅ | ✅ | ✅ | Web exposes via `sf-prefs` panel. |
|
||||||
|
| `/config` | ✅ | ✅ | ✅ | Web exposes via `sf-config` panel. |
|
||||||
|
| `/mode` | ✅ | ✅ | ✅ | Web exposes via `sf-mode` panel. |
|
||||||
|
| `/model` | ✅ | ✅ | ✅ | Web exposes via dedicated **Model** command surface. |
|
||||||
|
| `/thinking` | ✅ | ✅ | ✅ | Web exposes via dedicated **Thinking** command surface. |
|
||||||
|
| `/git` | ✅ | ✅ | ✅ | Web exposes via dedicated **Git** command surface. |
|
||||||
|
| `/settings` | ✅ | ✅ | ✅ | Web exposes via dedicated **Settings** command surface (general, recovery, auth, admin, experimental). |
|
||||||
|
| `/resume` | ✅ | ✅ | ✅ | Web exposes via dedicated **Resume** command surface. |
|
||||||
|
| `/name` | ✅ | ✅ | ✅ | Web exposes via dedicated **Name** command surface. |
|
||||||
|
| `/fork` | ✅ | ✅ | ✅ | Web exposes via dedicated **Fork** command surface. |
|
||||||
|
| `/session` | ✅ | ✅ | ✅ | Web exposes via dedicated **Session** command surface. |
|
||||||
|
| `/compact` | ✅ | ✅ | ✅ | Web exposes via dedicated **Compact** command surface. |
|
||||||
|
| `/tasks` | ✅ | ✅ | ✅ | Web exposes via Dashboard and Activity views. |
|
||||||
|
| `/research` | ✅ | ✅ | ✅ | Web terminal supports typing the command. |
|
||||||
|
| `/implement` | ✅ | ✅ | ✅ | Web terminal supports typing the command. |
|
||||||
|
|
||||||
|
## Intentional Gaps
|
||||||
|
|
||||||
|
### `/plan` is not available as a first-class Web UI workflow
|
||||||
|
|
||||||
|
**Why:** The web UI uses a different, browser-native planning and execution model. Planning artifacts are promoted through CLI-first workflows (`sf plan promote`) that require filesystem access, Git operations, and markdown rendering pipelines that are optimized for terminal and editor surfaces. The web surface focuses on higher-level UI interactions (roadmap views, milestone explorers, visual planning tools) rather than raw slash-command promotion.
|
||||||
|
|
||||||
|
**What web users do instead:**
|
||||||
|
- Use the **Roadmap** and **Milestone Explorer** views to inspect and navigate planning state.
|
||||||
|
- Type `/plan` in the embedded terminal if needed; the command executes but the full promotion workflow is CLI-first.
|
||||||
|
|
||||||
|
### `/run` (`/next`, `/autonomous`) is not available as a first-class Web UI workflow
|
||||||
|
|
||||||
|
**Why:** The web UI uses a different, browser-native execution model. Backend execution is managed via specific API routes and WebSocket/bridge communication rather than a `/run` command dispatch. The web surface prioritizes supervised, click-driven execution (e.g., **Power Mode**, action buttons, workflow steppers) over autonomous terminal-style dispatch.
|
||||||
|
|
||||||
|
**What web users do instead:**
|
||||||
|
- Use **Power Mode** for guided, step-by-step unit execution.
|
||||||
|
- Use **Chat Mode** for conversational task dispatch.
|
||||||
|
- Type `/autonomous` or `/next` in the embedded terminal if needed; execution proceeds via the PTY bridge.
|
||||||
|
|
||||||
|
## Design Principle
|
||||||
|
|
||||||
|
> **Behavioral coherence, not visual parity.**
|
||||||
|
>
|
||||||
|
> Every surface must expose the *same underlying state* (via `deriveState()`, UOK diagnostics, and bridge data) but may present it through different interaction models. A gap is intentional only when the surface provides an equivalent or superior alternative workflow for the same user goal.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
This matrix is verified against:
|
||||||
|
- `src/resources/extensions/sf/commands/handlers/core.js` — CLI/TUI `status` handler.
|
||||||
|
- `src/resources/extensions/sf/commands/handlers/ops.js` — CLI/TUI `plan` and `run` handlers.
|
||||||
|
- `src/headless.ts` — Headless status and execution entrypoints.
|
||||||
|
- `web/components/sf/command-surface.tsx` — Web command surface registry.
|
||||||
|
- `web/lib/command-surface-contract.ts` — Web command surface type definitions.
|
||||||
|
- `web/components/sf/sidebar.tsx` — Web navigation and exposed commands.
|
||||||
|
|
||||||
|
For the full behavioral audit, see `.sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md`.
|
||||||
75
packages/ai/src/providers/openai-completions.test.ts
Normal file
75
packages/ai/src/providers/openai-completions.test.ts
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
import assert from "node:assert/strict";
|
||||||
|
import { describe, it } from "vitest";
|
||||||
|
import type { Context, Model, OpenAICompletionsCompat } from "../types.js";
|
||||||
|
import { convertMessages } from "./openai-completions.js";
|
||||||
|
|
||||||
|
const compat = {
|
||||||
|
supportsDeveloperRole: false,
|
||||||
|
requiresAssistantAfterToolResult: false,
|
||||||
|
requiresThinkingAsText: false,
|
||||||
|
} as Required<OpenAICompletionsCompat>;
|
||||||
|
|
||||||
|
function model(provider: string, id: string): Model<"openai-completions"> {
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
name: id,
|
||||||
|
api: "openai-completions",
|
||||||
|
provider,
|
||||||
|
baseUrl:
|
||||||
|
provider === "openrouter"
|
||||||
|
? "https://openrouter.ai/api/v1"
|
||||||
|
: "https://api.openai.com/v1",
|
||||||
|
reasoning: false,
|
||||||
|
input: ["text"],
|
||||||
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||||
|
contextWindow: 128_000,
|
||||||
|
maxTokens: 4096,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function contextWithCacheControl(): Context {
|
||||||
|
return {
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: [
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
text: "stable prefix",
|
||||||
|
cache_control: { type: "ephemeral" },
|
||||||
|
} as any,
|
||||||
|
{ type: "text", text: "dynamic suffix" },
|
||||||
|
],
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("convertMessages cache_control", () => {
|
||||||
|
it("preserves_cache_control_when_openrouter_anthropic_model", () => {
|
||||||
|
const messages = convertMessages(
|
||||||
|
model("openrouter", "anthropic/claude-sonnet-4.5"),
|
||||||
|
contextWithCacheControl(),
|
||||||
|
compat,
|
||||||
|
);
|
||||||
|
|
||||||
|
const content = messages[0].content;
|
||||||
|
assert.ok(Array.isArray(content));
|
||||||
|
assert.deepEqual((content[0] as any).cache_control, {
|
||||||
|
type: "ephemeral",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("strips_cache_control_when_openai_compatible_model_does_not_support_it", () => {
|
||||||
|
const messages = convertMessages(
|
||||||
|
model("openai", "gpt-5.3-chat-latest"),
|
||||||
|
contextWithCacheControl(),
|
||||||
|
compat,
|
||||||
|
);
|
||||||
|
|
||||||
|
const content = messages[0].content;
|
||||||
|
assert.ok(Array.isArray(content));
|
||||||
|
assert.equal((content[0] as any).cache_control, undefined);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -493,6 +493,12 @@ function maybeAddOpenRouterAnthropicToolCacheControl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function supportsOpenRouterAnthropicCacheControl(
|
||||||
|
model: Model<"openai-completions">,
|
||||||
|
): boolean {
|
||||||
|
return model.provider === "openrouter" && model.id.startsWith("anthropic/");
|
||||||
|
}
|
||||||
|
|
||||||
function mapReasoningEffort(
|
function mapReasoningEffort(
|
||||||
effort: NonNullable<OpenAICompletionsOptions["reasoningEffort"]>,
|
effort: NonNullable<OpenAICompletionsOptions["reasoningEffort"]>,
|
||||||
reasoningEffortMap: Partial<
|
reasoningEffortMap: Partial<
|
||||||
|
|
@ -506,8 +512,7 @@ function maybeAddOpenRouterAnthropicCacheControl(
|
||||||
model: Model<"openai-completions">,
|
model: Model<"openai-completions">,
|
||||||
messages: ChatCompletionMessageParam[],
|
messages: ChatCompletionMessageParam[],
|
||||||
): void {
|
): void {
|
||||||
if (model.provider !== "openrouter" || !model.id.startsWith("anthropic/"))
|
if (!supportsOpenRouterAnthropicCacheControl(model)) return;
|
||||||
return;
|
|
||||||
|
|
||||||
// Anthropic-style caching requires cache_control on a text part. Add a breakpoint
|
// Anthropic-style caching requires cache_control on a text part. Add a breakpoint
|
||||||
// on the last user/assistant message (walking backwards until we find text content).
|
// on the last user/assistant message (walking backwards until we find text content).
|
||||||
|
|
@ -622,9 +627,11 @@ export function convertMessages(
|
||||||
// Preserve cache_control if present (set upstream for Anthropic prompt caching).
|
// Preserve cache_control if present (set upstream for Anthropic prompt caching).
|
||||||
// The property is not in the OpenAI SDK type but is accepted by providers
|
// The property is not in the OpenAI SDK type but is accepted by providers
|
||||||
// that support Anthropic-style caching (openrouter/anthropic/*).
|
// that support Anthropic-style caching (openrouter/anthropic/*).
|
||||||
const cacheControl = (
|
const cacheControl = supportsOpenRouterAnthropicCacheControl(
|
||||||
item as unknown as Record<string, unknown>
|
model,
|
||||||
).cache_control;
|
)
|
||||||
|
? (item as unknown as Record<string, unknown>).cache_control
|
||||||
|
: undefined;
|
||||||
if (cacheControl) {
|
if (cacheControl) {
|
||||||
(part as unknown as Record<string, unknown>).cache_control =
|
(part as unknown as Record<string, unknown>).cache_control =
|
||||||
cacheControl;
|
cacheControl;
|
||||||
|
|
|
||||||
64
src/resources/agents/rubber-duck.md
Normal file
64
src/resources/agents/rubber-duck.md
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
---
|
||||||
|
name: rubber-duck
|
||||||
|
description: Constructive pre-implementation critic — catches design flaws, missing edge cases, and gaps before code is written
|
||||||
|
model: sonnet
|
||||||
|
tools: read, grep, find, ls, bash
|
||||||
|
---
|
||||||
|
|
||||||
|
You are a constructive critic. Your job is to identify real problems in a plan, design, or code change **before** implementation is committed to — when course corrections are still cheap.
|
||||||
|
|
||||||
|
You are **read-only**. Do not edit files. Do not run commands that change the environment.
|
||||||
|
|
||||||
|
## What you review
|
||||||
|
|
||||||
|
You receive a plan, a design proposal, a code diff, or a task description. You review it for:
|
||||||
|
|
||||||
|
- **Logic errors** — incorrect assumptions, wrong control flow, missing invariants
|
||||||
|
- **Missing edge cases** — inputs/states the plan doesn't account for
|
||||||
|
- **Design flaws** — abstractions that won't hold, coupling that will hurt, missing separation of concerns
|
||||||
|
- **Security issues** — unvalidated inputs, exposed secrets, auth gaps
|
||||||
|
- **Test gaps** — behavior that will be untested or untestable with the proposed approach
|
||||||
|
- **Spec contradictions** — where the plan conflicts with stated requirements or existing behavior
|
||||||
|
|
||||||
|
## What you do NOT comment on
|
||||||
|
|
||||||
|
- Code style, formatting, naming conventions
|
||||||
|
- Grammar or wording in comments/docs
|
||||||
|
- Best practices that don't cause an actual problem
|
||||||
|
- Refactoring that doesn't change correctness
|
||||||
|
- Minor improvements that don't affect the task outcome
|
||||||
|
|
||||||
|
If something is fine, say so. Do not manufacture findings to seem thorough. A short report with two real findings beats a long report with ten nitpicks.
|
||||||
|
|
||||||
|
## Output format
|
||||||
|
|
||||||
|
For each finding:
|
||||||
|
|
||||||
|
```
|
||||||
|
## [Blocking|Non-blocking|Suggestion] — <title>
|
||||||
|
|
||||||
|
**What:** <the specific problem, stated precisely>
|
||||||
|
**Why it matters:** <the actual impact — what breaks, under what condition>
|
||||||
|
**Fix:** <concrete change to address it>
|
||||||
|
```
|
||||||
|
|
||||||
|
Then a final verdict:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Verdict
|
||||||
|
|
||||||
|
READY / NEEDS-REVISION
|
||||||
|
|
||||||
|
One sentence: overall assessment.
|
||||||
|
```
|
||||||
|
|
||||||
|
- `READY` — no blocking findings; the plan/code can proceed as-is
|
||||||
|
- `NEEDS-REVISION` — at least one blocking finding must be addressed first
|
||||||
|
|
||||||
|
## Severity guide
|
||||||
|
|
||||||
|
- **Blocking** — will cause a bug, data loss, security issue, or test failure if not fixed
|
||||||
|
- **Non-blocking** — should be fixed for quality but won't break the task
|
||||||
|
- **Suggestion** — worth considering; low priority
|
||||||
|
|
||||||
|
Lead with blocking findings. If there are none, say so explicitly before the non-blocking ones.
|
||||||
|
|
@ -18,6 +18,7 @@ import {
|
||||||
loadCapabilityOverrides,
|
loadCapabilityOverrides,
|
||||||
resolveModelForComplexity,
|
resolveModelForComplexity,
|
||||||
} from "./model-router.js";
|
} from "./model-router.js";
|
||||||
|
import { readStickyModelForUnit } from "./slice-routing-cache.js";
|
||||||
import {
|
import {
|
||||||
filterModelsByProviderModelAllow,
|
filterModelsByProviderModelAllow,
|
||||||
isProviderAllowedByLists,
|
isProviderAllowedByLists,
|
||||||
|
|
@ -543,6 +544,15 @@ export async function selectAndApplyModel(
|
||||||
selectionMethod: "tier-only",
|
selectionMethod: "tier-only",
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
|
// Slice-sticky hint: prefer the model that previously succeeded
|
||||||
|
// on a sibling unit in this slice when its capability score is
|
||||||
|
// within window of the winner. Cleared on executor refusal so a
|
||||||
|
// failing model does not re-attach to the slice.
|
||||||
|
const stickyHint = readStickyModelForUnit(
|
||||||
|
basePath,
|
||||||
|
unitType,
|
||||||
|
unitId,
|
||||||
|
);
|
||||||
routingResult = resolveModelForComplexity(
|
routingResult = resolveModelForComplexity(
|
||||||
classification,
|
classification,
|
||||||
modelConfig,
|
modelConfig,
|
||||||
|
|
@ -551,6 +561,7 @@ export async function selectAndApplyModel(
|
||||||
unitType,
|
unitType,
|
||||||
classification.taskMetadata,
|
classification.taskMetadata,
|
||||||
capabilityOverrides,
|
capabilityOverrides,
|
||||||
|
stickyHint,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (routingResult.wasDowngraded) {
|
if (routingResult.wasDowngraded) {
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,9 @@ import {
|
||||||
import { initRoutingHistory } from "./routing-history.js";
|
import { initRoutingHistory } from "./routing-history.js";
|
||||||
import {
|
import {
|
||||||
acquireSessionLock,
|
acquireSessionLock,
|
||||||
|
isSessionPidAlive,
|
||||||
releaseSessionLock,
|
releaseSessionLock,
|
||||||
|
terminateExistingSession,
|
||||||
updateSessionLock,
|
updateSessionLock,
|
||||||
} from "./session-lock.js";
|
} from "./session-lock.js";
|
||||||
import { getSessionModelOverride } from "./session-model-override.js";
|
import { getSessionModelOverride } from "./session-model-override.js";
|
||||||
|
|
@ -342,15 +344,91 @@ export async function bootstrapAutoSession(
|
||||||
lockBase,
|
lockBase,
|
||||||
buildResolver,
|
buildResolver,
|
||||||
} = deps;
|
} = deps;
|
||||||
const lockResult = acquireSessionLock(base, {
|
let lockResult = acquireSessionLock(base, {
|
||||||
sessionId: ctx.sessionManager?.getSessionId?.(),
|
sessionId: ctx.sessionManager?.getSessionId?.(),
|
||||||
sessionFile: ctx.sessionManager?.getSessionFile?.(),
|
sessionFile: ctx.sessionManager?.getSessionFile?.(),
|
||||||
});
|
});
|
||||||
|
// Lock busy on a *live* peer: instead of just refusing to start, ask the
|
||||||
|
// operator whether to terminate the existing session and take over. Two
|
||||||
|
// non-interactive escape hatches keep CI/headless usage predictable:
|
||||||
|
// - SF_KILL_EXISTING=1 (or =true / =yes) — auto-confirm the kill
|
||||||
|
// - SF_KILL_EXISTING=0 (or =false / =no) — auto-decline (current behavior)
|
||||||
|
// - SF_HEADLESS=1 with no SF_KILL_EXISTING — auto-decline (safe default
|
||||||
|
// for batch contexts where a hung interactive prompt would deadlock)
|
||||||
|
if (!lockResult.acquired && lockResult.existingPid) {
|
||||||
|
const existingPid = Number(lockResult.existingPid);
|
||||||
|
if (isSessionPidAlive(existingPid)) {
|
||||||
|
const envKill = String(process.env.SF_KILL_EXISTING ?? "")
|
||||||
|
.trim()
|
||||||
|
.toLowerCase();
|
||||||
|
const headless =
|
||||||
|
process.env.SF_HEADLESS === "1" ||
|
||||||
|
String(process.env.SF_HEADLESS ?? "").toLowerCase() === "true";
|
||||||
|
let confirmed;
|
||||||
|
if (envKill === "1" || envKill === "true" || envKill === "yes") {
|
||||||
|
confirmed = true;
|
||||||
|
} else if (envKill === "0" || envKill === "false" || envKill === "no") {
|
||||||
|
confirmed = false;
|
||||||
|
} else if (headless) {
|
||||||
|
// Headless without an explicit opt-in: refuse to kill silently.
|
||||||
|
confirmed = false;
|
||||||
|
} else if (typeof ctx.ui?.confirm === "function") {
|
||||||
|
confirmed = await ctx.ui.confirm(
|
||||||
|
"Stop running SF session?",
|
||||||
|
`Another SF autonomous session (PID ${existingPid}) is already running on this project. Stop it and start a fresh session?`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
confirmed = false;
|
||||||
|
}
|
||||||
|
if (confirmed) {
|
||||||
|
ctx.ui.notify(
|
||||||
|
`Stopping existing SF session (PID ${existingPid})…`,
|
||||||
|
"info",
|
||||||
|
);
|
||||||
|
let result;
|
||||||
|
try {
|
||||||
|
result = await terminateExistingSession(existingPid);
|
||||||
|
} catch (err) {
|
||||||
|
ctx.ui.notify(
|
||||||
|
`Failed to stop existing SF session (PID ${existingPid}): ${err?.message ?? err}. Stop it manually with \`kill ${existingPid}\`.`,
|
||||||
|
"error",
|
||||||
|
);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!result.terminated) {
|
||||||
|
ctx.ui.notify(
|
||||||
|
`Unable to stop existing SF session (PID ${existingPid}). It may belong to another user or be unresponsive. Stop it manually with \`kill -9 ${existingPid}\`.`,
|
||||||
|
"error",
|
||||||
|
);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ctx.ui.notify(
|
||||||
|
result.escalated
|
||||||
|
? `Existing SF session (PID ${existingPid}) did not exit on SIGTERM; SIGKILL applied.`
|
||||||
|
: `Existing SF session (PID ${existingPid}) stopped.`,
|
||||||
|
result.escalated ? "warning" : "info",
|
||||||
|
);
|
||||||
|
lockResult = acquireSessionLock(base, {
|
||||||
|
sessionId: ctx.sessionManager?.getSessionId?.(),
|
||||||
|
sessionFile: ctx.sessionManager?.getSessionFile?.(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!lockResult.acquired) {
|
if (!lockResult.acquired) {
|
||||||
const reason = lockResult.reason;
|
const reason = lockResult.reason;
|
||||||
ctx.ui.notify(reason, "error");
|
ctx.ui.notify(reason, "error");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
// Session-start janitor: prune per-flow trace files older than the longest
|
||||||
|
// analyzer window (30d). Best-effort, never blocks startup, errors swallowed
|
||||||
|
// in pruneStaleTraces. Keeps `.sf/traces/` from growing without bound.
|
||||||
|
try {
|
||||||
|
const { pruneStaleTraces } = await import("./uok/trace-writer.js");
|
||||||
|
pruneStaleTraces(base);
|
||||||
|
} catch {
|
||||||
|
// trace janitor must never break autonomous startup
|
||||||
|
}
|
||||||
function releaseLockAndReturn() {
|
function releaseLockAndReturn() {
|
||||||
releaseSessionLock(base);
|
releaseSessionLock(base);
|
||||||
clearLock(base);
|
clearLock(base);
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
import { scopeActiveToolsForUnitType } from "../constants.js";
|
import { scopeActiveToolsForUnitType } from "../constants.js";
|
||||||
import { debugLog } from "../debug-logger.js";
|
import { debugLog } from "../debug-logger.js";
|
||||||
|
import { getErrorMessage } from "../error-utils.js";
|
||||||
import {
|
import {
|
||||||
resolveAutoSupervisorConfig,
|
resolveAutoSupervisorConfig,
|
||||||
resolvePersistModelChanges,
|
resolvePersistModelChanges,
|
||||||
|
|
@ -27,11 +28,29 @@ import {
|
||||||
getCurrentTurnGeneration,
|
getCurrentTurnGeneration,
|
||||||
runWithTurnGeneration,
|
runWithTurnGeneration,
|
||||||
} from "./turn-epoch.js";
|
} from "./turn-epoch.js";
|
||||||
import { getErrorMessage } from "../error-utils.js";
|
|
||||||
|
|
||||||
// Tracks the latest session-switch attempt so a late timeout settlement from an
|
// Tracks the latest session-switch attempt so a late timeout settlement from an
|
||||||
// older runUnit() call cannot clear the guard for a newer one.
|
// older runUnit() call cannot clear the guard for a newer one.
|
||||||
let sessionSwitchGeneration = 0;
|
let sessionSwitchGeneration = 0;
|
||||||
|
/**
|
||||||
|
* Build the custom-message content for a unit prompt.
|
||||||
|
*
|
||||||
|
* Purpose: preserve the exact prompt text while allowing the provider layer to
|
||||||
|
* cache the stable prefix separately from the dynamic suffix.
|
||||||
|
*
|
||||||
|
* Consumer: runUnit before pi.sendMessage dispatches the autonomous unit turn.
|
||||||
|
*/
|
||||||
|
export function buildUnitPromptMessageContent(prompt, promptParts) {
|
||||||
|
if (!promptParts) return prompt;
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
text: `${promptParts.before}\n`,
|
||||||
|
cache_control: { type: "ephemeral" },
|
||||||
|
},
|
||||||
|
{ type: "text", text: promptParts.after },
|
||||||
|
];
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Execute a single unit: create a new session, send the prompt, and await
|
* Execute a single unit: create a new session, send the prompt, and await
|
||||||
* the agent_end promise. Returns a UnitResult describing what happened.
|
* the agent_end promise. Returns a UnitResult describing what happened.
|
||||||
|
|
@ -122,8 +141,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) {
|
||||||
sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
|
sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
|
||||||
} catch (sessionErr) {
|
} catch (sessionErr) {
|
||||||
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle);
|
||||||
const msg =
|
const msg = getErrorMessage(sessionErr);
|
||||||
getErrorMessage(sessionErr);
|
|
||||||
debugLog("runUnit", {
|
debugLog("runUnit", {
|
||||||
phase: "session-error",
|
phase: "session-error",
|
||||||
unitType,
|
unitType,
|
||||||
|
|
@ -264,16 +282,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) {
|
||||||
// When promptParts is available, send structured content so the provider can
|
// When promptParts is available, send structured content so the provider can
|
||||||
// apply cache_control:ephemeral to the stable prefix (before) while leaving
|
// apply cache_control:ephemeral to the stable prefix (before) while leaving
|
||||||
// the dynamic suffix (after) uncached.
|
// the dynamic suffix (after) uncached.
|
||||||
const messageContent = promptParts
|
const messageContent = buildUnitPromptMessageContent(prompt, promptParts);
|
||||||
? [
|
|
||||||
{
|
|
||||||
type: "text",
|
|
||||||
text: promptParts.before,
|
|
||||||
cache_control: { type: "ephemeral" },
|
|
||||||
},
|
|
||||||
{ type: "text", text: promptParts.after },
|
|
||||||
]
|
|
||||||
: prompt;
|
|
||||||
await pi.sendMessage(
|
await pi.sendMessage(
|
||||||
{ customType: "sf-auto", content: messageContent, display: s.verbose },
|
{ customType: "sf-auto", content: messageContent, display: s.verbose },
|
||||||
{ triggerTurn: true },
|
{ triggerTurn: true },
|
||||||
|
|
|
||||||
|
|
@ -301,7 +301,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
cmd: "rubber-duck",
|
cmd: "rubber-duck",
|
||||||
desc: "Request constructive code/design review from a rubber-duck subagent (RUBBER_DUCK flag)",
|
desc: "Dispatch a rubber-duck subagent for constructive pre-implementation review (alias: review-code)",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
cmd: "delegate",
|
cmd: "delegate",
|
||||||
|
|
|
||||||
|
|
@ -613,25 +613,47 @@ async function handleKeepAlive(args, ctx) {
|
||||||
// ─── /rubber-duck ────────────────────────────────────────────────────────────
|
// ─── /rubber-duck ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async function handleRubberDuckCommand(topic, ctx, _pi) {
|
async function handleRubberDuckCommand(topic, ctx, _pi) {
|
||||||
if (!getExperimentalFlag("rubber_duck")) {
|
const { execSync } = await import("node:child_process");
|
||||||
ctx.ui.notify(
|
const root = projectRoot();
|
||||||
"RUBBER_DUCK is not enabled. Run /experimental on rubber_duck to enable.",
|
|
||||||
"warning",
|
// Gather git diff for context (staged + unstaged, capped to avoid token bloat)
|
||||||
);
|
let diff = "";
|
||||||
return;
|
|
||||||
}
|
|
||||||
const prompt = topic
|
|
||||||
? `Rubber-duck review requested: ${topic}\n\nPlease review this as a constructive critic: identify risks, edge cases, missing tests, and improvements. Be direct and concise.`
|
|
||||||
: "Please give constructive feedback on the current code changes or design. Identify risks, edge cases, missing tests, and improvements.";
|
|
||||||
ctx.ui.notify(
|
|
||||||
"Starting rubber-duck review… (RUBBER_DUCK agent is constructive, not adversarial)",
|
|
||||||
"info",
|
|
||||||
);
|
|
||||||
try {
|
try {
|
||||||
await ctx.sendMessage?.(prompt);
|
const staged = execSync("git diff --cached --stat 2>/dev/null || true", {
|
||||||
|
cwd: root,
|
||||||
|
encoding: "utf-8",
|
||||||
|
}).trim();
|
||||||
|
const unstaged = execSync("git diff --stat 2>/dev/null || true", {
|
||||||
|
cwd: root,
|
||||||
|
encoding: "utf-8",
|
||||||
|
}).trim();
|
||||||
|
if (staged || unstaged) {
|
||||||
|
const fullDiff = execSync(
|
||||||
|
"git diff --cached 2>/dev/null; git diff 2>/dev/null",
|
||||||
|
{ cwd: root, encoding: "utf-8" },
|
||||||
|
).slice(0, 8000);
|
||||||
|
diff = `\n\n## Current diff (truncated to 8 kB)\n\n\`\`\`diff\n${fullDiff}\n\`\`\``;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// diff unavailable — not a hard failure
|
||||||
|
}
|
||||||
|
|
||||||
|
const focus = topic ? `Focus on: ${topic}\n\n` : "";
|
||||||
|
const reviewPrompt =
|
||||||
|
`Dispatch a \`rubber-duck\` subagent to review the current plan or changes before proceeding. ` +
|
||||||
|
`Use the \`subagent\` tool with \`agent: "rubber-duck"\`.\n\n` +
|
||||||
|
`${focus}` +
|
||||||
|
`Ask the rubber-duck agent to identify blocking issues, non-blocking issues, and suggestions. ` +
|
||||||
|
`After the subagent returns, summarise the verdict and any blocking findings in one short paragraph. ` +
|
||||||
|
`Do not proceed with implementation until the user acknowledges blocking findings.` +
|
||||||
|
diff;
|
||||||
|
|
||||||
|
ctx.ui.notify("Dispatching rubber-duck review…", "info");
|
||||||
|
try {
|
||||||
|
await ctx.sendMessage?.(reviewPrompt);
|
||||||
} catch {
|
} catch {
|
||||||
ctx.ui.notify(
|
ctx.ui.notify(
|
||||||
"Could not start rubber-duck session. Try typing your review request directly.",
|
"Could not dispatch rubber-duck. Try: subagent agent=rubber-duck task='review current changes'",
|
||||||
"warning",
|
"warning",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -741,6 +741,66 @@ export class SFDashboardOverlay {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// UOK Health section — aligns with headless status output
|
||||||
|
if (this.uokDiagnostics && this.uokDiagnostics.issues.length > 0) {
|
||||||
|
lines.push(blank());
|
||||||
|
lines.push(hr());
|
||||||
|
lines.push(row(th.fg("text", th.bold("UOK Health"))));
|
||||||
|
lines.push(blank());
|
||||||
|
// Compact summary line matching headless format
|
||||||
|
lines.push(
|
||||||
|
row(
|
||||||
|
th.fg(
|
||||||
|
this.uokDiagnostics.verdict === "degraded"
|
||||||
|
? "error"
|
||||||
|
: this.uokDiagnostics.verdict === "attention"
|
||||||
|
? "warning"
|
||||||
|
: "dim",
|
||||||
|
`Verdict: ${this.uokDiagnostics.verdict} (${this.uokDiagnostics.classification})`,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
lines.push(blank());
|
||||||
|
// Issue list
|
||||||
|
for (const issue of this.uokDiagnostics.issues) {
|
||||||
|
const icon =
|
||||||
|
issue.severity === "error"
|
||||||
|
? th.fg("error", "✗")
|
||||||
|
: th.fg("warning", "⚠");
|
||||||
|
lines.push(row(` ${icon} ${th.fg("text", issue.code)}`));
|
||||||
|
lines.push(row(th.fg("dim", ` ${issue.message}`)));
|
||||||
|
}
|
||||||
|
// Recommendations
|
||||||
|
if (this.uokDiagnostics.recommendations.length > 0) {
|
||||||
|
lines.push(blank());
|
||||||
|
for (const rec of this.uokDiagnostics.recommendations) {
|
||||||
|
lines.push(row(th.fg("dim", ` → ${rec}`)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Signals table
|
||||||
|
if (this.uokDiagnostics.signals) {
|
||||||
|
lines.push(blank());
|
||||||
|
lines.push(row(th.fg("dim", "Signals:")));
|
||||||
|
for (const [key, value] of Object.entries(
|
||||||
|
this.uokDiagnostics.signals,
|
||||||
|
)) {
|
||||||
|
const signalColor =
|
||||||
|
value === "ok" ||
|
||||||
|
value === "active" ||
|
||||||
|
value === "consistent" ||
|
||||||
|
value === "clear"
|
||||||
|
? "success"
|
||||||
|
: value === "unknown"
|
||||||
|
? "dim"
|
||||||
|
: "warning";
|
||||||
|
lines.push(
|
||||||
|
row(
|
||||||
|
` ${th.fg(signalColor, "●")} ${th.fg("text", key)}: ${th.fg(signalColor, String(value))}`,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
// Environment health section (#1221) — only show issues
|
// Environment health section (#1221) — only show issues
|
||||||
const envResults = runEnvironmentChecks(
|
const envResults = runEnvironmentChecks(
|
||||||
this.dashData.basePath || process.cwd(),
|
this.dashData.basePath || process.cwd(),
|
||||||
|
|
|
||||||
|
|
@ -31,18 +31,12 @@ export const EXPERIMENTAL_FLAGS = {
|
||||||
"STATUS_LINE — run a user-defined script to feed a custom footer status chip",
|
"STATUS_LINE — run a user-defined script to feed a custom footer status chip",
|
||||||
show_file:
|
show_file:
|
||||||
"SHOW_FILE — show_file tool renders code snippets inline in the timeline",
|
"SHOW_FILE — show_file tool renders code snippets inline in the timeline",
|
||||||
ask_elicitation:
|
|
||||||
"ASK_USER_ELICITATION — structured form/select UI replaces plain ask_user",
|
|
||||||
multi_turn_agents:
|
|
||||||
"MULTI_TURN_AGENTS — persistent subagents that accept follow-up messages",
|
|
||||||
extensions:
|
extensions:
|
||||||
"EXTENSIONS — user-installable extensions via marketplace npm install",
|
"EXTENSIONS — user-installable extensions via marketplace npm install",
|
||||||
configure_agent:
|
configure_agent:
|
||||||
"CONFIGURE_COPILOT_AGENT — interactive wizard for MCP servers and agents",
|
"CONFIGURE_COPILOT_AGENT — interactive wizard for MCP servers and agents",
|
||||||
background_sessions:
|
background_sessions:
|
||||||
"BACKGROUND_SESSIONS — concurrent sessions with background switching",
|
"BACKGROUND_SESSIONS — concurrent sessions with background switching",
|
||||||
rubber_duck:
|
|
||||||
"RUBBER_DUCK — constructive feedback subagent on code and designs",
|
|
||||||
prompt_frame:
|
prompt_frame:
|
||||||
"PROMPT_FRAME — decorative border rendered above the input prompt",
|
"PROMPT_FRAME — decorative border rendered above the input prompt",
|
||||||
streamer_mode:
|
streamer_mode:
|
||||||
|
|
|
||||||
|
|
@ -107,6 +107,8 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 30,
|
speed: 30,
|
||||||
longContext: 80,
|
longContext: 80,
|
||||||
instruction: 90,
|
instruction: 90,
|
||||||
|
// Agentic: Claude Opus is built around extended tool-use loops.
|
||||||
|
agentic: 95,
|
||||||
},
|
},
|
||||||
"claude-sonnet-4-6": {
|
"claude-sonnet-4-6": {
|
||||||
coding: 85,
|
coding: 85,
|
||||||
|
|
@ -116,6 +118,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 60,
|
speed: 60,
|
||||||
longContext: 75,
|
longContext: 75,
|
||||||
instruction: 85,
|
instruction: 85,
|
||||||
|
agentic: 92,
|
||||||
},
|
},
|
||||||
"claude-sonnet-4-5-20250514": {
|
"claude-sonnet-4-5-20250514": {
|
||||||
coding: 85,
|
coding: 85,
|
||||||
|
|
@ -125,6 +128,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 60,
|
speed: 60,
|
||||||
longContext: 75,
|
longContext: 75,
|
||||||
instruction: 85,
|
instruction: 85,
|
||||||
|
agentic: 90,
|
||||||
},
|
},
|
||||||
"claude-3-5-sonnet-latest": {
|
"claude-3-5-sonnet-latest": {
|
||||||
coding: 82,
|
coding: 82,
|
||||||
|
|
@ -134,6 +138,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 62,
|
speed: 62,
|
||||||
longContext: 70,
|
longContext: 70,
|
||||||
instruction: 82,
|
instruction: 82,
|
||||||
|
agentic: 85,
|
||||||
},
|
},
|
||||||
"claude-haiku-4-5": {
|
"claude-haiku-4-5": {
|
||||||
coding: 60,
|
coding: 60,
|
||||||
|
|
@ -143,6 +148,9 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 95,
|
speed: 95,
|
||||||
longContext: 50,
|
longContext: 50,
|
||||||
instruction: 75,
|
instruction: 75,
|
||||||
|
// Haiku follows tool-use contracts but is less reliable than Sonnet on
|
||||||
|
// long agentic loops.
|
||||||
|
agentic: 75,
|
||||||
},
|
},
|
||||||
"claude-3-5-haiku-latest": {
|
"claude-3-5-haiku-latest": {
|
||||||
coding: 60,
|
coding: 60,
|
||||||
|
|
@ -152,6 +160,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 95,
|
speed: 95,
|
||||||
longContext: 50,
|
longContext: 50,
|
||||||
instruction: 75,
|
instruction: 75,
|
||||||
|
agentic: 75,
|
||||||
},
|
},
|
||||||
"claude-3-haiku-20240307": {
|
"claude-3-haiku-20240307": {
|
||||||
coding: 50,
|
coding: 50,
|
||||||
|
|
@ -163,6 +172,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
instruction: 65,
|
instruction: 65,
|
||||||
},
|
},
|
||||||
"claude-3-opus-latest": {
|
"claude-3-opus-latest": {
|
||||||
|
agentic: 88,
|
||||||
coding: 90,
|
coding: 90,
|
||||||
debugging: 85,
|
debugging: 85,
|
||||||
research: 82,
|
research: 82,
|
||||||
|
|
@ -234,6 +244,8 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 40,
|
speed: 40,
|
||||||
longContext: 85,
|
longContext: 85,
|
||||||
instruction: 90,
|
instruction: 90,
|
||||||
|
// GPT-5 family is strongly agentic per OpenAI's tool-use evals.
|
||||||
|
agentic: 92,
|
||||||
},
|
},
|
||||||
"gpt-5-mini": {
|
"gpt-5-mini": {
|
||||||
coding: 62,
|
coding: 62,
|
||||||
|
|
@ -261,6 +273,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 35,
|
speed: 35,
|
||||||
longContext: 88,
|
longContext: 88,
|
||||||
instruction: 92,
|
instruction: 92,
|
||||||
|
agentic: 94,
|
||||||
},
|
},
|
||||||
"gpt-5.1": {
|
"gpt-5.1": {
|
||||||
coding: 93,
|
coding: 93,
|
||||||
|
|
@ -270,6 +283,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 42,
|
speed: 42,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 91,
|
instruction: 91,
|
||||||
|
agentic: 92,
|
||||||
},
|
},
|
||||||
"gpt-5.1-codex-max": {
|
"gpt-5.1-codex-max": {
|
||||||
coding: 90,
|
coding: 90,
|
||||||
|
|
@ -279,6 +293,9 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 55,
|
speed: 55,
|
||||||
longContext: 75,
|
longContext: 75,
|
||||||
instruction: 85,
|
instruction: 85,
|
||||||
|
// Codex-tuned models are agentic-capable but not as reliable as the
|
||||||
|
// flagship gpt-5/5.x lineup for long tool-use loops.
|
||||||
|
agentic: 80,
|
||||||
},
|
},
|
||||||
"gpt-5.1-codex-mini": {
|
"gpt-5.1-codex-mini": {
|
||||||
coding: 65,
|
coding: 65,
|
||||||
|
|
@ -288,6 +305,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 88,
|
speed: 88,
|
||||||
longContext: 48,
|
longContext: 48,
|
||||||
instruction: 72,
|
instruction: 72,
|
||||||
|
agentic: 55,
|
||||||
},
|
},
|
||||||
"gpt-5.2": {
|
"gpt-5.2": {
|
||||||
coding: 93,
|
coding: 93,
|
||||||
|
|
@ -297,6 +315,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 42,
|
speed: 42,
|
||||||
longContext: 87,
|
longContext: 87,
|
||||||
instruction: 91,
|
instruction: 91,
|
||||||
|
agentic: 92,
|
||||||
},
|
},
|
||||||
"gpt-5.2-codex": {
|
"gpt-5.2-codex": {
|
||||||
coding: 93,
|
coding: 93,
|
||||||
|
|
@ -306,6 +325,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 50,
|
speed: 50,
|
||||||
longContext: 78,
|
longContext: 78,
|
||||||
instruction: 88,
|
instruction: 88,
|
||||||
|
agentic: 82,
|
||||||
},
|
},
|
||||||
"gpt-5.3-codex": {
|
"gpt-5.3-codex": {
|
||||||
coding: 94,
|
coding: 94,
|
||||||
|
|
@ -315,6 +335,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 50,
|
speed: 50,
|
||||||
longContext: 80,
|
longContext: 80,
|
||||||
instruction: 89,
|
instruction: 89,
|
||||||
|
agentic: 84,
|
||||||
},
|
},
|
||||||
"gpt-5.3-codex-spark": {
|
"gpt-5.3-codex-spark": {
|
||||||
coding: 68,
|
coding: 68,
|
||||||
|
|
@ -324,6 +345,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 90,
|
speed: 90,
|
||||||
longContext: 50,
|
longContext: 50,
|
||||||
instruction: 74,
|
instruction: 74,
|
||||||
|
agentic: 55,
|
||||||
},
|
},
|
||||||
"gpt-5.4": {
|
"gpt-5.4": {
|
||||||
coding: 95,
|
coding: 95,
|
||||||
|
|
@ -333,6 +355,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 42,
|
speed: 42,
|
||||||
longContext: 88,
|
longContext: 88,
|
||||||
instruction: 92,
|
instruction: 92,
|
||||||
|
agentic: 94,
|
||||||
},
|
},
|
||||||
"gpt-5.4-mini": {
|
"gpt-5.4-mini": {
|
||||||
coding: 80,
|
coding: 80,
|
||||||
|
|
@ -342,6 +365,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 72,
|
speed: 72,
|
||||||
longContext: 72,
|
longContext: 72,
|
||||||
instruction: 80,
|
instruction: 80,
|
||||||
|
agentic: 80,
|
||||||
},
|
},
|
||||||
// GPT-5.5 scores are relative to the existing gpt-5.4 profile and backed by
|
// GPT-5.5 scores are relative to the existing gpt-5.4 profile and backed by
|
||||||
// OpenAI's 2026-04-23 published eval deltas across coding, tool use, and long context.
|
// OpenAI's 2026-04-23 published eval deltas across coding, tool use, and long context.
|
||||||
|
|
@ -354,6 +378,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 42,
|
speed: 42,
|
||||||
longContext: 90,
|
longContext: 90,
|
||||||
instruction: 93,
|
instruction: 93,
|
||||||
|
agentic: 95,
|
||||||
},
|
},
|
||||||
// ── OpenAI o-series (reasoning-first) ──────────────────────────────────────
|
// ── OpenAI o-series (reasoning-first) ──────────────────────────────────────
|
||||||
o1: {
|
o1: {
|
||||||
|
|
@ -410,6 +435,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 48,
|
speed: 48,
|
||||||
longContext: 98,
|
longContext: 98,
|
||||||
instruction: 82,
|
instruction: 82,
|
||||||
|
agentic: 85,
|
||||||
},
|
},
|
||||||
"gemini-3-pro-preview": {
|
"gemini-3-pro-preview": {
|
||||||
coding: 82,
|
coding: 82,
|
||||||
|
|
@ -419,6 +445,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 50,
|
speed: 50,
|
||||||
longContext: 96,
|
longContext: 96,
|
||||||
instruction: 82,
|
instruction: 82,
|
||||||
|
agentic: 85,
|
||||||
},
|
},
|
||||||
"gemini-3-flash-preview": {
|
"gemini-3-flash-preview": {
|
||||||
coding: 62,
|
coding: 62,
|
||||||
|
|
@ -428,6 +455,10 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 88,
|
speed: 88,
|
||||||
longContext: 88,
|
longContext: 88,
|
||||||
instruction: 72,
|
instruction: 72,
|
||||||
|
// Gemini Flash follows tool contracts but is occasionally chatty in
|
||||||
|
// agentic loops; mid-tier so it doesn't dominate execute-task vs
|
||||||
|
// a Sonnet/Opus/K2.6 alternative.
|
||||||
|
agentic: 70,
|
||||||
},
|
},
|
||||||
"gemini-3.1-flash-lite-preview": {
|
"gemini-3.1-flash-lite-preview": {
|
||||||
coding: 55,
|
coding: 55,
|
||||||
|
|
@ -583,6 +614,10 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 70,
|
speed: 70,
|
||||||
longContext: 60,
|
longContext: 60,
|
||||||
instruction: 80,
|
instruction: 80,
|
||||||
|
// Agentic: code-completion tuning. Refuses agentic tasks with "I'm sorry,
|
||||||
|
// I don't have the necessary tools" (M001-6377a4/S04/T02, 2026-05-12).
|
||||||
|
// Should not be routed to execute-task without explicit operator pin.
|
||||||
|
agentic: 25,
|
||||||
},
|
},
|
||||||
"ministral-8b-latest": {
|
"ministral-8b-latest": {
|
||||||
coding: 55,
|
coding: 55,
|
||||||
|
|
@ -655,6 +690,9 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 65,
|
speed: 65,
|
||||||
longContext: 65,
|
longContext: 65,
|
||||||
instruction: 80,
|
instruction: 80,
|
||||||
|
// Agentic: Devstral series is coding-completion-tuned; tool-use is not
|
||||||
|
// the design target. Penalize so execute-task routing avoids it.
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
"devstral-medium-latest": {
|
"devstral-medium-latest": {
|
||||||
coding: 78,
|
coding: 78,
|
||||||
|
|
@ -664,6 +702,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 75,
|
speed: 75,
|
||||||
longContext: 60,
|
longContext: 60,
|
||||||
instruction: 75,
|
instruction: 75,
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
"devstral-medium-2507": {
|
"devstral-medium-2507": {
|
||||||
coding: 78,
|
coding: 78,
|
||||||
|
|
@ -673,6 +712,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 75,
|
speed: 75,
|
||||||
longContext: 60,
|
longContext: 60,
|
||||||
instruction: 75,
|
instruction: 75,
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
"devstral-small-2505": {
|
"devstral-small-2505": {
|
||||||
coding: 60,
|
coding: 60,
|
||||||
|
|
@ -682,6 +722,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 90,
|
speed: 90,
|
||||||
longContext: 45,
|
longContext: 45,
|
||||||
instruction: 65,
|
instruction: 65,
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
"devstral-small-2507": {
|
"devstral-small-2507": {
|
||||||
coding: 60,
|
coding: 60,
|
||||||
|
|
@ -691,6 +732,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 90,
|
speed: 90,
|
||||||
longContext: 45,
|
longContext: 45,
|
||||||
instruction: 65,
|
instruction: 65,
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
"labs-devstral-small-2512": {
|
"labs-devstral-small-2512": {
|
||||||
coding: 65,
|
coding: 65,
|
||||||
|
|
@ -700,6 +742,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 88,
|
speed: 88,
|
||||||
longContext: 60,
|
longContext: 60,
|
||||||
instruction: 68,
|
instruction: 68,
|
||||||
|
agentic: 30,
|
||||||
},
|
},
|
||||||
// ── Zhipu AI (GLM) ─────────────────────────────────────────────────────────
|
// ── Zhipu AI (GLM) ─────────────────────────────────────────────────────────
|
||||||
"glm-5": {
|
"glm-5": {
|
||||||
|
|
@ -774,6 +817,8 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 58,
|
speed: 58,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 78,
|
instruction: 78,
|
||||||
|
// Agentic: qwen3-coder is tuned for code completion, not tool-use loops.
|
||||||
|
agentic: 40,
|
||||||
},
|
},
|
||||||
"qwen3-coder-next": {
|
"qwen3-coder-next": {
|
||||||
coding: 82,
|
coding: 82,
|
||||||
|
|
@ -783,6 +828,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 70,
|
speed: 70,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 76,
|
instruction: 76,
|
||||||
|
agentic: 40,
|
||||||
},
|
},
|
||||||
"qwen3-next:80b": {
|
"qwen3-next:80b": {
|
||||||
coding: 70,
|
coding: 70,
|
||||||
|
|
@ -802,6 +848,9 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 55,
|
speed: 55,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 84,
|
instruction: 84,
|
||||||
|
// Agentic: K2.6 is the pinned default for the autonomous-solver role
|
||||||
|
// (ADR-0079) — refusal-resistant and follows tool-use contracts.
|
||||||
|
agentic: 90,
|
||||||
},
|
},
|
||||||
"kimi-for-coding": {
|
"kimi-for-coding": {
|
||||||
coding: 88,
|
coding: 88,
|
||||||
|
|
@ -811,6 +860,9 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 55,
|
speed: 55,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 84,
|
instruction: 84,
|
||||||
|
// `kimi-for-coding` is an alias for K2.6 on the Kimi Code provider
|
||||||
|
// (memory: bayesian-blender/benchmark-selector both canonicalize it).
|
||||||
|
agentic: 90,
|
||||||
},
|
},
|
||||||
"kimi-k2-thinking": {
|
"kimi-k2-thinking": {
|
||||||
coding: 86,
|
coding: 86,
|
||||||
|
|
@ -820,8 +872,15 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 30,
|
speed: 30,
|
||||||
longContext: 86,
|
longContext: 86,
|
||||||
instruction: 84,
|
instruction: 84,
|
||||||
|
agentic: 88,
|
||||||
},
|
},
|
||||||
// ── MiniMax ───────────────────────────────────────────────────────────────
|
// ── MiniMax ───────────────────────────────────────────────────────────────
|
||||||
|
// Profiles ordered by generation. Older M2.1 generation gets distinctly
|
||||||
|
// lower agentic + capability scores: the M2.1 stuck-checkpoint loop on
|
||||||
|
// 2026-05-13 (infra repo) traced back to M2.1 being aliased to M2.7's
|
||||||
|
// profile, winning execute-task on cost, then failing to follow the
|
||||||
|
// checkpoint contract reliably across 60+ tool calls. (See
|
||||||
|
// self-feedback sf-mp37kjmo-1mfuru.)
|
||||||
"MiniMax-M2.7": {
|
"MiniMax-M2.7": {
|
||||||
coding: 84,
|
coding: 84,
|
||||||
debugging: 80,
|
debugging: 80,
|
||||||
|
|
@ -830,6 +889,7 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 52,
|
speed: 52,
|
||||||
longContext: 84,
|
longContext: 84,
|
||||||
instruction: 82,
|
instruction: 82,
|
||||||
|
agentic: 78,
|
||||||
},
|
},
|
||||||
"MiniMax-M2.7-highspeed": {
|
"MiniMax-M2.7-highspeed": {
|
||||||
coding: 82,
|
coding: 82,
|
||||||
|
|
@ -839,6 +899,47 @@ export const MODEL_CAPABILITY_PROFILES = {
|
||||||
speed: 72,
|
speed: 72,
|
||||||
longContext: 84,
|
longContext: 84,
|
||||||
instruction: 80,
|
instruction: 80,
|
||||||
|
agentic: 76,
|
||||||
|
},
|
||||||
|
"MiniMax-M2.5": {
|
||||||
|
// Distinct profile (previously aliased to M2.7 — overclaimed).
|
||||||
|
coding: 78,
|
||||||
|
debugging: 74,
|
||||||
|
research: 72,
|
||||||
|
reasoning: 78,
|
||||||
|
speed: 55,
|
||||||
|
longContext: 82,
|
||||||
|
instruction: 76,
|
||||||
|
// Mid agentic — better than coding-completion-only models but
|
||||||
|
// noticeably less reliable than current-gen agentic models.
|
||||||
|
agentic: 60,
|
||||||
|
},
|
||||||
|
"MiniMax-M2.1": {
|
||||||
|
// Distinct profile (previously aliased to M2.7 — overclaimed).
|
||||||
|
// M2.1 has demonstrated unreliable tool-use loops in production
|
||||||
|
// (M001-6377a4 / 1-ci-build-pipeline parallel-research, 2026-05-13:
|
||||||
|
// 60+ checkpoint calls with shifting unitId claims). Penalize the
|
||||||
|
// agentic axis so execute-task routing avoids it absent operator
|
||||||
|
// override.
|
||||||
|
coding: 72,
|
||||||
|
debugging: 66,
|
||||||
|
research: 64,
|
||||||
|
reasoning: 70,
|
||||||
|
speed: 60,
|
||||||
|
longContext: 78,
|
||||||
|
instruction: 72,
|
||||||
|
agentic: 40,
|
||||||
|
},
|
||||||
|
"MiniMax-M2": {
|
||||||
|
// Earliest of the M2.x line — older still.
|
||||||
|
coding: 68,
|
||||||
|
debugging: 60,
|
||||||
|
research: 60,
|
||||||
|
reasoning: 66,
|
||||||
|
speed: 62,
|
||||||
|
longContext: 76,
|
||||||
|
instruction: 68,
|
||||||
|
agentic: 35,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
const MODEL_CAPABILITY_ALIASES = {
|
const MODEL_CAPABILITY_ALIASES = {
|
||||||
|
|
@ -864,10 +965,23 @@ const MODEL_CAPABILITY_ALIASES = {
|
||||||
"kimi-for-coding": "kimi-k2.6",
|
"kimi-for-coding": "kimi-k2.6",
|
||||||
"kimi-k2.6:cloud": "kimi-k2.6",
|
"kimi-k2.6:cloud": "kimi-k2.6",
|
||||||
"kimi-k2.6-cloud": "kimi-k2.6",
|
"kimi-k2.6-cloud": "kimi-k2.6",
|
||||||
"minimax-m2": "MiniMax-M2.7",
|
// Each MiniMax generation now has its own profile — previously they all
|
||||||
"minimax-m2.1": "MiniMax-M2.7",
|
// aliased to MiniMax-M2.7, which let older/weaker models inherit current
|
||||||
"minimax-m2.5": "MiniMax-M2.7",
|
// capability scores and win cost tie-breaks on execute-task. The aliases
|
||||||
|
// below normalize provider-prefixed and casing variants to the canonical
|
||||||
|
// per-generation profile, NOT to the current generation.
|
||||||
|
"minimax-m2": "MiniMax-M2",
|
||||||
|
"minimax/MiniMax-M2": "MiniMax-M2",
|
||||||
|
"minimax/minimax-m2": "MiniMax-M2",
|
||||||
|
"minimax-m2.1": "MiniMax-M2.1",
|
||||||
|
"minimax/MiniMax-M2.1": "MiniMax-M2.1",
|
||||||
|
"minimax/minimax-m2.1": "MiniMax-M2.1",
|
||||||
|
"minimax-m2.5": "MiniMax-M2.5",
|
||||||
|
"minimax/MiniMax-M2.5": "MiniMax-M2.5",
|
||||||
|
"minimax/minimax-m2.5": "MiniMax-M2.5",
|
||||||
"minimax-m2.7": "MiniMax-M2.7",
|
"minimax-m2.7": "MiniMax-M2.7",
|
||||||
|
"minimax/MiniMax-M2.7": "MiniMax-M2.7",
|
||||||
|
"minimax/minimax-m2.7": "MiniMax-M2.7",
|
||||||
"mistral-large-3:675b": "mistral-large-latest",
|
"mistral-large-3:675b": "mistral-large-latest",
|
||||||
"ministral-3:3b": "ministral-3b-latest",
|
"ministral-3:3b": "ministral-3b-latest",
|
||||||
"ministral-3:8b": "ministral-8b-latest",
|
"ministral-3:8b": "ministral-8b-latest",
|
||||||
|
|
@ -888,18 +1002,32 @@ const MODEL_CAPABILITY_ALIASES = {
|
||||||
// ─── Base Task Requirements Data Table ───────────────────────────────────────
|
// ─── Base Task Requirements Data Table ───────────────────────────────────────
|
||||||
// Per-unit-type base requirement vectors. Weights indicate how important each
|
// Per-unit-type base requirement vectors. Weights indicate how important each
|
||||||
// capability dimension is for this unit type.
|
// capability dimension is for this unit type.
|
||||||
|
//
|
||||||
|
// The `agentic` dimension represents the model's reliability at multi-turn
|
||||||
|
// tool-use loops (does it follow the tool-use contract? does it refuse the
|
||||||
|
// task? does it call the checkpoint tool when asked?). It is weighted high
|
||||||
|
// for any unit type that actually uses tools at runtime — execute-task most
|
||||||
|
// of all. See ADR-0079 for the motivation: a Codestral-style refusal on
|
||||||
|
// execute-task in M001-6377a4/S04/T02 (2026-05-12) traced back to the router
|
||||||
|
// having no agentic axis, so a coding-completion model out-scored agentic
|
||||||
|
// alternatives on coding/instruction.
|
||||||
export const BASE_REQUIREMENTS = {
|
export const BASE_REQUIREMENTS = {
|
||||||
"execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 },
|
"execute-task": {
|
||||||
|
coding: 0.9,
|
||||||
|
instruction: 0.7,
|
||||||
|
speed: 0.3,
|
||||||
|
agentic: 0.85,
|
||||||
|
},
|
||||||
"research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
|
"research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
|
||||||
"research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
|
"research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
|
||||||
"plan-milestone": { reasoning: 0.9, coding: 0.5 },
|
"plan-milestone": { reasoning: 0.9, coding: 0.5, agentic: 0.6 },
|
||||||
"plan-slice": { reasoning: 0.9, coding: 0.5 },
|
"plan-slice": { reasoning: 0.9, coding: 0.5, agentic: 0.6 },
|
||||||
"replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5 },
|
"replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5, agentic: 0.6 },
|
||||||
"reassess-roadmap": { reasoning: 0.9, research: 0.5 },
|
"reassess-roadmap": { reasoning: 0.9, research: 0.5, agentic: 0.4 },
|
||||||
"complete-slice": { instruction: 0.8, speed: 0.7 },
|
"complete-slice": { instruction: 0.8, speed: 0.7, agentic: 0.6 },
|
||||||
"run-uat": { instruction: 0.7, speed: 0.8 },
|
"run-uat": { instruction: 0.7, speed: 0.8, agentic: 0.6 },
|
||||||
"discuss-milestone": { reasoning: 0.6, instruction: 0.7 },
|
"discuss-milestone": { reasoning: 0.6, instruction: 0.7, agentic: 0.4 },
|
||||||
"complete-milestone": { instruction: 0.8, reasoning: 0.5 },
|
"complete-milestone": { instruction: 0.8, reasoning: 0.5, agentic: 0.5 },
|
||||||
};
|
};
|
||||||
// ─── Public API ──────────────────────────────────────────────────────────────
|
// ─── Public API ──────────────────────────────────────────────────────────────
|
||||||
/**
|
/**
|
||||||
|
|
@ -1101,6 +1229,7 @@ export function resolveModelForComplexity(
|
||||||
unitType,
|
unitType,
|
||||||
taskMetadata,
|
taskMetadata,
|
||||||
capabilityOverrides,
|
capabilityOverrides,
|
||||||
|
stickyHint,
|
||||||
) {
|
) {
|
||||||
// If no phase config or routing disabled, pass through
|
// If no phase config or routing disabled, pass through
|
||||||
if (!phaseConfig || !routingConfig.enabled) {
|
if (!phaseConfig || !routingConfig.enabled) {
|
||||||
|
|
@ -1175,16 +1304,41 @@ export function resolveModelForComplexity(
|
||||||
if (winner) {
|
if (winner) {
|
||||||
const capScores = {};
|
const capScores = {};
|
||||||
for (const s of scored) capScores[s.modelId] = s.score;
|
for (const s of scored) capScores[s.modelId] = s.score;
|
||||||
const fallbacks = buildFallbackChain(winner.modelId, phaseConfig);
|
// Slice-sticky preference: if a model previously succeeded on a
|
||||||
|
// sibling unit in this slice AND it is still eligible in the
|
||||||
|
// current tier AND its capability score is within STICKY_WINDOW of
|
||||||
|
// the winner, prefer it. Stops within-slice routing thrash where
|
||||||
|
// T01 → gemini-flash and T02 → codestral on the same slice.
|
||||||
|
const STICKY_WINDOW_POINTS = 8;
|
||||||
|
const stickyId = (() => {
|
||||||
|
if (!stickyHint?.id) return null;
|
||||||
|
const stickyKey = stickyHint.provider
|
||||||
|
? `${stickyHint.provider}/${stickyHint.id}`
|
||||||
|
: stickyHint.id;
|
||||||
|
// Match either "provider/model" or bare model id in the eligible list.
|
||||||
|
const found = scored.find(
|
||||||
|
(s) => s.modelId === stickyKey || s.modelId.endsWith(`/${stickyHint.id}`),
|
||||||
|
);
|
||||||
|
if (!found) return null;
|
||||||
|
if (winner.score - found.score > STICKY_WINDOW_POINTS) return null;
|
||||||
|
return found.modelId;
|
||||||
|
})();
|
||||||
|
const selectedId = stickyId ?? winner.modelId;
|
||||||
|
const selectedScore = (
|
||||||
|
scored.find((s) => s.modelId === selectedId) ?? winner
|
||||||
|
).score;
|
||||||
|
const fallbacks = buildFallbackChain(selectedId, phaseConfig);
|
||||||
return {
|
return {
|
||||||
modelId: winner.modelId,
|
modelId: selectedId,
|
||||||
fallbacks,
|
fallbacks,
|
||||||
tier: requestedTier,
|
tier: requestedTier,
|
||||||
wasDowngraded: true,
|
wasDowngraded: true,
|
||||||
reason: `capability-scored: ${winner.modelId} (${winner.score.toFixed(1)}) for ${unitType}`,
|
reason: stickyId
|
||||||
|
? `slice-sticky: ${selectedId} (${selectedScore.toFixed(1)}, within ${STICKY_WINDOW_POINTS}pt of capability winner) for ${unitType}`
|
||||||
|
: `capability-scored: ${selectedId} (${selectedScore.toFixed(1)}) for ${unitType}`,
|
||||||
capabilityScores: capScores,
|
capabilityScores: capScores,
|
||||||
taskRequirements: requirements,
|
taskRequirements: requirements,
|
||||||
selectionMethod: "capability-scored",
|
selectionMethod: stickyId ? "slice-sticky" : "capability-scored",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -137,6 +137,11 @@ export function reorderForCaching(prompt) {
|
||||||
* static+semi-static prefix can be marked with cache_control: ephemeral on
|
* static+semi-static prefix can be marked with cache_control: ephemeral on
|
||||||
* Anthropic-compatible providers.
|
* Anthropic-compatible providers.
|
||||||
*
|
*
|
||||||
|
* Purpose: keep SF autonomous prompt prefixes byte-stable across adjacent task
|
||||||
|
* dispatches so provider prompt caches can reuse expensive context.
|
||||||
|
*
|
||||||
|
* Consumer: auto/phases-unit.js before runUnit dispatches an autonomous unit.
|
||||||
|
*
|
||||||
* Returns `{before: string, after: string}` where:
|
* Returns `{before: string, after: string}` where:
|
||||||
* - `before` = preamble + all static + all semi-static sections (cache this)
|
* - `before` = preamble + all static + all semi-static sections (cache this)
|
||||||
* - `after` = all dynamic sections (do not cache)
|
* - `after` = all dynamic sections (do not cache)
|
||||||
|
|
|
||||||
|
|
@ -596,3 +596,103 @@ function isPidAlive(pid) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Public wrapper around isPidAlive for callers outside this module.
|
||||||
|
*
|
||||||
|
* Consumer: auto-start's prompt-to-kill flow needs to decide whether the
|
||||||
|
* existingPid from acquireSessionLock's failure result is still alive before
|
||||||
|
* offering to terminate it.
|
||||||
|
*/
|
||||||
|
export function isSessionPidAlive(pid) {
|
||||||
|
return isPidAlive(Number(pid));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Terminate an existing SF auto session by PID.
|
||||||
|
*
|
||||||
|
* Why: when acquireSessionLock reports `{ acquired: false, existingPid }`
|
||||||
|
* because another SF process is holding the lock, we want a one-call helper
|
||||||
|
* that an interactive caller can invoke after confirming with the user. The
|
||||||
|
* helper sends SIGTERM, polls for the process to exit, escalates to SIGKILL
|
||||||
|
* after the grace window, and waits a short tail for the kernel to reap the
|
||||||
|
* PID so a subsequent acquireSessionLock retry sees a dead PID and proceeds
|
||||||
|
* down the stale-lock recovery path.
|
||||||
|
*
|
||||||
|
* Returns `{ terminated: boolean, escalated: boolean, alreadyDead: boolean }`.
|
||||||
|
* `terminated` is true iff the PID is no longer alive when the call returns.
|
||||||
|
* `escalated` is true iff SIGKILL was needed because SIGTERM did not produce
|
||||||
|
* an exit within `gracePeriodMs`.
|
||||||
|
*
|
||||||
|
* Consumer: auto-start's prompt-to-kill flow. Not part of the normal
|
||||||
|
* autonomous loop — only invoked after explicit operator consent.
|
||||||
|
*
|
||||||
|
* @param {number} pid - The PID to terminate.
|
||||||
|
* @param {object} [options]
|
||||||
|
* @param {number} [options.gracePeriodMs=5000] - How long to wait between
|
||||||
|
* SIGTERM and SIGKILL.
|
||||||
|
* @param {number} [options.reapWaitMs=1000] - How long to wait after the
|
||||||
|
* final kill signal for the kernel to reap.
|
||||||
|
* @param {number} [options.pollIntervalMs=100] - Poll interval used while
|
||||||
|
* waiting for exit.
|
||||||
|
*/
|
||||||
|
export async function terminateExistingSession(pid, options = {}) {
|
||||||
|
const numericPid = Number(pid);
|
||||||
|
if (!Number.isInteger(numericPid) || numericPid <= 0) {
|
||||||
|
return { terminated: false, escalated: false, alreadyDead: true };
|
||||||
|
}
|
||||||
|
if (numericPid === process.pid) {
|
||||||
|
// Refuse to terminate ourselves — would deadlock the caller.
|
||||||
|
return { terminated: false, escalated: false, alreadyDead: false };
|
||||||
|
}
|
||||||
|
if (!isPidAlive(numericPid)) {
|
||||||
|
return { terminated: true, escalated: false, alreadyDead: true };
|
||||||
|
}
|
||||||
|
const gracePeriodMs = Number(options.gracePeriodMs ?? 5000);
|
||||||
|
const reapWaitMs = Number(options.reapWaitMs ?? 1000);
|
||||||
|
const pollIntervalMs = Math.max(50, Number(options.pollIntervalMs ?? 100));
|
||||||
|
try {
|
||||||
|
process.kill(numericPid, "SIGTERM");
|
||||||
|
} catch (err) {
|
||||||
|
// ESRCH: process already gone between the alive check and the kill.
|
||||||
|
// EPERM: not ours to kill — surface as not-terminated.
|
||||||
|
if (err?.code === "ESRCH") {
|
||||||
|
return { terminated: true, escalated: false, alreadyDead: true };
|
||||||
|
}
|
||||||
|
if (err?.code === "EPERM") {
|
||||||
|
return { terminated: false, escalated: false, alreadyDead: false };
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
const deadline = Date.now() + gracePeriodMs;
|
||||||
|
while (Date.now() < deadline) {
|
||||||
|
if (!isPidAlive(numericPid)) {
|
||||||
|
return { terminated: true, escalated: false, alreadyDead: false };
|
||||||
|
}
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
||||||
|
}
|
||||||
|
// Grace expired — escalate to SIGKILL.
|
||||||
|
try {
|
||||||
|
process.kill(numericPid, "SIGKILL");
|
||||||
|
} catch (err) {
|
||||||
|
if (err?.code === "ESRCH") {
|
||||||
|
return { terminated: true, escalated: true, alreadyDead: false };
|
||||||
|
}
|
||||||
|
if (err?.code === "EPERM") {
|
||||||
|
return { terminated: false, escalated: true, alreadyDead: false };
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
const reapDeadline = Date.now() + reapWaitMs;
|
||||||
|
while (Date.now() < reapDeadline) {
|
||||||
|
if (!isPidAlive(numericPid)) {
|
||||||
|
return { terminated: true, escalated: true, alreadyDead: false };
|
||||||
|
}
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
terminated: !isPidAlive(numericPid),
|
||||||
|
escalated: true,
|
||||||
|
alreadyDead: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
|
||||||
154
src/resources/extensions/sf/slice-routing-cache.js
Normal file
154
src/resources/extensions/sf/slice-routing-cache.js
Normal file
|
|
@ -0,0 +1,154 @@
|
||||||
|
/**
|
||||||
|
* slice-routing-cache.js — per-slice sticky-model routing cache.
|
||||||
|
*
|
||||||
|
* Why: model routing is currently computed per-unit, so the executor can flip
|
||||||
|
* between models within a single slice (M001-6377a4/S04 routed T01 to
|
||||||
|
* gemini-3-flash-preview, then T02 to codestral-latest — the second was
|
||||||
|
* unfit and refused the task, see ADR-0079). Once a model has successfully
|
||||||
|
* completed work on a slice, prefer it for the slice's sibling units unless
|
||||||
|
* a hard mismatch forces a switch.
|
||||||
|
*
|
||||||
|
* Contract:
|
||||||
|
* - Cache is small JSON keyed by sliceId. Each entry stores provider/id and
|
||||||
|
* timestamps so stale entries can be aged out.
|
||||||
|
* - Best-effort: read/write errors are swallowed; routing always has a
|
||||||
|
* fallback through the capability scorer.
|
||||||
|
* - Only successful outcomes (`continue` or `complete`) write to the cache.
|
||||||
|
* Refusal/blocker outcomes clear the entry so a failing model does not
|
||||||
|
* re-attach to the slice.
|
||||||
|
*
|
||||||
|
* Consumer: auto-model-selection.js reads before calling
|
||||||
|
* resolveModelForComplexity; auto/phases-unit.js writes after a successful
|
||||||
|
* checkpoint and clears on `executor-refused`.
|
||||||
|
*/
|
||||||
|
import { existsSync, mkdirSync, readFileSync, unlinkSync } from "node:fs";
|
||||||
|
import { dirname, join } from "node:path";
|
||||||
|
import { atomicWriteSync } from "./atomic-write.js";
|
||||||
|
import { sfRuntimeRoot } from "./paths.js";
|
||||||
|
|
||||||
|
const CACHE_FILE = "slice-routing.json";
|
||||||
|
const DEFAULT_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
|
||||||
|
|
||||||
|
function cachePath(basePath) {
|
||||||
|
return join(sfRuntimeRoot(basePath), CACHE_FILE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the slice scope from a unit id.
|
||||||
|
*
|
||||||
|
* Supports the conventional SF unit-id grammar:
|
||||||
|
* - Execute task: "<milestoneId>/<sliceId>/<taskId>" → "<milestoneId>/<sliceId>"
|
||||||
|
* - Plan / complete slice: "<milestoneId>/<sliceId>" → "<milestoneId>/<sliceId>" (already a slice)
|
||||||
|
* - Milestone-level units: "<milestoneId>" → "<milestoneId>" (no slice scope)
|
||||||
|
*
|
||||||
|
* Returns null when the unit id is missing or unparseable.
|
||||||
|
*/
|
||||||
|
export function extractSliceScope(unitId) {
|
||||||
|
if (!unitId || typeof unitId !== "string") return null;
|
||||||
|
const parts = unitId.split("/").filter(Boolean);
|
||||||
|
if (parts.length === 0) return null;
|
||||||
|
if (parts.length === 1) return parts[0]; // milestone-only
|
||||||
|
return `${parts[0]}/${parts[1]}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function readCache(basePath) {
|
||||||
|
const path = cachePath(basePath);
|
||||||
|
if (!existsSync(path)) return {};
|
||||||
|
try {
|
||||||
|
return JSON.parse(readFileSync(path, "utf-8"));
|
||||||
|
} catch {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function writeCache(basePath, data) {
|
||||||
|
const path = cachePath(basePath);
|
||||||
|
try {
|
||||||
|
mkdirSync(dirname(path), { recursive: true });
|
||||||
|
atomicWriteSync(path, JSON.stringify(data, null, 2));
|
||||||
|
} catch {
|
||||||
|
// best-effort
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record the model that successfully handled a unit. The slice scope is
|
||||||
|
* derived from the unit id. Subsequent units in the same slice will see this
|
||||||
|
* as the sticky hint.
|
||||||
|
*/
|
||||||
|
export function recordSliceRouting(basePath, unitType, unitId, model) {
|
||||||
|
if (!basePath || !model?.id) return;
|
||||||
|
const sliceId = extractSliceScope(unitId);
|
||||||
|
if (!sliceId) return;
|
||||||
|
const data = readCache(basePath);
|
||||||
|
data[sliceId] = {
|
||||||
|
provider: String(model.provider ?? ""),
|
||||||
|
id: String(model.id),
|
||||||
|
ts: new Date().toISOString(),
|
||||||
|
lastUnitType: String(unitType ?? ""),
|
||||||
|
lastUnitId: String(unitId ?? ""),
|
||||||
|
};
|
||||||
|
writeCache(basePath, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look up the sticky model for the slice that contains this unit. Returns
|
||||||
|
* null when there is no entry, when it's older than maxAgeMs, or when the
|
||||||
|
* cache cannot be read.
|
||||||
|
*
|
||||||
|
* @param {string} basePath
|
||||||
|
* @param {string} unitType
|
||||||
|
* @param {string} unitId
|
||||||
|
* @param {object} [options]
|
||||||
|
* @param {number} [options.maxAgeMs=7d]
|
||||||
|
* @returns {{ provider: string, id: string } | null}
|
||||||
|
*/
|
||||||
|
export function readStickyModelForUnit(basePath, unitType, unitId, options = {}) {
|
||||||
|
if (!basePath) return null;
|
||||||
|
const sliceId = extractSliceScope(unitId);
|
||||||
|
if (!sliceId) return null;
|
||||||
|
const data = readCache(basePath);
|
||||||
|
const entry = data[sliceId];
|
||||||
|
if (!entry?.id) return null;
|
||||||
|
const maxAgeMs = Number(options.maxAgeMs ?? DEFAULT_MAX_AGE_MS);
|
||||||
|
if (entry.ts) {
|
||||||
|
const age = Date.now() - new Date(entry.ts).getTime();
|
||||||
|
if (Number.isFinite(age) && age > maxAgeMs) return null;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
provider: String(entry.provider ?? ""),
|
||||||
|
id: String(entry.id),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evict the sticky entry for the slice containing this unit. Called when the
|
||||||
|
* model attached to the slice refuses or hits a hard mismatch, so the next
|
||||||
|
* dispatch falls back to the capability scorer instead of re-pinning the
|
||||||
|
* broken model.
|
||||||
|
*/
|
||||||
|
export function clearSliceRoutingForUnit(basePath, unitId) {
|
||||||
|
if (!basePath) return;
|
||||||
|
const sliceId = extractSliceScope(unitId);
|
||||||
|
if (!sliceId) return;
|
||||||
|
const data = readCache(basePath);
|
||||||
|
if (!(sliceId in data)) return;
|
||||||
|
delete data[sliceId];
|
||||||
|
if (Object.keys(data).length === 0) {
|
||||||
|
try {
|
||||||
|
unlinkSync(cachePath(basePath));
|
||||||
|
} catch {
|
||||||
|
// best-effort
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
writeCache(basePath, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test/debug only — read the entire cache. Production callers should use
|
||||||
|
* readStickyModelForUnit instead.
|
||||||
|
*/
|
||||||
|
export function _readCacheForTests(basePath) {
|
||||||
|
return readCache(basePath);
|
||||||
|
}
|
||||||
467
src/resources/extensions/sf/tests/dashboard-overlay.test.ts
Normal file
467
src/resources/extensions/sf/tests/dashboard-overlay.test.ts
Normal file
|
|
@ -0,0 +1,467 @@
|
||||||
|
/**
|
||||||
|
* Dashboard Overlay UOK Diagnostics Tests
|
||||||
|
*
|
||||||
|
* Purpose: Verify that SFDashboardOverlay consumes writeUokDiagnostics output
|
||||||
|
* and renders it consistently with the headless status command.
|
||||||
|
*
|
||||||
|
* Consumer: TUI users who expect the dashboard to surface the same UOK health
|
||||||
|
* information as `sf status` / headless query.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
|
// ─── Hoisted mocks ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const mockDiagnostics = vi.hoisted(() => ({
|
||||||
|
clear: {
|
||||||
|
schemaVersion: 1,
|
||||||
|
generatedAt: new Date().toISOString(),
|
||||||
|
verdict: "clear",
|
||||||
|
classification: "healthy",
|
||||||
|
signals: {
|
||||||
|
lock: "active",
|
||||||
|
parity: "ok",
|
||||||
|
ledger: "consistent",
|
||||||
|
runtimeProjection: "ok",
|
||||||
|
wrapper: "clear",
|
||||||
|
},
|
||||||
|
currentUnit: null,
|
||||||
|
latestRun: null,
|
||||||
|
runtimeUnits: [],
|
||||||
|
issues: [],
|
||||||
|
recommendations: [],
|
||||||
|
reportPath: "/tmp/uok-diagnostics.json",
|
||||||
|
},
|
||||||
|
degraded: {
|
||||||
|
schemaVersion: 1,
|
||||||
|
generatedAt: new Date().toISOString(),
|
||||||
|
verdict: "degraded",
|
||||||
|
classification: "needs-repair",
|
||||||
|
signals: {
|
||||||
|
lock: "stale",
|
||||||
|
parity: "ok",
|
||||||
|
ledger: "open-runs",
|
||||||
|
runtimeProjection: "stale",
|
||||||
|
wrapper: "unknown",
|
||||||
|
},
|
||||||
|
currentUnit: null,
|
||||||
|
latestRun: null,
|
||||||
|
runtimeUnits: [],
|
||||||
|
issues: [
|
||||||
|
{
|
||||||
|
code: "stale-lock",
|
||||||
|
severity: "error",
|
||||||
|
message: "Stale auto.lock detected for PID 12345.",
|
||||||
|
evidence: { lock: { pid: 12345 } },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
code: "open-ledger-without-live-lock",
|
||||||
|
severity: "error",
|
||||||
|
message:
|
||||||
|
"UOK ledger has 2 started run(s) without a live auto.lock owner.",
|
||||||
|
evidence: { runIds: ["run-1", "run-2"] },
|
||||||
|
},
|
||||||
|
],
|
||||||
|
recommendations: [
|
||||||
|
"Clear stale auto.lock before dispatch.",
|
||||||
|
"Mark orphaned UOK runs recovered or restart from lock owner.",
|
||||||
|
],
|
||||||
|
reportPath: "/tmp/uok-diagnostics.json",
|
||||||
|
},
|
||||||
|
attention: {
|
||||||
|
schemaVersion: 1,
|
||||||
|
generatedAt: new Date().toISOString(),
|
||||||
|
verdict: "attention",
|
||||||
|
classification: "degraded",
|
||||||
|
signals: {
|
||||||
|
lock: "active",
|
||||||
|
parity: "degraded",
|
||||||
|
ledger: "consistent",
|
||||||
|
runtimeProjection: "ok",
|
||||||
|
wrapper: "unknown",
|
||||||
|
},
|
||||||
|
currentUnit: { unitType: "execute-task", unitId: "T01", pid: 12345 },
|
||||||
|
latestRun: null,
|
||||||
|
runtimeUnits: [],
|
||||||
|
issues: [
|
||||||
|
{
|
||||||
|
code: "uok-parity-degraded",
|
||||||
|
severity: "warning",
|
||||||
|
message:
|
||||||
|
"UOK parity degraded: 1 critical mismatch(es), 0 missing exit(s).",
|
||||||
|
evidence: { current: { criticalMismatches: 1, missingExitEvents: 0 } },
|
||||||
|
},
|
||||||
|
],
|
||||||
|
recommendations: ["Reconcile UOK parity before mutating git state."],
|
||||||
|
reportPath: "/tmp/uok-diagnostics.json",
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
|
const dashDataMock = vi.hoisted(() => ({
|
||||||
|
basePath: "/tmp/sf-test",
|
||||||
|
active: false,
|
||||||
|
paused: false,
|
||||||
|
remoteSession: null,
|
||||||
|
currentUnit: null,
|
||||||
|
elapsed: 0,
|
||||||
|
rtkEnabled: false,
|
||||||
|
rtkSavings: null,
|
||||||
|
pendingCaptureCount: 0,
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../uok/diagnostic-synthesis.js", () => ({
|
||||||
|
writeUokDiagnostics: vi.fn((_basePath, _options) => mockDiagnostics.clear),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../state.js", () => ({
|
||||||
|
deriveState: vi.fn(async () => ({
|
||||||
|
activeMilestone: null,
|
||||||
|
activeSlice: null,
|
||||||
|
activeTask: null,
|
||||||
|
phase: "idle",
|
||||||
|
progress: null,
|
||||||
|
nextAction: null,
|
||||||
|
blockers: [],
|
||||||
|
registry: [],
|
||||||
|
})),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../sf-db.js", () => ({
|
||||||
|
isDbAvailable: vi.fn(() => false),
|
||||||
|
getMilestoneSlices: vi.fn(() => []),
|
||||||
|
getSliceTasks: vi.fn(() => []),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../auto.js", () => ({
|
||||||
|
getAutoDashboardData: vi.fn(() => dashDataMock),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../auto-dashboard.js", () => ({
|
||||||
|
estimateTimeRemaining: vi.fn(() => null),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../progress-score.js", () => ({
|
||||||
|
computeProgressScore: vi.fn(() => ({
|
||||||
|
level: "green",
|
||||||
|
summary: "All systems healthy",
|
||||||
|
signals: [],
|
||||||
|
})),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../doctor-environment.js", () => ({
|
||||||
|
runEnvironmentChecks: vi.fn(() => []),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../worktree-command.js", () => ({
|
||||||
|
getActiveWorktreeName: vi.fn(() => null),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../subagent/worker-registry.js", () => ({
|
||||||
|
hasActiveWorkers: vi.fn(() => false),
|
||||||
|
getWorkerBatches: vi.fn(() => new Map()),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../metrics.js", () => ({
|
||||||
|
getLedger: vi.fn(() => null),
|
||||||
|
getProjectTotals: vi.fn(() => ({})),
|
||||||
|
aggregateByPhase: vi.fn(() => []),
|
||||||
|
aggregateBySlice: vi.fn(() => []),
|
||||||
|
aggregateByModel: vi.fn(() => []),
|
||||||
|
aggregateCacheHitRate: vi.fn(() => 0),
|
||||||
|
formatCost: vi.fn((n) => `$${n.toFixed(2)}`),
|
||||||
|
formatCostProjection: vi.fn(() => []),
|
||||||
|
formatTokenCount: vi.fn((n) => String(n)),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../paths.js", () => ({
|
||||||
|
resolveMilestoneFile: vi.fn(() => null),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../files.js", () => ({
|
||||||
|
loadFile: vi.fn(async () => null),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../preferences.js", () => ({
|
||||||
|
loadEffectiveSFPreferences: vi.fn(() => null),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("@singularity-forge/tui", async (importOriginal) => {
|
||||||
|
const actual = (await importOriginal()) as any;
|
||||||
|
return {
|
||||||
|
...actual,
|
||||||
|
Key: {
|
||||||
|
escape: "\u001B",
|
||||||
|
ctrl: (c: string) => `\u0000${c}`,
|
||||||
|
ctrlAlt: (c: string) => `\u001B\u0000${c}`,
|
||||||
|
ctrlShift: (c: string) => `\u001B\u0000${c.toUpperCase()}`,
|
||||||
|
down: "\u001B[B",
|
||||||
|
up: "\u001B[A",
|
||||||
|
},
|
||||||
|
matchesKey: vi.fn(() => false),
|
||||||
|
truncateToWidth: vi.fn((s: string, w: number) =>
|
||||||
|
s.length > w ? s.slice(0, w) : s,
|
||||||
|
),
|
||||||
|
visibleWidth: vi.fn((s: string) => s.length),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
vi.mock("../shared/mod.js", () => ({
|
||||||
|
centerLine: vi.fn(
|
||||||
|
(s: string, w: number) =>
|
||||||
|
" ".repeat(Math.max(0, Math.floor((w - s.length) / 2))) + s,
|
||||||
|
),
|
||||||
|
fitColumns: vi.fn((parts: string[], _w: number, _sep: string) =>
|
||||||
|
parts.join(" "),
|
||||||
|
),
|
||||||
|
formatDuration: vi.fn((ms: number) => `${Math.round(ms / 1000)}s`),
|
||||||
|
joinColumns: vi.fn(
|
||||||
|
(left: string, right: string, _w: number) =>
|
||||||
|
`${left}${" ".repeat(Math.max(1, _w - left.length - right.length))}${right}`,
|
||||||
|
),
|
||||||
|
padRight: vi.fn((s: string, w: number) => s.padEnd(w, " ")),
|
||||||
|
STATUS_COLOR: {
|
||||||
|
done: "success",
|
||||||
|
active: "accent",
|
||||||
|
pending: "dim",
|
||||||
|
},
|
||||||
|
STATUS_GLYPH: {
|
||||||
|
done: "✓",
|
||||||
|
active: "▶",
|
||||||
|
pending: "○",
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../shortcut-defs.js", () => ({
|
||||||
|
formattedShortcutPair: vi.fn(() => "ctrl+alt+g"),
|
||||||
|
}));
|
||||||
|
|
||||||
|
// ─── Helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function createMockTheme() {
|
||||||
|
return {
|
||||||
|
fg: vi.fn((color: string, text: string) => `[${color}:${text}]`),
|
||||||
|
bold: vi.fn((text: string) => `**${text}**`),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createMockTui() {
|
||||||
|
return {
|
||||||
|
requestRender: vi.fn(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Tests ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("SFDashboardOverlay UOK diagnostics", () => {
|
||||||
|
it("loadData_calls_writeUokDiagnostics_and_stores_result", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
// Prevent interval from firing during test
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
|
||||||
|
expect(writeUokDiagnostics).toHaveBeenCalledWith("/tmp/sf-test");
|
||||||
|
expect(overlay.uokDiagnostics).toEqual(mockDiagnostics.clear);
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("loadData_gracefully_handles_writeUokDiagnostics_failure", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
writeUokDiagnostics.mockImplementation(() => {
|
||||||
|
throw new Error("disk full");
|
||||||
|
});
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
|
||||||
|
expect(overlay.uokDiagnostics).toBeNull();
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
writeUokDiagnostics.mockRestore();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_includes_uok_verdict_when_diagnostics_present", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
expect(text).toContain("UOK");
|
||||||
|
expect(text).toContain("degraded");
|
||||||
|
expect(text).toContain("needs-repair");
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_includes_first_issue_code_like_headless_status", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
// Should contain the first issue code, matching headless status behavior
|
||||||
|
expect(text).toContain("stale-lock");
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_shows_uok_health_section_with_all_issues_when_degraded", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
// Should show both issue codes in the health section
|
||||||
|
expect(text).toContain("stale-lock");
|
||||||
|
expect(text).toContain("open-ledger-without-live-lock");
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_shows_recommendations_when_issues_present", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
expect(text).toContain("Clear stale auto.lock before dispatch.");
|
||||||
|
expect(text).toContain(
|
||||||
|
"Mark orphaned UOK runs recovered or restart from lock owner.",
|
||||||
|
);
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_shows_uok_signals_table_when_diagnostics_present", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
// Signals should be visible
|
||||||
|
expect(text).toContain("lock");
|
||||||
|
expect(text).toContain("parity");
|
||||||
|
expect(text).toContain("ledger");
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("render_omits_detailed_uok_section_when_verdict_is_clear", async () => {
|
||||||
|
const { writeUokDiagnostics } = await import(
|
||||||
|
"../uok/diagnostic-synthesis.js"
|
||||||
|
);
|
||||||
|
(writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.clear);
|
||||||
|
|
||||||
|
const { SFDashboardOverlay } = await import("../dashboard-overlay.js");
|
||||||
|
|
||||||
|
const tui = createMockTui();
|
||||||
|
const theme = createMockTheme();
|
||||||
|
const overlay = new SFDashboardOverlay(tui, theme, () => {});
|
||||||
|
|
||||||
|
clearInterval(overlay.refreshTimer);
|
||||||
|
overlay.refreshTimer = null as any;
|
||||||
|
|
||||||
|
await overlay.loadData();
|
||||||
|
const lines = overlay.buildContentLines(80);
|
||||||
|
const text = lines.join("\n");
|
||||||
|
|
||||||
|
// Should show the compact UOK clear line but no issue details
|
||||||
|
expect(text).toContain("clear");
|
||||||
|
expect(text).not.toContain("stale-lock");
|
||||||
|
|
||||||
|
overlay.dispose();
|
||||||
|
});
|
||||||
|
});
|
||||||
140
src/resources/extensions/sf/tests/model-router-agentic.test.mjs
Normal file
140
src/resources/extensions/sf/tests/model-router-agentic.test.mjs
Normal file
|
|
@ -0,0 +1,140 @@
|
||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import {
|
||||||
|
BASE_REQUIREMENTS,
|
||||||
|
MODEL_CAPABILITY_PROFILES,
|
||||||
|
scoreEligibleModels,
|
||||||
|
scoreModel,
|
||||||
|
} from "../model-router.js";
|
||||||
|
|
||||||
|
describe("agentic capability axis (ADR-0079)", () => {
|
||||||
|
test("execute-task base requirements weight the agentic dimension", () => {
|
||||||
|
// If this assertion fails because the weight changed: re-read ADR-0079
|
||||||
|
// before adjusting. The whole point of the axis is to outweigh raw
|
||||||
|
// coding score for execute-task routing.
|
||||||
|
expect(BASE_REQUIREMENTS["execute-task"].agentic).toBeGreaterThanOrEqual(
|
||||||
|
0.7,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("known agentic-capable models score higher than coding-completion models on execute-task", () => {
|
||||||
|
const codestralScore = scoreModel(
|
||||||
|
MODEL_CAPABILITY_PROFILES["codestral-latest"],
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
const kimiScore = scoreModel(
|
||||||
|
MODEL_CAPABILITY_PROFILES["kimi-k2.6"],
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
const sonnetScore = scoreModel(
|
||||||
|
MODEL_CAPABILITY_PROFILES["claude-sonnet-4-6"],
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
// Codestral has high coding (85) but agentic=25 — must not beat agentic models.
|
||||||
|
expect(kimiScore).toBeGreaterThan(codestralScore);
|
||||||
|
expect(sonnetScore).toBeGreaterThan(codestralScore);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("devstral variants score below agentic models on execute-task", () => {
|
||||||
|
const devstralScore = scoreModel(
|
||||||
|
MODEL_CAPABILITY_PROFILES["devstral-2512"],
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
const kimiScore = scoreModel(
|
||||||
|
MODEL_CAPABILITY_PROFILES["kimi-k2.6"],
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
expect(kimiScore).toBeGreaterThan(devstralScore);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("scoreEligibleModels ranks agentic models above coding-only models for execute-task", () => {
|
||||||
|
const eligible = [
|
||||||
|
"mistral/codestral-latest",
|
||||||
|
"mistral/devstral-2512",
|
||||||
|
"moonshotai/kimi-k2.6",
|
||||||
|
"anthropic/claude-sonnet-4-6",
|
||||||
|
];
|
||||||
|
const ranked = scoreEligibleModels(
|
||||||
|
eligible,
|
||||||
|
BASE_REQUIREMENTS["execute-task"],
|
||||||
|
);
|
||||||
|
const top = ranked[0]?.modelId;
|
||||||
|
// Either of the two pinned-agentic models must win.
|
||||||
|
expect(["moonshotai/kimi-k2.6", "anthropic/claude-sonnet-4-6"]).toContain(
|
||||||
|
top,
|
||||||
|
);
|
||||||
|
// And Codestral specifically must not win.
|
||||||
|
expect(top).not.toBe("mistral/codestral-latest");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("agentic axis preserves research-* unit-type behavior (no agentic weight there)", () => {
|
||||||
|
// Research isn't agentic — those unit types should not gain an agentic
|
||||||
|
// dimension. This protects long-context research-tuned models from
|
||||||
|
// being penalized.
|
||||||
|
expect(BASE_REQUIREMENTS["research-milestone"].agentic).toBeUndefined();
|
||||||
|
expect(BASE_REQUIREMENTS["research-slice"].agentic).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("known coding-only models all have agentic <= 50", () => {
|
||||||
|
const codingOnly = [
|
||||||
|
"codestral-latest",
|
||||||
|
"devstral-2512",
|
||||||
|
"devstral-medium-latest",
|
||||||
|
"devstral-medium-2507",
|
||||||
|
"devstral-small-2505",
|
||||||
|
"devstral-small-2507",
|
||||||
|
"labs-devstral-small-2512",
|
||||||
|
"qwen3-coder:480b",
|
||||||
|
"qwen3-coder-next",
|
||||||
|
];
|
||||||
|
for (const id of codingOnly) {
|
||||||
|
const profile = MODEL_CAPABILITY_PROFILES[id];
|
||||||
|
expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined();
|
||||||
|
expect(profile.agentic, `${id} should have agentic <= 50`).toBeLessThanOrEqual(
|
||||||
|
50,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("older MiniMax generations score lower than current on agentic", () => {
|
||||||
|
// 2026-05-13 incident: minimax/M2.1 stuck in 60+ checkpoint loop on
|
||||||
|
// infra repo. Root cause was the router aliasing all minimax-m2.x
|
||||||
|
// variants to MiniMax-M2.7's profile, so older models inherited
|
||||||
|
// current-gen capability scores and won cost tie-breaks on
|
||||||
|
// execute-task. Per-generation profiles + agentic axis fix the
|
||||||
|
// underlying routing decision.
|
||||||
|
const m21 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.1"];
|
||||||
|
const m25 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.5"];
|
||||||
|
const m27 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.7"];
|
||||||
|
expect(m21, "M2.1 should have its own profile").toBeDefined();
|
||||||
|
expect(m25, "M2.5 should have its own profile").toBeDefined();
|
||||||
|
expect(m27.agentic).toBeGreaterThan(m25.agentic);
|
||||||
|
expect(m25.agentic).toBeGreaterThan(m21.agentic);
|
||||||
|
// And on execute-task, the current generation must beat the older one.
|
||||||
|
const oldScore = scoreModel(m21, BASE_REQUIREMENTS["execute-task"]);
|
||||||
|
const newScore = scoreModel(m27, BASE_REQUIREMENTS["execute-task"]);
|
||||||
|
expect(newScore).toBeGreaterThan(oldScore);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("known agentic-frontier models all have agentic >= 85", () => {
|
||||||
|
const agenticFrontier = [
|
||||||
|
"claude-opus-4-6",
|
||||||
|
"claude-sonnet-4-6",
|
||||||
|
"claude-sonnet-4-5-20250514",
|
||||||
|
"kimi-k2.6",
|
||||||
|
"kimi-k2-thinking",
|
||||||
|
"gpt-5",
|
||||||
|
"gpt-5.4",
|
||||||
|
"gpt-5.5",
|
||||||
|
"gemini-3-pro-preview",
|
||||||
|
"gemini-3.1-pro-preview",
|
||||||
|
];
|
||||||
|
for (const id of agenticFrontier) {
|
||||||
|
const profile = MODEL_CAPABILITY_PROFILES[id];
|
||||||
|
expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined();
|
||||||
|
expect(
|
||||||
|
profile.agentic,
|
||||||
|
`${id} should have agentic >= 85`,
|
||||||
|
).toBeGreaterThanOrEqual(85);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -134,61 +134,3 @@ test("reorderAndSplitForCaching_preamble_goes_into_before", () => {
|
||||||
"dynamic section in after",
|
"dynamic section in after",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("reorderForCaching_when_inlined_slice_summary_has_requirements_advanced_keeps_it_after_mission", () => {
|
|
||||||
const prompt = [
|
|
||||||
"# Milestone Validation",
|
|
||||||
"",
|
|
||||||
"## Working Directory",
|
|
||||||
"/repo",
|
|
||||||
"",
|
|
||||||
"## Mission",
|
|
||||||
"Dispatch reviewers.",
|
|
||||||
"",
|
|
||||||
"## Context",
|
|
||||||
"Inlined below.",
|
|
||||||
"",
|
|
||||||
"## Inlined Context",
|
|
||||||
"### S01 Summary",
|
|
||||||
"# S01",
|
|
||||||
"",
|
|
||||||
"## Requirements Advanced",
|
|
||||||
"- R1",
|
|
||||||
"",
|
|
||||||
"## Requirements Validated",
|
|
||||||
"None.",
|
|
||||||
].join("\n");
|
|
||||||
|
|
||||||
const reordered = reorderForCaching(prompt);
|
|
||||||
|
|
||||||
assert.ok(
|
|
||||||
reordered.indexOf("## Mission") <
|
|
||||||
reordered.indexOf("## Requirements Advanced"),
|
|
||||||
);
|
|
||||||
assert.ok(
|
|
||||||
reordered.indexOf("## Context") <
|
|
||||||
reordered.indexOf("## Requirements Advanced"),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("reorderForCaching_when_top_level_requirements_exists_still_hoists_exact_requirements_block", () => {
|
|
||||||
const prompt = [
|
|
||||||
"# Execute",
|
|
||||||
"",
|
|
||||||
"## Mission",
|
|
||||||
"Do work.",
|
|
||||||
"",
|
|
||||||
"## Requirements",
|
|
||||||
"- R1",
|
|
||||||
"",
|
|
||||||
"## Verification",
|
|
||||||
"Run tests.",
|
|
||||||
].join("\n");
|
|
||||||
|
|
||||||
const reordered = reorderForCaching(prompt);
|
|
||||||
|
|
||||||
assert.ok(
|
|
||||||
reordered.indexOf("## Requirements") < reordered.indexOf("## Mission"),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
|
||||||
30
src/resources/extensions/sf/tests/run-unit.test.mjs
Normal file
30
src/resources/extensions/sf/tests/run-unit.test.mjs
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
import assert from "node:assert/strict";
|
||||||
|
import { test } from "vitest";
|
||||||
|
|
||||||
|
import { buildUnitPromptMessageContent } from "../auto/run-unit.js";
|
||||||
|
|
||||||
|
test("buildUnitPromptMessageContent_when_prompt_parts_present_preserves_join_boundary", () => {
|
||||||
|
const content = buildUnitPromptMessageContent("flat", {
|
||||||
|
before: "## Working Directory\n/repo",
|
||||||
|
after: "## Inlined Task Plan\nDo it.",
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.ok(Array.isArray(content));
|
||||||
|
assert.deepEqual(content[0], {
|
||||||
|
type: "text",
|
||||||
|
text: "## Working Directory\n/repo\n",
|
||||||
|
cache_control: { type: "ephemeral" },
|
||||||
|
});
|
||||||
|
assert.deepEqual(content[1], {
|
||||||
|
type: "text",
|
||||||
|
text: "## Inlined Task Plan\nDo it.",
|
||||||
|
});
|
||||||
|
assert.equal(
|
||||||
|
content.map((part) => part.text).join(""),
|
||||||
|
"## Working Directory\n/repo\n## Inlined Task Plan\nDo it.",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("buildUnitPromptMessageContent_when_no_prompt_parts_returns_flat_prompt", () => {
|
||||||
|
assert.equal(buildUnitPromptMessageContent("flat", null), "flat");
|
||||||
|
});
|
||||||
|
|
@ -0,0 +1,134 @@
|
||||||
|
import { spawn } from "node:child_process";
|
||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import {
|
||||||
|
isSessionPidAlive,
|
||||||
|
terminateExistingSession,
|
||||||
|
} from "../session-lock.js";
|
||||||
|
|
||||||
|
function spawnSleeper(seconds = 30) {
|
||||||
|
// `sleep` is a deliberate cooperative target: it exits on SIGTERM, which
|
||||||
|
// lets us exercise the graceful path. For the SIGKILL escalation test we
|
||||||
|
// spawn a child that ignores SIGTERM via `trap '' TERM`.
|
||||||
|
const child = spawn("/bin/sh", ["-c", `sleep ${seconds}`], {
|
||||||
|
stdio: "ignore",
|
||||||
|
detached: false,
|
||||||
|
});
|
||||||
|
return child;
|
||||||
|
}
|
||||||
|
|
||||||
|
function spawnIgnoreSigterm(seconds = 30) {
|
||||||
|
// A Node child that installs an explicit SIGTERM handler that does
|
||||||
|
// nothing. Unlike `sh -c "trap '' TERM; sleep N"` (where the shell
|
||||||
|
// tail-call-exec's sleep so SIGTERM hits sleep directly), this child
|
||||||
|
// IS the long-lived process and reliably ignores SIGTERM until the
|
||||||
|
// SIGKILL escalation. This lets us assert the escalation path.
|
||||||
|
const child = spawn(
|
||||||
|
process.execPath,
|
||||||
|
[
|
||||||
|
"-e",
|
||||||
|
`process.on('SIGTERM', () => {}); setTimeout(() => process.exit(0), ${seconds * 1000});`,
|
||||||
|
],
|
||||||
|
{ stdio: "ignore", detached: false },
|
||||||
|
);
|
||||||
|
return child;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("terminateExistingSession", () => {
|
||||||
|
test("returns alreadyDead=true when pid is invalid", async () => {
|
||||||
|
const result = await terminateExistingSession(0);
|
||||||
|
expect(result.terminated).toBe(false);
|
||||||
|
expect(result.alreadyDead).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("refuses to terminate the current process", async () => {
|
||||||
|
const result = await terminateExistingSession(process.pid);
|
||||||
|
expect(result.terminated).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns alreadyDead=true for a dead pid", async () => {
|
||||||
|
// PID 1 is alive but not ours; use a value that's almost certainly
|
||||||
|
// not assigned. 2^31 - 1 is well above any plausible PID.
|
||||||
|
const result = await terminateExistingSession(2147483646);
|
||||||
|
expect(result.alreadyDead).toBe(true);
|
||||||
|
expect(result.terminated).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("gracefully terminates a process that respects SIGTERM", async () => {
|
||||||
|
const child = spawnSleeper(60);
|
||||||
|
try {
|
||||||
|
expect(isSessionPidAlive(child.pid)).toBe(true);
|
||||||
|
const result = await terminateExistingSession(child.pid, {
|
||||||
|
gracePeriodMs: 3000,
|
||||||
|
reapWaitMs: 1000,
|
||||||
|
pollIntervalMs: 50,
|
||||||
|
});
|
||||||
|
expect(result.terminated).toBe(true);
|
||||||
|
expect(result.escalated).toBe(false);
|
||||||
|
expect(isSessionPidAlive(child.pid)).toBe(false);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
child.kill("SIGKILL");
|
||||||
|
} catch {
|
||||||
|
/* may already be dead */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("escalates to SIGKILL when the process ignores SIGTERM", async () => {
|
||||||
|
const child = spawnIgnoreSigterm(60);
|
||||||
|
// Give the child a moment to register its SIGTERM handler before we
|
||||||
|
// send SIGTERM. Without this, the kill may arrive before
|
||||||
|
// process.on('SIGTERM', …) executes and Node uses the default handler
|
||||||
|
// (exit on signal), which makes the test look like graceful exit.
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||||
|
try {
|
||||||
|
expect(isSessionPidAlive(child.pid)).toBe(true);
|
||||||
|
const result = await terminateExistingSession(child.pid, {
|
||||||
|
gracePeriodMs: 750,
|
||||||
|
reapWaitMs: 2000,
|
||||||
|
pollIntervalMs: 50,
|
||||||
|
});
|
||||||
|
expect(result.terminated).toBe(true);
|
||||||
|
expect(result.escalated).toBe(true);
|
||||||
|
expect(isSessionPidAlive(child.pid)).toBe(false);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
child.kill("SIGKILL");
|
||||||
|
} catch {
|
||||||
|
/* may already be dead */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("isSessionPidAlive", () => {
|
||||||
|
test("returns false for current process (self-check is intentionally disabled)", () => {
|
||||||
|
// isPidAlive specifically excludes the current PID to prevent
|
||||||
|
// false-positive self-detection in the lock takeover flow.
|
||||||
|
expect(isSessionPidAlive(process.pid)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns false for clearly-dead pid", () => {
|
||||||
|
expect(isSessionPidAlive(2147483646)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns true for a live child", async () => {
|
||||||
|
const child = spawnSleeper(30);
|
||||||
|
try {
|
||||||
|
expect(isSessionPidAlive(child.pid)).toBe(true);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
child.kill("SIGKILL");
|
||||||
|
} catch {
|
||||||
|
/* may already be dead */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns false for non-integer or non-positive inputs", () => {
|
||||||
|
expect(isSessionPidAlive(0)).toBe(false);
|
||||||
|
expect(isSessionPidAlive(-1)).toBe(false);
|
||||||
|
expect(isSessionPidAlive("nope")).toBe(false);
|
||||||
|
expect(isSessionPidAlive(null)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
136
src/resources/extensions/sf/tests/slice-routing-cache.test.mjs
Normal file
136
src/resources/extensions/sf/tests/slice-routing-cache.test.mjs
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
import { mkdtempSync, rmSync } from "node:fs";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { afterEach, describe, expect, test } from "vitest";
|
||||||
|
import {
|
||||||
|
_readCacheForTests,
|
||||||
|
clearSliceRoutingForUnit,
|
||||||
|
extractSliceScope,
|
||||||
|
readStickyModelForUnit,
|
||||||
|
recordSliceRouting,
|
||||||
|
} from "../slice-routing-cache.js";
|
||||||
|
|
||||||
|
let tempDirs = [];
|
||||||
|
function makeProject() {
|
||||||
|
const dir = mkdtempSync(join(tmpdir(), "sf-slice-routing-"));
|
||||||
|
tempDirs.push(dir);
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
afterEach(() => {
|
||||||
|
for (const dir of tempDirs) rmSync(dir, { recursive: true, force: true });
|
||||||
|
tempDirs = [];
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("extractSliceScope", () => {
|
||||||
|
test("execute-task style unit id collapses to milestone/slice", () => {
|
||||||
|
expect(extractSliceScope("M001-6377a4/S04/T02")).toBe("M001-6377a4/S04");
|
||||||
|
});
|
||||||
|
test("plan/complete slice ids stay as milestone/slice", () => {
|
||||||
|
expect(extractSliceScope("M001-6377a4/S04")).toBe("M001-6377a4/S04");
|
||||||
|
});
|
||||||
|
test("milestone-only ids return the milestone", () => {
|
||||||
|
expect(extractSliceScope("M001-6377a4")).toBe("M001-6377a4");
|
||||||
|
});
|
||||||
|
test("null/undefined/empty return null", () => {
|
||||||
|
expect(extractSliceScope(null)).toBeNull();
|
||||||
|
expect(extractSliceScope("")).toBeNull();
|
||||||
|
expect(extractSliceScope(undefined)).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("slice routing cache", () => {
|
||||||
|
test("record + read round-trips", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
const sticky = readStickyModelForUnit(
|
||||||
|
project,
|
||||||
|
"execute-task",
|
||||||
|
"M001/S04/T02",
|
||||||
|
);
|
||||||
|
expect(sticky).toEqual({ provider: "moonshotai", id: "kimi-k2.6" });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("sticky scoped per slice — different slice => no hit", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
expect(
|
||||||
|
readStickyModelForUnit(project, "execute-task", "M001/S05/T01"),
|
||||||
|
).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("clearSliceRoutingForUnit evicts only the matching slice", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S05/T01", {
|
||||||
|
provider: "anthropic",
|
||||||
|
id: "claude-sonnet-4-6",
|
||||||
|
});
|
||||||
|
clearSliceRoutingForUnit(project, "M001/S04/T07");
|
||||||
|
expect(
|
||||||
|
readStickyModelForUnit(project, "execute-task", "M001/S04/T99"),
|
||||||
|
).toBeNull();
|
||||||
|
expect(
|
||||||
|
readStickyModelForUnit(project, "execute-task", "M001/S05/T02"),
|
||||||
|
).toEqual({ provider: "anthropic", id: "claude-sonnet-4-6" });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("readStickyModelForUnit honors maxAgeMs", async () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
// Sleep past the retention window so age strictly exceeds maxAgeMs.
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 25));
|
||||||
|
expect(
|
||||||
|
readStickyModelForUnit(project, "execute-task", "M001/S04/T02", {
|
||||||
|
maxAgeMs: 10,
|
||||||
|
}),
|
||||||
|
).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns null on missing basePath or unparseable unit id", () => {
|
||||||
|
expect(readStickyModelForUnit("", "execute-task", "M001/S04/T01")).toBeNull();
|
||||||
|
const project = makeProject();
|
||||||
|
expect(readStickyModelForUnit(project, "execute-task", "")).toBeNull();
|
||||||
|
expect(readStickyModelForUnit(project, "execute-task", null)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("overwrite updates the slice entry in place", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T02", {
|
||||||
|
provider: "anthropic",
|
||||||
|
id: "claude-opus-4-7",
|
||||||
|
});
|
||||||
|
const cache = _readCacheForTests(project);
|
||||||
|
const entries = Object.values(cache);
|
||||||
|
expect(entries.length).toBe(1);
|
||||||
|
expect(
|
||||||
|
readStickyModelForUnit(project, "execute-task", "M001/S04/T03"),
|
||||||
|
).toEqual({ provider: "anthropic", id: "claude-opus-4-7" });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("clearSliceRoutingForUnit on the last entry removes the cache file", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
recordSliceRouting(project, "execute-task", "M001/S04/T01", {
|
||||||
|
provider: "moonshotai",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
clearSliceRoutingForUnit(project, "M001/S04/T01");
|
||||||
|
const cache = _readCacheForTests(project);
|
||||||
|
expect(Object.keys(cache).length).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
134
src/resources/extensions/sf/tests/solver-model.test.mjs
Normal file
134
src/resources/extensions/sf/tests/solver-model.test.mjs
Normal file
|
|
@ -0,0 +1,134 @@
|
||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import {
|
||||||
|
SOLVER_MODEL_DEFAULT,
|
||||||
|
SOLVER_MODEL_FALLBACKS,
|
||||||
|
isSolverModel,
|
||||||
|
resolveSolverModel,
|
||||||
|
resolveSolverModelCandidates,
|
||||||
|
} from "../solver-model.js";
|
||||||
|
|
||||||
|
describe("solver-model invariants", () => {
|
||||||
|
test("default is locked to kimi-k2.6 / kimi-coding", () => {
|
||||||
|
// This is a PROTOCOL INVARIANT, not a tuning parameter. Changing the
|
||||||
|
// default requires an ADR (see ADR-0079). If this test fails because
|
||||||
|
// someone bumped the default, that's a load-bearing change and a code
|
||||||
|
// review reject — re-read the ADR before re-running.
|
||||||
|
expect(SOLVER_MODEL_DEFAULT).toEqual({
|
||||||
|
provider: "kimi-coding",
|
||||||
|
id: "kimi-k2.6",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("no fallback is a code-completion-only model", () => {
|
||||||
|
// Code-completion models (Codestral, Devstral, the kimi-for-coding
|
||||||
|
// alias) are the ones that broke the loop in the first place. They
|
||||||
|
// must NEVER appear in the solver fallback chain.
|
||||||
|
const forbidden = new Set([
|
||||||
|
"codestral-latest",
|
||||||
|
"devstral-latest",
|
||||||
|
"kimi-for-coding",
|
||||||
|
]);
|
||||||
|
for (const candidate of SOLVER_MODEL_FALLBACKS) {
|
||||||
|
expect(forbidden.has(candidate.id)).toBe(false);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("resolveSolverModel", () => {
|
||||||
|
test("with no preferences returns the pinned default", () => {
|
||||||
|
expect(resolveSolverModel()).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
expect(resolveSolverModel(undefined)).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
expect(resolveSolverModel({})).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("ignores router/benchmark/learning state (no opt-in == default)", () => {
|
||||||
|
// Even with the kitchen sink of unrelated preference fields,
|
||||||
|
// resolveSolverModel must NOT consult any of them. Only an explicit
|
||||||
|
// preferences.autonomousSolver.model entry can override.
|
||||||
|
const preferences = {
|
||||||
|
currentModel: { provider: "mistral", id: "codestral-latest" },
|
||||||
|
modelRouter: { lastSelection: "google-gemini-cli/gemini-3-flash-preview" },
|
||||||
|
benchmarkSelector: { winner: "kimi-for-coding" },
|
||||||
|
learning: { blender: { recommended: "kimi-k2.5" } },
|
||||||
|
};
|
||||||
|
expect(resolveSolverModel(preferences)).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("respects an explicit object override", () => {
|
||||||
|
const resolved = resolveSolverModel({
|
||||||
|
autonomousSolver: { model: { provider: "anthropic", id: "claude-opus-4-7" } },
|
||||||
|
});
|
||||||
|
expect(resolved).toEqual({ provider: "anthropic", id: "claude-opus-4-7" });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("accepts a string override in provider/model form", () => {
|
||||||
|
const resolved = resolveSolverModel({
|
||||||
|
autonomousSolver: { model: "anthropic/claude-sonnet-4-6" },
|
||||||
|
});
|
||||||
|
expect(resolved).toEqual({
|
||||||
|
provider: "anthropic",
|
||||||
|
id: "claude-sonnet-4-6",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("accepts a bare model id and keeps the default provider", () => {
|
||||||
|
const resolved = resolveSolverModel({
|
||||||
|
autonomousSolver: { model: "kimi-k2-thinking" },
|
||||||
|
});
|
||||||
|
expect(resolved).toEqual({
|
||||||
|
provider: SOLVER_MODEL_DEFAULT.provider,
|
||||||
|
id: "kimi-k2-thinking",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("ignores an empty-string override", () => {
|
||||||
|
expect(
|
||||||
|
resolveSolverModel({ autonomousSolver: { model: "" } }),
|
||||||
|
).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
expect(
|
||||||
|
resolveSolverModel({ autonomousSolver: { model: " " } }),
|
||||||
|
).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("resolveSolverModelCandidates", () => {
|
||||||
|
test("primary comes first, then fallback chain (de-duplicated)", () => {
|
||||||
|
const candidates = resolveSolverModelCandidates();
|
||||||
|
expect(candidates[0]).toEqual(SOLVER_MODEL_DEFAULT);
|
||||||
|
expect(candidates.length).toBe(1 + SOLVER_MODEL_FALLBACKS.length);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("override does not duplicate when also in fallback list", () => {
|
||||||
|
const candidates = resolveSolverModelCandidates({
|
||||||
|
autonomousSolver: { model: "anthropic/claude-opus-4-7" },
|
||||||
|
});
|
||||||
|
const opusEntries = candidates.filter(
|
||||||
|
(c) => c.id === "claude-opus-4-7" && c.provider === "anthropic",
|
||||||
|
);
|
||||||
|
expect(opusEntries.length).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("isSolverModel", () => {
|
||||||
|
test("returns true for the pinned default", () => {
|
||||||
|
expect(isSolverModel(SOLVER_MODEL_DEFAULT)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns false for a routed executor model", () => {
|
||||||
|
expect(
|
||||||
|
isSolverModel({ provider: "mistral", id: "codestral-latest" }),
|
||||||
|
).toBe(false);
|
||||||
|
expect(
|
||||||
|
isSolverModel({
|
||||||
|
provider: "google-gemini-cli",
|
||||||
|
id: "gemini-3-flash-preview",
|
||||||
|
}),
|
||||||
|
).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns false for null / malformed inputs", () => {
|
||||||
|
expect(isSolverModel(null)).toBe(false);
|
||||||
|
expect(isSolverModel(undefined)).toBe(false);
|
||||||
|
expect(isSolverModel({})).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
115
src/resources/extensions/sf/tests/trace-janitor.test.mjs
Normal file
115
src/resources/extensions/sf/tests/trace-janitor.test.mjs
Normal file
|
|
@ -0,0 +1,115 @@
|
||||||
|
import {
|
||||||
|
existsSync,
|
||||||
|
mkdirSync,
|
||||||
|
mkdtempSync,
|
||||||
|
rmSync,
|
||||||
|
symlinkSync,
|
||||||
|
utimesSync,
|
||||||
|
writeFileSync,
|
||||||
|
} from "node:fs";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { afterEach, describe, expect, test } from "vitest";
|
||||||
|
import { pruneStaleTraces } from "../uok/trace-writer.js";
|
||||||
|
|
||||||
|
let tempDirs = [];
|
||||||
|
|
||||||
|
function makeProject() {
|
||||||
|
const dir = mkdtempSync(join(tmpdir(), "sf-trace-janitor-"));
|
||||||
|
tempDirs.push(dir);
|
||||||
|
mkdirSync(join(dir, ".sf"), { recursive: true });
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
for (const dir of tempDirs) {
|
||||||
|
rmSync(dir, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
tempDirs = [];
|
||||||
|
});
|
||||||
|
|
||||||
|
function makeTraceFile(project, name, daysOld) {
|
||||||
|
const tracesDir = join(project, ".sf", "traces");
|
||||||
|
mkdirSync(tracesDir, { recursive: true });
|
||||||
|
const path = join(tracesDir, name);
|
||||||
|
writeFileSync(path, '{"ts":"2024-01-01T00:00:00Z","type":"gate_run"}\n');
|
||||||
|
if (typeof daysOld === "number") {
|
||||||
|
const epoch = (Date.now() - daysOld * 24 * 60 * 60 * 1000) / 1000;
|
||||||
|
utimesSync(path, epoch, epoch);
|
||||||
|
}
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("pruneStaleTraces", () => {
|
||||||
|
test("removes jsonl files older than retention window", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
const oldFile = makeTraceFile(
|
||||||
|
project,
|
||||||
|
"pre-dispatch:old.jsonl",
|
||||||
|
45,
|
||||||
|
);
|
||||||
|
const freshFile = makeTraceFile(
|
||||||
|
project,
|
||||||
|
"pre-dispatch:fresh.jsonl",
|
||||||
|
5,
|
||||||
|
);
|
||||||
|
expect(existsSync(oldFile)).toBe(true);
|
||||||
|
expect(existsSync(freshFile)).toBe(true);
|
||||||
|
|
||||||
|
const result = pruneStaleTraces(project);
|
||||||
|
expect(result.pruned).toBe(1);
|
||||||
|
expect(existsSync(oldFile)).toBe(false);
|
||||||
|
expect(existsSync(freshFile)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("respects a custom retention window", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
const file = makeTraceFile(project, "pre-dispatch:tenday.jsonl", 10);
|
||||||
|
const result = pruneStaleTraces(project, { retentionDays: 7 });
|
||||||
|
expect(result.pruned).toBe(1);
|
||||||
|
expect(existsSync(file)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("never touches the `latest` symlink", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
const file = makeTraceFile(project, "pre-dispatch:current.jsonl", 0);
|
||||||
|
const latest = join(project, ".sf", "traces", "latest");
|
||||||
|
symlinkSync("pre-dispatch:current.jsonl", latest);
|
||||||
|
// Make `latest` look old via its target; the symlink itself is fine.
|
||||||
|
pruneStaleTraces(project);
|
||||||
|
expect(existsSync(latest)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("ignores non-jsonl files", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
const tracesDir = join(project, ".sf", "traces");
|
||||||
|
mkdirSync(tracesDir, { recursive: true });
|
||||||
|
const txt = join(tracesDir, "notes.txt");
|
||||||
|
writeFileSync(txt, "ignored");
|
||||||
|
const epoch = (Date.now() - 90 * 24 * 60 * 60 * 1000) / 1000;
|
||||||
|
utimesSync(txt, epoch, epoch);
|
||||||
|
pruneStaleTraces(project);
|
||||||
|
expect(existsSync(txt)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns zero-counts when traces dir does not exist", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
// no traces dir
|
||||||
|
const result = pruneStaleTraces(project);
|
||||||
|
expect(result).toEqual({ scanned: 0, pruned: 0, errors: 0 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("respects maxDeletePerCall safety cap", () => {
|
||||||
|
const project = makeProject();
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
makeTraceFile(project, `pre-dispatch:old-${i}.jsonl`, 60);
|
||||||
|
}
|
||||||
|
const result = pruneStaleTraces(project, { maxDeletePerCall: 2 });
|
||||||
|
expect(result.pruned).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("does not throw on missing basePath", () => {
|
||||||
|
expect(() => pruneStaleTraces("")).not.toThrow();
|
||||||
|
expect(() => pruneStaleTraces(undefined)).not.toThrow();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -328,13 +328,12 @@ export default function sfTui(pi) {
|
||||||
renderResult: ({ output }) => output,
|
renderResult: ({ output }) => output,
|
||||||
});
|
});
|
||||||
|
|
||||||
// ASK_USER_ELICITATION — structured form-based ask_user replacement.
|
// ask_user_elicitation — structured form-based ask_user replacement.
|
||||||
// When the flag is on and the agent calls this tool with choices, a TUI
|
// Shows a TUI select overlay when choices are provided, freeform input otherwise.
|
||||||
// select overlay is shown instead of a plain text prompt.
|
|
||||||
pi.registerTool({
|
pi.registerTool({
|
||||||
name: "ask_user_elicitation",
|
name: "ask_user_elicitation",
|
||||||
description:
|
description:
|
||||||
"Ask the user a question using a structured form with optional choices. When ASK_USER_ELICITATION is enabled this is preferred over plain ask_user for questions with known choices.",
|
"Ask the user a question using a structured form with optional choices. Shows a TUI select overlay when choices are provided, or a freeform text prompt otherwise.",
|
||||||
parameters: {
|
parameters: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
|
|
@ -359,12 +358,6 @@ export default function sfTui(pi) {
|
||||||
if (!ctx?.hasUI) {
|
if (!ctx?.hasUI) {
|
||||||
return { output: "No UI available for elicitation." };
|
return { output: "No UI available for elicitation." };
|
||||||
}
|
}
|
||||||
if (!getExperimentalFlag("ask_elicitation")) {
|
|
||||||
return {
|
|
||||||
output:
|
|
||||||
"ASK_USER_ELICITATION is not enabled. Run /experimental on ask_elicitation to enable.",
|
|
||||||
};
|
|
||||||
}
|
|
||||||
if (choices?.length) {
|
if (choices?.length) {
|
||||||
const answer = await ctx.ui.select(question, choices);
|
const answer = await ctx.ui.select(question, choices);
|
||||||
if (!answer && allow_freeform) {
|
if (!answer && allow_freeform) {
|
||||||
|
|
@ -379,121 +372,6 @@ export default function sfTui(pi) {
|
||||||
renderResult: ({ output }) => (output ? `**Answer:** ${output}` : ""),
|
renderResult: ({ output }) => (output ? `**Answer:** ${output}` : ""),
|
||||||
});
|
});
|
||||||
|
|
||||||
// MULTI_TURN_AGENTS — persistent named sub-agent sessions via file-backed state.
|
|
||||||
// Tool that spawns or resumes a named SF child process, relaying messages.
|
|
||||||
pi.registerTool({
|
|
||||||
name: "spawn_agent",
|
|
||||||
description:
|
|
||||||
"Spawn or resume a named persistent sub-agent. Sends a message and waits for the response. The agent persists across calls using file-backed state in .sf/agents/<name>/.",
|
|
||||||
parameters: {
|
|
||||||
type: "object",
|
|
||||||
properties: {
|
|
||||||
name: {
|
|
||||||
type: "string",
|
|
||||||
description:
|
|
||||||
"Unique agent name (alphanumeric + hyphens, e.g. 'researcher')",
|
|
||||||
},
|
|
||||||
message: {
|
|
||||||
type: "string",
|
|
||||||
description: "Message to send to the agent",
|
|
||||||
},
|
|
||||||
reset: {
|
|
||||||
type: "boolean",
|
|
||||||
description:
|
|
||||||
"If true, clear the agent's state and start fresh (default: false)",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
required: ["name", "message"],
|
|
||||||
},
|
|
||||||
execute: async ({ name, message, reset }) => {
|
|
||||||
if (!getExperimentalFlag("multi_turn_agents")) {
|
|
||||||
return {
|
|
||||||
output:
|
|
||||||
"MULTI_TURN_AGENTS is not enabled. Run /experimental on multi_turn_agents to enable.",
|
|
||||||
};
|
|
||||||
}
|
|
||||||
if (!/^[a-z0-9-]{1,32}$/i.test(name)) {
|
|
||||||
return {
|
|
||||||
output: "Agent name must be 1-32 alphanumeric/hyphen characters.",
|
|
||||||
};
|
|
||||||
}
|
|
||||||
const { join: pathJoin } = await import("node:path");
|
|
||||||
const { mkdirSync, writeFileSync, readFileSync, existsSync } =
|
|
||||||
await import("node:fs");
|
|
||||||
const stateDir = pathJoin(
|
|
||||||
projectRoot() ?? process.cwd(),
|
|
||||||
".sf",
|
|
||||||
"agents",
|
|
||||||
name,
|
|
||||||
);
|
|
||||||
mkdirSync(stateDir, { recursive: true });
|
|
||||||
const historyPath = pathJoin(stateDir, "history.jsonl");
|
|
||||||
if (reset && existsSync(historyPath)) {
|
|
||||||
writeFileSync(historyPath, "", "utf-8");
|
|
||||||
}
|
|
||||||
// Append user message to history
|
|
||||||
const entry = JSON.stringify({
|
|
||||||
role: "user",
|
|
||||||
content: message,
|
|
||||||
ts: Date.now(),
|
|
||||||
});
|
|
||||||
const { appendFileSync } = await import("node:fs");
|
|
||||||
appendFileSync(historyPath, `${entry}\n`, "utf-8");
|
|
||||||
// Dispatch to SF headless with the conversation history as context
|
|
||||||
const historyLines = existsSync(historyPath)
|
|
||||||
? readFileSync(historyPath, "utf-8")
|
|
||||||
.trim()
|
|
||||||
.split("\n")
|
|
||||||
.filter(Boolean)
|
|
||||||
.map((l) => {
|
|
||||||
try {
|
|
||||||
return JSON.parse(l);
|
|
||||||
} catch {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter(Boolean)
|
|
||||||
: [];
|
|
||||||
const contextMsg = historyLines
|
|
||||||
.slice(-10) // last 10 turns for context
|
|
||||||
.map((e) => `${e.role === "user" ? "User" : "Agent"}: ${e.content}`)
|
|
||||||
.join("\n");
|
|
||||||
const fullPrompt = `[Agent: ${name}]\n\nConversation history:\n${contextMsg}\n\nRespond to the last user message only.`;
|
|
||||||
const { execFile } = await import("node:child_process");
|
|
||||||
const { promisify } = await import("node:util");
|
|
||||||
const execFileAsync = promisify(execFile);
|
|
||||||
try {
|
|
||||||
const { stdout } = await execFileAsync(
|
|
||||||
process.execPath,
|
|
||||||
[
|
|
||||||
"-y",
|
|
||||||
"node@24",
|
|
||||||
process.env.SF_LOADER ?? "dist/loader.js",
|
|
||||||
"headless",
|
|
||||||
"--print",
|
|
||||||
fullPrompt,
|
|
||||||
],
|
|
||||||
{
|
|
||||||
timeout: 60000,
|
|
||||||
encoding: "utf-8",
|
|
||||||
env: { ...process.env },
|
|
||||||
},
|
|
||||||
);
|
|
||||||
const response = stdout.trim();
|
|
||||||
appendFileSync(
|
|
||||||
historyPath,
|
|
||||||
`${JSON.stringify({ role: "assistant", content: response, ts: Date.now() })}\n`,
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
return { output: response };
|
|
||||||
} catch (err) {
|
|
||||||
return {
|
|
||||||
output: `Agent dispatch failed: ${getErrorMessage(err)}`,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
},
|
|
||||||
renderResult: ({ output }) => output,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Run the STATUS_LINE user script on a 5s interval, posting stdout to footer. */
|
/** Run the STATUS_LINE user script on a 5s interval, posting stdout to footer. */
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,31 @@
|
||||||
*
|
*
|
||||||
* Consumer: AgentSwarm orchestrator, swarm role agents (CoordinatorAgent, WorkerAgent etc),
|
* Consumer: AgentSwarm orchestrator, swarm role agents (CoordinatorAgent, WorkerAgent etc),
|
||||||
* and direct use in multi-agent dispatch flows.
|
* and direct use in multi-agent dispatch flows.
|
||||||
|
*
|
||||||
|
* ## Current state
|
||||||
|
* This module implements the **container** half of a persistent agent: identity, inbox,
|
||||||
|
* memory blocks, and message routing. It does NOT implement the **runner** half.
|
||||||
|
*
|
||||||
|
* The missing piece is an LLM execution runner that:
|
||||||
|
* 1. Reads pending messages from this agent's inbox (`receive(true)`)
|
||||||
|
* 2. Assembles a prompt from core memory blocks + inbox messages
|
||||||
|
* 3. Dispatches to SF headless (`node dist/loader.js headless --print <prompt>`)
|
||||||
|
* 4. Writes the LLM response back into the bus as a reply
|
||||||
|
* 5. Updates memory blocks (eviction, summarization) when context grows large
|
||||||
|
*
|
||||||
|
* Until the runner exists, `PersistentAgent` is a passive store. The autonomous loop
|
||||||
|
* uses it this way for sleeptime memory consolidation (caller sends + immediately reads
|
||||||
|
* inbox). `SwarmDispatchLayer` also only enqueues messages — nothing processes them.
|
||||||
|
*
|
||||||
|
* When building the runner, key design decisions to make:
|
||||||
|
* - Context window management: how many inbox turns to include before summarizing
|
||||||
|
* - Memory eviction: which core blocks are injected, which are summarized to archival
|
||||||
|
* - Turn limits: max rounds before the runner yields and re-queues
|
||||||
|
* - Concurrency: one runner per agent name (enforce via DB lock or process mutex)
|
||||||
|
* - Error handling: failed LLM calls should leave the message as unread, not drop it
|
||||||
|
*
|
||||||
|
* See: Codex `codex-rs/core/src/agent/control.rs` for the reference implementation of
|
||||||
|
* typed parallel subagents (explorer/worker roles) with forked rollout history.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { randomUUID } from "node:crypto";
|
import { randomUUID } from "node:crypto";
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,18 @@
|
||||||
*
|
*
|
||||||
* Consumer: UOK kernel dispatch path, parallel orchestrators, and /sf autonomous controller
|
* Consumer: UOK kernel dispatch path, parallel orchestrators, and /sf autonomous controller
|
||||||
* when SF_A2A_ENABLED is set.
|
* when SF_A2A_ENABLED is set.
|
||||||
|
*
|
||||||
|
* ## Current state — enqueue only, no runner
|
||||||
|
* `_busDispatch` routes an envelope to a role agent's inbox via the MessageBus. It does NOT
|
||||||
|
* wait for a response — the `DispatchResult` contains only `messageId` and `targetAgent`,
|
||||||
|
* not LLM output. Nothing currently drains agent inboxes and runs LLM calls.
|
||||||
|
*
|
||||||
|
* This layer is ready to use once `PersistentAgent` gains a runner (see persistent-agent.js
|
||||||
|
* module comment for the runner design). At that point `dispatch()` can be extended to
|
||||||
|
* optionally block until the runner posts a reply to the bus.
|
||||||
|
*
|
||||||
|
* Callers outside uok/: none currently. The autonomous loop uses AgentSwarm directly for
|
||||||
|
* the sleeptime memory path. Wire this in when building the autonomous orchestrator.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { AgentSwarm } from "./agent-swarm.js";
|
import { AgentSwarm } from "./agent-swarm.js";
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import {
|
||||||
appendFileSync,
|
appendFileSync,
|
||||||
closeSync,
|
closeSync,
|
||||||
existsSync,
|
existsSync,
|
||||||
|
lstatSync,
|
||||||
mkdirSync,
|
mkdirSync,
|
||||||
openSync,
|
openSync,
|
||||||
readdirSync,
|
readdirSync,
|
||||||
|
|
@ -15,6 +16,12 @@ import {
|
||||||
import { join } from "node:path";
|
import { join } from "node:path";
|
||||||
import { sfRoot } from "../paths.js";
|
import { sfRoot } from "../paths.js";
|
||||||
|
|
||||||
|
// Longest read window currently used by any trace consumer
|
||||||
|
// (sf-db-gates.js:391 reads 30 days). Anything older than this is never
|
||||||
|
// read and just consumes disk.
|
||||||
|
const TRACE_RETENTION_DAYS_DEFAULT = 30;
|
||||||
|
const MS_PER_DAY = 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
function tracesDir(basePath) {
|
function tracesDir(basePath) {
|
||||||
return join(sfRoot(basePath), "traces");
|
return join(sfRoot(basePath), "traces");
|
||||||
}
|
}
|
||||||
|
|
@ -45,6 +52,64 @@ export function appendTraceEvent(basePath, traceId, event) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prune .sf/traces/*.jsonl files older than retentionDays.
|
||||||
|
*
|
||||||
|
* Why: per-flow trace files accumulate one-per-dispatch and are never
|
||||||
|
* cleaned. The longest analyzer window today is 30 days
|
||||||
|
* (sf-db-gates.js:391); anything older is never read and just consumes
|
||||||
|
* disk. The `latest` symlink is preserved unconditionally so the
|
||||||
|
* tail-friendly pointer keeps working.
|
||||||
|
*
|
||||||
|
* Consumer: session-start hook (idempotent, fast, best-effort).
|
||||||
|
*
|
||||||
|
* @param {string} basePath
|
||||||
|
* @param {object} [opts]
|
||||||
|
* @param {number} [opts.retentionDays=30]
|
||||||
|
* @param {number} [opts.maxDeletePerCall=1000] - safety cap so a runaway
|
||||||
|
* directory doesn't make startup slow.
|
||||||
|
* @returns {{ scanned: number, pruned: number, errors: number }}
|
||||||
|
*/
|
||||||
|
export function pruneStaleTraces(basePath, opts = {}) {
|
||||||
|
const retentionDays = Number(opts.retentionDays ?? TRACE_RETENTION_DAYS_DEFAULT);
|
||||||
|
const maxDeletePerCall = Math.max(1, Number(opts.maxDeletePerCall ?? 1000));
|
||||||
|
const result = { scanned: 0, pruned: 0, errors: 0 };
|
||||||
|
if (!basePath || typeof basePath !== "string") return result;
|
||||||
|
let dir;
|
||||||
|
try {
|
||||||
|
dir = tracesDir(basePath);
|
||||||
|
} catch {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
if (!existsSync(dir)) return result;
|
||||||
|
const cutoff = Date.now() - retentionDays * MS_PER_DAY;
|
||||||
|
let entries;
|
||||||
|
try {
|
||||||
|
entries = readdirSync(dir);
|
||||||
|
} catch {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
for (const name of entries) {
|
||||||
|
if (result.pruned >= maxDeletePerCall) break;
|
||||||
|
if (name === "latest") continue;
|
||||||
|
if (!name.endsWith(".jsonl")) continue;
|
||||||
|
const path = join(dir, name);
|
||||||
|
result.scanned += 1;
|
||||||
|
try {
|
||||||
|
// lstat so we don't follow a symlink (defensive — there shouldn't
|
||||||
|
// be any besides `latest`, but never silently chase).
|
||||||
|
const stat = lstatSync(path);
|
||||||
|
if (!stat.isFile()) continue;
|
||||||
|
if (stat.mtimeMs >= cutoff) continue;
|
||||||
|
unlinkSync(path);
|
||||||
|
result.pruned += 1;
|
||||||
|
} catch {
|
||||||
|
result.errors += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
export function readTraceEvents(basePath, type, windowHours = 24) {
|
export function readTraceEvents(basePath, type, windowHours = 24) {
|
||||||
// Read all trace files modified within windowHours, filter by event type
|
// Read all trace files modified within windowHours, filter by event type
|
||||||
// Returns array of matching events
|
// Returns array of matching events
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue