diff --git a/.gitignore b/.gitignore index 9269d43a0..7e8e1874d 100644 --- a/.gitignore +++ b/.gitignore @@ -106,4 +106,17 @@ repowise.db .sf/scaffold-manifest.json .sf/interactive.lock .sf/interactive.lock.d/ +# SQLite WAL/SHM are ephemeral checkpoint files — only the .db is durable. +.sf/metrics.db-wal +.sf/metrics.db-shm +.sf/sf.db-wal +.sf/sf.db-shm +# Per-dispatch trace files accumulate one-per-request and are runtime-only. +# Consumers (sf-db-gates, adaptive verification policy) read by mtime window +# (24h–30d) — on-disk retention is needed, but git tracking is not. +.sf/traces/pre-dispatch:*.jsonl +.sf/traces/finalize:*.jsonl +.sf/traces/guard:*.jsonl +# `latest` is a symlink retargeted on every dispatch — pure git noise. +.sf/traces/latest test_output.log diff --git a/.sf/backups/db/maintenance.json b/.sf/backups/db/maintenance.json index 7c66cbff5..054f1799b 100644 --- a/.sf/backups/db/maintenance.json +++ b/.sf/backups/db/maintenance.json @@ -1,3 +1,3 @@ { - "lastFullVacuumAt": "2026-05-12T13:59:07.765Z" + "lastFullVacuumAt": "2026-05-12T20:58:28.744Z" } diff --git a/.sf/backups/db/sf.db.2026-05-10T05-22-28-577Z b/.sf/backups/db/sf.db.2026-05-10T05-22-28-577Z deleted file mode 100644 index 8ace58971..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-10T05-22-28-577Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-10T05-37-52-529Z b/.sf/backups/db/sf.db.2026-05-10T05-37-52-529Z deleted file mode 100644 index f18e61f46..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-10T05-37-52-529Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-10T05-57-58-732Z b/.sf/backups/db/sf.db.2026-05-10T05-57-58-732Z deleted file mode 100644 index 43ca86cfe..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-10T05-57-58-732Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-12T20-58-28-491Z b/.sf/backups/db/sf.db.2026-05-12T20-58-28-491Z new file mode 100644 index 000000000..91e5bc7c8 Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-12T20-58-28-491Z differ diff --git a/.sf/backups/db/sf.db.2026-05-12T21-15-56-990Z b/.sf/backups/db/sf.db.2026-05-12T21-15-56-990Z new file mode 100644 index 000000000..b8c23051f Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-12T21-15-56-990Z differ diff --git a/.sf/backups/db/sf.db.2026-05-12T23-50-31-488Z b/.sf/backups/db/sf.db.2026-05-12T23-50-31-488Z new file mode 100644 index 000000000..5f65b2570 Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-12T23-50-31-488Z differ diff --git a/.sf/graphs/graph.json b/.sf/graphs/graph.json index 34f04f93b..5a422b3e7 100644 --- a/.sf/graphs/graph.json +++ b/.sf/graphs/graph.json @@ -60,5 +60,5 @@ "confidence": "EXTRACTED" } ], - "builtAt": "2026-05-12T15:26:43.252Z" + "builtAt": "2026-05-12T23:53:23.408Z" } \ No newline at end of file diff --git a/.sf/metrics.db b/.sf/metrics.db index ef8afc4bb..1c383bba1 100644 Binary files a/.sf/metrics.db and b/.sf/metrics.db differ diff --git a/.sf/metrics.db-shm b/.sf/metrics.db-shm index 20c56f6c4..0cd251d62 100644 Binary files a/.sf/metrics.db-shm and b/.sf/metrics.db-shm differ diff --git a/.sf/metrics.db-wal b/.sf/metrics.db-wal index 0a32c4de8..fe258b65e 100644 Binary files a/.sf/metrics.db-wal and b/.sf/metrics.db-wal differ diff --git a/.sf/model-catalog/mistral.json b/.sf/model-catalog/mistral.json index 11aef6ccf..0ff5ab62c 100644 --- a/.sf/model-catalog/mistral.json +++ b/.sf/model-catalog/mistral.json @@ -1 +1 @@ -{"fetchedAt":"2026-05-12T14:54:31.656Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]} \ No newline at end of file +{"fetchedAt":"2026-05-12T21:25:20.919Z","modelIds":["mistral-medium-2505","mistral-medium-2508","mistral-medium-latest","mistral-medium","mistral-vibe-cli-with-tools","open-mistral-nemo","open-mistral-nemo-2407","mistral-tiny-2407","mistral-tiny-latest","codestral-2508","codestral-latest","devstral-2512","devstral-medium-latest","devstral-latest","mistral-small-2603","mistral-small-latest","mistral-vibe-cli-fast","magistral-small-latest","magistral-medium-2509","magistral-medium-latest","labs-leanstral-2603","mistral-large-2512","mistral-large-latest","mistral-large-2512","mistral-large-latest","ministral-3b-2512","ministral-3b-latest","ministral-8b-2512","ministral-8b-latest","ministral-14b-2512","ministral-14b-latest","mistral-medium-3-5","mistral-medium-3.5","mistral-medium-3","mistral-medium-2604","mistral-medium-c21211-r0-75","mistral-vibe-cli-latest","mistral-large-2411","pixtral-large-2411","pixtral-large-latest","mistral-large-pixtral-2411","devstral-small-2507","devstral-medium-2507","magistral-small-2509","mistral-small-2506"]} \ No newline at end of file diff --git a/.sf/model-catalog/openrouter.json b/.sf/model-catalog/openrouter.json index a4361cf81..d4e7045e4 100644 --- a/.sf/model-catalog/openrouter.json +++ b/.sf/model-catalog/openrouter.json @@ -1 +1 @@ -{"fetchedAt":"2026-05-12T14:47:40.438Z","modelIds":["inclusionai/ring-2.6-1t:free","google/gemini-3.1-flash-lite","baidu/cobuddy:free","openai/gpt-chat-latest","x-ai/grok-4.3","ibm-granite/granite-4.1-8b","mistralai/mistral-medium-3-5","openrouter/owl-alpha","nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free","poolside/laguna-xs.2:free","poolside/laguna-m.1:free","~anthropic/claude-haiku-latest","~openai/gpt-mini-latest","~google/gemini-pro-latest","~moonshotai/kimi-latest","~google/gemini-flash-latest","~anthropic/claude-sonnet-latest","~openai/gpt-latest","qwen/qwen3.5-plus-20260420","qwen/qwen3.6-flash","qwen/qwen3.6-35b-a3b","qwen/qwen3.6-max-preview","qwen/qwen3.6-27b","openai/gpt-5.5-pro","openai/gpt-5.5","deepseek/deepseek-v4-pro","deepseek/deepseek-v4-flash","inclusionai/ling-2.6-1t","tencent/hy3-preview","xiaomi/mimo-v2.5-pro","xiaomi/mimo-v2.5","openai/gpt-5.4-image-2","inclusionai/ling-2.6-flash","~anthropic/claude-opus-latest","openrouter/pareto-code","baidu/qianfan-ocr-fast:free","moonshotai/kimi-k2.6","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6-fast","z-ai/glm-5.1","google/gemma-4-26b-a4b-it:free","google/gemma-4-26b-a4b-it","google/gemma-4-31b-it:free","google/gemma-4-31b-it","qwen/qwen3.6-plus","z-ai/glm-5v-turbo","arcee-ai/trinity-large-thinking:free","arcee-ai/trinity-large-thinking","x-ai/grok-4.20-multi-agent","x-ai/grok-4.20","google/lyria-3-pro-preview","google/lyria-3-clip-preview","kwaipilot/kat-coder-pro-v2","rekaai/reka-edge","xiaomi/mimo-v2-omni","xiaomi/mimo-v2-pro","minimax/minimax-m2.7","openai/gpt-5.4-nano","openai/gpt-5.4-mini","mistralai/mistral-small-2603","z-ai/glm-5-turbo","nvidia/nemotron-3-super-120b-a12b:free","nvidia/nemotron-3-super-120b-a12b","bytedance-seed/seed-2.0-lite","qwen/qwen3.5-9b","openai/gpt-5.4-pro","openai/gpt-5.4","inception/mercury-2","openai/gpt-5.3-chat","google/gemini-3.1-flash-lite-preview","bytedance-seed/seed-2.0-mini","google/gemini-3.1-flash-image-preview","qwen/qwen3.5-35b-a3b","qwen/qwen3.5-27b","qwen/qwen3.5-122b-a10b","qwen/qwen3.5-flash-02-23","liquid/lfm-2-24b-a2b","google/gemini-3.1-pro-preview-customtools","openai/gpt-5.3-codex","aion-labs/aion-2.0","google/gemini-3.1-pro-preview","anthropic/claude-sonnet-4.6","qwen/qwen3.5-plus-02-15","qwen/qwen3.5-397b-a17b","minimax/minimax-m2.5:free","minimax/minimax-m2.5","z-ai/glm-5","qwen/qwen3-max-thinking","anthropic/claude-opus-4.6","qwen/qwen3-coder-next","openrouter/free","stepfun/step-3.5-flash","arcee-ai/trinity-large-preview","moonshotai/kimi-k2.5","upstage/solar-pro-3","minimax/minimax-m2-her","writer/palmyra-x5","liquid/lfm-2.5-1.2b-thinking:free","liquid/lfm-2.5-1.2b-instruct:free","openai/gpt-audio","openai/gpt-audio-mini","z-ai/glm-4.7-flash","openai/gpt-5.2-codex","bytedance-seed/seed-1.6-flash","bytedance-seed/seed-1.6","minimax/minimax-m2.1","z-ai/glm-4.7","google/gemini-3-flash-preview","xiaomi/mimo-v2-flash","nvidia/nemotron-3-nano-30b-a3b:free","nvidia/nemotron-3-nano-30b-a3b","openai/gpt-5.2-chat","openai/gpt-5.2-pro","openai/gpt-5.2","mistralai/devstral-2512","relace/relace-search","z-ai/glm-4.6v","nex-agi/deepseek-v3.1-nex-n1","essentialai/rnj-1-instruct","openrouter/bodybuilder","openai/gpt-5.1-codex-max","amazon/nova-2-lite-v1","mistralai/ministral-14b-2512","mistralai/ministral-8b-2512","mistralai/ministral-3b-2512","mistralai/mistral-large-2512","arcee-ai/trinity-mini","deepseek/deepseek-v3.2-speciale","deepseek/deepseek-v3.2","prime-intellect/intellect-3","anthropic/claude-opus-4.5","allenai/olmo-3-32b-think","google/gemini-3-pro-image-preview","x-ai/grok-4.1-fast","deepcogito/cogito-v2.1-671b","openai/gpt-5.1","openai/gpt-5.1-chat","openai/gpt-5.1-codex","openai/gpt-5.1-codex-mini","moonshotai/kimi-k2-thinking","amazon/nova-premier-v1","perplexity/sonar-pro-search","mistralai/voxtral-small-24b-2507","openai/gpt-oss-safeguard-20b","nvidia/nemotron-nano-12b-v2-vl:free","minimax/minimax-m2","qwen/qwen3-vl-32b-instruct","ibm-granite/granite-4.0-h-micro","microsoft/phi-4-mini-instruct","openai/gpt-5-image-mini","anthropic/claude-haiku-4.5","qwen/qwen3-vl-8b-thinking","qwen/qwen3-vl-8b-instruct","openai/gpt-5-image","openai/o3-deep-research","openai/o4-mini-deep-research","nvidia/llama-3.3-nemotron-super-49b-v1.5","baidu/ernie-4.5-21b-a3b-thinking","google/gemini-2.5-flash-image","qwen/qwen3-vl-30b-a3b-thinking","qwen/qwen3-vl-30b-a3b-instruct","openai/gpt-5-pro","z-ai/glm-4.6","anthropic/claude-sonnet-4.5","deepseek/deepseek-v3.2-exp","thedrummer/cydonia-24b-v4.1","relace/relace-apply-3","google/gemini-2.5-flash-lite-preview-09-2025","qwen/qwen3-vl-235b-a22b-thinking","qwen/qwen3-vl-235b-a22b-instruct","qwen/qwen3-max","qwen/qwen3-coder-plus","openai/gpt-5-codex","deepseek/deepseek-v3.1-terminus","x-ai/grok-4-fast","alibaba/tongyi-deepresearch-30b-a3b","qwen/qwen3-coder-flash","qwen/qwen3-next-80b-a3b-thinking","qwen/qwen3-next-80b-a3b-instruct:free","qwen/qwen3-next-80b-a3b-instruct","qwen/qwen-plus-2025-07-28:thinking","qwen/qwen-plus-2025-07-28","nvidia/nemotron-nano-9b-v2:free","nvidia/nemotron-nano-9b-v2","moonshotai/kimi-k2-0905","qwen/qwen3-30b-a3b-thinking-2507","x-ai/grok-code-fast-1","nousresearch/hermes-4-70b","nousresearch/hermes-4-405b","deepseek/deepseek-chat-v3.1","openai/gpt-4o-audio-preview","mistralai/mistral-medium-3.1","baidu/ernie-4.5-21b-a3b","baidu/ernie-4.5-vl-28b-a3b","z-ai/glm-4.5v","ai21/jamba-large-1.7","openai/gpt-5-chat","openai/gpt-5","openai/gpt-5-mini","openai/gpt-5-nano","openai/gpt-oss-120b:free","openai/gpt-oss-120b","openai/gpt-oss-20b:free","openai/gpt-oss-20b","anthropic/claude-opus-4.1","mistralai/codestral-2508","qwen/qwen3-coder-30b-a3b-instruct","qwen/qwen3-30b-a3b-instruct-2507","z-ai/glm-4.5","z-ai/glm-4.5-air:free","z-ai/glm-4.5-air","qwen/qwen3-235b-a22b-thinking-2507","z-ai/glm-4-32b","qwen/qwen3-coder:free","qwen/qwen3-coder","bytedance/ui-tars-1.5-7b","google/gemini-2.5-flash-lite","qwen/qwen3-235b-a22b-2507","switchpoint/router","moonshotai/kimi-k2","mistralai/devstral-medium","mistralai/devstral-small","cognitivecomputations/dolphin-mistral-24b-venice-edition:free","x-ai/grok-4","tencent/hunyuan-a13b-instruct","morph/morph-v3-large","morph/morph-v3-fast","baidu/ernie-4.5-vl-424b-a47b","baidu/ernie-4.5-300b-a47b","mistralai/mistral-small-3.2-24b-instruct","minimax/minimax-m1","google/gemini-2.5-flash","google/gemini-2.5-pro","openai/o3-pro","x-ai/grok-3-mini","x-ai/grok-3","google/gemini-2.5-pro-preview","deepseek/deepseek-r1-0528","anthropic/claude-opus-4","anthropic/claude-sonnet-4","google/gemma-3n-e4b-it","mistralai/mistral-medium-3","google/gemini-2.5-pro-preview-05-06","arcee-ai/spotlight","arcee-ai/maestro-reasoning","arcee-ai/virtuoso-large","arcee-ai/coder-large","meta-llama/llama-guard-4-12b","qwen/qwen3-30b-a3b","qwen/qwen3-8b","qwen/qwen3-14b","qwen/qwen3-32b","qwen/qwen3-235b-a22b","openai/o4-mini-high","openai/o3","openai/o4-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","alfredpros/codellama-7b-instruct-solidity","x-ai/grok-3-mini-beta","x-ai/grok-3-beta","meta-llama/llama-4-maverick","meta-llama/llama-4-scout","deepseek/deepseek-chat-v3-0324","openai/o1-pro","mistralai/mistral-small-3.1-24b-instruct","google/gemma-3-4b-it","google/gemma-3-12b-it","cohere/command-a","openai/gpt-4o-mini-search-preview","openai/gpt-4o-search-preview","rekaai/reka-flash-3","google/gemma-3-27b-it","thedrummer/skyfall-36b-v2","perplexity/sonar-reasoning-pro","perplexity/sonar-pro","perplexity/sonar-deep-research","google/gemini-2.0-flash-lite-001","mistralai/mistral-saba","meta-llama/llama-guard-3-8b","openai/o3-mini-high","google/gemini-2.0-flash-001","qwen/qwen-vl-plus","aion-labs/aion-1.0","aion-labs/aion-1.0-mini","aion-labs/aion-rp-llama-3.1-8b","qwen/qwen-vl-max","qwen/qwen-turbo","qwen/qwen2.5-vl-72b-instruct","qwen/qwen-plus","qwen/qwen-max","openai/o3-mini","mistralai/mistral-small-24b-instruct-2501","deepseek/deepseek-r1-distill-qwen-32b","perplexity/sonar","deepseek/deepseek-r1-distill-llama-70b","deepseek/deepseek-r1","minimax/minimax-01","microsoft/phi-4","sao10k/l3.1-70b-hanami-x1","deepseek/deepseek-chat","sao10k/l3.3-euryale-70b","openai/o1","cohere/command-r7b-12-2024","meta-llama/llama-3.3-70b-instruct:free","meta-llama/llama-3.3-70b-instruct","amazon/nova-lite-v1","amazon/nova-micro-v1","amazon/nova-pro-v1","openai/gpt-4o-2024-11-20","mistralai/mistral-large-2411","mistralai/mistral-large-2407","mistralai/pixtral-large-2411","qwen/qwen-2.5-coder-32b-instruct","thedrummer/unslopnemo-12b","anthropic/claude-3.5-haiku","anthracite-org/magnum-v4-72b","qwen/qwen-2.5-7b-instruct","inflection/inflection-3-pi","inflection/inflection-3-productivity","thedrummer/rocinante-12b","meta-llama/llama-3.2-3b-instruct:free","meta-llama/llama-3.2-3b-instruct","meta-llama/llama-3.2-1b-instruct","meta-llama/llama-3.2-11b-vision-instruct","qwen/qwen-2.5-72b-instruct","cohere/command-r-08-2024","cohere/command-r-plus-08-2024","sao10k/l3.1-euryale-70b","nousresearch/hermes-3-llama-3.1-70b","nousresearch/hermes-3-llama-3.1-405b:free","nousresearch/hermes-3-llama-3.1-405b","sao10k/l3-lunaris-8b","openai/gpt-4o-2024-08-06","meta-llama/llama-3.1-70b-instruct","meta-llama/llama-3.1-8b-instruct","mistralai/mistral-nemo","openai/gpt-4o-mini-2024-07-18","openai/gpt-4o-mini","google/gemma-2-27b-it","sao10k/l3-euryale-70b","nousresearch/hermes-2-pro-llama-3-8b","openai/gpt-4o-2024-05-13","openai/gpt-4o","meta-llama/llama-3-8b-instruct","meta-llama/llama-3-70b-instruct","mistralai/mixtral-8x22b-instruct","microsoft/wizardlm-2-8x22b","openai/gpt-4-turbo","anthropic/claude-3-haiku","mistralai/mistral-large","openai/gpt-3.5-turbo-0613","openai/gpt-4-turbo-preview","openrouter/auto","openai/gpt-4-1106-preview","openai/gpt-3.5-turbo-instruct","mistralai/mistral-7b-instruct-v0.1","openai/gpt-3.5-turbo-16k","mancer/weaver","undi95/remm-slerp-l2-13b","gryphe/mythomax-l2-13b","openai/gpt-4","openai/gpt-3.5-turbo","openai/gpt-4-0314"]} \ No newline at end of file +{"fetchedAt":"2026-05-12T21:25:21.288Z","modelIds":["anthropic/claude-opus-4.7-fast","perceptron/perceptron-mk1","inclusionai/ring-2.6-1t:free","google/gemini-3.1-flash-lite","baidu/cobuddy:free","openai/gpt-chat-latest","x-ai/grok-4.3","ibm-granite/granite-4.1-8b","mistralai/mistral-medium-3-5","openrouter/owl-alpha","nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free","poolside/laguna-xs.2:free","poolside/laguna-m.1:free","~anthropic/claude-haiku-latest","~openai/gpt-mini-latest","~google/gemini-pro-latest","~moonshotai/kimi-latest","~google/gemini-flash-latest","~anthropic/claude-sonnet-latest","~openai/gpt-latest","qwen/qwen3.5-plus-20260420","qwen/qwen3.6-flash","qwen/qwen3.6-35b-a3b","qwen/qwen3.6-max-preview","qwen/qwen3.6-27b","openai/gpt-5.5-pro","openai/gpt-5.5","deepseek/deepseek-v4-pro","deepseek/deepseek-v4-flash","inclusionai/ling-2.6-1t","tencent/hy3-preview","xiaomi/mimo-v2.5-pro","xiaomi/mimo-v2.5","openai/gpt-5.4-image-2","inclusionai/ling-2.6-flash","~anthropic/claude-opus-latest","openrouter/pareto-code","baidu/qianfan-ocr-fast:free","moonshotai/kimi-k2.6","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6-fast","z-ai/glm-5.1","google/gemma-4-26b-a4b-it:free","google/gemma-4-26b-a4b-it","google/gemma-4-31b-it:free","google/gemma-4-31b-it","qwen/qwen3.6-plus","z-ai/glm-5v-turbo","arcee-ai/trinity-large-thinking:free","arcee-ai/trinity-large-thinking","x-ai/grok-4.20-multi-agent","x-ai/grok-4.20","google/lyria-3-pro-preview","google/lyria-3-clip-preview","kwaipilot/kat-coder-pro-v2","rekaai/reka-edge","xiaomi/mimo-v2-omni","xiaomi/mimo-v2-pro","minimax/minimax-m2.7","openai/gpt-5.4-nano","openai/gpt-5.4-mini","mistralai/mistral-small-2603","z-ai/glm-5-turbo","nvidia/nemotron-3-super-120b-a12b:free","nvidia/nemotron-3-super-120b-a12b","bytedance-seed/seed-2.0-lite","qwen/qwen3.5-9b","openai/gpt-5.4-pro","openai/gpt-5.4","inception/mercury-2","openai/gpt-5.3-chat","google/gemini-3.1-flash-lite-preview","bytedance-seed/seed-2.0-mini","google/gemini-3.1-flash-image-preview","qwen/qwen3.5-35b-a3b","qwen/qwen3.5-27b","qwen/qwen3.5-122b-a10b","qwen/qwen3.5-flash-02-23","liquid/lfm-2-24b-a2b","google/gemini-3.1-pro-preview-customtools","openai/gpt-5.3-codex","aion-labs/aion-2.0","google/gemini-3.1-pro-preview","anthropic/claude-sonnet-4.6","qwen/qwen3.5-plus-02-15","qwen/qwen3.5-397b-a17b","minimax/minimax-m2.5:free","minimax/minimax-m2.5","z-ai/glm-5","qwen/qwen3-max-thinking","anthropic/claude-opus-4.6","qwen/qwen3-coder-next","openrouter/free","stepfun/step-3.5-flash","arcee-ai/trinity-large-preview","moonshotai/kimi-k2.5","upstage/solar-pro-3","minimax/minimax-m2-her","writer/palmyra-x5","liquid/lfm-2.5-1.2b-thinking:free","liquid/lfm-2.5-1.2b-instruct:free","openai/gpt-audio","openai/gpt-audio-mini","z-ai/glm-4.7-flash","openai/gpt-5.2-codex","bytedance-seed/seed-1.6-flash","bytedance-seed/seed-1.6","minimax/minimax-m2.1","z-ai/glm-4.7","google/gemini-3-flash-preview","xiaomi/mimo-v2-flash","nvidia/nemotron-3-nano-30b-a3b:free","nvidia/nemotron-3-nano-30b-a3b","openai/gpt-5.2-chat","openai/gpt-5.2-pro","openai/gpt-5.2","mistralai/devstral-2512","relace/relace-search","z-ai/glm-4.6v","nex-agi/deepseek-v3.1-nex-n1","essentialai/rnj-1-instruct","openrouter/bodybuilder","openai/gpt-5.1-codex-max","amazon/nova-2-lite-v1","mistralai/ministral-14b-2512","mistralai/ministral-8b-2512","mistralai/ministral-3b-2512","mistralai/mistral-large-2512","arcee-ai/trinity-mini","deepseek/deepseek-v3.2-speciale","deepseek/deepseek-v3.2","prime-intellect/intellect-3","anthropic/claude-opus-4.5","allenai/olmo-3-32b-think","google/gemini-3-pro-image-preview","x-ai/grok-4.1-fast","deepcogito/cogito-v2.1-671b","openai/gpt-5.1","openai/gpt-5.1-chat","openai/gpt-5.1-codex","openai/gpt-5.1-codex-mini","moonshotai/kimi-k2-thinking","amazon/nova-premier-v1","perplexity/sonar-pro-search","mistralai/voxtral-small-24b-2507","openai/gpt-oss-safeguard-20b","nvidia/nemotron-nano-12b-v2-vl:free","minimax/minimax-m2","qwen/qwen3-vl-32b-instruct","ibm-granite/granite-4.0-h-micro","microsoft/phi-4-mini-instruct","openai/gpt-5-image-mini","anthropic/claude-haiku-4.5","qwen/qwen3-vl-8b-thinking","qwen/qwen3-vl-8b-instruct","openai/gpt-5-image","openai/o3-deep-research","openai/o4-mini-deep-research","nvidia/llama-3.3-nemotron-super-49b-v1.5","baidu/ernie-4.5-21b-a3b-thinking","google/gemini-2.5-flash-image","qwen/qwen3-vl-30b-a3b-thinking","qwen/qwen3-vl-30b-a3b-instruct","openai/gpt-5-pro","z-ai/glm-4.6","anthropic/claude-sonnet-4.5","deepseek/deepseek-v3.2-exp","thedrummer/cydonia-24b-v4.1","relace/relace-apply-3","google/gemini-2.5-flash-lite-preview-09-2025","qwen/qwen3-vl-235b-a22b-thinking","qwen/qwen3-vl-235b-a22b-instruct","qwen/qwen3-max","qwen/qwen3-coder-plus","openai/gpt-5-codex","deepseek/deepseek-v3.1-terminus","x-ai/grok-4-fast","alibaba/tongyi-deepresearch-30b-a3b","qwen/qwen3-coder-flash","qwen/qwen3-next-80b-a3b-thinking","qwen/qwen3-next-80b-a3b-instruct:free","qwen/qwen3-next-80b-a3b-instruct","qwen/qwen-plus-2025-07-28:thinking","qwen/qwen-plus-2025-07-28","nvidia/nemotron-nano-9b-v2:free","nvidia/nemotron-nano-9b-v2","moonshotai/kimi-k2-0905","qwen/qwen3-30b-a3b-thinking-2507","x-ai/grok-code-fast-1","nousresearch/hermes-4-70b","nousresearch/hermes-4-405b","deepseek/deepseek-chat-v3.1","openai/gpt-4o-audio-preview","mistralai/mistral-medium-3.1","baidu/ernie-4.5-21b-a3b","baidu/ernie-4.5-vl-28b-a3b","z-ai/glm-4.5v","ai21/jamba-large-1.7","openai/gpt-5-chat","openai/gpt-5","openai/gpt-5-mini","openai/gpt-5-nano","openai/gpt-oss-120b:free","openai/gpt-oss-120b","openai/gpt-oss-20b:free","openai/gpt-oss-20b","anthropic/claude-opus-4.1","mistralai/codestral-2508","qwen/qwen3-coder-30b-a3b-instruct","qwen/qwen3-30b-a3b-instruct-2507","z-ai/glm-4.5","z-ai/glm-4.5-air:free","z-ai/glm-4.5-air","qwen/qwen3-235b-a22b-thinking-2507","z-ai/glm-4-32b","qwen/qwen3-coder:free","qwen/qwen3-coder","bytedance/ui-tars-1.5-7b","google/gemini-2.5-flash-lite","qwen/qwen3-235b-a22b-2507","switchpoint/router","moonshotai/kimi-k2","mistralai/devstral-medium","mistralai/devstral-small","cognitivecomputations/dolphin-mistral-24b-venice-edition:free","x-ai/grok-4","tencent/hunyuan-a13b-instruct","morph/morph-v3-large","morph/morph-v3-fast","baidu/ernie-4.5-vl-424b-a47b","baidu/ernie-4.5-300b-a47b","mistralai/mistral-small-3.2-24b-instruct","minimax/minimax-m1","google/gemini-2.5-flash","google/gemini-2.5-pro","openai/o3-pro","x-ai/grok-3-mini","x-ai/grok-3","google/gemini-2.5-pro-preview","deepseek/deepseek-r1-0528","anthropic/claude-opus-4","anthropic/claude-sonnet-4","google/gemma-3n-e4b-it","mistralai/mistral-medium-3","google/gemini-2.5-pro-preview-05-06","arcee-ai/spotlight","arcee-ai/maestro-reasoning","arcee-ai/virtuoso-large","arcee-ai/coder-large","meta-llama/llama-guard-4-12b","qwen/qwen3-30b-a3b","qwen/qwen3-8b","qwen/qwen3-14b","qwen/qwen3-32b","qwen/qwen3-235b-a22b","openai/o4-mini-high","openai/o3","openai/o4-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","alfredpros/codellama-7b-instruct-solidity","x-ai/grok-3-mini-beta","x-ai/grok-3-beta","meta-llama/llama-4-maverick","meta-llama/llama-4-scout","deepseek/deepseek-chat-v3-0324","openai/o1-pro","mistralai/mistral-small-3.1-24b-instruct","google/gemma-3-4b-it","google/gemma-3-12b-it","cohere/command-a","openai/gpt-4o-mini-search-preview","openai/gpt-4o-search-preview","rekaai/reka-flash-3","google/gemma-3-27b-it","thedrummer/skyfall-36b-v2","perplexity/sonar-reasoning-pro","perplexity/sonar-pro","perplexity/sonar-deep-research","google/gemini-2.0-flash-lite-001","mistralai/mistral-saba","meta-llama/llama-guard-3-8b","openai/o3-mini-high","google/gemini-2.0-flash-001","qwen/qwen-vl-plus","aion-labs/aion-1.0","aion-labs/aion-1.0-mini","aion-labs/aion-rp-llama-3.1-8b","qwen/qwen-vl-max","qwen/qwen-turbo","qwen/qwen2.5-vl-72b-instruct","qwen/qwen-plus","qwen/qwen-max","openai/o3-mini","mistralai/mistral-small-24b-instruct-2501","deepseek/deepseek-r1-distill-qwen-32b","perplexity/sonar","deepseek/deepseek-r1-distill-llama-70b","deepseek/deepseek-r1","minimax/minimax-01","microsoft/phi-4","sao10k/l3.1-70b-hanami-x1","deepseek/deepseek-chat","sao10k/l3.3-euryale-70b","openai/o1","cohere/command-r7b-12-2024","meta-llama/llama-3.3-70b-instruct:free","meta-llama/llama-3.3-70b-instruct","amazon/nova-lite-v1","amazon/nova-micro-v1","amazon/nova-pro-v1","openai/gpt-4o-2024-11-20","mistralai/mistral-large-2411","mistralai/mistral-large-2407","mistralai/pixtral-large-2411","qwen/qwen-2.5-coder-32b-instruct","thedrummer/unslopnemo-12b","anthropic/claude-3.5-haiku","anthracite-org/magnum-v4-72b","qwen/qwen-2.5-7b-instruct","inflection/inflection-3-pi","inflection/inflection-3-productivity","thedrummer/rocinante-12b","meta-llama/llama-3.2-1b-instruct","meta-llama/llama-3.2-3b-instruct:free","meta-llama/llama-3.2-3b-instruct","meta-llama/llama-3.2-11b-vision-instruct","qwen/qwen-2.5-72b-instruct","cohere/command-r-plus-08-2024","cohere/command-r-08-2024","sao10k/l3.1-euryale-70b","nousresearch/hermes-3-llama-3.1-70b","nousresearch/hermes-3-llama-3.1-405b:free","nousresearch/hermes-3-llama-3.1-405b","sao10k/l3-lunaris-8b","openai/gpt-4o-2024-08-06","meta-llama/llama-3.1-8b-instruct","meta-llama/llama-3.1-70b-instruct","mistralai/mistral-nemo","openai/gpt-4o-mini","openai/gpt-4o-mini-2024-07-18","google/gemma-2-27b-it","sao10k/l3-euryale-70b","nousresearch/hermes-2-pro-llama-3-8b","openai/gpt-4o","openai/gpt-4o-2024-05-13","meta-llama/llama-3-8b-instruct","meta-llama/llama-3-70b-instruct","mistralai/mixtral-8x22b-instruct","microsoft/wizardlm-2-8x22b","openai/gpt-4-turbo","anthropic/claude-3-haiku","mistralai/mistral-large","openai/gpt-3.5-turbo-0613","openai/gpt-4-turbo-preview","openrouter/auto","openai/gpt-4-1106-preview","mistralai/mistral-7b-instruct-v0.1","openai/gpt-3.5-turbo-instruct","openai/gpt-3.5-turbo-16k","mancer/weaver","undi95/remm-slerp-l2-13b","gryphe/mythomax-l2-13b","openai/gpt-4","openai/gpt-4-0314","openai/gpt-3.5-turbo"]} \ No newline at end of file diff --git a/.sf/model-performance.json b/.sf/model-performance.json index 686ca334c..8c5225755 100644 --- a/.sf/model-performance.json +++ b/.sf/model-performance.json @@ -109,26 +109,26 @@ "total": 1 }, "kimi-coding/kimi-k2.6": { - "successes": 1, + "successes": 2, "failures": 0, "timeouts": 0, - "totalTokens": 1821480, - "totalCost": 0, - "lastUsed": "2026-05-12T20:57:45.179Z", + "totalTokens": 1892068, + "totalCost": 0.030715552, + "lastUsed": "2026-05-12T23:58:57.132Z", "successRate": 1, - "total": 1 + "total": 2 } }, "complete-slice": { "kimi-coding/kimi-k2.6": { - "successes": 1, + "successes": 2, "failures": 0, "timeouts": 0, - "totalTokens": 719526, - "totalCost": 0.026709, - "lastUsed": "2026-05-12T15:26:57.708Z", + "totalTokens": 814376, + "totalCost": 0.053080319800000005, + "lastUsed": "2026-05-12T23:54:01.143Z", "successRate": 1, - "total": 1 + "total": 2 } } } \ No newline at end of file diff --git a/.sf/safety/evidence-M001-6377a4-S04-T01.json b/.sf/safety/evidence-M001-6377a4-S04-T01.json deleted file mode 100644 index 53f1c2f0d..000000000 --- a/.sf/safety/evidence-M001-6377a4-S04-T01.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "kind": "write", - "toolCallId": "write_1778619443353_32", - "path": ".sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md", - "timestamp": 1778619443535 - }, - { - "kind": "bash", - "toolCallId": "bash_1778619447339_33", - "command": "test -f .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && grep -q \"status\" .sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md && echo \"Matrix exists and contains status command info.\"", - "exitCode": 0, - "outputSnippet": "Matrix exists and contains status command info.\n", - "timestamp": 1778619447544 - } -] diff --git a/.sf/safety/evidence-M001-6377a4-S04-T02.json b/.sf/safety/evidence-M001-6377a4-S04-T02.json new file mode 100644 index 000000000..fe51488c7 --- /dev/null +++ b/.sf/safety/evidence-M001-6377a4-S04-T02.json @@ -0,0 +1 @@ +[] diff --git a/.sf/safety/evidence-M001-6377a4-S05-T01.json b/.sf/safety/evidence-M001-6377a4-S05-T01.json new file mode 100644 index 000000000..cb4efcb9d --- /dev/null +++ b/.sf/safety/evidence-M001-6377a4-S05-T01.json @@ -0,0 +1,16 @@ +[ + { + "kind": "write", + "toolCallId": "DgPnxQEen", + "path": "docs/dev/sf-ace-patterns.md.draft", + "timestamp": 1778630297060 + }, + { + "kind": "bash", + "toolCallId": "8FjDDZSlA", + "command": "test -f docs/dev/sf-ace-patterns.md.draft && grep -c \"SF Implementation\" docs/dev/sf-ace-patterns.md.draft | grep -q \"6\"", + "exitCode": 0, + "outputSnippet": "(no output)", + "timestamp": 1778630298077 + } +] diff --git a/.sf/slice-routing.json b/.sf/slice-routing.json new file mode 100644 index 000000000..885c8fcd8 --- /dev/null +++ b/.sf/slice-routing.json @@ -0,0 +1,16 @@ +{ + "M001-6377a4/S04": { + "provider": "minimax", + "id": "MiniMax-M2.1", + "ts": "2026-05-12T23:54:01.079Z", + "lastUnitType": "complete-slice", + "lastUnitId": "M001-6377a4/S04" + }, + "M001-6377a4/S05": { + "provider": "mistral", + "id": "codestral-latest", + "ts": "2026-05-12T23:58:57.088Z", + "lastUnitType": "execute-task", + "lastUnitId": "M001-6377a4/S05/T01" + } +} \ No newline at end of file diff --git a/.sf/traces/latest b/.sf/traces/latest index 97b3a76cb..4444841d8 120000 --- a/.sf/traces/latest +++ b/.sf/traces/latest @@ -1 +1 @@ -guard:76c7c307-91b4-426e-8fad-4ff951d5a52e.jsonl \ No newline at end of file +guard:b8cbf9df-9fe8-4203-9c63-79fc7264d74e.jsonl \ No newline at end of file diff --git a/TODO.md b/TODO.md index 578e3715e..d70aefca9 100644 --- a/TODO.md +++ b/TODO.md @@ -3,3 +3,39 @@ Dump anything here. --- + +## Self-Feedback Inbox + +### [prompt-modularization] Phase 3 — migrate remaining builders to `composeUnitContext` v2 + +**Context:** Phase 1 (fragment infrastructure, 17-prompt Working Directory deduplication) and +Phase 2 (5 stub manifests for deploy/smoke-production/release/rollback/challenge) shipped in +commit `ca5d869e3`. 9 of 26 unit types are now fully manifest-driven via `composeInlinedContext`. + +**What's blocked and why:** + +Migrating the remaining 17 builders to `composeInlinedContext` (v1) is the wrong path because: +1. `inlineKnowledgeScoped` and `inlineGraphSubgraph` are NOT in `ARTIFACT_KEYS` — these + artifacts would remain imperative and undeclared in every manifest, making manifests + structurally unreliable descriptions of actual builder behavior. +2. Injecting knowledge/graph at the right position in the composed string requires fragile + sentinel-string searches (e.g., `body.lastIndexOf("### Task Summary:")`). This pattern + is already untested in the 2 migrated complex builders (`research-milestone`, `complete-slice`). +3. `composeUnitContext` (v2) in `unit-context-composer.js` already has `computed`, `prepend`, + and `excerpt` support — knowledge and graph inlining maps cleanly to `computed` entries. + Migrating to v1 now creates a half-migration state that must be undone when v2 lands. + +**Recommended next slice:** +1. Add `"knowledge"` and `"graph"` to `ARTIFACT_KEYS` in `unit-context-manifest.js`. +2. Register them as `computed` entries in relevant `UNIT_MANIFESTS` entries. +3. Wire one builder (e.g., `buildResearchSlicePrompt`) through `composeUnitContext` v2 as pilot. +4. Add position-assertion tests to already-migrated complex builders (`research-milestone`, + `complete-slice`) to guard against silent ordering degradation. +5. Then migrate remaining builders in batches: slice builders → milestone builders → execute-task. + +**Note on `prompt-cache-optimizer.js`:** Entirely dead code — `optimizeForCaching()`, +`estimateCacheSavings()`, `computeCacheHitRate()` have zero importers. `reorderForCaching()` +is wired at `phases-unit.js:519` but no `cache_control` markers are written to outgoing +requests. Remove the file or wire it in the same slice that adds `cache_control` breakpoints. + +--- diff --git a/docs/dev/sf-ace-patterns.md.draft b/docs/dev/sf-ace-patterns.md.draft new file mode 100644 index 000000000..9acddb55b --- /dev/null +++ b/docs/dev/sf-ace-patterns.md.draft @@ -0,0 +1,29 @@ +# SF Patterns to ACE Reference Draft Mapping + +## Preferences + +**SF Implementation:** `src/resources/extensions/sf/preferences.js` + +## PDD + +**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js` + +## UOK Gates + +**SF Implementation:** `src/resources/extensions/sf/uok/gate-runner.js` + +## Notifications + +**SF Implementation:** `src/resources/extensions/sf/skills/frontmatter.js` + +## Skills-as-Contracts + +**SF Implementation:** `src/resources/extensions/sf/steerable-autonomous-panel.js` + +## Idempotency + +**SF Implementation:** `src/resources/extensions/sf/uok/unit-runtime.js` + +## Verification + +- All 6 patterns have verified file paths in this document. \ No newline at end of file diff --git a/docs/product/SURFACE_CAPABILITIES.md b/docs/product/SURFACE_CAPABILITIES.md new file mode 100644 index 000000000..71cd95e5b --- /dev/null +++ b/docs/product/SURFACE_CAPABILITIES.md @@ -0,0 +1,85 @@ +# SF Product Surface Capabilities + +This document defines the command and feature availability across SF's three product surfaces: **CLI / Headless**, **TUI**, and **Web**. It records intentional gaps so they are not mistaken for bugs. + +## Surface Definitions + +| Surface | Description | Primary Consumer | +| :--- | :--- | :--- | +| **CLI / Headless** | Non-interactive command-line interface and machine-surface protocol (`sf headless`). | Scripts, CI/CD, editor integrations, autonomous dispatch. | +| **TUI** | Interactive Terminal User Interface with dashboards, visualizers, and live overlays. | Developers working locally who prefer keyboard-driven interaction. | +| **Web** | Browser-based interface (Next.js) with panels, command surfaces, and visual tools. | Developers who prefer a GUI, remote access, or power-mode workflows. | + +## Feature Matrix + +| Command / Feature | CLI / Headless | TUI | Web | Notes | +| :--- | :--- | :--- | :--- | :--- | +| `/status` | ✅ | ✅ | ✅ | Text in CLI/Headless; dashboard overlay in TUI; terminal or `sf-status` panel in Web. | +| `/plan` | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. | +| `/run` (`/next`, `/autonomous`) | ✅ | ✅ | ❌ **Intentional Gap** | See [Intentional Gaps](#intentional-gaps) below. | +| `/steer` | ✅ | ✅ | ✅ | Web exposes via `sf-steer` panel. | +| `/undo` | ✅ | ✅ | ✅ | Web exposes via `sf-undo` panel. | +| `/history` | ✅ | ✅ | ✅ | Web exposes via `sf-history` panel. | +| `/doctor` | ✅ | ✅ | ✅ | Web exposes via `sf-doctor` panel. | +| `/forensics` | ✅ | ✅ | ✅ | Web exposes via `sf-forensics` panel. | +| `/skills` | ✅ | ✅ | ✅ | Web exposes via `sf-skill-health` panel. | +| `/capture` | ✅ | ✅ | ✅ | Web exposes via `sf-capture` panel. | +| `/triage` | ✅ | ✅ | ✅ | Web exposes via `sf-triage` panel. | +| `/inspect` | ✅ | ✅ | ✅ | Web exposes via `sf-inspect` panel. | +| `/hooks` | ✅ | ✅ | ✅ | Web exposes via `sf-hooks` panel. | +| `/cleanup` | ✅ | ✅ | ✅ | Web exposes via `sf-cleanup` panel. | +| `/export` | ✅ | ✅ | ✅ | Web exposes via `sf-export` panel. | +| `/queue` | ✅ | ✅ | ✅ | Web exposes via `sf-queue` panel. | +| `/visualize` | ✅ | ✅ | ✅ | Web exposes via `sf-visualize` panel. | +| `/prefs` | ✅ | ✅ | ✅ | Web exposes via `sf-prefs` panel. | +| `/config` | ✅ | ✅ | ✅ | Web exposes via `sf-config` panel. | +| `/mode` | ✅ | ✅ | ✅ | Web exposes via `sf-mode` panel. | +| `/model` | ✅ | ✅ | ✅ | Web exposes via dedicated **Model** command surface. | +| `/thinking` | ✅ | ✅ | ✅ | Web exposes via dedicated **Thinking** command surface. | +| `/git` | ✅ | ✅ | ✅ | Web exposes via dedicated **Git** command surface. | +| `/settings` | ✅ | ✅ | ✅ | Web exposes via dedicated **Settings** command surface (general, recovery, auth, admin, experimental). | +| `/resume` | ✅ | ✅ | ✅ | Web exposes via dedicated **Resume** command surface. | +| `/name` | ✅ | ✅ | ✅ | Web exposes via dedicated **Name** command surface. | +| `/fork` | ✅ | ✅ | ✅ | Web exposes via dedicated **Fork** command surface. | +| `/session` | ✅ | ✅ | ✅ | Web exposes via dedicated **Session** command surface. | +| `/compact` | ✅ | ✅ | ✅ | Web exposes via dedicated **Compact** command surface. | +| `/tasks` | ✅ | ✅ | ✅ | Web exposes via Dashboard and Activity views. | +| `/research` | ✅ | ✅ | ✅ | Web terminal supports typing the command. | +| `/implement` | ✅ | ✅ | ✅ | Web terminal supports typing the command. | + +## Intentional Gaps + +### `/plan` is not available as a first-class Web UI workflow + +**Why:** The web UI uses a different, browser-native planning and execution model. Planning artifacts are promoted through CLI-first workflows (`sf plan promote`) that require filesystem access, Git operations, and markdown rendering pipelines that are optimized for terminal and editor surfaces. The web surface focuses on higher-level UI interactions (roadmap views, milestone explorers, visual planning tools) rather than raw slash-command promotion. + +**What web users do instead:** +- Use the **Roadmap** and **Milestone Explorer** views to inspect and navigate planning state. +- Type `/plan` in the embedded terminal if needed; the command executes but the full promotion workflow is CLI-first. + +### `/run` (`/next`, `/autonomous`) is not available as a first-class Web UI workflow + +**Why:** The web UI uses a different, browser-native execution model. Backend execution is managed via specific API routes and WebSocket/bridge communication rather than a `/run` command dispatch. The web surface prioritizes supervised, click-driven execution (e.g., **Power Mode**, action buttons, workflow steppers) over autonomous terminal-style dispatch. + +**What web users do instead:** +- Use **Power Mode** for guided, step-by-step unit execution. +- Use **Chat Mode** for conversational task dispatch. +- Type `/autonomous` or `/next` in the embedded terminal if needed; execution proceeds via the PTY bridge. + +## Design Principle + +> **Behavioral coherence, not visual parity.** +> +> Every surface must expose the *same underlying state* (via `deriveState()`, UOK diagnostics, and bridge data) but may present it through different interaction models. A gap is intentional only when the surface provides an equivalent or superior alternative workflow for the same user goal. + +## Verification + +This matrix is verified against: +- `src/resources/extensions/sf/commands/handlers/core.js` — CLI/TUI `status` handler. +- `src/resources/extensions/sf/commands/handlers/ops.js` — CLI/TUI `plan` and `run` handlers. +- `src/headless.ts` — Headless status and execution entrypoints. +- `web/components/sf/command-surface.tsx` — Web command surface registry. +- `web/lib/command-surface-contract.ts` — Web command surface type definitions. +- `web/components/sf/sidebar.tsx` — Web navigation and exposed commands. + +For the full behavioral audit, see `.sf/milestones/M001-6377a4/slices/S04/VERIFICATION_MATRIX.md`. diff --git a/packages/ai/src/providers/openai-completions.test.ts b/packages/ai/src/providers/openai-completions.test.ts new file mode 100644 index 000000000..6042f5eed --- /dev/null +++ b/packages/ai/src/providers/openai-completions.test.ts @@ -0,0 +1,75 @@ +import assert from "node:assert/strict"; +import { describe, it } from "vitest"; +import type { Context, Model, OpenAICompletionsCompat } from "../types.js"; +import { convertMessages } from "./openai-completions.js"; + +const compat = { + supportsDeveloperRole: false, + requiresAssistantAfterToolResult: false, + requiresThinkingAsText: false, +} as Required; + +function model(provider: string, id: string): Model<"openai-completions"> { + return { + id, + name: id, + api: "openai-completions", + provider, + baseUrl: + provider === "openrouter" + ? "https://openrouter.ai/api/v1" + : "https://api.openai.com/v1", + reasoning: false, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128_000, + maxTokens: 4096, + }; +} + +function contextWithCacheControl(): Context { + return { + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "stable prefix", + cache_control: { type: "ephemeral" }, + } as any, + { type: "text", text: "dynamic suffix" }, + ], + timestamp: Date.now(), + }, + ], + }; +} + +describe("convertMessages cache_control", () => { + it("preserves_cache_control_when_openrouter_anthropic_model", () => { + const messages = convertMessages( + model("openrouter", "anthropic/claude-sonnet-4.5"), + contextWithCacheControl(), + compat, + ); + + const content = messages[0].content; + assert.ok(Array.isArray(content)); + assert.deepEqual((content[0] as any).cache_control, { + type: "ephemeral", + }); + }); + + it("strips_cache_control_when_openai_compatible_model_does_not_support_it", () => { + const messages = convertMessages( + model("openai", "gpt-5.3-chat-latest"), + contextWithCacheControl(), + compat, + ); + + const content = messages[0].content; + assert.ok(Array.isArray(content)); + assert.equal((content[0] as any).cache_control, undefined); + }); +}); diff --git a/packages/ai/src/providers/openai-completions.ts b/packages/ai/src/providers/openai-completions.ts index 3fe2861bd..3e6583b75 100644 --- a/packages/ai/src/providers/openai-completions.ts +++ b/packages/ai/src/providers/openai-completions.ts @@ -493,6 +493,12 @@ function maybeAddOpenRouterAnthropicToolCacheControl( } } +function supportsOpenRouterAnthropicCacheControl( + model: Model<"openai-completions">, +): boolean { + return model.provider === "openrouter" && model.id.startsWith("anthropic/"); +} + function mapReasoningEffort( effort: NonNullable, reasoningEffortMap: Partial< @@ -506,8 +512,7 @@ function maybeAddOpenRouterAnthropicCacheControl( model: Model<"openai-completions">, messages: ChatCompletionMessageParam[], ): void { - if (model.provider !== "openrouter" || !model.id.startsWith("anthropic/")) - return; + if (!supportsOpenRouterAnthropicCacheControl(model)) return; // Anthropic-style caching requires cache_control on a text part. Add a breakpoint // on the last user/assistant message (walking backwards until we find text content). @@ -622,9 +627,11 @@ export function convertMessages( // Preserve cache_control if present (set upstream for Anthropic prompt caching). // The property is not in the OpenAI SDK type but is accepted by providers // that support Anthropic-style caching (openrouter/anthropic/*). - const cacheControl = ( - item as unknown as Record - ).cache_control; + const cacheControl = supportsOpenRouterAnthropicCacheControl( + model, + ) + ? (item as unknown as Record).cache_control + : undefined; if (cacheControl) { (part as unknown as Record).cache_control = cacheControl; diff --git a/src/resources/agents/rubber-duck.md b/src/resources/agents/rubber-duck.md new file mode 100644 index 000000000..6a7f29786 --- /dev/null +++ b/src/resources/agents/rubber-duck.md @@ -0,0 +1,64 @@ +--- +name: rubber-duck +description: Constructive pre-implementation critic — catches design flaws, missing edge cases, and gaps before code is written +model: sonnet +tools: read, grep, find, ls, bash +--- + +You are a constructive critic. Your job is to identify real problems in a plan, design, or code change **before** implementation is committed to — when course corrections are still cheap. + +You are **read-only**. Do not edit files. Do not run commands that change the environment. + +## What you review + +You receive a plan, a design proposal, a code diff, or a task description. You review it for: + +- **Logic errors** — incorrect assumptions, wrong control flow, missing invariants +- **Missing edge cases** — inputs/states the plan doesn't account for +- **Design flaws** — abstractions that won't hold, coupling that will hurt, missing separation of concerns +- **Security issues** — unvalidated inputs, exposed secrets, auth gaps +- **Test gaps** — behavior that will be untested or untestable with the proposed approach +- **Spec contradictions** — where the plan conflicts with stated requirements or existing behavior + +## What you do NOT comment on + +- Code style, formatting, naming conventions +- Grammar or wording in comments/docs +- Best practices that don't cause an actual problem +- Refactoring that doesn't change correctness +- Minor improvements that don't affect the task outcome + +If something is fine, say so. Do not manufacture findings to seem thorough. A short report with two real findings beats a long report with ten nitpicks. + +## Output format + +For each finding: + +``` +## [Blocking|Non-blocking|Suggestion] — + +**What:** <the specific problem, stated precisely> +**Why it matters:** <the actual impact — what breaks, under what condition> +**Fix:** <concrete change to address it> +``` + +Then a final verdict: + +``` +## Verdict + +READY / NEEDS-REVISION + +One sentence: overall assessment. +``` + +- `READY` — no blocking findings; the plan/code can proceed as-is +- `NEEDS-REVISION` — at least one blocking finding must be addressed first + +## Severity guide + +- **Blocking** — will cause a bug, data loss, security issue, or test failure if not fixed +- **Non-blocking** — should be fixed for quality but won't break the task +- **Suggestion** — worth considering; low priority + +Lead with blocking findings. If there are none, say so explicitly before the non-blocking ones. diff --git a/src/resources/extensions/sf/auto-model-selection.js b/src/resources/extensions/sf/auto-model-selection.js index ebc6f3f2a..dd9593255 100644 --- a/src/resources/extensions/sf/auto-model-selection.js +++ b/src/resources/extensions/sf/auto-model-selection.js @@ -18,6 +18,7 @@ import { loadCapabilityOverrides, resolveModelForComplexity, } from "./model-router.js"; +import { readStickyModelForUnit } from "./slice-routing-cache.js"; import { filterModelsByProviderModelAllow, isProviderAllowedByLists, @@ -543,6 +544,15 @@ export async function selectAndApplyModel( selectionMethod: "tier-only", }; } else { + // Slice-sticky hint: prefer the model that previously succeeded + // on a sibling unit in this slice when its capability score is + // within window of the winner. Cleared on executor refusal so a + // failing model does not re-attach to the slice. + const stickyHint = readStickyModelForUnit( + basePath, + unitType, + unitId, + ); routingResult = resolveModelForComplexity( classification, modelConfig, @@ -551,6 +561,7 @@ export async function selectAndApplyModel( unitType, classification.taskMetadata, capabilityOverrides, + stickyHint, ); } if (routingResult.wasDowngraded) { diff --git a/src/resources/extensions/sf/auto-start.js b/src/resources/extensions/sf/auto-start.js index 5e4a37c67..c55f58996 100644 --- a/src/resources/extensions/sf/auto-start.js +++ b/src/resources/extensions/sf/auto-start.js @@ -82,7 +82,9 @@ import { import { initRoutingHistory } from "./routing-history.js"; import { acquireSessionLock, + isSessionPidAlive, releaseSessionLock, + terminateExistingSession, updateSessionLock, } from "./session-lock.js"; import { getSessionModelOverride } from "./session-model-override.js"; @@ -342,15 +344,91 @@ export async function bootstrapAutoSession( lockBase, buildResolver, } = deps; - const lockResult = acquireSessionLock(base, { + let lockResult = acquireSessionLock(base, { sessionId: ctx.sessionManager?.getSessionId?.(), sessionFile: ctx.sessionManager?.getSessionFile?.(), }); + // Lock busy on a *live* peer: instead of just refusing to start, ask the + // operator whether to terminate the existing session and take over. Two + // non-interactive escape hatches keep CI/headless usage predictable: + // - SF_KILL_EXISTING=1 (or =true / =yes) — auto-confirm the kill + // - SF_KILL_EXISTING=0 (or =false / =no) — auto-decline (current behavior) + // - SF_HEADLESS=1 with no SF_KILL_EXISTING — auto-decline (safe default + // for batch contexts where a hung interactive prompt would deadlock) + if (!lockResult.acquired && lockResult.existingPid) { + const existingPid = Number(lockResult.existingPid); + if (isSessionPidAlive(existingPid)) { + const envKill = String(process.env.SF_KILL_EXISTING ?? "") + .trim() + .toLowerCase(); + const headless = + process.env.SF_HEADLESS === "1" || + String(process.env.SF_HEADLESS ?? "").toLowerCase() === "true"; + let confirmed; + if (envKill === "1" || envKill === "true" || envKill === "yes") { + confirmed = true; + } else if (envKill === "0" || envKill === "false" || envKill === "no") { + confirmed = false; + } else if (headless) { + // Headless without an explicit opt-in: refuse to kill silently. + confirmed = false; + } else if (typeof ctx.ui?.confirm === "function") { + confirmed = await ctx.ui.confirm( + "Stop running SF session?", + `Another SF autonomous session (PID ${existingPid}) is already running on this project. Stop it and start a fresh session?`, + ); + } else { + confirmed = false; + } + if (confirmed) { + ctx.ui.notify( + `Stopping existing SF session (PID ${existingPid})…`, + "info", + ); + let result; + try { + result = await terminateExistingSession(existingPid); + } catch (err) { + ctx.ui.notify( + `Failed to stop existing SF session (PID ${existingPid}): ${err?.message ?? err}. Stop it manually with \`kill ${existingPid}\`.`, + "error", + ); + return false; + } + if (!result.terminated) { + ctx.ui.notify( + `Unable to stop existing SF session (PID ${existingPid}). It may belong to another user or be unresponsive. Stop it manually with \`kill -9 ${existingPid}\`.`, + "error", + ); + return false; + } + ctx.ui.notify( + result.escalated + ? `Existing SF session (PID ${existingPid}) did not exit on SIGTERM; SIGKILL applied.` + : `Existing SF session (PID ${existingPid}) stopped.`, + result.escalated ? "warning" : "info", + ); + lockResult = acquireSessionLock(base, { + sessionId: ctx.sessionManager?.getSessionId?.(), + sessionFile: ctx.sessionManager?.getSessionFile?.(), + }); + } + } + } if (!lockResult.acquired) { const reason = lockResult.reason; ctx.ui.notify(reason, "error"); return false; } + // Session-start janitor: prune per-flow trace files older than the longest + // analyzer window (30d). Best-effort, never blocks startup, errors swallowed + // in pruneStaleTraces. Keeps `.sf/traces/` from growing without bound. + try { + const { pruneStaleTraces } = await import("./uok/trace-writer.js"); + pruneStaleTraces(base); + } catch { + // trace janitor must never break autonomous startup + } function releaseLockAndReturn() { releaseSessionLock(base); clearLock(base); diff --git a/src/resources/extensions/sf/auto/run-unit.js b/src/resources/extensions/sf/auto/run-unit.js index c914607a6..87539dd40 100644 --- a/src/resources/extensions/sf/auto/run-unit.js +++ b/src/resources/extensions/sf/auto/run-unit.js @@ -6,6 +6,7 @@ import { scopeActiveToolsForUnitType } from "../constants.js"; import { debugLog } from "../debug-logger.js"; +import { getErrorMessage } from "../error-utils.js"; import { resolveAutoSupervisorConfig, resolvePersistModelChanges, @@ -27,11 +28,29 @@ import { getCurrentTurnGeneration, runWithTurnGeneration, } from "./turn-epoch.js"; -import { getErrorMessage } from "../error-utils.js"; // Tracks the latest session-switch attempt so a late timeout settlement from an // older runUnit() call cannot clear the guard for a newer one. let sessionSwitchGeneration = 0; +/** + * Build the custom-message content for a unit prompt. + * + * Purpose: preserve the exact prompt text while allowing the provider layer to + * cache the stable prefix separately from the dynamic suffix. + * + * Consumer: runUnit before pi.sendMessage dispatches the autonomous unit turn. + */ +export function buildUnitPromptMessageContent(prompt, promptParts) { + if (!promptParts) return prompt; + return [ + { + type: "text", + text: `${promptParts.before}\n`, + cache_control: { type: "ephemeral" }, + }, + { type: "text", text: promptParts.after }, + ]; +} /** * Execute a single unit: create a new session, send the prompt, and await * the agent_end promise. Returns a UnitResult describing what happened. @@ -122,8 +141,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) { sessionResult = await Promise.race([sessionPromise, timeoutPromise]); } catch (sessionErr) { if (sessionTimeoutHandle) clearTimeout(sessionTimeoutHandle); - const msg = - getErrorMessage(sessionErr); + const msg = getErrorMessage(sessionErr); debugLog("runUnit", { phase: "session-error", unitType, @@ -264,16 +282,7 @@ export async function runUnit(ctx, pi, s, unitType, unitId, prompt, options) { // When promptParts is available, send structured content so the provider can // apply cache_control:ephemeral to the stable prefix (before) while leaving // the dynamic suffix (after) uncached. - const messageContent = promptParts - ? [ - { - type: "text", - text: promptParts.before, - cache_control: { type: "ephemeral" }, - }, - { type: "text", text: promptParts.after }, - ] - : prompt; + const messageContent = buildUnitPromptMessageContent(prompt, promptParts); await pi.sendMessage( { customType: "sf-auto", content: messageContent, display: s.verbose }, { triggerTurn: true }, diff --git a/src/resources/extensions/sf/commands/catalog.js b/src/resources/extensions/sf/commands/catalog.js index f71840e61..68f8336f6 100644 --- a/src/resources/extensions/sf/commands/catalog.js +++ b/src/resources/extensions/sf/commands/catalog.js @@ -301,7 +301,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [ }, { cmd: "rubber-duck", - desc: "Request constructive code/design review from a rubber-duck subagent (RUBBER_DUCK flag)", + desc: "Dispatch a rubber-duck subagent for constructive pre-implementation review (alias: review-code)", }, { cmd: "delegate", diff --git a/src/resources/extensions/sf/commands/handlers/ops.js b/src/resources/extensions/sf/commands/handlers/ops.js index 80162194a..3703454f6 100644 --- a/src/resources/extensions/sf/commands/handlers/ops.js +++ b/src/resources/extensions/sf/commands/handlers/ops.js @@ -613,25 +613,47 @@ async function handleKeepAlive(args, ctx) { // ─── /rubber-duck ──────────────────────────────────────────────────────────── async function handleRubberDuckCommand(topic, ctx, _pi) { - if (!getExperimentalFlag("rubber_duck")) { - ctx.ui.notify( - "RUBBER_DUCK is not enabled. Run /experimental on rubber_duck to enable.", - "warning", - ); - return; - } - const prompt = topic - ? `Rubber-duck review requested: ${topic}\n\nPlease review this as a constructive critic: identify risks, edge cases, missing tests, and improvements. Be direct and concise.` - : "Please give constructive feedback on the current code changes or design. Identify risks, edge cases, missing tests, and improvements."; - ctx.ui.notify( - "Starting rubber-duck review… (RUBBER_DUCK agent is constructive, not adversarial)", - "info", - ); + const { execSync } = await import("node:child_process"); + const root = projectRoot(); + + // Gather git diff for context (staged + unstaged, capped to avoid token bloat) + let diff = ""; try { - await ctx.sendMessage?.(prompt); + const staged = execSync("git diff --cached --stat 2>/dev/null || true", { + cwd: root, + encoding: "utf-8", + }).trim(); + const unstaged = execSync("git diff --stat 2>/dev/null || true", { + cwd: root, + encoding: "utf-8", + }).trim(); + if (staged || unstaged) { + const fullDiff = execSync( + "git diff --cached 2>/dev/null; git diff 2>/dev/null", + { cwd: root, encoding: "utf-8" }, + ).slice(0, 8000); + diff = `\n\n## Current diff (truncated to 8 kB)\n\n\`\`\`diff\n${fullDiff}\n\`\`\``; + } + } catch { + // diff unavailable — not a hard failure + } + + const focus = topic ? `Focus on: ${topic}\n\n` : ""; + const reviewPrompt = + `Dispatch a \`rubber-duck\` subagent to review the current plan or changes before proceeding. ` + + `Use the \`subagent\` tool with \`agent: "rubber-duck"\`.\n\n` + + `${focus}` + + `Ask the rubber-duck agent to identify blocking issues, non-blocking issues, and suggestions. ` + + `After the subagent returns, summarise the verdict and any blocking findings in one short paragraph. ` + + `Do not proceed with implementation until the user acknowledges blocking findings.` + + diff; + + ctx.ui.notify("Dispatching rubber-duck review…", "info"); + try { + await ctx.sendMessage?.(reviewPrompt); } catch { ctx.ui.notify( - "Could not start rubber-duck session. Try typing your review request directly.", + "Could not dispatch rubber-duck. Try: subagent agent=rubber-duck task='review current changes'", "warning", ); } diff --git a/src/resources/extensions/sf/dashboard-overlay.js b/src/resources/extensions/sf/dashboard-overlay.js index 3a51f3b2e..7017196e9 100644 --- a/src/resources/extensions/sf/dashboard-overlay.js +++ b/src/resources/extensions/sf/dashboard-overlay.js @@ -741,6 +741,66 @@ export class SFDashboardOverlay { ); } } + // UOK Health section — aligns with headless status output + if (this.uokDiagnostics && this.uokDiagnostics.issues.length > 0) { + lines.push(blank()); + lines.push(hr()); + lines.push(row(th.fg("text", th.bold("UOK Health")))); + lines.push(blank()); + // Compact summary line matching headless format + lines.push( + row( + th.fg( + this.uokDiagnostics.verdict === "degraded" + ? "error" + : this.uokDiagnostics.verdict === "attention" + ? "warning" + : "dim", + `Verdict: ${this.uokDiagnostics.verdict} (${this.uokDiagnostics.classification})`, + ), + ), + ); + lines.push(blank()); + // Issue list + for (const issue of this.uokDiagnostics.issues) { + const icon = + issue.severity === "error" + ? th.fg("error", "✗") + : th.fg("warning", "⚠"); + lines.push(row(` ${icon} ${th.fg("text", issue.code)}`)); + lines.push(row(th.fg("dim", ` ${issue.message}`))); + } + // Recommendations + if (this.uokDiagnostics.recommendations.length > 0) { + lines.push(blank()); + for (const rec of this.uokDiagnostics.recommendations) { + lines.push(row(th.fg("dim", ` → ${rec}`))); + } + } + // Signals table + if (this.uokDiagnostics.signals) { + lines.push(blank()); + lines.push(row(th.fg("dim", "Signals:"))); + for (const [key, value] of Object.entries( + this.uokDiagnostics.signals, + )) { + const signalColor = + value === "ok" || + value === "active" || + value === "consistent" || + value === "clear" + ? "success" + : value === "unknown" + ? "dim" + : "warning"; + lines.push( + row( + ` ${th.fg(signalColor, "●")} ${th.fg("text", key)}: ${th.fg(signalColor, String(value))}`, + ), + ); + } + } + } // Environment health section (#1221) — only show issues const envResults = runEnvironmentChecks( this.dashData.basePath || process.cwd(), diff --git a/src/resources/extensions/sf/experimental.js b/src/resources/extensions/sf/experimental.js index b254d1c58..eb9f122e3 100644 --- a/src/resources/extensions/sf/experimental.js +++ b/src/resources/extensions/sf/experimental.js @@ -31,18 +31,12 @@ export const EXPERIMENTAL_FLAGS = { "STATUS_LINE — run a user-defined script to feed a custom footer status chip", show_file: "SHOW_FILE — show_file tool renders code snippets inline in the timeline", - ask_elicitation: - "ASK_USER_ELICITATION — structured form/select UI replaces plain ask_user", - multi_turn_agents: - "MULTI_TURN_AGENTS — persistent subagents that accept follow-up messages", extensions: "EXTENSIONS — user-installable extensions via marketplace npm install", configure_agent: "CONFIGURE_COPILOT_AGENT — interactive wizard for MCP servers and agents", background_sessions: "BACKGROUND_SESSIONS — concurrent sessions with background switching", - rubber_duck: - "RUBBER_DUCK — constructive feedback subagent on code and designs", prompt_frame: "PROMPT_FRAME — decorative border rendered above the input prompt", streamer_mode: diff --git a/src/resources/extensions/sf/model-router.js b/src/resources/extensions/sf/model-router.js index df54025d1..060f122d4 100644 --- a/src/resources/extensions/sf/model-router.js +++ b/src/resources/extensions/sf/model-router.js @@ -107,6 +107,8 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 30, longContext: 80, instruction: 90, + // Agentic: Claude Opus is built around extended tool-use loops. + agentic: 95, }, "claude-sonnet-4-6": { coding: 85, @@ -116,6 +118,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 60, longContext: 75, instruction: 85, + agentic: 92, }, "claude-sonnet-4-5-20250514": { coding: 85, @@ -125,6 +128,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 60, longContext: 75, instruction: 85, + agentic: 90, }, "claude-3-5-sonnet-latest": { coding: 82, @@ -134,6 +138,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 62, longContext: 70, instruction: 82, + agentic: 85, }, "claude-haiku-4-5": { coding: 60, @@ -143,6 +148,9 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 95, longContext: 50, instruction: 75, + // Haiku follows tool-use contracts but is less reliable than Sonnet on + // long agentic loops. + agentic: 75, }, "claude-3-5-haiku-latest": { coding: 60, @@ -152,6 +160,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 95, longContext: 50, instruction: 75, + agentic: 75, }, "claude-3-haiku-20240307": { coding: 50, @@ -163,6 +172,7 @@ export const MODEL_CAPABILITY_PROFILES = { instruction: 65, }, "claude-3-opus-latest": { + agentic: 88, coding: 90, debugging: 85, research: 82, @@ -234,6 +244,8 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 40, longContext: 85, instruction: 90, + // GPT-5 family is strongly agentic per OpenAI's tool-use evals. + agentic: 92, }, "gpt-5-mini": { coding: 62, @@ -261,6 +273,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 35, longContext: 88, instruction: 92, + agentic: 94, }, "gpt-5.1": { coding: 93, @@ -270,6 +283,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 42, longContext: 86, instruction: 91, + agentic: 92, }, "gpt-5.1-codex-max": { coding: 90, @@ -279,6 +293,9 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 55, longContext: 75, instruction: 85, + // Codex-tuned models are agentic-capable but not as reliable as the + // flagship gpt-5/5.x lineup for long tool-use loops. + agentic: 80, }, "gpt-5.1-codex-mini": { coding: 65, @@ -288,6 +305,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 88, longContext: 48, instruction: 72, + agentic: 55, }, "gpt-5.2": { coding: 93, @@ -297,6 +315,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 42, longContext: 87, instruction: 91, + agentic: 92, }, "gpt-5.2-codex": { coding: 93, @@ -306,6 +325,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 50, longContext: 78, instruction: 88, + agentic: 82, }, "gpt-5.3-codex": { coding: 94, @@ -315,6 +335,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 50, longContext: 80, instruction: 89, + agentic: 84, }, "gpt-5.3-codex-spark": { coding: 68, @@ -324,6 +345,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 90, longContext: 50, instruction: 74, + agentic: 55, }, "gpt-5.4": { coding: 95, @@ -333,6 +355,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 42, longContext: 88, instruction: 92, + agentic: 94, }, "gpt-5.4-mini": { coding: 80, @@ -342,6 +365,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 72, longContext: 72, instruction: 80, + agentic: 80, }, // GPT-5.5 scores are relative to the existing gpt-5.4 profile and backed by // OpenAI's 2026-04-23 published eval deltas across coding, tool use, and long context. @@ -354,6 +378,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 42, longContext: 90, instruction: 93, + agentic: 95, }, // ── OpenAI o-series (reasoning-first) ────────────────────────────────────── o1: { @@ -410,6 +435,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 48, longContext: 98, instruction: 82, + agentic: 85, }, "gemini-3-pro-preview": { coding: 82, @@ -419,6 +445,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 50, longContext: 96, instruction: 82, + agentic: 85, }, "gemini-3-flash-preview": { coding: 62, @@ -428,6 +455,10 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 88, longContext: 88, instruction: 72, + // Gemini Flash follows tool contracts but is occasionally chatty in + // agentic loops; mid-tier so it doesn't dominate execute-task vs + // a Sonnet/Opus/K2.6 alternative. + agentic: 70, }, "gemini-3.1-flash-lite-preview": { coding: 55, @@ -583,6 +614,10 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 70, longContext: 60, instruction: 80, + // Agentic: code-completion tuning. Refuses agentic tasks with "I'm sorry, + // I don't have the necessary tools" (M001-6377a4/S04/T02, 2026-05-12). + // Should not be routed to execute-task without explicit operator pin. + agentic: 25, }, "ministral-8b-latest": { coding: 55, @@ -655,6 +690,9 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 65, longContext: 65, instruction: 80, + // Agentic: Devstral series is coding-completion-tuned; tool-use is not + // the design target. Penalize so execute-task routing avoids it. + agentic: 30, }, "devstral-medium-latest": { coding: 78, @@ -664,6 +702,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 75, longContext: 60, instruction: 75, + agentic: 30, }, "devstral-medium-2507": { coding: 78, @@ -673,6 +712,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 75, longContext: 60, instruction: 75, + agentic: 30, }, "devstral-small-2505": { coding: 60, @@ -682,6 +722,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 90, longContext: 45, instruction: 65, + agentic: 30, }, "devstral-small-2507": { coding: 60, @@ -691,6 +732,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 90, longContext: 45, instruction: 65, + agentic: 30, }, "labs-devstral-small-2512": { coding: 65, @@ -700,6 +742,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 88, longContext: 60, instruction: 68, + agentic: 30, }, // ── Zhipu AI (GLM) ───────────────────────────────────────────────────────── "glm-5": { @@ -774,6 +817,8 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 58, longContext: 86, instruction: 78, + // Agentic: qwen3-coder is tuned for code completion, not tool-use loops. + agentic: 40, }, "qwen3-coder-next": { coding: 82, @@ -783,6 +828,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 70, longContext: 86, instruction: 76, + agentic: 40, }, "qwen3-next:80b": { coding: 70, @@ -802,6 +848,9 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 55, longContext: 86, instruction: 84, + // Agentic: K2.6 is the pinned default for the autonomous-solver role + // (ADR-0079) — refusal-resistant and follows tool-use contracts. + agentic: 90, }, "kimi-for-coding": { coding: 88, @@ -811,6 +860,9 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 55, longContext: 86, instruction: 84, + // `kimi-for-coding` is an alias for K2.6 on the Kimi Code provider + // (memory: bayesian-blender/benchmark-selector both canonicalize it). + agentic: 90, }, "kimi-k2-thinking": { coding: 86, @@ -820,8 +872,15 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 30, longContext: 86, instruction: 84, + agentic: 88, }, // ── MiniMax ─────────────────────────────────────────────────────────────── + // Profiles ordered by generation. Older M2.1 generation gets distinctly + // lower agentic + capability scores: the M2.1 stuck-checkpoint loop on + // 2026-05-13 (infra repo) traced back to M2.1 being aliased to M2.7's + // profile, winning execute-task on cost, then failing to follow the + // checkpoint contract reliably across 60+ tool calls. (See + // self-feedback sf-mp37kjmo-1mfuru.) "MiniMax-M2.7": { coding: 84, debugging: 80, @@ -830,6 +889,7 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 52, longContext: 84, instruction: 82, + agentic: 78, }, "MiniMax-M2.7-highspeed": { coding: 82, @@ -839,6 +899,47 @@ export const MODEL_CAPABILITY_PROFILES = { speed: 72, longContext: 84, instruction: 80, + agentic: 76, + }, + "MiniMax-M2.5": { + // Distinct profile (previously aliased to M2.7 — overclaimed). + coding: 78, + debugging: 74, + research: 72, + reasoning: 78, + speed: 55, + longContext: 82, + instruction: 76, + // Mid agentic — better than coding-completion-only models but + // noticeably less reliable than current-gen agentic models. + agentic: 60, + }, + "MiniMax-M2.1": { + // Distinct profile (previously aliased to M2.7 — overclaimed). + // M2.1 has demonstrated unreliable tool-use loops in production + // (M001-6377a4 / 1-ci-build-pipeline parallel-research, 2026-05-13: + // 60+ checkpoint calls with shifting unitId claims). Penalize the + // agentic axis so execute-task routing avoids it absent operator + // override. + coding: 72, + debugging: 66, + research: 64, + reasoning: 70, + speed: 60, + longContext: 78, + instruction: 72, + agentic: 40, + }, + "MiniMax-M2": { + // Earliest of the M2.x line — older still. + coding: 68, + debugging: 60, + research: 60, + reasoning: 66, + speed: 62, + longContext: 76, + instruction: 68, + agentic: 35, }, }; const MODEL_CAPABILITY_ALIASES = { @@ -864,10 +965,23 @@ const MODEL_CAPABILITY_ALIASES = { "kimi-for-coding": "kimi-k2.6", "kimi-k2.6:cloud": "kimi-k2.6", "kimi-k2.6-cloud": "kimi-k2.6", - "minimax-m2": "MiniMax-M2.7", - "minimax-m2.1": "MiniMax-M2.7", - "minimax-m2.5": "MiniMax-M2.7", + // Each MiniMax generation now has its own profile — previously they all + // aliased to MiniMax-M2.7, which let older/weaker models inherit current + // capability scores and win cost tie-breaks on execute-task. The aliases + // below normalize provider-prefixed and casing variants to the canonical + // per-generation profile, NOT to the current generation. + "minimax-m2": "MiniMax-M2", + "minimax/MiniMax-M2": "MiniMax-M2", + "minimax/minimax-m2": "MiniMax-M2", + "minimax-m2.1": "MiniMax-M2.1", + "minimax/MiniMax-M2.1": "MiniMax-M2.1", + "minimax/minimax-m2.1": "MiniMax-M2.1", + "minimax-m2.5": "MiniMax-M2.5", + "minimax/MiniMax-M2.5": "MiniMax-M2.5", + "minimax/minimax-m2.5": "MiniMax-M2.5", "minimax-m2.7": "MiniMax-M2.7", + "minimax/MiniMax-M2.7": "MiniMax-M2.7", + "minimax/minimax-m2.7": "MiniMax-M2.7", "mistral-large-3:675b": "mistral-large-latest", "ministral-3:3b": "ministral-3b-latest", "ministral-3:8b": "ministral-8b-latest", @@ -888,18 +1002,32 @@ const MODEL_CAPABILITY_ALIASES = { // ─── Base Task Requirements Data Table ─────────────────────────────────────── // Per-unit-type base requirement vectors. Weights indicate how important each // capability dimension is for this unit type. +// +// The `agentic` dimension represents the model's reliability at multi-turn +// tool-use loops (does it follow the tool-use contract? does it refuse the +// task? does it call the checkpoint tool when asked?). It is weighted high +// for any unit type that actually uses tools at runtime — execute-task most +// of all. See ADR-0079 for the motivation: a Codestral-style refusal on +// execute-task in M001-6377a4/S04/T02 (2026-05-12) traced back to the router +// having no agentic axis, so a coding-completion model out-scored agentic +// alternatives on coding/instruction. export const BASE_REQUIREMENTS = { - "execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 }, + "execute-task": { + coding: 0.9, + instruction: 0.7, + speed: 0.3, + agentic: 0.85, + }, "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, - "plan-milestone": { reasoning: 0.9, coding: 0.5 }, - "plan-slice": { reasoning: 0.9, coding: 0.5 }, - "replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5 }, - "reassess-roadmap": { reasoning: 0.9, research: 0.5 }, - "complete-slice": { instruction: 0.8, speed: 0.7 }, - "run-uat": { instruction: 0.7, speed: 0.8 }, - "discuss-milestone": { reasoning: 0.6, instruction: 0.7 }, - "complete-milestone": { instruction: 0.8, reasoning: 0.5 }, + "plan-milestone": { reasoning: 0.9, coding: 0.5, agentic: 0.6 }, + "plan-slice": { reasoning: 0.9, coding: 0.5, agentic: 0.6 }, + "replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5, agentic: 0.6 }, + "reassess-roadmap": { reasoning: 0.9, research: 0.5, agentic: 0.4 }, + "complete-slice": { instruction: 0.8, speed: 0.7, agentic: 0.6 }, + "run-uat": { instruction: 0.7, speed: 0.8, agentic: 0.6 }, + "discuss-milestone": { reasoning: 0.6, instruction: 0.7, agentic: 0.4 }, + "complete-milestone": { instruction: 0.8, reasoning: 0.5, agentic: 0.5 }, }; // ─── Public API ────────────────────────────────────────────────────────────── /** @@ -1101,6 +1229,7 @@ export function resolveModelForComplexity( unitType, taskMetadata, capabilityOverrides, + stickyHint, ) { // If no phase config or routing disabled, pass through if (!phaseConfig || !routingConfig.enabled) { @@ -1175,16 +1304,41 @@ export function resolveModelForComplexity( if (winner) { const capScores = {}; for (const s of scored) capScores[s.modelId] = s.score; - const fallbacks = buildFallbackChain(winner.modelId, phaseConfig); + // Slice-sticky preference: if a model previously succeeded on a + // sibling unit in this slice AND it is still eligible in the + // current tier AND its capability score is within STICKY_WINDOW of + // the winner, prefer it. Stops within-slice routing thrash where + // T01 → gemini-flash and T02 → codestral on the same slice. + const STICKY_WINDOW_POINTS = 8; + const stickyId = (() => { + if (!stickyHint?.id) return null; + const stickyKey = stickyHint.provider + ? `${stickyHint.provider}/${stickyHint.id}` + : stickyHint.id; + // Match either "provider/model" or bare model id in the eligible list. + const found = scored.find( + (s) => s.modelId === stickyKey || s.modelId.endsWith(`/${stickyHint.id}`), + ); + if (!found) return null; + if (winner.score - found.score > STICKY_WINDOW_POINTS) return null; + return found.modelId; + })(); + const selectedId = stickyId ?? winner.modelId; + const selectedScore = ( + scored.find((s) => s.modelId === selectedId) ?? winner + ).score; + const fallbacks = buildFallbackChain(selectedId, phaseConfig); return { - modelId: winner.modelId, + modelId: selectedId, fallbacks, tier: requestedTier, wasDowngraded: true, - reason: `capability-scored: ${winner.modelId} (${winner.score.toFixed(1)}) for ${unitType}`, + reason: stickyId + ? `slice-sticky: ${selectedId} (${selectedScore.toFixed(1)}, within ${STICKY_WINDOW_POINTS}pt of capability winner) for ${unitType}` + : `capability-scored: ${selectedId} (${selectedScore.toFixed(1)}) for ${unitType}`, capabilityScores: capScores, taskRequirements: requirements, - selectionMethod: "capability-scored", + selectionMethod: stickyId ? "slice-sticky" : "capability-scored", }; } } diff --git a/src/resources/extensions/sf/prompt-ordering.js b/src/resources/extensions/sf/prompt-ordering.js index 357776fde..8c8442084 100644 --- a/src/resources/extensions/sf/prompt-ordering.js +++ b/src/resources/extensions/sf/prompt-ordering.js @@ -137,6 +137,11 @@ export function reorderForCaching(prompt) { * static+semi-static prefix can be marked with cache_control: ephemeral on * Anthropic-compatible providers. * + * Purpose: keep SF autonomous prompt prefixes byte-stable across adjacent task + * dispatches so provider prompt caches can reuse expensive context. + * + * Consumer: auto/phases-unit.js before runUnit dispatches an autonomous unit. + * * Returns `{before: string, after: string}` where: * - `before` = preamble + all static + all semi-static sections (cache this) * - `after` = all dynamic sections (do not cache) diff --git a/src/resources/extensions/sf/session-lock.js b/src/resources/extensions/sf/session-lock.js index 836695d45..ef5ead37d 100644 --- a/src/resources/extensions/sf/session-lock.js +++ b/src/resources/extensions/sf/session-lock.js @@ -596,3 +596,103 @@ function isPidAlive(pid) { return false; } } + +/** + * Public wrapper around isPidAlive for callers outside this module. + * + * Consumer: auto-start's prompt-to-kill flow needs to decide whether the + * existingPid from acquireSessionLock's failure result is still alive before + * offering to terminate it. + */ +export function isSessionPidAlive(pid) { + return isPidAlive(Number(pid)); +} + +/** + * Terminate an existing SF auto session by PID. + * + * Why: when acquireSessionLock reports `{ acquired: false, existingPid }` + * because another SF process is holding the lock, we want a one-call helper + * that an interactive caller can invoke after confirming with the user. The + * helper sends SIGTERM, polls for the process to exit, escalates to SIGKILL + * after the grace window, and waits a short tail for the kernel to reap the + * PID so a subsequent acquireSessionLock retry sees a dead PID and proceeds + * down the stale-lock recovery path. + * + * Returns `{ terminated: boolean, escalated: boolean, alreadyDead: boolean }`. + * `terminated` is true iff the PID is no longer alive when the call returns. + * `escalated` is true iff SIGKILL was needed because SIGTERM did not produce + * an exit within `gracePeriodMs`. + * + * Consumer: auto-start's prompt-to-kill flow. Not part of the normal + * autonomous loop — only invoked after explicit operator consent. + * + * @param {number} pid - The PID to terminate. + * @param {object} [options] + * @param {number} [options.gracePeriodMs=5000] - How long to wait between + * SIGTERM and SIGKILL. + * @param {number} [options.reapWaitMs=1000] - How long to wait after the + * final kill signal for the kernel to reap. + * @param {number} [options.pollIntervalMs=100] - Poll interval used while + * waiting for exit. + */ +export async function terminateExistingSession(pid, options = {}) { + const numericPid = Number(pid); + if (!Number.isInteger(numericPid) || numericPid <= 0) { + return { terminated: false, escalated: false, alreadyDead: true }; + } + if (numericPid === process.pid) { + // Refuse to terminate ourselves — would deadlock the caller. + return { terminated: false, escalated: false, alreadyDead: false }; + } + if (!isPidAlive(numericPid)) { + return { terminated: true, escalated: false, alreadyDead: true }; + } + const gracePeriodMs = Number(options.gracePeriodMs ?? 5000); + const reapWaitMs = Number(options.reapWaitMs ?? 1000); + const pollIntervalMs = Math.max(50, Number(options.pollIntervalMs ?? 100)); + try { + process.kill(numericPid, "SIGTERM"); + } catch (err) { + // ESRCH: process already gone between the alive check and the kill. + // EPERM: not ours to kill — surface as not-terminated. + if (err?.code === "ESRCH") { + return { terminated: true, escalated: false, alreadyDead: true }; + } + if (err?.code === "EPERM") { + return { terminated: false, escalated: false, alreadyDead: false }; + } + throw err; + } + const deadline = Date.now() + gracePeriodMs; + while (Date.now() < deadline) { + if (!isPidAlive(numericPid)) { + return { terminated: true, escalated: false, alreadyDead: false }; + } + await new Promise((resolve) => setTimeout(resolve, pollIntervalMs)); + } + // Grace expired — escalate to SIGKILL. + try { + process.kill(numericPid, "SIGKILL"); + } catch (err) { + if (err?.code === "ESRCH") { + return { terminated: true, escalated: true, alreadyDead: false }; + } + if (err?.code === "EPERM") { + return { terminated: false, escalated: true, alreadyDead: false }; + } + throw err; + } + const reapDeadline = Date.now() + reapWaitMs; + while (Date.now() < reapDeadline) { + if (!isPidAlive(numericPid)) { + return { terminated: true, escalated: true, alreadyDead: false }; + } + await new Promise((resolve) => setTimeout(resolve, pollIntervalMs)); + } + return { + terminated: !isPidAlive(numericPid), + escalated: true, + alreadyDead: false, + }; +} diff --git a/src/resources/extensions/sf/slice-routing-cache.js b/src/resources/extensions/sf/slice-routing-cache.js new file mode 100644 index 000000000..e3ba00cd7 --- /dev/null +++ b/src/resources/extensions/sf/slice-routing-cache.js @@ -0,0 +1,154 @@ +/** + * slice-routing-cache.js — per-slice sticky-model routing cache. + * + * Why: model routing is currently computed per-unit, so the executor can flip + * between models within a single slice (M001-6377a4/S04 routed T01 to + * gemini-3-flash-preview, then T02 to codestral-latest — the second was + * unfit and refused the task, see ADR-0079). Once a model has successfully + * completed work on a slice, prefer it for the slice's sibling units unless + * a hard mismatch forces a switch. + * + * Contract: + * - Cache is small JSON keyed by sliceId. Each entry stores provider/id and + * timestamps so stale entries can be aged out. + * - Best-effort: read/write errors are swallowed; routing always has a + * fallback through the capability scorer. + * - Only successful outcomes (`continue` or `complete`) write to the cache. + * Refusal/blocker outcomes clear the entry so a failing model does not + * re-attach to the slice. + * + * Consumer: auto-model-selection.js reads before calling + * resolveModelForComplexity; auto/phases-unit.js writes after a successful + * checkpoint and clears on `executor-refused`. + */ +import { existsSync, mkdirSync, readFileSync, unlinkSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { atomicWriteSync } from "./atomic-write.js"; +import { sfRuntimeRoot } from "./paths.js"; + +const CACHE_FILE = "slice-routing.json"; +const DEFAULT_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days + +function cachePath(basePath) { + return join(sfRuntimeRoot(basePath), CACHE_FILE); +} + +/** + * Extract the slice scope from a unit id. + * + * Supports the conventional SF unit-id grammar: + * - Execute task: "<milestoneId>/<sliceId>/<taskId>" → "<milestoneId>/<sliceId>" + * - Plan / complete slice: "<milestoneId>/<sliceId>" → "<milestoneId>/<sliceId>" (already a slice) + * - Milestone-level units: "<milestoneId>" → "<milestoneId>" (no slice scope) + * + * Returns null when the unit id is missing or unparseable. + */ +export function extractSliceScope(unitId) { + if (!unitId || typeof unitId !== "string") return null; + const parts = unitId.split("/").filter(Boolean); + if (parts.length === 0) return null; + if (parts.length === 1) return parts[0]; // milestone-only + return `${parts[0]}/${parts[1]}`; +} + +function readCache(basePath) { + const path = cachePath(basePath); + if (!existsSync(path)) return {}; + try { + return JSON.parse(readFileSync(path, "utf-8")); + } catch { + return {}; + } +} + +function writeCache(basePath, data) { + const path = cachePath(basePath); + try { + mkdirSync(dirname(path), { recursive: true }); + atomicWriteSync(path, JSON.stringify(data, null, 2)); + } catch { + // best-effort + } +} + +/** + * Record the model that successfully handled a unit. The slice scope is + * derived from the unit id. Subsequent units in the same slice will see this + * as the sticky hint. + */ +export function recordSliceRouting(basePath, unitType, unitId, model) { + if (!basePath || !model?.id) return; + const sliceId = extractSliceScope(unitId); + if (!sliceId) return; + const data = readCache(basePath); + data[sliceId] = { + provider: String(model.provider ?? ""), + id: String(model.id), + ts: new Date().toISOString(), + lastUnitType: String(unitType ?? ""), + lastUnitId: String(unitId ?? ""), + }; + writeCache(basePath, data); +} + +/** + * Look up the sticky model for the slice that contains this unit. Returns + * null when there is no entry, when it's older than maxAgeMs, or when the + * cache cannot be read. + * + * @param {string} basePath + * @param {string} unitType + * @param {string} unitId + * @param {object} [options] + * @param {number} [options.maxAgeMs=7d] + * @returns {{ provider: string, id: string } | null} + */ +export function readStickyModelForUnit(basePath, unitType, unitId, options = {}) { + if (!basePath) return null; + const sliceId = extractSliceScope(unitId); + if (!sliceId) return null; + const data = readCache(basePath); + const entry = data[sliceId]; + if (!entry?.id) return null; + const maxAgeMs = Number(options.maxAgeMs ?? DEFAULT_MAX_AGE_MS); + if (entry.ts) { + const age = Date.now() - new Date(entry.ts).getTime(); + if (Number.isFinite(age) && age > maxAgeMs) return null; + } + return { + provider: String(entry.provider ?? ""), + id: String(entry.id), + }; +} + +/** + * Evict the sticky entry for the slice containing this unit. Called when the + * model attached to the slice refuses or hits a hard mismatch, so the next + * dispatch falls back to the capability scorer instead of re-pinning the + * broken model. + */ +export function clearSliceRoutingForUnit(basePath, unitId) { + if (!basePath) return; + const sliceId = extractSliceScope(unitId); + if (!sliceId) return; + const data = readCache(basePath); + if (!(sliceId in data)) return; + delete data[sliceId]; + if (Object.keys(data).length === 0) { + try { + unlinkSync(cachePath(basePath)); + } catch { + // best-effort + } + return; + } + writeCache(basePath, data); +} + +/** + * Test/debug only — read the entire cache. Production callers should use + * readStickyModelForUnit instead. + */ +export function _readCacheForTests(basePath) { + return readCache(basePath); +} diff --git a/src/resources/extensions/sf/tests/dashboard-overlay.test.ts b/src/resources/extensions/sf/tests/dashboard-overlay.test.ts new file mode 100644 index 000000000..1cc06bec1 --- /dev/null +++ b/src/resources/extensions/sf/tests/dashboard-overlay.test.ts @@ -0,0 +1,467 @@ +/** + * Dashboard Overlay UOK Diagnostics Tests + * + * Purpose: Verify that SFDashboardOverlay consumes writeUokDiagnostics output + * and renders it consistently with the headless status command. + * + * Consumer: TUI users who expect the dashboard to surface the same UOK health + * information as `sf status` / headless query. + */ + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +// ─── Hoisted mocks ───────────────────────────────────────────────────────── + +const mockDiagnostics = vi.hoisted(() => ({ + clear: { + schemaVersion: 1, + generatedAt: new Date().toISOString(), + verdict: "clear", + classification: "healthy", + signals: { + lock: "active", + parity: "ok", + ledger: "consistent", + runtimeProjection: "ok", + wrapper: "clear", + }, + currentUnit: null, + latestRun: null, + runtimeUnits: [], + issues: [], + recommendations: [], + reportPath: "/tmp/uok-diagnostics.json", + }, + degraded: { + schemaVersion: 1, + generatedAt: new Date().toISOString(), + verdict: "degraded", + classification: "needs-repair", + signals: { + lock: "stale", + parity: "ok", + ledger: "open-runs", + runtimeProjection: "stale", + wrapper: "unknown", + }, + currentUnit: null, + latestRun: null, + runtimeUnits: [], + issues: [ + { + code: "stale-lock", + severity: "error", + message: "Stale auto.lock detected for PID 12345.", + evidence: { lock: { pid: 12345 } }, + }, + { + code: "open-ledger-without-live-lock", + severity: "error", + message: + "UOK ledger has 2 started run(s) without a live auto.lock owner.", + evidence: { runIds: ["run-1", "run-2"] }, + }, + ], + recommendations: [ + "Clear stale auto.lock before dispatch.", + "Mark orphaned UOK runs recovered or restart from lock owner.", + ], + reportPath: "/tmp/uok-diagnostics.json", + }, + attention: { + schemaVersion: 1, + generatedAt: new Date().toISOString(), + verdict: "attention", + classification: "degraded", + signals: { + lock: "active", + parity: "degraded", + ledger: "consistent", + runtimeProjection: "ok", + wrapper: "unknown", + }, + currentUnit: { unitType: "execute-task", unitId: "T01", pid: 12345 }, + latestRun: null, + runtimeUnits: [], + issues: [ + { + code: "uok-parity-degraded", + severity: "warning", + message: + "UOK parity degraded: 1 critical mismatch(es), 0 missing exit(s).", + evidence: { current: { criticalMismatches: 1, missingExitEvents: 0 } }, + }, + ], + recommendations: ["Reconcile UOK parity before mutating git state."], + reportPath: "/tmp/uok-diagnostics.json", + }, +})); + +const dashDataMock = vi.hoisted(() => ({ + basePath: "/tmp/sf-test", + active: false, + paused: false, + remoteSession: null, + currentUnit: null, + elapsed: 0, + rtkEnabled: false, + rtkSavings: null, + pendingCaptureCount: 0, +})); + +vi.mock("../uok/diagnostic-synthesis.js", () => ({ + writeUokDiagnostics: vi.fn((_basePath, _options) => mockDiagnostics.clear), +})); + +vi.mock("../state.js", () => ({ + deriveState: vi.fn(async () => ({ + activeMilestone: null, + activeSlice: null, + activeTask: null, + phase: "idle", + progress: null, + nextAction: null, + blockers: [], + registry: [], + })), +})); + +vi.mock("../sf-db.js", () => ({ + isDbAvailable: vi.fn(() => false), + getMilestoneSlices: vi.fn(() => []), + getSliceTasks: vi.fn(() => []), +})); + +vi.mock("../auto.js", () => ({ + getAutoDashboardData: vi.fn(() => dashDataMock), +})); + +vi.mock("../auto-dashboard.js", () => ({ + estimateTimeRemaining: vi.fn(() => null), +})); + +vi.mock("../progress-score.js", () => ({ + computeProgressScore: vi.fn(() => ({ + level: "green", + summary: "All systems healthy", + signals: [], + })), +})); + +vi.mock("../doctor-environment.js", () => ({ + runEnvironmentChecks: vi.fn(() => []), +})); + +vi.mock("../worktree-command.js", () => ({ + getActiveWorktreeName: vi.fn(() => null), +})); + +vi.mock("../subagent/worker-registry.js", () => ({ + hasActiveWorkers: vi.fn(() => false), + getWorkerBatches: vi.fn(() => new Map()), +})); + +vi.mock("../metrics.js", () => ({ + getLedger: vi.fn(() => null), + getProjectTotals: vi.fn(() => ({})), + aggregateByPhase: vi.fn(() => []), + aggregateBySlice: vi.fn(() => []), + aggregateByModel: vi.fn(() => []), + aggregateCacheHitRate: vi.fn(() => 0), + formatCost: vi.fn((n) => `$${n.toFixed(2)}`), + formatCostProjection: vi.fn(() => []), + formatTokenCount: vi.fn((n) => String(n)), +})); + +vi.mock("../paths.js", () => ({ + resolveMilestoneFile: vi.fn(() => null), +})); + +vi.mock("../files.js", () => ({ + loadFile: vi.fn(async () => null), +})); + +vi.mock("../preferences.js", () => ({ + loadEffectiveSFPreferences: vi.fn(() => null), +})); + +vi.mock("@singularity-forge/tui", async (importOriginal) => { + const actual = (await importOriginal()) as any; + return { + ...actual, + Key: { + escape: "\u001B", + ctrl: (c: string) => `\u0000${c}`, + ctrlAlt: (c: string) => `\u001B\u0000${c}`, + ctrlShift: (c: string) => `\u001B\u0000${c.toUpperCase()}`, + down: "\u001B[B", + up: "\u001B[A", + }, + matchesKey: vi.fn(() => false), + truncateToWidth: vi.fn((s: string, w: number) => + s.length > w ? s.slice(0, w) : s, + ), + visibleWidth: vi.fn((s: string) => s.length), + }; +}); + +vi.mock("../shared/mod.js", () => ({ + centerLine: vi.fn( + (s: string, w: number) => + " ".repeat(Math.max(0, Math.floor((w - s.length) / 2))) + s, + ), + fitColumns: vi.fn((parts: string[], _w: number, _sep: string) => + parts.join(" "), + ), + formatDuration: vi.fn((ms: number) => `${Math.round(ms / 1000)}s`), + joinColumns: vi.fn( + (left: string, right: string, _w: number) => + `${left}${" ".repeat(Math.max(1, _w - left.length - right.length))}${right}`, + ), + padRight: vi.fn((s: string, w: number) => s.padEnd(w, " ")), + STATUS_COLOR: { + done: "success", + active: "accent", + pending: "dim", + }, + STATUS_GLYPH: { + done: "✓", + active: "▶", + pending: "○", + }, +})); + +vi.mock("../shortcut-defs.js", () => ({ + formattedShortcutPair: vi.fn(() => "ctrl+alt+g"), +})); + +// ─── Helpers ─────────────────────────────────────────────────────────────── + +function createMockTheme() { + return { + fg: vi.fn((color: string, text: string) => `[${color}:${text}]`), + bold: vi.fn((text: string) => `**${text}**`), + }; +} + +function createMockTui() { + return { + requestRender: vi.fn(), + }; +} + +// ─── Tests ───────────────────────────────────────────────────────────────── + +beforeEach(() => { + vi.clearAllMocks(); +}); + +afterEach(() => { + vi.clearAllMocks(); +}); + +describe("SFDashboardOverlay UOK diagnostics", () => { + it("loadData_calls_writeUokDiagnostics_and_stores_result", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + // Prevent interval from firing during test + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + + expect(writeUokDiagnostics).toHaveBeenCalledWith("/tmp/sf-test"); + expect(overlay.uokDiagnostics).toEqual(mockDiagnostics.clear); + + overlay.dispose(); + }); + + it("loadData_gracefully_handles_writeUokDiagnostics_failure", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + writeUokDiagnostics.mockImplementation(() => { + throw new Error("disk full"); + }); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + + expect(overlay.uokDiagnostics).toBeNull(); + + overlay.dispose(); + writeUokDiagnostics.mockRestore(); + }); + + it("render_includes_uok_verdict_when_diagnostics_present", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + expect(text).toContain("UOK"); + expect(text).toContain("degraded"); + expect(text).toContain("needs-repair"); + + overlay.dispose(); + }); + + it("render_includes_first_issue_code_like_headless_status", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + // Should contain the first issue code, matching headless status behavior + expect(text).toContain("stale-lock"); + + overlay.dispose(); + }); + + it("render_shows_uok_health_section_with_all_issues_when_degraded", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + // Should show both issue codes in the health section + expect(text).toContain("stale-lock"); + expect(text).toContain("open-ledger-without-live-lock"); + + overlay.dispose(); + }); + + it("render_shows_recommendations_when_issues_present", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + expect(text).toContain("Clear stale auto.lock before dispatch."); + expect(text).toContain( + "Mark orphaned UOK runs recovered or restart from lock owner.", + ); + + overlay.dispose(); + }); + + it("render_shows_uok_signals_table_when_diagnostics_present", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.degraded); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + // Signals should be visible + expect(text).toContain("lock"); + expect(text).toContain("parity"); + expect(text).toContain("ledger"); + + overlay.dispose(); + }); + + it("render_omits_detailed_uok_section_when_verdict_is_clear", async () => { + const { writeUokDiagnostics } = await import( + "../uok/diagnostic-synthesis.js" + ); + (writeUokDiagnostics as any).mockReturnValue(mockDiagnostics.clear); + + const { SFDashboardOverlay } = await import("../dashboard-overlay.js"); + + const tui = createMockTui(); + const theme = createMockTheme(); + const overlay = new SFDashboardOverlay(tui, theme, () => {}); + + clearInterval(overlay.refreshTimer); + overlay.refreshTimer = null as any; + + await overlay.loadData(); + const lines = overlay.buildContentLines(80); + const text = lines.join("\n"); + + // Should show the compact UOK clear line but no issue details + expect(text).toContain("clear"); + expect(text).not.toContain("stale-lock"); + + overlay.dispose(); + }); +}); diff --git a/src/resources/extensions/sf/tests/model-router-agentic.test.mjs b/src/resources/extensions/sf/tests/model-router-agentic.test.mjs new file mode 100644 index 000000000..e3e0d98c7 --- /dev/null +++ b/src/resources/extensions/sf/tests/model-router-agentic.test.mjs @@ -0,0 +1,140 @@ +import { describe, expect, test } from "vitest"; +import { + BASE_REQUIREMENTS, + MODEL_CAPABILITY_PROFILES, + scoreEligibleModels, + scoreModel, +} from "../model-router.js"; + +describe("agentic capability axis (ADR-0079)", () => { + test("execute-task base requirements weight the agentic dimension", () => { + // If this assertion fails because the weight changed: re-read ADR-0079 + // before adjusting. The whole point of the axis is to outweigh raw + // coding score for execute-task routing. + expect(BASE_REQUIREMENTS["execute-task"].agentic).toBeGreaterThanOrEqual( + 0.7, + ); + }); + + test("known agentic-capable models score higher than coding-completion models on execute-task", () => { + const codestralScore = scoreModel( + MODEL_CAPABILITY_PROFILES["codestral-latest"], + BASE_REQUIREMENTS["execute-task"], + ); + const kimiScore = scoreModel( + MODEL_CAPABILITY_PROFILES["kimi-k2.6"], + BASE_REQUIREMENTS["execute-task"], + ); + const sonnetScore = scoreModel( + MODEL_CAPABILITY_PROFILES["claude-sonnet-4-6"], + BASE_REQUIREMENTS["execute-task"], + ); + // Codestral has high coding (85) but agentic=25 — must not beat agentic models. + expect(kimiScore).toBeGreaterThan(codestralScore); + expect(sonnetScore).toBeGreaterThan(codestralScore); + }); + + test("devstral variants score below agentic models on execute-task", () => { + const devstralScore = scoreModel( + MODEL_CAPABILITY_PROFILES["devstral-2512"], + BASE_REQUIREMENTS["execute-task"], + ); + const kimiScore = scoreModel( + MODEL_CAPABILITY_PROFILES["kimi-k2.6"], + BASE_REQUIREMENTS["execute-task"], + ); + expect(kimiScore).toBeGreaterThan(devstralScore); + }); + + test("scoreEligibleModels ranks agentic models above coding-only models for execute-task", () => { + const eligible = [ + "mistral/codestral-latest", + "mistral/devstral-2512", + "moonshotai/kimi-k2.6", + "anthropic/claude-sonnet-4-6", + ]; + const ranked = scoreEligibleModels( + eligible, + BASE_REQUIREMENTS["execute-task"], + ); + const top = ranked[0]?.modelId; + // Either of the two pinned-agentic models must win. + expect(["moonshotai/kimi-k2.6", "anthropic/claude-sonnet-4-6"]).toContain( + top, + ); + // And Codestral specifically must not win. + expect(top).not.toBe("mistral/codestral-latest"); + }); + + test("agentic axis preserves research-* unit-type behavior (no agentic weight there)", () => { + // Research isn't agentic — those unit types should not gain an agentic + // dimension. This protects long-context research-tuned models from + // being penalized. + expect(BASE_REQUIREMENTS["research-milestone"].agentic).toBeUndefined(); + expect(BASE_REQUIREMENTS["research-slice"].agentic).toBeUndefined(); + }); + + test("known coding-only models all have agentic <= 50", () => { + const codingOnly = [ + "codestral-latest", + "devstral-2512", + "devstral-medium-latest", + "devstral-medium-2507", + "devstral-small-2505", + "devstral-small-2507", + "labs-devstral-small-2512", + "qwen3-coder:480b", + "qwen3-coder-next", + ]; + for (const id of codingOnly) { + const profile = MODEL_CAPABILITY_PROFILES[id]; + expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined(); + expect(profile.agentic, `${id} should have agentic <= 50`).toBeLessThanOrEqual( + 50, + ); + } + }); + + test("older MiniMax generations score lower than current on agentic", () => { + // 2026-05-13 incident: minimax/M2.1 stuck in 60+ checkpoint loop on + // infra repo. Root cause was the router aliasing all minimax-m2.x + // variants to MiniMax-M2.7's profile, so older models inherited + // current-gen capability scores and won cost tie-breaks on + // execute-task. Per-generation profiles + agentic axis fix the + // underlying routing decision. + const m21 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.1"]; + const m25 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.5"]; + const m27 = MODEL_CAPABILITY_PROFILES["MiniMax-M2.7"]; + expect(m21, "M2.1 should have its own profile").toBeDefined(); + expect(m25, "M2.5 should have its own profile").toBeDefined(); + expect(m27.agentic).toBeGreaterThan(m25.agentic); + expect(m25.agentic).toBeGreaterThan(m21.agentic); + // And on execute-task, the current generation must beat the older one. + const oldScore = scoreModel(m21, BASE_REQUIREMENTS["execute-task"]); + const newScore = scoreModel(m27, BASE_REQUIREMENTS["execute-task"]); + expect(newScore).toBeGreaterThan(oldScore); + }); + + test("known agentic-frontier models all have agentic >= 85", () => { + const agenticFrontier = [ + "claude-opus-4-6", + "claude-sonnet-4-6", + "claude-sonnet-4-5-20250514", + "kimi-k2.6", + "kimi-k2-thinking", + "gpt-5", + "gpt-5.4", + "gpt-5.5", + "gemini-3-pro-preview", + "gemini-3.1-pro-preview", + ]; + for (const id of agenticFrontier) { + const profile = MODEL_CAPABILITY_PROFILES[id]; + expect(profile, `${id} should be in MODEL_CAPABILITY_PROFILES`).toBeDefined(); + expect( + profile.agentic, + `${id} should have agentic >= 85`, + ).toBeGreaterThanOrEqual(85); + } + }); +}); diff --git a/src/resources/extensions/sf/tests/prompt-ordering.test.mjs b/src/resources/extensions/sf/tests/prompt-ordering.test.mjs index 75710953e..f14d1b8d7 100644 --- a/src/resources/extensions/sf/tests/prompt-ordering.test.mjs +++ b/src/resources/extensions/sf/tests/prompt-ordering.test.mjs @@ -134,61 +134,3 @@ test("reorderAndSplitForCaching_preamble_goes_into_before", () => { "dynamic section in after", ); }); - - -test("reorderForCaching_when_inlined_slice_summary_has_requirements_advanced_keeps_it_after_mission", () => { - const prompt = [ - "# Milestone Validation", - "", - "## Working Directory", - "/repo", - "", - "## Mission", - "Dispatch reviewers.", - "", - "## Context", - "Inlined below.", - "", - "## Inlined Context", - "### S01 Summary", - "# S01", - "", - "## Requirements Advanced", - "- R1", - "", - "## Requirements Validated", - "None.", - ].join("\n"); - - const reordered = reorderForCaching(prompt); - - assert.ok( - reordered.indexOf("## Mission") < - reordered.indexOf("## Requirements Advanced"), - ); - assert.ok( - reordered.indexOf("## Context") < - reordered.indexOf("## Requirements Advanced"), - ); -}); - -test("reorderForCaching_when_top_level_requirements_exists_still_hoists_exact_requirements_block", () => { - const prompt = [ - "# Execute", - "", - "## Mission", - "Do work.", - "", - "## Requirements", - "- R1", - "", - "## Verification", - "Run tests.", - ].join("\n"); - - const reordered = reorderForCaching(prompt); - - assert.ok( - reordered.indexOf("## Requirements") < reordered.indexOf("## Mission"), - ); -}); diff --git a/src/resources/extensions/sf/tests/run-unit.test.mjs b/src/resources/extensions/sf/tests/run-unit.test.mjs new file mode 100644 index 000000000..5d95e296c --- /dev/null +++ b/src/resources/extensions/sf/tests/run-unit.test.mjs @@ -0,0 +1,30 @@ +import assert from "node:assert/strict"; +import { test } from "vitest"; + +import { buildUnitPromptMessageContent } from "../auto/run-unit.js"; + +test("buildUnitPromptMessageContent_when_prompt_parts_present_preserves_join_boundary", () => { + const content = buildUnitPromptMessageContent("flat", { + before: "## Working Directory\n/repo", + after: "## Inlined Task Plan\nDo it.", + }); + + assert.ok(Array.isArray(content)); + assert.deepEqual(content[0], { + type: "text", + text: "## Working Directory\n/repo\n", + cache_control: { type: "ephemeral" }, + }); + assert.deepEqual(content[1], { + type: "text", + text: "## Inlined Task Plan\nDo it.", + }); + assert.equal( + content.map((part) => part.text).join(""), + "## Working Directory\n/repo\n## Inlined Task Plan\nDo it.", + ); +}); + +test("buildUnitPromptMessageContent_when_no_prompt_parts_returns_flat_prompt", () => { + assert.equal(buildUnitPromptMessageContent("flat", null), "flat"); +}); diff --git a/src/resources/extensions/sf/tests/session-lock-terminate.test.mjs b/src/resources/extensions/sf/tests/session-lock-terminate.test.mjs new file mode 100644 index 000000000..288c2705a --- /dev/null +++ b/src/resources/extensions/sf/tests/session-lock-terminate.test.mjs @@ -0,0 +1,134 @@ +import { spawn } from "node:child_process"; +import { describe, expect, test } from "vitest"; +import { + isSessionPidAlive, + terminateExistingSession, +} from "../session-lock.js"; + +function spawnSleeper(seconds = 30) { + // `sleep` is a deliberate cooperative target: it exits on SIGTERM, which + // lets us exercise the graceful path. For the SIGKILL escalation test we + // spawn a child that ignores SIGTERM via `trap '' TERM`. + const child = spawn("/bin/sh", ["-c", `sleep ${seconds}`], { + stdio: "ignore", + detached: false, + }); + return child; +} + +function spawnIgnoreSigterm(seconds = 30) { + // A Node child that installs an explicit SIGTERM handler that does + // nothing. Unlike `sh -c "trap '' TERM; sleep N"` (where the shell + // tail-call-exec's sleep so SIGTERM hits sleep directly), this child + // IS the long-lived process and reliably ignores SIGTERM until the + // SIGKILL escalation. This lets us assert the escalation path. + const child = spawn( + process.execPath, + [ + "-e", + `process.on('SIGTERM', () => {}); setTimeout(() => process.exit(0), ${seconds * 1000});`, + ], + { stdio: "ignore", detached: false }, + ); + return child; +} + +describe("terminateExistingSession", () => { + test("returns alreadyDead=true when pid is invalid", async () => { + const result = await terminateExistingSession(0); + expect(result.terminated).toBe(false); + expect(result.alreadyDead).toBe(true); + }); + + test("refuses to terminate the current process", async () => { + const result = await terminateExistingSession(process.pid); + expect(result.terminated).toBe(false); + }); + + test("returns alreadyDead=true for a dead pid", async () => { + // PID 1 is alive but not ours; use a value that's almost certainly + // not assigned. 2^31 - 1 is well above any plausible PID. + const result = await terminateExistingSession(2147483646); + expect(result.alreadyDead).toBe(true); + expect(result.terminated).toBe(true); + }); + + test("gracefully terminates a process that respects SIGTERM", async () => { + const child = spawnSleeper(60); + try { + expect(isSessionPidAlive(child.pid)).toBe(true); + const result = await terminateExistingSession(child.pid, { + gracePeriodMs: 3000, + reapWaitMs: 1000, + pollIntervalMs: 50, + }); + expect(result.terminated).toBe(true); + expect(result.escalated).toBe(false); + expect(isSessionPidAlive(child.pid)).toBe(false); + } finally { + try { + child.kill("SIGKILL"); + } catch { + /* may already be dead */ + } + } + }); + + test("escalates to SIGKILL when the process ignores SIGTERM", async () => { + const child = spawnIgnoreSigterm(60); + // Give the child a moment to register its SIGTERM handler before we + // send SIGTERM. Without this, the kill may arrive before + // process.on('SIGTERM', …) executes and Node uses the default handler + // (exit on signal), which makes the test look like graceful exit. + await new Promise((resolve) => setTimeout(resolve, 250)); + try { + expect(isSessionPidAlive(child.pid)).toBe(true); + const result = await terminateExistingSession(child.pid, { + gracePeriodMs: 750, + reapWaitMs: 2000, + pollIntervalMs: 50, + }); + expect(result.terminated).toBe(true); + expect(result.escalated).toBe(true); + expect(isSessionPidAlive(child.pid)).toBe(false); + } finally { + try { + child.kill("SIGKILL"); + } catch { + /* may already be dead */ + } + } + }); +}); + +describe("isSessionPidAlive", () => { + test("returns false for current process (self-check is intentionally disabled)", () => { + // isPidAlive specifically excludes the current PID to prevent + // false-positive self-detection in the lock takeover flow. + expect(isSessionPidAlive(process.pid)).toBe(false); + }); + + test("returns false for clearly-dead pid", () => { + expect(isSessionPidAlive(2147483646)).toBe(false); + }); + + test("returns true for a live child", async () => { + const child = spawnSleeper(30); + try { + expect(isSessionPidAlive(child.pid)).toBe(true); + } finally { + try { + child.kill("SIGKILL"); + } catch { + /* may already be dead */ + } + } + }); + + test("returns false for non-integer or non-positive inputs", () => { + expect(isSessionPidAlive(0)).toBe(false); + expect(isSessionPidAlive(-1)).toBe(false); + expect(isSessionPidAlive("nope")).toBe(false); + expect(isSessionPidAlive(null)).toBe(false); + }); +}); diff --git a/src/resources/extensions/sf/tests/slice-routing-cache.test.mjs b/src/resources/extensions/sf/tests/slice-routing-cache.test.mjs new file mode 100644 index 000000000..7f59236f7 --- /dev/null +++ b/src/resources/extensions/sf/tests/slice-routing-cache.test.mjs @@ -0,0 +1,136 @@ +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, test } from "vitest"; +import { + _readCacheForTests, + clearSliceRoutingForUnit, + extractSliceScope, + readStickyModelForUnit, + recordSliceRouting, +} from "../slice-routing-cache.js"; + +let tempDirs = []; +function makeProject() { + const dir = mkdtempSync(join(tmpdir(), "sf-slice-routing-")); + tempDirs.push(dir); + return dir; +} +afterEach(() => { + for (const dir of tempDirs) rmSync(dir, { recursive: true, force: true }); + tempDirs = []; +}); + +describe("extractSliceScope", () => { + test("execute-task style unit id collapses to milestone/slice", () => { + expect(extractSliceScope("M001-6377a4/S04/T02")).toBe("M001-6377a4/S04"); + }); + test("plan/complete slice ids stay as milestone/slice", () => { + expect(extractSliceScope("M001-6377a4/S04")).toBe("M001-6377a4/S04"); + }); + test("milestone-only ids return the milestone", () => { + expect(extractSliceScope("M001-6377a4")).toBe("M001-6377a4"); + }); + test("null/undefined/empty return null", () => { + expect(extractSliceScope(null)).toBeNull(); + expect(extractSliceScope("")).toBeNull(); + expect(extractSliceScope(undefined)).toBeNull(); + }); +}); + +describe("slice routing cache", () => { + test("record + read round-trips", () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + const sticky = readStickyModelForUnit( + project, + "execute-task", + "M001/S04/T02", + ); + expect(sticky).toEqual({ provider: "moonshotai", id: "kimi-k2.6" }); + }); + + test("sticky scoped per slice — different slice => no hit", () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + expect( + readStickyModelForUnit(project, "execute-task", "M001/S05/T01"), + ).toBeNull(); + }); + + test("clearSliceRoutingForUnit evicts only the matching slice", () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + recordSliceRouting(project, "execute-task", "M001/S05/T01", { + provider: "anthropic", + id: "claude-sonnet-4-6", + }); + clearSliceRoutingForUnit(project, "M001/S04/T07"); + expect( + readStickyModelForUnit(project, "execute-task", "M001/S04/T99"), + ).toBeNull(); + expect( + readStickyModelForUnit(project, "execute-task", "M001/S05/T02"), + ).toEqual({ provider: "anthropic", id: "claude-sonnet-4-6" }); + }); + + test("readStickyModelForUnit honors maxAgeMs", async () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + // Sleep past the retention window so age strictly exceeds maxAgeMs. + await new Promise((resolve) => setTimeout(resolve, 25)); + expect( + readStickyModelForUnit(project, "execute-task", "M001/S04/T02", { + maxAgeMs: 10, + }), + ).toBeNull(); + }); + + test("returns null on missing basePath or unparseable unit id", () => { + expect(readStickyModelForUnit("", "execute-task", "M001/S04/T01")).toBeNull(); + const project = makeProject(); + expect(readStickyModelForUnit(project, "execute-task", "")).toBeNull(); + expect(readStickyModelForUnit(project, "execute-task", null)).toBeNull(); + }); + + test("overwrite updates the slice entry in place", () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + recordSliceRouting(project, "execute-task", "M001/S04/T02", { + provider: "anthropic", + id: "claude-opus-4-7", + }); + const cache = _readCacheForTests(project); + const entries = Object.values(cache); + expect(entries.length).toBe(1); + expect( + readStickyModelForUnit(project, "execute-task", "M001/S04/T03"), + ).toEqual({ provider: "anthropic", id: "claude-opus-4-7" }); + }); + + test("clearSliceRoutingForUnit on the last entry removes the cache file", () => { + const project = makeProject(); + recordSliceRouting(project, "execute-task", "M001/S04/T01", { + provider: "moonshotai", + id: "kimi-k2.6", + }); + clearSliceRoutingForUnit(project, "M001/S04/T01"); + const cache = _readCacheForTests(project); + expect(Object.keys(cache).length).toBe(0); + }); +}); diff --git a/src/resources/extensions/sf/tests/solver-model.test.mjs b/src/resources/extensions/sf/tests/solver-model.test.mjs new file mode 100644 index 000000000..25da7ea40 --- /dev/null +++ b/src/resources/extensions/sf/tests/solver-model.test.mjs @@ -0,0 +1,134 @@ +import { describe, expect, test } from "vitest"; +import { + SOLVER_MODEL_DEFAULT, + SOLVER_MODEL_FALLBACKS, + isSolverModel, + resolveSolverModel, + resolveSolverModelCandidates, +} from "../solver-model.js"; + +describe("solver-model invariants", () => { + test("default is locked to kimi-k2.6 / kimi-coding", () => { + // This is a PROTOCOL INVARIANT, not a tuning parameter. Changing the + // default requires an ADR (see ADR-0079). If this test fails because + // someone bumped the default, that's a load-bearing change and a code + // review reject — re-read the ADR before re-running. + expect(SOLVER_MODEL_DEFAULT).toEqual({ + provider: "kimi-coding", + id: "kimi-k2.6", + }); + }); + + test("no fallback is a code-completion-only model", () => { + // Code-completion models (Codestral, Devstral, the kimi-for-coding + // alias) are the ones that broke the loop in the first place. They + // must NEVER appear in the solver fallback chain. + const forbidden = new Set([ + "codestral-latest", + "devstral-latest", + "kimi-for-coding", + ]); + for (const candidate of SOLVER_MODEL_FALLBACKS) { + expect(forbidden.has(candidate.id)).toBe(false); + } + }); +}); + +describe("resolveSolverModel", () => { + test("with no preferences returns the pinned default", () => { + expect(resolveSolverModel()).toEqual(SOLVER_MODEL_DEFAULT); + expect(resolveSolverModel(undefined)).toEqual(SOLVER_MODEL_DEFAULT); + expect(resolveSolverModel({})).toEqual(SOLVER_MODEL_DEFAULT); + }); + + test("ignores router/benchmark/learning state (no opt-in == default)", () => { + // Even with the kitchen sink of unrelated preference fields, + // resolveSolverModel must NOT consult any of them. Only an explicit + // preferences.autonomousSolver.model entry can override. + const preferences = { + currentModel: { provider: "mistral", id: "codestral-latest" }, + modelRouter: { lastSelection: "google-gemini-cli/gemini-3-flash-preview" }, + benchmarkSelector: { winner: "kimi-for-coding" }, + learning: { blender: { recommended: "kimi-k2.5" } }, + }; + expect(resolveSolverModel(preferences)).toEqual(SOLVER_MODEL_DEFAULT); + }); + + test("respects an explicit object override", () => { + const resolved = resolveSolverModel({ + autonomousSolver: { model: { provider: "anthropic", id: "claude-opus-4-7" } }, + }); + expect(resolved).toEqual({ provider: "anthropic", id: "claude-opus-4-7" }); + }); + + test("accepts a string override in provider/model form", () => { + const resolved = resolveSolverModel({ + autonomousSolver: { model: "anthropic/claude-sonnet-4-6" }, + }); + expect(resolved).toEqual({ + provider: "anthropic", + id: "claude-sonnet-4-6", + }); + }); + + test("accepts a bare model id and keeps the default provider", () => { + const resolved = resolveSolverModel({ + autonomousSolver: { model: "kimi-k2-thinking" }, + }); + expect(resolved).toEqual({ + provider: SOLVER_MODEL_DEFAULT.provider, + id: "kimi-k2-thinking", + }); + }); + + test("ignores an empty-string override", () => { + expect( + resolveSolverModel({ autonomousSolver: { model: "" } }), + ).toEqual(SOLVER_MODEL_DEFAULT); + expect( + resolveSolverModel({ autonomousSolver: { model: " " } }), + ).toEqual(SOLVER_MODEL_DEFAULT); + }); +}); + +describe("resolveSolverModelCandidates", () => { + test("primary comes first, then fallback chain (de-duplicated)", () => { + const candidates = resolveSolverModelCandidates(); + expect(candidates[0]).toEqual(SOLVER_MODEL_DEFAULT); + expect(candidates.length).toBe(1 + SOLVER_MODEL_FALLBACKS.length); + }); + + test("override does not duplicate when also in fallback list", () => { + const candidates = resolveSolverModelCandidates({ + autonomousSolver: { model: "anthropic/claude-opus-4-7" }, + }); + const opusEntries = candidates.filter( + (c) => c.id === "claude-opus-4-7" && c.provider === "anthropic", + ); + expect(opusEntries.length).toBe(1); + }); +}); + +describe("isSolverModel", () => { + test("returns true for the pinned default", () => { + expect(isSolverModel(SOLVER_MODEL_DEFAULT)).toBe(true); + }); + + test("returns false for a routed executor model", () => { + expect( + isSolverModel({ provider: "mistral", id: "codestral-latest" }), + ).toBe(false); + expect( + isSolverModel({ + provider: "google-gemini-cli", + id: "gemini-3-flash-preview", + }), + ).toBe(false); + }); + + test("returns false for null / malformed inputs", () => { + expect(isSolverModel(null)).toBe(false); + expect(isSolverModel(undefined)).toBe(false); + expect(isSolverModel({})).toBe(false); + }); +}); diff --git a/src/resources/extensions/sf/tests/trace-janitor.test.mjs b/src/resources/extensions/sf/tests/trace-janitor.test.mjs new file mode 100644 index 000000000..de9cd6d18 --- /dev/null +++ b/src/resources/extensions/sf/tests/trace-janitor.test.mjs @@ -0,0 +1,115 @@ +import { + existsSync, + mkdirSync, + mkdtempSync, + rmSync, + symlinkSync, + utimesSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, test } from "vitest"; +import { pruneStaleTraces } from "../uok/trace-writer.js"; + +let tempDirs = []; + +function makeProject() { + const dir = mkdtempSync(join(tmpdir(), "sf-trace-janitor-")); + tempDirs.push(dir); + mkdirSync(join(dir, ".sf"), { recursive: true }); + return dir; +} + +afterEach(() => { + for (const dir of tempDirs) { + rmSync(dir, { recursive: true, force: true }); + } + tempDirs = []; +}); + +function makeTraceFile(project, name, daysOld) { + const tracesDir = join(project, ".sf", "traces"); + mkdirSync(tracesDir, { recursive: true }); + const path = join(tracesDir, name); + writeFileSync(path, '{"ts":"2024-01-01T00:00:00Z","type":"gate_run"}\n'); + if (typeof daysOld === "number") { + const epoch = (Date.now() - daysOld * 24 * 60 * 60 * 1000) / 1000; + utimesSync(path, epoch, epoch); + } + return path; +} + +describe("pruneStaleTraces", () => { + test("removes jsonl files older than retention window", () => { + const project = makeProject(); + const oldFile = makeTraceFile( + project, + "pre-dispatch:old.jsonl", + 45, + ); + const freshFile = makeTraceFile( + project, + "pre-dispatch:fresh.jsonl", + 5, + ); + expect(existsSync(oldFile)).toBe(true); + expect(existsSync(freshFile)).toBe(true); + + const result = pruneStaleTraces(project); + expect(result.pruned).toBe(1); + expect(existsSync(oldFile)).toBe(false); + expect(existsSync(freshFile)).toBe(true); + }); + + test("respects a custom retention window", () => { + const project = makeProject(); + const file = makeTraceFile(project, "pre-dispatch:tenday.jsonl", 10); + const result = pruneStaleTraces(project, { retentionDays: 7 }); + expect(result.pruned).toBe(1); + expect(existsSync(file)).toBe(false); + }); + + test("never touches the `latest` symlink", () => { + const project = makeProject(); + const file = makeTraceFile(project, "pre-dispatch:current.jsonl", 0); + const latest = join(project, ".sf", "traces", "latest"); + symlinkSync("pre-dispatch:current.jsonl", latest); + // Make `latest` look old via its target; the symlink itself is fine. + pruneStaleTraces(project); + expect(existsSync(latest)).toBe(true); + }); + + test("ignores non-jsonl files", () => { + const project = makeProject(); + const tracesDir = join(project, ".sf", "traces"); + mkdirSync(tracesDir, { recursive: true }); + const txt = join(tracesDir, "notes.txt"); + writeFileSync(txt, "ignored"); + const epoch = (Date.now() - 90 * 24 * 60 * 60 * 1000) / 1000; + utimesSync(txt, epoch, epoch); + pruneStaleTraces(project); + expect(existsSync(txt)).toBe(true); + }); + + test("returns zero-counts when traces dir does not exist", () => { + const project = makeProject(); + // no traces dir + const result = pruneStaleTraces(project); + expect(result).toEqual({ scanned: 0, pruned: 0, errors: 0 }); + }); + + test("respects maxDeletePerCall safety cap", () => { + const project = makeProject(); + for (let i = 0; i < 5; i++) { + makeTraceFile(project, `pre-dispatch:old-${i}.jsonl`, 60); + } + const result = pruneStaleTraces(project, { maxDeletePerCall: 2 }); + expect(result.pruned).toBe(2); + }); + + test("does not throw on missing basePath", () => { + expect(() => pruneStaleTraces("")).not.toThrow(); + expect(() => pruneStaleTraces(undefined)).not.toThrow(); + }); +}); diff --git a/src/resources/extensions/sf/ui/index.js b/src/resources/extensions/sf/ui/index.js index 1ce31513b..4e5ffd071 100644 --- a/src/resources/extensions/sf/ui/index.js +++ b/src/resources/extensions/sf/ui/index.js @@ -328,13 +328,12 @@ export default function sfTui(pi) { renderResult: ({ output }) => output, }); - // ASK_USER_ELICITATION — structured form-based ask_user replacement. - // When the flag is on and the agent calls this tool with choices, a TUI - // select overlay is shown instead of a plain text prompt. + // ask_user_elicitation — structured form-based ask_user replacement. + // Shows a TUI select overlay when choices are provided, freeform input otherwise. pi.registerTool({ name: "ask_user_elicitation", description: - "Ask the user a question using a structured form with optional choices. When ASK_USER_ELICITATION is enabled this is preferred over plain ask_user for questions with known choices.", + "Ask the user a question using a structured form with optional choices. Shows a TUI select overlay when choices are provided, or a freeform text prompt otherwise.", parameters: { type: "object", properties: { @@ -359,12 +358,6 @@ export default function sfTui(pi) { if (!ctx?.hasUI) { return { output: "No UI available for elicitation." }; } - if (!getExperimentalFlag("ask_elicitation")) { - return { - output: - "ASK_USER_ELICITATION is not enabled. Run /experimental on ask_elicitation to enable.", - }; - } if (choices?.length) { const answer = await ctx.ui.select(question, choices); if (!answer && allow_freeform) { @@ -379,121 +372,6 @@ export default function sfTui(pi) { renderResult: ({ output }) => (output ? `**Answer:** ${output}` : ""), }); - // MULTI_TURN_AGENTS — persistent named sub-agent sessions via file-backed state. - // Tool that spawns or resumes a named SF child process, relaying messages. - pi.registerTool({ - name: "spawn_agent", - description: - "Spawn or resume a named persistent sub-agent. Sends a message and waits for the response. The agent persists across calls using file-backed state in .sf/agents/<name>/.", - parameters: { - type: "object", - properties: { - name: { - type: "string", - description: - "Unique agent name (alphanumeric + hyphens, e.g. 'researcher')", - }, - message: { - type: "string", - description: "Message to send to the agent", - }, - reset: { - type: "boolean", - description: - "If true, clear the agent's state and start fresh (default: false)", - }, - }, - required: ["name", "message"], - }, - execute: async ({ name, message, reset }) => { - if (!getExperimentalFlag("multi_turn_agents")) { - return { - output: - "MULTI_TURN_AGENTS is not enabled. Run /experimental on multi_turn_agents to enable.", - }; - } - if (!/^[a-z0-9-]{1,32}$/i.test(name)) { - return { - output: "Agent name must be 1-32 alphanumeric/hyphen characters.", - }; - } - const { join: pathJoin } = await import("node:path"); - const { mkdirSync, writeFileSync, readFileSync, existsSync } = - await import("node:fs"); - const stateDir = pathJoin( - projectRoot() ?? process.cwd(), - ".sf", - "agents", - name, - ); - mkdirSync(stateDir, { recursive: true }); - const historyPath = pathJoin(stateDir, "history.jsonl"); - if (reset && existsSync(historyPath)) { - writeFileSync(historyPath, "", "utf-8"); - } - // Append user message to history - const entry = JSON.stringify({ - role: "user", - content: message, - ts: Date.now(), - }); - const { appendFileSync } = await import("node:fs"); - appendFileSync(historyPath, `${entry}\n`, "utf-8"); - // Dispatch to SF headless with the conversation history as context - const historyLines = existsSync(historyPath) - ? readFileSync(historyPath, "utf-8") - .trim() - .split("\n") - .filter(Boolean) - .map((l) => { - try { - return JSON.parse(l); - } catch { - return null; - } - }) - .filter(Boolean) - : []; - const contextMsg = historyLines - .slice(-10) // last 10 turns for context - .map((e) => `${e.role === "user" ? "User" : "Agent"}: ${e.content}`) - .join("\n"); - const fullPrompt = `[Agent: ${name}]\n\nConversation history:\n${contextMsg}\n\nRespond to the last user message only.`; - const { execFile } = await import("node:child_process"); - const { promisify } = await import("node:util"); - const execFileAsync = promisify(execFile); - try { - const { stdout } = await execFileAsync( - process.execPath, - [ - "-y", - "node@24", - process.env.SF_LOADER ?? "dist/loader.js", - "headless", - "--print", - fullPrompt, - ], - { - timeout: 60000, - encoding: "utf-8", - env: { ...process.env }, - }, - ); - const response = stdout.trim(); - appendFileSync( - historyPath, - `${JSON.stringify({ role: "assistant", content: response, ts: Date.now() })}\n`, - "utf-8", - ); - return { output: response }; - } catch (err) { - return { - output: `Agent dispatch failed: ${getErrorMessage(err)}`, - }; - } - }, - renderResult: ({ output }) => output, - }); } /** Run the STATUS_LINE user script on a 5s interval, posting stdout to footer. */ diff --git a/src/resources/extensions/sf/uok/persistent-agent.js b/src/resources/extensions/sf/uok/persistent-agent.js index cd53c0d21..8c822a305 100644 --- a/src/resources/extensions/sf/uok/persistent-agent.js +++ b/src/resources/extensions/sf/uok/persistent-agent.js @@ -7,6 +7,31 @@ * * Consumer: AgentSwarm orchestrator, swarm role agents (CoordinatorAgent, WorkerAgent etc), * and direct use in multi-agent dispatch flows. + * + * ## Current state + * This module implements the **container** half of a persistent agent: identity, inbox, + * memory blocks, and message routing. It does NOT implement the **runner** half. + * + * The missing piece is an LLM execution runner that: + * 1. Reads pending messages from this agent's inbox (`receive(true)`) + * 2. Assembles a prompt from core memory blocks + inbox messages + * 3. Dispatches to SF headless (`node dist/loader.js headless --print <prompt>`) + * 4. Writes the LLM response back into the bus as a reply + * 5. Updates memory blocks (eviction, summarization) when context grows large + * + * Until the runner exists, `PersistentAgent` is a passive store. The autonomous loop + * uses it this way for sleeptime memory consolidation (caller sends + immediately reads + * inbox). `SwarmDispatchLayer` also only enqueues messages — nothing processes them. + * + * When building the runner, key design decisions to make: + * - Context window management: how many inbox turns to include before summarizing + * - Memory eviction: which core blocks are injected, which are summarized to archival + * - Turn limits: max rounds before the runner yields and re-queues + * - Concurrency: one runner per agent name (enforce via DB lock or process mutex) + * - Error handling: failed LLM calls should leave the message as unread, not drop it + * + * See: Codex `codex-rs/core/src/agent/control.rs` for the reference implementation of + * typed parallel subagents (explorer/worker roles) with forked rollout history. */ import { randomUUID } from "node:crypto"; diff --git a/src/resources/extensions/sf/uok/swarm-dispatch.js b/src/resources/extensions/sf/uok/swarm-dispatch.js index 3a7e66723..b6e1f107a 100644 --- a/src/resources/extensions/sf/uok/swarm-dispatch.js +++ b/src/resources/extensions/sf/uok/swarm-dispatch.js @@ -8,6 +8,18 @@ * * Consumer: UOK kernel dispatch path, parallel orchestrators, and /sf autonomous controller * when SF_A2A_ENABLED is set. + * + * ## Current state — enqueue only, no runner + * `_busDispatch` routes an envelope to a role agent's inbox via the MessageBus. It does NOT + * wait for a response — the `DispatchResult` contains only `messageId` and `targetAgent`, + * not LLM output. Nothing currently drains agent inboxes and runs LLM calls. + * + * This layer is ready to use once `PersistentAgent` gains a runner (see persistent-agent.js + * module comment for the runner design). At that point `dispatch()` can be extended to + * optionally block until the runner posts a reply to the bus. + * + * Callers outside uok/: none currently. The autonomous loop uses AgentSwarm directly for + * the sleeptime memory path. Wire this in when building the autonomous orchestrator. */ import { AgentSwarm } from "./agent-swarm.js"; diff --git a/src/resources/extensions/sf/uok/trace-writer.js b/src/resources/extensions/sf/uok/trace-writer.js index 6c746a2da..fa110f52e 100644 --- a/src/resources/extensions/sf/uok/trace-writer.js +++ b/src/resources/extensions/sf/uok/trace-writer.js @@ -4,6 +4,7 @@ import { appendFileSync, closeSync, existsSync, + lstatSync, mkdirSync, openSync, readdirSync, @@ -15,6 +16,12 @@ import { import { join } from "node:path"; import { sfRoot } from "../paths.js"; +// Longest read window currently used by any trace consumer +// (sf-db-gates.js:391 reads 30 days). Anything older than this is never +// read and just consumes disk. +const TRACE_RETENTION_DAYS_DEFAULT = 30; +const MS_PER_DAY = 24 * 60 * 60 * 1000; + function tracesDir(basePath) { return join(sfRoot(basePath), "traces"); } @@ -45,6 +52,64 @@ export function appendTraceEvent(basePath, traceId, event) { } } +/** + * Prune .sf/traces/*.jsonl files older than retentionDays. + * + * Why: per-flow trace files accumulate one-per-dispatch and are never + * cleaned. The longest analyzer window today is 30 days + * (sf-db-gates.js:391); anything older is never read and just consumes + * disk. The `latest` symlink is preserved unconditionally so the + * tail-friendly pointer keeps working. + * + * Consumer: session-start hook (idempotent, fast, best-effort). + * + * @param {string} basePath + * @param {object} [opts] + * @param {number} [opts.retentionDays=30] + * @param {number} [opts.maxDeletePerCall=1000] - safety cap so a runaway + * directory doesn't make startup slow. + * @returns {{ scanned: number, pruned: number, errors: number }} + */ +export function pruneStaleTraces(basePath, opts = {}) { + const retentionDays = Number(opts.retentionDays ?? TRACE_RETENTION_DAYS_DEFAULT); + const maxDeletePerCall = Math.max(1, Number(opts.maxDeletePerCall ?? 1000)); + const result = { scanned: 0, pruned: 0, errors: 0 }; + if (!basePath || typeof basePath !== "string") return result; + let dir; + try { + dir = tracesDir(basePath); + } catch { + return result; + } + if (!existsSync(dir)) return result; + const cutoff = Date.now() - retentionDays * MS_PER_DAY; + let entries; + try { + entries = readdirSync(dir); + } catch { + return result; + } + for (const name of entries) { + if (result.pruned >= maxDeletePerCall) break; + if (name === "latest") continue; + if (!name.endsWith(".jsonl")) continue; + const path = join(dir, name); + result.scanned += 1; + try { + // lstat so we don't follow a symlink (defensive — there shouldn't + // be any besides `latest`, but never silently chase). + const stat = lstatSync(path); + if (!stat.isFile()) continue; + if (stat.mtimeMs >= cutoff) continue; + unlinkSync(path); + result.pruned += 1; + } catch { + result.errors += 1; + } + } + return result; +} + export function readTraceEvents(basePath, type, windowHours = 24) { // Read all trace files modified within windowHours, filter by event type // Returns array of matching events