diff --git a/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z b/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z deleted file mode 100644 index e78b0d5e7..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-09T17-40-16-600Z b/.sf/backups/db/sf.db.2026-05-09T17-40-16-600Z deleted file mode 100644 index feaf7cee6..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-09T17-40-16-600Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-09T19-41-02-472Z b/.sf/backups/db/sf.db.2026-05-09T19-41-02-472Z deleted file mode 100644 index 136e4280b..000000000 Binary files a/.sf/backups/db/sf.db.2026-05-09T19-41-02-472Z and /dev/null differ diff --git a/.sf/backups/db/sf.db.2026-05-10T14-47-54-645Z b/.sf/backups/db/sf.db.2026-05-10T14-47-54-645Z new file mode 100644 index 000000000..eb09e243f Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-10T14-47-54-645Z differ diff --git a/.sf/backups/db/sf.db.2026-05-10T15-16-24-679Z b/.sf/backups/db/sf.db.2026-05-10T15-16-24-679Z new file mode 100644 index 000000000..9d3fa91ee Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-10T15-16-24-679Z differ diff --git a/.sf/metrics.db b/.sf/metrics.db index 980021f6a..32a0ea60e 100644 Binary files a/.sf/metrics.db and b/.sf/metrics.db differ diff --git a/.sf/metrics.db-shm b/.sf/metrics.db-shm index fe9ac2845..b60ed8783 100644 Binary files a/.sf/metrics.db-shm and b/.sf/metrics.db-shm differ diff --git a/.sf/metrics.db-wal b/.sf/metrics.db-wal index e69de29bb..afbb704a3 100644 Binary files a/.sf/metrics.db-wal and b/.sf/metrics.db-wal differ diff --git a/.sf/model-performance.json b/.sf/model-performance.json index ee61f7aef..406df1828 100644 --- a/.sf/model-performance.json +++ b/.sf/model-performance.json @@ -86,5 +86,17 @@ "successRate": 1, "total": 2 } + }, + "execute-task": { + "minimax/MiniMax-M2.7-highspeed": { + "successes": 1, + "failures": 0, + "timeouts": 0, + "totalTokens": 12233288, + "totalCost": 0.3431336426, + "lastUsed": "2026-05-10T15:16:08.120Z", + "successRate": 1, + "total": 1 + } } } \ No newline at end of file diff --git a/.sf/safety/evidence-M001-6377a4-S03-T01.json b/.sf/safety/evidence-M001-6377a4-S03-T01.json new file mode 100644 index 000000000..68083ba8d --- /dev/null +++ b/.sf/safety/evidence-M001-6377a4-S03-T01.json @@ -0,0 +1,244 @@ +[ + { + "kind": "edit", + "toolCallId": "tool_Ca8DVl3y7fTqNLC5XPq0Pwk1", + "path": "src/resources/extensions/sf/verification-evidence.js", + "timestamp": 1778424928174 + }, + { + "kind": "edit", + "toolCallId": "tool_xVkmHZTHvJP7RwJWVqUORsz4", + "path": "src/resources/extensions/sf/verification-evidence.js", + "timestamp": 1778424949674 + }, + { + "kind": "edit", + "toolCallId": "tool_EiDPzaZO49a4LKnYvuvFaS8e", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778424983294 + }, + { + "kind": "edit", + "toolCallId": "tool_pNt9nP10Us3CPrsqlnWwtQ8l", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778425005515 + }, + { + "kind": "edit", + "toolCallId": "tool_Bl3x74Ojz6aenqD3nYqxkdlO", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778425108830 + }, + { + "kind": "edit", + "toolCallId": "tool_RHLdM0SZK4ffIIokuqNruHbn", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778425162119 + }, + { + "kind": "edit", + "toolCallId": "tool_mAdgaYCgksHmjAI45ZuSnMk5", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778425187240 + }, + { + "kind": "edit", + "toolCallId": "tool_HMsSokItiWF9y6ctKvFSkyE3", + "path": "src/resources/extensions/sf/auto-verification.js", + "timestamp": 1778425206204 + }, + { + "kind": "edit", + "toolCallId": "tool_Jbd8uJQ6ZV4PeF8P91s2OvFG", + "path": "src/resources/extensions/sf/uok/unit-runtime.js", + "timestamp": 1778425258651 + }, + { + "kind": "edit", + "toolCallId": "tool_m1a9UNWqpwBIJvzB9LtlVTBN", + "path": "src/cli-status.ts", + "timestamp": 1778425298174 + }, + { + "kind": "edit", + "toolCallId": "tool_zcSH4Fx3bOumjphAgYisPyhE", + "path": "src/cli-status.ts", + "timestamp": 1778425348170 + }, + { + "kind": "edit", + "toolCallId": "tool_UjLWJsxhCI2bAt3kYl4QEhNK", + "path": "src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "timestamp": 1778425381561 + }, + { + "kind": "edit", + "toolCallId": "tool_1owe7a26pVq3k18x59p6Sy1J", + "path": "src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "timestamp": 1778425414995 + }, + { + "kind": "bash", + "toolCallId": "tool_zgJvYKclL8xmY8DE4c8nluTk", + "command": "npx vitest run src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs --config vitest.config.ts -t \"getRecoveryDiagnostics\"", + "exitCode": 0, + "outputSnippet": "\n\u001b[1m\u001b[30m\u001b[46m RUN \u001b[49m\u001b[39m\u001b[22m \u001b[36mv4.1.5 \u001b[39m\u001b[90m/home/mhugo/code/singularity-forge\u001b[39m\n\n \u001b[32m✓\u001b[39m src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs \u001b[2m(\u001b[22m\u001b[2m38 tests\u001b[22m\u001b[2m | \u001b[22m\u001b[33m35 skipped\u001b[39m\u001b[2m)\u001b[22m\u001b[32m 33\u001b[2mms\u001b[22m\u001b[39m\n\n\u001b[2m Test Files \u001b[22m \u001b[1m\u001b[32m1 passed\u001b[39m\u001b[22m\u001b[90m (1)\u001b[39m\n\u001b[2m Tests \u001b[22m \u001b[1m\u001b[32m3 passed\u001b[39m\u001b[22m\u001b[2m | \u001b[22m\u001b[33m35 skipped\u001b[39m\u001b[90m (38)\u001b[39m\n\u001b[2m Start at \u001b[22m 17:03:54\n\u001b[2m Duration \u001b[22m 1.01s\u001b[2m (transfor", + "timestamp": 1778425433765 + }, + { + "kind": "bash", + "toolCallId": "tool_c9hBKD3D1Ojayl25rr7Mu3UW", + "command": "npx vitest run src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs --config vitest.config.ts", + "exitCode": 0, + "outputSnippet": "\n\u001b[1m\u001b[30m\u001b[46m RUN \u001b[49m\u001b[39m\u001b[22m \u001b[36mv4.1.5 \u001b[39m\u001b[90m/home/mhugo/code/singularity-forge\u001b[39m\n\n \u001b[32m✓\u001b[39m src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs \u001b[2m(\u001b[22m\u001b[2m38 tests\u001b[22m\u001b[2m)\u001b[22m\u001b[32m 140\u001b[2mms\u001b[22m\u001b[39m\n\n\u001b[2m Test Files \u001b[22m \u001b[1m\u001b[32m1 passed\u001b[39m\u001b[22m\u001b[90m (1)\u001b[39m\n\u001b[2m Tests \u001b[22m \u001b[1m\u001b[32m38 passed\u001b[39m\u001b[22m\u001b[90m (38)\u001b[39m\n\u001b[2m Start at \u001b[22m 17:04:12\n\u001b[2m Duration \u001b[22m 1.16s\u001b[2m (transform 552ms, setup 0ms, import 742ms, tests 140ms, environment 0ms", + "timestamp": 1778425451405 + }, + { + "kind": "bash", + "toolCallId": "tool_jrt7lAB8i6qBOnOePlNhiFR2", + "command": "npx biome check src/resources/extensions/sf/verification-evidence.js src/resources/extensions/sf/auto-verification.js src/resources/extensions/sf/uok/unit-runtime.js src/cli-status.ts src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "exitCode": 1, + "outputSnippet": "src/cli-status.ts format ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n × Formatter would have printed the following content:\n \n 240 240 │ if (!targetUnitId) {\n 241 241 │ const records = listUnitRuntimeRecords(basePath);\n 242 │ - → → → const·mostRecent·=·records.sort((a,·b)·=>\n 243 │ - → → → → (b.updatedAt·??·0)·-·(a.updatedAt·??·0),\n 242 │ + → → → const·mostRecent·=·records.sort(\n 243 │ + → → → → (a,·b)·=>·(b.updat", + "timestamp": 1778425466821 + }, + { + "kind": "bash", + "toolCallId": "tool_lpaw0h5RHFke7wyq5gw8sJKp", + "command": "npx biome check --write src/resources/extensions/sf/verification-evidence.js src/resources/extensions/sf/auto-verification.js src/resources/extensions/sf/uok/unit-runtime.js src/cli-status.ts src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "exitCode": 0, + "outputSnippet": "Checked 5 files in 160ms. Fixed 3 files.\n", + "timestamp": 1778425501417 + }, + { + "kind": "bash", + "toolCallId": "tool_so1U8wV32tA70qs55EtB7Pke", + "command": "npx vitest run src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs --config vitest.config.ts", + "exitCode": 0, + "outputSnippet": "\n\u001b[1m\u001b[30m\u001b[46m RUN \u001b[49m\u001b[39m\u001b[22m \u001b[36mv4.1.5 \u001b[39m\u001b[90m/home/mhugo/code/singularity-forge\u001b[39m\n\n \u001b[32m✓\u001b[39m src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs \u001b[2m(\u001b[22m\u001b[2m38 tests\u001b[22m\u001b[2m)\u001b[22m\u001b[32m 152\u001b[2mms\u001b[22m\u001b[39m\n\n\u001b[2m Test Files \u001b[22m \u001b[1m\u001b[32m1 passed\u001b[39m\u001b[22m\u001b[90m (1)\u001b[39m\n\u001b[2m Tests \u001b[22m \u001b[1m\u001b[32m38 passed\u001b[39m\u001b[22m\u001b[90m (38)\u001b[39m\n\u001b[2m Start at \u001b[22m 17:05:15\n\u001b[2m Duration \u001b[22m 1.16s\u001b[2m (transform 554ms, setup 0ms, import 758ms, tests 152ms, environment 0ms", + "timestamp": 1778425514641 + }, + { + "kind": "bash", + "toolCallId": "tool_xBsWuLSrGmQHcVUWydLtioru", + "command": "npx tsc --noEmit src/cli-status.ts --skipLibCheck --module NodeNext --moduleResolution NodeNext --target ES2022 --esModuleInterop --allowJs --strict 2>&1 | head -30", + "exitCode": 0, + "outputSnippet": "error TS5112: tsconfig.json is present but will not be loaded if files are specified on commandline. Use '--ignoreConfig' to skip this error.\n", + "timestamp": 1778425532429 + }, + { + "kind": "bash", + "toolCallId": "tool_lnBQh90rfwsYyVvqmIdq8pH7", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | grep -E \"cli-status|auto-verification|unit-runtime|verification-evidence\" | head -20", + "exitCode": 0, + "outputSnippet": "src/cli-status.ts(237,4): error TS7016: Could not find a declaration file for module './resources/extensions/sf/uok/unit-runtime.js'. '/home/mhugo/code/singularity-forge/src/resources/extensions/sf/uok/unit-runtime.js' implicitly has an 'any' type.\nsrc/cli-status.ts(243,6): error TS7006: Parameter 'a' implicitly has an 'any' type.\nsrc/cli-status.ts(243,9): error TS7006: Parameter 'b' implicitly has an 'any' type.\n", + "timestamp": 1778425546375 + }, + { + "kind": "bash", + "toolCallId": "tool_L5EQX39mZFzmJbRkCACrnkcM", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | grep \" Could not find a declaration file\" | head -10", + "exitCode": 0, + "outputSnippet": "src/cli-status.ts(237,4): error TS7016: Could not find a declaration file for module './resources/extensions/sf/uok/unit-runtime.js'. '/home/mhugo/code/singularity-forge/src/resources/extensions/sf/uok/unit-runtime.js' implicitly has an 'any' type.\n", + "timestamp": 1778425700527 + }, + { + "kind": "edit", + "toolCallId": "tool_bMteIOMYNzppZgJRP2bDK14p", + "path": "src/cli-status.ts", + "timestamp": 1778425760653 + }, + { + "kind": "bash", + "toolCallId": "tool_kja8VArplGz7aqlFb1rZvh9M", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | grep -E \"cli-status|auto-verification|unit-runtime|verification-evidence\" | head -20", + "exitCode": 0, + "outputSnippet": "src/cli-status.ts(237,4): error TS7016: Could not find a declaration file for module './resources/extensions/sf/uok/unit-runtime.js'. '/home/mhugo/code/singularity-forge/src/resources/extensions/sf/uok/unit-runtime.js' implicitly has an 'any' type.\n", + "timestamp": 1778425779047 + }, + { + "kind": "bash", + "toolCallId": "tool_xAMnWlYjFmPHJj2zb3YklZhI", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | grep \"TS7016\" | head -20", + "exitCode": 0, + "outputSnippet": "src/cli-status.ts(237,4): error TS7016: Could not find a declaration file for module './resources/extensions/sf/uok/unit-runtime.js'. '/home/mhugo/code/singularity-forge/src/resources/extensions/sf/uok/unit-runtime.js' implicitly has an 'any' type.\n", + "timestamp": 1778425808025 + }, + { + "kind": "bash", + "toolCallId": "tool_N2K96iYZY9GgeXRHbsVIDJFn", + "command": "ls src/resources/extensions/sf/autonomous-command-args.* src/resources/extensions/sf/repo-identity.* src/resources/extensions/sf/trace-collector.*", + "exitCode": 0, + "outputSnippet": "src/resources/extensions/sf/autonomous-command-args.d.ts\nsrc/resources/extensions/sf/autonomous-command-args.js\nsrc/resources/extensions/sf/repo-identity.d.ts\nsrc/resources/extensions/sf/repo-identity.js\nsrc/resources/extensions/sf/trace-collector.d.ts\nsrc/resources/extensions/sf/trace-collector.js\n", + "timestamp": 1778425827433 + }, + { + "kind": "write", + "toolCallId": "tool_d6oIGASndyhbsF3ethH49BBC", + "path": "src/resources/extensions/sf/uok/unit-runtime.d.ts", + "timestamp": 1778425847941 + }, + { + "kind": "bash", + "toolCallId": "tool_8ipcG5bydJIBFOOfadbuRNzl", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | grep -E \"cli-status|unit-runtime\" | head -20", + "exitCode": 0, + "outputSnippet": "(no output)", + "timestamp": 1778425863933 + }, + { + "kind": "bash", + "toolCallId": "tool_oas0rGaFmZW4i6dQZDfNbIOj", + "command": "npx tsc --noEmit --project tsconfig.json --skipLibCheck 2>&1 | head -20", + "exitCode": 0, + "outputSnippet": "(no output)", + "timestamp": 1778425882297 + }, + { + "kind": "bash", + "toolCallId": "tool_bWplHSK7Xw5d98Gzhf69rI3E", + "command": "npx tsc --noEmit --project tsconfig.extensions.json --skipLibCheck 2>&1 | head -20", + "exitCode": 0, + "outputSnippet": "packages/tui/src/index.ts(52,34): error TS6142: Module './ink-bridge.js' was resolved to '/home/mhugo/code/singularity-forge/packages/tui/src/ink-bridge.tsx', but '--jsx' is not set.\npackages/tui/src/tui.ts(14,34): error TS6142: Module './ink-bridge.js' was resolved to '/home/mhugo/code/singularity-forge/packages/tui/src/ink-bridge.tsx', but '--jsx' is not set.\npackages/tui/src/tui.ts(434,46): error TS7006: Parameter 'data' implicitly has an 'any' type.\n", + "timestamp": 1778425897974 + }, + { + "kind": "bash", + "toolCallId": "tool_R9DLuMbsvg7Lz87cGbyzuS01", + "command": "npx biome check src/resources/extensions/sf/verification-evidence.js src/resources/extensions/sf/auto-verification.js src/resources/extensions/sf/uok/unit-runtime.js src/resources/extensions/sf/uok/unit-runtime.d.ts src/cli-status.ts src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "exitCode": 1, + "outputSnippet": "src/resources/extensions/sf/uok/unit-runtime.d.ts format ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n × Formatter would have printed the following content:\n \n 28 28 │ ): RecoveryDiagnostics | null;\n 29 29 │ \n 30 │ - export·function·listUnitRuntimeRecords(basePath:·string):·Array<\n 31 │ - → Record·&·{·updatedAt?:·number;·unitId:·string·}\n 32 │ - >;\n 30 │ + export·function·listUnitRuntimeRecords(\n 31 │ + → basePath:·string,\n 32 │ + ", + "timestamp": 1778425942347 + }, + { + "kind": "bash", + "toolCallId": "tool_nQuAxzeREvyod0nBGyVPIvfJ", + "command": "npx biome check --write src/resources/extensions/sf/uok/unit-runtime.d.ts", + "exitCode": 0, + "outputSnippet": "Checked 1 file in 17ms. Fixed 1 file.\n", + "timestamp": 1778425957743 + }, + { + "kind": "bash", + "toolCallId": "tool_HZWYvGOveamvyW97wbv5A0Fv", + "command": "npx biome check src/resources/extensions/sf/verification-evidence.js src/resources/extensions/sf/auto-verification.js src/resources/extensions/sf/uok/unit-runtime.js src/resources/extensions/sf/uok/unit-runtime.d.ts src/cli-status.ts src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs", + "exitCode": 0, + "outputSnippet": "Checked 6 files in 70ms. No fixes applied.\n", + "timestamp": 1778425970841 + }, + { + "kind": "bash", + "toolCallId": "tool_SY84S3XYIIBJGZvUchG4qSRZ", + "command": "npx vitest run src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs --config vitest.config.ts", + "exitCode": 0, + "outputSnippet": "\n\u001b[1m\u001b[30m\u001b[46m RUN \u001b[49m\u001b[39m\u001b[22m \u001b[36mv4.1.5 \u001b[39m\u001b[90m/home/mhugo/code/singularity-forge\u001b[39m\n\n \u001b[32m✓\u001b[39m src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs \u001b[2m(\u001b[22m\u001b[2m38 tests\u001b[22m\u001b[2m)\u001b[22m\u001b[32m 168\u001b[2mms\u001b[22m\u001b[39m\n\n\u001b[2m Test Files \u001b[22m \u001b[1m\u001b[32m1 passed\u001b[39m\u001b[22m\u001b[90m (1)\u001b[39m\n\u001b[2m Tests \u001b[22m \u001b[1m\u001b[32m38 passed\u001b[39m\u001b[22m\u001b[90m (38)\u001b[39m\n\u001b[2m Start at \u001b[22m 17:13:05\n\u001b[2m Duration \u001b[22m 1.11s\u001b[2m (transform 477ms, setup 0ms, import 702ms, tests 168ms, environment 0ms", + "timestamp": 1778425983954 + }, + { + "kind": "bash", + "toolCallId": "tool_SdyaSTc9h2LJD9pq5ZLR3Ehc", + "command": "git status --short", + "exitCode": 0, + "outputSnippet": " D .sf/backups/db/sf.db.2026-05-08T22-42-32-307Z\n D .sf/backups/db/sf.db.2026-05-09T17-40-16-600Z\n D .sf/backups/db/sf.db.2026-05-09T19-41-02-472Z\n M .sf/metrics.db\n M .sf/metrics.db-shm\n M .sf/metrics.db-wal\n D BUILD_PLAN_MILESTONE_MAP.md\n D PRODUCTION_AUDIT_COMPLETE.md\n D QUICK_WINS_IMPLEMENTATION.md\n D TRIAGE_COMPLETE.md\n D TRIAGE_README.md\n D autoresearch.md\n D copilot-thoughts.md\n M packages/coding-agent/src/modes/interactive/interactive-mode.ts\n M packages/tui/src/ink-bridge.tsx\n M package", + "timestamp": 1778425999682 + } +] diff --git a/BUILD_PLAN_MILESTONE_MAP.md b/BUILD_PLAN_MILESTONE_MAP.md deleted file mode 100644 index 14be3351d..000000000 --- a/BUILD_PLAN_MILESTONE_MAP.md +++ /dev/null @@ -1,70 +0,0 @@ -# BUILD_PLAN → Milestone Map - -Every BUILD_PLAN.md tier item mapped to a milestone. **Rule D015**: every new milestone must cite which BUILD_PLAN tier/item it implements. - -This file answers **where work belongs**, not **whether code is done**. "Mapped" means a BUILD_PLAN item has a milestone/slice home. It does **not** mean the implementation is verified in the current repo. - -## Mapping vs. code truth - -- **Mapped** — the item has a milestone/slice destination. -- **Verified in code** — the behavior exists in the repo and has evidence/tests/artifacts. -- **Open** — still planned or partially folded in, but not yet verified as complete. -- **Deferred** — intentionally out of the active plan. - ---- - -## High-level milestone direction - -These are the strategy bands above the itemized mapping: - -1. **Core foundation** — UOK, purpose-driven TDD, eight-field PDD gate, repo-local state -2. **Single-repo sharpening** — adopt the best execution/workflow ideas from pi-mono, gsd-2, Claude Code, Codex, Aider, and Plandex where they strengthen Forge -3. **Autonomous reliability** — evidence, recovery, verification, and self-improvement loops -4. **Surface coherence** — CLI, TUI, docs, and workflow language all reflect the same UOK-driven model -5. **ACE convergence prep** — keep concepts compatible with ACE Coder without turning Forge into the multi-repo system - ---- - -## Tier 0 — Pi-mono ports → **M006** -## Tier 0.5 — gsd-2 ports → **M006 + M007** - -All mapped. See BUILD_PLAN.md for item-level status. - -## Tier 1 — ESSENTIAL → **ALL MAPPED** - -| Item | Milestone | Slice | Status | -|---|---|---|---| -| 1.1 Vault secret resolver | **M017-yf67h6** | S01-S03 | ⬜ NEW | -| 1.2 Singularity Memory integration | **M017-jpw5jo** | S01-S03 | ⬜ NEW | -| 1.3 Schema reconciliation (spec rewrite) | **M013** | S12 | ⬜ Folded in | -| 1.4 Config schema alignment | **M013** | S13 | ⬜ Folded in | - -## Tier 2 — STRONG → **ALL MAPPED** - -| Item | Milestone | Slice | Status | -|---|---|---|---| -| 2.1 Persistent agents v1 | M012 | S01-S05 | ⬜ | -| 2.2 Doc-sync sub-step | M009 | S08 | ⬜ | -| 2.3 Intent chapters | M013 | S08 | ⬜ | -| 2.4 PhaseReview 3-pass | M016 | S01-S02 | ⬜ | -| 2.5 turn_status marker | M013 | S09 | ⬜ | -| 2.6 last_error cap | M013 | S10 | ⬜ | -| 2.7 cost_micro_usd | M013 | S11 | ⬜ | - -## Tier 3+ → **Deferred by design** - ---- - -## Summary - -| Tier | Mapped | Gap | -|---|---|---| -| Tier 0 | 10 (M006) | 0 | -| Tier 0.5 | 17 (M006+M007) | 0 | -| **Tier 1** | **4** (M017×2, M013×2) | **0** | -| Tier 2 | 7 (M012, M009, M013, M016) | 0 | -| Tier 3+ | 0 | deferred | - -**Zero mapping gaps.** Every BUILD_PLAN tier item is either mapped to a milestone or explicitly deferred. - -That does **not** mean zero implementation gaps. Open `TODO`, `NEW`, and `⬜` markers in `BUILD_PLAN.md`, this map, and milestone artifacts still represent real work until they are reconciled against code evidence. diff --git a/PRODUCTION_AUDIT_COMPLETE.md b/PRODUCTION_AUDIT_COMPLETE.md deleted file mode 100644 index 7b7de0ad5..000000000 --- a/PRODUCTION_AUDIT_COMPLETE.md +++ /dev/null @@ -1,440 +0,0 @@ -# Complete Long-Term Production-Grade Audit - -**Scope:** All UOK kernel, gate system, execution graph, message bus, diagnostics, metrics, and supporting infrastructure -**Date:** 2026-05-08 -**Grade Scale:** S (exceptional) → A (production) → B (needs work) → C (risky) → D (broken) - ---- - -## Executive Summary - -| Module | Grade | Verdict | -|--------|-------|---------| -| `uok/kernel.js` | **A** | Clean lifecycle, parity recovery, audit envelope, signal handling | -| `uok/gate-runner.js` | **A** | Circuit breaker, retry matrix, memory enrichment, degradation logging | -| `uok/audit.js` | **A** | Atomic writes, stale-write detection, dual persistence (JSONL + DB) | -| `uok/contracts.js` | **A** | Complete JSDoc types, runtime validation, clear interfaces | -| `uok/flags.js` | **A** | Clean preference resolution, all features toggleable | -| `uok/loop-adapter.js` | **A** | Turn observer, gitops integration, writer tokens, timeout, documented | None | -| `uok/parity-report.js` | **A** | Deep parity analysis, orphaned run recovery, ledger reconciliation, malformed logging | -| `uok/message-bus.js` | **A** | Durable SQLite, deduplication, auto-compact, periodic refresh | Cache drift eliminated | -| `uok/cost-guard-gate.js` | **A** | Actual cost lookup, rolling window, high-tier failure detection, cheaper alternative suggestion | -| `uok/security-gate.js` | **A** | Secret scan integration, timeout, graceful skip when script missing | -| `uok/plan-v2.js` | **A** | Graph compilation, artifact validation, cycle detection, context gating | None | -| `uok/execution-graph.js` | **A** | Topological sort, conflict detection, parallel scheduling with deadlock detection | -| `uok/unit-runtime.js` | **A** | Complete lifecycle, retry budgets, LRU cache, durable reconciliation | None | -| `uok/diagnostic-synthesis.js` | **A** | Process tree analysis, multi-source correlation, actionable recommendations | None | -| `uok/metrics-exposition.js` | **A** | Prometheus format, caching, circuit breaker + latency + message bus metrics | Superseded by metrics-central.js | -| `uok/chaos-monkey.js` | **A** | Latency, partial failure, disk, memory stress; all recoverable, all logged | None | -| `uok/writer.js` | **A** | Atomic sequence tracking, token lifecycle, disk persistence, TTL | None | -| `sf-db.js` | **A** | Single-writer invariant, WAL mode, statement cache, schema v45, query timeout, split entry point | metrics-central.js for unified sink | - -**Overall Grade: A** — Production-ready. All scaling concerns addressed. - ---- - -## 1. `uok/kernel.js` — Grade A - -### Strengths -- Clean async lifecycle: enter → run → exit, with `finally` block guarantee -- `recordUokKernelTermination()` handles signal cleanup (symmetrical with enter) -- Parity recovery: checks previous report for missing exits, drains them -- Audit envelope: emits structured events on kernel enter/exit -- workMode + modelMode propagated into lifecycleFlags and audit payload -- `debugLog()` for non-fatal diagnostics without breaking orchestration - -### Production Concerns: None critical - -### Minor -- `runAutoLoopWithUok()` is 120+ lines — could extract helper functions for readability -- `decoratedDeps` spreads all deps — no validation that required deps exist - ---- - -## 2. `uok/gate-runner.js` — Grade A - -### Strengths -- Circuit breaker with exponential backoff: `openDurationMs * 2^streak` -- Half-open state with attempt limiting — proper gradual recovery -- Retry matrix per failure class: `execution`/`artifact`/`verification` get 1 retry, `timeout` gets 2 -- Memory enrichment: queries historical patterns for gate failures (degrades gracefully) -- Every gate run persisted to DB + audit event emitted -- Unknown gates get `manual-attention` outcome (fail-closed) - -### Production Concerns: None critical - -### Minor -- `computeGateEmbedding()` uses a simple hash — not a real semantic embedding -- `enrichGateResultWithMemory()` silently degrades on DB failure (correct behavior, but could log) - ---- - -## 3. `uok/audit.js` — Grade A - -### Strengths -- Atomic writes via `withFileLockSync()` with `onLocked: "skip"` (best-effort) -- Stale-write detection via `isStaleWrite("uok-audit")` — prevents superseded turns from polluting log -- Dual persistence: JSONL for local durability, SQLite for querying -- `closeSync(openSync(path, "a"))` touch pattern ensures lock target exists -- Schema version in envelope for future migration - -### Production Concerns: None critical - ---- - -## 4. `uok/contracts.js` — Grade A - -### Strengths -- Complete JSDoc typedefs for all UOK types -- `validateGate()` catches registration-time mistakes -- Clear separation: `UokContext` (input), `GateResult` (output), `Gate` (interface) - -### Production Concerns: None - ---- - -## 5. `uok/flags.js` — Grade A - -### Strengths -- All UOK features toggleable via preferences -- Clean resolution: `uok?.security_guard?.enabled ?? true` -- `resolvePermissionProfile()` for canonical permission profile - -### Production Concerns: None - ---- - -## 6. `uok/loop-adapter.js` — Grade A - -### Strengths -- Turn observer pattern: `onTurnStart`, `onPhaseResult`, `onTurnResult` -- Gitops integration: writes transaction records per phase with 10s timeout -- Writer token acquisition/release for sequence tracking -- Chaos monkey strikes at phase boundaries -- Audit events for turn start/result -- `nextSequenceMetadata()` fully documented with JSDoc - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Gitops timeout: `writeGitTransactionWithTimeout()` with 10s `Promise.race()` -- ✅ `nextSequenceMetadata()` documented: sequence is optional when no token active - ---- - -## 7. `uok/parity-report.js` — Grade A - -### Strengths -- Deep parity analysis: compares heartbeat events, ledger runs, diff events -- Orphaned run recovery: `recoverOrphanedStartedLedgerRuns()` closes stale DB runs -- Live process detection: `hasLiveAutoLock()` uses `process.kill(pid, 0)` -- Fresh vs historical mismatch separation -- Divergence tracking by plane: `plan`, `graph`, `model-policy`, `audit-envelope`, `gitops` -- `shallowEqualDecisions()` for comparing legacy vs UOK outputs - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Malformed line logging: `parseParityEvents()` now logs dropped count to stderr -- `UNMATCHED_RUN_STALE_MS = 30min` — appropriate for most cases - ---- - -## 8. `uok/message-bus.js` — Grade A - -### Strengths -- Durable SQLite storage with configurable retention -- Deterministic message IDs for idempotent `sendOnce()` -- Auto-compaction when message count exceeds threshold -- Per-agent inbox with read tracking and auto-refresh (30s interval) -- Conversation query between two agents - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Cache drift: `_maybeRefresh()` auto-refreshes from DB every 30s on `list()`, `markRead()`, `unreadCount` -- ✅ `sendOnce()` idempotency: Pre-checks inbox before insert; returns existing ID if found - ---- - -## 9. `uok/cost-guard-gate.js` — Grade A - -### Strengths -- Actual cost lookup from `BUNDLED_COST_TABLE` -- Rolling 1-hour window spend check -- High-tier model failure pattern detection -- Suggests cheaper alternative from same provider/family -- Per-unit and per-hour thresholds - -### Production Concerns: None critical - -### Minor -- `isHighTierModel()` uses `$0.005/1K tokens` threshold — magic number -- `_suggestCheaperAlternative()` could suggest incompatible models (different context window) - ---- - -## 10. `uok/security-gate.js` — Grade A - -### Strengths -- Runs `scripts/secret-scan.sh --diff HEAD` against changes -- 30-second timeout with process kill -- Gracefully skips if script missing (pass) -- Returns findings on failure - -### Production Concerns: None - ---- - -## 11. `uok/plan-v2.js` — Grade A - -### Strengths -- Compiles unit graph from milestone/slice/task DB state -- Validates artifact presence (CONTEXT.md, RESEARCH.md) before execution entry -- Clarify round limit enforcement -- Graph output to JSON for inspection -- Cycle detection at compile time using Kahn's algorithm - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Cycle detection: `detectCycles()` validates graph before execution; returns `hasCycles: true` with clear error - ---- - -## 12. `uok/execution-graph.js` — Grade A - -### Strengths -- Kahn's algorithm topological sort with deterministic ordering (localeCompare) -- File conflict detection: `detectFileConflicts()` finds nodes writing same file -- Parallel scheduling with max workers and dependency awareness -- Deadlock detection: throws when no ready nodes but graph incomplete -- Sidecar queue scheduling with kind-based handlers -- `selectReactiveDispatchBatch()` for incremental dispatch - -### Production Concerns: None critical - ---- - -## 13. `uok/unit-runtime.js` — Grade A - -### Strengths -- Complete lifecycle: queued → claimed → running → progress → completed/failed/blocked/cancelled/stale/runaway-recovered → notified -- Retry budgets with `retryBudgetRemaining()` -- Durable artifact reconciliation: `reconcileDurableCompleteUnitRuntimeRecords()` -- Stale complete-slice cleanup: `reconcileStaleCompleteSliceRecords()` -- In-memory cache for repeated reads within dispatch cycle -- `inspectExecuteTaskDurability()` checks plan, summary, state, must-haves - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Runtime cache bounds: LRU eviction at 5000 entries; removes oldest 20% -- `recordUnitOutcomeInMemory()` creates memory entries but no cleanup policy - ---- - -## 14. `uok/diagnostic-synthesis.js` — Grade A - -### Strengths -- Multi-source correlation: process tree, auto.lock, parity report, DB ledger, runtime projections -- Process descendant tracking via `ps` + tree traversal -- Classification: healthy | running | quiet-but-healthy | degraded | needs-repair -- Actionable recommendations per issue -- Publishes to message bus for observer chains -- `readUokDiagnostics()` for external consumption - -### Production Concerns: None critical - ---- - -## 15. `uok/metrics-exposition.js` — Grade A - -### Strengths -- Prometheus text format output -- 30-second cache TTL for performance -- Gate metrics: runs, passes, fails, retries, latency (avg/p50/p95/max) -- Circuit breaker state gauge (0=closed, 1=half-open, 2=open) -- Message bus metrics: total, unread, unique agents, conversations -- `invalidateMetricsCache()` for cache busting - -### Production Concerns: None - ---- - -## 16. `uok/chaos-monkey.js` — Grade A - -### Strengths -- Four fault types: latency, partial failure, disk stress, memory stress -- All faults are recoverable (no process kill) -- All faults are logged to stderr -- Configurable probabilities and magnitudes -- `getInjectedEvents()` for verification -- Immediate cleanup of stress artifacts - -### Production Concerns: None - ---- - -## 17. `uok/writer.js` — Grade A - -### Strengths -- Atomic sequence tracking via `atomicWriteSync()` -- Writer token lifecycle: acquire → use → release -- Prevents double-acquisition for same turn -- Sequence state persisted to disk -- Token crash recovery: persists to `uok-writer-tokens.json` with 5-min TTL - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Crash recovery: Tokens persisted to disk; `hasActiveWriterToken()` recovers from disk -- ✅ TTL cleanup: Expired tokens auto-purged from memory and disk - ---- - -## 18. `sf-db.js` — Grade A - -### Strengths -- Single-writer invariant enforced by convention + CI test -- WAL mode for file-backed DBs -- Statement cache for prepared queries -- Schema version 45 with migration path -- `normalizeRow()` handles null-prototype objects -- Query timeout protection: `withQueryTimeout()` helper (30s default) -- Split entry point: `sf-db/index.js` for future modularization -- Comprehensive table creation: backlog, schedule, repo profiles, UOK runs, gate runs, audit events, message bus, tasks, verification evidence - -### Production Concerns: None critical - -### Fixed ✅ -- ✅ Query timeout: `withQueryTimeout()` catches timeout/busy errors, returns fallback -- ✅ Split entry point: `sf-db/index.js` re-export created for gradual migration -- ✅ Console logging: All modules use `logWarning()` / `logError()` from workflow-logger - ---- - -## Cross-Cutting Concerns - -### Observability - -| Module | Metrics | Logs | Traces | Audit | -|--------|---------|------|--------|-------| -| kernel.js | ❌ | ✅ debugLog | ✅ traceId | ✅ envelope | -| gate-runner.js | ✅ DB | ✅ insertGateRun | ✅ traceId/turnId | ✅ envelope | -| audit.js | ❌ | ❌ | ✅ eventId | ✅ JSONL+DB | -| loop-adapter.js | ❌ | ❌ | ✅ traceId/turnId | ✅ envelope | -| parity-report.js | ❌ | ❌ | ❌ | ❌ | -| message-bus.js | ✅ DB | ❌ | ❌ | ❌ | -| cost-guard-gate.js | ❌ | ❌ | ❌ | ❌ | -| unit-runtime.js | ❌ | ❌ | ❌ | ❌ | -| diagnostic-synthesis.js | ❌ | ❌ | ❌ | ❌ | -| metrics-exposition.js | ✅ Prometheus | ❌ | ❌ | ❌ | -| chaos-monkey.js | ❌ | ✅ stderr | ❌ | ❌ | - -**Gap:** Resolved — `metrics-central.js` provides unified Counter/Gauge/Histogram with Prometheus text format. Legacy `metrics-exposition.js` still active for backward compatibility. - -### Security - -| Concern | Status | Notes | -|---------|--------|-------| -| Input validation | ✅ Good | All entry points validate | -| Injection prevention | ✅ Good | Parameterized queries in sf-db | -| Secrets scanning | ✅ Good | Security gate runs on every turn | -| Cost limits | ✅ Good | Per-unit and per-hour guards | -| Circuit breakers | ✅ Good | Exponential backoff on failures | -| Chaos engineering | ✅ Good | Opt-in, recoverable faults | - -### Performance - -| Concern | Status | Notes | -|---------|--------|-------| -| Big-O | ✅ Good | All graph ops are O(V+E) | -| Caching | ✅ Good | Metrics cache, runtime cache, statement cache | -| Memory | ✅ Good | LRU eviction on runtime cache (5000), bounded message bus inboxes | -| DB queries | ✅ Good | Single-writer, WAL mode, prepared statements | -| Parallelism | ✅ Good | Max workers capped at 8 | - -### Maintainability - -| Concern | Status | Notes | -|---------|--------|-------| -| Test coverage | ✅ Good | 139+ tests across all modules | -| Documentation | ✅ Good | JSDoc on all exports | -| Logging consistency | ✅ Good | All modules use `logWarning()` / `logError()` from workflow-logger | -| File organization | ✅ Good | sf-db.js has split entry point; full extraction deferred to v2 | -| Schema versioning | ✅ Good | Schema v45 with migrations | - ---- - -## Action Plan - -### Before Production (Blockers) — ALL CLEAR ✅ - -No blockers identified. All modules are production-ready. - -### Before Scaling to 10+ Workers — ALL FIXED ✅ - -1. ✅ **Message bus cache drift** — Added `_maybeRefresh()` with 30s interval; `list()`, `markRead()`, `unreadCount` auto-refresh -2. ✅ **Writer token crash recovery** — Persist tokens to `uok-writer-tokens.json`; 5-min TTL; `hasActiveWriterToken()` recovers from disk -3. ✅ **Runtime cache bounds** — LRU eviction at 5000 entries; removes oldest 20% - -### Before Next Major Release — ALL FIXABLE ITEMS COMPLETE ✅ - -4. ✅ **Split sf-db.js** — Created `sf-db/index.js` re-export entry point; full extraction deferred to v2 -5. ✅ **Console.warn cleanup** — `context-injector.js`, `vault-resolver.js`, `knowledge-injector.js` now use `logWarning()` -6. ✅ **Cycle detection at compile time** — `detectCycles()` in `plan-v2.js` using Kahn's algorithm; returns `hasCycles: true` - -### Implemented ✅ - -7. ✅ **Centralized metrics** — `metrics-central.js` with Counter/Gauge/Histogram, Prometheus text format, wired into subagent inheritance and mode transitions - -### Deferred to v2 (Architectural, Not Bugs) - -8. ⚠️ **TypeScript migration** — Convert UOK modules to `.ts` for compile-time safety - ---- - -## Appendix: Complete Module Inventory - -### UOK Kernel (18 modules, ~2,800 lines) - -| Module | Lines | Grade | Tests | -|--------|-------|-------|-------| -| `kernel.js` | 120 | A | ✅ | -| `gate-runner.js` | 280 | A | ✅ | -| `audit.js` | 80 | A | ✅ | -| `contracts.js` | 120 | A | ✅ | -| `flags.js` | 40 | A | ✅ | -| `loop-adapter.js` | 180 | A | ✅ | -| `parity-report.js` | 320 | A | ✅ | -| `message-bus.js` | 180 | A | ✅ | -| `cost-guard-gate.js` | 140 | A | ✅ | -| `security-gate.js` | 60 | A | ✅ | -| `plan-v2.js` | 200 | A | ✅ | -| `execution-graph.js` | 260 | A | ✅ | -| `unit-runtime.js` | 420 | A | ✅ | -| `diagnostic-synthesis.js` | 280 | A | ✅ | -| `metrics-exposition.js` | 180 | A | ✅ (legacy) | -| `chaos-monkey.js` | 140 | A | ✅ | -| `writer.js` | 100 | A | ✅ | -| `sf-db.js` | 7000+ | A | ✅ | -| `metrics-central.js` | 350 | A | ✅ (new) | - -### Mode System (7 modules, ~1,400 lines) - -| Module | Lines | Grade | Tests | -|--------|-------|-------|-------| -| `operating-model.js` | 120 | A | 13 | -| `auto/session.js` | 200 | A- | ✅ | -| `task-frontmatter.js` | 311 | A- | 9 | -| `subagent-inheritance.js` | 170 | A- | 9 | -| `remote-steering.js` | 139 | A- | 7 | -| `parallel-intent.js` | 139 | B+ | 6 | -| `skills/eval-harness.js` | 139 | A- | 5 | - -**Total: 139 tests passing, 0 failures, 1 skipped.** - ---- - -*Audit completed. All modules production-ready. Address scaling items before 10+ workers.* diff --git a/QUICK_WINS_IMPLEMENTATION.md b/QUICK_WINS_IMPLEMENTATION.md deleted file mode 100644 index e0794ec00..000000000 --- a/QUICK_WINS_IMPLEMENTATION.md +++ /dev/null @@ -1,385 +0,0 @@ -# Quick Wins Implementation - Complete - -**Date:** 2026-05-06 -**Implemented by:** Copilot CLI -**Commit:** 0e2edfdeb -**Status:** ✅ COMPLETE - Core infrastructure in place - -## Summary - -Successfully implemented the foundational infrastructure for 3 high-impact quick wins that activate SF's self-evolution learning loop: - -1. **Close Self-Report Feedback Loop** [9/10 impact, 2-3 days to full integration] -2. **Activate Continuous Model Learning** [8/10 impact, 3-4 days to full integration] -3. **Automate Knowledge Injection** [7/10 impact, 2-3 days to full integration] - -**Total:** 24/30 impact points unlocked through self-evolution infrastructure. - ---- - -## Quick Win 1: Close Self-Report Feedback Loop [9/10 Impact] - -### What Was Implemented - -**File:** `src/resources/extensions/sf/self-report-fixer.js` (348 lines) - -**Module:** `SelfReportFixer` with the following capabilities: - -- **Pattern Recognition** — 4 built-in fix patterns: - 1. `validation-reviewer-rubric` (95% confidence) — Add criterion/gap rubric to validation prompts ✅ *Already fixed* - 2. `gate-verdict-clarity` (90% confidence) — Document gate verdict semantics - 3. `env-vars-unvalidated` (85% confidence) — Add SF_* env validation - 4. `self-report-coverage-gap` (80% confidence) — Implement triage pipeline - -- **Automatic Fix Classification** - ```js - classifyReportFixes(report) // Returns applicable fixes with confidence scores - ``` - -- **High-Confidence Auto-Fix** - ```js - autoFixHighConfidenceReports(basePath, reports) - // Applies fixes for confidence > 0.85 - ``` - -- **Deduplication** - ```js - dedupReports(reports) // Group related reports by normalized issue key - ``` - -- **Severity Categorization** - ```js - categorizeBySeverity(reports) // blocker | warning | suggestion - ``` - -### Next Steps for Full Integration - -1. Hook into `triage-self-feedback.js` to invoke fixer after triage runs -2. Add pattern library for domain-specific fixes (provider routing, timeout tuning, etc.) -3. Create integration tests for each fix pattern -4. Document feedback loop: report → triage → fix → verification - -### How It Works - -```javascript -import { autoFixHighConfidenceReports } from './self-report-fixer.js'; - -// After collecting self-reports -const reports = readSelfFeedback(); - -// Auto-apply high-confidence fixes -const { applied, failed, skipped } = await autoFixHighConfidenceReports( - projectPath, - reports -); - -// applied: ["validation-reviewer-rubric: rubric already present"] -// failed: ["env-vars-unvalidated: requires schema impl"] -// skipped: ["gate-verdict-clarity: confidence 0.9 > threshold 0.85"] -``` - ---- - -## Quick Win 2: Activate Continuous Model Learning [8/10 Impact] - -### What Was Implemented - -**File:** `src/resources/extensions/sf/model-learner.js` (344 lines) - -**Classes:** - -#### ModelPerformanceTracker -Tracks per-task-type model performance with: -- Success/failure/timeout counts -- Token usage and cost tracking -- Success rate calculation -- Ranked model sorting - -**Storage:** `.sf/model-performance.json` - -```json -{ - "execute-task": { - "gpt-4o": { - "successes": 42, - "failures": 3, - "timeouts": 1, - "totalTokens": 1500000, - "totalCost": 45.50, - "lastUsed": "2026-05-06T16:30:00Z", - "successRate": 0.93 - } - } -} -``` - -**API:** -```js -tracker.recordOutcome(taskType, modelId, { success, timeout, tokensUsed, costUsd }) -tracker.getRankedModels(taskType, minSamples = 3) // Returns sorted by success rate -tracker.shouldDemote(taskType, modelId, threshold = 0.5) // Demote if failure >50% -tracker.getABTestCandidates(taskType) // For hypothesis testing -``` - -#### FailureAnalyzer -Categorizes and analyzes failure modes: -- Logs failures to JSONL -- Detects patterns (e.g., timeout-prone models) -- Provides failure summaries per model - -**Storage:** `.sf/model-failure-log.jsonl` - -```json -{ - "timestamp": "2026-05-06T16:30:00Z", - "taskType": "execute-task", - "modelId": "gpt-4o", - "reason": "quality_check_failed", - "timeout": false, - "tokensUsed": 25000, - "context": { ... } -} -``` - -**API:** -```js -analyzer.logFailure(taskType, modelId, { reason, timeout, tokensUsed, context }) -analyzer.getFailureSummary(taskType, modelId) // Returns { reasons, patterns } -``` - -### Main API: ModelLearner - -```javascript -import { ModelLearner } from './model-learner.js'; - -const learner = new ModelLearner(projectPath); - -// Record successful outcome -learner.recordOutcome('execute-task', 'claude-opus', { - success: true, - tokensUsed: 15000, - costUsd: 0.50, -}); - -// Record failure -learner.logFailure('execute-task', 'gpt-4o', { - reason: 'quality_check_failed', - timeout: false, - tokensUsed: 25000, -}); - -// Get ranked models (for intelligent routing) -const rankedModels = learner.getRankedModels('execute-task'); -// [ -// { modelId: 'claude-opus', successRate: 0.98, attempts: 50, ... }, -// { modelId: 'gpt-4o', successRate: 0.90, attempts: 40, ... } -// ] - -// A/B test decision -const abTest = learner.getABTestCandidates('execute-task'); -// { incumbent: claude-opus, challengers: [gpt-4o, gemini-pro], testBudget: 10 } - -// Analyze A/B results and decide promotion/demotion -const decision = learner.analyzeABTest('execute-task', { - incumbentWins: 8, - challengerWins: 2, -}); -// { recommendation: "continue", reason: "incumbent 0.80 vs challenger 0.20" } -``` - -### Next Steps for Full Integration - -1. Integrate into `auto-dispatch.ts` outcome logging -2. Hook into `model-router.ts` to use ranked models for routing decisions -3. Implement auto-demotion in model selection logic -4. Add A/B testing orchestration for low-risk tasks -5. Create dashboard in `benchmark-selector.ts` showing per-model performance - ---- - -## Quick Win 3: Automate Knowledge Injection [7/10 Impact] - -### What Was Implemented - -**File:** `src/resources/extensions/sf/knowledge-injector.js` (336 lines) - -**Key Functions:** - -- **Parse Knowledge Base** - ```js - parseKnowledgeEntries(knowledgeContent) - // Extracts judgment-log entries with confidence, domain, recommendation - ``` - -- **Semantic Matching** - ```js - extractConcepts(entry) // Extract domain tags, failure modes, constraints - semanticSimilarity(concepts, contextKeywords) // Score relevance - ``` - -- **Find Relevant Knowledge** - ```js - findRelevantKnowledge(entries, contextKeywords, minConfidence=0.6, minSimilarity=0.5) - // Returns sorted by combined score (confidence × 0.7 + similarity × 0.3) - ``` - -- **Detect Contradictions** - ```js - detectContradictions(entries) // Flag conflicting recommendations - ``` - -- **Format for Injection** - ```js - formatKnowledgeForInjection(relevantKnowledge) - // Human-readable markdown with confidence/relevance scores - ``` - -- **Track Usage** (for feedback loop) - ```js - trackKnowledgeUsage(taskId, injectedKnowledge) - // Logs which knowledge was used for effectiveness measurement - ``` - -### Integration into auto-prompts.js - -**Modified:** `src/resources/extensions/sf/auto-prompts.js` - -Added: -1. Import of knowledge-injector module -2. Helper function `getKnowledgeInjection(basePath, taskContext)` with graceful degradation -3. Knowledge injection into execute-task prompt with context (domain, keywords, technology) - -**In execute-task prompt loading (line 2203+):** -```javascript -const knowledgeInjection = await getKnowledgeInjection(base, { - domain: "task-execution", - taskType: "execute-task", - keywords: [tTitle, sTitle, mid, sid], - technology: [], -}); - -return loadPrompt("execute-task", { - memoriesSection, - knowledgeInjection, // NEW: Relevant prior learning - overridesSection, - // ... other variables -}); -``` - -### Existing Infrastructure - -**Note:** Knowledge injection is **60% complete** via existing `queryKnowledge()` in context-store.js - -- ✅ `inlineKnowledgeScoped()` already exists (uses queryKnowledge) -- ✅ Used in both plan-slice and execute-task prompts -- ❌ Uses simple keyword matching (not semantic scoring) -- ✅ Our new module enhances with semantic similarity - -### Next Steps for Full Integration - -1. Update execute-task and plan-slice prompt templates to include `{{knowledgeInjection}}` variable -2. Integrate semantic scoring into queryKnowledge or create parallel path -3. Implement feedback loop: track which knowledge was used and measure effectiveness -4. Create contradiction resolver UI for conflicting recommendations -5. Add knowledge effectiveness metrics to benchmark reports - ---- - -## Files Created - -| File | Lines | Purpose | -|------|-------|---------| -| `src/resources/extensions/sf/self-report-fixer.js` | 348 | Auto-fix high-confidence self-reports | -| `src/resources/extensions/sf/model-learner.js` | 344 | Per-task-type model performance tracking | -| `src/resources/extensions/sf/knowledge-injector.js` | 336 | Semantic knowledge matching and injection | - -## Files Modified - -| File | Changes | Purpose | -|------|---------|---------| -| `src/resources/extensions/sf/auto-prompts.js` | +7 lines | Added knowledge injection into execute-task | - -## Build Status - -✅ **Build Success** -- All new modules compile without errors -- TypeScript types intact -- Resources copied to `dist/` -- Inventory check passed - -## Testing Recommendations - -Create integration tests for: - -1. **Self-Report Fixer** - - Pattern matching accuracy (4 patterns) - - Deduplication logic - - Confidence thresholding - -2. **Model Learner** - - Success rate calculation - - Demotion logic (>50% failure rate) - - A/B test analysis - - Failure pattern detection - -3. **Knowledge Injector** - - Semantic similarity scoring - - Contradiction detection - - Formatting for prompt injection - - Graceful degradation (missing KNOWLEDGE.md) - -## Activation Timeline - -**To fully activate these quick wins:** - -1. **Week 1:** Hook model-learner into auto-dispatch outcome logging -2. **Week 1:** Integrate self-report-fixer into triage-self-feedback pipeline -3. **Week 2:** Implement knowledge injection in model-router for adaptive routing -4. **Week 2:** Add A/B testing orchestration for model promotion -5. **Week 3:** Create feedback loop dashboard in benchmark-selector -6. **Week 3:** Measure impact on learning efficiency - -**Estimated effort:** 8-10 days of focused integration work - ---- - -## Key Design Decisions - -1. **Graceful Degradation** — All modules degrade gracefully if knowledge base or tracking files are unavailable -2. **Append-Only Logs** — Failure logs use JSONL for durability and analysis -3. **Per-Task-Type Tracking** — Model performance varies by task type; no single ranking -4. **Confidence-Based Thresholding** — High-confidence fixes (>0.85) auto-apply; lower ones require review -5. **A/B Test Budgeting** — Low-risk hypothesis testing with configurable test budget - ---- - -## Impact Measurement - -**After full integration, expect:** - -- 🎯 **9/10 impact** from self-report loop: Close feedback loop from anomaly detection to code fixes -- 🎯 **8/10 impact** from model learning: 20-30% improvement in task success rate through adaptive routing -- 🎯 **7/10 impact** from knowledge injection: 15-20% faster task planning via relevant prior learning - -**Total:** **24/30 self-evolution capability points activated** (up from current 15/30) - ---- - -## Code Quality - -- ✅ No external dependencies (uses only Node.js built-ins + SF imports) -- ✅ JSDoc purpose statements on all exports -- ✅ Graceful error handling (no crash on missing files) -- ✅ Idempotent tracking (safe to call multiple times) -- ✅ Clear separation of concerns (fixer ≠ learner ≠ injector) - ---- - -## Status Summary - -**Phase:** ✅ **IMPLEMENTATION COMPLETE** -**Phase:** ⏳ **INTEGRATION PENDING** (dispatch loop hookup) -**Phase:** ⏳ **TESTING PENDING** (unit + integration tests) -**Phase:** ⏳ **FEEDBACK LOOP PENDING** (measure effectiveness) - -The infrastructure is in place. Next: Connect it into the dispatch loop and measure impact. diff --git a/TRIAGE_COMPLETE.md b/TRIAGE_COMPLETE.md deleted file mode 100644 index 2d19f1acf..000000000 --- a/TRIAGE_COMPLETE.md +++ /dev/null @@ -1,114 +0,0 @@ -# Triage Complete ✅ - -**Timestamp:** 2026-05-06 16:30 UTC -**Source:** TODO.md (Raw Dump Inbox) -**Command:** `sf todo triage` -**Node baseline:** v26.1.0+ -**Session:** 77b45896 - -## Summary - -Successfully triaged 60 items from TODO.md into structured backlog artifacts: - -- ✅ **60 items** normalized into `.sf/triage/inbox/20260506-163003.jsonl` -- ✅ **10 eval candidates** extracted into `.sf/triage/evals/20260506-163003.evals.jsonl` -- ✅ **1 skill proposal** in `.sf/triage/skills/20260506-163003.skills.jsonl` -- ✅ **Comprehensive report** generated at `.sf/triage/reports/20260506-163003.md` -- ✅ **TODO.md reset** to empty dump inbox (triage pipeline activated) - -## Artifacts Created - -### 1. Triage Report (`.sf/triage/reports/20260506-163003.md`) -Comprehensive analysis including: -- Summary of source material -- 10 eval candidates with failure modes and test locations -- 21 implementation tasks (gsd-2 ports, feature additions, provider expansion) -- Memory requirements for self-evolution infrastructure -- Harness suggestions for testing (property-based, chaos, end-to-end) -- Documentation improvements needed (ARCHITECTURE.md, ADRs, runbooks) -- Clarification needs ("Unclear Notes" section) - -**Key findings:** -- UOK is 60-70% complete for self-evolution -- Critical: Close self-report feedback loop (9/10 impact) -- 10+ undocumented architecture features identified -- Multiple safety/correctness fixes awaiting port from gsd-2 - -### 2. Normalized Inbox (`.sf/triage/inbox/20260506-163003.jsonl`) -60 structured items with: -- Type: eval_candidate, implementation_task, doc_improvement, harness_suggestion, memory_requirement, unclear_note -- Status: pending -- Source tracing: all items linked back to TODO.md section -- Prioritization ready for milestone planning - -### 3. Eval Candidates (`.sf/triage/evals/20260506-163003.evals.jsonl`) -10 test harness candidates with: -- Task input (trigger/condition) -- Expected behavior (contract) -- Failure mode (what breaks if missing) -- Evidence/source (citations to gsd-2/pi-mono commits) -- Suggested test location - -**Quick examples:** -1. `bash-evidence-race` — Evidence persists across dispatch/re-dispatch -2. `symlink-staging-data-loss` — Data-loss prevention for symlinked .sf -3. `mcp-stdout-deadlock` — Large MCP outputs don't hang -4. `env-sf-vars-unvalidated` — SF_* env vars validated at startup - -### 4. Skill Proposals (`.sf/triage/skills/20260506-163003.skills.jsonl`) -Architecture analysis suggesting improvements to SF's extension/gate system. - -## Next Steps - -1. **Review triage report** — Read `.sf/triage/reports/20260506-163003.md` -2. **Plan implementation** — Promote high-impact items to milestone backlog -3. **Prioritize quick wins:** - - Close self-report feedback loop [9/10 impact, ~4 days] - - Activate continuous model learning [8/10 impact, ~5 days] - - Automate knowledge injection [7/10 impact, ~4 days] -4. **Port gsd-2 safety fixes** — 9 commits awaiting cherry-pick -5. **Close documentation gaps** — Update ARCHITECTURE.md with state machine diagram - -## Evidence - -``` -$ ls -la .sf/triage/ - drwxrwxr-x evals/ - drwxrwxr-x inbox/ - drwxrwxr-x reports/ - drwxrwxr-x skills/ - -$ wc -l .sf/triage/*/*.{md,jsonl} - 60 .sf/triage/inbox/20260506-163003.jsonl - 10 .sf/triage/evals/20260506-163003.evals.jsonl - 1 .sf/triage/skills/20260506-163003.skills.jsonl - 9682 .sf/triage/reports/20260506-163003.md - -$ git status - D TODO.md (reset to empty dump inbox; items triaged) - M docs/* (from earlier work) -``` - -## What This Means - -SF's triage system successfully: -1. ✅ Parsed TODO.md dump inbox -2. ✅ Extracted 60 items into structured types (eval, task, doc, harness, etc.) -3. ✅ Generated failure-mode contracts for 10 critical correctness tests -4. ✅ Identified test locations and citations to source code -5. ✅ Reset TODO.md for next cycle -6. ✅ Created decision artifacts ready for milestone planning - -The comprehensive review, research, documentation updates, and automated triage are complete. The project is now positioned to: -- Activate SF's self-evolution learning loop (3 quick wins) -- Port 9 safety/correctness fixes from gsd-2 -- Close 10+ documentation gaps in ARCHITECTURE.md -- Implement property-based testing for autonomous dispatch -- Begin advanced feature ports (Cloudflare AI, Azure endpoints, SSE handling) - ---- - -**Created by:** Copilot CLI -**Session:** 2514fa98-076d-48d2-a1f9-c3fd77c4a82a -**Duration:** ~2 hours total (research + docs + triage) -**Command:** `node dist/cli.js todo triage` diff --git a/TRIAGE_README.md b/TRIAGE_README.md deleted file mode 100644 index e284790a3..000000000 --- a/TRIAGE_README.md +++ /dev/null @@ -1,53 +0,0 @@ -# TODO.md Triage Instructions - -## What's New - -TODO.md now contains two major sections ready for triage: - -1. **Feature Gaps & Limitations** — 40+ specific gaps identified in the codebase -2. **UOK Self-Evolution Research** — 10 prioritized improvements for SF's self-evolution capabilities - -## How to Triage - -When you have Node 26.1.0+ available: - -```bash -cd /home/mhugo/code/singularity-forge - -# Run the triage command -sf todo triage - -# Or if using npm/nvm -nvm use 26 -npm exec sf -- todo triage -``` - -## What Triage Does - -The triage tool will: -1. Parse TODO.md -2. Extract items into structured `.sf/triage/` artifacts -3. Propose categorization and priorities -4. Show you a review interface -5. Either commit to backlog or reset TODO.md to empty dump inbox - -## Key Items to Watch For - -The UOK Self-Evolution section has **3 high-impact quick wins** (8-10 days total): - -1. Close self-report feedback loop [9/10 impact, 2-3 days] -2. Activate continuous model learning [8/10 impact, 3-4 days] -3. Automate knowledge injection [7/10 impact, 2-3 days] - -These should be prioritized if you want to activate SF's learning loop. - -## Full Research Report - -See: `/home/mhugo/snap/copilot-cli/38/.copilot/session-state/2514fa98-076d-48d2-a1f9-c3fd77c4a82a/research/is-our-uok-the-best-for-a-self-evolving-coder-what.md` - -This contains: -- Executive summary -- Detailed analysis of UOK implementation vs. documentation -- 10 improvement suggestions with feasibility assessment -- Competitive analysis (vs. other orchestration systems) -- 15+ citations to code and design docs diff --git a/autoresearch.md b/autoresearch.md deleted file mode 100644 index 9752dd7b5..000000000 --- a/autoresearch.md +++ /dev/null @@ -1,53 +0,0 @@ -# Autoresearch: Reduce Biome Lint Diagnostics - -## Objective -Minimize the total number of Biome lint diagnostics (errors + warnings + info) across `src/`, starting from baseline ~40 diagnostics. Errors are mostly `organizeImports`, warnings are `noUnusedImports`, `noUnusedVariables`, and `useConst`. - -## Metrics -- **Primary**: `diagnostics` (count, lower is better) — sum of errors + warnings + info from `npx biome check src/` -- **Secondary**: `errors` (count, lower is better) -- **Secondary**: `warnings` (count, lower is better) - -## How to Run -`bash autoresearch.sh` — runs Biome check, parses JSON summary, outputs `METRIC diagnostics=N` and `METRIC errors=N` and `METRIC warnings=N`. - -## Files in Scope -All files under `src/` — but focus on the files flagged by Biome: -- `src/resources/extensions/sf/auto/phases.js` -- `src/resources/extensions/sf/commands/handlers/ops.js` -- `src/resources/extensions/sf/memory-repository.js` -- `src/resources/extensions/sf/metrics-central.js` -- `src/resources/extensions/sf/reasoning-assist.js` -- `src/resources/extensions/sf/remote-steering.js` -- `src/resources/extensions/sf/sf-db.js` -- `src/resources/extensions/sf/subagent-inheritance.js` -- `src/resources/extensions/sf/tests/memory-repository.test.mjs` -- `src/resources/extensions/sf/tests/metrics-central.test.mjs` -- `src/resources/extensions/sf/tests/trajectory-recorder.test.mjs` -- `src/resources/extensions/sf/trajectory-command.js` -- `src/resources/extensions/sf/trajectory-recorder.js` -- `src/resources/extensions/sf/uok/writer.js` - -## Off Limits -- `biome.json` (don't change lint rules — fixing source is the goal) -- `node_modules/`, `dist/`, `.sf/`, `packages/` (outside `src/` scope) -- Test assertion logic (don't weaken tests to make linters pass) - -## Constraints -- Existing vitest tests must pass: `npx vitest run --config vitest.config.ts` -- No new dependencies -- Don't introduce runtime behavior changes — only lint/import/style fixes - -## Termination -Run until interrupted by the user. - -## What's Been Tried - -- **#2 (auto-fix)**: `biome check --write` — fixed 26 auto-fixable errors (format/organizeImports), dropped diagnostics from 40 to 11. Status: keep. -- **#3 (manual fixes)**: Removed 7 unused imports and prefixed 4 intentionally-unused items with underscore. Dropped from 11 to 0. Status: keep. -- **#4 (regression re-fix)**: 37 new commits introduced 74 diagnostics. `biome check --write` fixed 58 (auto-safe), manual prefix/removal fixed the remaining 16 unsafe warnings across 11 files. Also fixed pre-existing web-mode-onboarding test timeout: added `timeoutMs: 120_000` to `launchPackagedWebHost`, raised `AbortSignal.timeout` on simple fetches 10s→30s, raised test budget 180s→420s. All 409 test files pass. Diagnostics: 0. Status: keep. - -## Lessons -- New development (37 commits) is enough to re-introduce 74 diagnostics. Re-run autoresearch periodically (monthly or after large feature branches land). -- Pattern of new violations: unused imports from refactors, unused function params from stubs, duplicate imports. Auto-fix handles errors; unsafe-fix (unused-import/var) requires manual triage. -- Integration test timeout under parallel load: cold-start Next.js can consume most of a 180s test timeout leaving insufficient budget for multi-step API calls. Fix: bound launch phase separately, raise individual fetch timeouts, increase overall budget to match worst-case sum. diff --git a/copilot-thoughts.md b/copilot-thoughts.md deleted file mode 100644 index 459c81d64..000000000 --- a/copilot-thoughts.md +++ /dev/null @@ -1,1267 +0,0 @@ -# Agent Mode And Skills Notes For SF - -Sources checked 2026-05-08: - -- GitHub Docs, "Allowing GitHub Copilot CLI to work autonomously" - -- GitHub Docs, "GitHub Copilot CLI command reference" - -- GitHub Copilot CLI product page - -- GitHub Changelog, "GitHub Copilot CLI is now generally available" - -- GitHub Changelog, "Copilot CLI now supports BYOK and local models" - -- Factory Droid, "Autonomy Level" - -- Factory Droid, "CLI Reference" - -- Factory Droid, "Skills" - -- Amp manual - -- Amp, "Agent Skills" - - -## Competitive Patterns - -### Copilot CLI - -Copilot CLI has the cleanest public "continue work" story: - -- plan first, then accept the plan and continue without step-by-step approval -- continuation is separate from permission expansion -- question suppression is separate from continuation -- a runaway continuation cap is explicit -- `/fleet` parallelizes with subagents -- `/remote` steers a running session from another device -- `/tasks` exposes background work -- `/session` exposes session info, checkpoints, files, plans, cleanup, and - pruning -- `/skills`, `/plugin`, `/mcp`, and `/agent` are visible control surfaces -- BYOK, local-model, and offline provider configuration are first-class - -Useful shape: - -```bash -copilot --autopilot --yolo --max-autopilot-continues 10 -p "YOUR PROMPT HERE" -``` - -SF should copy the separation, not the names: - -- continuation is run control -- permission expansion is permission profile -- question behavior is an escalation policy -- runaway caps are explicit autonomous limits - -### Factory Droid - -Factory Droid makes the most important distinction explicit: Autonomy Level is -separate from interaction mode. - -- interaction mode: Auto vs Spec Mode -- autonomy level: Off, Low, Medium, High -- execution surface: interactive `droid` vs headless `droid exec` -- tools and commands carry risk levels -- command allowlists and denylists layer on top of autonomy -- Spec Mode plans first; after approval, Droid exits Spec Mode and implements - with the chosen autonomy level -- `droid exec` is read-only by default and raises permissions with - `--auto low|medium|high` -- custom droids/subagents can have their own model/tool/autonomy policies -- skills are reusable capabilities that can be user-invoked or invoked by the - Droid when relevant - -This validates SF's split between work mode, run control, and permission -profile. - -### Amp - -Amp is useful for the agent-shape and skills model: - -- modes are model/capability presets: `smart`, `rush`, `deep` -- skills live in `.agents/skills/` and user-level skill directories -- skill content is lazily loaded only when relevant -- skills can package instructions, scripts, resources, and tool/MCP config -- subagents can be spawned automatically for isolated work, but they have - isolated context and return only final summaries -- Oracle and Librarian are specialized helper agents for second opinion and - cross-repository research - -Amp validates `.agents/skills/` as the preferred repo-local skill path. - -## SF Model - -SF should represent agent state as orthogonal axes, not one overloaded mode. - -```text -workMode: chat | plan | build | review | repair | research -runControl: manual | assisted | autonomous -permissionProfile: restricted | normal | trusted | unrestricted -modelMode: fast | smart | deep -surface: tui | web | headless | rpc -``` - -Note: `repair` is a `workMode`, not a separate subsystem. The `/doctor` command is the diagnostic engine; `/repair` switches `workMode` to `repair`. - -Examples: - -```text -plan | manual | normal | deep -build | autonomous | trusted | smart -repair | assisted | normal | smart -research | autonomous | restricted | deep -review | manual | restricted | deep -``` - -Definitions: - -- `workMode` describes what kind of work SF is doing. -- `runControl` describes who advances the loop. -- `permissionProfile` describes what tool/file/network actions may proceed - without approval. -- `modelMode` describes speed/cost/reasoning posture. -- `surface` describes how the user or automation is connected. - -`autonomous` is not the whole mode. It is a run-control value. - -## Work Modes - -### `chat` - -Default conversational mode for questions, explanations, and low-commitment -exploration. - -### `plan` - -Research, clarify, write/update specs, derive tasks, and produce an explicit -acceptance point before implementation. - -### `build` - -Implement, test, lint, typecheck, verify, and prepare commit-ready changes. - -### `review` - -Inspect diffs, tests, risks, regressions, security issues, and missing evidence. - -### `repair` - -Fix SF health, repo health, runtime drift, broken generated state, bad command -surfaces, failing workflow infrastructure, stale locks, and broken installed -runtime copies. - -Doctor is not a permanent mode. Doctor is the diagnostic engine used by -`repair`. - -### `research` - -Longer-form codebase, competitor, design, API, or dependency research. This can -use web search, local code exploration, cross-repo research, and helper agents. - -## Run Control - -```text -manual user drives every step -assisted SF executes one unit, then pauses -autonomous SF continues until done, blocked, interrupted, budget-hit, or limit-hit -``` - -Transitions: - -```text -/control manual -/control assisted -/control autonomous -/autonomous -/next -/pause -/stop -``` - -`/autonomous` is a direct command. Do not route through `/sf autonomous`. - -## Permission Profiles - -```text -restricted read-only and explicitly allowlisted actions -normal safe edits, non-destructive local commands -trusted build/test/install/local commits and bounded repo automation -unrestricted high-risk orchestration only in intentionally trusted environments -``` - -This is SF's equivalent of Droid autonomy levels and Copilot permission -expansion, but the names are SF-native and policy-oriented. - -Rules: - -- Permission profile never implies autonomous continuation. -- Autonomous continuation never implies broader permissions. -- Denylists and safety gates override permission profile. -- Risk decisions must be logged with the active work mode, run control, and - permission profile. - -## Model Modes - -```text -fast cheap/quick routing for small bounded tasks -smart default balanced routing -deep high-reasoning routing for planning, debugging, research, and review -``` - -This is SF's equivalent of Amp's `rush`, `smart`, and `deep`, but with names -that match SF's tone and routing layer. - -`modelMode` should guide routing; it should not replace explicit model -selection. - -## Mode Switching - -Mode switching must be first-class and visible. - -Direct commands: - -```text -/mode chat -/mode plan -/mode build -/mode review -/mode repair -/mode research -/control manual -/control assisted -/control autonomous -/trust restricted -/trust normal -/trust trusted -/trust unrestricted -/model-mode fast -/model-mode smart -/model-mode deep -``` - -Combined forms: - -```text -/mode repair --autonomous --trust normal -/mode build --autonomous --trust trusted -/mode research --autonomous --trust restricted --model-mode deep -``` - -Autonomous steering: - -```text -/steer mode repair -/steer mode review after-current-unit -/steer trust restricted now -/steer model-mode deep for-next-unit -``` - -Transition scopes: - -- `now`: apply before the next dispatch point if no tool is active -- `after-current-tool`: finish the active tool, then switch -- `after-current-unit`: finish the current SF unit, then switch -- `next-milestone`: switch after the current milestone completes - -Autonomous mode changes should affect future decisions, not mutate an active -tool call midway through execution. - -Every transition should be logged: - -```json -{ - "from": {"workMode": "build", "runControl": "autonomous"}, - "to": {"workMode": "repair", "runControl": "autonomous"}, - "reason": "pre-dispatch health gate failed", - "scope": "after-current-unit" -} -``` - -## Plan To Autonomous Handoff - -The primary user journey should be: - -```text -plan | manual | normal | deep -accept plan -build | autonomous | selected-permission-profile | smart -``` - -Required surfaces: - -- TUI: plan acceptance prompt includes "run autonomously" -- Web: plan acceptance button includes "run autonomously" -- Headless: `--autonomous` chains into direct `/autonomous` -- RPC: machine event records the transition explicitly - -This should not use `/sf`. - -## Repair Work Mode - -`repair` is a `workMode`, not a separate subsystem. - -Commands: - -```text -/doctor -/doctor fix -/doctor heal -/repair -/repair --autonomous -``` - -Semantics: - -- `/doctor` inspects health and reports. -- `/doctor fix` applies deterministic repairs. -- `/doctor heal` uses an LLM-assisted diagnostic flow for deeper issues. -- `/repair` switches work mode to `repair`. -- `/repair --autonomous` keeps repairing until clean, blocked, or limit-hit. - -Automatic transitions: - -```text -build | autonomous | trusted | smart --> repair | autonomous | normal | smart -``` - -This is allowed when health gates fail, installed runtime drift is detected, SF -cannot dispatch safely, or repo workflow state is corrupted. - -## Skills - -SF should use `.agents/skills/` for repo-local skills. - -```text -.agents/skills// - SKILL.md - scripts/ - schemas/ - checklists/ - mcp.json -``` - -Skill behavior should match the best Factory/Amp pattern: - -- skills are narrow reusable capabilities -- users can invoke a skill directly when it is user-invocable -- SF can lazily load a skill when relevant if model invocation is allowed -- supporting files live beside the skill -- dangerous skills are never model-invoked by default -- project skills are committed with the repo -- user skills live in user-level skill directories - -Recommended frontmatter: - -```yaml ---- -name: forge-command-surface -description: Use when changing SF slash commands, browser command parity, or headless command dispatch. -user-invocable: true -model-invocable: true -side-effects: code-edits -permission-profile: normal ---- -``` - -Dangerous workflow: - -```yaml ---- -name: production-deploy -description: Deploy production services after release gates pass. -user-invocable: true -model-invocable: false -side-effects: production-mutation -permission-profile: trusted ---- -``` - -Background knowledge: - -```yaml ---- -name: forge-autonomous-runtime -description: Explains SF autonomous loop, UOK gates, installed-runtime drift, and recovery paths. -user-invocable: false -model-invocable: true -side-effects: none -permission-profile: restricted ---- -``` - -## Automatic Skill Creation - -SF should add repo-specific skills when it repeatedly rediscovers a useful -pattern. - -Flow: - -1. Detect repeated repo-specific evidence: same files, same commands, same - failure mode, same architectural rule, same verification path. -2. Propose a skill in manual/restricted contexts. -3. Generate or update a project skill automatically only when policy allows it. -4. Record source evidence in `.sf` state. -5. Keep the skill narrow and testable. -6. Commit the skill with the repo when accepted. - -Examples for Forge: - -- `forge-command-surface` -- `forge-web-mode` -- `forge-autonomous-runtime` -- `forge-release-verification` -- `forge-installed-runtime-drift` - -Examples for DR: - -- `dr-agent-windows` -- `dr-portal-ui-and-handlers` -- `dr-production-readiness` -- `dr-systematic-debugging` - -This is the Hermes-agent direction: reusable operational knowledge becomes -repo-local skills plus `.sf` evidence, not scattered markdown. - -## Background Work Surface - -SF needs one coherent background work surface. - -Direct command: - -```text -/tasks -``` - -It should show: - -- autonomous units (durable state: todo | in_progress | review | done | retrying | failed | cancelled) -- parallel workers -- scheduled autonomous dispatches -- background shell sessions -- stuck or resumable sessions -- remote questions waiting for answers -- current cost/budget state -- last checkpoint and next action - -Task lifecycle uses ORCH-style states. `todo` means ready to run, not "queued." - -This complements, not replaces: - -- `/status` -- `/queue` (milestone dispatch order, not task state) -- `/parallel status` -- `/session-report` -- `/logs` -- `/forensics` - -Copilot's `/tasks` and `/session` are less powerful internally, but clearer as -control surfaces. SF should keep its deeper state and expose it better. - -## Actual Source Pass: Awesome CLI Agent Repos - -Checked locally under `/tmp/sf-agent-research`: - -- `bradAGI/awesome-cli-coding-agents` -- `plandex-ai/plandex` -- `leonardcser/smelt` -- `mikeyobrien/ralph-orchestrator` -- `subsy/ralph-tui` -- `oxgeneral/ORCH` -- `LucasDuys/forge` -- `ramarlina/agx` -- `youwangd/SageCLI` -- `jcast90/relay` -- `basilisk-labs/agentplane` -- `amaar-mc/wit` -- `fastxyz/skill-optimizer` -- `0xmariowu/AgentLint` -- `ZENG3LD/gate4agent` - -`arosstale/pi-builder` was listed but the GitHub repository was not found when -cloned on 2026-05-08. - -### Smelt - -Smelt's source has four modes: - -```text -normal -> plan -> apply -> yolo -``` - -It also has separate reasoning effort: - -```text -off | low | medium | high | max -``` - -Useful: - -- mode cycling is explicit and configurable -- permissions differ by mode -- read-only commands are allowed, writes usually ask, deny wins -- approval scopes are explicit: once, session, workspace -- workspace approvals persist under a workspace hash - -Do not copy: - -- `yolo` as a name -- putting work kind and trust level into one mode axis - -SF should keep Smelt's visible cycling and approval scopes, but preserve SF's -separate axes: `workMode`, `runControl`, `permissionProfile`, and `modelMode`. - -### ORCH - -ORCH has the cleanest small task state machine: - -```text -todo -> in_progress -> review -> done - \-> retrying -> in_progress - \-> failed -review -> todo -* -> cancelled -``` - -It also keeps runtime state separately: - -- `running` -- `claimed` -- `retry_queue` -- total run/task/token/runtime stats - -Useful for SF: - -- `/tasks` should show both durable task status and ephemeral running state -- successful completion should pass through review, even when auto-approved -- dependency blockers should be computed, not implied from ordering -- retrying should be an explicit state, not hidden inside logs - -### AgentPlane - -AgentPlane's strongest idea is schema-first task artifacts. Task README -frontmatter includes: - -- `risk_level` -- `status` -- `depends_on` -- `task_kind` -- `mutation_scope` -- `risk_flags` -- `blueprint_request` -- `verify` -- `plan_approval` -- `verification` -- `runner` - -Its workflow file also makes operational policy explicit: - -- workflow mode -- status commit policy -- workspace isolation -- retry policy -- scheduler concurrency -- required evaluator checks -- event log location - -Useful for SF: - -- task artifacts should have schema-backed frontmatter, not loose markdown -- plan approval and verification state deserve durable fields -- mutation scope and risk flags should feed `permissionProfile` -- workflow policy should be inspectable by `/status` and `/tasks` - -### Relay - -Relay's useful concepts: - -- a channel is the workspace for one piece of work -- tickets are parallelizable units with dependency DAGs, retry budgets, - specialty tags, optional repo routing, and verification commands -- decisions are first-class durable records -- crosslink lets agents discover and message other sessions -- complexity tiers drive approval behavior -- CLI/TUI/GUI all read the same state - -Useful for SF: - -- keep decisions as first-class records, not buried in summaries -- remote steering should become full-session steering and cross-session - messaging, not only remote questions -- multi-repo work needs explicit repo routing on tasks -- one state store should power TUI, web, headless, and RPC - -### Ralph - -Ralph's hat system is useful as a coordination topology: - -- hats declare triggers, publishes, instructions, backend overrides, max - activations, and disallowed tools -- events flow through a bus -- scope violations are detected when hats publish undeclared topics -- exhaustion emits explicit events - -Useful for SF: - -- specialized helpers should declare trigger/publish contracts -- helper activation should have max activation limits -- helper output should be checked against declared output topics -- mode transitions can be modeled as events, not ad hoc flags - -### Sage - -Sage's real value is runtime-neutral orchestration: - -- agents are processes -- messages are files -- tasks are templates with frontmatter -- plans decompose into dependency waves -- tasks in a wave execute in parallel -- resume skips done tasks and resets stale running tasks -- runtime fallback is explicit -- bench-as-code compares actual agent CLIs on actual tasks - -Useful for SF: - -- `/tasks` should be file/DB-backed enough that headless tools can read it - without attaching to a live TUI -- dependency waves should be visible in planning output -- stale running work should be reset or surfaced clearly on resume -- model/provider benchmarking should use actual SF workflows, not isolated - model prompts - -### AGX - -AGX has useful low-level patterns: - -- graph scheduler with hard, soft, failure, and always dependency conditions -- max concurrent work slots -- checkpoints with patch files and bounded history -- deterministic verify gate before LLM fallback -- repeated verification failure count that forces action - -Useful for SF: - -- dependency edges should support more than "depends on success" -- checkpoints should store patch references and bounded summaries -- deterministic verification should always run before semantic/LLM review -- repeated verify failures should force a mode transition to `repair` or - `review`, not keep retrying indefinitely - -### Wit - -Wit is the strongest coordination pattern for parallel edits: - -- agents declare intent before editing -- agents acquire symbol-level locks -- conflicts are warnings, not always hard blocks -- contracts can be enforced by git hooks -- Tree-sitter provides symbol ranges and call edges -- a `coordinate` skill auto-loads when `.wit/` exists - -Useful for SF: - -- parallel SF workers should declare intent before editing -- conflict detection should eventually be symbol-aware, not only file-aware -- warnings can steer agents away from collisions without freezing work -- accepted interface contracts should be enforceable before commit - -### skill-optimizer - -Skill optimizer has the best pattern for making skills real: - -- a case is a user-like task plus deterministic graders -- a suite is a case/model matrix -- references are copied into `/work` -- the agent sees only `/work`, not graders or hidden answers -- graders inspect files, artifacts, `answer.json`, `trace.jsonl`, and result - state -- failed trials preserve workspace for debugging - -Useful for SF: - -- auto-created skills need eval cases -- skill acceptance should be grader-backed, not vibes-backed -- negative cases should check that irrelevant skills were not loaded -- skill optimization should test across model modes/providers - -### Plandex And Forge Loop - -Plandex reinforces: - -- chat/tell split -- configurable autonomy levels -- cumulative diff sandbox before applying changes -- model packs for planning vs execution - -Forge Loop reinforces: - -- R-numbered acceptance criteria -- task DAGs with tiered parallelism -- per-task worktrees -- per-task and session token budgets -- structural completion markers -- backpropagation from runtime failure to spec gap -- state on disk as the recovery source - -SF already has many of these ideas. The part to tighten is the explicit product -surface: direct commands, visible modes, `/tasks`, schema-backed state, and -skill evals. - -## Status And Mode Badge - -The active state should always be visible, especially during full autonomy. - -Recommended status line: - -```text -SF build | autonomous | trusted | smart -``` - -Compact badge form: - -```text -[B][A][T][S] -``` - -Preferred full labels in critical states: - -```text -repair | autonomous | normal | smart -review | assisted | normal | deep -``` - -Do not use "autopilot" in SF UI. It may appear only as competitor context in -this research note. - -## Implementation Pull-Through - -Already directionally right: - -- UOK lifecycle records carry `runControl`. -- UOK lifecycle records and execution-policy decisions carry - `permissionProfile`. -- Schedule command state uses `autonomous_dispatch`. -- SF has DB-backed state, recovery, verification, scheduling, captures, - forensics, projections, and self-reporting. -- SF has skills and project-specific skill paths. -- SF has parallel orchestration and remote-question infrastructure. - -Still needed: - -- ~~Remove `/sf` from docs/web/tests (Phase 2 deprecation)~~ ✓ Complete - -Completed ✓ (RA.Aid Patterns — Phase 2): - -- structured memory repositories (`memory-repository.js` — SQLite-backed key facts, - snippets, research notes, human inputs, work logs, decisions; content hash - deduplication; auto-summarization; prompt formatting; 11 tests pass) -- trajectory recording (`trajectory-recorder.js` — per-step tool/LLM/error - execution trace with costs, tokens, errors; session+unit scoped; exportable; - 10 tests pass) -- trajectory command (`/trajectory` — step-by-step trace with `--all`, `--errors`, - `--tools`, `--llm`, `--limit=N` flags; wired into `commands/handlers/ops.js`) -- reasoning assist + memory integration (`reasoning-assist.js` loads key facts, - snippets, research notes from memory repository into pre-stage consultation prompt) -- compaction fix (`register-hooks.js` — never cancel compaction; provide custom - compaction summary with work state preservation instead) - -Completed ✓ (Additional): - -- schema-backed task/frontmatter fields (`task-frontmatter.js` — risk levels, - mutation scopes, verification types, plan approval states, task/scheduler - statuses; wired into `sf-db.js` `insertTaskSpecIfAbsent()`) -- subagent provider/model/permission inheritance audit - (`subagent-inheritance.js` — blocked providers, fast-mode heavy model blocking, - restricted destructive tool blocking; wired into `subagent/index.js`) -- remote steering as full-session steering surface (`remote-steering.js` — - parse/apply/format directives with 5s cooldown throttle) -- parallel worker intent/claim registry (`parallel-intent.js` — declareIntent, - checkIntentConflicts, releaseIntent, getActiveIntents with TTL) -- skill eval harness foundation (`skills/eval-harness.js` — createEvalCase, - runGrader with 30s timeout, runSkillEvals) -- terminal title mode indicator (`auto/session.js` — OSC escape sequence + - `process.title`, format: `SF[workMode|runControl|permissionProfile|modelMode]`) -- self-feedback → workMode auto-transition (`self-feedback-drain.js` — - high/critical feedback dispatches auto-switch to `repair` with reason - `"self-feedback-drain"`) -- UOK events carry workMode + modelMode (`uok/kernel.js` — lifecycleFlags include - both; audit envelope payload includes both) -- enhanced `/steer` with mode transitions (`/steer mode [scope]`, - `/steer trust

[scope]`, `/steer model-mode [scope]`) -- `/sf` prefix deprecation warning (Phase 1 — accept both forms, warn once per - session) -- centralized metrics system (`metrics-central.js` — Prometheus-compatible - Counter/Gauge/Histogram with session scoping, DB persistence, retry logic, - cost/token tracking; wired into subagent-inheritance + mode transitions) -- explicit stage commands (`/research`, `/plan`, `/implement` — set workMode and - dispatch corresponding phase) -- cost command (`/cost` — queries metrics-central DB + legacy ledger) -- reasoning assist foundation (`reasoning-assist.js` — pre-stage expert - consultation prompt builder, context loading, guidance injection; wired into - `auto/phases.js` dispatch path) - -Completed ✓: - -- make `workMode` durable state (SQLite session_mode_state table + AutoSession persistence) -- add direct mode/control/trust/model-mode commands -- make `--autonomous` chain into direct `/autonomous` -- add visible mode/status surface for TUI and web (header badge + /status) -- expose autonomous continuation limits in settings and status (mode badge shows runControl) -- add `/tasks` as the unified background work surface with durable task state, - ephemeral running state, retries, blockers, checkpoints, budget, and steering -- make `repair` a first-class workflow over doctor -- add policy-aware project skill suggestion/generation (auto-create flow) -- enhanced `/steer` with mode/trust/model-mode transitions -- TUI keyboard shortcuts for mode cycling (Ctrl+Shift+M/R/A/S/P) -- minimal auto-mode header/footer (badge visible during autonomy) -- `/sf` namespace removed from command registration; direct command roots only -- parallel worker intent/claim registry (declareIntent, checkIntentConflicts, releaseIntent) -- skill eval harness foundation (createEvalCase, runGrader, runSkillEvals) -- terminal title mode indicator (tmux/terminal tab visibility) - -## Direct Command Decision - -SF is the system, not a plugin namespace. - -Use: - -```text -/status -/autonomous -/doctor -/rate -/session-report -/parallel -/remote -/tasks -``` - -`/sf` is not registered in the TUI or browser command surface. - -Shell machine surface remains: - -```text -sf headless autonomous -sf headless --autonomous ... -``` - -The target model is simple: direct commands for humans, headless commands for -machines, durable state for autonomous execution, and explicit axes for mode, -control, trust, model posture, and surface. - -## Runtime Target: Node 26 - -SF treats Node 26.1+ as the runtime baseline. There is no compatibility path -for older Node versions in SF-owned runtime code. - -Source notes checked 2026-05-08: - -- Node 25 is a short-lived current line. It is useful as a compatibility probe, - but not a target. -- Node 26 is current now, LTS-bound, and useful for SF's own runtime model. -- Bun is closer to Node every release and supports many Node APIs plus - Node-API, but its compatibility target and partial API areas do not match - SF's risk surface yet. -- Deno supports Node/npm compatibility, package.json, local node_modules, and - Node-API addons with FFI permission, but that means SF would still be running - a Node-compatibility workload. -- LLRT is experimental and serverless-oriented, not a local CLI/runtime fit. - -### Why Node 26 Makes SF Stronger - -Node 26 is not just "newer Node." It gives SF a better platform for long-running -agent work: - -- `Temporal` is enabled by default. -- V8 14.6 is the JavaScript engine baseline. -- Undici 8 is the HTTP/fetch baseline. -- Node 26 removes and deprecates more legacy APIs, so it hardens SF against old - loader, stream, HTTP, crypto, and dependency assumptions. - -### Temporal Is More Than Better Dates - -Temporal gives SF the vocabulary it already needs for durable autonomous work. - -Important Temporal concepts: - -- `Temporal.Instant`: an exact point in history. Use for journal events, - checkpoint timestamps, lock leases, provider call start/end, and trace order. -- `Temporal.ZonedDateTime`: an exact instant plus time zone and calendar. Use - for reminders, schedules, adoption reviews, audits, and "run this at local - business time" semantics. -- `Temporal.PlainDate`: a calendar date without time or time zone. Use for - daily reports, milestone review dates, and human-facing due dates. -- `Temporal.PlainTime`: a wall-clock time without date or zone. Use for - recurring "at 09:00" style policies. -- `Temporal.PlainDateTime`: a date and wall-clock time before binding it to a - zone. Use only when the zone is deliberately chosen later. -- `Temporal.Duration`: a typed amount of time. Use for budgets, leases, - cooldowns, retry delays, schedule offsets, and age checks. - -That split matters because SF currently has many different meanings hidden -behind timestamps and strings: - -- exact event ordering -- local user reminders -- project schedule dates -- lease expiry -- retry backoff -- adoption review windows -- elapsed runtime -- "next business day" style planning - -`Date` collapses those into one weak type. Temporal lets SF store and validate -the real intent. - -### SF Runtime Places That Should Use Temporal - -Use Temporal first in the areas where wrong time semantics create real -operational mistakes: - -- `sf schedule`: due dates, relative offsets, local-time reminders, audit - windows, and recurrence-ready storage. -- autonomous locks and leases: exact `Instant` plus typed `Duration`, not - implicit millisecond math scattered through code. -- journals and traces: exact event instants with stable ordering and explicit - serialization. -- session reports: elapsed durations and grouped daily summaries without local - timezone drift. -- adoption reviews and decision audits: calendar dates and wall-clock reminders - that survive DST and timezone changes. -- background work surface: task age, stale-running detection, retry-after, and - next-action time should be typed. - -**Implementation Status:** `temporal-foundation.js` is a native-only Node 26 -wrapper with safe constructors (`instantFromISO`, `durationFromObject`, -`plainDateFromISO`), serialization, deserialization, and validation. It throws -clearly when native Temporal is unavailable instead of using compatibility -shims. - -### Temporal Design Rule For SF - -Store the semantic type, not just the formatted string: - -```text -event happened exactly now -> Instant -run at 09:00 in Europe/Oslo -> ZonedDateTime or PlainTime + timeZone -review on 2026-06-01 -> PlainDate -retry after 30 minutes -> Duration -lease expires at exact timestamp -> Instant -``` - -Serialization should stay explicit and boring: - -- store ISO strings plus a field that says which Temporal type they represent -- include timezone when wall-clock semantics matter -- do not infer local timezone at read time unless the record explicitly asks - for it -- validate schedule and lease records at DB boundaries - -### Node 26 Adoption Path - -Target policy: - -```text -current compatibility floor: Node 26.1+ -internal target runtime: Node 26.1+ -canonical baseline: Node 26.1+ -Node 25: skip except quick probes -``` - -### Runtime Alternatives - -Other JavaScript runtimes are useful comparators, but none should replace Node -as SF's primary runtime right now. - -SF's current runtime shape is Node-native: - -- npm workspaces and `package-lock.json` -- Next.js standalone web host -- Vitest and Node test-runner compatibility scripts -- Rust N-API `.node` addons -- `node-pty` native assets in the web host -- `node:` built-ins across CLI, scripts, packages, and web services -- child process, TTY, stream, module loader, and extension-loader behavior -- installed runtime sync into `~/.sf/agent` - -#### Bun - -Bun is the strongest speed and developer-experience competitor. - -Useful: - -- fast package install and script startup -- broad Node API compatibility -- built-in TypeScript, test runner, shell, SQLite, YAML, TOML, JSONL, and other - convenience APIs -- Node-API support is substantial enough to use as a compatibility probe - -Not primary for SF: - -- Bun's own docs say compatibility reflects Node v23, while SF is targeting - Node 26. -- Some core APIs are partial or behaviorally different: `child_process`, module - loader hooks, `node:v8`, `node:test`, `node:sqlite`, `worker_threads`, and - inspector/debugger areas are not exact Node. -- SF's highest-risk paths are exactly the places where "almost Node" can hurt: - TTY, child processes, native addons, Next standalone output, loaders, and - extension runtime. - -Decision: use Bun only for optional speed probes or isolated tooling. Do not -make it the SF runtime until full `npm test`, web build, native build, smoke -tests, and installed extension runtime all pass under Bun without special -cases. - -#### Deno - -Deno has the best security and integrated-toolchain story. - -Useful: - -- explicit permissions model -- first-class TypeScript and web standards -- npm/package.json compatibility -- Node-API support when local `node_modules` and FFI permission are enabled -- good target for thinking about sandboxing and permission profiles - -Not primary for SF: - -- Deno still becomes a Node-compatibility mode for a repo like SF. -- Deno docs recommend local `node_modules` for frameworks like Next.js and for - Node-API addons, which means SF would keep most Node/npm complexity anyway. -- Native addons require local `node_modules` plus `--allow-ffi`. -- The value would be security posture and packaging experiments, not simpler - runtime execution. - -Decision: study Deno for permission-profile design and maybe future packaged -headless workers. Do not switch the core SF runtime to Deno. - -#### LLRT, WinterJS, Edge Runtimes - -These are not fits for SF's primary runtime. - -Useful: - -- serverless cold-start research -- constrained worker/edge execution ideas -- tiny isolated helper tasks - -Not primary for SF: - -- SF is a long-running local CLI/runtime, not a small stateless Lambda handler. -- SF needs native addons, process control, TTY, filesystem state, git, shell, - Next web host, and Node-compatible package behavior. -- LLRT is explicitly experimental and evaluation-oriented. - -Decision: ignore as primary runtime. Only revisit for isolated future worker -surfaces. - -### Runtime Decision - -Node 26 is the target because SF is a Node-native agent runtime, not a generic -JavaScript app. - -Use alternatives this way: - -```text -Node 26 -> primary runtime and baseline -Bun -> speed/compatibility probe, not runtime -Deno -> permission/sandbox design reference, not runtime -LLRT -> ignore except tiny serverless worker research -``` - -The rule is simple: if a runtime cannot run the exact SF stack without special -cases, it is not stronger for SF. Node 26 makes the existing SF stack stronger; -alternative runtimes mostly make a different stack. - -Required Node 26 gate: - -```text -node@26 --version -npm run lint -npm run typecheck:extensions -npm run build -npm test -sf --version -sf --help -sf --print "ping" -``` - -SF already requires Node 26.1+ in `engines.node`; the remaining work is to keep -the gates green under Node 26 and replace fragile `Date`/millisecond logic with -Temporal in the schedule, lease, journal, and background task surfaces. - ---- - -## Appendix A: Related Source Files - -This section maps the concepts in this document to actual code in the repo. - -### A.1 Operating Model (Already Exists) - -**File:** `src/resources/extensions/sf/operating-model.js` - -Already exports canonical vocabulary: - -```js -export const RUN_CONTROL_MODES = ["manual", "assisted", "autonomous"]; -export const PERMISSION_PROFILES = ["restricted", "normal", "trusted", "unrestricted"]; -``` - -Tests: `src/resources/extensions/sf/tests/operating-model.test.mjs` - -**Gap:** No `workMode` or `modelMode` constants yet. Add to this file. - -### A.2 Execution Policy (Already Exists) - -**File:** `src/resources/extensions/sf/execution-policy.js` - -Maps permission profiles to concrete tool restrictions: - -```js -EXECUTION_POLICY_PROFILES = { - restricted: { filesystem: "read-mostly", network: "read-only", git: "read-only", mutation: "planning-artifacts-only" }, - normal: { filesystem: "workspace-write", network: "allowed", git: "normal", mutation: "workspace" }, - trusted: { filesystem: "workspace-write", network: "allowed", git: "normal", mutation: "workspace" }, - unrestricted: { filesystem: "danger-full-access", network: "allowed", git: "dangerous", mutation: "host" } -}; -``` - -**Status:** Wired to tool-call boundaries via `bootstrap/register-hooks.js` `tool_call` hook. `classifyExecutionPolicyCall()` reads `session.permissionProfile` to block destructive commands when `restricted`/`normal`. Enforcement is unified at the hook level. - -### A.3 Auto Session State (Already Exists) - -**File:** `src/resources/extensions/sf/auto/session.js` - -`AutoSession` class holds: -- `active`, `paused`, `stepMode`, `canAskUser` -- `currentUnit`, `currentMilestoneId` -- `autoModeStartModel`, `currentUnitModel` - -**Status:** `workMode`, `runControl`, `permissionProfile`, `modelMode`, `surface`, and `modeUpdatedAt` are all durable properties on `AutoSession`. Persisted to SQLite `session_mode_state` table on every transition. Loaded from DB on construction. - -### A.4 Command Registration (Already Exists) - -**File:** `src/resources/extensions/sf/commands/index.js` - -Registers direct commands via `pi.registerCommand()`: - -```js -for (const command of DIRECT_SF_COMMANDS) { - pi.registerCommand(command.cmd, { ... }); -} -``` - -**File:** `src/resources/extensions/sf/commands/catalog.js` - -Defines `TOP_LEVEL_SUBCOMMANDS` and `DIRECT_SF_COMMANDS`. - -**Status:** Direct commands implemented (`/mode`, `/control`, `/trust`, `/model-mode`, `/repair`, `/tasks`, `/skills`). `/sf` is not registered; the shell executable remains `sf`. - -### A.5 TUI Extension (Already Exists) - -**File:** `src/resources/extensions/sf-tui/index.js` - -Registers shortcuts: -- `Ctrl+Alt+H` — prompt history -- `Ctrl+Shift+H` — prompt history fallback -- `Ctrl+Alt+M` — marketplace - -**File:** `src/resources/extensions/sf-tui/header.js` - -Renders header with project name, branch, model. No mode badge yet. - -**File:** `src/resources/extensions/sf-tui/footer.js` - -Renders footer with git status, cost, context usage. No mode badge yet. - -**File:** `src/resources/extensions/sf-tui/extension-manifest.json` - -Declares hooks: `session_start`, `session_switch`, `before_agent_start`, `tool_result`, `agent_start`, `agent_end`. - -**Status:** Mode badge implemented in TUI header and footer. Compact `[B∞TS]` form at <80 cols, full `build · autonomous · trusted · smart` at ≥80 cols. Paused state dims all badge parts and shows `P!` (compact) or `paused ·` (full) prefix. `renderModeBadge` exported from header.js and shared with footer via `FOOTER_THEME` adapter. `getMode()` surfaces `session.paused` on the returned mode object. - -### A.6 UOK Parity Report (Already Uses runControl) - -**File:** `src/resources/extensions/sf/tests/uok-parity-report.test.mjs` - -Tests verify `runControl` and `permissionProfile` in UOK events: - -```js -assert.equal(events[0].runControl, "autonomous"); -assert.equal(events[0].permissionProfile, "normal"); -``` - -**Status:** `workMode` and `modelMode` added to AutoSession. Journal logging emits `mode-transition` events. UOK kernel includes both in `lifecycleFlags` and audit envelope payload. - -### A.7 Routing History (Already Exists) - -**File:** `src/resources/extensions/sf/routing-history.js` - -Tracks model tier success/failure per task pattern. - -**Status:** Connected. `modelModeToTier()` / `tierToModelMode()` bridge in `operating-model.js`. `classifyUnitComplexity()` signature includes `modelMode`. `deep` floors at `heavy`, `fast` caps at `light`. - -### A.8 Doctor System (Already Exists) - -**File:** `src/resources/extensions/sf/doctor.js` -**File:** `src/resources/extensions/sf/doctor-proactive.js` -**File:** `src/resources/extensions/sf/doctor-checks.js` - -Health checks, auto-fix, proactive monitoring. - -**Status:** `/repair` command switches to `repair` work mode and runs doctor fix. Auto-transitions to repair allowed when health gates fail. - -### A.9 Self-Feedback (Already Exists) - -**File:** `src/resources/extensions/sf/self-feedback.js` - -Records anomalies, blocking entries, version-bump resolution. - -**Status:** Connected. `self-feedback-drain.js` auto-transitions to `repair` workMode when high/critical self-feedback is dispatched for inline-fix. Reason: `"self-feedback-drain"`. - -### A.10 Skills (Partially Exists) - -**File:** `src/resources/extensions/sf/skill-discovery.js` -**File:** `src/resources/extensions/sf/skill-health.js` -**File:** `src/resources/extensions/sf/skill-telemetry.js` - -Skill loading, health monitoring, telemetry. - -**Status:** `.agents/skills/` directory structure implemented with YAML frontmatter parser, validation, skill loader, and auto-creation flow. Auto-creation detects patterns from activity logs (≥3 occurrences) and generates skills with a SQLite-backed cooldown. Sample skills created: `forge-command-surface`, `forge-autonomous-runtime`. - ---- - -## Appendix B: Implementation Priority - -| Priority | Item | Files to Touch | Effort | -|----------|------|----------------|--------| -| P0 | Add `workMode` + `modelMode` to `operating-model.js` | `operating-model.js`, `operating-model.test.mjs` | Small ✓ | -| P0 | Add `workMode` to `AutoSession` | `auto/session.js`, `auto.js` | Small ✓ | -| P0 | Add mode badge to TUI header | `sf-tui/header.js`, `sf-tui/index.js` | Small ✓ | -| P0 | Add mode-switching shortcuts | `sf-tui/index.js`, `extension-manifest.json` | Small ✓ | -| P0 | Remove `/sf` namespace registration | `commands/catalog.js`, `commands/index.js` | Medium ✓ | -| P1 | Add `/mode`, `/control`, `/trust`, `/model-mode` commands | `commands/handlers/*.js`, `commands/catalog.js` | Medium ✓ | -| P1 | Wire `execution-policy.js` to tool boundaries | `execution-policy.js`, `bootstrap/register-hooks.js` | Medium ✓ | -| P1 | Add `/tasks` background work surface | `commands/handlers/tasks.js` | Medium ✓ | -| P1 | Make `repair` first-class work mode | `commands/handlers/ops.js`, `commands/handlers/core.js` | Medium ✓ | -| P2 | Add `.agents/skills/` structure | `skills/*.js`, `.agents/skills/` | Medium ✓ | -| P2 | Add skill YAML frontmatter parser | `skills/frontmatter.js` | Small ✓ | -| P2 | Add skill eval harness | `skills/eval-harness.js`, eval templates | Medium ✓ | -| P2 | Adopt Temporal in `sf schedule` | `temporal-foundation.js` | Medium ✓ | -| P2 | Node 26 baseline | `temporal-foundation.js` native Temporal wrapper | Medium ✓ | - ---- - -## Appendix C: Open Questions — Resolved - -1. **Paused badge** → `[P!BATS]` in compact form; `paused · build · assisted · normal · smart` in full form. All parts dim when paused. Implemented in `renderModeBadge`. -2. **Mode per-session or per-project?** → Per-session. Mode is a runtime posture for the current work, not a project-level config. -3. **Badge in tmux/terminal window titles?** → Terminal title already handled via OSC escape in `auto/session.js`. Tmux requires users to set `set-option -g set-titles on` — SF does not inject tmux config. -4. **Mode transitions with sound/notification?** → No. A terminal tool has no appropriate sound channel. The badge is the sole visibility mechanism. -5. **`repair` auto-transition: ask by default for new projects?** → No. Auto-transition is correct behavior for autonomous runs. Only if `runControl` is `manual` or `assisted` is the transition blocked. -6. **Skill eval cases: run in CI or on-demand?** → On-demand only. Gate with `SF_SKILL_EVALS=1` env var. CI is too slow and model-dependent for skill evals. -7. **`/tasks`: TUI overlay or separate scrollable panel?** → Inline output (current). A full panel requires `pi-tui` overlay support that is not yet built. -8. **`modelMode` replace or supplement tier system?** → Supplement via `modelModeToTier()` bridge. Direct model selection overrides `modelMode`; `modelMode` guides routing when no explicit model is set. diff --git a/packages/coding-agent/src/modes/interactive/interactive-mode.ts b/packages/coding-agent/src/modes/interactive/interactive-mode.ts index a5de30e24..f4e8cd4c0 100644 --- a/packages/coding-agent/src/modes/interactive/interactive-mode.ts +++ b/packages/coding-agent/src/modes/interactive/interactive-mode.ts @@ -1929,6 +1929,31 @@ export class InteractiveMode { this.extensionTerminalInputUnsubscribers.clear(); } + /** + * Register an extension-scoped terminal input listener. + * + * Purpose: allow extensions (e.g. the SF autonomous extension) to intercept + * raw terminal input before it reaches the editor, so that special keys like + * Ctrl+C can trigger extension actions (e.g. pause autonomous mode) rather + * than always going to the default editor clear handler. + * + * Return `{ consume: true }` from the handler to stop the key from being + * processed further. Return `undefined` or `{}` to let it propagate. + * + * Consumer: extension-ui-controller → ctx.ui.onTerminalInput. + */ + addExtensionTerminalInputListener( + handler: (data: string) => { consume?: boolean } | undefined, + ): () => void { + const listener = (data: string) => handler(data); + const unsubscribe = this.ui.addInputListener(listener); + this.extensionTerminalInputUnsubscribers.add(unsubscribe); + return () => { + unsubscribe(); + this.extensionTerminalInputUnsubscribers.delete(unsubscribe); + }; + } + /** * Create the ExtensionUIContext for extensions. */ diff --git a/packages/tui/src/ink-bridge.tsx b/packages/tui/src/ink-bridge.tsx index 3c4038dc9..1041ee00e 100644 --- a/packages/tui/src/ink-bridge.tsx +++ b/packages/tui/src/ink-bridge.tsx @@ -42,19 +42,29 @@ function LegacyComponentView({ * * Purpose: accept keyboard input from Ink and route it to the active * component, then trigger a re-render so the updated state is displayed. + * Invalidation is event-driven: external callers invoke the returned + * invalidate() handle, which fires the tick signal registered here. * * Consumer: startInkRenderer. */ function InkApp({ root, onInput, + onRegisterTick, }: { root: Component; onInput: (data: string) => void; + onRegisterTick: (tick: () => void) => void; }) { const [, tick] = useState(0); const { columns } = useWindowSize(); + // Register the tick function so that startInkRenderer's invalidate() can + // trigger a React re-render without a polling interval. + useEffect(() => { + onRegisterTick(() => tick((n) => n + 1)); + }, [onRegisterTick]); + useInput((input, key) => { // Reconstruct the escape sequences that the legacy key handlers expect. let data = input; @@ -70,12 +80,6 @@ function InkApp({ tick((n) => n + 1); }); - // Poll at 20 fps so async state changes (e.g. streaming output) appear promptly. - useEffect(() => { - const interval = setInterval(() => tick((n) => n + 1), 50); - return () => clearInterval(interval); - }, []); - return ; } @@ -84,10 +88,11 @@ function InkApp({ * * Purpose: drop-in replacement for the legacy TUI render engine. Mounting * this drives the entire Ink React tree and forwards terminal input to - * the root Component's handleInput chain. + * the root Component's handleInput chain. invalidate() triggers an + * immediate React re-render via an event-driven tick signal — no polling. * - * Consumer: TUI class (future integration); standalone callers can use - * this directly to render any Component tree under Ink. + * Consumer: TUI class; standalone callers can use this to render any + * Component tree under Ink. * * @param root - The root Component whose render() output fills the screen. * @param onInput - Called with each decoded key string for legacy handlers. @@ -97,13 +102,22 @@ export function startInkRenderer( root: Component, onInput: (data: string) => void, ): { stop: () => void; invalidate: () => void } { + // Mutable signal populated by InkApp via onRegisterTick once the React + // tree has mounted. invalidate() fires this to trigger a synchronous tick. + let _tick: (() => void) | null = null; + const onRegisterTick = (tick: () => void) => { + _tick = tick; + }; + const { unmount } = render( - , + , { exitOnCtrlC: false }, ); return { - stop: unmount, - // Ink re-renders automatically; manual invalidation is a no-op for now. - invalidate: () => {}, + stop: () => { + _tick = null; + unmount(); + }, + invalidate: () => _tick?.(), }; } diff --git a/packages/tui/src/tui.ts b/packages/tui/src/tui.ts index 2c5f916ca..c1d9a569a 100644 --- a/packages/tui/src/tui.ts +++ b/packages/tui/src/tui.ts @@ -420,9 +420,16 @@ export class TUI extends Container { if (!this.terminal.isTTY) { return; } - // Ink-backed render path: Ink manages raw mode and input; the legacy - // differential renderer is bypassed entirely. - if (this._useInk || process.stdout.isTTY) { + // Ink-backed render path: Ink manages raw mode, input, and screen output. + // The legacy differential renderer (doRender) is bypassed entirely on TTY. + // process.stdout.isTTY guards this path — Ink requires a real interactive + // TTY to mount. useInk() is kept as an explicit opt-in for callers that + // want Ink on non-standard terminal configurations. Use PI_LEGACY_TUI=1 + // to force the legacy renderer for debugging. + if ( + (this._useInk || process.stdout.isTTY) && + process.env.PI_LEGACY_TUI !== "1" + ) { // Wrap `this` in a plain Component so the private handleInput doesn't // conflict with the public Component.handleInput? signature. const root: Component = { @@ -506,6 +513,12 @@ export class TUI extends Container { requestRender(force = false): void { // Skip rendering on non-TTY stdout to prevent CPU burn (issue #3095) if (!this.terminal.isTTY) return; + // Ink-backed path: Ink owns the terminal — delegate to the Ink handle and + // do NOT call doRender(), which would write conflicting ANSI escapes. + if (this._inkHandle) { + this._inkHandle.invalidate(); + return; + } if (force) { this.previousLines = []; this.previousWidth = -1; // -1 triggers widthChanged, forcing a full clear diff --git a/src/cli-status.ts b/src/cli-status.ts index f91aaab29..b91504374 100644 --- a/src/cli-status.ts +++ b/src/cli-status.ts @@ -11,6 +11,7 @@ import type { QuerySnapshot } from "./headless-query.js"; interface StatusArgs { watch: boolean; + recoveryUnitId?: string; } interface StatusDeps { @@ -27,6 +28,12 @@ interface CurrentModel { function parseStatusArgs(argv: string[]): StatusArgs { const args = argv.slice(1); + if (args[0] === "recovery") { + return { + watch: false, + recoveryUnitId: args[1], + }; + } return { watch: args.includes("--watch"), }; @@ -219,6 +226,76 @@ async function buildStatusText( }); } +async function renderRecoveryDiagnostics( + basePath: string, + unitId: string | undefined, + stdout: Pick, + stderr: Pick, +): Promise { + try { + const { getRecoveryDiagnostics, listUnitRuntimeRecords } = await import( + "./resources/extensions/sf/uok/unit-runtime.js" + ); + let targetUnitId = unitId; + if (!targetUnitId) { + const records: Array<{ updatedAt?: number; unitId: string }> = + listUnitRuntimeRecords(basePath); + const mostRecent = records.sort( + (a, b) => (b.updatedAt ?? 0) - (a.updatedAt ?? 0), + )[0]; + if (!mostRecent) { + stderr.write("sf status recovery: no runtime records found\n"); + return 1; + } + targetUnitId = mostRecent.unitId; + } + const diagnostics = getRecoveryDiagnostics( + basePath, + "execute-task", + targetUnitId, + ); + if (!diagnostics) { + stderr.write( + `sf status recovery: no runtime record for ${targetUnitId}\n`, + ); + return 1; + } + const lines: string[] = []; + lines.push("Recovery Diagnostics"); + lines.push("--------------------"); + lines.push(`Unit: ${diagnostics.unitType} ${diagnostics.unitId}`); + lines.push(`Status: ${diagnostics.status}`); + lines.push( + `Retries: ${diagnostics.retryCount}/${diagnostics.maxRetries}`, + ); + lines.push( + `Progress: ${diagnostics.progressCount} (${diagnostics.lastProgressKind})`, + ); + lines.push(`Recovery attempts: ${diagnostics.recoveryAttempts}`); + if (diagnostics.lastRecoveryReason) { + lines.push(`Last recovery reason: ${diagnostics.lastRecoveryReason}`); + } + if (diagnostics.lineageSummary) { + lines.push( + `Lineage: ${diagnostics.lineageSummary.status} · ${diagnostics.lineageSummary.workerCount} worker(s) · ${diagnostics.lineageSummary.eventCount} event(s)`, + ); + } + lines.push( + `Started: ${diagnostics.startedAt ? new Date(diagnostics.startedAt).toISOString() : "n/a"}`, + ); + lines.push( + `Updated: ${diagnostics.updatedAt ? new Date(diagnostics.updatedAt).toISOString() : "n/a"}`, + ); + stdout.write(lines.join("\n") + "\n"); + return 0; + } catch (err) { + stderr.write( + `sf status recovery: ${err instanceof Error ? err.message : String(err)}\n`, + ); + return 1; + } +} + export async function runStatusCli( argv: string[], deps: StatusDeps, @@ -228,6 +305,15 @@ export async function runStatusCli( const sfHome = deps.sfHome ?? process.env.SF_HOME ?? join(homedir(), ".sf"); const args = parseStatusArgs(argv); + if (args.recoveryUnitId !== undefined) { + return renderRecoveryDiagnostics( + deps.basePath, + args.recoveryUnitId, + stdout, + stderr, + ); + } + const renderOnce = async () => { try { const text = await buildStatusText(deps.basePath, sfHome); diff --git a/src/resources/extensions/mcp-client/index.js b/src/resources/extensions/mcp-client/index.js index a1fff2b51..b5af83f54 100644 --- a/src/resources/extensions/mcp-client/index.js +++ b/src/resources/extensions/mcp-client/index.js @@ -94,6 +94,32 @@ function getServerConfig(name) { (s) => s.name === trimmed || s.name.toLowerCase() === trimmed.toLowerCase(), ); } +const SAFE_CHILD_ENV_KEYS = new Set([ + "PATH", + "HOME", + "USER", + "LOGNAME", + "SHELL", + "LANG", + "LC_ALL", + "LC_CTYPE", + "LC_MESSAGES", + "LC_NUMERIC", + "LC_TIME", + "TMPDIR", + "TMP", + "TEMP", + "TZ", + "TERM", + "COLORTERM", +]); +function buildChildEnv(configEnv) { + const safe = {}; + for (const key of SAFE_CHILD_ENV_KEYS) { + if (process.env[key] !== undefined) safe[key] = process.env[key]; + } + return { ...safe, ...resolveEnv(configEnv ?? {}) }; +} /** Resolve ${VAR} references in env values against process.env. */ function resolveEnv(env) { const resolved = {}; @@ -210,9 +236,7 @@ async function getOrConnect(name, signal) { transport = new StdioClientTransport({ command: config.command, args: config.args, - env: config.env - ? { ...process.env, ...resolveEnv(config.env) } - : undefined, + env: buildChildEnv(config.env), cwd: config.cwd, stderr: "pipe", }); @@ -234,23 +258,27 @@ async function getOrConnect(name, signal) { `Server "${config.name}" has unsupported transport: ${config.transport}`, ); } - await client.connect(transport, { signal, timeout: 30000 }); + try { + await client.connect(transport, { signal, timeout: 30000 }); + } catch (err) { + try { await transport.close(); } catch { /* best-effort */ } + try { await client.close(); } catch { /* best-effort */ } + throw err; + } connections.set(config.name, { client, transport }); return client; } async function closeAll() { const closing = Array.from(connections.entries()).map( async ([name, conn]) => { - try { - await conn.client.close(); - } catch { - // Best-effort cleanup - } + try { await conn.transport.close(); } catch { /* best-effort */ } + try { await conn.client.close(); } catch { /* best-effort */ } connections.delete(name); }, ); await Promise.allSettled(closing); toolCache.clear(); + autoRegisteredServers.clear(); } // ─── Formatters ─────────────────────────────────────────────────────────────── function formatServerList(servers) { @@ -312,31 +340,8 @@ export function getConnectionStatus(name) { }; } // ─── Test-exported helpers ──────────────────────────────────────────────────── -const SAFE_CHILD_ENV_KEYS = new Set([ - "PATH", - "HOME", - "USER", - "LOGNAME", - "SHELL", - "LANG", - "LC_ALL", - "LC_CTYPE", - "LC_MESSAGES", - "LC_NUMERIC", - "LC_TIME", - "TMPDIR", - "TMP", - "TEMP", - "TZ", - "TERM", - "COLORTERM", -]); export function _buildMcpChildEnvForTest(env) { - const safe = {}; - for (const key of SAFE_CHILD_ENV_KEYS) { - if (process.env[key] !== undefined) safe[key] = process.env[key]; - } - return { ...safe, ...resolveEnv(env) }; + return buildChildEnv(env); } export function _buildMcpTrustConfirmOptionsForTest(signal) { return { timeout: 120_000, signal }; diff --git a/src/resources/extensions/sf/auto-prompts.js b/src/resources/extensions/sf/auto-prompts.js index abb2a9add..eb7799ed1 100644 --- a/src/resources/extensions/sf/auto-prompts.js +++ b/src/resources/extensions/sf/auto-prompts.js @@ -78,6 +78,7 @@ import { buildSliceSummaryExcerpt, getDependencyTaskSummaryPaths, getPriorTaskSummaryPaths, + extractSliceExecutionExcerpt, } from "./summary-helpers.js"; import { composeInlinedContext } from "./unit-context-composer.js"; import { getUatType } from "./verdict-parser.js"; @@ -336,7 +337,7 @@ export function buildSourceFilePaths(base, mid, sid) { * If parsing fails (unrecognizable frontmatter, missing id, etc.) the * function falls back to `inlineFile` so the closer loses no information. */ -// Re-exported from summary-helpers.js: +// Imported from summary-helpers.js: // - buildSliceSummaryExcerpt, getPriorTaskSummaryPaths // - getDependencyTaskSummaryPaths, isSummaryCleanForSkip // - extractSliceExecutionExcerpt diff --git a/src/resources/extensions/sf/auto-verification.js b/src/resources/extensions/sf/auto-verification.js index 6961d2509..1b8024d8d 100644 --- a/src/resources/extensions/sf/auto-verification.js +++ b/src/resources/extensions/sf/auto-verification.js @@ -32,6 +32,10 @@ import { UokGateRunner } from "./uok/gate-runner.js"; import { MultiPackageGate } from "./uok/multi-package-gate.js"; import { OutcomeLearningGate } from "./uok/outcome-learning-gate.js"; import { SecurityGate } from "./uok/security-gate.js"; +import { + formatExecuteTaskRecoveryStatus, + inspectExecuteTaskDurability, +} from "./uok/unit-runtime.js"; import { extractVerdict } from "./verdict-parser.js"; import { writeVerificationJSON } from "./verification-evidence.js"; import { @@ -42,6 +46,38 @@ import { } from "./verification-gate.js"; import { logError, logWarning } from "./workflow-logger.js"; +function computeTokenCountFromSession(ctx) { + const entries = ctx.sessionManager?.getEntries?.() ?? []; + let total = 0; + for (const entry of entries) { + if (entry.type !== "message") continue; + const msg = entry.message; + if (!msg || msg.role !== "assistant") continue; + if (msg.usage?.totalTokens != null) { + total += msg.usage.totalTokens; + } + } + return total; +} + +function getMemoryPressureMB() { + try { + const mem = process.memoryUsage(); + return Math.round(mem.heapUsed / 1024 / 1024); + } catch { + return undefined; + } +} + +function buildGateOutcomesSummary(gateIds, gateResults) { + if (!gateIds || !gateResults || gateIds.length === 0) return undefined; + const outcomes = {}; + for (let i = 0; i < gateIds.length; i++) { + outcomes[gateIds[i]] = gateResults[i]?.outcome ?? "unknown"; + } + return outcomes; +} + function isInfraVerificationFailure(stderr) { return /\b(ENOENT|ENOTFOUND|ETIMEDOUT|ECONNRESET|EAI_AGAIN|spawn\s+\S+\s+ENOENT|command not found)\b/i.test( stderr, @@ -259,6 +295,8 @@ export async function runPostUnitVerification(vctx, pauseAuto) { } // ── Zone 2: Ancillary post-gate work (inner try) ───────────────────────── // Failures here are non-fatal — evidence writes, UOK gate calls, notifications, retry logic. + let gateIds = []; + let gateResults = []; try { if (uokFlags.gates) { const gateRunner = new UokGateRunner(); @@ -304,8 +342,8 @@ export async function runPostUnitVerification(vctx, pauseAuto) { iteration: s.verificationRetryCount.get(s.currentUnit.id) ?? 0, }; - const gateIds = gateRunner.list().map((g) => g.id); - const gateResults = await Promise.all( + gateIds = gateRunner.list().map((g) => g.id); + gateResults = await Promise.all( gateIds.map((id) => gateRunner .run(id, { @@ -434,13 +472,39 @@ export async function runPostUnitVerification(vctx, pauseAuto) { } // Write verification evidence JSON const attempt = s.verificationRetryCount.get(s.currentUnit.id) ?? 0; + const tokenCount = computeTokenCountFromSession(ctx); + const memoryPressureMB = getMemoryPressureMB(); + const gateOutcomes = buildGateOutcomesSummary(gateIds, gateResults); + let recoveryStatus; + try { + const durability = await inspectExecuteTaskDurability( + s.basePath, + s.currentUnit.id, + ); + if (durability) { + recoveryStatus = formatExecuteTaskRecoveryStatus(durability); + } + } catch { + recoveryStatus = undefined; + } if (mid && sid && tid) { try { const sDir = resolveSlicePath(s.basePath, mid, sid); if (sDir) { const tasksDir = join(sDir, "tasks"); if (result.passed) { - writeVerificationJSON(result, tasksDir, tid, s.currentUnit.id); + writeVerificationJSON( + result, + tasksDir, + tid, + s.currentUnit.id, + undefined, + undefined, + tokenCount, + memoryPressureMB, + gateOutcomes, + recoveryStatus, + ); } else { const nextAttempt = attempt + 1; writeVerificationJSON( @@ -450,6 +514,10 @@ export async function runPostUnitVerification(vctx, pauseAuto) { s.currentUnit.id, nextAttempt, maxRetries, + tokenCount, + memoryPressureMB, + gateOutcomes, + recoveryStatus, ); } } @@ -617,6 +685,10 @@ export async function runPostUnitVerification(vctx, pauseAuto) { postExecChecks, postExecBlockingFailure ? attempt + 1 : undefined, postExecBlockingFailure ? maxRetries : undefined, + tokenCount, + memoryPressureMB, + gateOutcomes, + recoveryStatus, ); } } catch (evidenceErr) { @@ -703,6 +775,10 @@ function writeVerificationJSONWithPostExec( postExecutionChecks, retryAttempt, maxRetries, + tokenCount, + memoryPressureMB, + gateOutcomes, + recoveryStatus, ) { mkdirSync(tasksDir, { recursive: true }); const evidence = { @@ -720,6 +796,10 @@ function writeVerificationJSONWithPostExec( })), ...(retryAttempt !== undefined ? { retryAttempt } : {}), ...(maxRetries !== undefined ? { maxRetries } : {}), + ...(tokenCount !== undefined ? { tokenCount } : {}), + ...(memoryPressureMB !== undefined ? { memoryPressureMB } : {}), + ...(gateOutcomes !== undefined ? { gateOutcomes } : {}), + ...(recoveryStatus !== undefined ? { recoveryStatus } : {}), postExecutionChecks, }; if (result.runtimeErrors && result.runtimeErrors.length > 0) { diff --git a/src/resources/extensions/sf/auto.js b/src/resources/extensions/sf/auto.js index 648452524..a197c9632 100644 --- a/src/resources/extensions/sf/auto.js +++ b/src/resources/extensions/sf/auto.js @@ -211,6 +211,33 @@ export { // Tests in auto-session-encapsulation.test.ts enforce this invariant. // ───────────────────────────────────────────────────────────────────────────── const s = getAutoSession(); +/** Unsubscribe function for the Ctrl+C → pause intercept registered on autonomous start. */ +let _ctrlCUnsubscribe = null; +/** + * Register a terminal input listener that intercepts Ctrl+C while autonomous + * mode is active and routes the first press to pauseAuto() instead of letting + * it silently clear the editor. + * + * Purpose: give the user a reliable single-keypress escape from a running + * autonomous loop without requiring the double-press exit threshold. + */ +function registerCtrlCInterceptor(ctx) { + _unregisterCtrlCInterceptor(); + if (typeof ctx?.ui?.onTerminalInput !== "function") return; + _ctrlCUnsubscribe = ctx.ui.onTerminalInput((data) => { + if (data !== "\x03") return undefined; + if (!s.active) return undefined; + ctx.ui.notify("Ctrl+C received — pausing autonomous mode.", "info"); + void pauseAuto(ctx, null, "ctrl-c-interrupt"); + return { consume: true }; + }); +} +function _unregisterCtrlCInterceptor() { + if (_ctrlCUnsubscribe) { + _ctrlCUnsubscribe(); + _ctrlCUnsubscribe = null; + } +} /** Throttle STATE.md rebuilds — at most once per 30 seconds */ const _STATE_REBUILD_MIN_INTERVAL_MS = 30_000; function captureProjectRootEnv(projectRoot) { @@ -704,6 +731,7 @@ function cleanupAfterLoopExit(ctx) { s.currentUnit = null; s.active = false; s.runControl = "manual"; + _unregisterCtrlCInterceptor(); deactivateSF(); clearUnitTimeout(); restoreProjectRootEnv(); @@ -747,6 +775,7 @@ function cleanupAfterLoopExit(ctx) { } export async function stopAuto(ctx, pi, reason) { if (!s.active && !s.paused) return; + _unregisterCtrlCInterceptor(); const loadedPreferences = loadEffectiveSFPreferences()?.preferences; const reasonSuffix = reason ? ` — ${reason}` : ""; try { @@ -1677,6 +1706,7 @@ export async function startAuto(ctx, pi, base, verboseMode, options) { s.runControl = requestedStepMode ? "assisted" : "autonomous"; s.cmdCtx = ctx; s.basePath = base; + registerCtrlCInterceptor(ctx); // Ensure the workflow-logger audit log is pinned to the project root // even when autonomous mode is entered via a path that bypasses the // bootstrap/dynamic-tools ensureDbOpen() → setLogBasePath() chain @@ -1943,6 +1973,7 @@ export async function dispatchHookUnit( s.autoStartTime = Date.now(); s.currentUnit = null; s.pendingQuickTasks = []; + registerCtrlCInterceptor(hookCtx); } const hookUnitType = `hook/${hookName}`; const hookStartedAt = Date.now(); diff --git a/src/resources/extensions/sf/commands/handlers/autonomous.js b/src/resources/extensions/sf/commands/handlers/autonomous.js index 667925d47..a308b130c 100644 --- a/src/resources/extensions/sf/commands/handlers/autonomous.js +++ b/src/resources/extensions/sf/commands/handlers/autonomous.js @@ -116,6 +116,10 @@ export async function handleAutonomousCommand(trimmed, ctx, pi) { }); return true; } + if (trimmed === "stop") { + await stopAutonomousRun(ctx, pi); + return true; + } if (isAutonomousVerb) { const autonomousArgsText = trimmed.replace(/^autonomous\b/, "").trim(); if (autonomousArgsText === "stop") { diff --git a/src/resources/extensions/sf/sf-db.js b/src/resources/extensions/sf/sf-db.js index 349071a64..96c02541f 100644 --- a/src/resources/extensions/sf/sf-db.js +++ b/src/resources/extensions/sf/sf-db.js @@ -4212,7 +4212,8 @@ function hasTaskSpecIntent(planning = {}) { } function insertTaskSpecIfAbsent(milestoneId, sliceId, taskId, planning = {}) { if (!hasTaskSpecIntent(planning)) return; - const frontmatter = taskFrontmatterFromRecord(planning).normalized; + const { normalized: frontmatter, errors } = taskFrontmatterFromRecord(planning); + if (errors?.length) logWarning("sf-db:insertTaskSpec", `frontmatter validation errors for ${milestoneId}/${sliceId}/${taskId}: ${errors.join(", ")}`); currentDb .prepare(`INSERT OR IGNORE INTO task_specs ( milestone_id, slice_id, task_id, verify, inputs, expected_output, @@ -4433,7 +4434,8 @@ export function setTaskBlockerDiscovered( export function upsertTaskPlanning(milestoneId, sliceId, taskId, planning) { if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open"); insertTaskSpecIfAbsent(milestoneId, sliceId, taskId, planning); - const frontmatter = taskFrontmatterFromRecord(planning).normalized; + const { normalized: frontmatter, errors: fmErrors } = taskFrontmatterFromRecord(planning); + if (fmErrors?.length) logWarning("sf-db:upsertTaskPlanning", `frontmatter validation errors for ${milestoneId}/${sliceId}/${taskId}: ${fmErrors.join(", ")}`); const hasTaskStatus = planning.taskStatus !== undefined || planning.task_status !== undefined || diff --git a/src/resources/extensions/sf/summary-helpers.js b/src/resources/extensions/sf/summary-helpers.js index d4e248b4e..aca5e7675 100644 --- a/src/resources/extensions/sf/summary-helpers.js +++ b/src/resources/extensions/sf/summary-helpers.js @@ -195,3 +195,44 @@ export function isSummaryCleanForSkip(content) { return false; } } + +function escapeRegExpLocal(value) { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function extractMarkdownSectionLocal(content, heading) { + const match = new RegExp(`^## ${escapeRegExpLocal(heading)}\\s*$`, "m").exec(content); + if (!match) return null; + const start = match.index + match[0].length; + const rest = content.slice(start); + const nextHeading = rest.match(/^##\s+/m); + const end = nextHeading?.index ?? rest.length; + return rest.slice(0, end).trim(); +} + +/** + * Extract key sections from a slice PLAN.md for use in task execution prompts. + * Returns Goal, Demo, Verification, and Observability sections as a compact excerpt. + * + * Purpose: give task executors the slice-level contract without inlining the full plan. + * Consumer: auto-prompts.js buildExecuteTask*. + */ +export function extractSliceExecutionExcerpt(content, relPath) { + if (!content) { + return [ + "## Slice Plan Excerpt", + `Slice plan not found at dispatch time. Read \`${relPath}\` before running slice-level verification.`, + ].join("\n"); + } + const lines = content.split("\n"); + const goalLine = lines.find((line) => line.startsWith("**Goal:**"))?.trim(); + const demoLine = lines.find((line) => line.startsWith("**Demo:**"))?.trim(); + const verification = extractMarkdownSectionLocal(content, "Verification"); + const observability = extractMarkdownSectionLocal(content, "Observability / Diagnostics"); + const parts = ["## Slice Plan Excerpt", `Source: \`${relPath}\``]; + if (goalLine) parts.push(goalLine); + if (demoLine) parts.push(demoLine); + if (verification) parts.push("", "### Slice Verification", verification.trim()); + if (observability) parts.push("", "### Slice Observability / Diagnostics", observability.trim()); + return parts.join("\n"); +} diff --git a/src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs b/src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs index 86453fa7a..a6f758716 100644 --- a/src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs +++ b/src/resources/extensions/sf/tests/uok-unit-runtime.test.mjs @@ -13,6 +13,7 @@ import { clearRunawayRecoveredRuntimeRecords, clearUnitRuntimeRecord, decideUnitRuntimeDispatch, + getRecoveryDiagnostics, getUnitRuntimeState, isTerminalUnitRuntimeStatus, listUnitRuntimeRecords, @@ -377,3 +378,72 @@ test("listUnitRuntimeRecords_returns_empty_when_dir_missing", () => { const records = listUnitRuntimeRecords(root); assert.deepEqual(records, []); }); + +// ─── getRecoveryDiagnostics ──────────────────────────────────────────────── + +test("getRecoveryDiagnostics_returns_null_for_missing_record", () => { + const root = makeProject(); + const diagnostics = getRecoveryDiagnostics(root, "execute-task", "MISSING"); + assert.equal(diagnostics, null); +}); + +test("getRecoveryDiagnostics_returns_structured_object_for_record_with_recovery", () => { + const root = makeProject(); + const t = Date.now(); + writeUnitRuntimeRecord(root, "execute-task", "M001/S01/T01", t, { + status: "failed", + recoveryAttempts: 2, + retryCount: 2, + maxRetries: 3, + lastRecoveryReason: "timeout", + progressCount: 5, + lastProgressKind: "checkpoint", + lineageEvent: { + status: "started", + workerSessionId: "worker-1", + }, + }); + const diagnostics = getRecoveryDiagnostics( + root, + "execute-task", + "M001/S01/T01", + ); + assert.ok(diagnostics); + assert.equal(diagnostics.unitType, "execute-task"); + assert.equal(diagnostics.unitId, "M001/S01/T01"); + assert.equal(diagnostics.status, "failed"); + assert.equal(diagnostics.retryCount, 2); + assert.equal(diagnostics.maxRetries, 3); + assert.equal(diagnostics.lastRecoveryReason, "timeout"); + assert.equal(diagnostics.progressCount, 5); + assert.equal(diagnostics.lastProgressKind, "checkpoint"); + assert.equal(diagnostics.recoveryAttempts, 2); + assert.ok(diagnostics.lineageSummary); + assert.equal(diagnostics.lineageSummary.status, "started"); + assert.equal(diagnostics.lineageSummary.workerCount, 1); + assert.equal(diagnostics.lineageSummary.eventCount, 1); + assert.equal(diagnostics.startedAt, t); + assert.ok(diagnostics.updatedAt); +}); + +test("getRecoveryDiagnostics_returns_minimal_object_for_record_without_recovery", () => { + const root = makeProject(); + const t = Date.now(); + writeUnitRuntimeRecord(root, "execute-task", "M001/S01/T02", t, { + status: "running", + }); + const diagnostics = getRecoveryDiagnostics( + root, + "execute-task", + "M001/S01/T02", + ); + assert.ok(diagnostics); + assert.equal(diagnostics.status, "running"); + assert.equal(diagnostics.retryCount, 0); + assert.equal(diagnostics.maxRetries, 1); + assert.equal(diagnostics.lastRecoveryReason, null); + assert.equal(diagnostics.progressCount, 0); + assert.equal(diagnostics.lastProgressKind, "dispatch"); + assert.equal(diagnostics.recoveryAttempts, 0); + assert.equal(diagnostics.lineageSummary, null); +}); diff --git a/src/resources/extensions/sf/uok/unit-runtime.d.ts b/src/resources/extensions/sf/uok/unit-runtime.d.ts new file mode 100644 index 000000000..19868b577 --- /dev/null +++ b/src/resources/extensions/sf/uok/unit-runtime.d.ts @@ -0,0 +1,32 @@ +/** + * Type declarations for unit-runtime.js + */ + +export interface RecoveryDiagnostics { + unitType: string; + unitId: string; + status: string; + retryCount: number; + maxRetries: number; + lastRecoveryReason: string | null; + progressCount: number; + lastProgressKind: string; + recoveryAttempts: number; + lineageSummary: { + status: string; + workerCount: number; + eventCount: number; + } | null; + updatedAt: number | null; + startedAt: number | null; +} + +export function getRecoveryDiagnostics( + basePath: string, + unitType: string, + unitId: string, +): RecoveryDiagnostics | null; + +export function listUnitRuntimeRecords( + basePath: string, +): Array & { updatedAt?: number; unitId: string }>; diff --git a/src/resources/extensions/sf/uok/unit-runtime.js b/src/resources/extensions/sf/uok/unit-runtime.js index c0b56c918..468a7455a 100644 --- a/src/resources/extensions/sf/uok/unit-runtime.js +++ b/src/resources/extensions/sf/uok/unit-runtime.js @@ -582,6 +582,43 @@ export function formatExecuteTaskRecoveryStatus(status) { ? missing.join("; ") : "all durable task artifacts present"; } + +/** + * Read the runtime record for a unit and return structured recovery diagnostics. + * + * Purpose: surface runtime record state for post-mortem debugging of autonomous + * failures without requiring humans to parse `.sf/runtime/units/*.json` manually. + * + * Consumer: `sf status recovery` CLI command and verification evidence enrichment. + */ +export function getRecoveryDiagnostics(basePath, unitType, unitId) { + const record = readUnitRuntimeRecord(basePath, unitType, unitId); + if (!record) { + return null; + } + const state = getUnitRuntimeState(record); + const lineageSummary = record.lineage + ? { + status: record.lineage.status, + workerCount: record.lineage.workerSessionIds?.length ?? 0, + eventCount: record.lineage.events?.length ?? 0, + } + : null; + return { + unitType, + unitId, + status: state.status, + retryCount: state.retryCount, + maxRetries: state.maxRetries, + lastRecoveryReason: record.lastRecoveryReason ?? null, + progressCount: record.progressCount ?? 0, + lastProgressKind: record.lastProgressKind ?? "dispatch", + recoveryAttempts: record.recoveryAttempts ?? 0, + lineageSummary, + updatedAt: record.updatedAt ?? null, + startedAt: record.startedAt ?? null, + }; +} // ─── Stale slice runtime record reconciliation ────────────────────────────── /** * Clear unit runtime records for complete-slice units that are in a terminal diff --git a/src/resources/extensions/sf/verification-evidence.js b/src/resources/extensions/sf/verification-evidence.js index 890af0cbb..bb34dd01d 100644 --- a/src/resources/extensions/sf/verification-evidence.js +++ b/src/resources/extensions/sf/verification-evidence.js @@ -24,6 +24,10 @@ export function writeVerificationJSON( unitId, retryAttempt, maxRetries, + tokenCount, + memoryPressureMB, + gateOutcomes, + recoveryStatus, ) { mkdirSync(tasksDir, { recursive: true }); const evidence = { @@ -41,6 +45,10 @@ export function writeVerificationJSON( })), ...(retryAttempt !== undefined ? { retryAttempt } : {}), ...(maxRetries !== undefined ? { maxRetries } : {}), + ...(tokenCount !== undefined ? { tokenCount } : {}), + ...(memoryPressureMB !== undefined ? { memoryPressureMB } : {}), + ...(gateOutcomes !== undefined ? { gateOutcomes } : {}), + ...(recoveryStatus !== undefined ? { recoveryStatus } : {}), }; if (result.runtimeErrors && result.runtimeErrors.length > 0) { evidence.runtimeErrors = result.runtimeErrors.map((e) => ({ diff --git a/todo.md b/todo.md new file mode 100644 index 000000000..cbd72a7ae --- /dev/null +++ b/todo.md @@ -0,0 +1,53 @@ +# TODO + +Unimplemented items consolidated from root *.md files. Source file noted for each item. + +--- + +## Critical / Correctness + +- [x] Port `fix(security): harden project-controlled surfaces` — env isolation + transport cleanup done; gsd-2 trust/dedup hunks (server.ts, mcp-client/index.ts) not applicable (packages absent) *(BUILD_PLAN.md Tier 0.5 #2)* +- [ ] Port agent-session/agent-end transition fixes (gsd-2 `71114fccf`, `6d7e4gcb5`, `c162c44bf`, `e3bd04551`) *(BUILD_PLAN.md Tier 0.5 #7-10, UPSTREAM_CHERRY_PICK_CANDIDATES.md Cluster B)* +- [ ] Cloudflare Workers AI provider — `CLOUDFLARE_API_KEY`/`CLOUDFLARE_ACCOUNT_ID` (pi-mono PR #3851) *(BUILD_PLAN.md Tier 0 #8)* + +--- + +## Architecture / Design Gaps + +- [ ] Schema reconciliation: update SPEC.md to 3-table model (milestones/slices/tasks vs single `units`) *(BUILD_PLAN.md Tier 1.3)* +- [ ] Persistent agents v1 command surface — `/sf agent run|reset|delete|inspect` *(BUILD_PLAN.md Tier 2.1)* +- [ ] Intent chapters (`chapter_open`/`chapter_close` — crash-resume context) *(BUILD_PLAN.md Tier 2.3)* +- [ ] PhaseReview 3-pass review (establish-context → parallel chunked → synthesis) *(BUILD_PLAN.md Tier 2.4)* +- [ ] `last_error` cap to 4 KB head+tail; full payload to file *(BUILD_PLAN.md Tier 2.6)* +- [ ] Port workflow state machine hardening (gsd-2 `f2377eedd`, `b9a1c6743`, `153fb328a`, `381ccdef5`, `371b2eb31`) *(BUILD_PLAN.md Tier 0.5 #13, UPSTREAM_CHERRY_PICK_CANDIDATES.md Cluster F)* +- [ ] Port `fix(claude-code-cli): persist Always Allow for non-Bash tools` (gsd-2 `a88baeae9`) *(BUILD_PLAN.md Tier 0.5 #11)* + +--- + +## Medium Priority / Quality + +- [ ] Replace `isHeavyModelId()` name-matching heuristic with capability-based check *(PRODUCTION_AUDIT_GRADE.md #9, PRODUCTION_AUDIT.md 3.3)* +- [ ] Add `version` field to task frontmatter and mode state (schema versioning) *(PRODUCTION_AUDIT_GRADE.md #8)* +- [ ] Integration tests for full remote steering pipeline *(PRODUCTION_AUDIT.md Long Term #10)* +- [x] Log `frontmatterErrors` in sf-db.js instead of silently dropping validation errors *(PRODUCTION_AUDIT.md 3.1)* +- [ ] Search provider registry refactor — consolidate provider list across files into `SearchProviderRegistry` *(BUILD_PLAN.md Tier 1+)* +- [ ] Update ARCHITECTURE.md self-evolution section (triage pipeline IS active; injection IS automatic now) *(ARCHITECTURE.md)* +- [ ] Add Mermaid state machine diagram to ARCHITECTURE.md *(ARCHITECTURE.md)* +- [ ] Symlinked packages/resources/skills/sessions dedup (pi-mono PR #3818) *(BUILD_PLAN.md Tier 0 #6)* + +--- + +## Long-term / Deferred + +- [ ] Singularity Knowledge + Agent Platform (Go re-platform, ~12 weeks) *(BUILD_PLAN.md Tier 1+)* +- [ ] sf-worker SSH host (Go, `wish` + `xpty`, ~3 weeks) *(BUILD_PLAN.md Tier 4)* +- [ ] Charm TUI client (`sf-tui` in Go, ~12-16 weeks) *(BUILD_PLAN.md Tier 1+)* +- [ ] Flight recorder (`x/vcr`, ~3 weeks) *(BUILD_PLAN.md Tier 1+)* +- [ ] Full swarm chat for `subagent` tool (Option C, depends on persistent-agent layer) *(BUILD_PLAN.md Tier 1+)* +- [ ] Caveman input-side prompt compression (rewrite execute-task/plan-slice prompts) *(BUILD_PLAN.md Tier 1+)* +- [ ] Runtime input preprocessor (`terse_prompts: true` dispatch transform, ~3-4 days) *(BUILD_PLAN.md Tier 1+)* +- [ ] Judge calibration + eval runner service (Go/Charm, ~2-3 weeks post SM) *(BUILD_PLAN.md Tier 1+)* +- [ ] M009 promote-only adoption review — create `sf schedule` entry (2 weeks after M009 close) *(BACKLOG.md)* +- [ ] Establish pi-mono SDK sync cadence (recurring check schedule) *(BUILD_PLAN.md Tier 1+)* +- [ ] `scripts/port-from-gsd2.sh` automation script *(UPSTREAM_PORT_GUIDE.md)* +- [ ] TypeScript migration for UOK modules (`kernel.js`, etc.) *(PRODUCTION_AUDIT_COMPLETE.md, PRODUCTION_AUDIT_GRADE.md)*