diff --git a/.sf/backups/db/maintenance.json b/.sf/backups/db/maintenance.json new file mode 100644 index 000000000..31c31263f --- /dev/null +++ b/.sf/backups/db/maintenance.json @@ -0,0 +1,3 @@ +{ + "lastFullVacuumAt": "2026-05-08T20:15:21.317Z" +} diff --git a/.sf/backups/db/sf.db.2026-05-08T20-20-34-822Z b/.sf/backups/db/sf.db.2026-05-08T20-20-34-822Z new file mode 100644 index 000000000..eac12c10d Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-08T20-20-34-822Z differ diff --git a/.sf/backups/db/sf.db.2026-05-08T20-44-13-669Z b/.sf/backups/db/sf.db.2026-05-08T20-44-13-669Z new file mode 100644 index 000000000..eac12c10d Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-08T20-44-13-669Z differ diff --git a/.sf/backups/db/sf.db.2026-05-08T22-14-57-817Z b/.sf/backups/db/sf.db.2026-05-08T22-14-57-817Z new file mode 100644 index 000000000..c6a3e98de Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-08T22-14-57-817Z differ diff --git a/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z b/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z new file mode 100644 index 000000000..e78b0d5e7 Binary files /dev/null and b/.sf/backups/db/sf.db.2026-05-08T22-42-32-307Z differ diff --git a/.sf/backups/db/sf.db.20260508-220250 b/.sf/backups/db/sf.db.20260508-220250 new file mode 100644 index 000000000..c8eda44e7 Binary files /dev/null and b/.sf/backups/db/sf.db.20260508-220250 differ diff --git a/.sf/recovery/sf.db-shm.corrupt-20260508-220021 b/.sf/recovery/sf.db-shm.corrupt-20260508-220021 new file mode 100644 index 000000000..fe9ac2845 Binary files /dev/null and b/.sf/recovery/sf.db-shm.corrupt-20260508-220021 differ diff --git a/.sf/recovery/sf.db-shm.replaced-corrupt-20260508-220115 b/.sf/recovery/sf.db-shm.replaced-corrupt-20260508-220115 new file mode 100644 index 000000000..3e1e8c30a Binary files /dev/null and b/.sf/recovery/sf.db-shm.replaced-corrupt-20260508-220115 differ diff --git a/.sf/recovery/sf.db-wal.corrupt-20260508-220021 b/.sf/recovery/sf.db-wal.corrupt-20260508-220021 new file mode 100644 index 000000000..e69de29bb diff --git a/.sf/recovery/sf.db-wal.replaced-corrupt-20260508-220115 b/.sf/recovery/sf.db-wal.replaced-corrupt-20260508-220115 new file mode 100644 index 000000000..e69de29bb diff --git a/.sf/recovery/sf.db.corrupt-20260508-220021 b/.sf/recovery/sf.db.corrupt-20260508-220021 new file mode 100644 index 000000000..0e5cd30da Binary files /dev/null and b/.sf/recovery/sf.db.corrupt-20260508-220021 differ diff --git a/.sf/recovery/sf.db.recovered-20260508-220103 b/.sf/recovery/sf.db.recovered-20260508-220103 new file mode 100644 index 000000000..fc304921f Binary files /dev/null and b/.sf/recovery/sf.db.recovered-20260508-220103 differ diff --git a/.sf/recovery/sf.db.replaced-corrupt-20260508-220115 b/.sf/recovery/sf.db.replaced-corrupt-20260508-220115 new file mode 100644 index 000000000..0e5cd30da Binary files /dev/null and b/.sf/recovery/sf.db.replaced-corrupt-20260508-220115 differ diff --git a/.sf/recovery/stray-root-20260509-023724/backups/db/maintenance.json b/.sf/recovery/stray-root-20260509-023724/backups/db/maintenance.json new file mode 100644 index 000000000..7fb4dbdd2 --- /dev/null +++ b/.sf/recovery/stray-root-20260509-023724/backups/db/maintenance.json @@ -0,0 +1,3 @@ +{ + "lastFullVacuumAt": "2026-05-08T20:29:49.200Z" +} diff --git a/.sf/recovery/stray-root-20260509-023724/backups/db/sf.db.2026-05-08T20-29-49-149Z b/.sf/recovery/stray-root-20260509-023724/backups/db/sf.db.2026-05-08T20-29-49-149Z new file mode 100644 index 000000000..1d1f0d452 Binary files /dev/null and b/.sf/recovery/stray-root-20260509-023724/backups/db/sf.db.2026-05-08T20-29-49-149Z differ diff --git a/.sf/recovery/stray-root-20260509-023724/global b/.sf/recovery/stray-root-20260509-023724/global new file mode 100644 index 000000000..ef76ebfb4 Binary files /dev/null and b/.sf/recovery/stray-root-20260509-023724/global differ diff --git a/package-lock.json b/package-lock.json index 8296d27d3..e26f919ff 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5708,6 +5708,10 @@ "node_modules/@singularity-forge/engine-win32-x64-msvc": { "optional": true }, + "node_modules/@singularity-forge/google-gemini-cli-provider": { + "resolved": "packages/google-gemini-cli-provider", + "link": true + }, "node_modules/@singularity-forge/native": { "resolved": "packages/native", "link": true @@ -14618,6 +14622,16 @@ "url": "https://github.com/sponsors/colinhacks" } }, + "packages/google-gemini-cli-provider": { + "name": "@singularity-forge/google-gemini-cli-provider", + "version": "2.75.3", + "dependencies": { + "@google/gemini-cli-core": "0.40.1" + }, + "engines": { + "node": ">=26.1.0" + } + }, "packages/native": { "name": "@singularity-forge/native", "version": "2.75.3", @@ -14651,6 +14665,7 @@ "@google/genai": "^1.40.0", "@mistralai/mistralai": "^2.2.1", "@sinclair/typebox": "^0.34.41", + "@singularity-forge/google-gemini-cli-provider": "^2.75.3", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "chalk": "^5.6.2", diff --git a/package.json b/package.json index a5f110eb4..ff9e3a2d1 100644 --- a/package.json +++ b/package.json @@ -48,7 +48,8 @@ "build:pi-coding-agent": "npm --workspace @singularity-forge/pi-coding-agent run build", "build:native-pkg": "npm --workspace @singularity-forge/native run build", "build:rpc-client": "npm --workspace @singularity-forge/rpc-client run build", - "build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent", + "build:google-gemini-cli-provider": "npm --workspace @singularity-forge/google-gemini-cli-provider run build", + "build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:google-gemini-cli-provider && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent", "build:daemon": "npm --workspace @singularity-forge/daemon run build", "build:core": "npm run build:pi && npm run build:rpc-client && npm run build:daemon && npm run check:versioned-json && tsc && npm run copy-resources && npm run copy-themes && npm run copy-export-html", "build": "npm run build:core && node scripts/build-web-if-stale.cjs", diff --git a/packages/google-gemini-cli-provider/package.json b/packages/google-gemini-cli-provider/package.json new file mode 100644 index 000000000..263a0d7c1 --- /dev/null +++ b/packages/google-gemini-cli-provider/package.json @@ -0,0 +1,23 @@ +{ + "name": "@singularity-forge/google-gemini-cli-provider", + "version": "2.75.3", + "description": "Gemini CLI Core transport helper for SF providers", + "type": "module", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "scripts": { + "build": "tsc -p tsconfig.json" + }, + "dependencies": { + "@google/gemini-cli-core": "0.40.1" + }, + "engines": { + "node": ">=26.1.0" + } +} diff --git a/packages/google-gemini-cli-provider/src/index.test.ts b/packages/google-gemini-cli-provider/src/index.test.ts new file mode 100644 index 000000000..e270a22f2 --- /dev/null +++ b/packages/google-gemini-cli-provider/src/index.test.ts @@ -0,0 +1,40 @@ +import assert from "node:assert/strict"; +import { describe, test, vi } from "vitest"; + +const helperState = vi.hoisted(() => ({ + authType: undefined as unknown, + configParams: undefined as Record | undefined, +})); + +vi.mock("@google/gemini-cli-core", () => ({ + AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" }, + makeFakeConfig: vi.fn((params: Record) => { + helperState.configParams = params; + return { params }; + }), +})); + +vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({ + createContentGeneratorConfig: vi.fn(async (_config, authType) => { + helperState.authType = authType; + return { authType }; + }), + createContentGenerator: vi.fn(async () => ({ + async generateContentStream(): Promise> { + return (async function* emptyStream() {})(); + }, + })), +})); + +import { createGeminiCliContentGenerator } from "./index.js"; + +describe("google-gemini-cli-provider", () => { + test("createGeminiCliContentGenerator_uses_google_login_auth", async () => { + await createGeminiCliContentGenerator({ modelId: "gemini-3-pro" }); + + assert.equal(helperState.authType, "LOGIN_WITH_GOOGLE"); + assert.equal(helperState.configParams?.model, "gemini-3-pro"); + assert.equal(helperState.configParams?.cwd, process.cwd()); + assert.equal(helperState.configParams?.targetDir, process.cwd()); + }); +}); diff --git a/packages/google-gemini-cli-provider/src/index.ts b/packages/google-gemini-cli-provider/src/index.ts new file mode 100644 index 000000000..0672a551b --- /dev/null +++ b/packages/google-gemini-cli-provider/src/index.ts @@ -0,0 +1,48 @@ +/** + * Google Gemini CLI transport helper. + * + * Purpose: keep the Gemini CLI Core auth and content-generator wiring in a + * dedicated workspace package so provider code can depend on one small helper + * instead of embedding the upstream integration inline. + * + * Consumer: `@singularity-forge/pi-ai` Google Gemini provider. + */ +import { + AuthType, + makeFakeConfig, +} from "@google/gemini-cli-core"; +import { + createContentGenerator, + createContentGeneratorConfig, + type ContentGenerator, +} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js"; + +export interface GeminiCliContentGeneratorOptions { + modelId: string; + cwd?: string; + targetDir?: string; +} + +/** + * Create a Gemini CLI Core content generator for a model. + * + * Purpose: centralize the Code Assist setup and OAuth bootstrap logic in a + * reusable package so SF's Gemini provider can stay focused on stream shaping. + * + * Consumer: the Google Gemini provider in pi-ai. + */ +export async function createGeminiCliContentGenerator( + options: GeminiCliContentGeneratorOptions, +): Promise { + const cwd = options.cwd ?? process.cwd(); + const config = makeFakeConfig({ + model: options.modelId, + cwd, + targetDir: options.targetDir ?? cwd, + }); + const generatorConfig = await createContentGeneratorConfig( + config, + AuthType.LOGIN_WITH_GOOGLE, + ); + return createContentGenerator(generatorConfig, config); +} diff --git a/packages/google-gemini-cli-provider/tsconfig.json b/packages/google-gemini-cli-provider/tsconfig.json new file mode 100644 index 000000000..4aca0ff22 --- /dev/null +++ b/packages/google-gemini-cli-provider/tsconfig.json @@ -0,0 +1,28 @@ +{ + "compilerOptions": { + "target": "ES2024", + "module": "Node16", + "lib": ["ES2024"], + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "incremental": true, + "forceConsistentCasingInFileNames": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "inlineSources": true, + "inlineSourceMap": false, + "moduleResolution": "Node16", + "resolveJsonModule": true, + "allowImportingTsExtensions": false, + "experimentalDecorators": true, + "emitDecoratorMetadata": true, + "useDefineForClassFields": false, + "types": ["node"], + "outDir": "./dist", + "rootDir": "./src" + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "dist", "**/*.d.ts", "src/**/*.d.ts"] +} diff --git a/packages/pi-ai/package.json b/packages/pi-ai/package.json index 9733cf511..1d9594378 100644 --- a/packages/pi-ai/package.json +++ b/packages/pi-ai/package.json @@ -29,6 +29,7 @@ "@google/gemini-cli-core": "0.40.1", "@google/genai": "^1.40.0", "@mistralai/mistralai": "^2.2.1", + "@singularity-forge/google-gemini-cli-provider": "^2.75.3", "@sinclair/typebox": "^0.34.41", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", diff --git a/packages/pi-ai/src/providers/google-gemini-cli.test.ts b/packages/pi-ai/src/providers/google-gemini-cli.test.ts index e49d9f25e..509ff1b3b 100644 --- a/packages/pi-ai/src/providers/google-gemini-cli.test.ts +++ b/packages/pi-ai/src/providers/google-gemini-cli.test.ts @@ -5,41 +5,34 @@ import type { Context, Model } from "../types.js"; const geminiCliCore = vi.hoisted(() => ({ retryError: undefined as Error | undefined, retryOptions: undefined as Record | undefined, - fakeConfigParams: undefined as Record | undefined, - generatorAuthType: undefined as unknown, + helperArgs: undefined as Record | undefined, })); vi.mock("@google/gemini-cli-core", () => ({ - AuthType: { LOGIN_WITH_GOOGLE: "LOGIN_WITH_GOOGLE" }, CodeAssistServer: class { async generateContentStream(): Promise> { return (async function* emptyStream() {})(); } }, - getOauthClient: vi.fn(async () => ({})), - makeFakeConfig: vi.fn((params: Record) => { - geminiCliCore.fakeConfigParams = params; - return { params }; - }), retryWithBackoff: vi.fn( async (_fn: unknown, options: Record) => { geminiCliCore.retryOptions = options; throw geminiCliCore.retryError ?? new Error("quota exhausted"); }, ), - setupUser: vi.fn(async () => ({ projectId: "test-project" })), })); -vi.mock("@google/gemini-cli-core/dist/src/core/contentGenerator.js", () => ({ - createContentGeneratorConfig: vi.fn(async (_config, authType) => { - geminiCliCore.generatorAuthType = authType; - return { authType }; - }), - createContentGenerator: vi.fn(async () => ({ - async generateContentStream(): Promise> { - return (async function* emptyStream() {})(); +vi.mock("@singularity-forge/google-gemini-cli-provider", () => ({ + createGeminiCliContentGenerator: vi.fn( + async (args: Record) => { + geminiCliCore.helperArgs = args; + return { + async generateContentStream(): Promise> { + return (async function* emptyStream() {})(); + }, + }; }, - })), + ), })); import { streamGoogleGeminiCli } from "./google-gemini-cli.js"; @@ -82,12 +75,7 @@ describe("google-gemini-cli provider retry ownership", () => { | { maxAttempts?: unknown } | undefined; assert.equal(retryOptions?.maxAttempts, 1); - assert.equal( - geminiCliCore.fakeConfigParams?.model, - "gemini-3-flash-preview", - ); - assert.equal(geminiCliCore.fakeConfigParams?.clientName, undefined); - assert.equal(geminiCliCore.generatorAuthType, "LOGIN_WITH_GOOGLE"); + assert.equal(geminiCliCore.helperArgs?.modelId, "gemini-3-flash-preview"); assert.equal(result.stopReason, "error"); assert.match(result.errorMessage ?? "", /exhausted your capacity/i); assert.equal(result.retryAfterMs, 54_000); diff --git a/packages/pi-ai/src/providers/google-gemini-cli.ts b/packages/pi-ai/src/providers/google-gemini-cli.ts index ee6599682..f87e73bc1 100644 --- a/packages/pi-ai/src/providers/google-gemini-cli.ts +++ b/packages/pi-ai/src/providers/google-gemini-cli.ts @@ -1,24 +1,12 @@ /** * Google Gemini CLI provider. * - * Delegates auth, project discovery, and the Code Assist transport to - * @google/gemini-cli-core — the library behind Google's Gemini tooling. - * cli-core reads ~/.gemini/oauth_creds.json itself when present, refreshes tokens, - * discovers the project (free-tier or whatever's onboarded server-side) - * via setupUser(), and handles all the User-Agent / quota-classification details. + * Delegates auth, project discovery, and the Code Assist transport setup to + * the dedicated google-gemini-cli-provider package. * Request retry/fallback stays in the caller so SF can move to the next model. */ -import { - AuthType, - makeFakeConfig, - retryWithBackoff, -} from "@google/gemini-cli-core"; -import type { ContentGenerator } from "@google/gemini-cli-core/dist/src/core/contentGenerator.js"; -import { - createContentGenerator, - createContentGeneratorConfig, -} from "@google/gemini-cli-core/dist/src/core/contentGenerator.js"; +import { retryWithBackoff } from "@google/gemini-cli-core"; import type { Content, GenerateContentParameters, @@ -55,6 +43,7 @@ import { isAutoReasoning, resolveReasoningLevel, } from "./simple-options.js"; +import { createGeminiCliContentGenerator } from "@singularity-forge/google-gemini-cli-provider"; /** * Thinking level for Gemini 3 models. @@ -73,7 +62,8 @@ export type GoogleThinkingLevel = /** * Options for `streamGoogleGeminiCli()`. * - * Delegates auth to cli-core (reads ~/.gemini/oauth_creds.json via `getOauthClient()`); + * Delegates auth to the helper package (reads ~/.gemini/oauth_creds.json via + * Gemini CLI Core's transport setup); * `projectId` is auto-discovered and not used by this provider (apiKey is ignored). * Thinking is configured separately from base `StreamOptions` because Gemini 2 and 3 * models use incompatible enum formats (budgetTokens vs. level). @@ -100,30 +90,6 @@ export interface GoogleGeminiCliOptions extends StreamOptions { // Counter for generating unique tool call IDs let toolCallCounter = 0; -/** - * Build a Code Assist content generator using cli-core's official content-generator path. - * - * Upstream Gemini CLI does not instantiate CodeAssistServer directly from the - * caller. It creates a ContentGeneratorConfig, lets createContentGenerator() - * build the GeminiCLI User-Agent and transport headers, then delegates to - * createCodeAssistContentGenerator() for OAuth, setupUser(), and Code Assist. - * - * Both calls memoize internally inside cli-core — repeat invocations are - * cheap. - */ -async function getCodeAssistServer(modelId: string): Promise { - const config = makeFakeConfig({ - model: modelId, - cwd: process.cwd(), - targetDir: process.cwd(), - }); - const generatorConfig = await createContentGeneratorConfig( - config, - AuthType.LOGIN_WITH_GOOGLE, - ); - return createContentGenerator(generatorConfig, config); -} - function parseDurationMs(value: string): number | undefined { const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i); if (!match || !match[0]) return undefined; @@ -178,14 +144,14 @@ function isGemini3Model(modelId: string): boolean { } /** - * Stream a chat completion from Google Gemini via the cli-core transport. + * Stream a chat completion from Google Gemini via the helper package and cli-core transport. * - * Auth is handled transparently by cli-core (`getCodeAssistServer()` reads OAuth creds from - * ~/.gemini/oauth_creds.json and triggers browser OAuth on first run). Project ID is auto-discovered - * from the Code Assist API; `apiKey` is ignored. Casting the request as `any` works around the fact - * that cli-core bundles its own nested `@google/genai` copy (nominal type split at packaging time; - * runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and - * error events that accumulate into an `AssistantMessage`. + * The helper package owns the OAuth/bootstrap path against `@google/gemini-cli-core`, including + * `~/.gemini/oauth_creds.json` and Gemini Code Assist project discovery. `apiKey` is ignored. + * Casting the request as `any` works around the fact that cli-core bundles its own nested + * `@google/genai` copy (nominal type split at packaging time; runtime shapes are byte-identical). + * Returns a real-time stream emitting start, delta, end, and error events that accumulate into + * an `AssistantMessage`. */ export const streamGoogleGeminiCli: StreamFunction< "google-gemini-cli", @@ -222,9 +188,10 @@ export const streamGoogleGeminiCli: StreamFunction< if (nextReq !== undefined) { req = nextReq as GenerateContentParameters; } - // cli-core handles auth + project discovery. SF uses cli-core directly - // and does not spawn a separate provider CLI process. - const server = await getCodeAssistServer(req.model); + // cli-core handles auth + project discovery through the helper package. + const server = await createGeminiCliContentGenerator({ + modelId: req.model, + }); const promptId = `pi-${Date.now()}-${Math.random().toString(36).slice(2, 11)}`; // Cast through `any` — cli-core bundles its own nested @google/genai copy, // so TypeScript sees two structurally-identical-but-distinct Content types. @@ -233,7 +200,6 @@ export const streamGoogleGeminiCli: StreamFunction< const streamGen = await retryWithBackoff( () => server.generateContentStream(req as any, promptId, "USER" as any), { - authType: AuthType.LOGIN_WITH_GOOGLE, // SF owns cross-model fallback. Let cli-core classify quota errors, // but do not let it hold the turn through its 10-attempt retry loop. maxAttempts: 1, diff --git a/packages/pi-coding-agent/src/cli/args.test.ts b/packages/pi-coding-agent/src/cli/args.test.ts index 23c10a142..2d83b9f90 100644 --- a/packages/pi-coding-agent/src/cli/args.test.ts +++ b/packages/pi-coding-agent/src/cli/args.test.ts @@ -5,15 +5,15 @@ import { parseArgs } from "./args.js"; describe("parseArgs", () => { it("parses optional-value extension flags with implicit and explicit values", () => { const extensionFlags = new Map([ - ["genai-proxy", { type: "string" as const, allowNoValue: true }], + ["demo-flag", { type: "string" as const, allowNoValue: true }], ]); - const defaultFlagArgs = parseArgs(["--genai-proxy"], extensionFlags); - const explicitFlagArgs = parseArgs(["--genai-proxy=8080"], extensionFlags); + const defaultFlagArgs = parseArgs(["--demo-flag"], extensionFlags); + const explicitFlagArgs = parseArgs(["--demo-flag=8080"], extensionFlags); assert.deepEqual( [ - defaultFlagArgs.unknownFlags.get("genai-proxy"), - explicitFlagArgs.unknownFlags.get("genai-proxy"), + defaultFlagArgs.unknownFlags.get("demo-flag"), + explicitFlagArgs.unknownFlags.get("demo-flag"), ], [true, "8080"], ); diff --git a/packages/pi-tui/src/components/editor.ts b/packages/pi-tui/src/components/editor.ts index 574e69ad8..9bd6f9d04 100644 --- a/packages/pi-tui/src/components/editor.ts +++ b/packages/pi-tui/src/components/editor.ts @@ -190,7 +190,7 @@ export class Editor implements Component, Focusable { private autocompleteDebounceTimer: ReturnType | null = null; private lastAutocompleteLookupPrefix: string | null = null; - private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 150; + private static readonly AUTOCOMPLETE_DEBOUNCE_MS = 50; // Paste tracking for large pastes private pastes: Map = new Map(); diff --git a/scripts/ensure-workspace-builds.cjs b/scripts/ensure-workspace-builds.cjs index a90958d7a..0b9e86816 100644 --- a/scripts/ensure-workspace-builds.cjs +++ b/scripts/ensure-workspace-builds.cjs @@ -94,6 +94,7 @@ if (require.main === module) { const WORKSPACE_PACKAGES = [ "native", "pi-tui", + "google-gemini-cli-provider", "pi-ai", "pi-agent-core", "pi-coding-agent", diff --git a/scripts/install-pi-global.js b/scripts/install-pi-global.js index 49cda21d0..1c33f571e 100644 --- a/scripts/install-pi-global.js +++ b/scripts/install-pi-global.js @@ -28,6 +28,7 @@ mkdirSync(piAgentDir, { recursive: true }); const copied = []; if (copyDir("extensions")) copied.push("extensions"); if (copyDir("skills")) copied.push("skills"); +if (copyDir("workflow-skills")) copied.push("workflow-skills"); if (copyDir("agents")) copied.push("agents"); const agentsMdSrc = join(resourcesDir, "AGENTS.md"); diff --git a/scripts/link-workspace-packages.cjs b/scripts/link-workspace-packages.cjs index 3db70a22a..33834cc67 100644 --- a/scripts/link-workspace-packages.cjs +++ b/scripts/link-workspace-packages.cjs @@ -36,6 +36,7 @@ const scopeDir = join(root, "node_modules", scope); const packageDirs = [ "native", "pi-agent-core", + "google-gemini-cli-provider", "pi-ai", "pi-coding-agent", "pi-tui", diff --git a/scripts/uninstall-pi-global.js b/scripts/uninstall-pi-global.js index 26377389a..beba7806d 100644 --- a/scripts/uninstall-pi-global.js +++ b/scripts/uninstall-pi-global.js @@ -60,6 +60,7 @@ function removeIfContentMatches(targetPath, sourcePath, label) { removeResourceEntries("extensions"); removeResourceEntries("skills"); +removeResourceEntries("workflow-skills"); removeResourceEntries("agents"); removeIfContentMatches( join(piAgentDir, "AGENTS.md"), diff --git a/src/resources/extensions/sf/commands/catalog.js b/src/resources/extensions/sf/commands/catalog.js index faabf54f3..24a12abc2 100644 --- a/src/resources/extensions/sf/commands/catalog.js +++ b/src/resources/extensions/sf/commands/catalog.js @@ -69,6 +69,7 @@ export const TOP_LEVEL_SUBCOMMANDS = [ { cmd: "queue", desc: "Queue and reorder future milestones" }, { cmd: "quick", desc: "Execute a quick task without full planning overhead" }, { cmd: "discuss", desc: "Discuss architecture and decisions" }, + { cmd: "steer", desc: "Steerable autonomous panel (Shift+Tab)" }, { cmd: "capture", desc: "Fire-and-forget thought capture" }, { cmd: "debug", desc: "Create and inspect persistent /debug sessions" }, { cmd: "scan", desc: "Run source and project scans" }, diff --git a/src/resources/extensions/sf/index.js b/src/resources/extensions/sf/index.js index a28b0cfa9..cc84c3531 100644 --- a/src/resources/extensions/sf/index.js +++ b/src/resources/extensions/sf/index.js @@ -19,6 +19,13 @@ export default async function registerExtension(pi) { // tools, hooks) fails — e.g. due to a Windows-specific import error. const { registerSFCommands } = await import("./commands/index.js"); registerSFCommands(pi); + + // Register steerable autonomous extension for Copilot Auto-style controls + const { default: steerableAutonomousExtension } = await import( + "./steerable-autonomous-extension.js" + ); + steerableAutonomousExtension(pi); + // Full setup (shortcuts, tools, hooks) in a separate try/catch so that // any platform-specific load failure doesn't take out the core command. try { diff --git a/src/resources/extensions/sf/skills/directory.js b/src/resources/extensions/sf/skills/directory.js index 6edd42764..32fe38ec3 100644 --- a/src/resources/extensions/sf/skills/directory.js +++ b/src/resources/extensions/sf/skills/directory.js @@ -11,7 +11,9 @@ import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; const SKILL_FILENAME = "SKILL.md"; +export { SKILL_FILENAME }; const USER_SKILL_DIR = join(process.env.HOME ?? "", ".sf", "skills"); +export { USER_SKILL_DIR }; const BUNDLED_SKILL_DIR = join( dirname(fileURLToPath(import.meta.url)), "..", @@ -19,6 +21,15 @@ const BUNDLED_SKILL_DIR = join( "..", "skills", ); +export { BUNDLED_SKILL_DIR }; +const WORKFLOW_SKILL_DIR = join( + dirname(fileURLToPath(import.meta.url)), + "..", + "..", + "..", + "workflow-skills", +); +export { WORKFLOW_SKILL_DIR }; /** * Find all skill directories under a base path. @@ -41,12 +52,12 @@ export function discoverSkillDirs(basePath) { } /** - * Discover skills from all sources: project, user, and built-in. + * Discover skills from all sources: project, user, built-in, and workflow-internal. */ export function discoverAllSkills(projectPath, options = {}) { const sources = []; - // Bundled SF skills + // Bundled SF skills (user-facing, shown in /skills catalog) if (options.includeBundled && existsSync(BUNDLED_SKILL_DIR)) { const bundledSkills = discoverSkillDirsInRoot(BUNDLED_SKILL_DIR); for (const s of bundledSkills) { @@ -54,6 +65,14 @@ export function discoverAllSkills(projectPath, options = {}) { } } + // Workflow-internal skills (hidden from users, injected by the runtime) + if (options.includeWorkflow !== false && existsSync(WORKFLOW_SKILL_DIR)) { + const workflowSkills = discoverSkillDirsInRoot(WORKFLOW_SKILL_DIR); + for (const s of workflowSkills) { + sources.push({ ...s, source: "workflow" }); + } + } + // Project skills if (projectPath) { const projectSkills = discoverSkillDirs(projectPath); diff --git a/src/resources/extensions/sf/skills/index.js b/src/resources/extensions/sf/skills/index.js index a05e6da6c..a07614bf5 100644 --- a/src/resources/extensions/sf/skills/index.js +++ b/src/resources/extensions/sf/skills/index.js @@ -18,6 +18,7 @@ export { readSkillFile, SKILL_FILENAME, USER_SKILL_DIR, + WORKFLOW_SKILL_DIR, } from "./directory.js"; export { createEvalCase, diff --git a/src/resources/extensions/sf/skills/loader.js b/src/resources/extensions/sf/skills/loader.js index 2bfa696bf..57e1882bf 100644 --- a/src/resources/extensions/sf/skills/loader.js +++ b/src/resources/extensions/sf/skills/loader.js @@ -48,7 +48,7 @@ export function loadSkills(projectPath, options = {}) { } const validation = - source === "bundled" + source === "bundled" || source === "workflow" ? validateBundledSkillFrontmatter(parsed.frontmatter) : validateSkillFrontmatter(parsed.frontmatter); if (!validation.valid) { @@ -64,7 +64,10 @@ export function loadSkills(projectPath, options = {}) { } const record = buildSkillRecord(path, parsed.frontmatter, parsed.body); - if ( + if (source === "workflow") { + // Workflow-internal skills are never user-invocable regardless of frontmatter + record.userInvocable = false; + } else if ( source === "bundled" && parsed.frontmatter["user-invocable"] === undefined ) { @@ -132,7 +135,8 @@ export function getPermittedSkills(skills, activeProfile) { */ export function getUserInvocableSkills(skills) { return skills.filter( - (s) => s.source === "bundled" && s.valid && s.userInvocable, + (s) => + s.source !== "workflow" && s.source === "bundled" && s.valid && s.userInvocable, ); } diff --git a/src/resources/workflow-skills/assumption-log/SKILL.md b/src/resources/workflow-skills/assumption-log/SKILL.md new file mode 100644 index 000000000..6617ba8fd --- /dev/null +++ b/src/resources/workflow-skills/assumption-log/SKILL.md @@ -0,0 +1,92 @@ +--- +name: assumption-log +description: Document assumptions, proceed with sensible defaults, surface for review at milestones. Use in research and planning workflows where context is incomplete. Blocks the "ask the user every 5 minutes" pattern and the "guess silently and break something" pattern. Every assumption becomes a named, reviewable artifact. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - plan + - research + - "*" +--- + +# Assumption Log + +## Iron Law + +``` +NEVER GUESS SILENTLY. +NEVER ASK FOR EVERY MISSING DETAIL. +DOCUMENT THE ASSUMPTION, PICK A SENSIBLE DEFAULT, SURFACE FOR REVIEW. +``` + +Silent guessing produces invisible errors. Asking for every missing detail breaks autonomous flow. The correct middle path: make the assumption explicit, pick a defensible default, continue, and surface the log at review gates. + +## Recognize Your Own Rationalizations + +- "I'll just ask the user." → Ask only when the decision is irreversible or the cost of a wrong assumption is high. For everything else: document and proceed. +- "I know what they meant." → If you know, document the inference explicitly. If you don't know, document the assumption and the default you chose. +- "It's obvious — I don't need to write it down." → What is obvious to you during planning is invisible to the reviewer and to your future self. Write it down. +- "I'll address it when it comes up." → When it comes up, you won't remember what assumption you made. The log is the memory. + +## When to Run + +- At the start of any research or planning phase with incomplete context +- When a planning decision depends on information that isn't in the codebase or spec +- When a scope decision must be made without explicit instruction +- Before each irreversible op (combine with `irreversible-ops` skill) + +## Assumption Entry Format + +For each assumption, record: + +``` +Assumption ID: A- +Category: +Statement: +Basis: +Default chosen: +Confidence: +Falsifier: +Review gate: +Impact if wrong: +``` + +**Confidence guidelines:** +- `high` — strong evidence from code, docs, or established convention; probably correct +- `medium` — inferred from partial evidence; plausible but should be confirmed +- `low` — no evidence; pure default; must be confirmed before the affected code ships + +## Assumption Categories + +**Scope** — what is in/out of this task +> "Assumption: the email notification feature is out of scope for this slice. Basis: spec says 'user profile update' with no mention of notifications. Default: skip. Review at slice completion." + +**Design** — how something should be structured +> "Assumption: use SQLite for local state storage rather than JSON files. Basis: project uses SQLite everywhere else. Default: SQLite. Confidence: high." + +**Dependency** — which version, API, or external behaviour to rely on +> "Assumption: the gateway API responds within 5 seconds. Basis: no SLA documented; 5s is standard for synchronous APIs. Default: 5s timeout. Confidence: medium." + +**Behaviour** — what the system should do in an edge case +> "Assumption: on parse error, return empty array not null. Basis: existing code uses empty arrays for not-found cases. Default: []. Confidence: high." + +**Constraint** — limits on resources, permissions, or side effects +> "Assumption: this migration is safe to run without a maintenance window. Basis: adds a nullable column, no lock required. Default: proceed without window. Confidence: medium. Falsifier: if table > 10M rows, lock time may matter." + +## Review Gate Protocol + +At each milestone or slice completion, surface all `medium` and `low` confidence assumptions: + +1. List all logged assumptions for the current slice +2. Mark each: `CONFIRMED` (user or evidence validated it), `REVISED` (different default chosen), or `OPEN` (still unconfirmed) +3. Any `low` confidence assumption that remains `OPEN` blocks slice completion +4. Any `medium` confidence assumption that remains `OPEN` is a known risk — document it in the slice evidence + +## Completion Criteria + +- [ ] All assumptions made during the workflow are logged with full entry format +- [ ] All `low` confidence assumptions are confirmed or revised before the slice ships +- [ ] All `medium` confidence assumptions are surfaced at the milestone gate +- [ ] The assumption log is attached to the slice/task artifacts in `.sf/active/{unit-id}/assumptions.md` diff --git a/src/resources/workflow-skills/context-lean/SKILL.md b/src/resources/workflow-skills/context-lean/SKILL.md new file mode 100644 index 000000000..7219d61b3 --- /dev/null +++ b/src/resources/workflow-skills/context-lean/SKILL.md @@ -0,0 +1,116 @@ +--- +name: context-lean +description: Prune context before each LLM call. Use in any multi-step workflow that accumulates context across iterations. Less but more relevant context produces better outputs. Prevents context bloat — the single biggest silent quality degrader in long autonomous runs. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - "*" +--- + +# Context Lean + +## Iron Law + +``` +CONTEXT IS A BUDGET, NOT A DUMP. +EVERY TOKEN IN CONTEXT MUST EARN ITS PLACE. +``` + +Adding more context is not safer than adding less. Irrelevant context degrades output quality by diluting signal. When in doubt, leave it out. + +## Recognize Your Own Rationalizations + +- "More context can't hurt — it gives the model more to work with." → Wrong. Noise degrades recall. The model attends to everything; irrelevant context steals attention from relevant context. +- "I'll include the whole file to be safe." → Include only the functions you're actually modifying. The rest is noise. +- "I need to include the history so the model understands the situation." → Include the summary, not the transcript. Summaries are signal; raw transcripts are noise. +- "The token limit isn't hit yet, so it's fine." → Token limits are not quality thresholds. Quality degrades well before the limit. + +## When to Run + +Before any LLM call in a multi-step workflow. Especially: +- Before each autonomous iteration +- Before a planning call that synthesizes many inputs +- After completing a phase (prune phase artifacts before the next phase) +- When the context window is more than 50% full + +## Skill Chain + +Inline skill. Run as a pre-call gate before each significant LLM invocation. + +``` +← prev: any skill, before its LLM call +→ next: return to the invoking skill with pruned context +``` + +## Pruning Protocol + +Apply in order. Stop when the context is lean. + +### Step 1 — Remove completed work + +Anything that was needed to get to the current state but is not needed to proceed: +- Completed task details (keep the summary, drop the steps) +- Resolved errors (keep the fix, drop the stack trace) +- Superseded plans (keep the current plan, drop the draft) + +### Step 2 — Summarize transcripts + +Raw conversation history is always worse than a summary. For any context block older than the current phase: +1. Write a 3-5 sentence summary: what was decided, what was built, what failed +2. Replace the transcript block with the summary +3. Keep only the last 2-3 turns verbatim (for continuity) + +### Step 3 — Scope file content + +Never include entire files when you only need parts of them: +- Include only the functions/methods being modified +- Include only the test cases for the current behaviour +- Include only the error output relevant to the current failure + +If a file must be included whole (e.g., a small config), it must be ≤ 50 lines or explicitly justified. + +### Step 4 — Audit includes + +For every block of context, ask: **if this were removed, would the model's output be worse?** If the answer is "maybe not," remove it. + +Keep: +- The current task/goal (always) +- The specific code being modified (always) +- The error message or test failure driving the current step (always) +- The contract/spec for the current slice (always) +- Recent decisions that constrain the current step + +Remove: +- Earlier phases' full output (summarize) +- Files not touched in the current step +- Passing test output (keep only failures) +- Dependency documentation (link, don't include) +- Comment threads and discussion (summarize conclusions) + +### Step 5 — Verify budget + +After pruning: +- Context should fit in < 30% of the token budget for simple tasks, < 60% for complex ones +- If still over budget after pruning, the task is too large for one call — split it + +## Context Composition Rules + +| Source | Include | Format | +|--------|---------|--------| +| Current task | Always | Full | +| Current file being edited | Only changed functions | Snippet | +| Current error / test failure | Always | Full | +| Previous phase output | Summary only | 3-5 sentences | +| Related file (not being edited) | Only the contract/signature | Snippet | +| Conversation history | Last 2-3 turns + summary of rest | Mixed | +| Documentation | Never inline | Reference by path | + +## Completion Criteria + +Context is lean when: +- [ ] No completed phase artifacts in full (only summaries) +- [ ] No entire files included when snippets suffice +- [ ] Every included block answers "yes" to the audit question +- [ ] Token budget is within target diff --git a/src/resources/workflow-skills/error-routing/SKILL.md b/src/resources/workflow-skills/error-routing/SKILL.md new file mode 100644 index 000000000..b48caf377 --- /dev/null +++ b/src/resources/workflow-skills/error-routing/SKILL.md @@ -0,0 +1,130 @@ +--- +name: error-routing +description: Route errors by type, not severity. Use in any workflow with retry or error-handling steps. Maps error classes (transient, semantic, auth, infra, logic, contract) to their correct handlers. Prevents the two most common agent failure modes — retrying logic errors, and ignoring transient failures. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - build + - repair + - "*" +--- + +# Error Routing + +## Iron Law + +``` +ROUTE BY CLASS FIRST, SEVERITY SECOND. +NEVER RETRY A LOGIC ERROR. +NEVER ABANDON A TRANSIENT ERROR WITHOUT RETRY. +``` + +Retrying a logic error wastes time and can cause data corruption. Abandoning a transient error causes false failures. Routing by severity ("it's a 500, must be important") misclassifies both. + +## Recognize Your Own Rationalizations + +- "It failed, so I'll try a different approach." → Different approach to what? Classify the error first. A different approach to a transient failure is wrong — you need the same approach with a wait. +- "It's a 500 error — must be a server problem." → HTTP 500s include logic errors, auth errors, and transient failures. Read the body. +- "Let me retry with exponential backoff." → Exponential backoff is for transient errors only. Applying it to logic errors just slows down the failure. +- "The test is flaky — I'll just retry it." → Flaky tests are infrastructure errors or race conditions. Classify and fix, don't retry blindly. + +## Error Class Taxonomy + +### Transient + +**Definition:** Will resolve without code change, given time or retry. + +**Examples:** network timeout, rate limit (429), service temporarily unavailable (503), lock contention, resource temporarily exhausted. + +**Handler:** Retry with wait. Use Retry-After header if present; otherwise exponential backoff (1s, 2s, 4s, max 30s). Max 3 retries. If still failing after 3 retries, escalate to infra error. + +**Do NOT:** change code, change approach, or report as a bug. + +--- + +### Auth / Credential + +**Definition:** Request rejected due to missing or invalid credentials. + +**Examples:** 401, 403, expired token, invalid API key, insufficient permissions. + +**Handler:** Do NOT retry. Surface immediately with the exact credential or permission required. Never attempt to infer or work around missing auth — escalate to the human. + +**Do NOT:** retry, change approach, or attempt alternative auth methods. + +--- + +### Logic / Contract + +**Definition:** Code does the wrong thing. The error is in the logic, not the environment. + +**Examples:** wrong output, failing assertion, type error, invariant violation, business rule violation, test failure (not flaky). + +**Handler:** Debug, find root cause, fix. Follow `systematic-debugging` skill protocol. Do NOT retry or use a workaround. + +**Do NOT:** retry, add a workaround, suppress the error. + +--- + +### Infra / Environment + +**Definition:** The execution environment is broken in a way that requires external action. + +**Examples:** disk full, out of memory, missing required tool, corrupt DB, missing env var that cannot be inferred. + +**Handler:** Surface immediately. Describe exactly what is missing and what the minimum fix is. Do NOT attempt to work around infra failures in code. + +**Do NOT:** retry, assume it will resolve, add fallback code. + +--- + +### Semantic / Integration + +**Definition:** Two components disagree on a contract — schema mismatch, API version mismatch, unexpected data shape. + +**Examples:** JSON parse error on valid-looking response, unexpected null where required, field name changed in dependency. + +**Handler:** Investigate the contract. Identify which side is wrong (caller or callee). Fix the contract mismatch, not the symptom. + +**Do NOT:** add nil-guards without understanding why the nil is there. + +--- + +### Scope / Ambiguity + +**Definition:** Cannot proceed because the task is not well-defined enough to make a correct decision. + +**Examples:** conflicting requirements, missing spec, ambiguous acceptance criteria. + +**Handler:** Surface the ambiguity with the specific decision that is blocked. Follow `assumption-log` protocol — document the assumption, pick a sensible default, mark for review. + +**Do NOT:** guess silently. + +## Routing Decision Tree + +``` +Error occurs + │ + ├─ Is it a network/rate-limit/timeout? → TRANSIENT → retry with wait + │ + ├─ Is it auth/403/401/credential? → AUTH → surface, do not retry + │ + ├─ Is it a test failure or wrong output? → LOGIC → debug + fix + │ + ├─ Is the environment broken? → INFRA → surface, external action needed + │ + ├─ Is it a contract/schema mismatch? → SEMANTIC → investigate contract + │ + └─ Is the task underspecified? → SCOPE → assumption-log protocol +``` + +## Completion Criteria + +For each error encountered in the workflow: +- [ ] Error classified by type (not severity) +- [ ] Handler applied per classification +- [ ] Resolution recorded (what the error was, what fixed it) +- [ ] No logic errors suppressed or worked around +- [ ] No transient errors abandoned without retry diff --git a/src/resources/workflow-skills/handoff-readability/SKILL.md b/src/resources/workflow-skills/handoff-readability/SKILL.md new file mode 100644 index 000000000..75bebb58e --- /dev/null +++ b/src/resources/workflow-skills/handoff-readability/SKILL.md @@ -0,0 +1,132 @@ +--- +name: handoff-readability +description: Enforce boring code, why-comments on non-obvious decisions, and clean interface contracts. Use in code-generation workflows. Makes rewrites cheap, reduces onboarding time, and prevents the "only the original author understands this" failure mode. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - build + - review + - "*" +--- + +# Handoff Readability + +## Iron Law + +``` +CODE IS READ 10X MORE THAN IT IS WRITTEN. +WRITE FOR THE READER WHO HAS ZERO CONTEXT. +BORING CODE IS A FEATURE. +``` + +Clever code that only the author can read is a liability. Every non-obvious decision is a future debugging session waiting to happen. Every missing comment on a "why" is a future misunderstanding that will produce a silent regression. + +## Recognize Your Own Rationalizations + +- "It's obvious what this does." → Obvious to you, now, with context. Not obvious at 2am during an incident to someone who didn't write it. +- "Comments are noise." → Implementation comments are often noise. *Why* comments are always signal. +- "The code is self-documenting." → Function names document *what*. Only comments document *why*. +- "I'll clean it up later." → Later is when you're two milestones ahead and the context is gone. Clean it now. + +## When to Run + +- During code generation (inline, as you write) +- During code review (check existing code for violations) +- Before marking a slice complete (final readability pass) + +## The Three Rules + +### Rule 1: Boring over clever + +Prefer the solution a junior developer can read and modify. If you face a choice between: +- An elegant one-liner and a readable 5-liner → use the 5-liner +- A clever abstraction and a repeated-but-obvious pattern → repeat it until repetition is clearly worth abstracting +- A performance micro-optimization and readable code → readable code, unless the performance requirement is proven + +**Exception:** performance-critical paths (must be documented with a benchmark that proves the optimization is necessary). + +### Rule 2: Why-comments on every non-obvious decision + +A comment is required when: +- The code does something that looks wrong but is intentional +- The code uses a non-standard approach for a reason +- A value or constant was chosen for a specific reason (not arbitrary) +- The code handles an edge case that isn't obvious from the types + +Format: +```ts +// WHY: +``` + +Examples: +```ts +// WHY: SQLite WAL mode is required here — the default journal mode causes +// write contention when multiple processes access the same DB file. +db.pragma("journal_mode = WAL"); + +// WHY: Retry up to 3 times with 1s backoff. The gateway has a 500ms cold-start +// window after idle; the first call will often fail. +const result = await retry(call, { times: 3, waitMs: 1000 }); + +// WHY: Empty array not null — callers use .length checks without null guards. +if (!data) return []; +``` + +### Rule 3: Clean interface contracts + +Every exported function needs a contract that answers: +- **What does it return** (type + what null/undefined/empty means) +- **What are the preconditions** (what must be true for it to work) +- **What are the side effects** (writes, events, mutations) + +Bad: +```ts +export function processUser(user) { ... } +``` + +Good: +```ts +/** + * Validate and normalize a user record for DB insertion. + * Returns null if the record fails validation (caller decides whether to throw). + * Side effects: none. Pure function. + * Precondition: user.id must be a non-empty string. + */ +export function processUser(user: RawUser): NormalizedUser | null { ... } +``` + +## Rewrites-Cheap Test + +Before submitting a slice, ask: + +1. **Could a new team member understand each function without reading its callers?** + If no → add why-comments or simplify. + +2. **Could the core logic be replaced without touching the interface?** + If no → the interface is coupled to the implementation. Separate them. + +3. **Are there any "magic" values without a named constant and a why-comment?** + If yes → name the constant and explain the value. + +4. **Does every exported symbol have a contract (JSDoc with purpose + consumer)?** + If no → add it before marking the slice done. + +## Anti-Patterns + +| Pattern | Problem | Fix | +|---------|---------|-----| +| `// do the thing` | Describes what, not why | Replace with a why-comment or delete | +| `const x = 42` | Magic number | `const MAX_RETRIES = 3; // WHY: ...` | +| One-letter variables outside loops | Forces reader to track mental state | Use descriptive names | +| Deeply nested conditionals | Hard to follow control flow | Extract to named functions | +| Side effects in getters | Violates principle of least surprise | Separate reads from writes | + +## Completion Criteria + +- [ ] No magic values without named constants and why-comments +- [ ] Every non-obvious decision has a `// WHY:` comment +- [ ] Every exported symbol has a purpose + consumer JSDoc +- [ ] Core logic is replaceable without changing the interface +- [ ] A new team member can understand each function without external context diff --git a/src/resources/workflow-skills/irreversible-ops/SKILL.md b/src/resources/workflow-skills/irreversible-ops/SKILL.md new file mode 100644 index 000000000..6b279055e --- /dev/null +++ b/src/resources/workflow-skills/irreversible-ops/SKILL.md @@ -0,0 +1,96 @@ +--- +name: irreversible-ops +description: Human-review gate for irreversible operations — deploys, database migrations, published artifact pushes, force pushes, and destructive deletes. Use in any workflow that touches infra, DB schema, or published artifacts. Classifies reversibility, injects a mandatory verification step, and blocks autonomous progression past the gate. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: trusted +triggers: + - build + - repair + - "*" +--- + +# Irreversible Ops + +## Iron Law + +``` +BEFORE AN IRREVERSIBLE OP: STOP, CLASSIFY, GATE. +NO AUTONOMOUS AGENT CROSSES AN IRREVERSIBLE BOUNDARY WITHOUT AN EXPLICIT HUMAN GATE. +``` + +An operation is irreversible if rolling it back requires more than running one command. If you are not certain, treat it as irreversible. + +## Recognize Your Own Rationalizations + +- "It's a dev environment — I can always recreate it." → Development data and schemas that are not in source control are irreversible. Assume production semantics until proven otherwise. +- "The migration is small and I've done this before." → Size and familiarity do not reduce irreversibility. The gate is about the op class, not the op size. +- "Autonomous mode is enabled, so I can proceed." → Autonomous mode governs pace and interaction style. It does not remove irreversibility gates. +- "I'll add a rollback plan after." → Rollback plan comes first, before the gate can be passed. + +## Irreversible Op Classification + +### Class A — Always requires human gate + +| Operation | Why irreversible | +|-----------|-----------------| +| Database migration (schema change) | Column drops, type changes, constraint adds — data loss risk | +| Published package version bump | npm/PyPI/GitHub Releases — cannot be un-published cleanly | +| Force push to protected branch | Rewrites shared history | +| Production deploy | Live traffic impact; rollback window may close | +| Secret/credential rotation | Old credentials may already be in use | +| Mass delete (files, records, buckets) | Data loss if incorrect | +| External service configuration change | May affect other consumers | + +### Class B — Requires gate in autonomous mode, can proceed in assisted/manual + +| Operation | Condition | +|-----------|-----------| +| Database migration (data backfill) | If revert is a compensating migration | +| Git tag creation | If CI/CD triggers on tags | +| API endpoint removal | If consumers may exist | +| Config change affecting behaviour | If not behind a feature flag | + +### Class C — No gate required + +- Adding new columns (no existing data affected) +- Creating new tables +- Adding new endpoints +- Adding new feature flags (not yet enabled) +- Writing tests +- Modifying local dev config + +## Gate Protocol + +Before any Class A or Class B op, produce in writing: + +``` +Op class: +Operation: +Affected scope: +Reversibility: +Rollback plan: +Verification: +Gate: BLOCKED — requires human confirmation before proceeding +``` + +Do NOT proceed until the human confirms. "Confirmed" means explicit approval of the exact operation described above, not a general "go ahead." + +## Post-Gate Checklist + +After the human gate passes: +- [ ] Backup taken (or confirmed unnecessary with reason) +- [ ] Rollback plan is still valid +- [ ] Monitoring/alerting is in place +- [ ] Operation executed exactly as described in the gate record +- [ ] Verification result recorded + +If the actual operation deviates from the gate description, stop and re-gate. + +## Completion Criteria + +- [ ] Every irreversible op in the workflow has been classified +- [ ] All Class A ops have a gate record + human confirmation +- [ ] All Class B ops in autonomous mode have a gate record + human confirmation +- [ ] Post-gate checklist complete for each executed op diff --git a/src/resources/workflow-skills/observe-first/SKILL.md b/src/resources/workflow-skills/observe-first/SKILL.md new file mode 100644 index 000000000..79f9b6898 --- /dev/null +++ b/src/resources/workflow-skills/observe-first/SKILL.md @@ -0,0 +1,119 @@ +--- +name: observe-first +description: Enforce read-map-understand before any edit. Use at the start of any workflow that modifies existing code in an unfamiliar or partially-familiar codebase. Prevents the "Junior Refactor" failure mode — making changes without knowing what the code does or how it's used. Side-chain skill that gates the modify phase. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - build + - repair + - review + - "*" +--- + +# Observe First + +## Iron Law + +``` +NO EDIT WITHOUT A MENTAL MODEL. +NO MENTAL MODEL WITHOUT EVIDENCE. +``` + +If you have not completed Phase 1 (Structure) and Phase 2 (Usage), you are not permitted to modify any file. The modification phase is blocked until both phases produce written findings. + +## Recognize Your Own Rationalizations + +These are the exact shortcuts you will reach for. Each is wrong: + +- "I can see what it does from the name." → Names lie. Read the body. +- "I only need to change one line." → You don't know which one yet without reading the callers. +- "I've seen this pattern before." → Familiarity is not analysis. This codebase may use the pattern differently. +- "I'll figure it out as I go." → Going is the wrong order. Understand first, then go. +- "The tests will catch mistakes." → Tests catch regressions you knew about. They don't catch structural misunderstandings. + +## When to Run + +- Any workflow that modifies existing code you haven't read end-to-end in this session. +- Planning phases that require accurate impact analysis before choosing an approach. +- Whenever the scope of a change is unclear. + +Do NOT skip this skill for "small" changes — small changes with wrong mental models cause the most silent bugs. + +## Skill Chain + +Side-chain gate. Blocks the modify phase until both observe phases complete. + +``` +← prev: plan, spec-first-tdd, or any workflow beginning a modify phase +→ next: return to the invoking workflow once Phase 1 + Phase 2 are in writing +``` + +## Phase 1 — Structure Map + +Map the file/module being modified before touching it. + +```bash +# Who owns the symbol? +rg -n "export.*|function |class " src/ packages/ + +# What does the file do? +cat | head -80 # module header, imports, exports +rg -n "export " # public surface + +# What are its dependencies? +rg -n "^import " # what it imports +rg -rn "from.*" src/ # who imports this module +``` + +Produce written output: +1. **Module purpose** — one sentence: why does this module exist? +2. **Exports** — list every exported symbol and its type +3. **Callers** — list every file that imports this module +4. **Dependencies** — list what this module imports from elsewhere + +Do NOT proceed to Phase 2 until this list exists in writing. + +## Phase 2 — Usage Analysis + +For each symbol you intend to modify, trace how it is called. + +```bash +# All call sites +rg -n "" src/ packages/ --type ts --type js + +# Test coverage +rg -rn "" src/ --include="*.test.*" + +# Recent history +git log --oneline -10 -- +git log --oneline -10 -S "" # commits that changed this symbol +``` + +Produce written output for each symbol: +1. **Call sites** — file:line for every caller, with the argument values passed +2. **Contract** — what callers expect in return (inferred from usage) +3. **Invariants** — what must be true before/after this symbol runs +4. **Change blast radius** — which callers break if you change the signature or behaviour + +Do NOT write any code until this list exists in writing. + +## Phase 3 — Modification (Unblocked) + +Only after Phases 1 and 2 are documented: + +1. Make the **smallest** change that satisfies the contract. +2. Keep changes inside the blast radius you mapped — no scope creep. +3. If the blast radius is larger than expected, surface it before continuing. +4. Update callers in the order dictated by the dependency map, not alphabetically. + +## Completion Criteria + +You may exit this skill and return to the invoking workflow when: + +- [ ] Phase 1 findings written (module purpose, exports, callers, deps) +- [ ] Phase 2 findings written (call sites, contract, invariants, blast radius) for every symbol to be modified +- [ ] The modification is bounded to the mapped blast radius + +If Phase 1 or Phase 2 reveals that the change is larger than originally scoped, **stop and surface the new scope** before modifying anything. diff --git a/src/resources/workflow-skills/state-discipline/SKILL.md b/src/resources/workflow-skills/state-discipline/SKILL.md new file mode 100644 index 000000000..26dbab392 --- /dev/null +++ b/src/resources/workflow-skills/state-discipline/SKILL.md @@ -0,0 +1,134 @@ +--- +name: state-discipline +description: Enforce structured, deterministic state management in long-running workflows. Use in any multi-step workflow that persists state across iterations. Prevents LLM-managed state, in-memory-only state, and unstructured file-based state — the three failure modes that cause autonomous loops to lose track of where they are. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - build + - plan + - "*" +--- + +# State Discipline + +## Iron Law + +``` +STATE LIVES IN SQLITE OR ON DISK AS STRUCTURED FILES. +NEVER IN THE LLM'S CONTEXT WINDOW. +NEVER IN MEMORY ACROSS STEPS. +``` + +Context-window state is lost on restart, summarization, and context compaction. In-memory state is lost on crash. Only SQLite and structured files survive restarts, crashes, and context rotation. + +## Recognize Your Own Rationalizations + +- "I'll track the progress in my context." → Context is summarized and truncated. Progress state in context is lost exactly when you need it most — after a crash or a long run. +- "I'll use a JSON object in a variable." → In-memory variables don't survive the tool call boundary. Each tool invocation is a fresh execution context. +- "It's simpler to just write to a text file." → Unstructured text files can't be queried, can't be joined, and produce parse errors under concurrent access. Use SQLite. +- "I'll write the state management after the feature works." → State management is not a feature — it is the foundation. Without it, you can't resume, can't retry, and can't verify. + +## When to Run + +- Before designing any multi-step workflow that must survive restart +- When a workflow has been running for more than 2 iterations +- When implementing retry logic that requires tracking attempts +- When implementing any lock, queue, or work-item pattern + +## The Four State Rules + +### Rule 1: SQLite for structured state + +Use `.sf/sf.db` (or a task-scoped DB) for any state with schema, ordering, priority, joins, or queries. + +**Use SQLite when:** +- Tracking work items (pending/in-progress/done) +- Recording retry counts +- Storing key-value configuration that persists across steps +- Any state that needs to be queried or filtered + +**Use structured files when:** +- The state is a single document (a plan, a spec, an evidence file) +- The state is append-only and never queried (logs) +- The state must be human-readable and is the primary artifact + +**Never use:** +- In-memory variables for state that crosses step boundaries +- Free-form text files for state that needs to be queried +- LLM context window for state that must survive restart + +### Rule 2: Schema before data + +Define the schema explicitly before inserting any rows. The schema is the contract: + +```sql +CREATE TABLE IF NOT EXISTS workflow_units ( + id TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'pending' -- pending | in_progress | done | blocked + CHECK(status IN ('pending','in_progress','done','blocked')), + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + error TEXT -- last error if status = blocked +); +``` + +Never add rows to an undefined table. Never use a table whose schema you haven't verified. + +### Rule 3: Atomic transitions + +State transitions must be atomic. Use SQLite transactions for multi-step transitions: + +```sql +BEGIN; +UPDATE workflow_units SET status = 'in_progress', updated_at = datetime('now') + WHERE id = :id AND status = 'pending'; -- conditional: only if still pending +-- do the work +UPDATE workflow_units SET status = 'done', updated_at = datetime('now') + WHERE id = :id; +COMMIT; +``` + +Never set status to 'in_progress' in one statement and 'done' in another without a transaction — a crash between the two leaves inconsistent state. + +### Rule 4: Resume from state, not from memory + +Every workflow step must be resumable from the DB alone: + +```sql +-- Find the next pending unit (resumable from cold start) +SELECT * FROM workflow_units +WHERE status = 'pending' + AND NOT EXISTS ( + SELECT 1 FROM workflow_units dep + JOIN unit_deps d ON d.depends_on = dep.id + WHERE d.unit_id = workflow_units.id AND dep.status != 'done' + ) +ORDER BY priority DESC, created_at ASC +LIMIT 1; +``` + +If you cannot reconstruct "where the workflow is" from a single SQL query, the state model is wrong. + +## State Inventory Checklist + +Before implementing a multi-step workflow, produce this inventory: + +``` +State item: +Lifetime: +Schema: +Read pattern: +Write pattern: +Conflict rule: +Recovery: +``` + +## Completion Criteria + +- [ ] All cross-step state is in SQLite or structured files +- [ ] Schema is defined before any data is written +- [ ] All state transitions are atomic (transactions for multi-step) +- [ ] The workflow is resumable from the DB alone after a cold restart +- [ ] No state stored only in context or in-memory variables diff --git a/src/resources/workflow-skills/vertical-slice/SKILL.md b/src/resources/workflow-skills/vertical-slice/SKILL.md new file mode 100644 index 000000000..bdb6c67d6 --- /dev/null +++ b/src/resources/workflow-skills/vertical-slice/SKILL.md @@ -0,0 +1,91 @@ +--- +name: vertical-slice +description: Enforce end-to-end working increments at each workflow step. Use during planning and decomposition phases. Prevents "horizontal layers" — building all models, then all services, then all tests — which produces nothing shippable until the very end. Every slice must be testable and deployable in isolation. +user-invocable: false +model-invocable: true +side-effects: none +permission-profile: normal +triggers: + - plan + - build + - "*" +--- + +# Vertical Slice + +## Iron Law + +``` +EVERY SLICE MUST BE INDEPENDENTLY TESTABLE AND DEPLOYABLE. +NO SLICE IS DONE UNTIL ITS CONSUMER PATH WORKS END-TO-END. +``` + +A slice that produces "partial infrastructure" is not a slice — it is a layer. Layers are not shippable. If the slice cannot be verified in isolation, it is too large or wrongly cut. + +## Recognize Your Own Rationalizations + +- "I'll wire it up in the next slice." → If it can't be verified now, you can't confirm the first slice worked. Bugs compound invisibly. +- "It's more efficient to build all the DB tables first." → It is more efficient to ship nothing until the very end. Horizontal layers guarantee integration surprises. +- "The consumer isn't built yet." → Then build a stub consumer in this slice. The slice defines its own consumer path. +- "I'll test it all together when it's complete." → "Together" is where integration bugs hide. Test each slice independently. + +## When to Run + +- Planning or decomposition: before breaking a milestone into tasks. +- Slice review: before starting a new slice, confirm the previous one is truly end-to-end. +- When an autonomous loop has been running for more than two slices without a shippable increment. + +## Skill Chain + +Planning-phase skill. Inline with the main delivery chain. + +``` +← prev: architecture-planning, pm-planning, or any planning phase +→ next: spec-first-tdd (write the failing test for the first slice) +``` + +## Slice Definition Protocol + +For each slice, define **before writing any code**: + +``` +Slice ID: +Purpose: +Entry point: +Done state: +Verifier: +Stub strategy: +``` + +A slice without a `Verifier` is not a valid slice. Stop and define one before proceeding. + +## Anti-Patterns to Detect and Reject + +| Pattern | Problem | Correct Cut | +|---------|---------|-------------| +| "Add all DB tables" | No consumer, not testable alone | "Add one table + one read + one test" | +| "Build the service layer" | No entry point, no verifier | "Add one endpoint that returns real data from DB" | +| "Implement the model" | Model without integration is not slice | "Add model + minimal handler + test that calls handler" | +| "Set up infrastructure" | Infrastructure without behaviour is scaffolding | Include the first real use in the same slice | +| "Refactor X" | Refactors with no consumer test are invisible | Include the test that proves behaviour unchanged | + +## Slice Sizing + +**Right-sized slice:** completes in a single autonomous iteration, has one clear verifier, can be described in one sentence. + +**Too large:** "Build the authentication system." Cut it: login endpoint → token validation → logout → password reset. + +**Too small:** "Add an import statement." Merge it with the first meaningful use. + +**Boundary check:** If a slice takes more than one session to complete, it is too large. Cut it. + +## Completion Criteria + +Each slice is done when: + +- [ ] `Verifier` command runs and passes +- [ ] The consumer path works end-to-end (not "the model is ready") +- [ ] No "temporary stubs" left in production paths (test stubs are fine) +- [ ] The done state matches what was defined before coding started + +If the verifier passes but the done state wasn't defined upfront, you completed something — you just don't know what. Define done state first next time.