diff --git a/packages/pi-coding-agent/src/core/extensions/provider-registration.test.ts b/packages/pi-coding-agent/src/core/extensions/provider-registration.test.ts new file mode 100644 index 000000000..2679feae6 --- /dev/null +++ b/packages/pi-coding-agent/src/core/extensions/provider-registration.test.ts @@ -0,0 +1,81 @@ +// GSD2 — Regression test: pendingProviderRegistrations must be flushed exactly once (#3576) +// Copyright (c) 2026 Jeremy McSpadden + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; + +/** + * This test validates that the provider preflush pattern in sdk.ts clears + * pendingProviderRegistrations after iterating, so bindCore() doesn't + * re-register the same providers. + * + * The bug: createAgentSession() iterated pendingProviderRegistrations but + * did not clear the array. Later, bindCore() replayed and registered the + * same providers again, stacking wrappers. + */ + +interface ProviderEntry { + name: string; + config: Record; +} + +interface MockRuntime { + pendingProviderRegistrations: ProviderEntry[]; +} + +describe("provider registration preflush", () => { + it("clears pending registrations after preflush so bindCore does not replay", () => { + const registered: string[] = []; + const runtime: MockRuntime = { + pendingProviderRegistrations: [ + { name: "ollama", config: { type: "ollama" } }, + { name: "custom-provider", config: { type: "custom" } }, + ], + }; + + // Simulate sdk.ts preflush (lines 220-223) + for (const { name } of runtime.pendingProviderRegistrations) { + registered.push(name); + } + // The fix: clear after preflush + runtime.pendingProviderRegistrations = []; + + // Simulate bindCore() flush (runner.ts lines 268-271) + for (const { name } of runtime.pendingProviderRegistrations) { + registered.push(name); + } + runtime.pendingProviderRegistrations = []; + + assert.deepEqual( + registered, + ["ollama", "custom-provider"], + "each provider should be registered exactly once", + ); + }); + + it("without the fix, providers are registered twice", () => { + const registered: string[] = []; + const runtime: MockRuntime = { + pendingProviderRegistrations: [ + { name: "ollama", config: { type: "ollama" } }, + ], + }; + + // Old behavior: preflush without clearing + for (const { name } of runtime.pendingProviderRegistrations) { + registered.push(name); + } + // NOT clearing — simulating the old bug + + // bindCore() replays the same queue + for (const { name } of runtime.pendingProviderRegistrations) { + registered.push(name); + } + + assert.deepEqual( + registered, + ["ollama", "ollama"], + "without clearing, providers are registered twice (demonstrating the bug)", + ); + }); +}); diff --git a/packages/pi-coding-agent/src/core/sdk.ts b/packages/pi-coding-agent/src/core/sdk.ts index 8d8f8cf04..74dc2a962 100644 --- a/packages/pi-coding-agent/src/core/sdk.ts +++ b/packages/pi-coding-agent/src/core/sdk.ts @@ -221,9 +221,8 @@ export async function createAgentSession(options: CreateAgentSessionOptions = {} for (const { name, config } of extensionsForModelResolution.runtime.pendingProviderRegistrations) { modelRegistry.registerProvider(name, config); } - // Note: we do NOT clear pendingProviderRegistrations here — bindCore() will iterate - // an empty array harmlessly, and clearing here would require the runtime to track - // whether the flush already happened. + // Clear the queue so bindCore() doesn't re-register the same providers. + extensionsForModelResolution.runtime.pendingProviderRegistrations = []; // If still no model, use findInitialModel (checks settings default, then provider defaults) if (!model) { diff --git a/src/resources/extensions/gsd/auto-post-unit.ts b/src/resources/extensions/gsd/auto-post-unit.ts index 869f4e45d..20f1faed2 100644 --- a/src/resources/extensions/gsd/auto-post-unit.ts +++ b/src/resources/extensions/gsd/auto-post-unit.ts @@ -52,6 +52,13 @@ import { hasPendingCaptures, loadPendingCaptures, revertExecutorResolvedCaptures import { debugLog } from "./debug-logger.js"; import { runSafely } from "./auto-utils.js"; import type { AutoSession, SidecarItem } from "./auto/session.js"; +import { getEvidence } from "./safety/evidence-collector.js"; +import { validateFileChanges } from "./safety/file-change-validator.js"; +// crossReferenceEvidence available for future use when verification_evidence is stored in DB +// import { crossReferenceEvidence, type ClaimedEvidence } from "./safety/evidence-cross-ref.js"; +import { validateContent } from "./safety/content-validator.js"; +import { resolveSafetyHarnessConfig } from "./safety/safety-harness.js"; +import { resolveExpectedArtifactPath as resolveArtifactForContent } from "./auto-artifact-paths.js"; /** Maximum verification retry attempts before escalating to blocker placeholder (#2653). */ const MAX_VERIFICATION_RETRIES = 3; @@ -437,6 +444,87 @@ export async function postUnitPreVerification(pctx: PostUnitContext, opts?: PreV debugLog("postUnit", { phase: "rogue-detection", error: String(e) }); } + // ── Safety harness: post-unit validation ── + try { + const { loadEffectiveGSDPreferences } = await import("./preferences.js"); + const prefs = loadEffectiveGSDPreferences()?.preferences; + const safetyConfig = resolveSafetyHarnessConfig( + prefs?.safety_harness as Record | undefined, + ); + + if (safetyConfig.enabled) { + const { milestone: sMid, slice: sSid, task: sTid } = parseUnitId(s.currentUnit.id); + + // File change validation (execute-task only, after auto-commit) + if (safetyConfig.file_change_validation && s.currentUnit.type === "execute-task" && sMid && sSid && sTid && isDbAvailable()) { + try { + const taskRow = getTask(sMid, sSid, sTid); + if (taskRow) { + const expectedOutput = taskRow.expected_output ?? []; + const plannedFiles = taskRow.files ?? []; + const audit = validateFileChanges(s.basePath, expectedOutput, plannedFiles); + if (audit && audit.violations.length > 0) { + const warnings = audit.violations.filter(v => v.severity === "warning"); + for (const v of warnings) { + logWarning("safety", `file-change: ${v.file} — ${v.reason}`); + } + if (warnings.length > 0) { + ctx.ui.notify( + `Safety: ${warnings.length} unexpected file change(s) outside task plan`, + "warning", + ); + } + } + } + } catch (e) { + debugLog("postUnit", { phase: "safety-file-change", error: String(e) }); + } + } + + // Evidence cross-reference (execute-task only) + // Verification evidence is passed via the complete-task tool call and + // stored in the SUMMARY.md on disk — not available as structured data + // in the DB. The evidence collector tracks actual bash tool calls, so + // we can still detect units that claimed success but ran no commands. + if (safetyConfig.evidence_cross_reference && s.currentUnit.type === "execute-task") { + try { + const actual = getEvidence(); + const bashCalls = actual.filter(e => e.kind === "bash"); + // If the task is marked complete but zero bash commands were run, + // it's suspicious — the LLM may have fabricated results. + if (sMid && sSid && sTid && isDbAvailable()) { + const taskRow = getTask(sMid, sSid, sTid); + if (taskRow?.status === "complete" && taskRow.verify && bashCalls.length === 0) { + logWarning("safety", "task marked complete with verification commands but no bash calls were executed"); + ctx.ui.notify( + `Safety: task ${sTid} has verification commands but no bash calls were recorded`, + "warning", + ); + } + } + } catch (e) { + debugLog("postUnit", { phase: "safety-evidence-xref", error: String(e) }); + } + } + + // Content validation (plan-slice, plan-milestone) + if (safetyConfig.content_validation) { + try { + const artifactPath = resolveArtifactForContent(s.currentUnit.type, s.currentUnit.id, s.basePath); + const contentViolations = validateContent(s.currentUnit.type, artifactPath); + for (const v of contentViolations) { + logWarning("safety", `content: ${v.reason}`); + ctx.ui.notify(`Content validation: ${v.reason}`, "warning"); + } + } catch (e) { + debugLog("postUnit", { phase: "safety-content-validation", error: String(e) }); + } + } + } + } catch (e) { + debugLog("postUnit", { phase: "safety-harness", error: String(e) }); + } + // Artifact verification let triggerArtifactVerified = false; if (!s.currentUnit.type.startsWith("hook/")) { diff --git a/src/resources/extensions/gsd/auto-timers.ts b/src/resources/extensions/gsd/auto-timers.ts index b2faa0c44..3b7b11f81 100644 --- a/src/resources/extensions/gsd/auto-timers.ts +++ b/src/resources/extensions/gsd/auto-timers.ts @@ -106,8 +106,9 @@ export function startUnitSupervision(sctx: SupervisionContext): void { } } const estimateMinutes = taskEstimate ? parseEstimateMinutes(taskEstimate) : null; + const MAX_TIMEOUT_SCALE = 6; // Cap at 6x (60min task). Prevents 2h+ tasks from creating 120min+ timeout windows. const timeoutScale = estimateMinutes && estimateMinutes > 0 - ? Math.max(1, estimateMinutes / 10) // 10min task = 1x, 30min = 3x, 2h = 12x + ? Math.min(MAX_TIMEOUT_SCALE, Math.max(1, estimateMinutes / 10)) : 1; const softTimeoutMs = (supervisor.soft_timeout_minutes ?? 0) * 60 * 1000 * timeoutScale; diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 35ecad194..a5da2519c 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -37,6 +37,9 @@ import { withTimeout, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js"; import { getEligibleSlices } from "../slice-parallel-eligibility.js"; import { startSliceParallel } from "../slice-parallel-orchestrator.js"; import { isDbAvailable, getMilestoneSlices } from "../gsd-db.js"; +import { resetEvidence } from "../safety/evidence-collector.js"; +import { createCheckpoint, cleanupCheckpoint, rollbackToCheckpoint } from "../safety/git-checkpoint.js"; +import { resolveSafetyHarnessConfig } from "../safety/safety-harness.js"; // ─── generateMilestoneReport ────────────────────────────────────────────────── @@ -1079,6 +1082,21 @@ export async function runUnitPhase( if (mid) deps.updateSliceProgressCache(s.basePath, mid, state.activeSlice?.id); + // ── Safety harness: reset evidence + create checkpoint ── + const safetyConfig = resolveSafetyHarnessConfig( + prefs?.safety_harness as Record | undefined, + ); + if (safetyConfig.enabled && safetyConfig.evidence_collection) { + resetEvidence(); + } + // Only checkpoint code-executing units (not lifecycle/planning units) + if (safetyConfig.enabled && safetyConfig.checkpoints && unitType === "execute-task") { + s.checkpointSha = createCheckpoint(s.basePath, unitId); + if (s.checkpointSha) { + debugLog("runUnitPhase", { phase: "checkpoint-created", unitId, sha: s.checkpointSha.slice(0, 8) }); + } + } + // Prompt injection let finalPrompt = prompt; @@ -1376,6 +1394,27 @@ export async function runUnitPhase( deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); + // ── Safety harness: checkpoint cleanup or rollback ── + if (s.checkpointSha) { + if (unitResult.status === "error" && safetyConfig.auto_rollback) { + const rolled = rollbackToCheckpoint(s.basePath, unitId, s.checkpointSha); + if (rolled) { + ctx.ui.notify(`Rolled back to pre-unit checkpoint for ${unitId}`, "info"); + debugLog("runUnitPhase", { phase: "checkpoint-rollback", unitId }); + } + } else if (unitResult.status === "error") { + ctx.ui.notify( + `Unit ${unitId} failed. Pre-unit checkpoint available at ${s.checkpointSha.slice(0, 8)}`, + "warning", + ); + } else { + // Success — clean up checkpoint ref + cleanupCheckpoint(s.basePath, unitId); + debugLog("runUnitPhase", { phase: "checkpoint-cleaned", unitId }); + } + s.checkpointSha = null; + } + return { action: "next", data: { unitStartedAt: s.currentUnit?.startedAt } }; } diff --git a/src/resources/extensions/gsd/auto/session.ts b/src/resources/extensions/gsd/auto/session.ts index 011e9c159..7cb991511 100644 --- a/src/resources/extensions/gsd/auto/session.ts +++ b/src/resources/extensions/gsd/auto/session.ts @@ -145,6 +145,10 @@ export class AutoSession { lastBaselineCharCount: number | undefined; pendingQuickTasks: CaptureEntry[] = []; + // ── Safety harness ─────────────────────────────────────────────────────── + /** SHA of the pre-unit git checkpoint ref. Cleared on success or rollback. */ + checkpointSha: string | null = null; + // ── Signal handler ─────────────────────────────────────────────────────── sigtermHandler: (() => void) | null = null; @@ -223,6 +227,7 @@ export class AutoSession { this.lastToolInvocationError = null; this.isolationDegraded = false; this.milestoneMergedInPhases = false; + this.checkpointSha = null; // Signal handler this.sigtermHandler = null; diff --git a/src/resources/extensions/gsd/bootstrap/register-hooks.ts b/src/resources/extensions/gsd/bootstrap/register-hooks.ts index 49cb7072f..910e91b9e 100644 --- a/src/resources/extensions/gsd/bootstrap/register-hooks.ts +++ b/src/resources/extensions/gsd/bootstrap/register-hooks.ts @@ -18,6 +18,9 @@ import { isParallelActive, shutdownParallel } from "../parallel-orchestrator.js" import { checkToolCallLoop, resetToolCallLoopGuard } from "./tool-call-loop-guard.js"; import { saveActivityLog } from "../activity-log.js"; import { resetAskUserQuestionsCache } from "../../ask-user-questions.js"; +import { recordToolCall as safetyRecordToolCall, recordToolResult as safetyRecordToolResult } from "../safety/evidence-collector.js"; +import { classifyCommand } from "../safety/destructive-guard.js"; +import { logWarning as safetyLogWarning } from "../workflow-logger.js"; // Skip the welcome screen on the very first session_start — cli.ts already // printed it before the TUI launched. Only re-print on /clear (subsequent sessions). @@ -203,6 +206,26 @@ export function registerHooks(pi: ExtensionAPI): void { if (result.block) return result; }); + // ── Safety harness: evidence collection + destructive command warnings ── + pi.on("tool_call", async (event, ctx) => { + if (!isAutoActive()) return; + safetyRecordToolCall(event.toolName, event.input as Record); + + // Destructive command classification (warn only, never block) + if (isToolCallEventType("bash", event)) { + const classification = classifyCommand(event.input.command); + if (classification.destructive) { + safetyLogWarning("safety", `destructive command: ${classification.labels.join(", ")}`, { + command: String(event.input.command).slice(0, 200), + }); + ctx.ui.notify( + `Destructive command detected: ${classification.labels.join(", ")}`, + "warning", + ); + } + } + }); + pi.on("tool_result", async (event) => { if (event.toolName !== "ask_user_questions") return; const milestoneId = getDiscussionMilestoneId(); @@ -268,6 +291,10 @@ export function registerHooks(pi: ExtensionAPI): void { : (typeof event.result?.content?.[0]?.text === "string" ? event.result.content[0].text : String(event.result)); recordToolInvocationError(event.toolName, errorText); } + // Safety harness: record tool execution results for evidence cross-referencing + if (isAutoActive()) { + safetyRecordToolResult(event.toolCallId, event.toolName, event.result, event.isError); + } }); pi.on("model_select", async (_event, ctx) => { diff --git a/src/resources/extensions/gsd/preferences-types.ts b/src/resources/extensions/gsd/preferences-types.ts index 043bb4055..a5013c18c 100644 --- a/src/resources/extensions/gsd/preferences-types.ts +++ b/src/resources/extensions/gsd/preferences-types.ts @@ -105,6 +105,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([ "experimental", "codebase", "slice_parallel", + "safety_harness", ]); /** Canonical list of all dispatch unit types. */ @@ -291,6 +292,18 @@ export interface GSDPreferences { codebase?: CodebaseMapPreferences; /** Slice-level parallelism within a milestone. Disabled by default. */ slice_parallel?: { enabled?: boolean; max_workers?: number }; + /** LLM safety harness configuration. Monitors, validates, and constrains LLM behavior during auto-mode. Enabled by default with warn-and-continue policy. */ + safety_harness?: { + enabled?: boolean; + evidence_collection?: boolean; + file_change_validation?: boolean; + evidence_cross_reference?: boolean; + destructive_command_warnings?: boolean; + content_validation?: boolean; + checkpoints?: boolean; + auto_rollback?: boolean; + timeout_scale_cap?: number; + }; } export interface LoadedGSDPreferences { diff --git a/src/resources/extensions/gsd/safety/content-validator.ts b/src/resources/extensions/gsd/safety/content-validator.ts new file mode 100644 index 000000000..a61ec2483 --- /dev/null +++ b/src/resources/extensions/gsd/safety/content-validator.ts @@ -0,0 +1,98 @@ +/** + * Lightweight content validator for auto-mode safety harness. + * Validates that high-value unit outputs contain minimum expected content. + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +import { existsSync, readFileSync } from "node:fs"; +import { logWarning } from "../workflow-logger.js"; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface ContentViolation { + severity: "warning"; + reason: string; +} + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** + * Validate content quality for a completed unit. + * Returns an array of violations. Empty array = content looks acceptable. + * + * @param unitType - The type of unit that completed (e.g. "plan-slice") + * @param artifactPath - Absolute path to the primary artifact file + */ +export function validateContent( + unitType: string, + artifactPath: string | null, +): ContentViolation[] { + if (!artifactPath || !existsSync(artifactPath)) return []; + + const validator = VALIDATORS[unitType]; + if (!validator) return []; + + try { + const content = readFileSync(artifactPath, "utf-8"); + return validator(content); + } catch (e) { + logWarning("safety", `content validation read failed: ${(e as Error).message}`); + return []; + } +} + +// ─── Validators ───────────────────────────────────────────────────────────── + +type ContentValidatorFn = (content: string) => ContentViolation[]; + +const VALIDATORS: Record = { + "plan-slice": validatePlanSlice, + "plan-milestone": validatePlanMilestone, +}; + +function validatePlanSlice(content: string): ContentViolation[] { + const violations: ContentViolation[] = []; + + // Must have at least 2 task entries (checkbox pattern) + const taskCount = (content.match(/- \[[ x]\] \*\*T\d+/g) || []).length; + if (taskCount < 2) { + violations.push({ + severity: "warning", + reason: `Slice plan has only ${taskCount} task(s) — expected at least 2`, + }); + } + + // Should have a Files Likely Touched section + if (!content.includes("## Files Likely Touched") && !content.includes("## Files")) { + violations.push({ + severity: "warning", + reason: "Slice plan missing 'Files Likely Touched' section", + }); + } + + // Should have a verification section + if (!content.includes("Verify") && !content.includes("verify")) { + violations.push({ + severity: "warning", + reason: "Slice plan has no verification instructions", + }); + } + + return violations; +} + +function validatePlanMilestone(content: string): ContentViolation[] { + const violations: ContentViolation[] = []; + + // Must have at least 1 slice entry + const sliceCount = (content.match(/##\s+S\d+/g) || []).length; + if (sliceCount < 1) { + violations.push({ + severity: "warning", + reason: `Milestone roadmap has ${sliceCount} slice(s) — expected at least 1`, + }); + } + + return violations; +} diff --git a/src/resources/extensions/gsd/safety/destructive-guard.ts b/src/resources/extensions/gsd/safety/destructive-guard.ts new file mode 100644 index 000000000..9d8e635bd --- /dev/null +++ b/src/resources/extensions/gsd/safety/destructive-guard.ts @@ -0,0 +1,49 @@ +/** + * Destructive command classifier for auto-mode safety harness. + * Classifies bash commands and warns on potentially destructive operations. + * Does NOT block — only classifies for logging/notification. + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +// ─── Pattern Definitions ──────────────────────────────────────────────────── + +interface DestructivePattern { + pattern: RegExp; + label: string; +} + +const DESTRUCTIVE_PATTERNS: readonly DestructivePattern[] = [ + { pattern: /\brm\s+(-[^\s]*[rfRF][^\s]*\s+|.*\s+-[^\s]*[rfRF])/, label: "recursive delete" }, + { pattern: /\bgit\s+push\s+.*--force/, label: "force push" }, + { pattern: /\bgit\s+push\s+-f\b/, label: "force push" }, + { pattern: /\bgit\s+reset\s+--hard/, label: "hard reset" }, + { pattern: /\bgit\s+clean\s+-[^\s]*[fdxFDX]/, label: "git clean" }, + { pattern: /\bgit\s+checkout\s+--\s+\./, label: "discard all changes" }, + { pattern: /\bdrop\s+(database|table|index)\b/i, label: "SQL drop" }, + { pattern: /\btruncate\s+table\b/i, label: "SQL truncate" }, + { pattern: /\bchmod\s+777\b/, label: "world-writable permissions" }, + { pattern: /\bcurl\s.*\|\s*(bash|sh|zsh)\b/, label: "pipe to shell" }, +]; + +// ─── Public API ───────────────────────────────────────────────────────────── + +export interface CommandClassification { + destructive: boolean; + labels: string[]; +} + +/** + * Classify a bash command for destructive operations. + * Returns the list of matched destructive pattern labels. + */ +export function classifyCommand(command: string): CommandClassification { + const labels: string[] = []; + for (const { pattern, label } of DESTRUCTIVE_PATTERNS) { + if (pattern.test(command)) { + // Deduplicate labels (e.g., two force-push patterns) + if (!labels.includes(label)) labels.push(label); + } + } + return { destructive: labels.length > 0, labels }; +} diff --git a/src/resources/extensions/gsd/safety/evidence-collector.ts b/src/resources/extensions/gsd/safety/evidence-collector.ts new file mode 100644 index 000000000..9d57363cf --- /dev/null +++ b/src/resources/extensions/gsd/safety/evidence-collector.ts @@ -0,0 +1,151 @@ +/** + * Real-time tool call evidence collector for auto-mode safety harness. + * Tracks every bash command, file write, and file edit during a unit execution. + * Evidence is compared against LLM completion claims in evidence-cross-ref.ts. + * + * Follows the same module-level Map pattern as auto-tool-tracking.ts. + * Copyright (c) 2026 Jeremy McSpadden + */ + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface BashEvidence { + kind: "bash"; + toolCallId: string; + command: string; + exitCode: number; + outputSnippet: string; + timestamp: number; +} + +export interface FileWriteEvidence { + kind: "write"; + toolCallId: string; + path: string; + timestamp: number; +} + +export interface FileEditEvidence { + kind: "edit"; + toolCallId: string; + path: string; + timestamp: number; +} + +export type EvidenceEntry = BashEvidence | FileWriteEvidence | FileEditEvidence; + +// ─── Module State ─────────────────────────────────────────────────────────── + +let unitEvidence: EvidenceEntry[] = []; + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** Reset all evidence for a new unit. Call at unit start. */ +export function resetEvidence(): void { + unitEvidence = []; +} + +/** Get a read-only view of all evidence collected for the current unit. */ +export function getEvidence(): readonly EvidenceEntry[] { + return unitEvidence; +} + +/** Get only bash evidence entries. */ +export function getBashEvidence(): readonly BashEvidence[] { + return unitEvidence.filter((e): e is BashEvidence => e.kind === "bash"); +} + +/** Get all file paths touched (write + edit). */ +export function getFilePaths(): string[] { + return unitEvidence + .filter((e): e is FileWriteEvidence | FileEditEvidence => e.kind === "write" || e.kind === "edit") + .map(e => e.path); +} + +// ─── Recording (called from register-hooks.ts) ───────────────────────────── + +/** + * Record a tool call at dispatch time (before execution). + * Exit codes and output are filled in by recordToolResult after execution. + */ +export function recordToolCall(toolName: string, input: Record): void { + if (toolName === "bash" || toolName === "Bash") { + unitEvidence.push({ + kind: "bash", + toolCallId: "", + command: String(input.command ?? ""), + exitCode: -1, + outputSnippet: "", + timestamp: Date.now(), + }); + } else if (toolName === "write" || toolName === "Write") { + unitEvidence.push({ + kind: "write", + toolCallId: "", + path: String(input.file_path ?? input.path ?? ""), + timestamp: Date.now(), + }); + } else if (toolName === "edit" || toolName === "Edit") { + unitEvidence.push({ + kind: "edit", + toolCallId: "", + path: String(input.file_path ?? input.path ?? ""), + timestamp: Date.now(), + }); + } +} + +/** + * Record a tool execution result. Matches the most recent unresolved entry + * of the same kind and fills in the toolCallId, exit code, and output. + */ +export function recordToolResult( + toolCallId: string, + toolName: string, + result: unknown, + isError: boolean, +): void { + const normalizedName = toolName.toLowerCase(); + + if (normalizedName === "bash") { + const entry = findLastUnresolved("bash") as BashEvidence | undefined; + if (entry) { + entry.toolCallId = toolCallId; + const text = extractResultText(result); + entry.outputSnippet = text.slice(0, 500); + const exitMatch = text.match(/Command exited with code (\d+)/); + entry.exitCode = exitMatch ? Number(exitMatch[1]) : (isError ? 1 : 0); + } + } else if (normalizedName === "write" || normalizedName === "edit") { + const entry = findLastUnresolved(normalizedName as "write" | "edit"); + if (entry) { + entry.toolCallId = toolCallId; + } + } +} + +// ─── Internals ────────────────────────────────────────────────────────────── + +function findLastUnresolved(kind: string): EvidenceEntry | undefined { + for (let i = unitEvidence.length - 1; i >= 0; i--) { + if (unitEvidence[i].kind === kind && unitEvidence[i].toolCallId === "") { + return unitEvidence[i]; + } + } + return undefined; +} + +function extractResultText(result: unknown): string { + if (typeof result === "string") return result; + if (result && typeof result === "object") { + const r = result as Record; + if (Array.isArray(r.content)) { + const textBlock = r.content.find( + (c: unknown) => typeof c === "object" && c !== null && (c as Record).type === "text", + ) as Record | undefined; + if (textBlock && typeof textBlock.text === "string") return textBlock.text; + } + if (typeof r.text === "string") return r.text; + } + return String(result ?? ""); +} diff --git a/src/resources/extensions/gsd/safety/evidence-cross-ref.ts b/src/resources/extensions/gsd/safety/evidence-cross-ref.ts new file mode 100644 index 000000000..2a57f6962 --- /dev/null +++ b/src/resources/extensions/gsd/safety/evidence-cross-ref.ts @@ -0,0 +1,120 @@ +/** + * Evidence cross-reference for auto-mode safety harness. + * Compares the LLM's claimed verification evidence (command + exitCode) + * against actual bash tool calls recorded by the evidence collector. + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +import type { BashEvidence, EvidenceEntry } from "./evidence-collector.js"; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface ClaimedEvidence { + command: string; + exitCode: number; + verdict: string; +} + +export interface EvidenceMismatch { + severity: "warning" | "error"; + claimed: ClaimedEvidence; + actual: BashEvidence | null; + reason: string; +} + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** + * Cross-reference claimed verification evidence against actual bash tool calls. + * + * Returns an array of mismatches. Empty array = all claims verified. + * Skips entries that were coerced from strings (already flagged by db-tools.ts). + */ +export function crossReferenceEvidence( + claimedEvidence: readonly ClaimedEvidence[], + actualEvidence: readonly EvidenceEntry[], +): EvidenceMismatch[] { + const bashCalls = actualEvidence.filter( + (e): e is BashEvidence => e.kind === "bash", + ); + const mismatches: EvidenceMismatch[] = []; + + for (const claimed of claimedEvidence) { + // Skip coerced entries — they're already flagged with exitCode: -1 + // and verdict: "unknown (coerced from string)" by db-tools.ts + if (claimed.verdict?.includes("coerced from string")) continue; + if (claimed.exitCode === -1) continue; + + // Skip entries with empty or generic commands + if (!claimed.command || claimed.command.length < 3) continue; + + // Find matching bash call by command substring match + const match = findBestMatch(claimed.command, bashCalls); + + if (!match) { + mismatches.push({ + severity: "warning", + claimed, + actual: null, + reason: `No bash tool call found matching "${claimed.command.slice(0, 80)}"`, + }); + continue; + } + + // Exit code mismatch: LLM claims success but actual command failed + if (claimed.exitCode === 0 && match.exitCode !== 0) { + mismatches.push({ + severity: "error", + claimed, + actual: match, + reason: `Claimed exitCode=0 but actual exitCode=${match.exitCode}`, + }); + } + } + + return mismatches; +} + +// ─── Internals ────────────────────────────────────────────────────────────── + +/** + * Find the best matching bash evidence entry for a claimed command. + * Uses substring matching — the claimed command may be a shortened version + * of the actual command, or vice versa. + */ +function findBestMatch( + claimedCommand: string, + bashCalls: readonly BashEvidence[], +): BashEvidence | null { + const normalized = claimedCommand.trim(); + + // Exact match first + const exact = bashCalls.find(b => b.command.trim() === normalized); + if (exact) return exact; + + // Substring match: claimed is contained in actual or actual in claimed + const substring = bashCalls.find( + b => b.command.includes(normalized) || normalized.includes(b.command), + ); + if (substring) return substring; + + // Token match: split on whitespace and check significant overlap + const claimedTokens = normalized.split(/\s+/).filter(t => t.length > 2); + if (claimedTokens.length === 0) return null; + + let bestMatch: BashEvidence | null = null; + let bestScore = 0; + + for (const call of bashCalls) { + const callTokens = new Set(call.command.split(/\s+/)); + const matchCount = claimedTokens.filter(t => callTokens.has(t)).length; + const score = matchCount / claimedTokens.length; + if (score > bestScore && score >= 0.5) { + bestScore = score; + bestMatch = call; + } + } + + return bestMatch; +} diff --git a/src/resources/extensions/gsd/safety/file-change-validator.ts b/src/resources/extensions/gsd/safety/file-change-validator.ts new file mode 100644 index 000000000..e2bb390ab --- /dev/null +++ b/src/resources/extensions/gsd/safety/file-change-validator.ts @@ -0,0 +1,108 @@ +/** + * Post-unit file change validator for auto-mode safety harness. + * Compares actual git diff against the task plan's expected output files. + * + * Uses tasks.expected_output (DB column, populated from per-task ## Expected Output) + * and tasks.files (from slice PLAN.md - Files: subline) as the expected set. + * Compares against git diff HEAD~1 --name-only after auto-commit. + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +import { execFileSync } from "node:child_process"; +import { logWarning } from "../workflow-logger.js"; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface FileViolation { + severity: "info" | "warning"; + file: string; + reason: string; +} + +export interface FileChangeAudit { + expectedFiles: string[]; + actualFiles: string[]; + unexpectedFiles: string[]; + missingFiles: string[]; + violations: FileViolation[]; +} + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** + * Validate file changes after auto-commit for an execute-task unit. + * Returns null if task data is unavailable or DB is not loaded. + * + * @param basePath - Working directory (worktree or project root) + * @param expectedOutput - JSON array from tasks.expected_output DB column + * @param plannedFiles - JSON array from tasks.files DB column + */ +export function validateFileChanges( + basePath: string, + expectedOutput: string[], + plannedFiles: string[], +): FileChangeAudit | null { + const allExpected = new Set([...expectedOutput, ...plannedFiles]); + + // If no expected files were planned, skip validation + if (allExpected.size === 0) return null; + + // Get actual changed files from last commit + const actualFiles = getChangedFilesFromLastCommit(basePath); + if (!actualFiles) return null; + + // Filter out .gsd/ internal files — only validate project source files + const projectFiles = actualFiles.filter(f => !f.startsWith(".gsd/") && !f.startsWith(".gsd\\")); + + // Normalize expected paths (strip leading ./ or /) + const normalizedExpected = new Set( + [...allExpected].map(f => f.replace(/^\.\//, "").replace(/^\//, "")), + ); + + // Compute symmetric difference + const unexpectedFiles = projectFiles.filter(f => !normalizedExpected.has(f)); + const missingFiles = [...normalizedExpected].filter(f => !projectFiles.includes(f)); + + const violations: FileViolation[] = []; + + for (const f of unexpectedFiles) { + violations.push({ + severity: "warning", + file: f, + reason: "Modified but not in task plan's expected output", + }); + } + + for (const f of missingFiles) { + violations.push({ + severity: "info", + file: f, + reason: "Listed in task plan but not modified", + }); + } + + return { + expectedFiles: [...normalizedExpected], + actualFiles: projectFiles, + unexpectedFiles, + missingFiles, + violations, + }; +} + +// ─── Internals ────────────────────────────────────────────────────────────── + +function getChangedFilesFromLastCommit(basePath: string): string[] | null { + try { + const result = execFileSync( + "git", + ["diff", "--name-only", "HEAD~1", "HEAD"], + { cwd: basePath, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }, + ).trim(); + return result ? result.split("\n").filter(Boolean) : []; + } catch (e) { + logWarning("safety", `git diff failed in file-change-validator: ${(e as Error).message}`); + return null; + } +} diff --git a/src/resources/extensions/gsd/safety/git-checkpoint.ts b/src/resources/extensions/gsd/safety/git-checkpoint.ts new file mode 100644 index 000000000..4f66b6dbb --- /dev/null +++ b/src/resources/extensions/gsd/safety/git-checkpoint.ts @@ -0,0 +1,106 @@ +/** + * Pre-unit git checkpoint and rollback for auto-mode safety harness. + * Uses the existing refs/gsd/ namespace (already pruned by doctor). + * + * Creates a lightweight ref at HEAD before unit execution. On failure, + * the ref can be used to rollback the branch to the pre-unit state. + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +import { execFileSync } from "node:child_process"; +import { logWarning } from "../workflow-logger.js"; + +// ─── Constants ────────────────────────────────────────────────────────────── + +const CHECKPOINT_PREFIX = "refs/gsd/checkpoints/"; + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** + * Create a checkpoint ref at the current HEAD for the given unit. + * Returns the SHA of HEAD, or null if the operation fails. + */ +export function createCheckpoint(basePath: string, unitId: string): string | null { + try { + const sha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: basePath, + stdio: ["ignore", "pipe", "pipe"], + encoding: "utf-8", + }).trim(); + + if (!sha || sha.length < 7) return null; + + // Sanitize unitId for use in ref path (replace / with -) + const safeUnitId = unitId.replace(/\//g, "-"); + + execFileSync("git", ["update-ref", `${CHECKPOINT_PREFIX}${safeUnitId}`, sha], { + cwd: basePath, + stdio: ["ignore", "pipe", "pipe"], + }); + + return sha; + } catch (e) { + logWarning("safety", `checkpoint creation failed: ${(e as Error).message}`); + return null; + } +} + +/** + * Rollback the current branch to a checkpoint SHA. + * Returns true on success, false on failure. + * + * WARNING: This is a destructive operation — it discards all changes + * since the checkpoint. Only call when the user has opted in via + * safety_harness.auto_rollback or an explicit manual trigger. + */ +export function rollbackToCheckpoint( + basePath: string, + unitId: string, + sha: string, +): boolean { + try { + // Get current branch name + const branch = execFileSync("git", ["rev-parse", "--abbrev-ref", "HEAD"], { + cwd: basePath, + stdio: ["ignore", "pipe", "pipe"], + encoding: "utf-8", + }).trim(); + + if (!branch || branch === "HEAD") { + logWarning("safety", "rollback: detached HEAD state, cannot rollback"); + return false; + } + + // Reset branch pointer and working tree to checkpoint SHA in one step. + // Using `git reset --hard ` works on the currently checked-out branch + // (unlike `git branch -f` which is rejected for checked-out branches). + execFileSync("git", ["reset", "--hard", sha], { + cwd: basePath, + stdio: ["ignore", "pipe", "pipe"], + }); + + // Cleanup checkpoint ref + cleanupCheckpoint(basePath, unitId); + + return true; + } catch (e) { + logWarning("safety", `rollback failed: ${(e as Error).message}`); + return false; + } +} + +/** + * Remove a checkpoint ref after successful unit completion. + */ +export function cleanupCheckpoint(basePath: string, unitId: string): void { + try { + const safeUnitId = unitId.replace(/\//g, "-"); + execFileSync("git", ["update-ref", "-d", `${CHECKPOINT_PREFIX}${safeUnitId}`], { + cwd: basePath, + stdio: ["ignore", "pipe", "pipe"], + }); + } catch { + // Non-fatal — ref may already have been cleaned up + } +} diff --git a/src/resources/extensions/gsd/safety/safety-harness.ts b/src/resources/extensions/gsd/safety/safety-harness.ts new file mode 100644 index 000000000..f4e9e83d1 --- /dev/null +++ b/src/resources/extensions/gsd/safety/safety-harness.ts @@ -0,0 +1,105 @@ +/** + * Safety Harness — central module for LLM damage control during auto-mode. + * Provides types, preference resolution, and orchestration for all safety components. + * + * Components: + * - evidence-collector.ts: Real-time tool call tracking + * - destructive-guard.ts: Bash command classification + * - file-change-validator.ts: Post-unit git diff vs plan + * - evidence-cross-ref.ts: Claimed vs actual verification evidence + * - git-checkpoint.ts: Pre-unit checkpoints + rollback + * - content-validator.ts: Output quality validation + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface SafetyHarnessConfig { + enabled: boolean; + evidence_collection: boolean; + file_change_validation: boolean; + evidence_cross_reference: boolean; + destructive_command_warnings: boolean; + content_validation: boolean; + checkpoints: boolean; + auto_rollback: boolean; + timeout_scale_cap: number; +} + +// ─── Defaults ─────────────────────────────────────────────────────────────── + +const DEFAULTS: SafetyHarnessConfig = { + enabled: true, + evidence_collection: true, + file_change_validation: true, + evidence_cross_reference: true, + destructive_command_warnings: true, + content_validation: true, + checkpoints: true, + auto_rollback: false, + timeout_scale_cap: 6, +}; + +// ─── Public API ───────────────────────────────────────────────────────────── + +/** + * Resolve safety harness configuration from raw preferences. + * Missing fields fall back to defaults. + */ +export function resolveSafetyHarnessConfig( + raw: Record | undefined, +): SafetyHarnessConfig { + if (!raw) return { ...DEFAULTS }; + + return { + enabled: typeof raw.enabled === "boolean" ? raw.enabled : DEFAULTS.enabled, + evidence_collection: typeof raw.evidence_collection === "boolean" ? raw.evidence_collection : DEFAULTS.evidence_collection, + file_change_validation: typeof raw.file_change_validation === "boolean" ? raw.file_change_validation : DEFAULTS.file_change_validation, + evidence_cross_reference: typeof raw.evidence_cross_reference === "boolean" ? raw.evidence_cross_reference : DEFAULTS.evidence_cross_reference, + destructive_command_warnings: typeof raw.destructive_command_warnings === "boolean" ? raw.destructive_command_warnings : DEFAULTS.destructive_command_warnings, + content_validation: typeof raw.content_validation === "boolean" ? raw.content_validation : DEFAULTS.content_validation, + checkpoints: typeof raw.checkpoints === "boolean" ? raw.checkpoints : DEFAULTS.checkpoints, + auto_rollback: typeof raw.auto_rollback === "boolean" ? raw.auto_rollback : DEFAULTS.auto_rollback, + timeout_scale_cap: typeof raw.timeout_scale_cap === "number" ? raw.timeout_scale_cap : DEFAULTS.timeout_scale_cap, + }; +} + +/** + * Check if the safety harness is enabled. + * Used as a fast gate at hook registration and phase integration points. + */ +export function isHarnessEnabled( + raw: Record | undefined, +): boolean { + if (!raw) return DEFAULTS.enabled; + if (typeof raw.enabled === "boolean") return raw.enabled; + return DEFAULTS.enabled; +} + +// ─── Re-exports ───────────────────────────────────────────────────────────── + +export { + resetEvidence, + getEvidence, + getBashEvidence, + getFilePaths, + recordToolCall, + recordToolResult, +} from "./evidence-collector.js"; + +export type { EvidenceEntry, BashEvidence, FileWriteEvidence, FileEditEvidence } from "./evidence-collector.js"; + +export { classifyCommand } from "./destructive-guard.js"; +export type { CommandClassification } from "./destructive-guard.js"; + +export { validateFileChanges } from "./file-change-validator.js"; +export type { FileChangeAudit, FileViolation } from "./file-change-validator.js"; + +export { crossReferenceEvidence } from "./evidence-cross-ref.js"; +export type { ClaimedEvidence, EvidenceMismatch } from "./evidence-cross-ref.js"; + +export { createCheckpoint, rollbackToCheckpoint, cleanupCheckpoint } from "./git-checkpoint.js"; + +export { validateContent } from "./content-validator.js"; +export type { ContentViolation } from "./content-validator.js"; diff --git a/src/resources/extensions/gsd/tests/git-checkpoint.test.ts b/src/resources/extensions/gsd/tests/git-checkpoint.test.ts new file mode 100644 index 000000000..33cd3829f --- /dev/null +++ b/src/resources/extensions/gsd/tests/git-checkpoint.test.ts @@ -0,0 +1,94 @@ +// GSD2 — Regression tests for git-checkpoint rollback (#3576) +// Copyright (c) 2026 Jeremy McSpadden + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import { mkdtempSync, writeFileSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { execFileSync } from "node:child_process"; +import { createCheckpoint, rollbackToCheckpoint, cleanupCheckpoint } from "../safety/git-checkpoint.js"; + +function git(args: string[], cwd: string): string { + return execFileSync("git", args, { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim(); +} + +function createTempRepo(): string { + const dir = mkdtempSync(join(tmpdir(), "ckpt-test-")); + git(["init"], dir); + git(["config", "user.email", "test@test.com"], dir); + git(["config", "user.name", "Test"], dir); + writeFileSync(join(dir, "file.txt"), "initial\n"); + git(["add", "."], dir); + git(["commit", "-m", "init"], dir); + git(["branch", "-M", "main"], dir); + return dir; +} + +describe("git-checkpoint rollback", () => { + it("rolls back to checkpoint on checked-out branch", (t) => { + const repo = createTempRepo(); + t.after(() => rmSync(repo, { recursive: true, force: true })); + + // Create checkpoint at initial commit + const sha = createCheckpoint(repo, "unit-1"); + assert.ok(sha, "checkpoint should return a SHA"); + + // Make a second commit + writeFileSync(join(repo, "file.txt"), "modified\n"); + git(["add", "."], repo); + git(["commit", "-m", "second"], repo); + + const headBefore = git(["rev-parse", "HEAD"], repo); + assert.notEqual(headBefore, sha, "HEAD should have advanced"); + + // Rollback — this must work on the checked-out branch + const result = rollbackToCheckpoint(repo, "unit-1", sha); + assert.equal(result, true, "rollback should succeed"); + + const headAfter = git(["rev-parse", "HEAD"], repo); + assert.equal(headAfter, sha, "HEAD should match checkpoint SHA after rollback"); + }); + + it("returns false on detached HEAD", (t) => { + const repo = createTempRepo(); + t.after(() => rmSync(repo, { recursive: true, force: true })); + + const sha = git(["rev-parse", "HEAD"], repo); + git(["checkout", "--detach", sha], repo); + + const result = rollbackToCheckpoint(repo, "unit-2", sha); + assert.equal(result, false, "rollback should fail on detached HEAD"); + }); + + it("cleans up checkpoint ref after rollback", (t) => { + const repo = createTempRepo(); + t.after(() => rmSync(repo, { recursive: true, force: true })); + + const sha = createCheckpoint(repo, "unit-3"); + assert.ok(sha); + + // Ref should exist + const refBefore = git(["for-each-ref", "refs/gsd/checkpoints/unit-3", "--format=%(objectname)"], repo); + assert.equal(refBefore, sha); + + rollbackToCheckpoint(repo, "unit-3", sha); + + // Ref should be cleaned up + const refAfter = git(["for-each-ref", "refs/gsd/checkpoints/unit-3", "--format=%(objectname)"], repo); + assert.equal(refAfter, "", "checkpoint ref should be removed after rollback"); + }); + + it("cleanupCheckpoint removes the ref without error", (t) => { + const repo = createTempRepo(); + t.after(() => rmSync(repo, { recursive: true, force: true })); + + const sha = createCheckpoint(repo, "unit-4"); + assert.ok(sha); + + cleanupCheckpoint(repo, "unit-4"); + + const ref = git(["for-each-ref", "refs/gsd/checkpoints/unit-4", "--format=%(objectname)"], repo); + assert.equal(ref, "", "ref should be gone"); + }); +}); diff --git a/src/resources/extensions/gsd/workflow-logger.ts b/src/resources/extensions/gsd/workflow-logger.ts index 1f0a7d163..3e135ab5c 100644 --- a/src/resources/extensions/gsd/workflow-logger.ts +++ b/src/resources/extensions/gsd/workflow-logger.ts @@ -48,7 +48,8 @@ export type LogComponent = | "bootstrap" // Extension bootstrap (system-context, agent-end) | "guided" // Guided flow (discuss, plan wizards) | "registry" // Rule registry hook state - | "renderer"; // Markdown renderer and projections + | "renderer" // Markdown renderer and projections + | "safety"; // LLM safety harness export interface LogEntry { ts: string; diff --git a/src/resources/extensions/ollama/ollama-chat-provider.ts b/src/resources/extensions/ollama/ollama-chat-provider.ts index f02361622..81e1de6f4 100644 --- a/src/resources/extensions/ollama/ollama-chat-provider.ts +++ b/src/resources/extensions/ollama/ollama-chat-provider.ts @@ -149,7 +149,7 @@ export function streamOllamaChat( // Handle text content — process independently of tool_calls // (a chunk may contain both content and tool_calls) const content = chunk.message?.content ?? ""; - if (content && !chunk.done) { + if (content) { if (thinkParser) { processChunks(thinkParser.push(content)); } else { diff --git a/src/resources/extensions/ollama/tests/ollama-chat-provider-stream.test.ts b/src/resources/extensions/ollama/tests/ollama-chat-provider-stream.test.ts new file mode 100644 index 000000000..bc3982c6e --- /dev/null +++ b/src/resources/extensions/ollama/tests/ollama-chat-provider-stream.test.ts @@ -0,0 +1,82 @@ +// GSD2 — Regression test: Ollama streaming must not drop content on done:true chunks (#3576) +// Copyright (c) 2026 Jeremy McSpadden + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; + +/** + * This test validates the streaming logic pattern used in ollama-chat-provider.ts. + * The bug: content on the terminal done:true chunk was silently dropped because + * the stream loop only emitted content when `!chunk.done`. + * + * The fix: process chunk.message.content regardless of chunk.done, then handle + * done metadata. This test exercises that logic path with a simulated chunk stream. + */ + +interface OllamaChunk { + done: boolean; + done_reason?: string; + message?: { content?: string; tool_calls?: unknown[] }; + prompt_eval_count?: number; + eval_count?: number; +} + +function simulateStreamLoop(chunks: OllamaChunk[]): string { + let output = ""; + + for (const chunk of chunks) { + // This mirrors the fixed logic in ollama-chat-provider.ts + const content = chunk.message?.content ?? ""; + if (content) { + output += content; + } + + if (chunk.done) { + break; + } + } + + return output; +} + +describe("Ollama stream terminal chunk handling", () => { + it("captures content from done:true chunk", () => { + const chunks: OllamaChunk[] = [ + { done: false, message: { content: "Hello " } }, + { done: false, message: { content: "world" } }, + { done: true, done_reason: "stop", message: { content: "!" } }, + ]; + + const result = simulateStreamLoop(chunks); + assert.equal(result, "Hello world!", "trailing content on done chunk must not be dropped"); + }); + + it("works when done chunk has no content", () => { + const chunks: OllamaChunk[] = [ + { done: false, message: { content: "Hello" } }, + { done: true, done_reason: "stop", message: {} }, + ]; + + const result = simulateStreamLoop(chunks); + assert.equal(result, "Hello"); + }); + + it("works when done chunk has empty string content", () => { + const chunks: OllamaChunk[] = [ + { done: false, message: { content: "data" } }, + { done: true, done_reason: "stop", message: { content: "" } }, + ]; + + const result = simulateStreamLoop(chunks); + assert.equal(result, "data"); + }); + + it("handles single done chunk with content", () => { + const chunks: OllamaChunk[] = [ + { done: true, done_reason: "stop", message: { content: "one-shot" } }, + ]; + + const result = simulateStreamLoop(chunks); + assert.equal(result, "one-shot", "single done chunk with content should work"); + }); +});