feat(ollama): native /api/chat provider with full option exposure

Replace the OpenAI-compat shim with a native Ollama /api/chat streaming provider that exposes all commonly-used Ollama options and surfaces inference performance metrics. Key changes: - Native NDJSON streaming from /api/chat (no more OpenAI shim) - Known models send num_ctx from capability table; unknown models defer to Ollama's default to avoid OOM on constrained hosts - Exposes: temperature, top_p, top_k, repeat_penalty, seed, num_gpu, keep_alive, num_predict via per-model providerOptions - Extracts <think>...</think> blocks for reasoning models (deepseek-r1, qwq) - Surfaces InferenceMetrics (tokens/sec, durations) on AssistantMessage - Adds remove and show actions to ollama_manage LLM tool - Adds "ollama-chat" to KnownApi, providerOptions to Model<TApi> - NDJSON parser uses strict mode for chat (fails on malformed frames) - Mixed content+tool_call chunks handled independently Closes #3544
2026-04-05 08:49:48 -05:00 · 2026-04-05 08:49:48 -05:00 · 4ba2d5a219
commit 4ba2d5a219
parent dcf41154b8
13 changed files with 832 additions and 112 deletions
--- a/packages/pi-ai/src/types.ts
+++ b/packages/pi-ai/src/types.ts
@ -13,7 +13,8 @@ export type KnownApi =
 	| "bedrock-converse-stream"
 	| "google-generative-ai"
 	| "google-gemini-cli"
-	| "google-vertex";
+	| "google-vertex"
+	| "ollama-chat";

 export type Api = KnownApi | (string & {});

@ -212,9 +213,23 @@ export interface AssistantMessage {
 	errorMessage?: string;
 	/** Server-requested retry delay in milliseconds (from Retry-After or rate limit headers). */
 	retryAfterMs?: number;
+	/** Provider inference performance metrics (e.g. tokens/sec from local models). */
+	inferenceMetrics?: InferenceMetrics;
 	timestamp: number; // Unix timestamp in milliseconds
 }

+/** Inference performance metrics reported by providers that support it (e.g. Ollama). */
+export interface InferenceMetrics {
+	/** Tokens generated per second during eval phase. */
+	tokensPerSecond: number;
+	/** Wall-clock duration of the full request in milliseconds. */
+	totalDurationMs: number;
+	/** Duration of the eval (generation) phase in milliseconds. */
+	evalDurationMs: number;
+	/** Duration of the prompt eval phase in milliseconds. */
+	promptEvalDurationMs: number;
+}
+
 export interface ToolResultMessage<TDetails = any> {
 	role: "toolResult";
 	toolCallId: string;
@ -374,4 +389,6 @@ export interface Model<TApi extends Api> {
 	 * Read these fields instead of pattern-matching on model IDs or provider names.
 	 */
 	capabilities?: ModelCapabilities;
+	/** Opaque provider-specific options. Cast to the appropriate type in the provider's stream handler. */
+	providerOptions?: Record<string, unknown>;
 }
--- a/packages/pi-coding-agent/src/core/extensions/types.ts
+++ b/packages/pi-coding-agent/src/core/extensions/types.ts
@ -1341,6 +1341,8 @@ export interface ProviderModelConfig {
 	headers?: Record<string, string>;
 	/** OpenAI compatibility settings. */
 	compat?: Model<Api>["compat"];
+	/** Opaque provider-specific options (e.g. Ollama keep_alive, num_gpu). */
+	providerOptions?: Record<string, unknown>;
 }

 /** Extension factory function type. Supports both sync and async initialization. */
--- a/packages/pi-coding-agent/src/core/model-registry.ts
+++ b/packages/pi-coding-agent/src/core/model-registry.ts
@ -742,6 +742,7 @@ export class ModelRegistry {
 					maxTokens: modelDef.maxTokens,
 					headers,
 					compat: modelDef.compat,
+					providerOptions: modelDef.providerOptions,
 				} as Model<Api>);
 			}

@ -917,5 +918,6 @@ export interface ProviderConfigInput {
 		maxTokens: number;
 		headers?: Record<string, string>;
 		compat?: Model<Api>["compat"];
+		providerOptions?: Record<string, unknown>;
 	}>;
 }
--- a/src/resources/extensions/ollama/index.ts
+++ b/src/resources/extensions/ollama/index.ts
@ -17,19 +17,10 @@
 */

 import { importExtensionModule, type ExtensionAPI } from "@gsd/pi-coding-agent";
-import type { OpenAICompletionsCompat } from "@gsd/pi-ai";
 import * as client from "./ollama-client.js";
-import { discoverModels, getOllamaOpenAIBaseUrl } from "./ollama-discovery.js";
+import { discoverModels } from "./ollama-discovery.js";
 import { registerOllamaCommands } from "./ollama-commands.js";
-
-/** Default compat settings for Ollama models via OpenAI-compat endpoint */
-const OLLAMA_COMPAT: OpenAICompletionsCompat = {
-	supportsDeveloperRole: false,
-	supportsReasoningEffort: false,
-	supportsUsageInStreaming: false,
-	maxTokensField: "max_tokens",
-	supportsStore: false,
-};
+import { streamOllamaChat } from "./ollama-chat-provider.js";

 let toolsPromise: Promise<void> | null = null;

@ -68,12 +59,13 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
 	const models = await discoverModels();
 	if (models.length === 0) return true; // Running but no models pulled

-	const baseUrl = getOllamaOpenAIBaseUrl();
+	const baseUrl = client.getOllamaHost();

 	pi.registerProvider("ollama", {
 		authMode: "none",
 		baseUrl,
-		api: "openai-completions",
+		api: "ollama-chat",
+		streamSimple: streamOllamaChat,
 		isReady: () => true,
 		models: models.map((m) => ({
 			id: m.id,
@ -83,7 +75,7 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
 			cost: m.cost,
 			contextWindow: m.contextWindow,
 			maxTokens: m.maxTokens,
-			compat: OLLAMA_COMPAT,
+			providerOptions: (m.ollamaOptions ?? {}) as Record<string, unknown>,
 		})),
 	});

--- a/src/resources/extensions/ollama/model-capabilities.ts
+++ b/src/resources/extensions/ollama/model-capabilities.ts
@ -8,11 +8,15 @@
 * Fallback: estimate from parameter count if model isn't in the table.
 */

+import type { OllamaChatOptions } from "./types.js";
+
 export interface ModelCapability {
 	contextWindow?: number;
 	maxTokens?: number;
 	input?: ("text" | "image")[];
 	reasoning?: boolean;
+	/** Ollama-specific default inference options for this model family. */
+	ollamaOptions?: OllamaChatOptions;
 }

 /**
@ -20,58 +24,61 @@ export interface ModelCapability {
 * Keys are matched as prefixes against the model name (before the colon/tag).
 * More specific entries should appear first.
 */
+// Note: ollamaOptions.num_ctx is set for known model families where the context
+// window is authoritative. For unknown/estimated models, num_ctx is NOT sent
+// to avoid OOM risk — Ollama uses its own safe default instead.
 const KNOWN_MODELS: Array<[pattern: string, caps: ModelCapability]> = [
 	// ─── Reasoning models ───────────────────────────────────────────────
-	["deepseek-r1", { contextWindow: 131072, reasoning: true }],
-	["qwq", { contextWindow: 131072, reasoning: true }],
+	["deepseek-r1", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],
+	["qwq", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],

 	// ─── Vision models ──────────────────────────────────────────────────
-	["llava", { contextWindow: 4096, input: ["text", "image"] }],
-	["bakllava", { contextWindow: 4096, input: ["text", "image"] }],
-	["moondream", { contextWindow: 8192, input: ["text", "image"] }],
-	["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"] }],
-	["minicpm-v", { contextWindow: 4096, input: ["text", "image"] }],
+	["llava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
+	["bakllava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
+	["moondream", { contextWindow: 8192, input: ["text", "image"], ollamaOptions: { num_ctx: 8192 } }],
+	["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"], ollamaOptions: { num_ctx: 131072 } }],
+	["minicpm-v", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],

 	// ─── Code models ────────────────────────────────────────────────────
-	["codestral", { contextWindow: 262144, maxTokens: 32768 }],
-	["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768 }],
-	["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384 }],
-	["starcoder2", { contextWindow: 16384, maxTokens: 8192 }],
-	["codegemma", { contextWindow: 8192, maxTokens: 8192 }],
-	["codellama", { contextWindow: 16384, maxTokens: 8192 }],
-	["devstral", { contextWindow: 131072, maxTokens: 32768 }],
+	["codestral", { contextWindow: 262144, maxTokens: 32768, ollamaOptions: { num_ctx: 262144 } }],
+	["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
+	["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["starcoder2", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
+	["codegemma", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
+	["codellama", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
+	["devstral", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],

 	// ─── Llama family ───────────────────────────────────────────────────
-	["llama3.3", { contextWindow: 131072, maxTokens: 16384 }],
-	["llama3.2", { contextWindow: 131072, maxTokens: 16384 }],
-	["llama3.1", { contextWindow: 131072, maxTokens: 16384 }],
-	["llama3", { contextWindow: 8192, maxTokens: 8192 }],
-	["llama2", { contextWindow: 4096, maxTokens: 4096 }],
+	["llama3.3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["llama3.2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["llama3.1", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["llama3", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
+	["llama2", { contextWindow: 4096, maxTokens: 4096, ollamaOptions: { num_ctx: 4096 } }],

 	// ─── Qwen family ────────────────────────────────────────────────────
-	["qwen3", { contextWindow: 131072, maxTokens: 32768 }],
-	["qwen2.5", { contextWindow: 131072, maxTokens: 32768 }],
-	["qwen2", { contextWindow: 131072, maxTokens: 32768 }],
+	["qwen3", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
+	["qwen2.5", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
+	["qwen2", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],

 	// ─── Gemma family ───────────────────────────────────────────────────
-	["gemma3", { contextWindow: 131072, maxTokens: 16384 }],
-	["gemma2", { contextWindow: 8192, maxTokens: 8192 }],
+	["gemma3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["gemma2", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],

 	// ─── Mistral family ─────────────────────────────────────────────────
-	["mistral-large", { contextWindow: 131072, maxTokens: 16384 }],
-	["mistral-small", { contextWindow: 131072, maxTokens: 16384 }],
-	["mistral-nemo", { contextWindow: 131072, maxTokens: 16384 }],
-	["mistral", { contextWindow: 32768, maxTokens: 8192 }],
-	["mixtral", { contextWindow: 32768, maxTokens: 8192 }],
+	["mistral-large", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["mistral-small", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["mistral-nemo", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["mistral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],
+	["mixtral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],

 	// ─── Phi family ─────────────────────────────────────────────────────
-	["phi4", { contextWindow: 16384, maxTokens: 16384 }],
-	["phi3.5", { contextWindow: 131072, maxTokens: 16384 }],
-	["phi3", { contextWindow: 131072, maxTokens: 4096 }],
+	["phi4", { contextWindow: 16384, maxTokens: 16384, ollamaOptions: { num_ctx: 16384 } }],
+	["phi3.5", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["phi3", { contextWindow: 131072, maxTokens: 4096, ollamaOptions: { num_ctx: 131072 } }],

 	// ─── Command R ──────────────────────────────────────────────────────
-	["command-r-plus", { contextWindow: 131072, maxTokens: 16384 }],
-	["command-r", { contextWindow: 131072, maxTokens: 16384 }],
+	["command-r-plus", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
+	["command-r", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
 ];

 /**
--- a/src/resources/extensions/ollama/ndjson-stream.ts
+++ b/src/resources/extensions/ollama/ndjson-stream.ts
@ -0,0 +1,63 @@
+// GSD2 — Ollama Extension: NDJSON streaming parser
+
+/**
+ * Parses a streaming NDJSON (newline-delimited JSON) response body into
+ * typed objects. Used for Ollama's /api/chat and /api/pull endpoints.
+ *
+ * @param strict When true, malformed JSON lines throw instead of being skipped.
+ *   Use strict mode for inference streams where silent data loss is unacceptable.
+ *   Use permissive mode (default) for progress endpoints like /api/pull.
+ */
+
+export async function* parseNDJsonStream<T>(
+	body: ReadableStream<Uint8Array>,
+	signal?: AbortSignal,
+	strict = false,
+): AsyncGenerator<T> {
+	const reader = body.getReader();
+	const decoder = new TextDecoder();
+	let buffer = "";
+
+	try {
+		while (true) {
+			if (signal?.aborted) break;
+
+			const { done, value } = await reader.read();
+			if (done) break;
+
+			buffer += decoder.decode(value, { stream: true });
+			const lines = buffer.split("\n");
+			buffer = lines.pop() ?? "";
+
+			for (const line of lines) {
+				const trimmed = line.trim();
+				if (!trimmed) continue;
+				try {
+					yield JSON.parse(trimmed) as T;
+				} catch (err) {
+					if (strict) {
+						throw new Error(
+							`Malformed NDJSON line from Ollama: ${trimmed.slice(0, 200)}`,
+						);
+					}
+					// Permissive mode: skip malformed lines
+				}
+			}
+		}
+
+		// Flush remaining buffer (skip if aborted)
+		if (buffer.trim() && !signal?.aborted) {
+			try {
+				yield JSON.parse(buffer.trim()) as T;
+			} catch (err) {
+				if (strict) {
+					throw new Error(
+						`Malformed NDJSON line from Ollama: ${buffer.trim().slice(0, 200)}`,
+					);
+				}
+			}
+		}
+	} finally {
+		reader.releaseLock();
+	}
+}
--- a/src/resources/extensions/ollama/ollama-chat-provider.ts
+++ b/src/resources/extensions/ollama/ollama-chat-provider.ts
@ -0,0 +1,459 @@
+// GSD2 — Ollama Extension: Native /api/chat stream provider
+
+/**
+ * Implements the "ollama-chat" API provider, streaming responses directly
+ * from Ollama's native /api/chat endpoint instead of the OpenAI compatibility
+ * shim. This exposes Ollama-specific options (num_ctx, keep_alive, num_gpu,
+ * sampling parameters) and surfaces inference performance metrics.
+ */
+
+import {
+	type Api,
+	type AssistantMessage,
+	type AssistantMessageEvent,
+	type AssistantMessageEventStream,
+	type Context,
+	type ImageContent,
+	type InferenceMetrics,
+	type Message,
+	type Model,
+	type SimpleStreamOptions,
+	type StopReason,
+	type TextContent,
+	type ThinkingContent,
+	type Tool,
+	type ToolCall,
+	type Usage,
+	EventStream,
+} from "@gsd/pi-ai";
+import { chat } from "./ollama-client.js";
+import type {
+	OllamaChatMessage,
+	OllamaChatOptions,
+	OllamaChatRequest,
+	OllamaChatResponse,
+	OllamaTool,
+	OllamaToolCall,
+} from "./types.js";
+import { ThinkingTagParser, type ParsedChunk } from "./thinking-parser.js";
+
+/** Create an AssistantMessageEventStream using the base EventStream class. */
+function createStream(): AssistantMessageEventStream {
+	return new EventStream<AssistantMessageEvent, AssistantMessage>(
+		(event) => event.type === "done" || event.type === "error",
+		(event) => {
+			if (event.type === "done") return event.message;
+			if (event.type === "error") return event.error;
+			throw new Error("Unexpected event type for final result");
+		},
+	) as AssistantMessageEventStream;
+}
+
+// ─── Stream handler ─────────────────────────────────────────────────────────
+
+export function streamOllamaChat(
+	model: Model<Api>,
+	context: Context,
+	options?: SimpleStreamOptions,
+): AssistantMessageEventStream {
+	const stream = createStream();
+
+	(async () => {
+		const output = buildInitialOutput(model);
+
+		try {
+			const request = buildRequest(model, context, options);
+			stream.push({ type: "start", partial: output });
+
+			const useThinkingParser = model.reasoning;
+			const thinkParser = useThinkingParser ? new ThinkingTagParser() : null;
+
+			let contentIndex = -1;
+			let currentBlockType: "text" | "thinking" | null = null;
+
+			function startBlock(type: "text" | "thinking") {
+				contentIndex++;
+				currentBlockType = type;
+				if (type === "text") {
+					output.content.push({ type: "text", text: "" });
+					stream.push({ type: "text_start", contentIndex, partial: output });
+				} else {
+					output.content.push({ type: "thinking", thinking: "" });
+					stream.push({ type: "thinking_start", contentIndex, partial: output });
+				}
+			}
+
+			function endBlock() {
+				if (currentBlockType === null) return;
+				if (currentBlockType === "text") {
+					const block = output.content[contentIndex] as TextContent;
+					stream.push({ type: "text_end", contentIndex, content: block.text, partial: output });
+				} else {
+					const block = output.content[contentIndex] as ThinkingContent;
+					stream.push({ type: "thinking_end", contentIndex, content: block.thinking, partial: output });
+				}
+				currentBlockType = null;
+			}
+
+			function emitDelta(type: "text" | "thinking", text: string) {
+				if (!text) return;
+				if (currentBlockType !== type) {
+					endBlock();
+					startBlock(type);
+				}
+				if (type === "text") {
+					(output.content[contentIndex] as TextContent).text += text;
+					stream.push({ type: "text_delta", contentIndex, delta: text, partial: output });
+				} else {
+					(output.content[contentIndex] as ThinkingContent).thinking += text;
+					stream.push({ type: "thinking_delta", contentIndex, delta: text, partial: output });
+				}
+			}
+
+			function processChunks(chunks: ParsedChunk[]) {
+				for (const chunk of chunks) {
+					emitDelta(chunk.type, chunk.text);
+				}
+			}
+
+			function processToolCalls(toolCalls: OllamaToolCall[]) {
+				endBlock();
+				for (const tc of toolCalls) {
+					contentIndex++;
+					const toolCall: ToolCall = {
+						type: "toolCall",
+						id: `ollama_tc_${contentIndex}`,
+						name: tc.function.name,
+						arguments: tc.function.arguments,
+					};
+					output.content.push(toolCall);
+					stream.push({ type: "toolcall_start", contentIndex, partial: output });
+					// Emit a delta with the serialized arguments (convention: start/delta/end)
+					stream.push({
+						type: "toolcall_delta",
+						contentIndex,
+						delta: JSON.stringify(tc.function.arguments),
+						partial: output,
+					});
+					stream.push({
+						type: "toolcall_end",
+						contentIndex,
+						toolCall,
+						partial: output,
+					});
+				}
+				output.stopReason = "toolUse";
+			}
+
+			for await (const chunk of chat(request, options?.signal)) {
+				// Handle text content — process independently of tool_calls
+				// (a chunk may contain both content and tool_calls)
+				const content = chunk.message?.content ?? "";
+				if (content && !chunk.done) {
+					if (thinkParser) {
+						processChunks(thinkParser.push(content));
+					} else {
+						emitDelta("text", content);
+					}
+				}
+
+				// Handle tool calls (Ollama sends them complete, may be on done:true chunk)
+				if (chunk.message?.tool_calls?.length) {
+					processToolCalls(chunk.message.tool_calls);
+				}
+
+				if (chunk.done) {
+					// Final chunk — extract metrics and usage
+					if (thinkParser) processChunks(thinkParser.flush());
+					endBlock();
+
+					output.usage = buildUsage(chunk);
+					output.inferenceMetrics = extractMetrics(chunk);
+					// Preserve "toolUse" if tool calls were processed
+					if (output.stopReason !== "toolUse") {
+						output.stopReason = mapStopReason(chunk.done_reason);
+					}
+					break;
+				}
+			}
+
+			assertStreamSuccess(output, options?.signal);
+			finalizeStream(stream, output);
+		} catch (error) {
+			handleStreamError(stream, output, error, options?.signal);
+		}
+	})();
+
+	return stream;
+}
+
+// ─── Request building ───────────────────────────────────────────────────────
+
+function buildRequest(
+	model: Model<Api>,
+	context: Context,
+	options?: SimpleStreamOptions,
+): OllamaChatRequest {
+	const ollamaOpts = (model.providerOptions ?? {}) as OllamaChatOptions;
+
+	const request: OllamaChatRequest = {
+		model: model.id,
+		messages: convertMessages(context),
+		stream: true,
+	};
+
+	// Build options block with all Ollama-specific parameters
+	const reqOptions: NonNullable<OllamaChatRequest["options"]> = {};
+
+	// Context window — only sent when explicitly configured via providerOptions.
+	// Sending inferred/estimated values risks OOM on constrained hosts.
+	// Users can set num_ctx per-model in models.json ollamaOptions or the
+	// capability table can provide it for known model families.
+	if (ollamaOpts.num_ctx !== undefined && ollamaOpts.num_ctx > 0) {
+		reqOptions.num_ctx = ollamaOpts.num_ctx;
+	}
+
+	// Max output tokens
+	const maxTokens = options?.maxTokens ?? model.maxTokens;
+	if (maxTokens > 0) {
+		reqOptions.num_predict = maxTokens;
+	}
+
+	// Temperature
+	if (options?.temperature !== undefined) {
+		reqOptions.temperature = options.temperature;
+	}
+
+	// Per-model sampling options from providerOptions
+	if (ollamaOpts.top_p !== undefined) reqOptions.top_p = ollamaOpts.top_p;
+	if (ollamaOpts.top_k !== undefined) reqOptions.top_k = ollamaOpts.top_k;
+	if (ollamaOpts.repeat_penalty !== undefined) reqOptions.repeat_penalty = ollamaOpts.repeat_penalty;
+	if (ollamaOpts.seed !== undefined) reqOptions.seed = ollamaOpts.seed;
+	if (ollamaOpts.num_gpu !== undefined) reqOptions.num_gpu = ollamaOpts.num_gpu;
+
+	if (Object.keys(reqOptions).length > 0) {
+		request.options = reqOptions;
+	}
+
+	// Keep alive
+	if (ollamaOpts.keep_alive !== undefined) {
+		request.keep_alive = ollamaOpts.keep_alive;
+	}
+
+	// Tools
+	if (context.tools?.length) {
+		request.tools = convertTools(context.tools);
+	}
+
+	return request;
+}
+
+// ─── Message conversion ─────────────────────────────────────────────────────
+
+function convertMessages(context: Context): OllamaChatMessage[] {
+	const messages: OllamaChatMessage[] = [];
+
+	// System prompt
+	if (context.systemPrompt) {
+		messages.push({ role: "system", content: context.systemPrompt });
+	}
+
+	for (const msg of context.messages) {
+		switch (msg.role) {
+			case "user":
+				messages.push(convertUserMessage(msg));
+				break;
+			case "assistant":
+				messages.push(convertAssistantMessage(msg));
+				break;
+			case "toolResult":
+				messages.push({
+					role: "tool",
+					content: msg.content
+						.filter((c): c is TextContent => c.type === "text")
+						.map((c) => c.text)
+						.join("\n"),
+					name: msg.toolName,
+				});
+				break;
+		}
+	}
+
+	return messages;
+}
+
+function convertUserMessage(msg: Message & { role: "user" }): OllamaChatMessage {
+	if (typeof msg.content === "string") {
+		return { role: "user", content: msg.content };
+	}
+
+	const textParts: string[] = [];
+	const images: string[] = [];
+
+	for (const part of msg.content) {
+		if (part.type === "text") {
+			textParts.push(part.text);
+		} else if (part.type === "image") {
+			// Strip data URI prefix if present
+			let data = (part as ImageContent).data;
+			const commaIdx = data.indexOf(",");
+			if (commaIdx !== -1 && data.startsWith("data:")) {
+				data = data.slice(commaIdx + 1);
+			}
+			images.push(data);
+		}
+	}
+
+	const result: OllamaChatMessage = {
+		role: "user",
+		content: textParts.join("\n"),
+	};
+	if (images.length > 0) {
+		result.images = images;
+	}
+	return result;
+}
+
+function convertAssistantMessage(msg: Message & { role: "assistant" }): OllamaChatMessage {
+	let content = "";
+	const toolCalls: OllamaChatMessage["tool_calls"] = [];
+
+	for (const block of msg.content) {
+		if (block.type === "thinking") {
+			// Serialize thinking back inline for round-trip with Ollama
+			content += `<think>${(block as ThinkingContent).thinking}</think>`;
+		} else if (block.type === "text") {
+			content += (block as TextContent).text;
+		} else if (block.type === "toolCall") {
+			const tc = block as ToolCall;
+			toolCalls.push({
+				function: {
+					name: tc.name,
+					arguments: tc.arguments,
+				},
+			});
+		}
+	}
+
+	const result: OllamaChatMessage = { role: "assistant", content };
+	if (toolCalls.length > 0) {
+		result.tool_calls = toolCalls;
+	}
+	return result;
+}
+
+// ─── Tool conversion ────────────────────────────────────────────────────────
+
+function convertTools(tools: Tool[]): OllamaTool[] {
+	return tools.map((tool) => {
+		const params = tool.parameters as Record<string, unknown>;
+		return {
+			type: "function" as const,
+			function: {
+				name: tool.name,
+				description: tool.description,
+				parameters: {
+					type: "object" as const,
+					required: params.required as string[] | undefined,
+					properties: (params.properties as Record<string, unknown>) ?? {},
+				},
+			},
+		};
+	});
+}
+
+// ─── Response mapping ───────────────────────────────────────────────────────
+
+function mapStopReason(doneReason?: string): StopReason {
+	switch (doneReason) {
+		case "stop":
+			return "stop";
+		case "length":
+			return "length";
+		default:
+			return "stop";
+	}
+}
+
+function buildUsage(chunk: OllamaChatResponse): Usage {
+	const input = chunk.prompt_eval_count ?? 0;
+	const outputTokens = chunk.eval_count ?? 0;
+	return {
+		input,
+		output: outputTokens,
+		cacheRead: 0,
+		cacheWrite: 0,
+		totalTokens: input + outputTokens,
+		cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+	};
+}
+
+function extractMetrics(chunk: OllamaChatResponse): InferenceMetrics | undefined {
+	if (!chunk.eval_duration && !chunk.total_duration) return undefined;
+
+	const evalCount = chunk.eval_count ?? 0;
+	const evalDurationNs = chunk.eval_duration ?? 0;
+	const evalDurationMs = evalDurationNs / 1e6;
+	const tokensPerSecond = evalDurationNs > 0 ? evalCount / (evalDurationNs / 1e9) : 0;
+
+	return {
+		tokensPerSecond,
+		totalDurationMs: (chunk.total_duration ?? 0) / 1e6,
+		evalDurationMs,
+		promptEvalDurationMs: (chunk.prompt_eval_duration ?? 0) / 1e6,
+	};
+}
+
+// ─── Stream lifecycle helpers ───────────────────────────────────────────────
+// Replicated from openai-shared.ts (not exported from @gsd/pi-ai)
+
+function buildInitialOutput(model: Model<Api>): AssistantMessage {
+	return {
+		role: "assistant",
+		content: [],
+		api: model.api as Api,
+		provider: model.provider,
+		model: model.id,
+		usage: {
+			input: 0,
+			output: 0,
+			cacheRead: 0,
+			cacheWrite: 0,
+			totalTokens: 0,
+			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+		},
+		stopReason: "stop",
+		timestamp: Date.now(),
+	};
+}
+
+function assertStreamSuccess(output: AssistantMessage, signal?: AbortSignal): void {
+	if (signal?.aborted) {
+		throw new Error("Request was aborted");
+	}
+	if (output.stopReason === "aborted" || output.stopReason === "error") {
+		throw new Error("An unknown error occurred");
+	}
+}
+
+function finalizeStream(stream: AssistantMessageEventStream, output: AssistantMessage): void {
+	stream.push({
+		type: "done",
+		reason: output.stopReason as Extract<StopReason, "stop" | "length" | "toolUse" | "pauseTurn">,
+		message: output,
+	});
+	stream.end();
+}
+
+function handleStreamError(
+	stream: AssistantMessageEventStream,
+	output: AssistantMessage,
+	error: unknown,
+	signal?: AbortSignal,
+): void {
+	for (const block of output.content) delete (block as { index?: number }).index;
+	output.stopReason = signal?.aborted ? "aborted" : "error";
+	output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
+	stream.push({ type: "error", reason: output.stopReason, error: output });
+	stream.end();
+}
--- a/src/resources/extensions/ollama/ollama-client.ts
+++ b/src/resources/extensions/ollama/ollama-client.ts
@ -8,12 +8,15 @@
 */

 import type {
+	OllamaChatRequest,
+	OllamaChatResponse,
 	OllamaPsResponse,
 	OllamaPullProgress,
 	OllamaShowResponse,
 	OllamaTagsResponse,
 	OllamaVersionResponse,
 } from "./types.js";
+import { parseNDJsonStream } from "./ndjson-stream.js";

 const DEFAULT_HOST = "http://localhost:11434";
 const PROBE_TIMEOUT_MS = 1500;
@ -130,39 +133,36 @@ export async function pullModel(
 		throw new Error("Ollama /api/pull returned no body");
 	}

-	const reader = response.body.getReader();
-	const decoder = new TextDecoder();
-	let buffer = "";
+	for await (const progress of parseNDJsonStream<OllamaPullProgress>(response.body, signal)) {
+		onProgress?.(progress);
+	}
+}

-	while (true) {
-		const { done, value } = await reader.read();
-		if (done) break;
+/**
+ * Stream a chat completion via /api/chat.
+ * Returns an async generator yielding each NDJSON response chunk.
+ */
+export async function* chat(
+	request: OllamaChatRequest,
+	signal?: AbortSignal,
+): AsyncGenerator<OllamaChatResponse> {
+	const response = await fetch(`${getOllamaHost()}/api/chat`, {
+		method: "POST",
+		headers: { "Content-Type": "application/json" },
+		body: JSON.stringify(request),
+		signal,
+	});

-		buffer += decoder.decode(value, { stream: true });
-		const lines = buffer.split("\n");
-		buffer = lines.pop() ?? "";
-
-		for (const line of lines) {
-			const trimmed = line.trim();
-			if (!trimmed) continue;
-			try {
-				const progress = JSON.parse(trimmed) as OllamaPullProgress;
-				onProgress?.(progress);
-			} catch {
-				// Skip malformed lines
-			}
-		}
+	if (!response.ok) {
+		const text = await response.text();
+		throw new Error(`Ollama /api/chat returned ${response.status}: ${text}`);
 	}

-	// Process remaining buffer
-	if (buffer.trim()) {
-		try {
-			const progress = JSON.parse(buffer.trim()) as OllamaPullProgress;
-			onProgress?.(progress);
-		} catch {
-			// Ignore
-		}
+	if (!response.body) {
+		throw new Error("Ollama /api/chat returned no body");
 	}
+
+	yield* parseNDJsonStream<OllamaChatResponse>(response.body, signal, true);
 }

 /**
--- a/src/resources/extensions/ollama/ollama-discovery.ts
+++ b/src/resources/extensions/ollama/ollama-discovery.ts
@ -8,14 +8,14 @@
 * Returns models in the format expected by pi.registerProvider().
 */

-import { listModels, getOllamaHost } from "./ollama-client.js";
+import { listModels } from "./ollama-client.js";
 import {
 	estimateContextFromParams,
 	formatModelSize,
 	getModelCapabilities,
 	humanizeModelName,
 } from "./model-capabilities.js";
-import type { OllamaModelInfo } from "./types.js";
+import type { OllamaChatOptions, OllamaModelInfo } from "./types.js";

 export interface DiscoveredOllamaModel {
 	id: string;
@ -29,6 +29,8 @@ export interface DiscoveredOllamaModel {
 	sizeBytes: number;
 	/** Parameter size string from Ollama (e.g. "7B") */
 	parameterSize: string;
+	/** Ollama-specific inference options for this model */
+	ollamaOptions?: OllamaChatOptions;
 }

 const ZERO_COST = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 };
@ -64,6 +66,7 @@ function enrichModel(info: OllamaModelInfo): DiscoveredOllamaModel {
 		maxTokens,
 		sizeBytes: info.size,
 		parameterSize,
+		ollamaOptions: caps.ollamaOptions,
 	};
 }

@ -98,9 +101,3 @@ export function formatModelForDisplay(model: DiscoveredOllamaModel): string {
 	return parts.join(" ");
 }

-/**
- * Build the OpenAI-compat base URL for Ollama.
- */
-export function getOllamaOpenAIBaseUrl(): string {
-	return `${getOllamaHost()}/v1`;
-}
--- a/src/resources/extensions/ollama/ollama-tool.ts
+++ b/src/resources/extensions/ollama/ollama-tool.ts
@ -31,6 +31,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
 		promptGuidelines: [
 			"Use 'list' to see what models are available locally before trying to use one.",
 			"Use 'pull' to download a model that isn't available yet.",
+			"Use 'remove' to delete a local model that is no longer needed.",
+			"Use 'show' to get detailed info about a model (parameters, quantization, families).",
 			"Use 'status' to check if Ollama is running.",
 			"Use 'ps' to see which models are loaded in memory and VRAM usage.",
 			"Common models: llama3.1:8b, qwen2.5-coder:7b, deepseek-r1:8b, codestral:22b",
@ -40,6 +42,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
 				[
 					Type.Literal("list"),
 					Type.Literal("pull"),
+					Type.Literal("remove"),
+					Type.Literal("show"),
 					Type.Literal("status"),
 					Type.Literal("ps"),
 				],
@ -164,6 +168,71 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
 						};
 					}

+					case "remove": {
+						if (!model) {
+							return {
+								content: [{ type: "text", text: "Error: 'model' parameter is required for remove action." }],
+								isError: true,
+								details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
+							};
+						}
+
+						const running = await client.isRunning();
+						if (!running) {
+							return {
+								content: [{ type: "text", text: "Ollama is not running." }],
+								isError: true,
+								details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
+							};
+						}
+
+						await client.deleteModel(model);
+						return {
+							content: [{ type: "text", text: `Successfully removed ${model}` }],
+							details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
+						};
+					}
+
+					case "show": {
+						if (!model) {
+							return {
+								content: [{ type: "text", text: "Error: 'model' parameter is required for show action." }],
+								isError: true,
+								details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
+							};
+						}
+
+						const running = await client.isRunning();
+						if (!running) {
+							return {
+								content: [{ type: "text", text: "Ollama is not running." }],
+								isError: true,
+								details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
+							};
+						}
+
+						const info = await client.showModel(model);
+						const details = info.details;
+						const infoLines = [
+							`Model: ${model}`,
+							`Family: ${details.family}`,
+							`Parameters: ${details.parameter_size}`,
+							`Quantization: ${details.quantization_level}`,
+							`Format: ${details.format}`,
+						];
+						if (details.families?.length) {
+							infoLines.push(`Families: ${details.families.join(", ")}`);
+						}
+						if (info.parameters) {
+							infoLines.push(`\nModelfile parameters:\n${info.parameters}`);
+						}
+
+						return {
+							content: [{ type: "text", text: infoLines.join("\n") }],
+							details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
+						};
+					}
+
 					default:
 						return {
 							content: [{ type: "text", text: `Unknown action: ${action}` }],
--- a/src/resources/extensions/ollama/tests/ollama-discovery.test.ts
+++ b/src/resources/extensions/ollama/tests/ollama-discovery.test.ts
@ -1,28 +1 @@
 // GSD2 — Tests for Ollama model discovery and enrichment
-import { describe, it, afterEach } from "node:test";
-import assert from "node:assert/strict";
-import { getOllamaOpenAIBaseUrl } from "../ollama-discovery.js";
-
-// ─── getOllamaOpenAIBaseUrl ─────────────────────────────────────────────────
-
-describe("getOllamaOpenAIBaseUrl", () => {
-	const originalHost = process.env.OLLAMA_HOST;
-
-	afterEach(() => {
-		if (originalHost === undefined) {
-			delete process.env.OLLAMA_HOST;
-		} else {
-			process.env.OLLAMA_HOST = originalHost;
-		}
-	});
-
-	it("returns default OpenAI-compat URL", () => {
-		delete process.env.OLLAMA_HOST;
-		assert.equal(getOllamaOpenAIBaseUrl(), "http://localhost:11434/v1");
-	});
-
-	it("appends /v1 to custom OLLAMA_HOST", () => {
-		process.env.OLLAMA_HOST = "http://remote:9999";
-		assert.equal(getOllamaOpenAIBaseUrl(), "http://remote:9999/v1");
-	});
-});
--- a/src/resources/extensions/ollama/thinking-parser.ts
+++ b/src/resources/extensions/ollama/thinking-parser.ts
@ -0,0 +1,116 @@
+// GSD2 — Ollama Extension: Stateful <think> tag stream parser
+
+/**
+ * Extracts <think>...</think> thinking blocks from a streaming text response.
+ * Handles the case where tag boundaries span multiple chunks by buffering
+ * up to 8 characters (length of "</think>") at chunk boundaries.
+ *
+ * Used for reasoning models like deepseek-r1 and qwq that embed thinking
+ * inline in their text output.
+ */
+
+export type ParsedChunk =
+	| { type: "thinking"; text: string }
+	| { type: "text"; text: string };
+
+const OPEN_TAG = "<think>";
+const CLOSE_TAG = "</think>";
+const MAX_TAG_LEN = Math.max(OPEN_TAG.length, CLOSE_TAG.length);
+
+export class ThinkingTagParser {
+	private buffer = "";
+	private inThinking = false;
+
+	/**
+	 * Feed a chunk of text and get back parsed segments.
+	 * May return zero or more segments depending on tag boundaries.
+	 */
+	push(chunk: string): ParsedChunk[] {
+		const results: ParsedChunk[] = [];
+		let input = this.buffer + chunk;
+		this.buffer = "";
+
+		while (input.length > 0) {
+			if (this.inThinking) {
+				const closeIdx = input.indexOf(CLOSE_TAG);
+				if (closeIdx !== -1) {
+					// Found close tag — emit thinking content before it
+					const thinking = input.slice(0, closeIdx);
+					if (thinking) results.push({ type: "thinking", text: thinking });
+					this.inThinking = false;
+					input = input.slice(closeIdx + CLOSE_TAG.length);
+				} else if (this.couldBePartialTag(input, CLOSE_TAG)) {
+					// Possible partial close tag at end — buffer only the matching tail
+					const tailLen = this.getPartialTagTailLength(input, CLOSE_TAG);
+					const safe = input.slice(0, input.length - tailLen);
+					if (safe) results.push({ type: "thinking", text: safe });
+					this.buffer = input.slice(-tailLen);
+					break;
+				} else {
+					// No close tag — emit all as thinking
+					results.push({ type: "thinking", text: input });
+					break;
+				}
+			} else {
+				const openIdx = input.indexOf(OPEN_TAG);
+				if (openIdx !== -1) {
+					// Found open tag — emit text before it
+					const text = input.slice(0, openIdx);
+					if (text) results.push({ type: "text", text });
+					this.inThinking = true;
+					input = input.slice(openIdx + OPEN_TAG.length);
+				} else if (this.couldBePartialTag(input, OPEN_TAG)) {
+					// Possible partial open tag at end — buffer only the matching tail
+					const tailLen = this.getPartialTagTailLength(input, OPEN_TAG);
+					const safe = input.slice(0, input.length - tailLen);
+					if (safe) results.push({ type: "text", text: safe });
+					this.buffer = input.slice(-tailLen);
+					break;
+				} else {
+					// No open tag — emit all as text
+					results.push({ type: "text", text: input });
+					break;
+				}
+			}
+		}
+
+		return results;
+	}
+
+	/**
+	 * Flush any remaining buffered content. Call at end of stream.
+	 */
+	flush(): ParsedChunk[] {
+		if (!this.buffer) return [];
+
+		const result: ParsedChunk = {
+			type: this.inThinking ? "thinking" : "text",
+			text: this.buffer,
+		};
+		this.buffer = "";
+		return [result];
+	}
+
+	/**
+	 * Check if the end of input could be the start of a partial tag.
+	 * Only buffers when the tail of input matches a prefix of the tag.
+	 */
+	private couldBePartialTag(input: string, tag: string): boolean {
+		return this.getPartialTagTailLength(input, tag) > 0;
+	}
+
+	/**
+	 * Get the length of the tail of input that matches a prefix of the tag.
+	 * Returns 0 if no partial match.
+	 */
+	private getPartialTagTailLength(input: string, tag: string): number {
+		const maxCheck = Math.min(input.length, tag.length - 1);
+		for (let len = maxCheck; len >= 1; len--) {
+			const tail = input.slice(-len);
+			if (tag.startsWith(tail)) {
+				return len;
+			}
+		}
+		return 0;
+	}
+}
--- a/src/resources/extensions/ollama/types.ts
+++ b/src/resources/extensions/ollama/types.ts
@ -72,11 +72,31 @@ export interface OllamaVersionResponse {

 // ─── /api/chat ──────────────────────────────────────────────────────────────

+/** Per-model Ollama inference options carried via Model.providerOptions. */
+export interface OllamaChatOptions {
+	/** How long to keep the model loaded after the last request. e.g. "5m", "0" to unload. */
+	keep_alive?: string;
+	/** Number of GPU layers to offload. -1 = all. */
+	num_gpu?: number;
+	/** Override the context window for Ollama requests. Only sent when explicitly set. */
+	num_ctx?: number;
+	/** Sampling: top-k most likely tokens. Default: 40 */
+	top_k?: number;
+	/** Sampling: nucleus sampling threshold. */
+	top_p?: number;
+	/** Sampling: penalize repeating tokens. Default: 1.1 */
+	repeat_penalty?: number;
+	/** Sampling: fixed seed for reproducibility. */
+	seed?: number;
+}
+
 export interface OllamaChatMessage {
 	role: "system" | "user" | "assistant" | "tool";
 	content: string;
 	images?: string[];
 	tool_calls?: OllamaToolCall[];
+	/** Tool name — required for role: "tool" messages to correlate results with calls. */
+	name?: string;
 }

 export interface OllamaToolCall {
@ -110,7 +130,10 @@ export interface OllamaChatRequest {
 		temperature?: number;
 		top_p?: number;
 		top_k?: number;
+		repeat_penalty?: number;
+		seed?: number;
 		stop?: string[];
+		num_gpu?: number;
 	};
 	keep_alive?: string;
 }