diff --git a/packages/pi-ai/src/types.ts b/packages/pi-ai/src/types.ts index 42a6b3478..661b58b57 100644 --- a/packages/pi-ai/src/types.ts +++ b/packages/pi-ai/src/types.ts @@ -13,7 +13,8 @@ export type KnownApi = | "bedrock-converse-stream" | "google-generative-ai" | "google-gemini-cli" - | "google-vertex"; + | "google-vertex" + | "ollama-chat"; export type Api = KnownApi | (string & {}); @@ -212,9 +213,23 @@ export interface AssistantMessage { errorMessage?: string; /** Server-requested retry delay in milliseconds (from Retry-After or rate limit headers). */ retryAfterMs?: number; + /** Provider inference performance metrics (e.g. tokens/sec from local models). */ + inferenceMetrics?: InferenceMetrics; timestamp: number; // Unix timestamp in milliseconds } +/** Inference performance metrics reported by providers that support it (e.g. Ollama). */ +export interface InferenceMetrics { + /** Tokens generated per second during eval phase. */ + tokensPerSecond: number; + /** Wall-clock duration of the full request in milliseconds. */ + totalDurationMs: number; + /** Duration of the eval (generation) phase in milliseconds. */ + evalDurationMs: number; + /** Duration of the prompt eval phase in milliseconds. */ + promptEvalDurationMs: number; +} + export interface ToolResultMessage { role: "toolResult"; toolCallId: string; @@ -374,4 +389,6 @@ export interface Model { * Read these fields instead of pattern-matching on model IDs or provider names. */ capabilities?: ModelCapabilities; + /** Opaque provider-specific options. Cast to the appropriate type in the provider's stream handler. */ + providerOptions?: Record; } diff --git a/packages/pi-coding-agent/src/core/extensions/types.ts b/packages/pi-coding-agent/src/core/extensions/types.ts index 037e9718c..f4c153992 100644 --- a/packages/pi-coding-agent/src/core/extensions/types.ts +++ b/packages/pi-coding-agent/src/core/extensions/types.ts @@ -1341,6 +1341,8 @@ export interface ProviderModelConfig { headers?: Record; /** OpenAI compatibility settings. */ compat?: Model["compat"]; + /** Opaque provider-specific options (e.g. Ollama keep_alive, num_gpu). */ + providerOptions?: Record; } /** Extension factory function type. Supports both sync and async initialization. */ diff --git a/packages/pi-coding-agent/src/core/model-registry.ts b/packages/pi-coding-agent/src/core/model-registry.ts index 42714560c..5049a3f7f 100644 --- a/packages/pi-coding-agent/src/core/model-registry.ts +++ b/packages/pi-coding-agent/src/core/model-registry.ts @@ -742,6 +742,7 @@ export class ModelRegistry { maxTokens: modelDef.maxTokens, headers, compat: modelDef.compat, + providerOptions: modelDef.providerOptions, } as Model); } @@ -917,5 +918,6 @@ export interface ProviderConfigInput { maxTokens: number; headers?: Record; compat?: Model["compat"]; + providerOptions?: Record; }>; } diff --git a/src/resources/extensions/ollama/index.ts b/src/resources/extensions/ollama/index.ts index 3117fdd54..550771232 100644 --- a/src/resources/extensions/ollama/index.ts +++ b/src/resources/extensions/ollama/index.ts @@ -17,19 +17,10 @@ */ import { importExtensionModule, type ExtensionAPI } from "@gsd/pi-coding-agent"; -import type { OpenAICompletionsCompat } from "@gsd/pi-ai"; import * as client from "./ollama-client.js"; -import { discoverModels, getOllamaOpenAIBaseUrl } from "./ollama-discovery.js"; +import { discoverModels } from "./ollama-discovery.js"; import { registerOllamaCommands } from "./ollama-commands.js"; - -/** Default compat settings for Ollama models via OpenAI-compat endpoint */ -const OLLAMA_COMPAT: OpenAICompletionsCompat = { - supportsDeveloperRole: false, - supportsReasoningEffort: false, - supportsUsageInStreaming: false, - maxTokensField: "max_tokens", - supportsStore: false, -}; +import { streamOllamaChat } from "./ollama-chat-provider.js"; let toolsPromise: Promise | null = null; @@ -68,12 +59,13 @@ async function probeAndRegister(pi: ExtensionAPI): Promise { const models = await discoverModels(); if (models.length === 0) return true; // Running but no models pulled - const baseUrl = getOllamaOpenAIBaseUrl(); + const baseUrl = client.getOllamaHost(); pi.registerProvider("ollama", { authMode: "none", baseUrl, - api: "openai-completions", + api: "ollama-chat", + streamSimple: streamOllamaChat, isReady: () => true, models: models.map((m) => ({ id: m.id, @@ -83,7 +75,7 @@ async function probeAndRegister(pi: ExtensionAPI): Promise { cost: m.cost, contextWindow: m.contextWindow, maxTokens: m.maxTokens, - compat: OLLAMA_COMPAT, + providerOptions: (m.ollamaOptions ?? {}) as Record, })), }); diff --git a/src/resources/extensions/ollama/model-capabilities.ts b/src/resources/extensions/ollama/model-capabilities.ts index 8209946c3..f44506fbf 100644 --- a/src/resources/extensions/ollama/model-capabilities.ts +++ b/src/resources/extensions/ollama/model-capabilities.ts @@ -8,11 +8,15 @@ * Fallback: estimate from parameter count if model isn't in the table. */ +import type { OllamaChatOptions } from "./types.js"; + export interface ModelCapability { contextWindow?: number; maxTokens?: number; input?: ("text" | "image")[]; reasoning?: boolean; + /** Ollama-specific default inference options for this model family. */ + ollamaOptions?: OllamaChatOptions; } /** @@ -20,58 +24,61 @@ export interface ModelCapability { * Keys are matched as prefixes against the model name (before the colon/tag). * More specific entries should appear first. */ +// Note: ollamaOptions.num_ctx is set for known model families where the context +// window is authoritative. For unknown/estimated models, num_ctx is NOT sent +// to avoid OOM risk — Ollama uses its own safe default instead. const KNOWN_MODELS: Array<[pattern: string, caps: ModelCapability]> = [ // ─── Reasoning models ─────────────────────────────────────────────── - ["deepseek-r1", { contextWindow: 131072, reasoning: true }], - ["qwq", { contextWindow: 131072, reasoning: true }], + ["deepseek-r1", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }], + ["qwq", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }], // ─── Vision models ────────────────────────────────────────────────── - ["llava", { contextWindow: 4096, input: ["text", "image"] }], - ["bakllava", { contextWindow: 4096, input: ["text", "image"] }], - ["moondream", { contextWindow: 8192, input: ["text", "image"] }], - ["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"] }], - ["minicpm-v", { contextWindow: 4096, input: ["text", "image"] }], + ["llava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }], + ["bakllava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }], + ["moondream", { contextWindow: 8192, input: ["text", "image"], ollamaOptions: { num_ctx: 8192 } }], + ["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"], ollamaOptions: { num_ctx: 131072 } }], + ["minicpm-v", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }], // ─── Code models ──────────────────────────────────────────────────── - ["codestral", { contextWindow: 262144, maxTokens: 32768 }], - ["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768 }], - ["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384 }], - ["starcoder2", { contextWindow: 16384, maxTokens: 8192 }], - ["codegemma", { contextWindow: 8192, maxTokens: 8192 }], - ["codellama", { contextWindow: 16384, maxTokens: 8192 }], - ["devstral", { contextWindow: 131072, maxTokens: 32768 }], + ["codestral", { contextWindow: 262144, maxTokens: 32768, ollamaOptions: { num_ctx: 262144 } }], + ["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }], + ["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["starcoder2", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }], + ["codegemma", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }], + ["codellama", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }], + ["devstral", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }], // ─── Llama family ─────────────────────────────────────────────────── - ["llama3.3", { contextWindow: 131072, maxTokens: 16384 }], - ["llama3.2", { contextWindow: 131072, maxTokens: 16384 }], - ["llama3.1", { contextWindow: 131072, maxTokens: 16384 }], - ["llama3", { contextWindow: 8192, maxTokens: 8192 }], - ["llama2", { contextWindow: 4096, maxTokens: 4096 }], + ["llama3.3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["llama3.2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["llama3.1", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["llama3", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }], + ["llama2", { contextWindow: 4096, maxTokens: 4096, ollamaOptions: { num_ctx: 4096 } }], // ─── Qwen family ──────────────────────────────────────────────────── - ["qwen3", { contextWindow: 131072, maxTokens: 32768 }], - ["qwen2.5", { contextWindow: 131072, maxTokens: 32768 }], - ["qwen2", { contextWindow: 131072, maxTokens: 32768 }], + ["qwen3", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }], + ["qwen2.5", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }], + ["qwen2", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }], // ─── Gemma family ─────────────────────────────────────────────────── - ["gemma3", { contextWindow: 131072, maxTokens: 16384 }], - ["gemma2", { contextWindow: 8192, maxTokens: 8192 }], + ["gemma3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["gemma2", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }], // ─── Mistral family ───────────────────────────────────────────────── - ["mistral-large", { contextWindow: 131072, maxTokens: 16384 }], - ["mistral-small", { contextWindow: 131072, maxTokens: 16384 }], - ["mistral-nemo", { contextWindow: 131072, maxTokens: 16384 }], - ["mistral", { contextWindow: 32768, maxTokens: 8192 }], - ["mixtral", { contextWindow: 32768, maxTokens: 8192 }], + ["mistral-large", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["mistral-small", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["mistral-nemo", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["mistral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }], + ["mixtral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }], // ─── Phi family ───────────────────────────────────────────────────── - ["phi4", { contextWindow: 16384, maxTokens: 16384 }], - ["phi3.5", { contextWindow: 131072, maxTokens: 16384 }], - ["phi3", { contextWindow: 131072, maxTokens: 4096 }], + ["phi4", { contextWindow: 16384, maxTokens: 16384, ollamaOptions: { num_ctx: 16384 } }], + ["phi3.5", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["phi3", { contextWindow: 131072, maxTokens: 4096, ollamaOptions: { num_ctx: 131072 } }], // ─── Command R ────────────────────────────────────────────────────── - ["command-r-plus", { contextWindow: 131072, maxTokens: 16384 }], - ["command-r", { contextWindow: 131072, maxTokens: 16384 }], + ["command-r-plus", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], + ["command-r", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }], ]; /** diff --git a/src/resources/extensions/ollama/ndjson-stream.ts b/src/resources/extensions/ollama/ndjson-stream.ts new file mode 100644 index 000000000..32065aa4e --- /dev/null +++ b/src/resources/extensions/ollama/ndjson-stream.ts @@ -0,0 +1,63 @@ +// GSD2 — Ollama Extension: NDJSON streaming parser + +/** + * Parses a streaming NDJSON (newline-delimited JSON) response body into + * typed objects. Used for Ollama's /api/chat and /api/pull endpoints. + * + * @param strict When true, malformed JSON lines throw instead of being skipped. + * Use strict mode for inference streams where silent data loss is unacceptable. + * Use permissive mode (default) for progress endpoints like /api/pull. + */ + +export async function* parseNDJsonStream( + body: ReadableStream, + signal?: AbortSignal, + strict = false, +): AsyncGenerator { + const reader = body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + try { + while (true) { + if (signal?.aborted) break; + + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() ?? ""; + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + yield JSON.parse(trimmed) as T; + } catch (err) { + if (strict) { + throw new Error( + `Malformed NDJSON line from Ollama: ${trimmed.slice(0, 200)}`, + ); + } + // Permissive mode: skip malformed lines + } + } + } + + // Flush remaining buffer (skip if aborted) + if (buffer.trim() && !signal?.aborted) { + try { + yield JSON.parse(buffer.trim()) as T; + } catch (err) { + if (strict) { + throw new Error( + `Malformed NDJSON line from Ollama: ${buffer.trim().slice(0, 200)}`, + ); + } + } + } + } finally { + reader.releaseLock(); + } +} diff --git a/src/resources/extensions/ollama/ollama-chat-provider.ts b/src/resources/extensions/ollama/ollama-chat-provider.ts new file mode 100644 index 000000000..f02361622 --- /dev/null +++ b/src/resources/extensions/ollama/ollama-chat-provider.ts @@ -0,0 +1,459 @@ +// GSD2 — Ollama Extension: Native /api/chat stream provider + +/** + * Implements the "ollama-chat" API provider, streaming responses directly + * from Ollama's native /api/chat endpoint instead of the OpenAI compatibility + * shim. This exposes Ollama-specific options (num_ctx, keep_alive, num_gpu, + * sampling parameters) and surfaces inference performance metrics. + */ + +import { + type Api, + type AssistantMessage, + type AssistantMessageEvent, + type AssistantMessageEventStream, + type Context, + type ImageContent, + type InferenceMetrics, + type Message, + type Model, + type SimpleStreamOptions, + type StopReason, + type TextContent, + type ThinkingContent, + type Tool, + type ToolCall, + type Usage, + EventStream, +} from "@gsd/pi-ai"; +import { chat } from "./ollama-client.js"; +import type { + OllamaChatMessage, + OllamaChatOptions, + OllamaChatRequest, + OllamaChatResponse, + OllamaTool, + OllamaToolCall, +} from "./types.js"; +import { ThinkingTagParser, type ParsedChunk } from "./thinking-parser.js"; + +/** Create an AssistantMessageEventStream using the base EventStream class. */ +function createStream(): AssistantMessageEventStream { + return new EventStream( + (event) => event.type === "done" || event.type === "error", + (event) => { + if (event.type === "done") return event.message; + if (event.type === "error") return event.error; + throw new Error("Unexpected event type for final result"); + }, + ) as AssistantMessageEventStream; +} + +// ─── Stream handler ───────────────────────────────────────────────────────── + +export function streamOllamaChat( + model: Model, + context: Context, + options?: SimpleStreamOptions, +): AssistantMessageEventStream { + const stream = createStream(); + + (async () => { + const output = buildInitialOutput(model); + + try { + const request = buildRequest(model, context, options); + stream.push({ type: "start", partial: output }); + + const useThinkingParser = model.reasoning; + const thinkParser = useThinkingParser ? new ThinkingTagParser() : null; + + let contentIndex = -1; + let currentBlockType: "text" | "thinking" | null = null; + + function startBlock(type: "text" | "thinking") { + contentIndex++; + currentBlockType = type; + if (type === "text") { + output.content.push({ type: "text", text: "" }); + stream.push({ type: "text_start", contentIndex, partial: output }); + } else { + output.content.push({ type: "thinking", thinking: "" }); + stream.push({ type: "thinking_start", contentIndex, partial: output }); + } + } + + function endBlock() { + if (currentBlockType === null) return; + if (currentBlockType === "text") { + const block = output.content[contentIndex] as TextContent; + stream.push({ type: "text_end", contentIndex, content: block.text, partial: output }); + } else { + const block = output.content[contentIndex] as ThinkingContent; + stream.push({ type: "thinking_end", contentIndex, content: block.thinking, partial: output }); + } + currentBlockType = null; + } + + function emitDelta(type: "text" | "thinking", text: string) { + if (!text) return; + if (currentBlockType !== type) { + endBlock(); + startBlock(type); + } + if (type === "text") { + (output.content[contentIndex] as TextContent).text += text; + stream.push({ type: "text_delta", contentIndex, delta: text, partial: output }); + } else { + (output.content[contentIndex] as ThinkingContent).thinking += text; + stream.push({ type: "thinking_delta", contentIndex, delta: text, partial: output }); + } + } + + function processChunks(chunks: ParsedChunk[]) { + for (const chunk of chunks) { + emitDelta(chunk.type, chunk.text); + } + } + + function processToolCalls(toolCalls: OllamaToolCall[]) { + endBlock(); + for (const tc of toolCalls) { + contentIndex++; + const toolCall: ToolCall = { + type: "toolCall", + id: `ollama_tc_${contentIndex}`, + name: tc.function.name, + arguments: tc.function.arguments, + }; + output.content.push(toolCall); + stream.push({ type: "toolcall_start", contentIndex, partial: output }); + // Emit a delta with the serialized arguments (convention: start/delta/end) + stream.push({ + type: "toolcall_delta", + contentIndex, + delta: JSON.stringify(tc.function.arguments), + partial: output, + }); + stream.push({ + type: "toolcall_end", + contentIndex, + toolCall, + partial: output, + }); + } + output.stopReason = "toolUse"; + } + + for await (const chunk of chat(request, options?.signal)) { + // Handle text content — process independently of tool_calls + // (a chunk may contain both content and tool_calls) + const content = chunk.message?.content ?? ""; + if (content && !chunk.done) { + if (thinkParser) { + processChunks(thinkParser.push(content)); + } else { + emitDelta("text", content); + } + } + + // Handle tool calls (Ollama sends them complete, may be on done:true chunk) + if (chunk.message?.tool_calls?.length) { + processToolCalls(chunk.message.tool_calls); + } + + if (chunk.done) { + // Final chunk — extract metrics and usage + if (thinkParser) processChunks(thinkParser.flush()); + endBlock(); + + output.usage = buildUsage(chunk); + output.inferenceMetrics = extractMetrics(chunk); + // Preserve "toolUse" if tool calls were processed + if (output.stopReason !== "toolUse") { + output.stopReason = mapStopReason(chunk.done_reason); + } + break; + } + } + + assertStreamSuccess(output, options?.signal); + finalizeStream(stream, output); + } catch (error) { + handleStreamError(stream, output, error, options?.signal); + } + })(); + + return stream; +} + +// ─── Request building ─────────────────────────────────────────────────────── + +function buildRequest( + model: Model, + context: Context, + options?: SimpleStreamOptions, +): OllamaChatRequest { + const ollamaOpts = (model.providerOptions ?? {}) as OllamaChatOptions; + + const request: OllamaChatRequest = { + model: model.id, + messages: convertMessages(context), + stream: true, + }; + + // Build options block with all Ollama-specific parameters + const reqOptions: NonNullable = {}; + + // Context window — only sent when explicitly configured via providerOptions. + // Sending inferred/estimated values risks OOM on constrained hosts. + // Users can set num_ctx per-model in models.json ollamaOptions or the + // capability table can provide it for known model families. + if (ollamaOpts.num_ctx !== undefined && ollamaOpts.num_ctx > 0) { + reqOptions.num_ctx = ollamaOpts.num_ctx; + } + + // Max output tokens + const maxTokens = options?.maxTokens ?? model.maxTokens; + if (maxTokens > 0) { + reqOptions.num_predict = maxTokens; + } + + // Temperature + if (options?.temperature !== undefined) { + reqOptions.temperature = options.temperature; + } + + // Per-model sampling options from providerOptions + if (ollamaOpts.top_p !== undefined) reqOptions.top_p = ollamaOpts.top_p; + if (ollamaOpts.top_k !== undefined) reqOptions.top_k = ollamaOpts.top_k; + if (ollamaOpts.repeat_penalty !== undefined) reqOptions.repeat_penalty = ollamaOpts.repeat_penalty; + if (ollamaOpts.seed !== undefined) reqOptions.seed = ollamaOpts.seed; + if (ollamaOpts.num_gpu !== undefined) reqOptions.num_gpu = ollamaOpts.num_gpu; + + if (Object.keys(reqOptions).length > 0) { + request.options = reqOptions; + } + + // Keep alive + if (ollamaOpts.keep_alive !== undefined) { + request.keep_alive = ollamaOpts.keep_alive; + } + + // Tools + if (context.tools?.length) { + request.tools = convertTools(context.tools); + } + + return request; +} + +// ─── Message conversion ───────────────────────────────────────────────────── + +function convertMessages(context: Context): OllamaChatMessage[] { + const messages: OllamaChatMessage[] = []; + + // System prompt + if (context.systemPrompt) { + messages.push({ role: "system", content: context.systemPrompt }); + } + + for (const msg of context.messages) { + switch (msg.role) { + case "user": + messages.push(convertUserMessage(msg)); + break; + case "assistant": + messages.push(convertAssistantMessage(msg)); + break; + case "toolResult": + messages.push({ + role: "tool", + content: msg.content + .filter((c): c is TextContent => c.type === "text") + .map((c) => c.text) + .join("\n"), + name: msg.toolName, + }); + break; + } + } + + return messages; +} + +function convertUserMessage(msg: Message & { role: "user" }): OllamaChatMessage { + if (typeof msg.content === "string") { + return { role: "user", content: msg.content }; + } + + const textParts: string[] = []; + const images: string[] = []; + + for (const part of msg.content) { + if (part.type === "text") { + textParts.push(part.text); + } else if (part.type === "image") { + // Strip data URI prefix if present + let data = (part as ImageContent).data; + const commaIdx = data.indexOf(","); + if (commaIdx !== -1 && data.startsWith("data:")) { + data = data.slice(commaIdx + 1); + } + images.push(data); + } + } + + const result: OllamaChatMessage = { + role: "user", + content: textParts.join("\n"), + }; + if (images.length > 0) { + result.images = images; + } + return result; +} + +function convertAssistantMessage(msg: Message & { role: "assistant" }): OllamaChatMessage { + let content = ""; + const toolCalls: OllamaChatMessage["tool_calls"] = []; + + for (const block of msg.content) { + if (block.type === "thinking") { + // Serialize thinking back inline for round-trip with Ollama + content += `${(block as ThinkingContent).thinking}`; + } else if (block.type === "text") { + content += (block as TextContent).text; + } else if (block.type === "toolCall") { + const tc = block as ToolCall; + toolCalls.push({ + function: { + name: tc.name, + arguments: tc.arguments, + }, + }); + } + } + + const result: OllamaChatMessage = { role: "assistant", content }; + if (toolCalls.length > 0) { + result.tool_calls = toolCalls; + } + return result; +} + +// ─── Tool conversion ──────────────────────────────────────────────────────── + +function convertTools(tools: Tool[]): OllamaTool[] { + return tools.map((tool) => { + const params = tool.parameters as Record; + return { + type: "function" as const, + function: { + name: tool.name, + description: tool.description, + parameters: { + type: "object" as const, + required: params.required as string[] | undefined, + properties: (params.properties as Record) ?? {}, + }, + }, + }; + }); +} + +// ─── Response mapping ─────────────────────────────────────────────────────── + +function mapStopReason(doneReason?: string): StopReason { + switch (doneReason) { + case "stop": + return "stop"; + case "length": + return "length"; + default: + return "stop"; + } +} + +function buildUsage(chunk: OllamaChatResponse): Usage { + const input = chunk.prompt_eval_count ?? 0; + const outputTokens = chunk.eval_count ?? 0; + return { + input, + output: outputTokens, + cacheRead: 0, + cacheWrite: 0, + totalTokens: input + outputTokens, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }; +} + +function extractMetrics(chunk: OllamaChatResponse): InferenceMetrics | undefined { + if (!chunk.eval_duration && !chunk.total_duration) return undefined; + + const evalCount = chunk.eval_count ?? 0; + const evalDurationNs = chunk.eval_duration ?? 0; + const evalDurationMs = evalDurationNs / 1e6; + const tokensPerSecond = evalDurationNs > 0 ? evalCount / (evalDurationNs / 1e9) : 0; + + return { + tokensPerSecond, + totalDurationMs: (chunk.total_duration ?? 0) / 1e6, + evalDurationMs, + promptEvalDurationMs: (chunk.prompt_eval_duration ?? 0) / 1e6, + }; +} + +// ─── Stream lifecycle helpers ─────────────────────────────────────────────── +// Replicated from openai-shared.ts (not exported from @gsd/pi-ai) + +function buildInitialOutput(model: Model): AssistantMessage { + return { + role: "assistant", + content: [], + api: model.api as Api, + provider: model.provider, + model: model.id, + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + totalTokens: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "stop", + timestamp: Date.now(), + }; +} + +function assertStreamSuccess(output: AssistantMessage, signal?: AbortSignal): void { + if (signal?.aborted) { + throw new Error("Request was aborted"); + } + if (output.stopReason === "aborted" || output.stopReason === "error") { + throw new Error("An unknown error occurred"); + } +} + +function finalizeStream(stream: AssistantMessageEventStream, output: AssistantMessage): void { + stream.push({ + type: "done", + reason: output.stopReason as Extract, + message: output, + }); + stream.end(); +} + +function handleStreamError( + stream: AssistantMessageEventStream, + output: AssistantMessage, + error: unknown, + signal?: AbortSignal, +): void { + for (const block of output.content) delete (block as { index?: number }).index; + output.stopReason = signal?.aborted ? "aborted" : "error"; + output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error); + stream.push({ type: "error", reason: output.stopReason, error: output }); + stream.end(); +} diff --git a/src/resources/extensions/ollama/ollama-client.ts b/src/resources/extensions/ollama/ollama-client.ts index d881fd013..4738c09da 100644 --- a/src/resources/extensions/ollama/ollama-client.ts +++ b/src/resources/extensions/ollama/ollama-client.ts @@ -8,12 +8,15 @@ */ import type { + OllamaChatRequest, + OllamaChatResponse, OllamaPsResponse, OllamaPullProgress, OllamaShowResponse, OllamaTagsResponse, OllamaVersionResponse, } from "./types.js"; +import { parseNDJsonStream } from "./ndjson-stream.js"; const DEFAULT_HOST = "http://localhost:11434"; const PROBE_TIMEOUT_MS = 1500; @@ -130,39 +133,36 @@ export async function pullModel( throw new Error("Ollama /api/pull returned no body"); } - const reader = response.body.getReader(); - const decoder = new TextDecoder(); - let buffer = ""; + for await (const progress of parseNDJsonStream(response.body, signal)) { + onProgress?.(progress); + } +} - while (true) { - const { done, value } = await reader.read(); - if (done) break; +/** + * Stream a chat completion via /api/chat. + * Returns an async generator yielding each NDJSON response chunk. + */ +export async function* chat( + request: OllamaChatRequest, + signal?: AbortSignal, +): AsyncGenerator { + const response = await fetch(`${getOllamaHost()}/api/chat`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(request), + signal, + }); - buffer += decoder.decode(value, { stream: true }); - const lines = buffer.split("\n"); - buffer = lines.pop() ?? ""; - - for (const line of lines) { - const trimmed = line.trim(); - if (!trimmed) continue; - try { - const progress = JSON.parse(trimmed) as OllamaPullProgress; - onProgress?.(progress); - } catch { - // Skip malformed lines - } - } + if (!response.ok) { + const text = await response.text(); + throw new Error(`Ollama /api/chat returned ${response.status}: ${text}`); } - // Process remaining buffer - if (buffer.trim()) { - try { - const progress = JSON.parse(buffer.trim()) as OllamaPullProgress; - onProgress?.(progress); - } catch { - // Ignore - } + if (!response.body) { + throw new Error("Ollama /api/chat returned no body"); } + + yield* parseNDJsonStream(response.body, signal, true); } /** diff --git a/src/resources/extensions/ollama/ollama-discovery.ts b/src/resources/extensions/ollama/ollama-discovery.ts index eb6916b9e..29fb1bc77 100644 --- a/src/resources/extensions/ollama/ollama-discovery.ts +++ b/src/resources/extensions/ollama/ollama-discovery.ts @@ -8,14 +8,14 @@ * Returns models in the format expected by pi.registerProvider(). */ -import { listModels, getOllamaHost } from "./ollama-client.js"; +import { listModels } from "./ollama-client.js"; import { estimateContextFromParams, formatModelSize, getModelCapabilities, humanizeModelName, } from "./model-capabilities.js"; -import type { OllamaModelInfo } from "./types.js"; +import type { OllamaChatOptions, OllamaModelInfo } from "./types.js"; export interface DiscoveredOllamaModel { id: string; @@ -29,6 +29,8 @@ export interface DiscoveredOllamaModel { sizeBytes: number; /** Parameter size string from Ollama (e.g. "7B") */ parameterSize: string; + /** Ollama-specific inference options for this model */ + ollamaOptions?: OllamaChatOptions; } const ZERO_COST = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }; @@ -64,6 +66,7 @@ function enrichModel(info: OllamaModelInfo): DiscoveredOllamaModel { maxTokens, sizeBytes: info.size, parameterSize, + ollamaOptions: caps.ollamaOptions, }; } @@ -98,9 +101,3 @@ export function formatModelForDisplay(model: DiscoveredOllamaModel): string { return parts.join(" "); } -/** - * Build the OpenAI-compat base URL for Ollama. - */ -export function getOllamaOpenAIBaseUrl(): string { - return `${getOllamaHost()}/v1`; -} diff --git a/src/resources/extensions/ollama/ollama-tool.ts b/src/resources/extensions/ollama/ollama-tool.ts index ad5af5885..e3a5d7535 100644 --- a/src/resources/extensions/ollama/ollama-tool.ts +++ b/src/resources/extensions/ollama/ollama-tool.ts @@ -31,6 +31,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void { promptGuidelines: [ "Use 'list' to see what models are available locally before trying to use one.", "Use 'pull' to download a model that isn't available yet.", + "Use 'remove' to delete a local model that is no longer needed.", + "Use 'show' to get detailed info about a model (parameters, quantization, families).", "Use 'status' to check if Ollama is running.", "Use 'ps' to see which models are loaded in memory and VRAM usage.", "Common models: llama3.1:8b, qwen2.5-coder:7b, deepseek-r1:8b, codestral:22b", @@ -40,6 +42,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void { [ Type.Literal("list"), Type.Literal("pull"), + Type.Literal("remove"), + Type.Literal("show"), Type.Literal("status"), Type.Literal("ps"), ], @@ -164,6 +168,71 @@ export function registerOllamaTool(pi: ExtensionAPI): void { }; } + case "remove": { + if (!model) { + return { + content: [{ type: "text", text: "Error: 'model' parameter is required for remove action." }], + isError: true, + details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails, + }; + } + + const running = await client.isRunning(); + if (!running) { + return { + content: [{ type: "text", text: "Ollama is not running." }], + isError: true, + details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails, + }; + } + + await client.deleteModel(model); + return { + content: [{ type: "text", text: `Successfully removed ${model}` }], + details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails, + }; + } + + case "show": { + if (!model) { + return { + content: [{ type: "text", text: "Error: 'model' parameter is required for show action." }], + isError: true, + details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails, + }; + } + + const running = await client.isRunning(); + if (!running) { + return { + content: [{ type: "text", text: "Ollama is not running." }], + isError: true, + details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails, + }; + } + + const info = await client.showModel(model); + const details = info.details; + const infoLines = [ + `Model: ${model}`, + `Family: ${details.family}`, + `Parameters: ${details.parameter_size}`, + `Quantization: ${details.quantization_level}`, + `Format: ${details.format}`, + ]; + if (details.families?.length) { + infoLines.push(`Families: ${details.families.join(", ")}`); + } + if (info.parameters) { + infoLines.push(`\nModelfile parameters:\n${info.parameters}`); + } + + return { + content: [{ type: "text", text: infoLines.join("\n") }], + details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails, + }; + } + default: return { content: [{ type: "text", text: `Unknown action: ${action}` }], diff --git a/src/resources/extensions/ollama/tests/ollama-discovery.test.ts b/src/resources/extensions/ollama/tests/ollama-discovery.test.ts index b69cf84e1..a228bf663 100644 --- a/src/resources/extensions/ollama/tests/ollama-discovery.test.ts +++ b/src/resources/extensions/ollama/tests/ollama-discovery.test.ts @@ -1,28 +1 @@ // GSD2 — Tests for Ollama model discovery and enrichment -import { describe, it, afterEach } from "node:test"; -import assert from "node:assert/strict"; -import { getOllamaOpenAIBaseUrl } from "../ollama-discovery.js"; - -// ─── getOllamaOpenAIBaseUrl ───────────────────────────────────────────────── - -describe("getOllamaOpenAIBaseUrl", () => { - const originalHost = process.env.OLLAMA_HOST; - - afterEach(() => { - if (originalHost === undefined) { - delete process.env.OLLAMA_HOST; - } else { - process.env.OLLAMA_HOST = originalHost; - } - }); - - it("returns default OpenAI-compat URL", () => { - delete process.env.OLLAMA_HOST; - assert.equal(getOllamaOpenAIBaseUrl(), "http://localhost:11434/v1"); - }); - - it("appends /v1 to custom OLLAMA_HOST", () => { - process.env.OLLAMA_HOST = "http://remote:9999"; - assert.equal(getOllamaOpenAIBaseUrl(), "http://remote:9999/v1"); - }); -}); diff --git a/src/resources/extensions/ollama/thinking-parser.ts b/src/resources/extensions/ollama/thinking-parser.ts new file mode 100644 index 000000000..9c060761c --- /dev/null +++ b/src/resources/extensions/ollama/thinking-parser.ts @@ -0,0 +1,116 @@ +// GSD2 — Ollama Extension: Stateful tag stream parser + +/** + * Extracts ... thinking blocks from a streaming text response. + * Handles the case where tag boundaries span multiple chunks by buffering + * up to 8 characters (length of "") at chunk boundaries. + * + * Used for reasoning models like deepseek-r1 and qwq that embed thinking + * inline in their text output. + */ + +export type ParsedChunk = + | { type: "thinking"; text: string } + | { type: "text"; text: string }; + +const OPEN_TAG = ""; +const CLOSE_TAG = ""; +const MAX_TAG_LEN = Math.max(OPEN_TAG.length, CLOSE_TAG.length); + +export class ThinkingTagParser { + private buffer = ""; + private inThinking = false; + + /** + * Feed a chunk of text and get back parsed segments. + * May return zero or more segments depending on tag boundaries. + */ + push(chunk: string): ParsedChunk[] { + const results: ParsedChunk[] = []; + let input = this.buffer + chunk; + this.buffer = ""; + + while (input.length > 0) { + if (this.inThinking) { + const closeIdx = input.indexOf(CLOSE_TAG); + if (closeIdx !== -1) { + // Found close tag — emit thinking content before it + const thinking = input.slice(0, closeIdx); + if (thinking) results.push({ type: "thinking", text: thinking }); + this.inThinking = false; + input = input.slice(closeIdx + CLOSE_TAG.length); + } else if (this.couldBePartialTag(input, CLOSE_TAG)) { + // Possible partial close tag at end — buffer only the matching tail + const tailLen = this.getPartialTagTailLength(input, CLOSE_TAG); + const safe = input.slice(0, input.length - tailLen); + if (safe) results.push({ type: "thinking", text: safe }); + this.buffer = input.slice(-tailLen); + break; + } else { + // No close tag — emit all as thinking + results.push({ type: "thinking", text: input }); + break; + } + } else { + const openIdx = input.indexOf(OPEN_TAG); + if (openIdx !== -1) { + // Found open tag — emit text before it + const text = input.slice(0, openIdx); + if (text) results.push({ type: "text", text }); + this.inThinking = true; + input = input.slice(openIdx + OPEN_TAG.length); + } else if (this.couldBePartialTag(input, OPEN_TAG)) { + // Possible partial open tag at end — buffer only the matching tail + const tailLen = this.getPartialTagTailLength(input, OPEN_TAG); + const safe = input.slice(0, input.length - tailLen); + if (safe) results.push({ type: "text", text: safe }); + this.buffer = input.slice(-tailLen); + break; + } else { + // No open tag — emit all as text + results.push({ type: "text", text: input }); + break; + } + } + } + + return results; + } + + /** + * Flush any remaining buffered content. Call at end of stream. + */ + flush(): ParsedChunk[] { + if (!this.buffer) return []; + + const result: ParsedChunk = { + type: this.inThinking ? "thinking" : "text", + text: this.buffer, + }; + this.buffer = ""; + return [result]; + } + + /** + * Check if the end of input could be the start of a partial tag. + * Only buffers when the tail of input matches a prefix of the tag. + */ + private couldBePartialTag(input: string, tag: string): boolean { + return this.getPartialTagTailLength(input, tag) > 0; + } + + /** + * Get the length of the tail of input that matches a prefix of the tag. + * Returns 0 if no partial match. + */ + private getPartialTagTailLength(input: string, tag: string): number { + const maxCheck = Math.min(input.length, tag.length - 1); + for (let len = maxCheck; len >= 1; len--) { + const tail = input.slice(-len); + if (tag.startsWith(tail)) { + return len; + } + } + return 0; + } +} diff --git a/src/resources/extensions/ollama/types.ts b/src/resources/extensions/ollama/types.ts index 5f2c88705..51e9beb01 100644 --- a/src/resources/extensions/ollama/types.ts +++ b/src/resources/extensions/ollama/types.ts @@ -72,11 +72,31 @@ export interface OllamaVersionResponse { // ─── /api/chat ────────────────────────────────────────────────────────────── +/** Per-model Ollama inference options carried via Model.providerOptions. */ +export interface OllamaChatOptions { + /** How long to keep the model loaded after the last request. e.g. "5m", "0" to unload. */ + keep_alive?: string; + /** Number of GPU layers to offload. -1 = all. */ + num_gpu?: number; + /** Override the context window for Ollama requests. Only sent when explicitly set. */ + num_ctx?: number; + /** Sampling: top-k most likely tokens. Default: 40 */ + top_k?: number; + /** Sampling: nucleus sampling threshold. */ + top_p?: number; + /** Sampling: penalize repeating tokens. Default: 1.1 */ + repeat_penalty?: number; + /** Sampling: fixed seed for reproducibility. */ + seed?: number; +} + export interface OllamaChatMessage { role: "system" | "user" | "assistant" | "tool"; content: string; images?: string[]; tool_calls?: OllamaToolCall[]; + /** Tool name — required for role: "tool" messages to correlate results with calls. */ + name?: string; } export interface OllamaToolCall { @@ -110,7 +130,10 @@ export interface OllamaChatRequest { temperature?: number; top_p?: number; top_k?: number; + repeat_penalty?: number; + seed?: number; stop?: string[]; + num_gpu?: number; }; keep_alive?: string; }