feat(ollama): native /api/chat provider with full option exposure

Replace the OpenAI-compat shim with a native Ollama /api/chat streaming
provider that exposes all commonly-used Ollama options and surfaces
inference performance metrics.

Key changes:
- Native NDJSON streaming from /api/chat (no more OpenAI shim)
- Known models send num_ctx from capability table; unknown models defer
  to Ollama's default to avoid OOM on constrained hosts
- Exposes: temperature, top_p, top_k, repeat_penalty, seed, num_gpu,
  keep_alive, num_predict via per-model providerOptions
- Extracts <think>...</think> blocks for reasoning models (deepseek-r1, qwq)
- Surfaces InferenceMetrics (tokens/sec, durations) on AssistantMessage
- Adds remove and show actions to ollama_manage LLM tool
- Adds "ollama-chat" to KnownApi, providerOptions to Model<TApi>
- NDJSON parser uses strict mode for chat (fails on malformed frames)
- Mixed content+tool_call chunks handled independently

Closes #3544
This commit is contained in:
Jeremy 2026-04-05 08:49:48 -05:00
parent dcf41154b8
commit 4ba2d5a219
13 changed files with 832 additions and 112 deletions

View file

@ -13,7 +13,8 @@ export type KnownApi =
| "bedrock-converse-stream"
| "google-generative-ai"
| "google-gemini-cli"
| "google-vertex";
| "google-vertex"
| "ollama-chat";
export type Api = KnownApi | (string & {});
@ -212,9 +213,23 @@ export interface AssistantMessage {
errorMessage?: string;
/** Server-requested retry delay in milliseconds (from Retry-After or rate limit headers). */
retryAfterMs?: number;
/** Provider inference performance metrics (e.g. tokens/sec from local models). */
inferenceMetrics?: InferenceMetrics;
timestamp: number; // Unix timestamp in milliseconds
}
/** Inference performance metrics reported by providers that support it (e.g. Ollama). */
export interface InferenceMetrics {
/** Tokens generated per second during eval phase. */
tokensPerSecond: number;
/** Wall-clock duration of the full request in milliseconds. */
totalDurationMs: number;
/** Duration of the eval (generation) phase in milliseconds. */
evalDurationMs: number;
/** Duration of the prompt eval phase in milliseconds. */
promptEvalDurationMs: number;
}
export interface ToolResultMessage<TDetails = any> {
role: "toolResult";
toolCallId: string;
@ -374,4 +389,6 @@ export interface Model<TApi extends Api> {
* Read these fields instead of pattern-matching on model IDs or provider names.
*/
capabilities?: ModelCapabilities;
/** Opaque provider-specific options. Cast to the appropriate type in the provider's stream handler. */
providerOptions?: Record<string, unknown>;
}

View file

@ -1341,6 +1341,8 @@ export interface ProviderModelConfig {
headers?: Record<string, string>;
/** OpenAI compatibility settings. */
compat?: Model<Api>["compat"];
/** Opaque provider-specific options (e.g. Ollama keep_alive, num_gpu). */
providerOptions?: Record<string, unknown>;
}
/** Extension factory function type. Supports both sync and async initialization. */

View file

@ -742,6 +742,7 @@ export class ModelRegistry {
maxTokens: modelDef.maxTokens,
headers,
compat: modelDef.compat,
providerOptions: modelDef.providerOptions,
} as Model<Api>);
}
@ -917,5 +918,6 @@ export interface ProviderConfigInput {
maxTokens: number;
headers?: Record<string, string>;
compat?: Model<Api>["compat"];
providerOptions?: Record<string, unknown>;
}>;
}

View file

@ -17,19 +17,10 @@
*/
import { importExtensionModule, type ExtensionAPI } from "@gsd/pi-coding-agent";
import type { OpenAICompletionsCompat } from "@gsd/pi-ai";
import * as client from "./ollama-client.js";
import { discoverModels, getOllamaOpenAIBaseUrl } from "./ollama-discovery.js";
import { discoverModels } from "./ollama-discovery.js";
import { registerOllamaCommands } from "./ollama-commands.js";
/** Default compat settings for Ollama models via OpenAI-compat endpoint */
const OLLAMA_COMPAT: OpenAICompletionsCompat = {
supportsDeveloperRole: false,
supportsReasoningEffort: false,
supportsUsageInStreaming: false,
maxTokensField: "max_tokens",
supportsStore: false,
};
import { streamOllamaChat } from "./ollama-chat-provider.js";
let toolsPromise: Promise<void> | null = null;
@ -68,12 +59,13 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
const models = await discoverModels();
if (models.length === 0) return true; // Running but no models pulled
const baseUrl = getOllamaOpenAIBaseUrl();
const baseUrl = client.getOllamaHost();
pi.registerProvider("ollama", {
authMode: "none",
baseUrl,
api: "openai-completions",
api: "ollama-chat",
streamSimple: streamOllamaChat,
isReady: () => true,
models: models.map((m) => ({
id: m.id,
@ -83,7 +75,7 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
cost: m.cost,
contextWindow: m.contextWindow,
maxTokens: m.maxTokens,
compat: OLLAMA_COMPAT,
providerOptions: (m.ollamaOptions ?? {}) as Record<string, unknown>,
})),
});

View file

@ -8,11 +8,15 @@
* Fallback: estimate from parameter count if model isn't in the table.
*/
import type { OllamaChatOptions } from "./types.js";
export interface ModelCapability {
contextWindow?: number;
maxTokens?: number;
input?: ("text" | "image")[];
reasoning?: boolean;
/** Ollama-specific default inference options for this model family. */
ollamaOptions?: OllamaChatOptions;
}
/**
@ -20,58 +24,61 @@ export interface ModelCapability {
* Keys are matched as prefixes against the model name (before the colon/tag).
* More specific entries should appear first.
*/
// Note: ollamaOptions.num_ctx is set for known model families where the context
// window is authoritative. For unknown/estimated models, num_ctx is NOT sent
// to avoid OOM risk — Ollama uses its own safe default instead.
const KNOWN_MODELS: Array<[pattern: string, caps: ModelCapability]> = [
// ─── Reasoning models ───────────────────────────────────────────────
["deepseek-r1", { contextWindow: 131072, reasoning: true }],
["qwq", { contextWindow: 131072, reasoning: true }],
["deepseek-r1", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],
["qwq", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],
// ─── Vision models ──────────────────────────────────────────────────
["llava", { contextWindow: 4096, input: ["text", "image"] }],
["bakllava", { contextWindow: 4096, input: ["text", "image"] }],
["moondream", { contextWindow: 8192, input: ["text", "image"] }],
["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"] }],
["minicpm-v", { contextWindow: 4096, input: ["text", "image"] }],
["llava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
["bakllava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
["moondream", { contextWindow: 8192, input: ["text", "image"], ollamaOptions: { num_ctx: 8192 } }],
["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"], ollamaOptions: { num_ctx: 131072 } }],
["minicpm-v", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
// ─── Code models ────────────────────────────────────────────────────
["codestral", { contextWindow: 262144, maxTokens: 32768 }],
["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768 }],
["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384 }],
["starcoder2", { contextWindow: 16384, maxTokens: 8192 }],
["codegemma", { contextWindow: 8192, maxTokens: 8192 }],
["codellama", { contextWindow: 16384, maxTokens: 8192 }],
["devstral", { contextWindow: 131072, maxTokens: 32768 }],
["codestral", { contextWindow: 262144, maxTokens: 32768, ollamaOptions: { num_ctx: 262144 } }],
["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["starcoder2", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
["codegemma", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
["codellama", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
["devstral", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
// ─── Llama family ───────────────────────────────────────────────────
["llama3.3", { contextWindow: 131072, maxTokens: 16384 }],
["llama3.2", { contextWindow: 131072, maxTokens: 16384 }],
["llama3.1", { contextWindow: 131072, maxTokens: 16384 }],
["llama3", { contextWindow: 8192, maxTokens: 8192 }],
["llama2", { contextWindow: 4096, maxTokens: 4096 }],
["llama3.3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["llama3.2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["llama3.1", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["llama3", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
["llama2", { contextWindow: 4096, maxTokens: 4096, ollamaOptions: { num_ctx: 4096 } }],
// ─── Qwen family ────────────────────────────────────────────────────
["qwen3", { contextWindow: 131072, maxTokens: 32768 }],
["qwen2.5", { contextWindow: 131072, maxTokens: 32768 }],
["qwen2", { contextWindow: 131072, maxTokens: 32768 }],
["qwen3", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
["qwen2.5", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
["qwen2", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
// ─── Gemma family ───────────────────────────────────────────────────
["gemma3", { contextWindow: 131072, maxTokens: 16384 }],
["gemma2", { contextWindow: 8192, maxTokens: 8192 }],
["gemma3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["gemma2", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
// ─── Mistral family ─────────────────────────────────────────────────
["mistral-large", { contextWindow: 131072, maxTokens: 16384 }],
["mistral-small", { contextWindow: 131072, maxTokens: 16384 }],
["mistral-nemo", { contextWindow: 131072, maxTokens: 16384 }],
["mistral", { contextWindow: 32768, maxTokens: 8192 }],
["mixtral", { contextWindow: 32768, maxTokens: 8192 }],
["mistral-large", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["mistral-small", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["mistral-nemo", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["mistral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],
["mixtral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],
// ─── Phi family ─────────────────────────────────────────────────────
["phi4", { contextWindow: 16384, maxTokens: 16384 }],
["phi3.5", { contextWindow: 131072, maxTokens: 16384 }],
["phi3", { contextWindow: 131072, maxTokens: 4096 }],
["phi4", { contextWindow: 16384, maxTokens: 16384, ollamaOptions: { num_ctx: 16384 } }],
["phi3.5", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["phi3", { contextWindow: 131072, maxTokens: 4096, ollamaOptions: { num_ctx: 131072 } }],
// ─── Command R ──────────────────────────────────────────────────────
["command-r-plus", { contextWindow: 131072, maxTokens: 16384 }],
["command-r", { contextWindow: 131072, maxTokens: 16384 }],
["command-r-plus", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
["command-r", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
];
/**

View file

@ -0,0 +1,63 @@
// GSD2 — Ollama Extension: NDJSON streaming parser
/**
* Parses a streaming NDJSON (newline-delimited JSON) response body into
* typed objects. Used for Ollama's /api/chat and /api/pull endpoints.
*
* @param strict When true, malformed JSON lines throw instead of being skipped.
* Use strict mode for inference streams where silent data loss is unacceptable.
* Use permissive mode (default) for progress endpoints like /api/pull.
*/
export async function* parseNDJsonStream<T>(
body: ReadableStream<Uint8Array>,
signal?: AbortSignal,
strict = false,
): AsyncGenerator<T> {
const reader = body.getReader();
const decoder = new TextDecoder();
let buffer = "";
try {
while (true) {
if (signal?.aborted) break;
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() ?? "";
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
yield JSON.parse(trimmed) as T;
} catch (err) {
if (strict) {
throw new Error(
`Malformed NDJSON line from Ollama: ${trimmed.slice(0, 200)}`,
);
}
// Permissive mode: skip malformed lines
}
}
}
// Flush remaining buffer (skip if aborted)
if (buffer.trim() && !signal?.aborted) {
try {
yield JSON.parse(buffer.trim()) as T;
} catch (err) {
if (strict) {
throw new Error(
`Malformed NDJSON line from Ollama: ${buffer.trim().slice(0, 200)}`,
);
}
}
}
} finally {
reader.releaseLock();
}
}

View file

@ -0,0 +1,459 @@
// GSD2 — Ollama Extension: Native /api/chat stream provider
/**
* Implements the "ollama-chat" API provider, streaming responses directly
* from Ollama's native /api/chat endpoint instead of the OpenAI compatibility
* shim. This exposes Ollama-specific options (num_ctx, keep_alive, num_gpu,
* sampling parameters) and surfaces inference performance metrics.
*/
import {
type Api,
type AssistantMessage,
type AssistantMessageEvent,
type AssistantMessageEventStream,
type Context,
type ImageContent,
type InferenceMetrics,
type Message,
type Model,
type SimpleStreamOptions,
type StopReason,
type TextContent,
type ThinkingContent,
type Tool,
type ToolCall,
type Usage,
EventStream,
} from "@gsd/pi-ai";
import { chat } from "./ollama-client.js";
import type {
OllamaChatMessage,
OllamaChatOptions,
OllamaChatRequest,
OllamaChatResponse,
OllamaTool,
OllamaToolCall,
} from "./types.js";
import { ThinkingTagParser, type ParsedChunk } from "./thinking-parser.js";
/** Create an AssistantMessageEventStream using the base EventStream class. */
function createStream(): AssistantMessageEventStream {
return new EventStream<AssistantMessageEvent, AssistantMessage>(
(event) => event.type === "done" || event.type === "error",
(event) => {
if (event.type === "done") return event.message;
if (event.type === "error") return event.error;
throw new Error("Unexpected event type for final result");
},
) as AssistantMessageEventStream;
}
// ─── Stream handler ─────────────────────────────────────────────────────────
export function streamOllamaChat(
model: Model<Api>,
context: Context,
options?: SimpleStreamOptions,
): AssistantMessageEventStream {
const stream = createStream();
(async () => {
const output = buildInitialOutput(model);
try {
const request = buildRequest(model, context, options);
stream.push({ type: "start", partial: output });
const useThinkingParser = model.reasoning;
const thinkParser = useThinkingParser ? new ThinkingTagParser() : null;
let contentIndex = -1;
let currentBlockType: "text" | "thinking" | null = null;
function startBlock(type: "text" | "thinking") {
contentIndex++;
currentBlockType = type;
if (type === "text") {
output.content.push({ type: "text", text: "" });
stream.push({ type: "text_start", contentIndex, partial: output });
} else {
output.content.push({ type: "thinking", thinking: "" });
stream.push({ type: "thinking_start", contentIndex, partial: output });
}
}
function endBlock() {
if (currentBlockType === null) return;
if (currentBlockType === "text") {
const block = output.content[contentIndex] as TextContent;
stream.push({ type: "text_end", contentIndex, content: block.text, partial: output });
} else {
const block = output.content[contentIndex] as ThinkingContent;
stream.push({ type: "thinking_end", contentIndex, content: block.thinking, partial: output });
}
currentBlockType = null;
}
function emitDelta(type: "text" | "thinking", text: string) {
if (!text) return;
if (currentBlockType !== type) {
endBlock();
startBlock(type);
}
if (type === "text") {
(output.content[contentIndex] as TextContent).text += text;
stream.push({ type: "text_delta", contentIndex, delta: text, partial: output });
} else {
(output.content[contentIndex] as ThinkingContent).thinking += text;
stream.push({ type: "thinking_delta", contentIndex, delta: text, partial: output });
}
}
function processChunks(chunks: ParsedChunk[]) {
for (const chunk of chunks) {
emitDelta(chunk.type, chunk.text);
}
}
function processToolCalls(toolCalls: OllamaToolCall[]) {
endBlock();
for (const tc of toolCalls) {
contentIndex++;
const toolCall: ToolCall = {
type: "toolCall",
id: `ollama_tc_${contentIndex}`,
name: tc.function.name,
arguments: tc.function.arguments,
};
output.content.push(toolCall);
stream.push({ type: "toolcall_start", contentIndex, partial: output });
// Emit a delta with the serialized arguments (convention: start/delta/end)
stream.push({
type: "toolcall_delta",
contentIndex,
delta: JSON.stringify(tc.function.arguments),
partial: output,
});
stream.push({
type: "toolcall_end",
contentIndex,
toolCall,
partial: output,
});
}
output.stopReason = "toolUse";
}
for await (const chunk of chat(request, options?.signal)) {
// Handle text content — process independently of tool_calls
// (a chunk may contain both content and tool_calls)
const content = chunk.message?.content ?? "";
if (content && !chunk.done) {
if (thinkParser) {
processChunks(thinkParser.push(content));
} else {
emitDelta("text", content);
}
}
// Handle tool calls (Ollama sends them complete, may be on done:true chunk)
if (chunk.message?.tool_calls?.length) {
processToolCalls(chunk.message.tool_calls);
}
if (chunk.done) {
// Final chunk — extract metrics and usage
if (thinkParser) processChunks(thinkParser.flush());
endBlock();
output.usage = buildUsage(chunk);
output.inferenceMetrics = extractMetrics(chunk);
// Preserve "toolUse" if tool calls were processed
if (output.stopReason !== "toolUse") {
output.stopReason = mapStopReason(chunk.done_reason);
}
break;
}
}
assertStreamSuccess(output, options?.signal);
finalizeStream(stream, output);
} catch (error) {
handleStreamError(stream, output, error, options?.signal);
}
})();
return stream;
}
// ─── Request building ───────────────────────────────────────────────────────
function buildRequest(
model: Model<Api>,
context: Context,
options?: SimpleStreamOptions,
): OllamaChatRequest {
const ollamaOpts = (model.providerOptions ?? {}) as OllamaChatOptions;
const request: OllamaChatRequest = {
model: model.id,
messages: convertMessages(context),
stream: true,
};
// Build options block with all Ollama-specific parameters
const reqOptions: NonNullable<OllamaChatRequest["options"]> = {};
// Context window — only sent when explicitly configured via providerOptions.
// Sending inferred/estimated values risks OOM on constrained hosts.
// Users can set num_ctx per-model in models.json ollamaOptions or the
// capability table can provide it for known model families.
if (ollamaOpts.num_ctx !== undefined && ollamaOpts.num_ctx > 0) {
reqOptions.num_ctx = ollamaOpts.num_ctx;
}
// Max output tokens
const maxTokens = options?.maxTokens ?? model.maxTokens;
if (maxTokens > 0) {
reqOptions.num_predict = maxTokens;
}
// Temperature
if (options?.temperature !== undefined) {
reqOptions.temperature = options.temperature;
}
// Per-model sampling options from providerOptions
if (ollamaOpts.top_p !== undefined) reqOptions.top_p = ollamaOpts.top_p;
if (ollamaOpts.top_k !== undefined) reqOptions.top_k = ollamaOpts.top_k;
if (ollamaOpts.repeat_penalty !== undefined) reqOptions.repeat_penalty = ollamaOpts.repeat_penalty;
if (ollamaOpts.seed !== undefined) reqOptions.seed = ollamaOpts.seed;
if (ollamaOpts.num_gpu !== undefined) reqOptions.num_gpu = ollamaOpts.num_gpu;
if (Object.keys(reqOptions).length > 0) {
request.options = reqOptions;
}
// Keep alive
if (ollamaOpts.keep_alive !== undefined) {
request.keep_alive = ollamaOpts.keep_alive;
}
// Tools
if (context.tools?.length) {
request.tools = convertTools(context.tools);
}
return request;
}
// ─── Message conversion ─────────────────────────────────────────────────────
function convertMessages(context: Context): OllamaChatMessage[] {
const messages: OllamaChatMessage[] = [];
// System prompt
if (context.systemPrompt) {
messages.push({ role: "system", content: context.systemPrompt });
}
for (const msg of context.messages) {
switch (msg.role) {
case "user":
messages.push(convertUserMessage(msg));
break;
case "assistant":
messages.push(convertAssistantMessage(msg));
break;
case "toolResult":
messages.push({
role: "tool",
content: msg.content
.filter((c): c is TextContent => c.type === "text")
.map((c) => c.text)
.join("\n"),
name: msg.toolName,
});
break;
}
}
return messages;
}
function convertUserMessage(msg: Message & { role: "user" }): OllamaChatMessage {
if (typeof msg.content === "string") {
return { role: "user", content: msg.content };
}
const textParts: string[] = [];
const images: string[] = [];
for (const part of msg.content) {
if (part.type === "text") {
textParts.push(part.text);
} else if (part.type === "image") {
// Strip data URI prefix if present
let data = (part as ImageContent).data;
const commaIdx = data.indexOf(",");
if (commaIdx !== -1 && data.startsWith("data:")) {
data = data.slice(commaIdx + 1);
}
images.push(data);
}
}
const result: OllamaChatMessage = {
role: "user",
content: textParts.join("\n"),
};
if (images.length > 0) {
result.images = images;
}
return result;
}
function convertAssistantMessage(msg: Message & { role: "assistant" }): OllamaChatMessage {
let content = "";
const toolCalls: OllamaChatMessage["tool_calls"] = [];
for (const block of msg.content) {
if (block.type === "thinking") {
// Serialize thinking back inline for round-trip with Ollama
content += `<think>${(block as ThinkingContent).thinking}</think>`;
} else if (block.type === "text") {
content += (block as TextContent).text;
} else if (block.type === "toolCall") {
const tc = block as ToolCall;
toolCalls.push({
function: {
name: tc.name,
arguments: tc.arguments,
},
});
}
}
const result: OllamaChatMessage = { role: "assistant", content };
if (toolCalls.length > 0) {
result.tool_calls = toolCalls;
}
return result;
}
// ─── Tool conversion ────────────────────────────────────────────────────────
function convertTools(tools: Tool[]): OllamaTool[] {
return tools.map((tool) => {
const params = tool.parameters as Record<string, unknown>;
return {
type: "function" as const,
function: {
name: tool.name,
description: tool.description,
parameters: {
type: "object" as const,
required: params.required as string[] | undefined,
properties: (params.properties as Record<string, unknown>) ?? {},
},
},
};
});
}
// ─── Response mapping ───────────────────────────────────────────────────────
function mapStopReason(doneReason?: string): StopReason {
switch (doneReason) {
case "stop":
return "stop";
case "length":
return "length";
default:
return "stop";
}
}
function buildUsage(chunk: OllamaChatResponse): Usage {
const input = chunk.prompt_eval_count ?? 0;
const outputTokens = chunk.eval_count ?? 0;
return {
input,
output: outputTokens,
cacheRead: 0,
cacheWrite: 0,
totalTokens: input + outputTokens,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
};
}
function extractMetrics(chunk: OllamaChatResponse): InferenceMetrics | undefined {
if (!chunk.eval_duration && !chunk.total_duration) return undefined;
const evalCount = chunk.eval_count ?? 0;
const evalDurationNs = chunk.eval_duration ?? 0;
const evalDurationMs = evalDurationNs / 1e6;
const tokensPerSecond = evalDurationNs > 0 ? evalCount / (evalDurationNs / 1e9) : 0;
return {
tokensPerSecond,
totalDurationMs: (chunk.total_duration ?? 0) / 1e6,
evalDurationMs,
promptEvalDurationMs: (chunk.prompt_eval_duration ?? 0) / 1e6,
};
}
// ─── Stream lifecycle helpers ───────────────────────────────────────────────
// Replicated from openai-shared.ts (not exported from @gsd/pi-ai)
function buildInitialOutput(model: Model<Api>): AssistantMessage {
return {
role: "assistant",
content: [],
api: model.api as Api,
provider: model.provider,
model: model.id,
usage: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
},
stopReason: "stop",
timestamp: Date.now(),
};
}
function assertStreamSuccess(output: AssistantMessage, signal?: AbortSignal): void {
if (signal?.aborted) {
throw new Error("Request was aborted");
}
if (output.stopReason === "aborted" || output.stopReason === "error") {
throw new Error("An unknown error occurred");
}
}
function finalizeStream(stream: AssistantMessageEventStream, output: AssistantMessage): void {
stream.push({
type: "done",
reason: output.stopReason as Extract<StopReason, "stop" | "length" | "toolUse" | "pauseTurn">,
message: output,
});
stream.end();
}
function handleStreamError(
stream: AssistantMessageEventStream,
output: AssistantMessage,
error: unknown,
signal?: AbortSignal,
): void {
for (const block of output.content) delete (block as { index?: number }).index;
output.stopReason = signal?.aborted ? "aborted" : "error";
output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
stream.push({ type: "error", reason: output.stopReason, error: output });
stream.end();
}

View file

@ -8,12 +8,15 @@
*/
import type {
OllamaChatRequest,
OllamaChatResponse,
OllamaPsResponse,
OllamaPullProgress,
OllamaShowResponse,
OllamaTagsResponse,
OllamaVersionResponse,
} from "./types.js";
import { parseNDJsonStream } from "./ndjson-stream.js";
const DEFAULT_HOST = "http://localhost:11434";
const PROBE_TIMEOUT_MS = 1500;
@ -130,39 +133,36 @@ export async function pullModel(
throw new Error("Ollama /api/pull returned no body");
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
for await (const progress of parseNDJsonStream<OllamaPullProgress>(response.body, signal)) {
onProgress?.(progress);
}
}
while (true) {
const { done, value } = await reader.read();
if (done) break;
/**
* Stream a chat completion via /api/chat.
* Returns an async generator yielding each NDJSON response chunk.
*/
export async function* chat(
request: OllamaChatRequest,
signal?: AbortSignal,
): AsyncGenerator<OllamaChatResponse> {
const response = await fetch(`${getOllamaHost()}/api/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(request),
signal,
});
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() ?? "";
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const progress = JSON.parse(trimmed) as OllamaPullProgress;
onProgress?.(progress);
} catch {
// Skip malformed lines
}
}
if (!response.ok) {
const text = await response.text();
throw new Error(`Ollama /api/chat returned ${response.status}: ${text}`);
}
// Process remaining buffer
if (buffer.trim()) {
try {
const progress = JSON.parse(buffer.trim()) as OllamaPullProgress;
onProgress?.(progress);
} catch {
// Ignore
}
if (!response.body) {
throw new Error("Ollama /api/chat returned no body");
}
yield* parseNDJsonStream<OllamaChatResponse>(response.body, signal, true);
}
/**

View file

@ -8,14 +8,14 @@
* Returns models in the format expected by pi.registerProvider().
*/
import { listModels, getOllamaHost } from "./ollama-client.js";
import { listModels } from "./ollama-client.js";
import {
estimateContextFromParams,
formatModelSize,
getModelCapabilities,
humanizeModelName,
} from "./model-capabilities.js";
import type { OllamaModelInfo } from "./types.js";
import type { OllamaChatOptions, OllamaModelInfo } from "./types.js";
export interface DiscoveredOllamaModel {
id: string;
@ -29,6 +29,8 @@ export interface DiscoveredOllamaModel {
sizeBytes: number;
/** Parameter size string from Ollama (e.g. "7B") */
parameterSize: string;
/** Ollama-specific inference options for this model */
ollamaOptions?: OllamaChatOptions;
}
const ZERO_COST = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 };
@ -64,6 +66,7 @@ function enrichModel(info: OllamaModelInfo): DiscoveredOllamaModel {
maxTokens,
sizeBytes: info.size,
parameterSize,
ollamaOptions: caps.ollamaOptions,
};
}
@ -98,9 +101,3 @@ export function formatModelForDisplay(model: DiscoveredOllamaModel): string {
return parts.join(" ");
}
/**
* Build the OpenAI-compat base URL for Ollama.
*/
export function getOllamaOpenAIBaseUrl(): string {
return `${getOllamaHost()}/v1`;
}

View file

@ -31,6 +31,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
promptGuidelines: [
"Use 'list' to see what models are available locally before trying to use one.",
"Use 'pull' to download a model that isn't available yet.",
"Use 'remove' to delete a local model that is no longer needed.",
"Use 'show' to get detailed info about a model (parameters, quantization, families).",
"Use 'status' to check if Ollama is running.",
"Use 'ps' to see which models are loaded in memory and VRAM usage.",
"Common models: llama3.1:8b, qwen2.5-coder:7b, deepseek-r1:8b, codestral:22b",
@ -40,6 +42,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
[
Type.Literal("list"),
Type.Literal("pull"),
Type.Literal("remove"),
Type.Literal("show"),
Type.Literal("status"),
Type.Literal("ps"),
],
@ -164,6 +168,71 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
};
}
case "remove": {
if (!model) {
return {
content: [{ type: "text", text: "Error: 'model' parameter is required for remove action." }],
isError: true,
details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
};
}
const running = await client.isRunning();
if (!running) {
return {
content: [{ type: "text", text: "Ollama is not running." }],
isError: true,
details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
};
}
await client.deleteModel(model);
return {
content: [{ type: "text", text: `Successfully removed ${model}` }],
details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
};
}
case "show": {
if (!model) {
return {
content: [{ type: "text", text: "Error: 'model' parameter is required for show action." }],
isError: true,
details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
};
}
const running = await client.isRunning();
if (!running) {
return {
content: [{ type: "text", text: "Ollama is not running." }],
isError: true,
details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
};
}
const info = await client.showModel(model);
const details = info.details;
const infoLines = [
`Model: ${model}`,
`Family: ${details.family}`,
`Parameters: ${details.parameter_size}`,
`Quantization: ${details.quantization_level}`,
`Format: ${details.format}`,
];
if (details.families?.length) {
infoLines.push(`Families: ${details.families.join(", ")}`);
}
if (info.parameters) {
infoLines.push(`\nModelfile parameters:\n${info.parameters}`);
}
return {
content: [{ type: "text", text: infoLines.join("\n") }],
details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
};
}
default:
return {
content: [{ type: "text", text: `Unknown action: ${action}` }],

View file

@ -1,28 +1 @@
// GSD2 — Tests for Ollama model discovery and enrichment
import { describe, it, afterEach } from "node:test";
import assert from "node:assert/strict";
import { getOllamaOpenAIBaseUrl } from "../ollama-discovery.js";
// ─── getOllamaOpenAIBaseUrl ─────────────────────────────────────────────────
describe("getOllamaOpenAIBaseUrl", () => {
const originalHost = process.env.OLLAMA_HOST;
afterEach(() => {
if (originalHost === undefined) {
delete process.env.OLLAMA_HOST;
} else {
process.env.OLLAMA_HOST = originalHost;
}
});
it("returns default OpenAI-compat URL", () => {
delete process.env.OLLAMA_HOST;
assert.equal(getOllamaOpenAIBaseUrl(), "http://localhost:11434/v1");
});
it("appends /v1 to custom OLLAMA_HOST", () => {
process.env.OLLAMA_HOST = "http://remote:9999";
assert.equal(getOllamaOpenAIBaseUrl(), "http://remote:9999/v1");
});
});

View file

@ -0,0 +1,116 @@
// GSD2 — Ollama Extension: Stateful <think> tag stream parser
/**
* Extracts <think>...</think> thinking blocks from a streaming text response.
* Handles the case where tag boundaries span multiple chunks by buffering
* up to 8 characters (length of "</think>") at chunk boundaries.
*
* Used for reasoning models like deepseek-r1 and qwq that embed thinking
* inline in their text output.
*/
export type ParsedChunk =
| { type: "thinking"; text: string }
| { type: "text"; text: string };
const OPEN_TAG = "<think>";
const CLOSE_TAG = "</think>";
const MAX_TAG_LEN = Math.max(OPEN_TAG.length, CLOSE_TAG.length);
export class ThinkingTagParser {
private buffer = "";
private inThinking = false;
/**
* Feed a chunk of text and get back parsed segments.
* May return zero or more segments depending on tag boundaries.
*/
push(chunk: string): ParsedChunk[] {
const results: ParsedChunk[] = [];
let input = this.buffer + chunk;
this.buffer = "";
while (input.length > 0) {
if (this.inThinking) {
const closeIdx = input.indexOf(CLOSE_TAG);
if (closeIdx !== -1) {
// Found close tag — emit thinking content before it
const thinking = input.slice(0, closeIdx);
if (thinking) results.push({ type: "thinking", text: thinking });
this.inThinking = false;
input = input.slice(closeIdx + CLOSE_TAG.length);
} else if (this.couldBePartialTag(input, CLOSE_TAG)) {
// Possible partial close tag at end — buffer only the matching tail
const tailLen = this.getPartialTagTailLength(input, CLOSE_TAG);
const safe = input.slice(0, input.length - tailLen);
if (safe) results.push({ type: "thinking", text: safe });
this.buffer = input.slice(-tailLen);
break;
} else {
// No close tag — emit all as thinking
results.push({ type: "thinking", text: input });
break;
}
} else {
const openIdx = input.indexOf(OPEN_TAG);
if (openIdx !== -1) {
// Found open tag — emit text before it
const text = input.slice(0, openIdx);
if (text) results.push({ type: "text", text });
this.inThinking = true;
input = input.slice(openIdx + OPEN_TAG.length);
} else if (this.couldBePartialTag(input, OPEN_TAG)) {
// Possible partial open tag at end — buffer only the matching tail
const tailLen = this.getPartialTagTailLength(input, OPEN_TAG);
const safe = input.slice(0, input.length - tailLen);
if (safe) results.push({ type: "text", text: safe });
this.buffer = input.slice(-tailLen);
break;
} else {
// No open tag — emit all as text
results.push({ type: "text", text: input });
break;
}
}
}
return results;
}
/**
* Flush any remaining buffered content. Call at end of stream.
*/
flush(): ParsedChunk[] {
if (!this.buffer) return [];
const result: ParsedChunk = {
type: this.inThinking ? "thinking" : "text",
text: this.buffer,
};
this.buffer = "";
return [result];
}
/**
* Check if the end of input could be the start of a partial tag.
* Only buffers when the tail of input matches a prefix of the tag.
*/
private couldBePartialTag(input: string, tag: string): boolean {
return this.getPartialTagTailLength(input, tag) > 0;
}
/**
* Get the length of the tail of input that matches a prefix of the tag.
* Returns 0 if no partial match.
*/
private getPartialTagTailLength(input: string, tag: string): number {
const maxCheck = Math.min(input.length, tag.length - 1);
for (let len = maxCheck; len >= 1; len--) {
const tail = input.slice(-len);
if (tag.startsWith(tail)) {
return len;
}
}
return 0;
}
}

View file

@ -72,11 +72,31 @@ export interface OllamaVersionResponse {
// ─── /api/chat ──────────────────────────────────────────────────────────────
/** Per-model Ollama inference options carried via Model.providerOptions. */
export interface OllamaChatOptions {
/** How long to keep the model loaded after the last request. e.g. "5m", "0" to unload. */
keep_alive?: string;
/** Number of GPU layers to offload. -1 = all. */
num_gpu?: number;
/** Override the context window for Ollama requests. Only sent when explicitly set. */
num_ctx?: number;
/** Sampling: top-k most likely tokens. Default: 40 */
top_k?: number;
/** Sampling: nucleus sampling threshold. */
top_p?: number;
/** Sampling: penalize repeating tokens. Default: 1.1 */
repeat_penalty?: number;
/** Sampling: fixed seed for reproducibility. */
seed?: number;
}
export interface OllamaChatMessage {
role: "system" | "user" | "assistant" | "tool";
content: string;
images?: string[];
tool_calls?: OllamaToolCall[];
/** Tool name — required for role: "tool" messages to correlate results with calls. */
name?: string;
}
export interface OllamaToolCall {
@ -110,7 +130,10 @@ export interface OllamaChatRequest {
temperature?: number;
top_p?: number;
top_k?: number;
repeat_penalty?: number;
seed?: number;
stop?: string[];
num_gpu?: number;
};
keep_alive?: string;
}