Merge pull request #3545 from jeremymcs/feat/ollama-native-chat-provider
feat(ollama): native /api/chat provider with full option exposure
This commit is contained in:
commit
a6b7febc5e
13 changed files with 832 additions and 112 deletions
|
|
@ -13,7 +13,8 @@ export type KnownApi =
|
|||
| "bedrock-converse-stream"
|
||||
| "google-generative-ai"
|
||||
| "google-gemini-cli"
|
||||
| "google-vertex";
|
||||
| "google-vertex"
|
||||
| "ollama-chat";
|
||||
|
||||
export type Api = KnownApi | (string & {});
|
||||
|
||||
|
|
@ -212,9 +213,23 @@ export interface AssistantMessage {
|
|||
errorMessage?: string;
|
||||
/** Server-requested retry delay in milliseconds (from Retry-After or rate limit headers). */
|
||||
retryAfterMs?: number;
|
||||
/** Provider inference performance metrics (e.g. tokens/sec from local models). */
|
||||
inferenceMetrics?: InferenceMetrics;
|
||||
timestamp: number; // Unix timestamp in milliseconds
|
||||
}
|
||||
|
||||
/** Inference performance metrics reported by providers that support it (e.g. Ollama). */
|
||||
export interface InferenceMetrics {
|
||||
/** Tokens generated per second during eval phase. */
|
||||
tokensPerSecond: number;
|
||||
/** Wall-clock duration of the full request in milliseconds. */
|
||||
totalDurationMs: number;
|
||||
/** Duration of the eval (generation) phase in milliseconds. */
|
||||
evalDurationMs: number;
|
||||
/** Duration of the prompt eval phase in milliseconds. */
|
||||
promptEvalDurationMs: number;
|
||||
}
|
||||
|
||||
export interface ToolResultMessage<TDetails = any> {
|
||||
role: "toolResult";
|
||||
toolCallId: string;
|
||||
|
|
@ -374,4 +389,6 @@ export interface Model<TApi extends Api> {
|
|||
* Read these fields instead of pattern-matching on model IDs or provider names.
|
||||
*/
|
||||
capabilities?: ModelCapabilities;
|
||||
/** Opaque provider-specific options. Cast to the appropriate type in the provider's stream handler. */
|
||||
providerOptions?: Record<string, unknown>;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1341,6 +1341,8 @@ export interface ProviderModelConfig {
|
|||
headers?: Record<string, string>;
|
||||
/** OpenAI compatibility settings. */
|
||||
compat?: Model<Api>["compat"];
|
||||
/** Opaque provider-specific options (e.g. Ollama keep_alive, num_gpu). */
|
||||
providerOptions?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
/** Extension factory function type. Supports both sync and async initialization. */
|
||||
|
|
|
|||
|
|
@ -742,6 +742,7 @@ export class ModelRegistry {
|
|||
maxTokens: modelDef.maxTokens,
|
||||
headers,
|
||||
compat: modelDef.compat,
|
||||
providerOptions: modelDef.providerOptions,
|
||||
} as Model<Api>);
|
||||
}
|
||||
|
||||
|
|
@ -917,5 +918,6 @@ export interface ProviderConfigInput {
|
|||
maxTokens: number;
|
||||
headers?: Record<string, string>;
|
||||
compat?: Model<Api>["compat"];
|
||||
providerOptions?: Record<string, unknown>;
|
||||
}>;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,19 +17,10 @@
|
|||
*/
|
||||
|
||||
import { importExtensionModule, type ExtensionAPI } from "@gsd/pi-coding-agent";
|
||||
import type { OpenAICompletionsCompat } from "@gsd/pi-ai";
|
||||
import * as client from "./ollama-client.js";
|
||||
import { discoverModels, getOllamaOpenAIBaseUrl } from "./ollama-discovery.js";
|
||||
import { discoverModels } from "./ollama-discovery.js";
|
||||
import { registerOllamaCommands } from "./ollama-commands.js";
|
||||
|
||||
/** Default compat settings for Ollama models via OpenAI-compat endpoint */
|
||||
const OLLAMA_COMPAT: OpenAICompletionsCompat = {
|
||||
supportsDeveloperRole: false,
|
||||
supportsReasoningEffort: false,
|
||||
supportsUsageInStreaming: false,
|
||||
maxTokensField: "max_tokens",
|
||||
supportsStore: false,
|
||||
};
|
||||
import { streamOllamaChat } from "./ollama-chat-provider.js";
|
||||
|
||||
let toolsPromise: Promise<void> | null = null;
|
||||
|
||||
|
|
@ -68,12 +59,13 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
|
|||
const models = await discoverModels();
|
||||
if (models.length === 0) return true; // Running but no models pulled
|
||||
|
||||
const baseUrl = getOllamaOpenAIBaseUrl();
|
||||
const baseUrl = client.getOllamaHost();
|
||||
|
||||
pi.registerProvider("ollama", {
|
||||
authMode: "none",
|
||||
baseUrl,
|
||||
api: "openai-completions",
|
||||
api: "ollama-chat",
|
||||
streamSimple: streamOllamaChat,
|
||||
isReady: () => true,
|
||||
models: models.map((m) => ({
|
||||
id: m.id,
|
||||
|
|
@ -83,7 +75,7 @@ async function probeAndRegister(pi: ExtensionAPI): Promise<boolean> {
|
|||
cost: m.cost,
|
||||
contextWindow: m.contextWindow,
|
||||
maxTokens: m.maxTokens,
|
||||
compat: OLLAMA_COMPAT,
|
||||
providerOptions: (m.ollamaOptions ?? {}) as Record<string, unknown>,
|
||||
})),
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -8,11 +8,15 @@
|
|||
* Fallback: estimate from parameter count if model isn't in the table.
|
||||
*/
|
||||
|
||||
import type { OllamaChatOptions } from "./types.js";
|
||||
|
||||
export interface ModelCapability {
|
||||
contextWindow?: number;
|
||||
maxTokens?: number;
|
||||
input?: ("text" | "image")[];
|
||||
reasoning?: boolean;
|
||||
/** Ollama-specific default inference options for this model family. */
|
||||
ollamaOptions?: OllamaChatOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -20,58 +24,61 @@ export interface ModelCapability {
|
|||
* Keys are matched as prefixes against the model name (before the colon/tag).
|
||||
* More specific entries should appear first.
|
||||
*/
|
||||
// Note: ollamaOptions.num_ctx is set for known model families where the context
|
||||
// window is authoritative. For unknown/estimated models, num_ctx is NOT sent
|
||||
// to avoid OOM risk — Ollama uses its own safe default instead.
|
||||
const KNOWN_MODELS: Array<[pattern: string, caps: ModelCapability]> = [
|
||||
// ─── Reasoning models ───────────────────────────────────────────────
|
||||
["deepseek-r1", { contextWindow: 131072, reasoning: true }],
|
||||
["qwq", { contextWindow: 131072, reasoning: true }],
|
||||
["deepseek-r1", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["qwq", { contextWindow: 131072, reasoning: true, ollamaOptions: { num_ctx: 131072 } }],
|
||||
|
||||
// ─── Vision models ──────────────────────────────────────────────────
|
||||
["llava", { contextWindow: 4096, input: ["text", "image"] }],
|
||||
["bakllava", { contextWindow: 4096, input: ["text", "image"] }],
|
||||
["moondream", { contextWindow: 8192, input: ["text", "image"] }],
|
||||
["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"] }],
|
||||
["minicpm-v", { contextWindow: 4096, input: ["text", "image"] }],
|
||||
["llava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
|
||||
["bakllava", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
|
||||
["moondream", { contextWindow: 8192, input: ["text", "image"], ollamaOptions: { num_ctx: 8192 } }],
|
||||
["llama3.2-vision", { contextWindow: 131072, input: ["text", "image"], ollamaOptions: { num_ctx: 131072 } }],
|
||||
["minicpm-v", { contextWindow: 4096, input: ["text", "image"], ollamaOptions: { num_ctx: 4096 } }],
|
||||
|
||||
// ─── Code models ────────────────────────────────────────────────────
|
||||
["codestral", { contextWindow: 262144, maxTokens: 32768 }],
|
||||
["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768 }],
|
||||
["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["starcoder2", { contextWindow: 16384, maxTokens: 8192 }],
|
||||
["codegemma", { contextWindow: 8192, maxTokens: 8192 }],
|
||||
["codellama", { contextWindow: 16384, maxTokens: 8192 }],
|
||||
["devstral", { contextWindow: 131072, maxTokens: 32768 }],
|
||||
["codestral", { contextWindow: 262144, maxTokens: 32768, ollamaOptions: { num_ctx: 262144 } }],
|
||||
["qwen2.5-coder", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["deepseek-coder-v2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["starcoder2", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
|
||||
["codegemma", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
|
||||
["codellama", { contextWindow: 16384, maxTokens: 8192, ollamaOptions: { num_ctx: 16384 } }],
|
||||
["devstral", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
|
||||
|
||||
// ─── Llama family ───────────────────────────────────────────────────
|
||||
["llama3.3", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["llama3.2", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["llama3.1", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["llama3", { contextWindow: 8192, maxTokens: 8192 }],
|
||||
["llama2", { contextWindow: 4096, maxTokens: 4096 }],
|
||||
["llama3.3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["llama3.2", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["llama3.1", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["llama3", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
|
||||
["llama2", { contextWindow: 4096, maxTokens: 4096, ollamaOptions: { num_ctx: 4096 } }],
|
||||
|
||||
// ─── Qwen family ────────────────────────────────────────────────────
|
||||
["qwen3", { contextWindow: 131072, maxTokens: 32768 }],
|
||||
["qwen2.5", { contextWindow: 131072, maxTokens: 32768 }],
|
||||
["qwen2", { contextWindow: 131072, maxTokens: 32768 }],
|
||||
["qwen3", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["qwen2.5", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["qwen2", { contextWindow: 131072, maxTokens: 32768, ollamaOptions: { num_ctx: 131072 } }],
|
||||
|
||||
// ─── Gemma family ───────────────────────────────────────────────────
|
||||
["gemma3", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["gemma2", { contextWindow: 8192, maxTokens: 8192 }],
|
||||
["gemma3", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["gemma2", { contextWindow: 8192, maxTokens: 8192, ollamaOptions: { num_ctx: 8192 } }],
|
||||
|
||||
// ─── Mistral family ─────────────────────────────────────────────────
|
||||
["mistral-large", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["mistral-small", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["mistral-nemo", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["mistral", { contextWindow: 32768, maxTokens: 8192 }],
|
||||
["mixtral", { contextWindow: 32768, maxTokens: 8192 }],
|
||||
["mistral-large", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["mistral-small", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["mistral-nemo", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["mistral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],
|
||||
["mixtral", { contextWindow: 32768, maxTokens: 8192, ollamaOptions: { num_ctx: 32768 } }],
|
||||
|
||||
// ─── Phi family ─────────────────────────────────────────────────────
|
||||
["phi4", { contextWindow: 16384, maxTokens: 16384 }],
|
||||
["phi3.5", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["phi3", { contextWindow: 131072, maxTokens: 4096 }],
|
||||
["phi4", { contextWindow: 16384, maxTokens: 16384, ollamaOptions: { num_ctx: 16384 } }],
|
||||
["phi3.5", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["phi3", { contextWindow: 131072, maxTokens: 4096, ollamaOptions: { num_ctx: 131072 } }],
|
||||
|
||||
// ─── Command R ──────────────────────────────────────────────────────
|
||||
["command-r-plus", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["command-r", { contextWindow: 131072, maxTokens: 16384 }],
|
||||
["command-r-plus", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
["command-r", { contextWindow: 131072, maxTokens: 16384, ollamaOptions: { num_ctx: 131072 } }],
|
||||
];
|
||||
|
||||
/**
|
||||
|
|
|
|||
63
src/resources/extensions/ollama/ndjson-stream.ts
Normal file
63
src/resources/extensions/ollama/ndjson-stream.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
// GSD2 — Ollama Extension: NDJSON streaming parser
|
||||
|
||||
/**
|
||||
* Parses a streaming NDJSON (newline-delimited JSON) response body into
|
||||
* typed objects. Used for Ollama's /api/chat and /api/pull endpoints.
|
||||
*
|
||||
* @param strict When true, malformed JSON lines throw instead of being skipped.
|
||||
* Use strict mode for inference streams where silent data loss is unacceptable.
|
||||
* Use permissive mode (default) for progress endpoints like /api/pull.
|
||||
*/
|
||||
|
||||
export async function* parseNDJsonStream<T>(
|
||||
body: ReadableStream<Uint8Array>,
|
||||
signal?: AbortSignal,
|
||||
strict = false,
|
||||
): AsyncGenerator<T> {
|
||||
const reader = body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
if (signal?.aborted) break;
|
||||
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split("\n");
|
||||
buffer = lines.pop() ?? "";
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try {
|
||||
yield JSON.parse(trimmed) as T;
|
||||
} catch (err) {
|
||||
if (strict) {
|
||||
throw new Error(
|
||||
`Malformed NDJSON line from Ollama: ${trimmed.slice(0, 200)}`,
|
||||
);
|
||||
}
|
||||
// Permissive mode: skip malformed lines
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining buffer (skip if aborted)
|
||||
if (buffer.trim() && !signal?.aborted) {
|
||||
try {
|
||||
yield JSON.parse(buffer.trim()) as T;
|
||||
} catch (err) {
|
||||
if (strict) {
|
||||
throw new Error(
|
||||
`Malformed NDJSON line from Ollama: ${buffer.trim().slice(0, 200)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
reader.releaseLock();
|
||||
}
|
||||
}
|
||||
459
src/resources/extensions/ollama/ollama-chat-provider.ts
Normal file
459
src/resources/extensions/ollama/ollama-chat-provider.ts
Normal file
|
|
@ -0,0 +1,459 @@
|
|||
// GSD2 — Ollama Extension: Native /api/chat stream provider
|
||||
|
||||
/**
|
||||
* Implements the "ollama-chat" API provider, streaming responses directly
|
||||
* from Ollama's native /api/chat endpoint instead of the OpenAI compatibility
|
||||
* shim. This exposes Ollama-specific options (num_ctx, keep_alive, num_gpu,
|
||||
* sampling parameters) and surfaces inference performance metrics.
|
||||
*/
|
||||
|
||||
import {
|
||||
type Api,
|
||||
type AssistantMessage,
|
||||
type AssistantMessageEvent,
|
||||
type AssistantMessageEventStream,
|
||||
type Context,
|
||||
type ImageContent,
|
||||
type InferenceMetrics,
|
||||
type Message,
|
||||
type Model,
|
||||
type SimpleStreamOptions,
|
||||
type StopReason,
|
||||
type TextContent,
|
||||
type ThinkingContent,
|
||||
type Tool,
|
||||
type ToolCall,
|
||||
type Usage,
|
||||
EventStream,
|
||||
} from "@gsd/pi-ai";
|
||||
import { chat } from "./ollama-client.js";
|
||||
import type {
|
||||
OllamaChatMessage,
|
||||
OllamaChatOptions,
|
||||
OllamaChatRequest,
|
||||
OllamaChatResponse,
|
||||
OllamaTool,
|
||||
OllamaToolCall,
|
||||
} from "./types.js";
|
||||
import { ThinkingTagParser, type ParsedChunk } from "./thinking-parser.js";
|
||||
|
||||
/** Create an AssistantMessageEventStream using the base EventStream class. */
|
||||
function createStream(): AssistantMessageEventStream {
|
||||
return new EventStream<AssistantMessageEvent, AssistantMessage>(
|
||||
(event) => event.type === "done" || event.type === "error",
|
||||
(event) => {
|
||||
if (event.type === "done") return event.message;
|
||||
if (event.type === "error") return event.error;
|
||||
throw new Error("Unexpected event type for final result");
|
||||
},
|
||||
) as AssistantMessageEventStream;
|
||||
}
|
||||
|
||||
// ─── Stream handler ─────────────────────────────────────────────────────────
|
||||
|
||||
export function streamOllamaChat(
|
||||
model: Model<Api>,
|
||||
context: Context,
|
||||
options?: SimpleStreamOptions,
|
||||
): AssistantMessageEventStream {
|
||||
const stream = createStream();
|
||||
|
||||
(async () => {
|
||||
const output = buildInitialOutput(model);
|
||||
|
||||
try {
|
||||
const request = buildRequest(model, context, options);
|
||||
stream.push({ type: "start", partial: output });
|
||||
|
||||
const useThinkingParser = model.reasoning;
|
||||
const thinkParser = useThinkingParser ? new ThinkingTagParser() : null;
|
||||
|
||||
let contentIndex = -1;
|
||||
let currentBlockType: "text" | "thinking" | null = null;
|
||||
|
||||
function startBlock(type: "text" | "thinking") {
|
||||
contentIndex++;
|
||||
currentBlockType = type;
|
||||
if (type === "text") {
|
||||
output.content.push({ type: "text", text: "" });
|
||||
stream.push({ type: "text_start", contentIndex, partial: output });
|
||||
} else {
|
||||
output.content.push({ type: "thinking", thinking: "" });
|
||||
stream.push({ type: "thinking_start", contentIndex, partial: output });
|
||||
}
|
||||
}
|
||||
|
||||
function endBlock() {
|
||||
if (currentBlockType === null) return;
|
||||
if (currentBlockType === "text") {
|
||||
const block = output.content[contentIndex] as TextContent;
|
||||
stream.push({ type: "text_end", contentIndex, content: block.text, partial: output });
|
||||
} else {
|
||||
const block = output.content[contentIndex] as ThinkingContent;
|
||||
stream.push({ type: "thinking_end", contentIndex, content: block.thinking, partial: output });
|
||||
}
|
||||
currentBlockType = null;
|
||||
}
|
||||
|
||||
function emitDelta(type: "text" | "thinking", text: string) {
|
||||
if (!text) return;
|
||||
if (currentBlockType !== type) {
|
||||
endBlock();
|
||||
startBlock(type);
|
||||
}
|
||||
if (type === "text") {
|
||||
(output.content[contentIndex] as TextContent).text += text;
|
||||
stream.push({ type: "text_delta", contentIndex, delta: text, partial: output });
|
||||
} else {
|
||||
(output.content[contentIndex] as ThinkingContent).thinking += text;
|
||||
stream.push({ type: "thinking_delta", contentIndex, delta: text, partial: output });
|
||||
}
|
||||
}
|
||||
|
||||
function processChunks(chunks: ParsedChunk[]) {
|
||||
for (const chunk of chunks) {
|
||||
emitDelta(chunk.type, chunk.text);
|
||||
}
|
||||
}
|
||||
|
||||
function processToolCalls(toolCalls: OllamaToolCall[]) {
|
||||
endBlock();
|
||||
for (const tc of toolCalls) {
|
||||
contentIndex++;
|
||||
const toolCall: ToolCall = {
|
||||
type: "toolCall",
|
||||
id: `ollama_tc_${contentIndex}`,
|
||||
name: tc.function.name,
|
||||
arguments: tc.function.arguments,
|
||||
};
|
||||
output.content.push(toolCall);
|
||||
stream.push({ type: "toolcall_start", contentIndex, partial: output });
|
||||
// Emit a delta with the serialized arguments (convention: start/delta/end)
|
||||
stream.push({
|
||||
type: "toolcall_delta",
|
||||
contentIndex,
|
||||
delta: JSON.stringify(tc.function.arguments),
|
||||
partial: output,
|
||||
});
|
||||
stream.push({
|
||||
type: "toolcall_end",
|
||||
contentIndex,
|
||||
toolCall,
|
||||
partial: output,
|
||||
});
|
||||
}
|
||||
output.stopReason = "toolUse";
|
||||
}
|
||||
|
||||
for await (const chunk of chat(request, options?.signal)) {
|
||||
// Handle text content — process independently of tool_calls
|
||||
// (a chunk may contain both content and tool_calls)
|
||||
const content = chunk.message?.content ?? "";
|
||||
if (content && !chunk.done) {
|
||||
if (thinkParser) {
|
||||
processChunks(thinkParser.push(content));
|
||||
} else {
|
||||
emitDelta("text", content);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle tool calls (Ollama sends them complete, may be on done:true chunk)
|
||||
if (chunk.message?.tool_calls?.length) {
|
||||
processToolCalls(chunk.message.tool_calls);
|
||||
}
|
||||
|
||||
if (chunk.done) {
|
||||
// Final chunk — extract metrics and usage
|
||||
if (thinkParser) processChunks(thinkParser.flush());
|
||||
endBlock();
|
||||
|
||||
output.usage = buildUsage(chunk);
|
||||
output.inferenceMetrics = extractMetrics(chunk);
|
||||
// Preserve "toolUse" if tool calls were processed
|
||||
if (output.stopReason !== "toolUse") {
|
||||
output.stopReason = mapStopReason(chunk.done_reason);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assertStreamSuccess(output, options?.signal);
|
||||
finalizeStream(stream, output);
|
||||
} catch (error) {
|
||||
handleStreamError(stream, output, error, options?.signal);
|
||||
}
|
||||
})();
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
// ─── Request building ───────────────────────────────────────────────────────
|
||||
|
||||
function buildRequest(
|
||||
model: Model<Api>,
|
||||
context: Context,
|
||||
options?: SimpleStreamOptions,
|
||||
): OllamaChatRequest {
|
||||
const ollamaOpts = (model.providerOptions ?? {}) as OllamaChatOptions;
|
||||
|
||||
const request: OllamaChatRequest = {
|
||||
model: model.id,
|
||||
messages: convertMessages(context),
|
||||
stream: true,
|
||||
};
|
||||
|
||||
// Build options block with all Ollama-specific parameters
|
||||
const reqOptions: NonNullable<OllamaChatRequest["options"]> = {};
|
||||
|
||||
// Context window — only sent when explicitly configured via providerOptions.
|
||||
// Sending inferred/estimated values risks OOM on constrained hosts.
|
||||
// Users can set num_ctx per-model in models.json ollamaOptions or the
|
||||
// capability table can provide it for known model families.
|
||||
if (ollamaOpts.num_ctx !== undefined && ollamaOpts.num_ctx > 0) {
|
||||
reqOptions.num_ctx = ollamaOpts.num_ctx;
|
||||
}
|
||||
|
||||
// Max output tokens
|
||||
const maxTokens = options?.maxTokens ?? model.maxTokens;
|
||||
if (maxTokens > 0) {
|
||||
reqOptions.num_predict = maxTokens;
|
||||
}
|
||||
|
||||
// Temperature
|
||||
if (options?.temperature !== undefined) {
|
||||
reqOptions.temperature = options.temperature;
|
||||
}
|
||||
|
||||
// Per-model sampling options from providerOptions
|
||||
if (ollamaOpts.top_p !== undefined) reqOptions.top_p = ollamaOpts.top_p;
|
||||
if (ollamaOpts.top_k !== undefined) reqOptions.top_k = ollamaOpts.top_k;
|
||||
if (ollamaOpts.repeat_penalty !== undefined) reqOptions.repeat_penalty = ollamaOpts.repeat_penalty;
|
||||
if (ollamaOpts.seed !== undefined) reqOptions.seed = ollamaOpts.seed;
|
||||
if (ollamaOpts.num_gpu !== undefined) reqOptions.num_gpu = ollamaOpts.num_gpu;
|
||||
|
||||
if (Object.keys(reqOptions).length > 0) {
|
||||
request.options = reqOptions;
|
||||
}
|
||||
|
||||
// Keep alive
|
||||
if (ollamaOpts.keep_alive !== undefined) {
|
||||
request.keep_alive = ollamaOpts.keep_alive;
|
||||
}
|
||||
|
||||
// Tools
|
||||
if (context.tools?.length) {
|
||||
request.tools = convertTools(context.tools);
|
||||
}
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
// ─── Message conversion ─────────────────────────────────────────────────────
|
||||
|
||||
function convertMessages(context: Context): OllamaChatMessage[] {
|
||||
const messages: OllamaChatMessage[] = [];
|
||||
|
||||
// System prompt
|
||||
if (context.systemPrompt) {
|
||||
messages.push({ role: "system", content: context.systemPrompt });
|
||||
}
|
||||
|
||||
for (const msg of context.messages) {
|
||||
switch (msg.role) {
|
||||
case "user":
|
||||
messages.push(convertUserMessage(msg));
|
||||
break;
|
||||
case "assistant":
|
||||
messages.push(convertAssistantMessage(msg));
|
||||
break;
|
||||
case "toolResult":
|
||||
messages.push({
|
||||
role: "tool",
|
||||
content: msg.content
|
||||
.filter((c): c is TextContent => c.type === "text")
|
||||
.map((c) => c.text)
|
||||
.join("\n"),
|
||||
name: msg.toolName,
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return messages;
|
||||
}
|
||||
|
||||
function convertUserMessage(msg: Message & { role: "user" }): OllamaChatMessage {
|
||||
if (typeof msg.content === "string") {
|
||||
return { role: "user", content: msg.content };
|
||||
}
|
||||
|
||||
const textParts: string[] = [];
|
||||
const images: string[] = [];
|
||||
|
||||
for (const part of msg.content) {
|
||||
if (part.type === "text") {
|
||||
textParts.push(part.text);
|
||||
} else if (part.type === "image") {
|
||||
// Strip data URI prefix if present
|
||||
let data = (part as ImageContent).data;
|
||||
const commaIdx = data.indexOf(",");
|
||||
if (commaIdx !== -1 && data.startsWith("data:")) {
|
||||
data = data.slice(commaIdx + 1);
|
||||
}
|
||||
images.push(data);
|
||||
}
|
||||
}
|
||||
|
||||
const result: OllamaChatMessage = {
|
||||
role: "user",
|
||||
content: textParts.join("\n"),
|
||||
};
|
||||
if (images.length > 0) {
|
||||
result.images = images;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function convertAssistantMessage(msg: Message & { role: "assistant" }): OllamaChatMessage {
|
||||
let content = "";
|
||||
const toolCalls: OllamaChatMessage["tool_calls"] = [];
|
||||
|
||||
for (const block of msg.content) {
|
||||
if (block.type === "thinking") {
|
||||
// Serialize thinking back inline for round-trip with Ollama
|
||||
content += `<think>${(block as ThinkingContent).thinking}</think>`;
|
||||
} else if (block.type === "text") {
|
||||
content += (block as TextContent).text;
|
||||
} else if (block.type === "toolCall") {
|
||||
const tc = block as ToolCall;
|
||||
toolCalls.push({
|
||||
function: {
|
||||
name: tc.name,
|
||||
arguments: tc.arguments,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const result: OllamaChatMessage = { role: "assistant", content };
|
||||
if (toolCalls.length > 0) {
|
||||
result.tool_calls = toolCalls;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ─── Tool conversion ────────────────────────────────────────────────────────
|
||||
|
||||
function convertTools(tools: Tool[]): OllamaTool[] {
|
||||
return tools.map((tool) => {
|
||||
const params = tool.parameters as Record<string, unknown>;
|
||||
return {
|
||||
type: "function" as const,
|
||||
function: {
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
parameters: {
|
||||
type: "object" as const,
|
||||
required: params.required as string[] | undefined,
|
||||
properties: (params.properties as Record<string, unknown>) ?? {},
|
||||
},
|
||||
},
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Response mapping ───────────────────────────────────────────────────────
|
||||
|
||||
function mapStopReason(doneReason?: string): StopReason {
|
||||
switch (doneReason) {
|
||||
case "stop":
|
||||
return "stop";
|
||||
case "length":
|
||||
return "length";
|
||||
default:
|
||||
return "stop";
|
||||
}
|
||||
}
|
||||
|
||||
function buildUsage(chunk: OllamaChatResponse): Usage {
|
||||
const input = chunk.prompt_eval_count ?? 0;
|
||||
const outputTokens = chunk.eval_count ?? 0;
|
||||
return {
|
||||
input,
|
||||
output: outputTokens,
|
||||
cacheRead: 0,
|
||||
cacheWrite: 0,
|
||||
totalTokens: input + outputTokens,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
function extractMetrics(chunk: OllamaChatResponse): InferenceMetrics | undefined {
|
||||
if (!chunk.eval_duration && !chunk.total_duration) return undefined;
|
||||
|
||||
const evalCount = chunk.eval_count ?? 0;
|
||||
const evalDurationNs = chunk.eval_duration ?? 0;
|
||||
const evalDurationMs = evalDurationNs / 1e6;
|
||||
const tokensPerSecond = evalDurationNs > 0 ? evalCount / (evalDurationNs / 1e9) : 0;
|
||||
|
||||
return {
|
||||
tokensPerSecond,
|
||||
totalDurationMs: (chunk.total_duration ?? 0) / 1e6,
|
||||
evalDurationMs,
|
||||
promptEvalDurationMs: (chunk.prompt_eval_duration ?? 0) / 1e6,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Stream lifecycle helpers ───────────────────────────────────────────────
|
||||
// Replicated from openai-shared.ts (not exported from @gsd/pi-ai)
|
||||
|
||||
function buildInitialOutput(model: Model<Api>): AssistantMessage {
|
||||
return {
|
||||
role: "assistant",
|
||||
content: [],
|
||||
api: model.api as Api,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
usage: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
cacheRead: 0,
|
||||
cacheWrite: 0,
|
||||
totalTokens: 0,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
||||
},
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
function assertStreamSuccess(output: AssistantMessage, signal?: AbortSignal): void {
|
||||
if (signal?.aborted) {
|
||||
throw new Error("Request was aborted");
|
||||
}
|
||||
if (output.stopReason === "aborted" || output.stopReason === "error") {
|
||||
throw new Error("An unknown error occurred");
|
||||
}
|
||||
}
|
||||
|
||||
function finalizeStream(stream: AssistantMessageEventStream, output: AssistantMessage): void {
|
||||
stream.push({
|
||||
type: "done",
|
||||
reason: output.stopReason as Extract<StopReason, "stop" | "length" | "toolUse" | "pauseTurn">,
|
||||
message: output,
|
||||
});
|
||||
stream.end();
|
||||
}
|
||||
|
||||
function handleStreamError(
|
||||
stream: AssistantMessageEventStream,
|
||||
output: AssistantMessage,
|
||||
error: unknown,
|
||||
signal?: AbortSignal,
|
||||
): void {
|
||||
for (const block of output.content) delete (block as { index?: number }).index;
|
||||
output.stopReason = signal?.aborted ? "aborted" : "error";
|
||||
output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
|
||||
stream.push({ type: "error", reason: output.stopReason, error: output });
|
||||
stream.end();
|
||||
}
|
||||
|
|
@ -8,12 +8,15 @@
|
|||
*/
|
||||
|
||||
import type {
|
||||
OllamaChatRequest,
|
||||
OllamaChatResponse,
|
||||
OllamaPsResponse,
|
||||
OllamaPullProgress,
|
||||
OllamaShowResponse,
|
||||
OllamaTagsResponse,
|
||||
OllamaVersionResponse,
|
||||
} from "./types.js";
|
||||
import { parseNDJsonStream } from "./ndjson-stream.js";
|
||||
|
||||
const DEFAULT_HOST = "http://localhost:11434";
|
||||
const PROBE_TIMEOUT_MS = 1500;
|
||||
|
|
@ -130,39 +133,36 @@ export async function pullModel(
|
|||
throw new Error("Ollama /api/pull returned no body");
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
for await (const progress of parseNDJsonStream<OllamaPullProgress>(response.body, signal)) {
|
||||
onProgress?.(progress);
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
/**
|
||||
* Stream a chat completion via /api/chat.
|
||||
* Returns an async generator yielding each NDJSON response chunk.
|
||||
*/
|
||||
export async function* chat(
|
||||
request: OllamaChatRequest,
|
||||
signal?: AbortSignal,
|
||||
): AsyncGenerator<OllamaChatResponse> {
|
||||
const response = await fetch(`${getOllamaHost()}/api/chat`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(request),
|
||||
signal,
|
||||
});
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split("\n");
|
||||
buffer = lines.pop() ?? "";
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try {
|
||||
const progress = JSON.parse(trimmed) as OllamaPullProgress;
|
||||
onProgress?.(progress);
|
||||
} catch {
|
||||
// Skip malformed lines
|
||||
}
|
||||
}
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
throw new Error(`Ollama /api/chat returned ${response.status}: ${text}`);
|
||||
}
|
||||
|
||||
// Process remaining buffer
|
||||
if (buffer.trim()) {
|
||||
try {
|
||||
const progress = JSON.parse(buffer.trim()) as OllamaPullProgress;
|
||||
onProgress?.(progress);
|
||||
} catch {
|
||||
// Ignore
|
||||
}
|
||||
if (!response.body) {
|
||||
throw new Error("Ollama /api/chat returned no body");
|
||||
}
|
||||
|
||||
yield* parseNDJsonStream<OllamaChatResponse>(response.body, signal, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -8,14 +8,14 @@
|
|||
* Returns models in the format expected by pi.registerProvider().
|
||||
*/
|
||||
|
||||
import { listModels, getOllamaHost } from "./ollama-client.js";
|
||||
import { listModels } from "./ollama-client.js";
|
||||
import {
|
||||
estimateContextFromParams,
|
||||
formatModelSize,
|
||||
getModelCapabilities,
|
||||
humanizeModelName,
|
||||
} from "./model-capabilities.js";
|
||||
import type { OllamaModelInfo } from "./types.js";
|
||||
import type { OllamaChatOptions, OllamaModelInfo } from "./types.js";
|
||||
|
||||
export interface DiscoveredOllamaModel {
|
||||
id: string;
|
||||
|
|
@ -29,6 +29,8 @@ export interface DiscoveredOllamaModel {
|
|||
sizeBytes: number;
|
||||
/** Parameter size string from Ollama (e.g. "7B") */
|
||||
parameterSize: string;
|
||||
/** Ollama-specific inference options for this model */
|
||||
ollamaOptions?: OllamaChatOptions;
|
||||
}
|
||||
|
||||
const ZERO_COST = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 };
|
||||
|
|
@ -64,6 +66,7 @@ function enrichModel(info: OllamaModelInfo): DiscoveredOllamaModel {
|
|||
maxTokens,
|
||||
sizeBytes: info.size,
|
||||
parameterSize,
|
||||
ollamaOptions: caps.ollamaOptions,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -98,9 +101,3 @@ export function formatModelForDisplay(model: DiscoveredOllamaModel): string {
|
|||
return parts.join(" ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the OpenAI-compat base URL for Ollama.
|
||||
*/
|
||||
export function getOllamaOpenAIBaseUrl(): string {
|
||||
return `${getOllamaHost()}/v1`;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
|
|||
promptGuidelines: [
|
||||
"Use 'list' to see what models are available locally before trying to use one.",
|
||||
"Use 'pull' to download a model that isn't available yet.",
|
||||
"Use 'remove' to delete a local model that is no longer needed.",
|
||||
"Use 'show' to get detailed info about a model (parameters, quantization, families).",
|
||||
"Use 'status' to check if Ollama is running.",
|
||||
"Use 'ps' to see which models are loaded in memory and VRAM usage.",
|
||||
"Common models: llama3.1:8b, qwen2.5-coder:7b, deepseek-r1:8b, codestral:22b",
|
||||
|
|
@ -40,6 +42,8 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
|
|||
[
|
||||
Type.Literal("list"),
|
||||
Type.Literal("pull"),
|
||||
Type.Literal("remove"),
|
||||
Type.Literal("show"),
|
||||
Type.Literal("status"),
|
||||
Type.Literal("ps"),
|
||||
],
|
||||
|
|
@ -164,6 +168,71 @@ export function registerOllamaTool(pi: ExtensionAPI): void {
|
|||
};
|
||||
}
|
||||
|
||||
case "remove": {
|
||||
if (!model) {
|
||||
return {
|
||||
content: [{ type: "text", text: "Error: 'model' parameter is required for remove action." }],
|
||||
isError: true,
|
||||
details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
const running = await client.isRunning();
|
||||
if (!running) {
|
||||
return {
|
||||
content: [{ type: "text", text: "Ollama is not running." }],
|
||||
isError: true,
|
||||
details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
await client.deleteModel(model);
|
||||
return {
|
||||
content: [{ type: "text", text: `Successfully removed ${model}` }],
|
||||
details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
case "show": {
|
||||
if (!model) {
|
||||
return {
|
||||
content: [{ type: "text", text: "Error: 'model' parameter is required for show action." }],
|
||||
isError: true,
|
||||
details: { action, durationMs: Date.now() - startTime, error: "missing_model" } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
const running = await client.isRunning();
|
||||
if (!running) {
|
||||
return {
|
||||
content: [{ type: "text", text: "Ollama is not running." }],
|
||||
isError: true,
|
||||
details: { action, model, durationMs: Date.now() - startTime, error: "not_running" } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
const info = await client.showModel(model);
|
||||
const details = info.details;
|
||||
const infoLines = [
|
||||
`Model: ${model}`,
|
||||
`Family: ${details.family}`,
|
||||
`Parameters: ${details.parameter_size}`,
|
||||
`Quantization: ${details.quantization_level}`,
|
||||
`Format: ${details.format}`,
|
||||
];
|
||||
if (details.families?.length) {
|
||||
infoLines.push(`Families: ${details.families.join(", ")}`);
|
||||
}
|
||||
if (info.parameters) {
|
||||
infoLines.push(`\nModelfile parameters:\n${info.parameters}`);
|
||||
}
|
||||
|
||||
return {
|
||||
content: [{ type: "text", text: infoLines.join("\n") }],
|
||||
details: { action, model, durationMs: Date.now() - startTime } as OllamaToolDetails,
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
return {
|
||||
content: [{ type: "text", text: `Unknown action: ${action}` }],
|
||||
|
|
|
|||
|
|
@ -1,28 +1 @@
|
|||
// GSD2 — Tests for Ollama model discovery and enrichment
|
||||
import { describe, it, afterEach } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { getOllamaOpenAIBaseUrl } from "../ollama-discovery.js";
|
||||
|
||||
// ─── getOllamaOpenAIBaseUrl ─────────────────────────────────────────────────
|
||||
|
||||
describe("getOllamaOpenAIBaseUrl", () => {
|
||||
const originalHost = process.env.OLLAMA_HOST;
|
||||
|
||||
afterEach(() => {
|
||||
if (originalHost === undefined) {
|
||||
delete process.env.OLLAMA_HOST;
|
||||
} else {
|
||||
process.env.OLLAMA_HOST = originalHost;
|
||||
}
|
||||
});
|
||||
|
||||
it("returns default OpenAI-compat URL", () => {
|
||||
delete process.env.OLLAMA_HOST;
|
||||
assert.equal(getOllamaOpenAIBaseUrl(), "http://localhost:11434/v1");
|
||||
});
|
||||
|
||||
it("appends /v1 to custom OLLAMA_HOST", () => {
|
||||
process.env.OLLAMA_HOST = "http://remote:9999";
|
||||
assert.equal(getOllamaOpenAIBaseUrl(), "http://remote:9999/v1");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
116
src/resources/extensions/ollama/thinking-parser.ts
Normal file
116
src/resources/extensions/ollama/thinking-parser.ts
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
// GSD2 — Ollama Extension: Stateful <think> tag stream parser
|
||||
|
||||
/**
|
||||
* Extracts <think>...</think> thinking blocks from a streaming text response.
|
||||
* Handles the case where tag boundaries span multiple chunks by buffering
|
||||
* up to 8 characters (length of "</think>") at chunk boundaries.
|
||||
*
|
||||
* Used for reasoning models like deepseek-r1 and qwq that embed thinking
|
||||
* inline in their text output.
|
||||
*/
|
||||
|
||||
export type ParsedChunk =
|
||||
| { type: "thinking"; text: string }
|
||||
| { type: "text"; text: string };
|
||||
|
||||
const OPEN_TAG = "<think>";
|
||||
const CLOSE_TAG = "</think>";
|
||||
const MAX_TAG_LEN = Math.max(OPEN_TAG.length, CLOSE_TAG.length);
|
||||
|
||||
export class ThinkingTagParser {
|
||||
private buffer = "";
|
||||
private inThinking = false;
|
||||
|
||||
/**
|
||||
* Feed a chunk of text and get back parsed segments.
|
||||
* May return zero or more segments depending on tag boundaries.
|
||||
*/
|
||||
push(chunk: string): ParsedChunk[] {
|
||||
const results: ParsedChunk[] = [];
|
||||
let input = this.buffer + chunk;
|
||||
this.buffer = "";
|
||||
|
||||
while (input.length > 0) {
|
||||
if (this.inThinking) {
|
||||
const closeIdx = input.indexOf(CLOSE_TAG);
|
||||
if (closeIdx !== -1) {
|
||||
// Found close tag — emit thinking content before it
|
||||
const thinking = input.slice(0, closeIdx);
|
||||
if (thinking) results.push({ type: "thinking", text: thinking });
|
||||
this.inThinking = false;
|
||||
input = input.slice(closeIdx + CLOSE_TAG.length);
|
||||
} else if (this.couldBePartialTag(input, CLOSE_TAG)) {
|
||||
// Possible partial close tag at end — buffer only the matching tail
|
||||
const tailLen = this.getPartialTagTailLength(input, CLOSE_TAG);
|
||||
const safe = input.slice(0, input.length - tailLen);
|
||||
if (safe) results.push({ type: "thinking", text: safe });
|
||||
this.buffer = input.slice(-tailLen);
|
||||
break;
|
||||
} else {
|
||||
// No close tag — emit all as thinking
|
||||
results.push({ type: "thinking", text: input });
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const openIdx = input.indexOf(OPEN_TAG);
|
||||
if (openIdx !== -1) {
|
||||
// Found open tag — emit text before it
|
||||
const text = input.slice(0, openIdx);
|
||||
if (text) results.push({ type: "text", text });
|
||||
this.inThinking = true;
|
||||
input = input.slice(openIdx + OPEN_TAG.length);
|
||||
} else if (this.couldBePartialTag(input, OPEN_TAG)) {
|
||||
// Possible partial open tag at end — buffer only the matching tail
|
||||
const tailLen = this.getPartialTagTailLength(input, OPEN_TAG);
|
||||
const safe = input.slice(0, input.length - tailLen);
|
||||
if (safe) results.push({ type: "text", text: safe });
|
||||
this.buffer = input.slice(-tailLen);
|
||||
break;
|
||||
} else {
|
||||
// No open tag — emit all as text
|
||||
results.push({ type: "text", text: input });
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush any remaining buffered content. Call at end of stream.
|
||||
*/
|
||||
flush(): ParsedChunk[] {
|
||||
if (!this.buffer) return [];
|
||||
|
||||
const result: ParsedChunk = {
|
||||
type: this.inThinking ? "thinking" : "text",
|
||||
text: this.buffer,
|
||||
};
|
||||
this.buffer = "";
|
||||
return [result];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the end of input could be the start of a partial tag.
|
||||
* Only buffers when the tail of input matches a prefix of the tag.
|
||||
*/
|
||||
private couldBePartialTag(input: string, tag: string): boolean {
|
||||
return this.getPartialTagTailLength(input, tag) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of the tail of input that matches a prefix of the tag.
|
||||
* Returns 0 if no partial match.
|
||||
*/
|
||||
private getPartialTagTailLength(input: string, tag: string): number {
|
||||
const maxCheck = Math.min(input.length, tag.length - 1);
|
||||
for (let len = maxCheck; len >= 1; len--) {
|
||||
const tail = input.slice(-len);
|
||||
if (tag.startsWith(tail)) {
|
||||
return len;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -72,11 +72,31 @@ export interface OllamaVersionResponse {
|
|||
|
||||
// ─── /api/chat ──────────────────────────────────────────────────────────────
|
||||
|
||||
/** Per-model Ollama inference options carried via Model.providerOptions. */
|
||||
export interface OllamaChatOptions {
|
||||
/** How long to keep the model loaded after the last request. e.g. "5m", "0" to unload. */
|
||||
keep_alive?: string;
|
||||
/** Number of GPU layers to offload. -1 = all. */
|
||||
num_gpu?: number;
|
||||
/** Override the context window for Ollama requests. Only sent when explicitly set. */
|
||||
num_ctx?: number;
|
||||
/** Sampling: top-k most likely tokens. Default: 40 */
|
||||
top_k?: number;
|
||||
/** Sampling: nucleus sampling threshold. */
|
||||
top_p?: number;
|
||||
/** Sampling: penalize repeating tokens. Default: 1.1 */
|
||||
repeat_penalty?: number;
|
||||
/** Sampling: fixed seed for reproducibility. */
|
||||
seed?: number;
|
||||
}
|
||||
|
||||
export interface OllamaChatMessage {
|
||||
role: "system" | "user" | "assistant" | "tool";
|
||||
content: string;
|
||||
images?: string[];
|
||||
tool_calls?: OllamaToolCall[];
|
||||
/** Tool name — required for role: "tool" messages to correlate results with calls. */
|
||||
name?: string;
|
||||
}
|
||||
|
||||
export interface OllamaToolCall {
|
||||
|
|
@ -110,7 +130,10 @@ export interface OllamaChatRequest {
|
|||
temperature?: number;
|
||||
top_p?: number;
|
||||
top_k?: number;
|
||||
repeat_penalty?: number;
|
||||
seed?: number;
|
||||
stop?: string[];
|
||||
num_gpu?: number;
|
||||
};
|
||||
keep_alive?: string;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue