From 98fe3b605dfad266dc024f39993e3d91b0ddfa98 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 2 May 2026 13:20:10 +0200 Subject: [PATCH] fix(gemini): route cli retry and quota through core --- .../pi-ai/src/providers/google-gemini-cli.ts | 151 +++++++++++++++--- .../extensions/sf-usage-bar/index.ts | 105 +++++------- 2 files changed, 170 insertions(+), 86 deletions(-) diff --git a/packages/pi-ai/src/providers/google-gemini-cli.ts b/packages/pi-ai/src/providers/google-gemini-cli.ts index f07ba91a0..c815227cc 100644 --- a/packages/pi-ai/src/providers/google-gemini-cli.ts +++ b/packages/pi-ai/src/providers/google-gemini-cli.ts @@ -8,8 +8,19 @@ * via setupUser(), and handles all the User-Agent / retry / 429 details. */ -import type { Content, GenerateContentParameters, GenerateContentResponse, ThinkingConfig } from "@google/genai"; -import { AuthType, CodeAssistServer, getOauthClient, makeFakeConfig, setupUser } from "@google/gemini-cli-core"; +import { + AuthType, + CodeAssistServer, + getOauthClient, + makeFakeConfig, + retryWithBackoff, + setupUser, +} from "@google/gemini-cli-core"; +import type { + Content, + GenerateContentParameters, + ThinkingConfig, +} from "@google/genai"; import { calculateCost } from "../models.js"; import type { Api, @@ -35,7 +46,12 @@ import { mapToolChoice, retainThoughtSignature, } from "./google-shared.js"; -import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLevel } from "./simple-options.js"; +import { + buildBaseOptions, + clampReasoning, + isAutoReasoning, + resolveReasoningLevel, +} from "./simple-options.js"; /** * Thinking level for Gemini 3 models. @@ -44,7 +60,12 @@ import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLeve * These are the wire format values for `ThinkingConfig.thinkingLevel` sent to cli-core's * `CodeAssistServer.generateContentStream()`. */ -export type GoogleThinkingLevel = "THINKING_LEVEL_UNSPECIFIED" | "MINIMAL" | "LOW" | "MEDIUM" | "HIGH"; +export type GoogleThinkingLevel = + | "THINKING_LEVEL_UNSPECIFIED" + | "MINIMAL" + | "LOW" + | "MEDIUM" + | "HIGH"; /** * Options for `streamGoogleGeminiCli()`. @@ -96,6 +117,34 @@ async function getCodeAssistServer(): Promise { return new CodeAssistServer(authClient, userData.projectId, { headers: {} }); } +function parseDurationMs(value: string): number | undefined { + const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i); + if (!match || !match[0]) return undefined; + const hours = Number(match[1] ?? 0); + const minutes = Number(match[2] ?? 0); + const seconds = Number(match[3] ?? 0); + const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000; + return totalMs > 0 ? totalMs : undefined; +} + +function extractRetryAfterMs(error: unknown): number | undefined { + if (typeof error === "object" && error !== null && "retryDelayMs" in error) { + const retryDelayMs = (error as { retryDelayMs?: unknown }).retryDelayMs; + if ( + typeof retryDelayMs === "number" && + Number.isFinite(retryDelayMs) && + retryDelayMs > 0 + ) { + return retryDelayMs; + } + } + const message = + error instanceof Error ? error.message : JSON.stringify(error); + const resetMatch = message.match( + /(?:quota will reset|reset) after ([0-9hms]+)/i, + ); + return resetMatch?.[1] ? parseDurationMs(resetMatch[1]) : undefined; +} /** * Check if the model is a Gemini 3 Pro variant (gemini-3*-pro). @@ -131,7 +180,10 @@ function isGemini3Model(modelId: string): boolean { * runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and * error events that accumulate into an `AssistantMessage`. */ -export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGeminiCliOptions> = ( +export const streamGoogleGeminiCli: StreamFunction< + "google-gemini-cli", + GoogleGeminiCliOptions +> = ( model: Model<"google-gemini-cli">, context: Context, options?: GoogleGeminiCliOptions, @@ -171,8 +223,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe // so TypeScript sees two structurally-identical-but-distinct Content types. // The runtime shapes are byte-identical; the nominal split is a packaging // artefact. - // biome-ignore lint/suspicious/noExplicitAny: see above - const streamGen = await server.generateContentStream(req as any, promptId, "USER" as any); + const streamGen = await retryWithBackoff( + () => server.generateContentStream(req as any, promptId, "USER" as any), + { + authType: AuthType.LOGIN_WITH_GOOGLE, + signal: options?.signal, + }, + ); let started = false; const ensureStarted = () => { @@ -220,7 +277,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe } } if (isThinking) { - currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined }; + currentBlock = { + type: "thinking", + thinking: "", + thinkingSignature: undefined, + }; output.content.push(currentBlock); ensureStarted(); stream.push({ @@ -232,7 +293,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe currentBlock = { type: "text", text: "" }; output.content.push(currentBlock); ensureStarted(); - stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output }); + stream.push({ + type: "text_start", + contentIndex: blockIndex(), + partial: output, + }); } } if (currentBlock.type === "thinking") { @@ -285,7 +350,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe const providedId = part.functionCall.id; const needsNewId = - !providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId); + !providedId || + output.content.some( + (b) => b.type === "toolCall" && b.id === providedId, + ); const toolCallId = needsNewId ? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}` : providedId; @@ -294,13 +362,20 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe type: "toolCall", id: toolCallId, name: part.functionCall.name || "", - arguments: (part.functionCall.args as Record) ?? {}, - ...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }), + arguments: + (part.functionCall.args as Record) ?? {}, + ...(part.thoughtSignature && { + thoughtSignature: part.thoughtSignature, + }), }; output.content.push(toolCall); ensureStarted(); - stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output }); + stream.push({ + type: "toolcall_start", + contentIndex: blockIndex(), + partial: output, + }); stream.push({ type: "toolcall_delta", contentIndex: blockIndex(), @@ -326,7 +401,8 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe if (chunk?.usageMetadata) { const promptTokens = chunk.usageMetadata.promptTokenCount || 0; - const cacheReadTokens = chunk.usageMetadata.cachedContentTokenCount || 0; + const cacheReadTokens = + chunk.usageMetadata.cachedContentTokenCount || 0; output.usage = { input: promptTokens - cacheReadTokens, output: @@ -335,7 +411,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe cacheRead: cacheReadTokens, cacheWrite: 0, totalTokens: chunk.usageMetadata.totalTokenCount || 0, - cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + cost: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + total: 0, + }, }; calculateCost(model, output.usage); } @@ -377,7 +459,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe } } output.stopReason = options?.signal?.aborted ? "aborted" : "error"; - output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error); + output.errorMessage = + error instanceof Error ? error.message : JSON.stringify(error); + const retryAfterMs = extractRetryAfterMs(error); + if (retryAfterMs !== undefined) { + output.retryAfterMs = retryAfterMs; + } stream.push({ type: "error", reason: output.stopReason, error: output }); stream.end(); } @@ -395,7 +482,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe * Auth is still handled by cli-core (apiKey is ignored). Returns the same `AssistantMessageEventStream` * as `streamGoogleGeminiCli()` after delegating with appropriate `thinking` config. */ -export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", SimpleStreamOptions> = ( +export const streamSimpleGoogleGeminiCli: StreamFunction< + "google-gemini-cli", + SimpleStreamOptions +> = ( model: Model<"google-gemini-cli">, context: Context, options?: SimpleStreamOptions, @@ -429,7 +519,9 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si } satisfies GoogleGeminiCliOptions); } - const effort = clampReasoning(resolveReasoningLevel(model, options.reasoning))!; + const effort = clampReasoning( + resolveReasoningLevel(model, options.reasoning), + )!; if (isGemini3Model(model.id)) { return streamGoogleGeminiCli(model, context, { ...base, @@ -450,7 +542,10 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si const minOutputTokens = 1024; let thinkingBudget = budgets[effort]!; - const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens); + const maxTokens = Math.min( + (base.maxTokens || 0) + thinkingBudget, + model.maxTokens, + ); if (maxTokens <= thinkingBudget) { thinkingBudget = Math.max(0, maxTokens - minOutputTokens); @@ -483,15 +578,18 @@ function buildRequest( const contents = convertMessages(model, context); const config: NonNullable = {}; - if (options.temperature !== undefined) config.temperature = options.temperature; - if (options.maxTokens !== undefined) config.maxOutputTokens = options.maxTokens; + if (options.temperature !== undefined) + config.temperature = options.temperature; + if (options.maxTokens !== undefined) + config.maxOutputTokens = options.maxTokens; // Thinking config if (options.thinking?.enabled && model.reasoning) { const thinkingConfig: ThinkingConfig = { includeThoughts: true }; // Gemini 3 models use thinkingLevel, older models use thinkingBudget if (options.thinking.level !== undefined) { - thinkingConfig.thinkingLevel = options.thinking.level as ThinkingConfig["thinkingLevel"]; + thinkingConfig.thinkingLevel = options.thinking + .level as ThinkingConfig["thinkingLevel"]; } else if (options.thinking.budgetTokens !== undefined) { thinkingConfig.thinkingBudget = options.thinking.budgetTokens; } @@ -509,7 +607,9 @@ function buildRequest( // Claude via gemini-cli is no longer supported (Antigravity was the // only path). Keep the useParameters=false default. const useParameters = false; - config.tools = convertTools(context.tools, useParameters) as NonNullable["tools"]; + config.tools = convertTools(context.tools, useParameters) as NonNullable< + GenerateContentParameters["config"] + >["tools"]; if (options.toolChoice) { config.toolConfig = { functionCallingConfig: { @@ -535,7 +635,10 @@ type ClampedThinkingLevel = Exclude; * Gemini 3 Flash supports all four (MINIMAL/LOW/MEDIUM/HIGH one-to-one). * Used when `options.thinking.level` is set for Gemini 3 models. */ -function getGeminiCliThinkingLevel(effort: ClampedThinkingLevel, modelId: string): GoogleThinkingLevel { +function getGeminiCliThinkingLevel( + effort: ClampedThinkingLevel, + modelId: string, +): GoogleThinkingLevel { if (isGemini3ProModel(modelId)) { switch (effort) { case "minimal": diff --git a/src/resources/extensions/sf-usage-bar/index.ts b/src/resources/extensions/sf-usage-bar/index.ts index f253212aa..b408ee46d 100644 --- a/src/resources/extensions/sf-usage-bar/index.ts +++ b/src/resources/extensions/sf-usage-bar/index.ts @@ -12,6 +12,14 @@ import { execSync, spawnSync } from "node:child_process"; import * as fs from "node:fs"; import * as os from "node:os"; import * as path from "node:path"; +import { + AuthType, + CodeAssistServer, + getOauthClient, + makeFakeConfig, + type RetrieveUserQuotaResponse, + setupUser, +} from "@google/gemini-cli-core"; import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent"; import { visibleWidth } from "@singularity-forge/pi-tui"; @@ -382,92 +390,65 @@ async function fetchCopilotUsage(_modelRegistry: any): Promise { // ============================================================================ async function fetchGeminiUsage(_modelRegistry: any): Promise { - let token: string | undefined; - - // Read directly from sf/pi auth.json - const data = loadAuthJson(); - if (data) { - token = data["google-gemini-cli"]?.access; - } - - // Fallback to ~/.gemini/oauth_creds.json - if (!token) { - const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json"); - try { - if (fs.existsSync(credPath)) { - const geminiData = JSON.parse(fs.readFileSync(credPath, "utf-8")); - token = geminiData.access_token; - } - } catch {} // missing or invalid JSON → continue - } - - if (!token) { + const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json"); + if (!fs.existsSync(credPath)) { return { provider: "gemini", displayName: "Gemini", windows: [], - error: "No credentials", + error: "No ~/.gemini credentials", }; } try { - const controller = new AbortController(); - setTimeout(() => controller.abort(), 5000); - - const res = await fetch( - "https://cloudcode-pa.googleapis.com/v1internal:retrieveUserQuota", - { - method: "POST", - headers: { - Authorization: `Bearer ${token}`, - "Content-Type": "application/json", - }, - body: "{}", - signal: controller.signal, - }, - ); - - if (!res.ok) { + const config = makeFakeConfig(); + const authClient = await getOauthClient(AuthType.LOGIN_WITH_GOOGLE, config); + const userData = await setupUser(authClient, config); + const projectId = userData.projectId; + if (!projectId) { return { provider: "gemini", displayName: "Gemini", windows: [], - error: `HTTP ${res.status}`, + error: "No Code Assist project", }; } - const data = (await res.json()) as any; - const quotas: Record = {}; + const server = new CodeAssistServer(authClient, projectId, { headers: {} }); + const data: RetrieveUserQuotaResponse = await server.retrieveUserQuota({ + project: projectId, + }); + const quotas: Record< + string, + { remainingFraction: number; resetTime?: string } + > = {}; for (const bucket of data.buckets || []) { const model = bucket.modelId || "unknown"; const frac = bucket.remainingFraction ?? 1; - if (!quotas[model] || frac < quotas[model]) quotas[model] = frac; + if (!quotas[model] || frac < quotas[model].remainingFraction) { + quotas[model] = { + remainingFraction: frac, + resetTime: bucket.resetTime, + }; + } } const windows: RateWindow[] = []; - let proMin = 1, - flashMin = 1; - let hasProModel = false, - hasFlashModel = false; - - for (const [model, frac] of Object.entries(quotas)) { - if (model.toLowerCase().includes("pro")) { - hasProModel = true; - if (frac < proMin) proMin = frac; - } - if (model.toLowerCase().includes("flash")) { - hasFlashModel = true; - if (frac < flashMin) flashMin = frac; - } + for (const [model, quota] of Object.entries(quotas).sort(([a], [b]) => + a.localeCompare(b), + )) { + const resetDate = quota.resetTime ? new Date(quota.resetTime) : undefined; + windows.push({ + label: model.replace(/^gemini-/, "").slice(0, 7), + usedPercent: (1 - quota.remainingFraction) * 100, + resetDescription: + resetDate && !Number.isNaN(resetDate.getTime()) + ? formatReset(resetDate) + : undefined, + }); } - // Always show windows if model exists (even at 0% usage) - if (hasProModel) - windows.push({ label: "Pro", usedPercent: (1 - proMin) * 100 }); - if (hasFlashModel) - windows.push({ label: "Flash", usedPercent: (1 - flashMin) * 100 }); - return { provider: "gemini", displayName: "Gemini", windows }; } catch (e) { return {