fix(gemini): route cli retry and quota through core

This commit is contained in:
Mikael Hugo 2026-05-02 13:20:10 +02:00
parent 14c0412ee4
commit 98fe3b605d
2 changed files with 170 additions and 86 deletions

View file

@ -8,8 +8,19 @@
* via setupUser(), and handles all the User-Agent / retry / 429 details.
*/
import type { Content, GenerateContentParameters, GenerateContentResponse, ThinkingConfig } from "@google/genai";
import { AuthType, CodeAssistServer, getOauthClient, makeFakeConfig, setupUser } from "@google/gemini-cli-core";
import {
AuthType,
CodeAssistServer,
getOauthClient,
makeFakeConfig,
retryWithBackoff,
setupUser,
} from "@google/gemini-cli-core";
import type {
Content,
GenerateContentParameters,
ThinkingConfig,
} from "@google/genai";
import { calculateCost } from "../models.js";
import type {
Api,
@ -35,7 +46,12 @@ import {
mapToolChoice,
retainThoughtSignature,
} from "./google-shared.js";
import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLevel } from "./simple-options.js";
import {
buildBaseOptions,
clampReasoning,
isAutoReasoning,
resolveReasoningLevel,
} from "./simple-options.js";
/**
* Thinking level for Gemini 3 models.
@ -44,7 +60,12 @@ import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLeve
* These are the wire format values for `ThinkingConfig.thinkingLevel` sent to cli-core's
* `CodeAssistServer.generateContentStream()`.
*/
export type GoogleThinkingLevel = "THINKING_LEVEL_UNSPECIFIED" | "MINIMAL" | "LOW" | "MEDIUM" | "HIGH";
export type GoogleThinkingLevel =
| "THINKING_LEVEL_UNSPECIFIED"
| "MINIMAL"
| "LOW"
| "MEDIUM"
| "HIGH";
/**
* Options for `streamGoogleGeminiCli()`.
@ -96,6 +117,34 @@ async function getCodeAssistServer(): Promise<CodeAssistServer> {
return new CodeAssistServer(authClient, userData.projectId, { headers: {} });
}
function parseDurationMs(value: string): number | undefined {
const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i);
if (!match || !match[0]) return undefined;
const hours = Number(match[1] ?? 0);
const minutes = Number(match[2] ?? 0);
const seconds = Number(match[3] ?? 0);
const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000;
return totalMs > 0 ? totalMs : undefined;
}
function extractRetryAfterMs(error: unknown): number | undefined {
if (typeof error === "object" && error !== null && "retryDelayMs" in error) {
const retryDelayMs = (error as { retryDelayMs?: unknown }).retryDelayMs;
if (
typeof retryDelayMs === "number" &&
Number.isFinite(retryDelayMs) &&
retryDelayMs > 0
) {
return retryDelayMs;
}
}
const message =
error instanceof Error ? error.message : JSON.stringify(error);
const resetMatch = message.match(
/(?:quota will reset|reset) after ([0-9hms]+)/i,
);
return resetMatch?.[1] ? parseDurationMs(resetMatch[1]) : undefined;
}
/**
* Check if the model is a Gemini 3 Pro variant (gemini-3*-pro).
@ -131,7 +180,10 @@ function isGemini3Model(modelId: string): boolean {
* runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and
* error events that accumulate into an `AssistantMessage`.
*/
export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGeminiCliOptions> = (
export const streamGoogleGeminiCli: StreamFunction<
"google-gemini-cli",
GoogleGeminiCliOptions
> = (
model: Model<"google-gemini-cli">,
context: Context,
options?: GoogleGeminiCliOptions,
@ -171,8 +223,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
// so TypeScript sees two structurally-identical-but-distinct Content types.
// The runtime shapes are byte-identical; the nominal split is a packaging
// artefact.
// biome-ignore lint/suspicious/noExplicitAny: see above
const streamGen = await server.generateContentStream(req as any, promptId, "USER" as any);
const streamGen = await retryWithBackoff(
() => server.generateContentStream(req as any, promptId, "USER" as any),
{
authType: AuthType.LOGIN_WITH_GOOGLE,
signal: options?.signal,
},
);
let started = false;
const ensureStarted = () => {
@ -220,7 +277,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
}
}
if (isThinking) {
currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
currentBlock = {
type: "thinking",
thinking: "",
thinkingSignature: undefined,
};
output.content.push(currentBlock);
ensureStarted();
stream.push({
@ -232,7 +293,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
currentBlock = { type: "text", text: "" };
output.content.push(currentBlock);
ensureStarted();
stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
stream.push({
type: "text_start",
contentIndex: blockIndex(),
partial: output,
});
}
}
if (currentBlock.type === "thinking") {
@ -285,7 +350,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
const providedId = part.functionCall.id;
const needsNewId =
!providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId);
!providedId ||
output.content.some(
(b) => b.type === "toolCall" && b.id === providedId,
);
const toolCallId = needsNewId
? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
: providedId;
@ -294,13 +362,20 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
type: "toolCall",
id: toolCallId,
name: part.functionCall.name || "",
arguments: (part.functionCall.args as Record<string, unknown>) ?? {},
...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
arguments:
(part.functionCall.args as Record<string, unknown>) ?? {},
...(part.thoughtSignature && {
thoughtSignature: part.thoughtSignature,
}),
};
output.content.push(toolCall);
ensureStarted();
stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
stream.push({
type: "toolcall_start",
contentIndex: blockIndex(),
partial: output,
});
stream.push({
type: "toolcall_delta",
contentIndex: blockIndex(),
@ -326,7 +401,8 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
if (chunk?.usageMetadata) {
const promptTokens = chunk.usageMetadata.promptTokenCount || 0;
const cacheReadTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
const cacheReadTokens =
chunk.usageMetadata.cachedContentTokenCount || 0;
output.usage = {
input: promptTokens - cacheReadTokens,
output:
@ -335,7 +411,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
cacheRead: cacheReadTokens,
cacheWrite: 0,
totalTokens: chunk.usageMetadata.totalTokenCount || 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
cost: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
total: 0,
},
};
calculateCost(model, output.usage);
}
@ -377,7 +459,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
}
}
output.stopReason = options?.signal?.aborted ? "aborted" : "error";
output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
output.errorMessage =
error instanceof Error ? error.message : JSON.stringify(error);
const retryAfterMs = extractRetryAfterMs(error);
if (retryAfterMs !== undefined) {
output.retryAfterMs = retryAfterMs;
}
stream.push({ type: "error", reason: output.stopReason, error: output });
stream.end();
}
@ -395,7 +482,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
* Auth is still handled by cli-core (apiKey is ignored). Returns the same `AssistantMessageEventStream`
* as `streamGoogleGeminiCli()` after delegating with appropriate `thinking` config.
*/
export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", SimpleStreamOptions> = (
export const streamSimpleGoogleGeminiCli: StreamFunction<
"google-gemini-cli",
SimpleStreamOptions
> = (
model: Model<"google-gemini-cli">,
context: Context,
options?: SimpleStreamOptions,
@ -429,7 +519,9 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
} satisfies GoogleGeminiCliOptions);
}
const effort = clampReasoning(resolveReasoningLevel(model, options.reasoning))!;
const effort = clampReasoning(
resolveReasoningLevel(model, options.reasoning),
)!;
if (isGemini3Model(model.id)) {
return streamGoogleGeminiCli(model, context, {
...base,
@ -450,7 +542,10 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
const minOutputTokens = 1024;
let thinkingBudget = budgets[effort]!;
const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens);
const maxTokens = Math.min(
(base.maxTokens || 0) + thinkingBudget,
model.maxTokens,
);
if (maxTokens <= thinkingBudget) {
thinkingBudget = Math.max(0, maxTokens - minOutputTokens);
@ -483,15 +578,18 @@ function buildRequest(
const contents = convertMessages(model, context);
const config: NonNullable<GenerateContentParameters["config"]> = {};
if (options.temperature !== undefined) config.temperature = options.temperature;
if (options.maxTokens !== undefined) config.maxOutputTokens = options.maxTokens;
if (options.temperature !== undefined)
config.temperature = options.temperature;
if (options.maxTokens !== undefined)
config.maxOutputTokens = options.maxTokens;
// Thinking config
if (options.thinking?.enabled && model.reasoning) {
const thinkingConfig: ThinkingConfig = { includeThoughts: true };
// Gemini 3 models use thinkingLevel, older models use thinkingBudget
if (options.thinking.level !== undefined) {
thinkingConfig.thinkingLevel = options.thinking.level as ThinkingConfig["thinkingLevel"];
thinkingConfig.thinkingLevel = options.thinking
.level as ThinkingConfig["thinkingLevel"];
} else if (options.thinking.budgetTokens !== undefined) {
thinkingConfig.thinkingBudget = options.thinking.budgetTokens;
}
@ -509,7 +607,9 @@ function buildRequest(
// Claude via gemini-cli is no longer supported (Antigravity was the
// only path). Keep the useParameters=false default.
const useParameters = false;
config.tools = convertTools(context.tools, useParameters) as NonNullable<GenerateContentParameters["config"]>["tools"];
config.tools = convertTools(context.tools, useParameters) as NonNullable<
GenerateContentParameters["config"]
>["tools"];
if (options.toolChoice) {
config.toolConfig = {
functionCallingConfig: {
@ -535,7 +635,10 @@ type ClampedThinkingLevel = Exclude<ThinkingLevel, "xhigh">;
* Gemini 3 Flash supports all four (MINIMAL/LOW/MEDIUM/HIGH one-to-one).
* Used when `options.thinking.level` is set for Gemini 3 models.
*/
function getGeminiCliThinkingLevel(effort: ClampedThinkingLevel, modelId: string): GoogleThinkingLevel {
function getGeminiCliThinkingLevel(
effort: ClampedThinkingLevel,
modelId: string,
): GoogleThinkingLevel {
if (isGemini3ProModel(modelId)) {
switch (effort) {
case "minimal":

View file

@ -12,6 +12,14 @@ import { execSync, spawnSync } from "node:child_process";
import * as fs from "node:fs";
import * as os from "node:os";
import * as path from "node:path";
import {
AuthType,
CodeAssistServer,
getOauthClient,
makeFakeConfig,
type RetrieveUserQuotaResponse,
setupUser,
} from "@google/gemini-cli-core";
import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent";
import { visibleWidth } from "@singularity-forge/pi-tui";
@ -382,92 +390,65 @@ async function fetchCopilotUsage(_modelRegistry: any): Promise<UsageSnapshot> {
// ============================================================================
async function fetchGeminiUsage(_modelRegistry: any): Promise<UsageSnapshot> {
let token: string | undefined;
// Read directly from sf/pi auth.json
const data = loadAuthJson();
if (data) {
token = data["google-gemini-cli"]?.access;
}
// Fallback to ~/.gemini/oauth_creds.json
if (!token) {
const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
try {
if (fs.existsSync(credPath)) {
const geminiData = JSON.parse(fs.readFileSync(credPath, "utf-8"));
token = geminiData.access_token;
}
} catch {} // missing or invalid JSON → continue
}
if (!token) {
const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
if (!fs.existsSync(credPath)) {
return {
provider: "gemini",
displayName: "Gemini",
windows: [],
error: "No credentials",
error: "No ~/.gemini credentials",
};
}
try {
const controller = new AbortController();
setTimeout(() => controller.abort(), 5000);
const res = await fetch(
"https://cloudcode-pa.googleapis.com/v1internal:retrieveUserQuota",
{
method: "POST",
headers: {
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
},
body: "{}",
signal: controller.signal,
},
);
if (!res.ok) {
const config = makeFakeConfig();
const authClient = await getOauthClient(AuthType.LOGIN_WITH_GOOGLE, config);
const userData = await setupUser(authClient, config);
const projectId = userData.projectId;
if (!projectId) {
return {
provider: "gemini",
displayName: "Gemini",
windows: [],
error: `HTTP ${res.status}`,
error: "No Code Assist project",
};
}
const data = (await res.json()) as any;
const quotas: Record<string, number> = {};
const server = new CodeAssistServer(authClient, projectId, { headers: {} });
const data: RetrieveUserQuotaResponse = await server.retrieveUserQuota({
project: projectId,
});
const quotas: Record<
string,
{ remainingFraction: number; resetTime?: string }
> = {};
for (const bucket of data.buckets || []) {
const model = bucket.modelId || "unknown";
const frac = bucket.remainingFraction ?? 1;
if (!quotas[model] || frac < quotas[model]) quotas[model] = frac;
if (!quotas[model] || frac < quotas[model].remainingFraction) {
quotas[model] = {
remainingFraction: frac,
resetTime: bucket.resetTime,
};
}
}
const windows: RateWindow[] = [];
let proMin = 1,
flashMin = 1;
let hasProModel = false,
hasFlashModel = false;
for (const [model, frac] of Object.entries(quotas)) {
if (model.toLowerCase().includes("pro")) {
hasProModel = true;
if (frac < proMin) proMin = frac;
}
if (model.toLowerCase().includes("flash")) {
hasFlashModel = true;
if (frac < flashMin) flashMin = frac;
}
for (const [model, quota] of Object.entries(quotas).sort(([a], [b]) =>
a.localeCompare(b),
)) {
const resetDate = quota.resetTime ? new Date(quota.resetTime) : undefined;
windows.push({
label: model.replace(/^gemini-/, "").slice(0, 7),
usedPercent: (1 - quota.remainingFraction) * 100,
resetDescription:
resetDate && !Number.isNaN(resetDate.getTime())
? formatReset(resetDate)
: undefined,
});
}
// Always show windows if model exists (even at 0% usage)
if (hasProModel)
windows.push({ label: "Pro", usedPercent: (1 - proMin) * 100 });
if (hasFlashModel)
windows.push({ label: "Flash", usedPercent: (1 - flashMin) * 100 });
return { provider: "gemini", displayName: "Gemini", windows };
} catch (e) {
return {