fix(gemini): route cli retry and quota through core
This commit is contained in:
parent
14c0412ee4
commit
98fe3b605d
2 changed files with 170 additions and 86 deletions
|
|
@ -8,8 +8,19 @@
|
|||
* via setupUser(), and handles all the User-Agent / retry / 429 details.
|
||||
*/
|
||||
|
||||
import type { Content, GenerateContentParameters, GenerateContentResponse, ThinkingConfig } from "@google/genai";
|
||||
import { AuthType, CodeAssistServer, getOauthClient, makeFakeConfig, setupUser } from "@google/gemini-cli-core";
|
||||
import {
|
||||
AuthType,
|
||||
CodeAssistServer,
|
||||
getOauthClient,
|
||||
makeFakeConfig,
|
||||
retryWithBackoff,
|
||||
setupUser,
|
||||
} from "@google/gemini-cli-core";
|
||||
import type {
|
||||
Content,
|
||||
GenerateContentParameters,
|
||||
ThinkingConfig,
|
||||
} from "@google/genai";
|
||||
import { calculateCost } from "../models.js";
|
||||
import type {
|
||||
Api,
|
||||
|
|
@ -35,7 +46,12 @@ import {
|
|||
mapToolChoice,
|
||||
retainThoughtSignature,
|
||||
} from "./google-shared.js";
|
||||
import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLevel } from "./simple-options.js";
|
||||
import {
|
||||
buildBaseOptions,
|
||||
clampReasoning,
|
||||
isAutoReasoning,
|
||||
resolveReasoningLevel,
|
||||
} from "./simple-options.js";
|
||||
|
||||
/**
|
||||
* Thinking level for Gemini 3 models.
|
||||
|
|
@ -44,7 +60,12 @@ import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLeve
|
|||
* These are the wire format values for `ThinkingConfig.thinkingLevel` sent to cli-core's
|
||||
* `CodeAssistServer.generateContentStream()`.
|
||||
*/
|
||||
export type GoogleThinkingLevel = "THINKING_LEVEL_UNSPECIFIED" | "MINIMAL" | "LOW" | "MEDIUM" | "HIGH";
|
||||
export type GoogleThinkingLevel =
|
||||
| "THINKING_LEVEL_UNSPECIFIED"
|
||||
| "MINIMAL"
|
||||
| "LOW"
|
||||
| "MEDIUM"
|
||||
| "HIGH";
|
||||
|
||||
/**
|
||||
* Options for `streamGoogleGeminiCli()`.
|
||||
|
|
@ -96,6 +117,34 @@ async function getCodeAssistServer(): Promise<CodeAssistServer> {
|
|||
return new CodeAssistServer(authClient, userData.projectId, { headers: {} });
|
||||
}
|
||||
|
||||
function parseDurationMs(value: string): number | undefined {
|
||||
const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i);
|
||||
if (!match || !match[0]) return undefined;
|
||||
const hours = Number(match[1] ?? 0);
|
||||
const minutes = Number(match[2] ?? 0);
|
||||
const seconds = Number(match[3] ?? 0);
|
||||
const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000;
|
||||
return totalMs > 0 ? totalMs : undefined;
|
||||
}
|
||||
|
||||
function extractRetryAfterMs(error: unknown): number | undefined {
|
||||
if (typeof error === "object" && error !== null && "retryDelayMs" in error) {
|
||||
const retryDelayMs = (error as { retryDelayMs?: unknown }).retryDelayMs;
|
||||
if (
|
||||
typeof retryDelayMs === "number" &&
|
||||
Number.isFinite(retryDelayMs) &&
|
||||
retryDelayMs > 0
|
||||
) {
|
||||
return retryDelayMs;
|
||||
}
|
||||
}
|
||||
const message =
|
||||
error instanceof Error ? error.message : JSON.stringify(error);
|
||||
const resetMatch = message.match(
|
||||
/(?:quota will reset|reset) after ([0-9hms]+)/i,
|
||||
);
|
||||
return resetMatch?.[1] ? parseDurationMs(resetMatch[1]) : undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the model is a Gemini 3 Pro variant (gemini-3*-pro).
|
||||
|
|
@ -131,7 +180,10 @@ function isGemini3Model(modelId: string): boolean {
|
|||
* runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and
|
||||
* error events that accumulate into an `AssistantMessage`.
|
||||
*/
|
||||
export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGeminiCliOptions> = (
|
||||
export const streamGoogleGeminiCli: StreamFunction<
|
||||
"google-gemini-cli",
|
||||
GoogleGeminiCliOptions
|
||||
> = (
|
||||
model: Model<"google-gemini-cli">,
|
||||
context: Context,
|
||||
options?: GoogleGeminiCliOptions,
|
||||
|
|
@ -171,8 +223,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
// so TypeScript sees two structurally-identical-but-distinct Content types.
|
||||
// The runtime shapes are byte-identical; the nominal split is a packaging
|
||||
// artefact.
|
||||
// biome-ignore lint/suspicious/noExplicitAny: see above
|
||||
const streamGen = await server.generateContentStream(req as any, promptId, "USER" as any);
|
||||
const streamGen = await retryWithBackoff(
|
||||
() => server.generateContentStream(req as any, promptId, "USER" as any),
|
||||
{
|
||||
authType: AuthType.LOGIN_WITH_GOOGLE,
|
||||
signal: options?.signal,
|
||||
},
|
||||
);
|
||||
|
||||
let started = false;
|
||||
const ensureStarted = () => {
|
||||
|
|
@ -220,7 +277,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
}
|
||||
}
|
||||
if (isThinking) {
|
||||
currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
|
||||
currentBlock = {
|
||||
type: "thinking",
|
||||
thinking: "",
|
||||
thinkingSignature: undefined,
|
||||
};
|
||||
output.content.push(currentBlock);
|
||||
ensureStarted();
|
||||
stream.push({
|
||||
|
|
@ -232,7 +293,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
currentBlock = { type: "text", text: "" };
|
||||
output.content.push(currentBlock);
|
||||
ensureStarted();
|
||||
stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
|
||||
stream.push({
|
||||
type: "text_start",
|
||||
contentIndex: blockIndex(),
|
||||
partial: output,
|
||||
});
|
||||
}
|
||||
}
|
||||
if (currentBlock.type === "thinking") {
|
||||
|
|
@ -285,7 +350,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
|
||||
const providedId = part.functionCall.id;
|
||||
const needsNewId =
|
||||
!providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId);
|
||||
!providedId ||
|
||||
output.content.some(
|
||||
(b) => b.type === "toolCall" && b.id === providedId,
|
||||
);
|
||||
const toolCallId = needsNewId
|
||||
? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
|
||||
: providedId;
|
||||
|
|
@ -294,13 +362,20 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
type: "toolCall",
|
||||
id: toolCallId,
|
||||
name: part.functionCall.name || "",
|
||||
arguments: (part.functionCall.args as Record<string, unknown>) ?? {},
|
||||
...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
|
||||
arguments:
|
||||
(part.functionCall.args as Record<string, unknown>) ?? {},
|
||||
...(part.thoughtSignature && {
|
||||
thoughtSignature: part.thoughtSignature,
|
||||
}),
|
||||
};
|
||||
|
||||
output.content.push(toolCall);
|
||||
ensureStarted();
|
||||
stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
|
||||
stream.push({
|
||||
type: "toolcall_start",
|
||||
contentIndex: blockIndex(),
|
||||
partial: output,
|
||||
});
|
||||
stream.push({
|
||||
type: "toolcall_delta",
|
||||
contentIndex: blockIndex(),
|
||||
|
|
@ -326,7 +401,8 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
|
||||
if (chunk?.usageMetadata) {
|
||||
const promptTokens = chunk.usageMetadata.promptTokenCount || 0;
|
||||
const cacheReadTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
|
||||
const cacheReadTokens =
|
||||
chunk.usageMetadata.cachedContentTokenCount || 0;
|
||||
output.usage = {
|
||||
input: promptTokens - cacheReadTokens,
|
||||
output:
|
||||
|
|
@ -335,7 +411,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
cacheRead: cacheReadTokens,
|
||||
cacheWrite: 0,
|
||||
totalTokens: chunk.usageMetadata.totalTokenCount || 0,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
||||
cost: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
cacheRead: 0,
|
||||
cacheWrite: 0,
|
||||
total: 0,
|
||||
},
|
||||
};
|
||||
calculateCost(model, output.usage);
|
||||
}
|
||||
|
|
@ -377,7 +459,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
}
|
||||
}
|
||||
output.stopReason = options?.signal?.aborted ? "aborted" : "error";
|
||||
output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
|
||||
output.errorMessage =
|
||||
error instanceof Error ? error.message : JSON.stringify(error);
|
||||
const retryAfterMs = extractRetryAfterMs(error);
|
||||
if (retryAfterMs !== undefined) {
|
||||
output.retryAfterMs = retryAfterMs;
|
||||
}
|
||||
stream.push({ type: "error", reason: output.stopReason, error: output });
|
||||
stream.end();
|
||||
}
|
||||
|
|
@ -395,7 +482,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
|
|||
* Auth is still handled by cli-core (apiKey is ignored). Returns the same `AssistantMessageEventStream`
|
||||
* as `streamGoogleGeminiCli()` after delegating with appropriate `thinking` config.
|
||||
*/
|
||||
export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", SimpleStreamOptions> = (
|
||||
export const streamSimpleGoogleGeminiCli: StreamFunction<
|
||||
"google-gemini-cli",
|
||||
SimpleStreamOptions
|
||||
> = (
|
||||
model: Model<"google-gemini-cli">,
|
||||
context: Context,
|
||||
options?: SimpleStreamOptions,
|
||||
|
|
@ -429,7 +519,9 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
|
|||
} satisfies GoogleGeminiCliOptions);
|
||||
}
|
||||
|
||||
const effort = clampReasoning(resolveReasoningLevel(model, options.reasoning))!;
|
||||
const effort = clampReasoning(
|
||||
resolveReasoningLevel(model, options.reasoning),
|
||||
)!;
|
||||
if (isGemini3Model(model.id)) {
|
||||
return streamGoogleGeminiCli(model, context, {
|
||||
...base,
|
||||
|
|
@ -450,7 +542,10 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
|
|||
|
||||
const minOutputTokens = 1024;
|
||||
let thinkingBudget = budgets[effort]!;
|
||||
const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens);
|
||||
const maxTokens = Math.min(
|
||||
(base.maxTokens || 0) + thinkingBudget,
|
||||
model.maxTokens,
|
||||
);
|
||||
|
||||
if (maxTokens <= thinkingBudget) {
|
||||
thinkingBudget = Math.max(0, maxTokens - minOutputTokens);
|
||||
|
|
@ -483,15 +578,18 @@ function buildRequest(
|
|||
const contents = convertMessages(model, context);
|
||||
|
||||
const config: NonNullable<GenerateContentParameters["config"]> = {};
|
||||
if (options.temperature !== undefined) config.temperature = options.temperature;
|
||||
if (options.maxTokens !== undefined) config.maxOutputTokens = options.maxTokens;
|
||||
if (options.temperature !== undefined)
|
||||
config.temperature = options.temperature;
|
||||
if (options.maxTokens !== undefined)
|
||||
config.maxOutputTokens = options.maxTokens;
|
||||
|
||||
// Thinking config
|
||||
if (options.thinking?.enabled && model.reasoning) {
|
||||
const thinkingConfig: ThinkingConfig = { includeThoughts: true };
|
||||
// Gemini 3 models use thinkingLevel, older models use thinkingBudget
|
||||
if (options.thinking.level !== undefined) {
|
||||
thinkingConfig.thinkingLevel = options.thinking.level as ThinkingConfig["thinkingLevel"];
|
||||
thinkingConfig.thinkingLevel = options.thinking
|
||||
.level as ThinkingConfig["thinkingLevel"];
|
||||
} else if (options.thinking.budgetTokens !== undefined) {
|
||||
thinkingConfig.thinkingBudget = options.thinking.budgetTokens;
|
||||
}
|
||||
|
|
@ -509,7 +607,9 @@ function buildRequest(
|
|||
// Claude via gemini-cli is no longer supported (Antigravity was the
|
||||
// only path). Keep the useParameters=false default.
|
||||
const useParameters = false;
|
||||
config.tools = convertTools(context.tools, useParameters) as NonNullable<GenerateContentParameters["config"]>["tools"];
|
||||
config.tools = convertTools(context.tools, useParameters) as NonNullable<
|
||||
GenerateContentParameters["config"]
|
||||
>["tools"];
|
||||
if (options.toolChoice) {
|
||||
config.toolConfig = {
|
||||
functionCallingConfig: {
|
||||
|
|
@ -535,7 +635,10 @@ type ClampedThinkingLevel = Exclude<ThinkingLevel, "xhigh">;
|
|||
* Gemini 3 Flash supports all four (MINIMAL/LOW/MEDIUM/HIGH one-to-one).
|
||||
* Used when `options.thinking.level` is set for Gemini 3 models.
|
||||
*/
|
||||
function getGeminiCliThinkingLevel(effort: ClampedThinkingLevel, modelId: string): GoogleThinkingLevel {
|
||||
function getGeminiCliThinkingLevel(
|
||||
effort: ClampedThinkingLevel,
|
||||
modelId: string,
|
||||
): GoogleThinkingLevel {
|
||||
if (isGemini3ProModel(modelId)) {
|
||||
switch (effort) {
|
||||
case "minimal":
|
||||
|
|
|
|||
|
|
@ -12,6 +12,14 @@ import { execSync, spawnSync } from "node:child_process";
|
|||
import * as fs from "node:fs";
|
||||
import * as os from "node:os";
|
||||
import * as path from "node:path";
|
||||
import {
|
||||
AuthType,
|
||||
CodeAssistServer,
|
||||
getOauthClient,
|
||||
makeFakeConfig,
|
||||
type RetrieveUserQuotaResponse,
|
||||
setupUser,
|
||||
} from "@google/gemini-cli-core";
|
||||
import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent";
|
||||
import { visibleWidth } from "@singularity-forge/pi-tui";
|
||||
|
||||
|
|
@ -382,92 +390,65 @@ async function fetchCopilotUsage(_modelRegistry: any): Promise<UsageSnapshot> {
|
|||
// ============================================================================
|
||||
|
||||
async function fetchGeminiUsage(_modelRegistry: any): Promise<UsageSnapshot> {
|
||||
let token: string | undefined;
|
||||
|
||||
// Read directly from sf/pi auth.json
|
||||
const data = loadAuthJson();
|
||||
if (data) {
|
||||
token = data["google-gemini-cli"]?.access;
|
||||
}
|
||||
|
||||
// Fallback to ~/.gemini/oauth_creds.json
|
||||
if (!token) {
|
||||
const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
|
||||
try {
|
||||
if (fs.existsSync(credPath)) {
|
||||
const geminiData = JSON.parse(fs.readFileSync(credPath, "utf-8"));
|
||||
token = geminiData.access_token;
|
||||
}
|
||||
} catch {} // missing or invalid JSON → continue
|
||||
}
|
||||
|
||||
if (!token) {
|
||||
const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
|
||||
if (!fs.existsSync(credPath)) {
|
||||
return {
|
||||
provider: "gemini",
|
||||
displayName: "Gemini",
|
||||
windows: [],
|
||||
error: "No credentials",
|
||||
error: "No ~/.gemini credentials",
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
setTimeout(() => controller.abort(), 5000);
|
||||
|
||||
const res = await fetch(
|
||||
"https://cloudcode-pa.googleapis.com/v1internal:retrieveUserQuota",
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${token}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: "{}",
|
||||
signal: controller.signal,
|
||||
},
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const config = makeFakeConfig();
|
||||
const authClient = await getOauthClient(AuthType.LOGIN_WITH_GOOGLE, config);
|
||||
const userData = await setupUser(authClient, config);
|
||||
const projectId = userData.projectId;
|
||||
if (!projectId) {
|
||||
return {
|
||||
provider: "gemini",
|
||||
displayName: "Gemini",
|
||||
windows: [],
|
||||
error: `HTTP ${res.status}`,
|
||||
error: "No Code Assist project",
|
||||
};
|
||||
}
|
||||
|
||||
const data = (await res.json()) as any;
|
||||
const quotas: Record<string, number> = {};
|
||||
const server = new CodeAssistServer(authClient, projectId, { headers: {} });
|
||||
const data: RetrieveUserQuotaResponse = await server.retrieveUserQuota({
|
||||
project: projectId,
|
||||
});
|
||||
const quotas: Record<
|
||||
string,
|
||||
{ remainingFraction: number; resetTime?: string }
|
||||
> = {};
|
||||
|
||||
for (const bucket of data.buckets || []) {
|
||||
const model = bucket.modelId || "unknown";
|
||||
const frac = bucket.remainingFraction ?? 1;
|
||||
if (!quotas[model] || frac < quotas[model]) quotas[model] = frac;
|
||||
if (!quotas[model] || frac < quotas[model].remainingFraction) {
|
||||
quotas[model] = {
|
||||
remainingFraction: frac,
|
||||
resetTime: bucket.resetTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const windows: RateWindow[] = [];
|
||||
let proMin = 1,
|
||||
flashMin = 1;
|
||||
let hasProModel = false,
|
||||
hasFlashModel = false;
|
||||
|
||||
for (const [model, frac] of Object.entries(quotas)) {
|
||||
if (model.toLowerCase().includes("pro")) {
|
||||
hasProModel = true;
|
||||
if (frac < proMin) proMin = frac;
|
||||
}
|
||||
if (model.toLowerCase().includes("flash")) {
|
||||
hasFlashModel = true;
|
||||
if (frac < flashMin) flashMin = frac;
|
||||
}
|
||||
for (const [model, quota] of Object.entries(quotas).sort(([a], [b]) =>
|
||||
a.localeCompare(b),
|
||||
)) {
|
||||
const resetDate = quota.resetTime ? new Date(quota.resetTime) : undefined;
|
||||
windows.push({
|
||||
label: model.replace(/^gemini-/, "").slice(0, 7),
|
||||
usedPercent: (1 - quota.remainingFraction) * 100,
|
||||
resetDescription:
|
||||
resetDate && !Number.isNaN(resetDate.getTime())
|
||||
? formatReset(resetDate)
|
||||
: undefined,
|
||||
});
|
||||
}
|
||||
|
||||
// Always show windows if model exists (even at 0% usage)
|
||||
if (hasProModel)
|
||||
windows.push({ label: "Pro", usedPercent: (1 - proMin) * 100 });
|
||||
if (hasFlashModel)
|
||||
windows.push({ label: "Flash", usedPercent: (1 - flashMin) * 100 });
|
||||
|
||||
return { provider: "gemini", displayName: "Gemini", windows };
|
||||
} catch (e) {
|
||||
return {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue