From 98fe3b605dfad266dc024f39993e3d91b0ddfa98 Mon Sep 17 00:00:00 2001
From: Mikael Hugo <mikkihugo@users.noreply.github.com>
Date: Sat, 2 May 2026 13:20:10 +0200
Subject: [PATCH] fix(gemini): route cli retry and quota through core

---
 .../pi-ai/src/providers/google-gemini-cli.ts  | 151 +++++++++++++++---
 .../extensions/sf-usage-bar/index.ts          | 105 +++++-------
 2 files changed, 170 insertions(+), 86 deletions(-)
diff --git a/packages/pi-ai/src/providers/google-gemini-cli.ts b/packages/pi-ai/src/providers/google-gemini-cli.ts
index f07ba91a0..c815227cc 100644
--- a/packages/pi-ai/src/providers/google-gemini-cli.ts
+++ b/packages/pi-ai/src/providers/google-gemini-cli.ts
@@ -8,8 +8,19 @@
  * via setupUser(), and handles all the User-Agent / retry / 429 details.
  */
 
-import type { Content, GenerateContentParameters, GenerateContentResponse, ThinkingConfig } from "@google/genai";
-import { AuthType, CodeAssistServer, getOauthClient, makeFakeConfig, setupUser } from "@google/gemini-cli-core";
+import {
+	AuthType,
+	CodeAssistServer,
+	getOauthClient,
+	makeFakeConfig,
+	retryWithBackoff,
+	setupUser,
+} from "@google/gemini-cli-core";
+import type {
+	Content,
+	GenerateContentParameters,
+	ThinkingConfig,
+} from "@google/genai";
 import { calculateCost } from "../models.js";
 import type {
 	Api,
@@ -35,7 +46,12 @@ import {
 	mapToolChoice,
 	retainThoughtSignature,
 } from "./google-shared.js";
-import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLevel } from "./simple-options.js";
+import {
+	buildBaseOptions,
+	clampReasoning,
+	isAutoReasoning,
+	resolveReasoningLevel,
+} from "./simple-options.js";
 
 /**
  * Thinking level for Gemini 3 models.
@@ -44,7 +60,12 @@ import { buildBaseOptions, clampReasoning, isAutoReasoning, resolveReasoningLeve
  * These are the wire format values for `ThinkingConfig.thinkingLevel` sent to cli-core's
  * `CodeAssistServer.generateContentStream()`.
  */
-export type GoogleThinkingLevel = "THINKING_LEVEL_UNSPECIFIED" | "MINIMAL" | "LOW" | "MEDIUM" | "HIGH";
+export type GoogleThinkingLevel =
+	| "THINKING_LEVEL_UNSPECIFIED"
+	| "MINIMAL"
+	| "LOW"
+	| "MEDIUM"
+	| "HIGH";
 
 /**
  * Options for `streamGoogleGeminiCli()`.
@@ -96,6 +117,34 @@ async function getCodeAssistServer(): Promise<CodeAssistServer> {
 	return new CodeAssistServer(authClient, userData.projectId, { headers: {} });
 }
 
+function parseDurationMs(value: string): number | undefined {
+	const match = value.match(/(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?/i);
+	if (!match || !match[0]) return undefined;
+	const hours = Number(match[1] ?? 0);
+	const minutes = Number(match[2] ?? 0);
+	const seconds = Number(match[3] ?? 0);
+	const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000;
+	return totalMs > 0 ? totalMs : undefined;
+}
+
+function extractRetryAfterMs(error: unknown): number | undefined {
+	if (typeof error === "object" && error !== null && "retryDelayMs" in error) {
+		const retryDelayMs = (error as { retryDelayMs?: unknown }).retryDelayMs;
+		if (
+			typeof retryDelayMs === "number" &&
+			Number.isFinite(retryDelayMs) &&
+			retryDelayMs > 0
+		) {
+			return retryDelayMs;
+		}
+	}
+	const message =
+		error instanceof Error ? error.message : JSON.stringify(error);
+	const resetMatch = message.match(
+		/(?:quota will reset|reset) after ([0-9hms]+)/i,
+	);
+	return resetMatch?.[1] ? parseDurationMs(resetMatch[1]) : undefined;
+}
 
 /**
  * Check if the model is a Gemini 3 Pro variant (gemini-3*-pro).
@@ -131,7 +180,10 @@ function isGemini3Model(modelId: string): boolean {
  * runtime shapes are byte-identical). Returns a real-time stream emitting start, delta, end, and
  * error events that accumulate into an `AssistantMessage`.
  */
-export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGeminiCliOptions> = (
+export const streamGoogleGeminiCli: StreamFunction<
+	"google-gemini-cli",
+	GoogleGeminiCliOptions
+> = (
 	model: Model<"google-gemini-cli">,
 	context: Context,
 	options?: GoogleGeminiCliOptions,
@@ -171,8 +223,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 			// so TypeScript sees two structurally-identical-but-distinct Content types.
 			// The runtime shapes are byte-identical; the nominal split is a packaging
 			// artefact.
-			// biome-ignore lint/suspicious/noExplicitAny: see above
-			const streamGen = await server.generateContentStream(req as any, promptId, "USER" as any);
+			const streamGen = await retryWithBackoff(
+				() => server.generateContentStream(req as any, promptId, "USER" as any),
+				{
+					authType: AuthType.LOGIN_WITH_GOOGLE,
+					signal: options?.signal,
+				},
+			);
 
 			let started = false;
 			const ensureStarted = () => {
@@ -220,7 +277,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 									}
 								}
 								if (isThinking) {
-									currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
+									currentBlock = {
+										type: "thinking",
+										thinking: "",
+										thinkingSignature: undefined,
+									};
 									output.content.push(currentBlock);
 									ensureStarted();
 									stream.push({
@@ -232,7 +293,11 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 									currentBlock = { type: "text", text: "" };
 									output.content.push(currentBlock);
 									ensureStarted();
-									stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
+									stream.push({
+										type: "text_start",
+										contentIndex: blockIndex(),
+										partial: output,
+									});
 								}
 							}
 							if (currentBlock.type === "thinking") {
@@ -285,7 +350,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 
 							const providedId = part.functionCall.id;
 							const needsNewId =
-								!providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId);
+								!providedId ||
+								output.content.some(
+									(b) => b.type === "toolCall" && b.id === providedId,
+								);
 							const toolCallId = needsNewId
 								? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
 								: providedId;
@@ -294,13 +362,20 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 								type: "toolCall",
 								id: toolCallId,
 								name: part.functionCall.name || "",
-								arguments: (part.functionCall.args as Record<string, unknown>) ?? {},
-								...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
+								arguments:
+									(part.functionCall.args as Record<string, unknown>) ?? {},
+								...(part.thoughtSignature && {
+									thoughtSignature: part.thoughtSignature,
+								}),
 							};
 
 							output.content.push(toolCall);
 							ensureStarted();
-							stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
+							stream.push({
+								type: "toolcall_start",
+								contentIndex: blockIndex(),
+								partial: output,
+							});
 							stream.push({
 								type: "toolcall_delta",
 								contentIndex: blockIndex(),
@@ -326,7 +401,8 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 
 				if (chunk?.usageMetadata) {
 					const promptTokens = chunk.usageMetadata.promptTokenCount || 0;
-					const cacheReadTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
+					const cacheReadTokens =
+						chunk.usageMetadata.cachedContentTokenCount || 0;
 					output.usage = {
 						input: promptTokens - cacheReadTokens,
 						output:
@@ -335,7 +411,13 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 						cacheRead: cacheReadTokens,
 						cacheWrite: 0,
 						totalTokens: chunk.usageMetadata.totalTokenCount || 0,
-						cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+						cost: {
+							input: 0,
+							output: 0,
+							cacheRead: 0,
+							cacheWrite: 0,
+							total: 0,
+						},
 					};
 					calculateCost(model, output.usage);
 				}
@@ -377,7 +459,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
 				}
 			}
 			output.stopReason = options?.signal?.aborted ? "aborted" : "error";
-			output.errorMessage = error instanceof Error ? error.message : JSON.stringify(error);
+			output.errorMessage =
+				error instanceof Error ? error.message : JSON.stringify(error);
+			const retryAfterMs = extractRetryAfterMs(error);
+			if (retryAfterMs !== undefined) {
+				output.retryAfterMs = retryAfterMs;
+			}
 			stream.push({ type: "error", reason: output.stopReason, error: output });
 			stream.end();
 		}
@@ -395,7 +482,10 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli", GoogleGe
  * Auth is still handled by cli-core (apiKey is ignored). Returns the same `AssistantMessageEventStream`
  * as `streamGoogleGeminiCli()` after delegating with appropriate `thinking` config.
  */
-export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", SimpleStreamOptions> = (
+export const streamSimpleGoogleGeminiCli: StreamFunction<
+	"google-gemini-cli",
+	SimpleStreamOptions
+> = (
 	model: Model<"google-gemini-cli">,
 	context: Context,
 	options?: SimpleStreamOptions,
@@ -429,7 +519,9 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
 		} satisfies GoogleGeminiCliOptions);
 	}
 
-	const effort = clampReasoning(resolveReasoningLevel(model, options.reasoning))!;
+	const effort = clampReasoning(
+		resolveReasoningLevel(model, options.reasoning),
+	)!;
 	if (isGemini3Model(model.id)) {
 		return streamGoogleGeminiCli(model, context, {
 			...base,
@@ -450,7 +542,10 @@ export const streamSimpleGoogleGeminiCli: StreamFunction<"google-gemini-cli", Si
 
 	const minOutputTokens = 1024;
 	let thinkingBudget = budgets[effort]!;
-	const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens);
+	const maxTokens = Math.min(
+		(base.maxTokens || 0) + thinkingBudget,
+		model.maxTokens,
+	);
 
 	if (maxTokens <= thinkingBudget) {
 		thinkingBudget = Math.max(0, maxTokens - minOutputTokens);
@@ -483,15 +578,18 @@ function buildRequest(
 	const contents = convertMessages(model, context);
 
 	const config: NonNullable<GenerateContentParameters["config"]> = {};
-	if (options.temperature !== undefined) config.temperature = options.temperature;
-	if (options.maxTokens !== undefined) config.maxOutputTokens = options.maxTokens;
+	if (options.temperature !== undefined)
+		config.temperature = options.temperature;
+	if (options.maxTokens !== undefined)
+		config.maxOutputTokens = options.maxTokens;
 
 	// Thinking config
 	if (options.thinking?.enabled && model.reasoning) {
 		const thinkingConfig: ThinkingConfig = { includeThoughts: true };
 		// Gemini 3 models use thinkingLevel, older models use thinkingBudget
 		if (options.thinking.level !== undefined) {
-			thinkingConfig.thinkingLevel = options.thinking.level as ThinkingConfig["thinkingLevel"];
+			thinkingConfig.thinkingLevel = options.thinking
+				.level as ThinkingConfig["thinkingLevel"];
 		} else if (options.thinking.budgetTokens !== undefined) {
 			thinkingConfig.thinkingBudget = options.thinking.budgetTokens;
 		}
@@ -509,7 +607,9 @@ function buildRequest(
 		// Claude via gemini-cli is no longer supported (Antigravity was the
 		// only path). Keep the useParameters=false default.
 		const useParameters = false;
-		config.tools = convertTools(context.tools, useParameters) as NonNullable<GenerateContentParameters["config"]>["tools"];
+		config.tools = convertTools(context.tools, useParameters) as NonNullable<
+			GenerateContentParameters["config"]
+		>["tools"];
 		if (options.toolChoice) {
 			config.toolConfig = {
 				functionCallingConfig: {
@@ -535,7 +635,10 @@ type ClampedThinkingLevel = Exclude<ThinkingLevel, "xhigh">;
  * Gemini 3 Flash supports all four (MINIMAL/LOW/MEDIUM/HIGH one-to-one).
  * Used when `options.thinking.level` is set for Gemini 3 models.
  */
-function getGeminiCliThinkingLevel(effort: ClampedThinkingLevel, modelId: string): GoogleThinkingLevel {
+function getGeminiCliThinkingLevel(
+	effort: ClampedThinkingLevel,
+	modelId: string,
+): GoogleThinkingLevel {
 	if (isGemini3ProModel(modelId)) {
 		switch (effort) {
 			case "minimal":
diff --git a/src/resources/extensions/sf-usage-bar/index.ts b/src/resources/extensions/sf-usage-bar/index.ts
index f253212aa..b408ee46d 100644
--- a/src/resources/extensions/sf-usage-bar/index.ts
+++ b/src/resources/extensions/sf-usage-bar/index.ts
@@ -12,6 +12,14 @@ import { execSync, spawnSync } from "node:child_process";
 import * as fs from "node:fs";
 import * as os from "node:os";
 import * as path from "node:path";
+import {
+	AuthType,
+	CodeAssistServer,
+	getOauthClient,
+	makeFakeConfig,
+	type RetrieveUserQuotaResponse,
+	setupUser,
+} from "@google/gemini-cli-core";
 import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent";
 import { visibleWidth } from "@singularity-forge/pi-tui";
 
@@ -382,92 +390,65 @@ async function fetchCopilotUsage(_modelRegistry: any): Promise<UsageSnapshot> {
 // ============================================================================
 
 async function fetchGeminiUsage(_modelRegistry: any): Promise<UsageSnapshot> {
-	let token: string | undefined;
-
-	// Read directly from sf/pi auth.json
-	const data = loadAuthJson();
-	if (data) {
-		token = data["google-gemini-cli"]?.access;
-	}
-
-	// Fallback to ~/.gemini/oauth_creds.json
-	if (!token) {
-		const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
-		try {
-			if (fs.existsSync(credPath)) {
-				const geminiData = JSON.parse(fs.readFileSync(credPath, "utf-8"));
-				token = geminiData.access_token;
-			}
-		} catch {} // missing or invalid JSON → continue
-	}
-
-	if (!token) {
+	const credPath = path.join(os.homedir(), ".gemini", "oauth_creds.json");
+	if (!fs.existsSync(credPath)) {
 		return {
 			provider: "gemini",
 			displayName: "Gemini",
 			windows: [],
-			error: "No credentials",
+			error: "No ~/.gemini credentials",
 		};
 	}
 
 	try {
-		const controller = new AbortController();
-		setTimeout(() => controller.abort(), 5000);
-
-		const res = await fetch(
-			"https://cloudcode-pa.googleapis.com/v1internal:retrieveUserQuota",
-			{
-				method: "POST",
-				headers: {
-					Authorization: `Bearer ${token}`,
-					"Content-Type": "application/json",
-				},
-				body: "{}",
-				signal: controller.signal,
-			},
-		);
-
-		if (!res.ok) {
+		const config = makeFakeConfig();
+		const authClient = await getOauthClient(AuthType.LOGIN_WITH_GOOGLE, config);
+		const userData = await setupUser(authClient, config);
+		const projectId = userData.projectId;
+		if (!projectId) {
 			return {
 				provider: "gemini",
 				displayName: "Gemini",
 				windows: [],
-				error: `HTTP ${res.status}`,
+				error: "No Code Assist project",
 			};
 		}
 
-		const data = (await res.json()) as any;
-		const quotas: Record<string, number> = {};
+		const server = new CodeAssistServer(authClient, projectId, { headers: {} });
+		const data: RetrieveUserQuotaResponse = await server.retrieveUserQuota({
+			project: projectId,
+		});
+		const quotas: Record<
+			string,
+			{ remainingFraction: number; resetTime?: string }
+		> = {};
 
 		for (const bucket of data.buckets || []) {
 			const model = bucket.modelId || "unknown";
 			const frac = bucket.remainingFraction ?? 1;
-			if (!quotas[model] || frac < quotas[model]) quotas[model] = frac;
+			if (!quotas[model] || frac < quotas[model].remainingFraction) {
+				quotas[model] = {
+					remainingFraction: frac,
+					resetTime: bucket.resetTime,
+				};
+			}
 		}
 
 		const windows: RateWindow[] = [];
-		let proMin = 1,
-			flashMin = 1;
-		let hasProModel = false,
-			hasFlashModel = false;
-
-		for (const [model, frac] of Object.entries(quotas)) {
-			if (model.toLowerCase().includes("pro")) {
-				hasProModel = true;
-				if (frac < proMin) proMin = frac;
-			}
-			if (model.toLowerCase().includes("flash")) {
-				hasFlashModel = true;
-				if (frac < flashMin) flashMin = frac;
-			}
+		for (const [model, quota] of Object.entries(quotas).sort(([a], [b]) =>
+			a.localeCompare(b),
+		)) {
+			const resetDate = quota.resetTime ? new Date(quota.resetTime) : undefined;
+			windows.push({
+				label: model.replace(/^gemini-/, "").slice(0, 7),
+				usedPercent: (1 - quota.remainingFraction) * 100,
+				resetDescription:
+					resetDate && !Number.isNaN(resetDate.getTime())
+						? formatReset(resetDate)
+						: undefined,
+			});
 		}
 
-		// Always show windows if model exists (even at 0% usage)
-		if (hasProModel)
-			windows.push({ label: "Pro", usedPercent: (1 - proMin) * 100 });
-		if (hasFlashModel)
-			windows.push({ label: "Flash", usedPercent: (1 - flashMin) * 100 });
-
 		return { provider: "gemini", displayName: "Gemini", windows };
 	} catch (e) {
 		return {