diff --git a/packages/pi-coding-agent/src/core/fallback-resolver.test.ts b/packages/pi-coding-agent/src/core/fallback-resolver.test.ts
index 4e9ca7cd9..c40a6462b 100644
--- a/packages/pi-coding-agent/src/core/fallback-resolver.test.ts
+++ b/packages/pi-coding-agent/src/core/fallback-resolver.test.ts
@@ -82,7 +82,7 @@ describe("FallbackResolver — findFallback", () => {
 		assert.equal(result!.chainName, "coding");
 	});
 
-	it("marks current provider as exhausted", async () => {
+	it("marks current provider as exhausted for rate_limit errors", async () => {
 		const { resolver, authStorage } = createResolver();
 		await resolver.findFallback(zaiModel, "rate_limit");
 
@@ -92,6 +92,18 @@ describe("FallbackResolver — findFallback", () => {
 		assert.equal(fn.mock.calls[0][1], "rate_limit");
 	});
 
+	it("does NOT mark provider as exhausted for quota_exhausted (per-model quota)", async () => {
+		const { resolver, authStorage } = createResolver();
+		await resolver.findFallback(zaiModel, "quota_exhausted");
+
+		const fn = authStorage.markProviderExhausted as any;
+		assert.equal(
+			fn.mock.calls.length,
+			0,
+			"quota_exhausted should not mark entire provider exhausted — other models may have quota",
+		);
+	});
+
 	it("skips backed-off providers", async () => {
 		const { resolver } = createResolver({
 			isProviderAvailable: (provider: string) => provider !== "alibaba",
diff --git a/packages/pi-coding-agent/src/core/fallback-resolver.ts b/packages/pi-coding-agent/src/core/fallback-resolver.ts
index 690dca75d..7220b819a 100644
--- a/packages/pi-coding-agent/src/core/fallback-resolver.ts
+++ b/packages/pi-coding-agent/src/core/fallback-resolver.ts
@@ -44,8 +44,13 @@ export class FallbackResolver {
 		const { enabled, chains } = this.settingsManager.getFallbackSettings();
 		if (!enabled) return null;
 
-		// Mark the current provider as exhausted at the provider level
-		this.authStorage.markProviderExhausted(currentModel.provider, errorType);
+		// Mark the current provider as exhausted at the provider level.
+		// Skip for quota_exhausted — quotas are typically per-model (e.g.
+		// google-gemini-cli's Code Assist per-model limits), so other models
+		// from the same provider may still be available.
+		if (errorType !== "quota_exhausted") {
+			this.authStorage.markProviderExhausted(currentModel.provider, errorType);
+		}
 
 		// Search all chains for one containing the current model
 		for (const [chainName, entries] of Object.entries(chains)) {
diff --git a/packages/pi-coding-agent/src/core/retry-handler.test.ts b/packages/pi-coding-agent/src/core/retry-handler.test.ts
index 71086f202..1272955ed 100644
--- a/packages/pi-coding-agent/src/core/retry-handler.test.ts
+++ b/packages/pi-coding-agent/src/core/retry-handler.test.ts
@@ -513,6 +513,19 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 			);
 			assert.equal(handler.isRetryableError(msg), true);
 		});
+
+		it("considers 'no capacity' provider errors as retryable", () => {
+			const { deps } = createMockDeps();
+			const handler = new RetryHandler(deps);
+			const msg = errorMessage(
+				"No capacity available for model gemini-2.5-pro on the server",
+			);
+			assert.equal(
+				handler.isRetryableError(msg),
+				true,
+				"no capacity errors should be retryable (triggers fallback)",
+			);
+		});
 	});
 
 	describe("third-party block claude-code fallback (#3772)", () => {
@@ -608,7 +621,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 	});
 
 	describe("quota wait before fallback", () => {
-		it("waits for retryAfterMs before retrying same provider on quota error", async () => {
+		it("waits for short retryAfterMs before retrying same provider on quota error", async () => {
 			const { deps, emittedEvents, continueFn } = createMockDeps({
 				model: createMockModel("google-gemini-cli", "gemini-2.5-pro"),
 				fallbackResult: null,
@@ -617,14 +630,14 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 
 			const handler = new RetryHandler(deps);
 			const msg = errorMessage(
-				"You have exhausted your capacity on this model. Your quota will reset after 59s.",
+				"You have exhausted your capacity on this model. Your quota will reset after 3s.",
 			);
-			(msg as any).retryAfterMs = 59000;
+			(msg as any).retryAfterMs = 3000;
 
 			const result = await handler.handleRetryableError(msg);
 
-			// Should wait and retry, not immediately fail
-			assert.equal(result, true, "should wait and retry on quota reset");
+			// Should wait and retry for short resets (< 5s threshold)
+			assert.equal(result, true, "should wait and retry on short quota reset");
 
 			const retryStart = emittedEvents.find(
 				(e) => e.type === "auto_retry_start",
@@ -632,7 +645,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 			assert.ok(retryStart, "should emit auto_retry_start with wait");
 			assert.equal(
 				retryStart!.delayMs,
-				59000,
+				3000,
 				"should use provider's retry-after delay",
 			);
 			assert.ok(
@@ -651,7 +664,7 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 			);
 		});
 
-		it("falls through to fallback when retryAfterMs exceeds maxDelayMs", async () => {
+		it("falls through to fallback when retryAfterMs exceeds short threshold", async () => {
 			const fallbackModel = createMockModel("openai", "gpt-4o");
 			const { deps, emittedEvents } = createMockDeps({
 				model: createMockModel("google-gemini-cli", "gemini-2.5-pro"),
@@ -659,18 +672,18 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 					model: fallbackModel,
 					reason: "cross-provider fallback",
 				},
-				retrySettings: { maxRetries: 5, baseDelayMs: 1000, maxDelayMs: 30000 },
+				retrySettings: { maxRetries: 5, baseDelayMs: 1000, maxDelayMs: 60000 },
 			});
 
 			const handler = new RetryHandler(deps);
 			const msg = errorMessage(
-				"You have exhausted your capacity on this model. Your quota will reset after 5m.",
+				"You have exhausted your capacity on this model. Your quota will reset after 59s.",
 			);
-			(msg as any).retryAfterMs = 300000; // 5 minutes, exceeds maxDelayMs
+			(msg as any).retryAfterMs = 59000;
 
 			const result = await handler.handleRetryableError(msg);
 
-			// Should fall through to fallback since wait is too long
+			// Should fall through to fallback since wait > 5s threshold
 			assert.equal(result, true, "should fallback when quota reset is too long");
 
 			const switchEvent = emittedEvents.find(
@@ -764,6 +777,39 @@ describe("RetryHandler — long-context entitlement 429 (#2803)", () => {
 			);
 		});
 
+		it("triggers fallback for 'no capacity' server errors", async () => {
+			// "No capacity available" is a provider-side capacity issue,
+			// not a credential/rate-limit problem. Should classify as rate_limit
+			// to trigger the fallback chain.
+			const fallbackModel = createMockModel("openai", "gpt-4o");
+			const { deps, emittedEvents, findFallback } = createMockDeps({
+				model: createMockModel("google-gemini-cli", "gemini-2.5-pro"),
+				markUsageLimitReachedResult: false,
+				fallbackResult: {
+					model: fallbackModel,
+					reason: "free-selection fallback",
+				},
+			});
+
+			const handler = new RetryHandler(deps);
+			const msg = errorMessage(
+				"No capacity available for model gemini-2.5-pro on the server",
+			);
+
+			const result = await handler.handleRetryableError(msg);
+
+			assert.equal(result, true, "should retry with fallback provider");
+			assert.equal(
+				findFallback.mock.calls.length,
+				1,
+				"should invoke fallback resolver for capacity errors",
+			);
+			assert.ok(
+				emittedEvents.some((e) => e.type === "fallback_provider_switch"),
+				"should emit fallback_provider_switch",
+			);
+		});
+
 		it("still tries cross-provider fallback for quota_exhausted without credential backoff", async () => {
 			const fallbackModel = createMockModel("openai", "gpt-4o");
 			const { deps, markUsageLimitReached, continueFn } = createMockDeps({
diff --git a/packages/pi-coding-agent/src/core/retry-handler.ts b/packages/pi-coding-agent/src/core/retry-handler.ts
index 50261b89a..725312c38 100644
--- a/packages/pi-coding-agent/src/core/retry-handler.ts
+++ b/packages/pi-coding-agent/src/core/retry-handler.ts
@@ -119,7 +119,7 @@ export class RetryHandler {
 		// generated error from getApiKey() when credentials are in a backoff window.
 		// Re-entering the retry handler for that message creates a cascade of empty
 		// error entries in the session file, breaking resume (#3429).
-		return /overloaded|rate.?limit|too many requests|402|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|connection.?lost|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|requires more credits|can only afford|insufficient credits|not enough credits|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available/i.test(
+		return /overloaded|rate.?limit|too many requests|402|429|500|502|503|504|service.?unavailable|server.?error|internal.?error|connection.?error|connection.?refused|connection.?lost|other side closed|fetch failed|upstream.?connect|reset before headers|terminated|retry delay|network.?(?:is\s+)?unavailable|credentials.*expired|requires more credits|can only afford|insufficient credits|not enough credits|extra usage is required|(?:out of|no) extra usage|third.party.*draw from extra|third.party.*not.*available|no capacity|capacity.*available/i.test(
 			err,
 		);
 	}
@@ -211,9 +211,10 @@ export class RetryHandler {
 			const isAuthError = errorType === "auth_error";
 			if (isRateLimit || isQuotaError || isAuthError) {
 				// For quota errors with a retry-after hint, wait before switching providers.
-				// The quota may reset quickly (e.g. 59s), so waiting is often better than
-				// switching to a potentially worse model.
-				if (isQuotaError && message.retryAfterMs !== undefined && message.retryAfterMs > 0) {
+				// Only wait if the reset is very short (< 5s); otherwise falling back to
+				// another provider is faster and keeps auto-mode throughput up.
+				const QUOTA_WAIT_THRESHOLD_MS = 5_000;
+				if (isQuotaError && message.retryAfterMs !== undefined && message.retryAfterMs > 0 && message.retryAfterMs <= QUOTA_WAIT_THRESHOLD_MS) {
 					const cap = settings.maxDelayMs > 0 ? settings.maxDelayMs : Infinity;
 					if (message.retryAfterMs <= cap) {
 						this._deps.emit({
@@ -486,6 +487,11 @@ export class RetryHandler {
 			return "quota_exhausted";
 		if (/rate.?limit|too many requests|429|529|overloaded/i.test(err))
 			return "rate_limit";
+		// Provider-side capacity/server load — the server has no available
+		// capacity for this model (e.g. "No capacity available for model X").
+		// Treat as rate_limit so the fallback chain kicks in immediately.
+		if (/no capacity|capacity.*available|server.*busy|too busy/i.test(err))
+			return "rate_limit";
 		if (
 			/500|502|503|504|server.?error|internal.?error|service.?unavailable/i.test(
 				err,