From 0f0dcbf8c76609f6ee2cce7c4a1f8d8df32f69aa Mon Sep 17 00:00:00 2001
From: Mikael Hugo <mikkihugo@users.noreply.github.com>
Date: Sun, 19 Apr 2026 10:11:45 +0200
Subject: [PATCH] benchmarks: add Gemini 2.5/3/3.1 Pro + Flash entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemini had zero benchmark entries in model-benchmarks.json despite
being served by google-gemini-cli (OAuth provider, SF native), google
(API key), google-vertex, google-antigravity, openrouter, etc. Every
gemini-* model in the pi-ai catalog scored 0 in the benchmark selector
— effectively excluded from auto-selection even when allow-listed.

Published numbers from DeepMind model cards + Vellum LLM leaderboard +
Vals AI:

  gemini-3-pro-preview:    SWE-Verified 76.2, HLE 37.5, AIME25 95,
                            GPQA-D 91.9, MMLU-Pro 81.0
  gemini-3.1-pro-preview:  SWE-Verified 78, HLE 41, AIME 97,
                            GPQA-D 93, MMLU-Pro 83 (Feb 2026)
  gemini-3-flash-preview:  estimated from Pro-vs-Flash delta
  gemini-2.5-pro:          SWE-Verified 63.8, HLE 18.8, GPQA-D 84.0,
                            MMLU-Pro 86
  gemini-2.5-flash:        estimated from Pro-vs-Flash delta

Context windows reflect Gemini's 1M-2M token capability.

LiveCodeBench Pro Elo (2439 for Gemini 3 Pro) isn't in the 0-100
percent schema — skipped rather than forced. Future: add arena_elo-
style LCB Elo dimension to the schema if we start routing on it.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../sf/learning/data/model-benchmarks.json    | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/src/resources/extensions/sf/learning/data/model-benchmarks.json b/src/resources/extensions/sf/learning/data/model-benchmarks.json
index 55cb7ede2..c8f3bc474 100644
--- a/src/resources/extensions/sf/learning/data/model-benchmarks.json
+++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json
@@ -808,5 +808,100 @@
     "context_window": 131072,
     "max_output_tokens": 8192,
     "context_window_source": "vendor model card (registry reported wrong value)"
+  },
+  "gemini-3.1-pro-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 78,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 41,
+    "aime_2026": 97,
+    "gpqa": 93,
+    "mmlu_pro": 83,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-3-pro-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 76.2,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 37.5,
+    "aime_2026": 95,
+    "gpqa": 91.9,
+    "mmlu_pro": 81.0,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-3-flash-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 68,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 28,
+    "aime_2026": 88,
+    "gpqa": 85,
+    "mmlu_pro": 76,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines",
+    "context_window": 1048576,
+    "max_output_tokens": 65536
+  },
+  "gemini-2.5-pro": {
+    "swe_bench": null,
+    "swe_bench_verified": 63.8,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 18.8,
+    "aime_2026": null,
+    "gpqa": 84.0,
+    "mmlu_pro": 86,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-2.5-flash": {
+    "swe_bench": null,
+    "swe_bench_verified": 52,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 12,
+    "aime_2026": null,
+    "gpqa": 75,
+    "mmlu_pro": 77,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks",
+    "context_window": 1048576,
+    "max_output_tokens": 65536
   }
 }
\ No newline at end of file