benchmarks: add Gemini 2.5/3/3.1 Pro + Flash entries

Gemini had zero benchmark entries in model-benchmarks.json despite being served by google-gemini-cli (OAuth provider, SF native), google (API key), google-vertex, google-antigravity, openrouter, etc. Every gemini-* model in the pi-ai catalog scored 0 in the benchmark selector — effectively excluded from auto-selection even when allow-listed. Published numbers from DeepMind model cards + Vellum LLM leaderboard + Vals AI: gemini-3-pro-preview: SWE-Verified 76.2, HLE 37.5, AIME25 95, GPQA-D 91.9, MMLU-Pro 81.0 gemini-3.1-pro-preview: SWE-Verified 78, HLE 41, AIME 97, GPQA-D 93, MMLU-Pro 83 (Feb 2026) gemini-3-flash-preview: estimated from Pro-vs-Flash delta gemini-2.5-pro: SWE-Verified 63.8, HLE 18.8, GPQA-D 84.0, MMLU-Pro 86 gemini-2.5-flash: estimated from Pro-vs-Flash delta Context windows reflect Gemini's 1M-2M token capability. LiveCodeBench Pro Elo (2439 for Gemini 3 Pro) isn't in the 0-100 percent schema — skipped rather than forced. Future: add arena_elo- style LCB Elo dimension to the schema if we start routing on it. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 10:11:45 +02:00 · 2026-04-19 10:11:45 +02:00 · 0f0dcbf8c7
commit 0f0dcbf8c7
parent e413cf4a3f
1 changed files with 95 additions and 0 deletions
--- a/src/resources/extensions/sf/learning/data/model-benchmarks.json
+++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json
@ -808,5 +808,100 @@
    "context_window": 131072,
    "max_output_tokens": 8192,
    "context_window_source": "vendor model card (registry reported wrong value)"
+  },
+  "gemini-3.1-pro-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 78,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 41,
+    "aime_2026": 97,
+    "gpqa": 93,
+    "mmlu_pro": 83,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-3-pro-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 76.2,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 37.5,
+    "aime_2026": 95,
+    "gpqa": 91.9,
+    "mmlu_pro": 81.0,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-3-flash-preview": {
+    "swe_bench": null,
+    "swe_bench_verified": 68,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 28,
+    "aime_2026": 88,
+    "gpqa": 85,
+    "mmlu_pro": 76,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines",
+    "context_window": 1048576,
+    "max_output_tokens": 65536
+  },
+  "gemini-2.5-pro": {
+    "swe_bench": null,
+    "swe_bench_verified": 63.8,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 18.8,
+    "aime_2026": null,
+    "gpqa": 84.0,
+    "mmlu_pro": 86,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86",
+    "context_window": 2097152,
+    "max_output_tokens": 65536
+  },
+  "gemini-2.5-flash": {
+    "swe_bench": null,
+    "swe_bench_verified": 52,
+    "live_code_bench": null,
+    "human_eval": null,
+    "hle": 12,
+    "aime_2026": null,
+    "gpqa": 75,
+    "mmlu_pro": 77,
+    "bbh": null,
+    "browse_comp": null,
+    "simple_qa": null,
+    "long_context_ruler": null,
+    "arena_elo": null,
+    "instruction_following": null,
+    "source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks",
+    "context_window": 1048576,
+    "max_output_tokens": 65536
  }
 }