From 0f0dcbf8c76609f6ee2cce7c4a1f8d8df32f69aa Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 19 Apr 2026 10:11:45 +0200 Subject: [PATCH] benchmarks: add Gemini 2.5/3/3.1 Pro + Flash entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini had zero benchmark entries in model-benchmarks.json despite being served by google-gemini-cli (OAuth provider, SF native), google (API key), google-vertex, google-antigravity, openrouter, etc. Every gemini-* model in the pi-ai catalog scored 0 in the benchmark selector — effectively excluded from auto-selection even when allow-listed. Published numbers from DeepMind model cards + Vellum LLM leaderboard + Vals AI: gemini-3-pro-preview: SWE-Verified 76.2, HLE 37.5, AIME25 95, GPQA-D 91.9, MMLU-Pro 81.0 gemini-3.1-pro-preview: SWE-Verified 78, HLE 41, AIME 97, GPQA-D 93, MMLU-Pro 83 (Feb 2026) gemini-3-flash-preview: estimated from Pro-vs-Flash delta gemini-2.5-pro: SWE-Verified 63.8, HLE 18.8, GPQA-D 84.0, MMLU-Pro 86 gemini-2.5-flash: estimated from Pro-vs-Flash delta Context windows reflect Gemini's 1M-2M token capability. LiveCodeBench Pro Elo (2439 for Gemini 3 Pro) isn't in the 0-100 percent schema — skipped rather than forced. Future: add arena_elo- style LCB Elo dimension to the schema if we start routing on it. Co-Authored-By: Claude Sonnet 4.6 --- .../sf/learning/data/model-benchmarks.json | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/resources/extensions/sf/learning/data/model-benchmarks.json b/src/resources/extensions/sf/learning/data/model-benchmarks.json index 55cb7ede2..c8f3bc474 100644 --- a/src/resources/extensions/sf/learning/data/model-benchmarks.json +++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json @@ -808,5 +808,100 @@ "context_window": 131072, "max_output_tokens": 8192, "context_window_source": "vendor model card (registry reported wrong value)" + }, + "gemini-3.1-pro-preview": { + "swe_bench": null, + "swe_bench_verified": 78, + "live_code_bench": null, + "human_eval": null, + "hle": 41, + "aime_2026": 97, + "gpqa": 93, + "mmlu_pro": 83, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-3-pro-preview": { + "swe_bench": null, + "swe_bench_verified": 76.2, + "live_code_bench": null, + "human_eval": null, + "hle": 37.5, + "aime_2026": 95, + "gpqa": 91.9, + "mmlu_pro": 81.0, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-3-flash-preview": { + "swe_bench": null, + "swe_bench_verified": 68, + "live_code_bench": null, + "human_eval": null, + "hle": 28, + "aime_2026": 88, + "gpqa": 85, + "mmlu_pro": 76, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines", + "context_window": 1048576, + "max_output_tokens": 65536 + }, + "gemini-2.5-pro": { + "swe_bench": null, + "swe_bench_verified": 63.8, + "live_code_bench": null, + "human_eval": null, + "hle": 18.8, + "aime_2026": null, + "gpqa": 84.0, + "mmlu_pro": 86, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-2.5-flash": { + "swe_bench": null, + "swe_bench_verified": 52, + "live_code_bench": null, + "human_eval": null, + "hle": 12, + "aime_2026": null, + "gpqa": 75, + "mmlu_pro": 77, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks", + "context_window": 1048576, + "max_output_tokens": 65536 } } \ No newline at end of file