benchmarks: add Gemini 2.5/3/3.1 Pro + Flash entries

Gemini had zero benchmark entries in model-benchmarks.json despite
being served by google-gemini-cli (OAuth provider, SF native), google
(API key), google-vertex, google-antigravity, openrouter, etc. Every
gemini-* model in the pi-ai catalog scored 0 in the benchmark selector
— effectively excluded from auto-selection even when allow-listed.

Published numbers from DeepMind model cards + Vellum LLM leaderboard +
Vals AI:

  gemini-3-pro-preview:    SWE-Verified 76.2, HLE 37.5, AIME25 95,
                            GPQA-D 91.9, MMLU-Pro 81.0
  gemini-3.1-pro-preview:  SWE-Verified 78, HLE 41, AIME 97,
                            GPQA-D 93, MMLU-Pro 83 (Feb 2026)
  gemini-3-flash-preview:  estimated from Pro-vs-Flash delta
  gemini-2.5-pro:          SWE-Verified 63.8, HLE 18.8, GPQA-D 84.0,
                            MMLU-Pro 86
  gemini-2.5-flash:        estimated from Pro-vs-Flash delta

Context windows reflect Gemini's 1M-2M token capability.

LiveCodeBench Pro Elo (2439 for Gemini 3 Pro) isn't in the 0-100
percent schema — skipped rather than forced. Future: add arena_elo-
style LCB Elo dimension to the schema if we start routing on it.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-04-19 10:11:45 +02:00
parent e413cf4a3f
commit 0f0dcbf8c7

View file

@ -808,5 +808,100 @@
"context_window": 131072,
"max_output_tokens": 8192,
"context_window_source": "vendor model card (registry reported wrong value)"
},
"gemini-3.1-pro-preview": {
"swe_bench": null,
"swe_bench_verified": 78,
"live_code_bench": null,
"human_eval": null,
"hle": 41,
"aime_2026": 97,
"gpqa": 93,
"mmlu_pro": 83,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": null,
"arena_elo": null,
"instruction_following": null,
"source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis",
"context_window": 2097152,
"max_output_tokens": 65536
},
"gemini-3-pro-preview": {
"swe_bench": null,
"swe_bench_verified": 76.2,
"live_code_bench": null,
"human_eval": null,
"hle": 37.5,
"aime_2026": 95,
"gpqa": 91.9,
"mmlu_pro": 81.0,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": null,
"arena_elo": null,
"instruction_following": null,
"source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439",
"context_window": 2097152,
"max_output_tokens": 65536
},
"gemini-3-flash-preview": {
"swe_bench": null,
"swe_bench_verified": 68,
"live_code_bench": null,
"human_eval": null,
"hle": 28,
"aime_2026": 88,
"gpqa": 85,
"mmlu_pro": 76,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": null,
"arena_elo": null,
"instruction_following": null,
"source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines",
"context_window": 1048576,
"max_output_tokens": 65536
},
"gemini-2.5-pro": {
"swe_bench": null,
"swe_bench_verified": 63.8,
"live_code_bench": null,
"human_eval": null,
"hle": 18.8,
"aime_2026": null,
"gpqa": 84.0,
"mmlu_pro": 86,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": null,
"arena_elo": null,
"instruction_following": null,
"source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86",
"context_window": 2097152,
"max_output_tokens": 65536
},
"gemini-2.5-flash": {
"swe_bench": null,
"swe_bench_verified": 52,
"live_code_bench": null,
"human_eval": null,
"hle": 12,
"aime_2026": null,
"gpqa": 75,
"mmlu_pro": 77,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": null,
"arena_elo": null,
"instruction_following": null,
"source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks",
"context_window": 1048576,
"max_output_tokens": 65536
}
}