benchmarks: add Gemini 2.5/3/3.1 Pro + Flash entries
Gemini had zero benchmark entries in model-benchmarks.json despite
being served by google-gemini-cli (OAuth provider, SF native), google
(API key), google-vertex, google-antigravity, openrouter, etc. Every
gemini-* model in the pi-ai catalog scored 0 in the benchmark selector
— effectively excluded from auto-selection even when allow-listed.
Published numbers from DeepMind model cards + Vellum LLM leaderboard +
Vals AI:
gemini-3-pro-preview: SWE-Verified 76.2, HLE 37.5, AIME25 95,
GPQA-D 91.9, MMLU-Pro 81.0
gemini-3.1-pro-preview: SWE-Verified 78, HLE 41, AIME 97,
GPQA-D 93, MMLU-Pro 83 (Feb 2026)
gemini-3-flash-preview: estimated from Pro-vs-Flash delta
gemini-2.5-pro: SWE-Verified 63.8, HLE 18.8, GPQA-D 84.0,
MMLU-Pro 86
gemini-2.5-flash: estimated from Pro-vs-Flash delta
Context windows reflect Gemini's 1M-2M token capability.
LiveCodeBench Pro Elo (2439 for Gemini 3 Pro) isn't in the 0-100
percent schema — skipped rather than forced. Future: add arena_elo-
style LCB Elo dimension to the schema if we start routing on it.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e413cf4a3f
commit
0f0dcbf8c7
1 changed files with 95 additions and 0 deletions
|
|
@ -808,5 +808,100 @@
|
|||
"context_window": 131072,
|
||||
"max_output_tokens": 8192,
|
||||
"context_window_source": "vendor model card (registry reported wrong value)"
|
||||
},
|
||||
"gemini-3.1-pro-preview": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": 78,
|
||||
"live_code_bench": null,
|
||||
"human_eval": null,
|
||||
"hle": 41,
|
||||
"aime_2026": 97,
|
||||
"gpqa": 93,
|
||||
"mmlu_pro": 83,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": null,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis",
|
||||
"context_window": 2097152,
|
||||
"max_output_tokens": 65536
|
||||
},
|
||||
"gemini-3-pro-preview": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": 76.2,
|
||||
"live_code_bench": null,
|
||||
"human_eval": null,
|
||||
"hle": 37.5,
|
||||
"aime_2026": 95,
|
||||
"gpqa": 91.9,
|
||||
"mmlu_pro": 81.0,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": null,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439",
|
||||
"context_window": 2097152,
|
||||
"max_output_tokens": 65536
|
||||
},
|
||||
"gemini-3-flash-preview": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": 68,
|
||||
"live_code_bench": null,
|
||||
"human_eval": null,
|
||||
"hle": 28,
|
||||
"aime_2026": 88,
|
||||
"gpqa": 85,
|
||||
"mmlu_pro": 76,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": null,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines",
|
||||
"context_window": 1048576,
|
||||
"max_output_tokens": 65536
|
||||
},
|
||||
"gemini-2.5-pro": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": 63.8,
|
||||
"live_code_bench": null,
|
||||
"human_eval": null,
|
||||
"hle": 18.8,
|
||||
"aime_2026": null,
|
||||
"gpqa": 84.0,
|
||||
"mmlu_pro": 86,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": null,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86",
|
||||
"context_window": 2097152,
|
||||
"max_output_tokens": 65536
|
||||
},
|
||||
"gemini-2.5-flash": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": 52,
|
||||
"live_code_bench": null,
|
||||
"human_eval": null,
|
||||
"hle": 12,
|
||||
"aime_2026": null,
|
||||
"gpqa": 75,
|
||||
"mmlu_pro": 77,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": null,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks",
|
||||
"context_window": 1048576,
|
||||
"max_output_tokens": 65536
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue