diff --git a/src/resources/extensions/sf/learning/data/model-benchmarks.json b/src/resources/extensions/sf/learning/data/model-benchmarks.json index 55cb7ede2..c8f3bc474 100644 --- a/src/resources/extensions/sf/learning/data/model-benchmarks.json +++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json @@ -808,5 +808,100 @@ "context_window": 131072, "max_output_tokens": 8192, "context_window_source": "vendor model card (registry reported wrong value)" + }, + "gemini-3.1-pro-preview": { + "swe_bench": null, + "swe_bench_verified": 78, + "live_code_bench": null, + "human_eval": null, + "hle": 41, + "aime_2026": 97, + "gpqa": 93, + "mmlu_pro": 83, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3.1 Pro Preview (Feb 2026 release) incremental over 3 Pro baseline; reference: DeepMind model card, SmartScope analysis", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-3-pro-preview": { + "swe_bench": null, + "swe_bench_verified": 76.2, + "live_code_bench": null, + "human_eval": null, + "hle": 37.5, + "aime_2026": 95, + "gpqa": 91.9, + "mmlu_pro": 81.0, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3 Pro official benchmarks (DeepMind, Vellum leaderboard): SWE-Bench Verified 76.2, HLE 37.5 (no tools), AIME 2025 95 / 100 w/code exec, GPQA-Diamond 91.9, MMLU-Pro 81.0, LiveCodeBench Pro Elo 2439", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-3-flash-preview": { + "swe_bench": null, + "swe_bench_verified": 68, + "live_code_bench": null, + "human_eval": null, + "hle": 28, + "aime_2026": 88, + "gpqa": 85, + "mmlu_pro": 76, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 3 Flash Preview — faster/cheaper tier of Gemini 3 family. Scores estimated from published Flash-vs-Pro delta (~10pp on reasoning) applied to Gemini 3 Pro baselines", + "context_window": 1048576, + "max_output_tokens": 65536 + }, + "gemini-2.5-pro": { + "swe_bench": null, + "swe_bench_verified": 63.8, + "live_code_bench": null, + "human_eval": null, + "hle": 18.8, + "aime_2026": null, + "gpqa": 84.0, + "mmlu_pro": 86, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 2.5 Pro official model card: SWE-Bench Verified 63.8 (custom agent), HLE 18.8 (no tools), GPQA Diamond 84.0 pass@1, MMLU-Pro 86", + "context_window": 2097152, + "max_output_tokens": 65536 + }, + "gemini-2.5-flash": { + "swe_bench": null, + "swe_bench_verified": 52, + "live_code_bench": null, + "human_eval": null, + "hle": 12, + "aime_2026": null, + "gpqa": 75, + "mmlu_pro": 77, + "bbh": null, + "browse_comp": null, + "simple_qa": null, + "long_context_ruler": null, + "arena_elo": null, + "instruction_following": null, + "source": "Gemini 2.5 Flash — faster/cheaper tier, scores estimated from Pro-vs-Flash delta on published benchmarks", + "context_window": 1048576, + "max_output_tokens": 65536 } } \ No newline at end of file