2026-04-30 10:19:18 +02:00
#!/usr/bin/env node
2026-05-05 14:46:18 +02:00
import { spawnSync } from "node:child_process" ;
import { mkdirSync , readFileSync , writeFileSync } from "node:fs" ;
2026-04-30 10:19:18 +02:00
import { homedir } from "node:os" ;
import { dirname , resolve } from "node:path" ;
import { performance } from "node:perf_hooks" ;
const repoRoot = resolve ( import . meta . dirname , ".." ) ;
2026-05-05 14:31:16 +02:00
const defaultOutputPath = resolve (
repoRoot ,
".sf" ,
"model-benchmarks" ,
` ${ new Date ( ) . toISOString ( ) . replace ( /[:.]/g , "-" ) } .json ` ,
) ;
2026-04-30 10:19:18 +02:00
const args = parseArgs ( process . argv . slice ( 2 ) ) ;
const modelsArg = args . models ? ? args . model ;
const outputPath = resolve ( args . output ? ? defaultOutputPath ) ;
2026-05-05 14:31:16 +02:00
const maxModels = Number . parseInt (
args . maxModels ? ? args [ "max-models" ] ? ? "8" ,
10 ,
) ;
const maxTokens = Number . parseInt (
args . maxTokens ? ? args [ "max-tokens" ] ? ? "420" ,
10 ,
) ;
2026-04-30 10:19:18 +02:00
await loadSfScopedEnv ( ) ;
2026-05-05 14:31:16 +02:00
const { getModel , streamSimpleOpenAICompletions } = await import (
2026-05-10 11:28:01 +02:00
"../packages/ai/src/index.ts"
2026-05-05 14:31:16 +02:00
) ;
2026-04-30 10:19:18 +02:00
const modelIds = modelsArg
2026-05-05 14:31:16 +02:00
? modelsArg
. split ( "," )
. map ( ( s ) => s . trim ( ) )
. filter ( Boolean )
2026-04-30 10:19:18 +02:00
: [
2026-04-30 15:11:45 +02:00
"kimi-coding/kimi-k2.6" ,
2026-04-30 10:19:18 +02:00
"minimax/MiniMax-M2.7-highspeed" ,
2026-04-30 10:39:17 +02:00
"zai/glm-4.5" ,
2026-04-30 10:19:18 +02:00
"mistral/devstral-latest" ,
"alibaba-coding-plan/qwen3-coder-plus" ,
"xiaomi/mimo-v2-pro" ,
"opencode-go/minimax-m2.7" ,
"openrouter/inclusionai/ling-2.6-1t:free" ,
] ;
const tasks = [
{
id : "json-repair" ,
maxTokens : Math . min ( maxTokens , 280 ) ,
prompt : ` Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
Broken payload : { "bug" : "path traversal\\n- accepts ../foo" , "fix" : 123 , "tests" : "none" } .
Normalize it semantically ; no markdown . ` ,
check : ( text ) => {
try {
const parsed = JSON . parse ( text ) ;
2026-05-05 14:31:16 +02:00
return (
typeof parsed . bug === "string" &&
typeof parsed . fix === "string" &&
Array . isArray ( parsed . tests )
) ;
2026-04-30 10:19:18 +02:00
} catch {
return false ;
}
} ,
} ,
{
id : "path-debug" ,
maxTokens ,
prompt : ` Find the bug and propose the minimal patch. Code:
function isSafe ( base , target ) {
const resolved = path . resolve ( base , target )
return resolved . startsWith ( base )
}
Explain why it is unsafe in <= 8 bullets , then provide a corrected JS function . ` ,
2026-05-05 14:31:16 +02:00
check : ( text ) =>
/startsWith|prefix/i . test ( text ) &&
/path\.sep|relative|normalize|resolve/i . test ( text ) ,
2026-04-30 10:19:18 +02:00
} ,
{
id : "routing-plan" ,
maxTokens ,
prompt : ` Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6. ` ,
2026-05-05 14:31:16 +02:00
check : ( text ) =>
/kimi-k2\.5/ . test ( text ) &&
/kimi-k2\.6/ . test ( text ) &&
/test|verify|validation/i . test ( text ) ,
2026-04-30 10:19:18 +02:00
} ,
] ;
2026-05-05 14:31:16 +02:00
const selectedModels = modelIds . slice (
0 ,
Number . isFinite ( maxModels ) ? maxModels : modelIds . length ,
) ;
2026-04-30 10:19:18 +02:00
const results = [ ] ;
for ( const fullId of selectedModels ) {
const slash = fullId . indexOf ( "/" ) ;
if ( slash === - 1 ) {
2026-05-05 14:31:16 +02:00
results . push ( {
model : fullId ,
ok : false ,
error : "expected provider/model id" ,
} ) ;
2026-04-30 10:19:18 +02:00
continue ;
}
const provider = fullId . slice ( 0 , slash ) ;
const modelId = fullId . slice ( slash + 1 ) ;
const model = getModel ( provider , modelId ) ;
if ( ! model ) {
2026-05-05 14:31:16 +02:00
results . push ( {
model : fullId ,
ok : false ,
error : "model not found in registry" ,
} ) ;
2026-04-30 10:19:18 +02:00
continue ;
}
for ( const task of tasks ) {
const started = performance . now ( ) ;
let text = "" ;
let result ;
try {
const stream = streamSimpleOpenAICompletions (
model ,
{
2026-05-05 14:31:16 +02:00
systemPrompt :
"You are a precise software engineering benchmark model. Follow requested output formats exactly." ,
messages : [
{ role : "user" , content : task . prompt , timestamp : Date . now ( ) } ,
] ,
2026-04-30 10:19:18 +02:00
} ,
{ temperature : 0 , maxTokens : task . maxTokens } ,
) ;
for await ( const event of stream ) {
if ( event . type === "text_delta" ) text += event . delta ;
}
result = await stream . result ( ) ;
} catch ( error ) {
results . push ( {
model : fullId ,
task : task . id ,
ok : false ,
elapsedMs : Math . round ( performance . now ( ) - started ) ,
error : error instanceof Error ? error . message : String ( error ) ,
} ) ;
continue ;
}
const elapsedMs = Math . round ( performance . now ( ) - started ) ;
const passed = result . stopReason !== "error" && task . check ( text ) ;
results . push ( {
model : fullId ,
task : task . id ,
ok : passed ,
stopReason : result . stopReason ,
errorMessage : result . errorMessage ,
elapsedMs ,
chars : text . length ,
usage : result . usage ,
sample : text . slice ( 0 , 700 ) ,
} ) ;
2026-05-05 14:31:16 +02:00
console . log (
` ${ passed ? "PASS" : "FAIL" } ${ fullId } ${ task . id } ${ elapsedMs } ms ${ result . stopReason } ` ,
) ;
2026-04-30 10:19:18 +02:00
}
}
const report = {
createdAt : new Date ( ) . toISOString ( ) ,
models : selectedModels ,
tasks : tasks . map ( ( t ) => t . id ) ,
results ,
} ;
mkdirSync ( dirname ( outputPath ) , { recursive : true } ) ;
writeFileSync ( outputPath , ` ${ JSON . stringify ( report , null , 2 ) } \n ` ) ;
console . log ( ` wrote ${ outputPath } ` ) ;
function parseArgs ( argv ) {
const parsed = { } ;
for ( let i = 0 ; i < argv . length ; i ++ ) {
const arg = argv [ i ] ;
if ( ! arg . startsWith ( "--" ) ) continue ;
const key = arg . slice ( 2 ) ;
const next = argv [ i + 1 ] ;
if ( ! next || next . startsWith ( "--" ) ) {
parsed [ key ] = "true" ;
} else {
parsed [ key ] = next ;
i ++ ;
}
}
return parsed ;
}
async function loadSfScopedEnv ( ) {
const secretsFile = ` ${ homedir ( ) } /.dotfiles/secrets/api-keys.yaml ` ;
const sopsConfig = ` ${ homedir ( ) } /.dotfiles/.sops.yaml ` ;
const wrapperPath = ` ${ homedir ( ) } /.local/bin/sf ` ;
const envNames = readSfScopedEnvNames ( wrapperPath ) ;
for ( const name of envNames ) delete process . env [ name ] ;
2026-05-05 14:31:16 +02:00
const decrypted = spawnSync (
"sops" ,
[ "--config" , sopsConfig , "-d" , secretsFile ] ,
{
encoding : "utf8" ,
stdio : [ "ignore" , "pipe" , "ignore" ] ,
} ,
) ;
2026-04-30 10:19:18 +02:00
if ( decrypted . status !== 0 || ! decrypted . stdout ) return ;
2026-05-05 14:31:16 +02:00
const extracted = spawnSync (
"yq" ,
[
"-r" ,
` (
2026-04-30 10:19:18 +02:00
( . sf // {} | to_entries[]
| select ( ( . value | type ) == "string" or ( . value | type ) == "number" or ( . value | type ) == "boolean" )
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" ) ,
( . sf . env // {} | to_entries[]
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" ) ,
( . sf . providers // {} | to_entries[]
| ( . value . env // {})
| to _entries [ ]
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" )
) ` ,
2026-05-05 14:31:16 +02:00
] ,
{
input : decrypted . stdout ,
encoding : "utf8" ,
stdio : [ "pipe" , "pipe" , "ignore" ] ,
} ,
) ;
2026-04-30 10:19:18 +02:00
if ( extracted . status !== 0 || ! extracted . stdout ) return ;
for ( const line of extracted . stdout . split ( /\r?\n/ ) ) {
const idx = line . indexOf ( "=" ) ;
if ( idx <= 0 ) continue ;
const key = line . slice ( 0 , idx ) ;
const value = line . slice ( idx + 1 ) ;
if ( /^[A-Za-z_][A-Za-z0-9_]*$/ . test ( key ) && value ) process . env [ key ] = value ;
}
}
function readSfScopedEnvNames ( wrapperPath ) {
try {
const source = readFileSync ( wrapperPath , "utf8" ) ;
const match = source . match ( /sf_scoped_env=\(\n([\s\S]*?)\n\)/ ) ;
if ( ! match ) return [ ] ;
return match [ 1 ]
. split ( /\r?\n/ )
. map ( ( line ) => line . trim ( ) )
. filter ( ( line ) => / ^ [ A - Z0 - 9_ ] + $ / . test ( line ) ) ;
} catch {
return [ ] ;
}
}