2026-04-30 10:19:18 +02:00
#!/usr/bin/env node
import { readFileSync , writeFileSync , mkdirSync } from "node:fs" ;
import { homedir } from "node:os" ;
import { dirname , resolve } from "node:path" ;
import { spawnSync } from "node:child_process" ;
import { performance } from "node:perf_hooks" ;
const repoRoot = resolve ( import . meta . dirname , ".." ) ;
const defaultOutputPath = resolve ( repoRoot , ".sf" , "model-benchmarks" , ` ${ new Date ( ) . toISOString ( ) . replace ( /[:.]/g , "-" ) } .json ` ) ;
const args = parseArgs ( process . argv . slice ( 2 ) ) ;
const modelsArg = args . models ? ? args . model ;
const outputPath = resolve ( args . output ? ? defaultOutputPath ) ;
const maxModels = Number . parseInt ( args . maxModels ? ? args [ "max-models" ] ? ? "8" , 10 ) ;
const maxTokens = Number . parseInt ( args . maxTokens ? ? args [ "max-tokens" ] ? ? "420" , 10 ) ;
await loadSfScopedEnv ( ) ;
const { getModel , streamSimpleOpenAICompletions } = await import ( "../packages/pi-ai/src/index.ts" ) ;
const modelIds = modelsArg
? modelsArg . split ( "," ) . map ( ( s ) => s . trim ( ) ) . filter ( Boolean )
: [
2026-04-30 15:11:45 +02:00
"kimi-coding/kimi-k2.6" ,
2026-04-30 10:19:18 +02:00
"minimax/MiniMax-M2.7-highspeed" ,
2026-04-30 10:39:17 +02:00
"zai/glm-4.5" ,
2026-04-30 10:19:18 +02:00
"mistral/devstral-latest" ,
"alibaba-coding-plan/qwen3-coder-plus" ,
"xiaomi/mimo-v2-pro" ,
"opencode-go/minimax-m2.7" ,
"openrouter/inclusionai/ling-2.6-1t:free" ,
] ;
const tasks = [
{
id : "json-repair" ,
maxTokens : Math . min ( maxTokens , 280 ) ,
prompt : ` Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
Broken payload : { "bug" : "path traversal\\n- accepts ../foo" , "fix" : 123 , "tests" : "none" } .
Normalize it semantically ; no markdown . ` ,
check : ( text ) => {
try {
const parsed = JSON . parse ( text ) ;
return typeof parsed . bug === "string" && typeof parsed . fix === "string" && Array . isArray ( parsed . tests ) ;
} catch {
return false ;
}
} ,
} ,
{
id : "path-debug" ,
maxTokens ,
prompt : ` Find the bug and propose the minimal patch. Code:
function isSafe ( base , target ) {
const resolved = path . resolve ( base , target )
return resolved . startsWith ( base )
}
Explain why it is unsafe in <= 8 bullets , then provide a corrected JS function . ` ,
check : ( text ) => / startsWith | prefix / i . test ( text ) && /path\.sep|relative|normalize|resolve/i . test ( text ) ,
} ,
{
id : "routing-plan" ,
maxTokens ,
prompt : ` Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6. ` ,
check : ( text ) => / kimi - k2 \ . 5 / . test ( text ) && /kimi-k2\.6/ . test ( text ) && /test|verify|validation/i . test ( text ) ,
} ,
] ;
const selectedModels = modelIds . slice ( 0 , Number . isFinite ( maxModels ) ? maxModels : modelIds . length ) ;
const results = [ ] ;
for ( const fullId of selectedModels ) {
const slash = fullId . indexOf ( "/" ) ;
if ( slash === - 1 ) {
results . push ( { model : fullId , ok : false , error : "expected provider/model id" } ) ;
continue ;
}
const provider = fullId . slice ( 0 , slash ) ;
const modelId = fullId . slice ( slash + 1 ) ;
const model = getModel ( provider , modelId ) ;
if ( ! model ) {
results . push ( { model : fullId , ok : false , error : "model not found in registry" } ) ;
continue ;
}
for ( const task of tasks ) {
const started = performance . now ( ) ;
let text = "" ;
let result ;
try {
const stream = streamSimpleOpenAICompletions (
model ,
{
systemPrompt : "You are a precise software engineering benchmark model. Follow requested output formats exactly." ,
messages : [ { role : "user" , content : task . prompt , timestamp : Date . now ( ) } ] ,
} ,
{ temperature : 0 , maxTokens : task . maxTokens } ,
) ;
for await ( const event of stream ) {
if ( event . type === "text_delta" ) text += event . delta ;
}
result = await stream . result ( ) ;
} catch ( error ) {
results . push ( {
model : fullId ,
task : task . id ,
ok : false ,
elapsedMs : Math . round ( performance . now ( ) - started ) ,
error : error instanceof Error ? error . message : String ( error ) ,
} ) ;
continue ;
}
const elapsedMs = Math . round ( performance . now ( ) - started ) ;
const passed = result . stopReason !== "error" && task . check ( text ) ;
results . push ( {
model : fullId ,
task : task . id ,
ok : passed ,
stopReason : result . stopReason ,
errorMessage : result . errorMessage ,
elapsedMs ,
chars : text . length ,
usage : result . usage ,
sample : text . slice ( 0 , 700 ) ,
} ) ;
console . log ( ` ${ passed ? "PASS" : "FAIL" } ${ fullId } ${ task . id } ${ elapsedMs } ms ${ result . stopReason } ` ) ;
}
}
const report = {
createdAt : new Date ( ) . toISOString ( ) ,
models : selectedModels ,
tasks : tasks . map ( ( t ) => t . id ) ,
results ,
} ;
mkdirSync ( dirname ( outputPath ) , { recursive : true } ) ;
writeFileSync ( outputPath , ` ${ JSON . stringify ( report , null , 2 ) } \n ` ) ;
console . log ( ` wrote ${ outputPath } ` ) ;
function parseArgs ( argv ) {
const parsed = { } ;
for ( let i = 0 ; i < argv . length ; i ++ ) {
const arg = argv [ i ] ;
if ( ! arg . startsWith ( "--" ) ) continue ;
const key = arg . slice ( 2 ) ;
const next = argv [ i + 1 ] ;
if ( ! next || next . startsWith ( "--" ) ) {
parsed [ key ] = "true" ;
} else {
parsed [ key ] = next ;
i ++ ;
}
}
return parsed ;
}
async function loadSfScopedEnv ( ) {
const secretsFile = ` ${ homedir ( ) } /.dotfiles/secrets/api-keys.yaml ` ;
const sopsConfig = ` ${ homedir ( ) } /.dotfiles/.sops.yaml ` ;
const wrapperPath = ` ${ homedir ( ) } /.local/bin/sf ` ;
const envNames = readSfScopedEnvNames ( wrapperPath ) ;
for ( const name of envNames ) delete process . env [ name ] ;
const decrypted = spawnSync ( "sops" , [ "--config" , sopsConfig , "-d" , secretsFile ] , {
encoding : "utf8" ,
stdio : [ "ignore" , "pipe" , "ignore" ] ,
} ) ;
if ( decrypted . status !== 0 || ! decrypted . stdout ) return ;
const extracted = spawnSync ( "yq" , [
"-r" ,
` (
( . sf // {} | to_entries[]
| select ( ( . value | type ) == "string" or ( . value | type ) == "number" or ( . value | type ) == "boolean" )
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" ) ,
( . sf . env // {} | to_entries[]
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" ) ,
( . sf . providers // {} | to_entries[]
| ( . value . env // {})
| to _entries [ ]
| select ( . value != null and . value != "" )
| "\\(.key)=\\(.value)" )
) ` ,
] , {
input : decrypted . stdout ,
encoding : "utf8" ,
stdio : [ "pipe" , "pipe" , "ignore" ] ,
} ) ;
if ( extracted . status !== 0 || ! extracted . stdout ) return ;
for ( const line of extracted . stdout . split ( /\r?\n/ ) ) {
const idx = line . indexOf ( "=" ) ;
if ( idx <= 0 ) continue ;
const key = line . slice ( 0 , idx ) ;
const value = line . slice ( idx + 1 ) ;
if ( /^[A-Za-z_][A-Za-z0-9_]*$/ . test ( key ) && value ) process . env [ key ] = value ;
}
}
function readSfScopedEnvNames ( wrapperPath ) {
try {
const source = readFileSync ( wrapperPath , "utf8" ) ;
const match = source . match ( /sf_scoped_env=\(\n([\s\S]*?)\n\)/ ) ;
if ( ! match ) return [ ] ;
return match [ 1 ]
. split ( /\r?\n/ )
. map ( ( line ) => line . trim ( ) )
. filter ( ( line ) => / ^ [ A - Z0 - 9_ ] + $ / . test ( line ) ) ;
} catch {
return [ ] ;
}
}