#!/usr/bin/env node import { spawnSync } from "node:child_process"; import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { homedir } from "node:os"; import { dirname, resolve } from "node:path"; import { performance } from "node:perf_hooks"; const repoRoot = resolve(import.meta.dirname, ".."); const defaultOutputPath = resolve( repoRoot, ".sf", "model-benchmarks", `${new Date().toISOString().replace(/[:.]/g, "-")}.json`, ); const args = parseArgs(process.argv.slice(2)); const modelsArg = args.models ?? args.model; const outputPath = resolve(args.output ?? defaultOutputPath); const maxModels = Number.parseInt( args.maxModels ?? args["max-models"] ?? "8", 10, ); const maxTokens = Number.parseInt( args.maxTokens ?? args["max-tokens"] ?? "420", 10, ); await loadSfScopedEnv(); const { getModel, streamSimpleOpenAICompletions } = await import( "../packages/ai/src/index.ts" ); const modelIds = modelsArg ? modelsArg .split(",") .map((s) => s.trim()) .filter(Boolean) : [ "kimi-coding/kimi-k2.6", "minimax/MiniMax-M2.7-highspeed", "zai/glm-4.5", "mistral/devstral-latest", "alibaba-coding-plan/qwen3-coder-plus", "xiaomi/mimo-v2-pro", "opencode-go/minimax-m2.7", "openrouter/inclusionai/ling-2.6-1t:free", ]; const tasks = [ { id: "json-repair", maxTokens: Math.min(maxTokens, 280), prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }. Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}. Normalize it semantically; no markdown.`, check: (text) => { try { const parsed = JSON.parse(text); return ( typeof parsed.bug === "string" && typeof parsed.fix === "string" && Array.isArray(parsed.tests) ); } catch { return false; } }, }, { id: "path-debug", maxTokens, prompt: `Find the bug and propose the minimal patch. Code: function isSafe(base, target) { const resolved = path.resolve(base, target) return resolved.startsWith(base) } Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`, check: (text) => /startsWith|prefix/i.test(text) && /path\.sep|relative|normalize|resolve/i.test(text), }, { id: "routing-plan", maxTokens, prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`, check: (text) => /kimi-k2\.5/.test(text) && /kimi-k2\.6/.test(text) && /test|verify|validation/i.test(text), }, ]; const selectedModels = modelIds.slice( 0, Number.isFinite(maxModels) ? maxModels : modelIds.length, ); const results = []; for (const fullId of selectedModels) { const slash = fullId.indexOf("/"); if (slash === -1) { results.push({ model: fullId, ok: false, error: "expected provider/model id", }); continue; } const provider = fullId.slice(0, slash); const modelId = fullId.slice(slash + 1); const model = getModel(provider, modelId); if (!model) { results.push({ model: fullId, ok: false, error: "model not found in registry", }); continue; } for (const task of tasks) { const started = performance.now(); let text = ""; let result; try { const stream = streamSimpleOpenAICompletions( model, { systemPrompt: "You are a precise software engineering benchmark model. Follow requested output formats exactly.", messages: [ { role: "user", content: task.prompt, timestamp: Date.now() }, ], }, { temperature: 0, maxTokens: task.maxTokens }, ); for await (const event of stream) { if (event.type === "text_delta") text += event.delta; } result = await stream.result(); } catch (error) { results.push({ model: fullId, task: task.id, ok: false, elapsedMs: Math.round(performance.now() - started), error: error instanceof Error ? error.message : String(error), }); continue; } const elapsedMs = Math.round(performance.now() - started); const passed = result.stopReason !== "error" && task.check(text); results.push({ model: fullId, task: task.id, ok: passed, stopReason: result.stopReason, errorMessage: result.errorMessage, elapsedMs, chars: text.length, usage: result.usage, sample: text.slice(0, 700), }); console.log( `${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`, ); } } const report = { createdAt: new Date().toISOString(), models: selectedModels, tasks: tasks.map((t) => t.id), results, }; mkdirSync(dirname(outputPath), { recursive: true }); writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`); console.log(`wrote ${outputPath}`); function parseArgs(argv) { const parsed = {}; for (let i = 0; i < argv.length; i++) { const arg = argv[i]; if (!arg.startsWith("--")) continue; const key = arg.slice(2); const next = argv[i + 1]; if (!next || next.startsWith("--")) { parsed[key] = "true"; } else { parsed[key] = next; i++; } } return parsed; } async function loadSfScopedEnv() { const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`; const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`; const wrapperPath = `${homedir()}/.local/bin/sf`; const envNames = readSfScopedEnvNames(wrapperPath); for (const name of envNames) delete process.env[name]; const decrypted = spawnSync( "sops", ["--config", sopsConfig, "-d", secretsFile], { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"], }, ); if (decrypted.status !== 0 || !decrypted.stdout) return; const extracted = spawnSync( "yq", [ "-r", `( (.sf // {} | to_entries[] | select((.value | type) == "string" or (.value | type) == "number" or (.value | type) == "boolean") | select(.value != null and .value != "") | "\\(.key)=\\(.value)"), (.sf.env // {} | to_entries[] | select(.value != null and .value != "") | "\\(.key)=\\(.value)"), (.sf.providers // {} | to_entries[] | (.value.env // {}) | to_entries[] | select(.value != null and .value != "") | "\\(.key)=\\(.value)") )`, ], { input: decrypted.stdout, encoding: "utf8", stdio: ["pipe", "pipe", "ignore"], }, ); if (extracted.status !== 0 || !extracted.stdout) return; for (const line of extracted.stdout.split(/\r?\n/)) { const idx = line.indexOf("="); if (idx <= 0) continue; const key = line.slice(0, idx); const value = line.slice(idx + 1); if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value; } } function readSfScopedEnvNames(wrapperPath) { try { const source = readFileSync(wrapperPath, "utf8"); const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/); if (!match) return []; return match[1] .split(/\r?\n/) .map((line) => line.trim()) .filter((line) => /^[A-Z0-9_]+$/.test(line)); } catch { return []; } }