217 lines
6.6 KiB
JavaScript
217 lines
6.6 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
import { readFileSync, writeFileSync, mkdirSync } from "node:fs";
|
|
import { homedir } from "node:os";
|
|
import { dirname, resolve } from "node:path";
|
|
import { spawnSync } from "node:child_process";
|
|
import { performance } from "node:perf_hooks";
|
|
|
|
const repoRoot = resolve(import.meta.dirname, "..");
|
|
const defaultOutputPath = resolve(repoRoot, ".sf", "model-benchmarks", `${new Date().toISOString().replace(/[:.]/g, "-")}.json`);
|
|
|
|
const args = parseArgs(process.argv.slice(2));
|
|
const modelsArg = args.models ?? args.model;
|
|
const outputPath = resolve(args.output ?? defaultOutputPath);
|
|
const maxModels = Number.parseInt(args.maxModels ?? args["max-models"] ?? "8", 10);
|
|
const maxTokens = Number.parseInt(args.maxTokens ?? args["max-tokens"] ?? "420", 10);
|
|
|
|
await loadSfScopedEnv();
|
|
|
|
const { getModel, streamSimpleOpenAICompletions } = await import("../packages/pi-ai/src/index.ts");
|
|
|
|
const modelIds = modelsArg
|
|
? modelsArg.split(",").map((s) => s.trim()).filter(Boolean)
|
|
: [
|
|
"kimi-coding/kimi-k2.6",
|
|
"minimax/MiniMax-M2.7-highspeed",
|
|
"zai/glm-4.5",
|
|
"mistral/devstral-latest",
|
|
"alibaba-coding-plan/qwen3-coder-plus",
|
|
"xiaomi/mimo-v2-pro",
|
|
"opencode-go/minimax-m2.7",
|
|
"openrouter/inclusionai/ling-2.6-1t:free",
|
|
];
|
|
|
|
const tasks = [
|
|
{
|
|
id: "json-repair",
|
|
maxTokens: Math.min(maxTokens, 280),
|
|
prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
|
|
Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}.
|
|
Normalize it semantically; no markdown.`,
|
|
check: (text) => {
|
|
try {
|
|
const parsed = JSON.parse(text);
|
|
return typeof parsed.bug === "string" && typeof parsed.fix === "string" && Array.isArray(parsed.tests);
|
|
} catch {
|
|
return false;
|
|
}
|
|
},
|
|
},
|
|
{
|
|
id: "path-debug",
|
|
maxTokens,
|
|
prompt: `Find the bug and propose the minimal patch. Code:
|
|
function isSafe(base, target) {
|
|
const resolved = path.resolve(base, target)
|
|
return resolved.startsWith(base)
|
|
}
|
|
Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`,
|
|
check: (text) => /startsWith|prefix/i.test(text) && /path\.sep|relative|normalize|resolve/i.test(text),
|
|
},
|
|
{
|
|
id: "routing-plan",
|
|
maxTokens,
|
|
prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`,
|
|
check: (text) => /kimi-k2\.5/.test(text) && /kimi-k2\.6/.test(text) && /test|verify|validation/i.test(text),
|
|
},
|
|
];
|
|
|
|
const selectedModels = modelIds.slice(0, Number.isFinite(maxModels) ? maxModels : modelIds.length);
|
|
const results = [];
|
|
|
|
for (const fullId of selectedModels) {
|
|
const slash = fullId.indexOf("/");
|
|
if (slash === -1) {
|
|
results.push({ model: fullId, ok: false, error: "expected provider/model id" });
|
|
continue;
|
|
}
|
|
const provider = fullId.slice(0, slash);
|
|
const modelId = fullId.slice(slash + 1);
|
|
const model = getModel(provider, modelId);
|
|
if (!model) {
|
|
results.push({ model: fullId, ok: false, error: "model not found in registry" });
|
|
continue;
|
|
}
|
|
|
|
for (const task of tasks) {
|
|
const started = performance.now();
|
|
let text = "";
|
|
let result;
|
|
try {
|
|
const stream = streamSimpleOpenAICompletions(
|
|
model,
|
|
{
|
|
systemPrompt: "You are a precise software engineering benchmark model. Follow requested output formats exactly.",
|
|
messages: [{ role: "user", content: task.prompt, timestamp: Date.now() }],
|
|
},
|
|
{ temperature: 0, maxTokens: task.maxTokens },
|
|
);
|
|
for await (const event of stream) {
|
|
if (event.type === "text_delta") text += event.delta;
|
|
}
|
|
result = await stream.result();
|
|
} catch (error) {
|
|
results.push({
|
|
model: fullId,
|
|
task: task.id,
|
|
ok: false,
|
|
elapsedMs: Math.round(performance.now() - started),
|
|
error: error instanceof Error ? error.message : String(error),
|
|
});
|
|
continue;
|
|
}
|
|
|
|
const elapsedMs = Math.round(performance.now() - started);
|
|
const passed = result.stopReason !== "error" && task.check(text);
|
|
results.push({
|
|
model: fullId,
|
|
task: task.id,
|
|
ok: passed,
|
|
stopReason: result.stopReason,
|
|
errorMessage: result.errorMessage,
|
|
elapsedMs,
|
|
chars: text.length,
|
|
usage: result.usage,
|
|
sample: text.slice(0, 700),
|
|
});
|
|
console.log(`${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`);
|
|
}
|
|
}
|
|
|
|
const report = {
|
|
createdAt: new Date().toISOString(),
|
|
models: selectedModels,
|
|
tasks: tasks.map((t) => t.id),
|
|
results,
|
|
};
|
|
|
|
mkdirSync(dirname(outputPath), { recursive: true });
|
|
writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
console.log(`wrote ${outputPath}`);
|
|
|
|
function parseArgs(argv) {
|
|
const parsed = {};
|
|
for (let i = 0; i < argv.length; i++) {
|
|
const arg = argv[i];
|
|
if (!arg.startsWith("--")) continue;
|
|
const key = arg.slice(2);
|
|
const next = argv[i + 1];
|
|
if (!next || next.startsWith("--")) {
|
|
parsed[key] = "true";
|
|
} else {
|
|
parsed[key] = next;
|
|
i++;
|
|
}
|
|
}
|
|
return parsed;
|
|
}
|
|
|
|
async function loadSfScopedEnv() {
|
|
const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`;
|
|
const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`;
|
|
const wrapperPath = `${homedir()}/.local/bin/sf`;
|
|
const envNames = readSfScopedEnvNames(wrapperPath);
|
|
for (const name of envNames) delete process.env[name];
|
|
|
|
const decrypted = spawnSync("sops", ["--config", sopsConfig, "-d", secretsFile], {
|
|
encoding: "utf8",
|
|
stdio: ["ignore", "pipe", "ignore"],
|
|
});
|
|
if (decrypted.status !== 0 || !decrypted.stdout) return;
|
|
|
|
const extracted = spawnSync("yq", [
|
|
"-r",
|
|
`(
|
|
(.sf // {} | to_entries[]
|
|
| select((.value | type) == "string" or (.value | type) == "number" or (.value | type) == "boolean")
|
|
| select(.value != null and .value != "")
|
|
| "\\(.key)=\\(.value)"),
|
|
(.sf.env // {} | to_entries[]
|
|
| select(.value != null and .value != "")
|
|
| "\\(.key)=\\(.value)"),
|
|
(.sf.providers // {} | to_entries[]
|
|
| (.value.env // {})
|
|
| to_entries[]
|
|
| select(.value != null and .value != "")
|
|
| "\\(.key)=\\(.value)")
|
|
)`,
|
|
], {
|
|
input: decrypted.stdout,
|
|
encoding: "utf8",
|
|
stdio: ["pipe", "pipe", "ignore"],
|
|
});
|
|
if (extracted.status !== 0 || !extracted.stdout) return;
|
|
|
|
for (const line of extracted.stdout.split(/\r?\n/)) {
|
|
const idx = line.indexOf("=");
|
|
if (idx <= 0) continue;
|
|
const key = line.slice(0, idx);
|
|
const value = line.slice(idx + 1);
|
|
if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value;
|
|
}
|
|
}
|
|
|
|
function readSfScopedEnvNames(wrapperPath) {
|
|
try {
|
|
const source = readFileSync(wrapperPath, "utf8");
|
|
const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/);
|
|
if (!match) return [];
|
|
return match[1]
|
|
.split(/\r?\n/)
|
|
.map((line) => line.trim())
|
|
.filter((line) => /^[A-Z0-9_]+$/.test(line));
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|