singularity-forge/scripts/model-smoke-benchmark.mjs

#!/usr/bin/env node

import { readFileSync, writeFileSync, mkdirSync } from "node:fs";
import { homedir } from "node:os";
import { dirname, resolve } from "node:path";
import { spawnSync } from "node:child_process";
import { performance } from "node:perf_hooks";

const repoRoot = resolve(import.meta.dirname, "..");
const defaultOutputPath = resolve(repoRoot, ".sf", "model-benchmarks", `${new Date().toISOString().replace(/[:.]/g, "-")}.json`);

const args = parseArgs(process.argv.slice(2));
const modelsArg = args.models ?? args.model;
const outputPath = resolve(args.output ?? defaultOutputPath);
const maxModels = Number.parseInt(args.maxModels ?? args["max-models"] ?? "8", 10);
const maxTokens = Number.parseInt(args.maxTokens ?? args["max-tokens"] ?? "420", 10);

await loadSfScopedEnv();

const { getModel, streamSimpleOpenAICompletions } = await import("../packages/pi-ai/src/index.ts");

const modelIds = modelsArg
	? modelsArg.split(",").map((s) => s.trim()).filter(Boolean)
	: [
			"kimi-coding/kimi-k2.6",
			"minimax/MiniMax-M2.7-highspeed",
			"zai/glm-4.5",
			"mistral/devstral-latest",
			"alibaba-coding-plan/qwen3-coder-plus",
			"xiaomi/mimo-v2-pro",
			"opencode-go/minimax-m2.7",
			"openrouter/inclusionai/ling-2.6-1t:free",
		];

const tasks = [
	{
		id: "json-repair",
		maxTokens: Math.min(maxTokens, 280),
		prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}.
Normalize it semantically; no markdown.`,
		check: (text) => {
			try {
				const parsed = JSON.parse(text);
				return typeof parsed.bug === "string" && typeof parsed.fix === "string" && Array.isArray(parsed.tests);
			} catch {
				return false;
			}
		},
	},
	{
		id: "path-debug",
		maxTokens,
		prompt: `Find the bug and propose the minimal patch. Code:
function isSafe(base, target) {
  const resolved = path.resolve(base, target)
  return resolved.startsWith(base)
}
Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`,
		check: (text) => /startsWith|prefix/i.test(text) && /path\.sep|relative|normalize|resolve/i.test(text),
	},
	{
		id: "routing-plan",
		maxTokens,
		prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`,
		check: (text) => /kimi-k2\.5/.test(text) && /kimi-k2\.6/.test(text) && /test|verify|validation/i.test(text),
	},
];

const selectedModels = modelIds.slice(0, Number.isFinite(maxModels) ? maxModels : modelIds.length);
const results = [];

for (const fullId of selectedModels) {
	const slash = fullId.indexOf("/");
	if (slash === -1) {
		results.push({ model: fullId, ok: false, error: "expected provider/model id" });
		continue;
	}
	const provider = fullId.slice(0, slash);
	const modelId = fullId.slice(slash + 1);
	const model = getModel(provider, modelId);
	if (!model) {
		results.push({ model: fullId, ok: false, error: "model not found in registry" });
		continue;
	}

	for (const task of tasks) {
		const started = performance.now();
		let text = "";
		let result;
		try {
			const stream = streamSimpleOpenAICompletions(
				model,
				{
					systemPrompt: "You are a precise software engineering benchmark model. Follow requested output formats exactly.",
					messages: [{ role: "user", content: task.prompt, timestamp: Date.now() }],
				},
				{ temperature: 0, maxTokens: task.maxTokens },
			);
			for await (const event of stream) {
				if (event.type === "text_delta") text += event.delta;
			}
			result = await stream.result();
		} catch (error) {
			results.push({
				model: fullId,
				task: task.id,
				ok: false,
				elapsedMs: Math.round(performance.now() - started),
				error: error instanceof Error ? error.message : String(error),
			});
			continue;
		}

		const elapsedMs = Math.round(performance.now() - started);
		const passed = result.stopReason !== "error" && task.check(text);
		results.push({
			model: fullId,
			task: task.id,
			ok: passed,
			stopReason: result.stopReason,
			errorMessage: result.errorMessage,
			elapsedMs,
			chars: text.length,
			usage: result.usage,
			sample: text.slice(0, 700),
		});
		console.log(`${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`);
	}
}

const report = {
	createdAt: new Date().toISOString(),
	models: selectedModels,
	tasks: tasks.map((t) => t.id),
	results,
};

mkdirSync(dirname(outputPath), { recursive: true });
writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`);
console.log(`wrote ${outputPath}`);

function parseArgs(argv) {
	const parsed = {};
	for (let i = 0; i < argv.length; i++) {
		const arg = argv[i];
		if (!arg.startsWith("--")) continue;
		const key = arg.slice(2);
		const next = argv[i + 1];
		if (!next || next.startsWith("--")) {
			parsed[key] = "true";
		} else {
			parsed[key] = next;
			i++;
		}
	}
	return parsed;
}

async function loadSfScopedEnv() {
	const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`;
	const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`;
	const wrapperPath = `${homedir()}/.local/bin/sf`;
	const envNames = readSfScopedEnvNames(wrapperPath);
	for (const name of envNames) delete process.env[name];

	const decrypted = spawnSync("sops", ["--config", sopsConfig, "-d", secretsFile], {
		encoding: "utf8",
		stdio: ["ignore", "pipe", "ignore"],
	});
	if (decrypted.status !== 0 || !decrypted.stdout) return;

	const extracted = spawnSync("yq", [
		"-r",
		`(
			(.sf // {} | to_entries[]
				| select((.value | type) == "string" or (.value | type) == "number" or (.value | type) == "boolean")
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)"),
			(.sf.env // {} | to_entries[]
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)"),
			(.sf.providers // {} | to_entries[]
				| (.value.env // {})
				| to_entries[]
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)")
		)`,
	], {
		input: decrypted.stdout,
		encoding: "utf8",
		stdio: ["pipe", "pipe", "ignore"],
	});
	if (extracted.status !== 0 || !extracted.stdout) return;

	for (const line of extracted.stdout.split(/\r?\n/)) {
		const idx = line.indexOf("=");
		if (idx <= 0) continue;
		const key = line.slice(0, idx);
		const value = line.slice(idx + 1);
		if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value;
	}
}

function readSfScopedEnvNames(wrapperPath) {
	try {
		const source = readFileSync(wrapperPath, "utf8");
		const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/);
		if (!match) return [];
		return match[1]
			.split(/\r?\n/)
			.map((line) => line.trim())
			.filter((line) => /^[A-Z0-9_]+$/.test(line));
	} catch {
		return [];
	}
}