singularity-forge/scripts/model-smoke-benchmark.mjs

#!/usr/bin/env node

import { readFileSync, writeFileSync, mkdirSync } from "node:fs";
import { homedir } from "node:os";
import { dirname, resolve } from "node:path";
import { spawnSync } from "node:child_process";
import { performance } from "node:perf_hooks";

const repoRoot = resolve(import.meta.dirname, "..");
const defaultOutputPath = resolve(repoRoot, ".sf", "model-benchmarks", `${new Date().toISOString().replace(/[:.]/g, "-")}.json`);

const args = parseArgs(process.argv.slice(2));
const modelsArg = args.models ?? args.model;
const outputPath = resolve(args.output ?? defaultOutputPath);
const maxModels = Number.parseInt(args.maxModels ?? args["max-models"] ?? "8", 10);
const maxTokens = Number.parseInt(args.maxTokens ?? args["max-tokens"] ?? "420", 10);

await loadSfScopedEnv();

const { getModel, streamSimpleOpenAICompletions } = await import("../packages/pi-ai/src/index.ts");

const modelIds = modelsArg
	? modelsArg.split(",").map((s) => s.trim()).filter(Boolean)
	: [
			"kimi-coding/kimi-k2.6",
			"minimax/MiniMax-M2.7-highspeed",
			"zai/glm-4.5",
			"mistral/devstral-latest",
			"alibaba-coding-plan/qwen3-coder-plus",
			"xiaomi/mimo-v2-pro",
			"opencode-go/minimax-m2.7",
			"openrouter/inclusionai/ling-2.6-1t:free",
		];

const tasks = [
	{
		id: "json-repair",
		maxTokens: Math.min(maxTokens, 280),
		prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}.
Normalize it semantically; no markdown.`,
		check: (text) => {
			try {
				const parsed = JSON.parse(text);
				return typeof parsed.bug === "string" && typeof parsed.fix === "string" && Array.isArray(parsed.tests);
			} catch {
				return false;
			}
		},
	},
	{
		id: "path-debug",
		maxTokens,
		prompt: `Find the bug and propose the minimal patch. Code:
function isSafe(base, target) {
  const resolved = path.resolve(base, target)
  return resolved.startsWith(base)
}
Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`,
		check: (text) => /startsWith|prefix/i.test(text) && /path\.sep|relative|normalize|resolve/i.test(text),
	},
	{
		id: "routing-plan",
		maxTokens,
		prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`,
		check: (text) => /kimi-k2\.5/.test(text) && /kimi-k2\.6/.test(text) && /test|verify|validation/i.test(text),
	},
];

const selectedModels = modelIds.slice(0, Number.isFinite(maxModels) ? maxModels : modelIds.length);
const results = [];

for (const fullId of selectedModels) {
	const slash = fullId.indexOf("/");
	if (slash === -1) {
		results.push({ model: fullId, ok: false, error: "expected provider/model id" });
		continue;
	}
	const provider = fullId.slice(0, slash);
	const modelId = fullId.slice(slash + 1);
	const model = getModel(provider, modelId);
	if (!model) {
		results.push({ model: fullId, ok: false, error: "model not found in registry" });
		continue;
	}

	for (const task of tasks) {
		const started = performance.now();
		let text = "";
		let result;
		try {
			const stream = streamSimpleOpenAICompletions(
				model,
				{
					systemPrompt: "You are a precise software engineering benchmark model. Follow requested output formats exactly.",
					messages: [{ role: "user", content: task.prompt, timestamp: Date.now() }],
				},
				{ temperature: 0, maxTokens: task.maxTokens },
			);
			for await (const event of stream) {
				if (event.type === "text_delta") text += event.delta;
			}
			result = await stream.result();
		} catch (error) {
			results.push({
				model: fullId,
				task: task.id,
				ok: false,
				elapsedMs: Math.round(performance.now() - started),
				error: error instanceof Error ? error.message : String(error),
			});
			continue;
		}

		const elapsedMs = Math.round(performance.now() - started);
		const passed = result.stopReason !== "error" && task.check(text);
		results.push({
			model: fullId,
			task: task.id,
			ok: passed,
			stopReason: result.stopReason,
			errorMessage: result.errorMessage,
			elapsedMs,
			chars: text.length,
			usage: result.usage,
			sample: text.slice(0, 700),
		});
		console.log(`${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`);
	}
}

const report = {
	createdAt: new Date().toISOString(),
	models: selectedModels,
	tasks: tasks.map((t) => t.id),
	results,
};

mkdirSync(dirname(outputPath), { recursive: true });
writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`);
console.log(`wrote ${outputPath}`);

function parseArgs(argv) {
	const parsed = {};
	for (let i = 0; i < argv.length; i++) {
		const arg = argv[i];
		if (!arg.startsWith("--")) continue;
		const key = arg.slice(2);
		const next = argv[i + 1];
		if (!next || next.startsWith("--")) {
			parsed[key] = "true";
		} else {
			parsed[key] = next;
			i++;
		}
	}
	return parsed;
}

async function loadSfScopedEnv() {
	const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`;
	const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`;
	const wrapperPath = `${homedir()}/.local/bin/sf`;
	const envNames = readSfScopedEnvNames(wrapperPath);
	for (const name of envNames) delete process.env[name];

	const decrypted = spawnSync("sops", ["--config", sopsConfig, "-d", secretsFile], {
		encoding: "utf8",
		stdio: ["ignore", "pipe", "ignore"],
	});
	if (decrypted.status !== 0 || !decrypted.stdout) return;

	const extracted = spawnSync("yq", [
		"-r",
		`(
			(.sf // {} | to_entries[]
				| select((.value | type) == "string" or (.value | type) == "number" or (.value | type) == "boolean")
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)"),
			(.sf.env // {} | to_entries[]
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)"),
			(.sf.providers // {} | to_entries[]
				| (.value.env // {})
				| to_entries[]
				| select(.value != null and .value != "")
				| "\\(.key)=\\(.value)")
		)`,
	], {
		input: decrypted.stdout,
		encoding: "utf8",
		stdio: ["pipe", "pipe", "ignore"],
	});
	if (extracted.status !== 0 || !extracted.stdout) return;

	for (const line of extracted.stdout.split(/\r?\n/)) {
		const idx = line.indexOf("=");
		if (idx <= 0) continue;
		const key = line.slice(0, idx);
		const value = line.slice(idx + 1);
		if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value;
	}
}

function readSfScopedEnvNames(wrapperPath) {
	try {
		const source = readFileSync(wrapperPath, "utf8");
		const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/);
		if (!match) return [];
		return match[1]
			.split(/\r?\n/)
			.map((line) => line.trim())
			.filter((line) => /^[A-Z0-9_]+$/.test(line));
	} catch {
		return [];
	}
}
Add provider smoke benchmark and headless updates 2026-04-30 10:19:18 +02:00			`#!/usr/bin/env node`

			`import { readFileSync, writeFileSync, mkdirSync } from "node:fs";`
			`import { homedir } from "node:os";`
			`import { dirname, resolve } from "node:path";`
			`import { spawnSync } from "node:child_process";`
			`import { performance } from "node:perf_hooks";`

			`const repoRoot = resolve(import.meta.dirname, "..");`
			const defaultOutputPath = resolve(repoRoot, ".sf", "model-benchmarks", `${new Date().toISOString().replace(/[:.]/g, "-")}.json`);

			`const args = parseArgs(process.argv.slice(2));`
			`const modelsArg = args.models ?? args.model;`
			`const outputPath = resolve(args.output ?? defaultOutputPath);`
			`const maxModels = Number.parseInt(args.maxModels ?? args["max-models"] ?? "8", 10);`
			`const maxTokens = Number.parseInt(args.maxTokens ?? args["max-tokens"] ?? "420", 10);`

			`await loadSfScopedEnv();`

			`const { getModel, streamSimpleOpenAICompletions } = await import("../packages/pi-ai/src/index.ts");`

			`const modelIds = modelsArg`
			`? modelsArg.split(",").map((s) => s.trim()).filter(Boolean)`
			`: [`
sf snapshot: pre-dispatch, uncommitted changes after 97m inactivity 2026-04-30 15:11:45 +02:00			`"kimi-coding/kimi-k2.6",`
Add provider smoke benchmark and headless updates 2026-04-30 10:19:18 +02:00			`"minimax/MiniMax-M2.7-highspeed",`
Use GLM 4.5 for Zai smoke benchmark 2026-04-30 10:39:17 +02:00			`"zai/glm-4.5",`
Add provider smoke benchmark and headless updates 2026-04-30 10:19:18 +02:00			`"mistral/devstral-latest",`
			`"alibaba-coding-plan/qwen3-coder-plus",`
			`"xiaomi/mimo-v2-pro",`
			`"opencode-go/minimax-m2.7",`
			`"openrouter/inclusionai/ling-2.6-1t:free",`
			`];`

			`const tasks = [`
			`{`
			`id: "json-repair",`
			`maxTokens: Math.min(maxTokens, 280),`
			prompt: `Return ONLY valid JSON matching { "bug": string, "fix": string, "tests": string[] }.
			`Broken payload: {"bug":"path traversal\\n- accepts ../foo","fix":123,"tests":"none"}.`
			Normalize it semantically; no markdown.`,
			`check: (text) => {`
			`try {`
			`const parsed = JSON.parse(text);`
			`return typeof parsed.bug === "string" && typeof parsed.fix === "string" && Array.isArray(parsed.tests);`
			`} catch {`
			`return false;`
			`}`
			`},`
			`},`
			`{`
			`id: "path-debug",`
			`maxTokens,`
			prompt: `Find the bug and propose the minimal patch. Code:
			`function isSafe(base, target) {`
			`const resolved = path.resolve(base, target)`
			`return resolved.startsWith(base)`
			`}`
			Explain why it is unsafe in <= 8 bullets, then provide a corrected JS function.`,
			`check: (text) => /startsWith\|prefix/i.test(text) && /path\.sep\|relative\|normalize\|resolve/i.test(text),`
			`},`
			`{`
			`id: "routing-plan",`
			`maxTokens,`
			prompt: `Produce a concise implementation plan with risks and verification for migrating an LLM routing table from alias k2p5 to semantic ids kimi-k2.5 and kimi-k2.6.`,
			`check: (text) => /kimi-k2\.5/.test(text) && /kimi-k2\.6/.test(text) && /test\|verify\|validation/i.test(text),`
			`},`
			`];`

			`const selectedModels = modelIds.slice(0, Number.isFinite(maxModels) ? maxModels : modelIds.length);`
			`const results = [];`

			`for (const fullId of selectedModels) {`
			`const slash = fullId.indexOf("/");`
			`if (slash === -1) {`
			`results.push({ model: fullId, ok: false, error: "expected provider/model id" });`
			`continue;`
			`}`
			`const provider = fullId.slice(0, slash);`
			`const modelId = fullId.slice(slash + 1);`
			`const model = getModel(provider, modelId);`
			`if (!model) {`
			`results.push({ model: fullId, ok: false, error: "model not found in registry" });`
			`continue;`
			`}`

			`for (const task of tasks) {`
			`const started = performance.now();`
			`let text = "";`
			`let result;`
			`try {`
			`const stream = streamSimpleOpenAICompletions(`
			`model,`
			`{`
			`systemPrompt: "You are a precise software engineering benchmark model. Follow requested output formats exactly.",`
			`messages: [{ role: "user", content: task.prompt, timestamp: Date.now() }],`
			`},`
			`{ temperature: 0, maxTokens: task.maxTokens },`
			`);`
			`for await (const event of stream) {`
			`if (event.type === "text_delta") text += event.delta;`
			`}`
			`result = await stream.result();`
			`} catch (error) {`
			`results.push({`
			`model: fullId,`
			`task: task.id,`
			`ok: false,`
			`elapsedMs: Math.round(performance.now() - started),`
			`error: error instanceof Error ? error.message : String(error),`
			`});`
			`continue;`
			`}`

			`const elapsedMs = Math.round(performance.now() - started);`
			`const passed = result.stopReason !== "error" && task.check(text);`
			`results.push({`
			`model: fullId,`
			`task: task.id,`
			`ok: passed,`
			`stopReason: result.stopReason,`
			`errorMessage: result.errorMessage,`
			`elapsedMs,`
			`chars: text.length,`
			`usage: result.usage,`
			`sample: text.slice(0, 700),`
			`});`
			console.log(`${passed ? "PASS" : "FAIL"} ${fullId} ${task.id} ${elapsedMs}ms ${result.stopReason}`);
			`}`
			`}`

			`const report = {`
			`createdAt: new Date().toISOString(),`
			`models: selectedModels,`
			`tasks: tasks.map((t) => t.id),`
			`results,`
			`};`

			`mkdirSync(dirname(outputPath), { recursive: true });`
			writeFileSync(outputPath, `${JSON.stringify(report, null, 2)}\n`);
			console.log(`wrote ${outputPath}`);

			`function parseArgs(argv) {`
			`const parsed = {};`
			`for (let i = 0; i < argv.length; i++) {`
			`const arg = argv[i];`
			`if (!arg.startsWith("--")) continue;`
			`const key = arg.slice(2);`
			`const next = argv[i + 1];`
			`if (!next \|\| next.startsWith("--")) {`
			`parsed[key] = "true";`
			`} else {`
			`parsed[key] = next;`
			`i++;`
			`}`
			`}`
			`return parsed;`
			`}`

			`async function loadSfScopedEnv() {`
			const secretsFile = `${homedir()}/.dotfiles/secrets/api-keys.yaml`;
			const sopsConfig = `${homedir()}/.dotfiles/.sops.yaml`;
			const wrapperPath = `${homedir()}/.local/bin/sf`;
			`const envNames = readSfScopedEnvNames(wrapperPath);`
			`for (const name of envNames) delete process.env[name];`

			`const decrypted = spawnSync("sops", ["--config", sopsConfig, "-d", secretsFile], {`
			`encoding: "utf8",`
			`stdio: ["ignore", "pipe", "ignore"],`
			`});`
			`if (decrypted.status !== 0 \|\| !decrypted.stdout) return;`

			`const extracted = spawnSync("yq", [`
			`"-r",`
			`(
			`(.sf // {} \| to_entries[]`
			`\| select((.value \| type) == "string" or (.value \| type) == "number" or (.value \| type) == "boolean")`
			`\| select(.value != null and .value != "")`
			`\| "\\(.key)=\\(.value)"),`
			`(.sf.env // {} \| to_entries[]`
			`\| select(.value != null and .value != "")`
			`\| "\\(.key)=\\(.value)"),`
			`(.sf.providers // {} \| to_entries[]`
			`\| (.value.env // {})`
			`\| to_entries[]`
			`\| select(.value != null and .value != "")`
			`\| "\\(.key)=\\(.value)")`
			)`,
			`], {`
			`input: decrypted.stdout,`
			`encoding: "utf8",`
			`stdio: ["pipe", "pipe", "ignore"],`
			`});`
			`if (extracted.status !== 0 \|\| !extracted.stdout) return;`

			`for (const line of extracted.stdout.split(/\r?\n/)) {`
			`const idx = line.indexOf("=");`
			`if (idx <= 0) continue;`
			`const key = line.slice(0, idx);`
			`const value = line.slice(idx + 1);`
			`if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value) process.env[key] = value;`
			`}`
			`}`

			`function readSfScopedEnvNames(wrapperPath) {`
			`try {`
			`const source = readFileSync(wrapperPath, "utf8");`
			`const match = source.match(/sf_scoped_env=\(\n([\s\S]*?)\n\)/);`
			`if (!match) return [];`
			`return match[1]`
			`.split(/\r?\n/)`
			`.map((line) => line.trim())`
			`.filter((line) => /^[A-Z0-9_]+$/.test(line));`
			`} catch {`
			`return [];`
			`}`
			`}`