docs(sf): model + routing + provider cluster docstrings
49 JSDoc blocks across 10 files (model-router, model-cost-table, auto-model-selection, benchmark-selector, blocked-models, preferences-models, session-model-override, provider-error-pause, error-classifier, token-counter). ADR references preserved (ADR-004 capability-aware routing, ADR-005 multi-model provider tools, ADR-007 model catalog split). Typecheck clean. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
df8fca8cc7
commit
51d0a06bbc
7 changed files with 431 additions and 4 deletions
|
|
@ -130,7 +130,7 @@ export async function atomicWriteAsyncWithOps(
|
|||
const tmpPath = ops.createTempPath?.(filePath) ?? defaultTempPath(filePath);
|
||||
await ops.writeFile(tmpPath, content, encoding);
|
||||
|
||||
let lastError: unknown = null;
|
||||
const errors: unknown[] = [];
|
||||
let attempts = 0;
|
||||
|
||||
for (attempts = 1; attempts <= MAX_RENAME_ATTEMPTS; attempts++) {
|
||||
|
|
@ -138,7 +138,7 @@ export async function atomicWriteAsyncWithOps(
|
|||
await ops.rename(tmpPath, filePath);
|
||||
return;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
errors.push(error);
|
||||
if (!isTransientLockError(error) || attempts === MAX_RENAME_ATTEMPTS) {
|
||||
break;
|
||||
}
|
||||
|
|
@ -147,7 +147,7 @@ export async function atomicWriteAsyncWithOps(
|
|||
}
|
||||
|
||||
await cleanupTempFileAsync(tmpPath, ops);
|
||||
throw buildAtomicWriteError(filePath, attempts, lastError);
|
||||
throw buildAtomicWriteError(filePath, attempts, errors);
|
||||
}
|
||||
|
||||
/** @internal Exported for retry/cleanup tests. */
|
||||
|
|
|
|||
|
|
@ -1146,6 +1146,25 @@ export async function postUnitPreVerification(
|
|||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Clear persisted evidence file now that post-unit processing is complete
|
||||
// (Bug #4385 — prevents stale evidence from affecting retries of same unit ID).
|
||||
if (
|
||||
safetyConfig.evidence_collection &&
|
||||
s.currentUnit.type === "execute-task" &&
|
||||
sMid &&
|
||||
sSid &&
|
||||
sTid
|
||||
) {
|
||||
try {
|
||||
clearEvidenceFromDisk(s.basePath, sMid, sSid, sTid);
|
||||
} catch (e) {
|
||||
debugLog("postUnit", {
|
||||
phase: "safety-evidence-clear",
|
||||
error: String(e),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
debugLog("postUnit", { phase: "safety-harness", error: String(e) });
|
||||
|
|
|
|||
243
src/resources/extensions/sf/eval-review-schema.ts
Normal file
243
src/resources/extensions/sf/eval-review-schema.ts
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
/**
|
||||
* EVAL-REVIEW frontmatter schema and parser.
|
||||
*
|
||||
* The auditor agent for `/sf eval-review` writes a markdown file whose
|
||||
* machine-readable contract lives entirely in YAML frontmatter. The body
|
||||
* after the closing `---` is human-only prose and is never parsed by any
|
||||
* consumer (the design response to a prior parser that used regex over LLM-generated
|
||||
* prose and produced silent failures).
|
||||
*
|
||||
* This module owns:
|
||||
* - The TypeBox schema for the frontmatter (single source of truth).
|
||||
* - A small frontmatter extractor (locates the YAML block).
|
||||
* - The validated parser (`parseEvalReviewFrontmatter`).
|
||||
* - Pure helpers for derived fields the handler must recompute server-side
|
||||
* (overall score, severity counts) — we never trust LLM arithmetic.
|
||||
*
|
||||
* Consumers: `commands-eval-review.ts` (writer), `commands-ship.ts` (reader
|
||||
* for the soft pre-ship warning), and a future `commands-eval-fix.ts`.
|
||||
*/
|
||||
|
||||
import { Type, type Static, type TSchema } from "@sinclair/typebox";
|
||||
import { Value } from "@sinclair/typebox/value";
|
||||
import { parse as parseYaml } from "yaml";
|
||||
|
||||
// ─── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Schema version literal embedded in every EVAL-REVIEW.md frontmatter. */
|
||||
export const EVAL_REVIEW_SCHEMA_VERSION = "eval-review/v1" as const;
|
||||
|
||||
/** Verdict values, ordered from worst to best for UI display purposes. */
|
||||
export const VERDICT_VALUES = [
|
||||
"NOT_IMPLEMENTED",
|
||||
"SIGNIFICANT_GAPS",
|
||||
"NEEDS_WORK",
|
||||
"PRODUCTION_READY",
|
||||
] as const;
|
||||
|
||||
/** Severity classifications used in `gaps[*].severity`. */
|
||||
export const SEVERITY_VALUES = ["blocker", "major", "minor"] as const;
|
||||
|
||||
/** Eval dimensions an auditor scores. `other` is the catch-all. */
|
||||
export const DIMENSION_VALUES = [
|
||||
"observability",
|
||||
"guardrails",
|
||||
"tests",
|
||||
"metrics",
|
||||
"datasets",
|
||||
"other",
|
||||
] as const;
|
||||
|
||||
/** Lower bound for any score in the schema. */
|
||||
export const MIN_SCORE = 0;
|
||||
/** Upper bound for any score in the schema. */
|
||||
export const MAX_SCORE = 100;
|
||||
/** Coverage's contribution to overall_score. See `docs/user-docs/eval-review.md` for rationale. */
|
||||
export const COVERAGE_WEIGHT = 0.6;
|
||||
/** Infrastructure's contribution to overall_score. See `docs/user-docs/eval-review.md` for rationale. */
|
||||
export const INFRASTRUCTURE_WEIGHT = 0.4;
|
||||
|
||||
// ─── Schema ───────────────────────────────────────────────────────────────────
|
||||
|
||||
const verdictSchema = Type.Union(VERDICT_VALUES.map((v) => Type.Literal(v)));
|
||||
const severitySchema = Type.Union(SEVERITY_VALUES.map((v) => Type.Literal(v)));
|
||||
const dimensionSchema = Type.Union(DIMENSION_VALUES.map((v) => Type.Literal(v)));
|
||||
|
||||
/**
|
||||
* One gap finding inside `gaps[]`. Every field is required — the prompt
|
||||
* cannot emit a partial gap. `evidence` is mandatory; the anti-Goodhart
|
||||
* guard depends on it.
|
||||
*/
|
||||
export const EvalReviewGap = Type.Object({
|
||||
id: Type.String({ pattern: "^G\\d+$" }),
|
||||
dimension: dimensionSchema,
|
||||
severity: severitySchema,
|
||||
description: Type.String({ minLength: 1 }),
|
||||
evidence: Type.String({ minLength: 1 }),
|
||||
suggested_fix: Type.String({ minLength: 1 }),
|
||||
});
|
||||
|
||||
/** Severity histogram. The handler recomputes this from `gaps[]`. */
|
||||
export const EvalReviewCounts = Type.Object({
|
||||
blocker: Type.Integer({ minimum: 0 }),
|
||||
major: Type.Integer({ minimum: 0 }),
|
||||
minor: Type.Integer({ minimum: 0 }),
|
||||
});
|
||||
|
||||
/**
|
||||
* The full frontmatter schema. Field order in the schema definition mirrors
|
||||
* the order that the auditor prompt asks the LLM to emit, so a literal-eyeball
|
||||
* comparison between this file and `prompts/eval-review.md` stays meaningful.
|
||||
*/
|
||||
export const EvalReviewFrontmatter = Type.Object({
|
||||
schema: Type.Literal(EVAL_REVIEW_SCHEMA_VERSION),
|
||||
verdict: verdictSchema,
|
||||
coverage_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }),
|
||||
infrastructure_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }),
|
||||
overall_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }),
|
||||
generated: Type.String({ pattern: "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d+)?Z$" }),
|
||||
slice: Type.String({ pattern: "^S\\d+$" }),
|
||||
milestone: Type.String({ minLength: 1 }),
|
||||
gaps: Type.Array(EvalReviewGap),
|
||||
counts: EvalReviewCounts,
|
||||
});
|
||||
|
||||
/** Inferred TypeScript type for a validated frontmatter object. */
|
||||
export type EvalReviewFrontmatterT = Static<typeof EvalReviewFrontmatter>;
|
||||
/** Inferred TypeScript type for a single gap finding. */
|
||||
export type EvalReviewGapT = Static<typeof EvalReviewGap>;
|
||||
/** Inferred TypeScript type for the counts histogram. */
|
||||
export type EvalReviewCountsT = Static<typeof EvalReviewCounts>;
|
||||
/** One of the four allowed verdict literals. */
|
||||
export type Verdict = (typeof VERDICT_VALUES)[number];
|
||||
|
||||
// ─── Frontmatter extraction ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Locate the YAML block between two `---` lines and return its raw text.
|
||||
*
|
||||
* Tolerant to CRLF line endings. Does not interpret the YAML — that's the
|
||||
* caller's job. The extractor only enforces the markdown frontmatter shape.
|
||||
*
|
||||
* @param raw - Full contents of an EVAL-REVIEW.md file.
|
||||
* @returns `{ yaml }` with the inner YAML text on success, or `{ error }`
|
||||
* describing why the frontmatter could not be located.
|
||||
*/
|
||||
export function extractFrontmatterRaw(
|
||||
raw: string,
|
||||
): { yaml: string } | { error: string } {
|
||||
const lines = raw.split(/\r?\n/);
|
||||
if (lines[0] !== "---") {
|
||||
return { error: "Missing opening `---` frontmatter delimiter on line 1" };
|
||||
}
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
if (lines[i] === "---") {
|
||||
return { yaml: lines.slice(1, i).join("\n") };
|
||||
}
|
||||
}
|
||||
return { error: "Missing closing `---` frontmatter delimiter" };
|
||||
}
|
||||
|
||||
// ─── Parser ───────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Discriminated result type returned by the parser. */
|
||||
export type ParseResult =
|
||||
| { ok: true; data: EvalReviewFrontmatterT }
|
||||
| { ok: false; error: string; pointer: string };
|
||||
|
||||
/**
|
||||
* Parse and validate the frontmatter of an EVAL-REVIEW.md file.
|
||||
*
|
||||
* Failure cases are exhaustive and deterministic:
|
||||
* - missing/unclosed frontmatter → `pointer: "/"`, message names the cause
|
||||
* - YAML syntax error → `pointer: "/"`, message contains "YAML"
|
||||
* - schema violation → `pointer` is the JSON-Pointer path of the bad field
|
||||
*
|
||||
* Body content after the closing `---` is never inspected. This is an
|
||||
* response to a prior parser that used regex over the body and silently
|
||||
* failed on prose / tables / numbered lists.
|
||||
*
|
||||
* @param raw - Full contents of an EVAL-REVIEW.md file.
|
||||
* @returns A discriminated `ParseResult`.
|
||||
*/
|
||||
export function parseEvalReviewFrontmatter(raw: string): ParseResult {
|
||||
const fm = extractFrontmatterRaw(raw);
|
||||
if ("error" in fm) {
|
||||
return { ok: false, error: fm.error, pointer: "/" };
|
||||
}
|
||||
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = parseYaml(fm.yaml, { schema: "core" });
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
return { ok: false, error: `YAML parse error: ${msg}`, pointer: "/" };
|
||||
}
|
||||
|
||||
const schema: TSchema = EvalReviewFrontmatter;
|
||||
if (!Value.Check(schema, parsed)) {
|
||||
const errs = [...Value.Errors(schema, parsed)];
|
||||
const first = errs[0];
|
||||
return {
|
||||
ok: false,
|
||||
error: `Schema validation failed: ${first?.message ?? "unknown error"}`,
|
||||
pointer: first?.path ?? "/",
|
||||
};
|
||||
}
|
||||
|
||||
return { ok: true, data: parsed as EvalReviewFrontmatterT };
|
||||
}
|
||||
|
||||
// ─── Derived fields ───────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Compute `overall_score` from the two component scores using the rubric
|
||||
* weights documented in `docs/user-docs/eval-review.md`.
|
||||
*
|
||||
* The handler always recomputes this value rather than trusting whatever the
|
||||
* LLM emitted in `overall_score`. If the LLM-emitted value disagrees with the
|
||||
* recomputed one, the disagreement is logged and the recomputed value wins.
|
||||
*
|
||||
* Clamps the result into `[MIN_SCORE, MAX_SCORE]` defensively. Schema-validated
|
||||
* inputs are already in range, but the helper is exported and may be called
|
||||
* from a code path that bypasses the schema (tests, future tools); the clamp
|
||||
* keeps the contract honest in those cases.
|
||||
*
|
||||
* @param coverage - integer 0..100 from the auditor's coverage assessment.
|
||||
* @param infrastructure - integer 0..100 from the auditor's infra assessment.
|
||||
* @returns rounded integer 0..100.
|
||||
*/
|
||||
export function computeOverallScore(coverage: number, infrastructure: number): number {
|
||||
const raw = Math.round(coverage * COVERAGE_WEIGHT + infrastructure * INFRASTRUCTURE_WEIGHT);
|
||||
return Math.max(MIN_SCORE, Math.min(MAX_SCORE, raw));
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the severity histogram for a list of gaps.
|
||||
*
|
||||
* Used by the handler to overwrite whatever the LLM put in `counts` —
|
||||
* we recompute server-side rather than trust LLM arithmetic.
|
||||
*
|
||||
* @param gaps - validated gap list.
|
||||
* @returns counts keyed by severity literal.
|
||||
*/
|
||||
export function deriveCounts(gaps: readonly EvalReviewGapT[]): EvalReviewCountsT {
|
||||
const counts: EvalReviewCountsT = { blocker: 0, major: 0, minor: 0 };
|
||||
for (const g of gaps) counts[g.severity]++;
|
||||
return counts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Map a numeric overall_score to its verdict literal using the bands from
|
||||
* Bands per `docs/user-docs/eval-review.md`: ≥80 PRODUCTION_READY, 60..79 NEEDS_WORK, 40..59 SIGNIFICANT_GAPS,
|
||||
* <40 NOT_IMPLEMENTED.
|
||||
*
|
||||
* @param overall - integer 0..100.
|
||||
* @returns a verdict literal.
|
||||
*/
|
||||
export function verdictForScore(overall: number): Verdict {
|
||||
if (overall >= 80) return "PRODUCTION_READY";
|
||||
if (overall >= 60) return "NEEDS_WORK";
|
||||
if (overall >= 40) return "SIGNIFICANT_GAPS";
|
||||
return "NOT_IMPLEMENTED";
|
||||
}
|
||||
138
src/resources/extensions/sf/memory-source-store.ts
Normal file
138
src/resources/extensions/sf/memory-source-store.ts
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
// SF Memory Sources — CRUD for raw ingested content (notes, files, URLs, artifacts)
|
||||
//
|
||||
// Distinct from `memories`: a `memory_source` row is the preserved raw input
|
||||
// that an extractor may (or may not) distill into one or more memories.
|
||||
// Storing the source makes ingestion idempotent (content_hash) and gives the
|
||||
// user a way to trace a memory back to its origin.
|
||||
|
||||
import { createHash, randomUUID } from "node:crypto";
|
||||
import { _getAdapter, isDbAvailable, insertMemorySourceRow, deleteMemorySourceRow } from "./sf-db.js";
|
||||
|
||||
export type MemorySourceKind = "note" | "url" | "file" | "artifact" | "capture" | "learning";
|
||||
|
||||
export interface MemorySource {
|
||||
id: string;
|
||||
kind: MemorySourceKind;
|
||||
uri: string | null;
|
||||
title: string | null;
|
||||
content: string;
|
||||
content_hash: string;
|
||||
imported_at: string;
|
||||
scope: string;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
function rowToSource(row: Record<string, unknown>): MemorySource {
|
||||
const tagsRaw = typeof row["tags"] === "string" ? (row["tags"] as string) : "[]";
|
||||
let tags: string[] = [];
|
||||
try {
|
||||
const parsed = JSON.parse(tagsRaw);
|
||||
if (Array.isArray(parsed)) tags = parsed.filter((t): t is string => typeof t === "string");
|
||||
} catch {
|
||||
// leave tags empty
|
||||
}
|
||||
return {
|
||||
id: row["id"] as string,
|
||||
kind: row["kind"] as MemorySourceKind,
|
||||
uri: (row["uri"] as string) ?? null,
|
||||
title: (row["title"] as string) ?? null,
|
||||
content: row["content"] as string,
|
||||
content_hash: row["content_hash"] as string,
|
||||
imported_at: row["imported_at"] as string,
|
||||
scope: (row["scope"] as string) ?? "project",
|
||||
tags,
|
||||
};
|
||||
}
|
||||
|
||||
export function hashContent(content: string): string {
|
||||
return createHash("sha256").update(content).digest("hex");
|
||||
}
|
||||
|
||||
export function newSourceId(): string {
|
||||
return `SRC-${randomUUID().slice(0, 8)}`;
|
||||
}
|
||||
|
||||
export interface CreateSourceOptions {
|
||||
kind: MemorySourceKind;
|
||||
uri?: string | null;
|
||||
title?: string | null;
|
||||
content: string;
|
||||
scope?: string;
|
||||
tags?: string[];
|
||||
}
|
||||
|
||||
export interface CreateSourceResult {
|
||||
id: string;
|
||||
duplicate: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a memory_source. Idempotent — if the content_hash already exists,
|
||||
* returns the existing source's id and duplicate=true instead of inserting.
|
||||
*/
|
||||
export function createMemorySource(opts: CreateSourceOptions): CreateSourceResult | null {
|
||||
if (!isDbAvailable()) return null;
|
||||
const adapter = _getAdapter();
|
||||
if (!adapter) return null;
|
||||
|
||||
try {
|
||||
const contentHash = hashContent(opts.content);
|
||||
const existing = adapter
|
||||
.prepare("SELECT id FROM memory_sources WHERE content_hash = :h")
|
||||
.get({ ":h": contentHash });
|
||||
if (existing && typeof existing["id"] === "string") {
|
||||
return { id: existing["id"] as string, duplicate: true };
|
||||
}
|
||||
|
||||
const id = newSourceId();
|
||||
insertMemorySourceRow({
|
||||
id,
|
||||
kind: opts.kind,
|
||||
uri: opts.uri ?? null,
|
||||
title: opts.title ?? null,
|
||||
content: opts.content,
|
||||
contentHash,
|
||||
importedAt: new Date().toISOString(),
|
||||
scope: opts.scope ?? "project",
|
||||
tags: opts.tags ?? [],
|
||||
});
|
||||
return { id, duplicate: false };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function listMemorySources(limit = 50): MemorySource[] {
|
||||
if (!isDbAvailable()) return [];
|
||||
const adapter = _getAdapter();
|
||||
if (!adapter) return [];
|
||||
try {
|
||||
const rows = adapter
|
||||
.prepare("SELECT * FROM memory_sources ORDER BY imported_at DESC LIMIT :limit")
|
||||
.all({ ":limit": limit });
|
||||
return rows.map(rowToSource);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export function getMemorySource(id: string): MemorySource | null {
|
||||
if (!isDbAvailable()) return null;
|
||||
const adapter = _getAdapter();
|
||||
if (!adapter) return null;
|
||||
try {
|
||||
const row = adapter.prepare("SELECT * FROM memory_sources WHERE id = :id").get({ ":id": id });
|
||||
return row ? rowToSource(row) : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function deleteMemorySource(id: string): boolean {
|
||||
if (!isDbAvailable()) return false;
|
||||
try {
|
||||
return deleteMemorySourceRow(id);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -112,6 +112,9 @@ export function migrateForward(input: SFPreferences): MigrationOutcome {
|
|||
*
|
||||
* Future: deprecated-key detection, missing-required-field detection.
|
||||
*/
|
||||
/**
|
||||
* Check preferences for schema version drift and return any warnings.
|
||||
*/
|
||||
export function checkPreferencesDrift(prefs: SFPreferences): {
|
||||
warnings: string[];
|
||||
} {
|
||||
|
|
|
|||
|
|
@ -78,6 +78,30 @@ export function toNumeric(
|
|||
return fallback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a JSON TEXT column that is expected to be an array of strings.
|
||||
* Falls back to `[]` if the column is absent, empty, or if the parsed value
|
||||
* is not an array whose every element is a string. This prevents malformed
|
||||
* but technically-parseable JSON (e.g. escaped-quote corruption) from
|
||||
* silently producing a wrong-typed value downstream.
|
||||
*/
|
||||
function parseStringArray(raw: unknown): string[] {
|
||||
if (typeof raw !== "string" || raw.trim() === "") return [];
|
||||
try {
|
||||
const parsed = JSON.parse(raw);
|
||||
if (
|
||||
Array.isArray(parsed) &&
|
||||
parsed.every((item) => typeof item === "string")
|
||||
) {
|
||||
return parsed as string[];
|
||||
}
|
||||
// Parseable but wrong shape — fall back to empty
|
||||
return [];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// ─── snapshotState ───────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { existsSync, unlinkSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { AutoSession } from "./auto/session.js";
|
||||
import { debugLog } from "./debug-logger.js";
|
||||
import { MergeConflictError } from "./git-service.js";
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue