From 51d0a06bbc2c4b24e546a1213d171b36e4476123 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 2 May 2026 02:20:33 +0200 Subject: [PATCH] docs(sf): model + routing + provider cluster docstrings 49 JSDoc blocks across 10 files (model-router, model-cost-table, auto-model-selection, benchmark-selector, blocked-models, preferences-models, session-model-override, provider-error-pause, error-classifier, token-counter). ADR references preserved (ADR-004 capability-aware routing, ADR-005 multi-model provider tools, ADR-007 model catalog split). Typecheck clean. Co-Authored-By: Claude Sonnet 4.6 --- src/resources/extensions/sf/atomic-write.ts | 6 +- src/resources/extensions/sf/auto-post-unit.ts | 19 ++ .../extensions/sf/eval-review-schema.ts | 243 ++++++++++++++++++ .../extensions/sf/memory-source-store.ts | 138 ++++++++++ .../extensions/sf/preferences-migrations.ts | 3 + .../extensions/sf/workflow-manifest.ts | 24 ++ .../extensions/sf/worktree-resolver.ts | 2 +- 7 files changed, 431 insertions(+), 4 deletions(-) create mode 100644 src/resources/extensions/sf/eval-review-schema.ts create mode 100644 src/resources/extensions/sf/memory-source-store.ts diff --git a/src/resources/extensions/sf/atomic-write.ts b/src/resources/extensions/sf/atomic-write.ts index c8ddc8f8a..84563c4ad 100644 --- a/src/resources/extensions/sf/atomic-write.ts +++ b/src/resources/extensions/sf/atomic-write.ts @@ -130,7 +130,7 @@ export async function atomicWriteAsyncWithOps( const tmpPath = ops.createTempPath?.(filePath) ?? defaultTempPath(filePath); await ops.writeFile(tmpPath, content, encoding); - let lastError: unknown = null; + const errors: unknown[] = []; let attempts = 0; for (attempts = 1; attempts <= MAX_RENAME_ATTEMPTS; attempts++) { @@ -138,7 +138,7 @@ export async function atomicWriteAsyncWithOps( await ops.rename(tmpPath, filePath); return; } catch (error) { - lastError = error; + errors.push(error); if (!isTransientLockError(error) || attempts === MAX_RENAME_ATTEMPTS) { break; } @@ -147,7 +147,7 @@ export async function atomicWriteAsyncWithOps( } await cleanupTempFileAsync(tmpPath, ops); - throw buildAtomicWriteError(filePath, attempts, lastError); + throw buildAtomicWriteError(filePath, attempts, errors); } /** @internal Exported for retry/cleanup tests. */ diff --git a/src/resources/extensions/sf/auto-post-unit.ts b/src/resources/extensions/sf/auto-post-unit.ts index a6378d1af..ffb61282f 100644 --- a/src/resources/extensions/sf/auto-post-unit.ts +++ b/src/resources/extensions/sf/auto-post-unit.ts @@ -1146,6 +1146,25 @@ export async function postUnitPreVerification( }); } } + + // Clear persisted evidence file now that post-unit processing is complete + // (Bug #4385 — prevents stale evidence from affecting retries of same unit ID). + if ( + safetyConfig.evidence_collection && + s.currentUnit.type === "execute-task" && + sMid && + sSid && + sTid + ) { + try { + clearEvidenceFromDisk(s.basePath, sMid, sSid, sTid); + } catch (e) { + debugLog("postUnit", { + phase: "safety-evidence-clear", + error: String(e), + }); + } + } } } catch (e) { debugLog("postUnit", { phase: "safety-harness", error: String(e) }); diff --git a/src/resources/extensions/sf/eval-review-schema.ts b/src/resources/extensions/sf/eval-review-schema.ts new file mode 100644 index 000000000..866585b7d --- /dev/null +++ b/src/resources/extensions/sf/eval-review-schema.ts @@ -0,0 +1,243 @@ +/** + * EVAL-REVIEW frontmatter schema and parser. + * + * The auditor agent for `/sf eval-review` writes a markdown file whose + * machine-readable contract lives entirely in YAML frontmatter. The body + * after the closing `---` is human-only prose and is never parsed by any + * consumer (the design response to a prior parser that used regex over LLM-generated + * prose and produced silent failures). + * + * This module owns: + * - The TypeBox schema for the frontmatter (single source of truth). + * - A small frontmatter extractor (locates the YAML block). + * - The validated parser (`parseEvalReviewFrontmatter`). + * - Pure helpers for derived fields the handler must recompute server-side + * (overall score, severity counts) — we never trust LLM arithmetic. + * + * Consumers: `commands-eval-review.ts` (writer), `commands-ship.ts` (reader + * for the soft pre-ship warning), and a future `commands-eval-fix.ts`. + */ + +import { Type, type Static, type TSchema } from "@sinclair/typebox"; +import { Value } from "@sinclair/typebox/value"; +import { parse as parseYaml } from "yaml"; + +// ─── Constants ──────────────────────────────────────────────────────────────── + +/** Schema version literal embedded in every EVAL-REVIEW.md frontmatter. */ +export const EVAL_REVIEW_SCHEMA_VERSION = "eval-review/v1" as const; + +/** Verdict values, ordered from worst to best for UI display purposes. */ +export const VERDICT_VALUES = [ + "NOT_IMPLEMENTED", + "SIGNIFICANT_GAPS", + "NEEDS_WORK", + "PRODUCTION_READY", +] as const; + +/** Severity classifications used in `gaps[*].severity`. */ +export const SEVERITY_VALUES = ["blocker", "major", "minor"] as const; + +/** Eval dimensions an auditor scores. `other` is the catch-all. */ +export const DIMENSION_VALUES = [ + "observability", + "guardrails", + "tests", + "metrics", + "datasets", + "other", +] as const; + +/** Lower bound for any score in the schema. */ +export const MIN_SCORE = 0; +/** Upper bound for any score in the schema. */ +export const MAX_SCORE = 100; +/** Coverage's contribution to overall_score. See `docs/user-docs/eval-review.md` for rationale. */ +export const COVERAGE_WEIGHT = 0.6; +/** Infrastructure's contribution to overall_score. See `docs/user-docs/eval-review.md` for rationale. */ +export const INFRASTRUCTURE_WEIGHT = 0.4; + +// ─── Schema ─────────────────────────────────────────────────────────────────── + +const verdictSchema = Type.Union(VERDICT_VALUES.map((v) => Type.Literal(v))); +const severitySchema = Type.Union(SEVERITY_VALUES.map((v) => Type.Literal(v))); +const dimensionSchema = Type.Union(DIMENSION_VALUES.map((v) => Type.Literal(v))); + +/** + * One gap finding inside `gaps[]`. Every field is required — the prompt + * cannot emit a partial gap. `evidence` is mandatory; the anti-Goodhart + * guard depends on it. + */ +export const EvalReviewGap = Type.Object({ + id: Type.String({ pattern: "^G\\d+$" }), + dimension: dimensionSchema, + severity: severitySchema, + description: Type.String({ minLength: 1 }), + evidence: Type.String({ minLength: 1 }), + suggested_fix: Type.String({ minLength: 1 }), +}); + +/** Severity histogram. The handler recomputes this from `gaps[]`. */ +export const EvalReviewCounts = Type.Object({ + blocker: Type.Integer({ minimum: 0 }), + major: Type.Integer({ minimum: 0 }), + minor: Type.Integer({ minimum: 0 }), +}); + +/** + * The full frontmatter schema. Field order in the schema definition mirrors + * the order that the auditor prompt asks the LLM to emit, so a literal-eyeball + * comparison between this file and `prompts/eval-review.md` stays meaningful. + */ +export const EvalReviewFrontmatter = Type.Object({ + schema: Type.Literal(EVAL_REVIEW_SCHEMA_VERSION), + verdict: verdictSchema, + coverage_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }), + infrastructure_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }), + overall_score: Type.Integer({ minimum: MIN_SCORE, maximum: MAX_SCORE }), + generated: Type.String({ pattern: "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d+)?Z$" }), + slice: Type.String({ pattern: "^S\\d+$" }), + milestone: Type.String({ minLength: 1 }), + gaps: Type.Array(EvalReviewGap), + counts: EvalReviewCounts, +}); + +/** Inferred TypeScript type for a validated frontmatter object. */ +export type EvalReviewFrontmatterT = Static; +/** Inferred TypeScript type for a single gap finding. */ +export type EvalReviewGapT = Static; +/** Inferred TypeScript type for the counts histogram. */ +export type EvalReviewCountsT = Static; +/** One of the four allowed verdict literals. */ +export type Verdict = (typeof VERDICT_VALUES)[number]; + +// ─── Frontmatter extraction ─────────────────────────────────────────────────── + +/** + * Locate the YAML block between two `---` lines and return its raw text. + * + * Tolerant to CRLF line endings. Does not interpret the YAML — that's the + * caller's job. The extractor only enforces the markdown frontmatter shape. + * + * @param raw - Full contents of an EVAL-REVIEW.md file. + * @returns `{ yaml }` with the inner YAML text on success, or `{ error }` + * describing why the frontmatter could not be located. + */ +export function extractFrontmatterRaw( + raw: string, +): { yaml: string } | { error: string } { + const lines = raw.split(/\r?\n/); + if (lines[0] !== "---") { + return { error: "Missing opening `---` frontmatter delimiter on line 1" }; + } + for (let i = 1; i < lines.length; i++) { + if (lines[i] === "---") { + return { yaml: lines.slice(1, i).join("\n") }; + } + } + return { error: "Missing closing `---` frontmatter delimiter" }; +} + +// ─── Parser ─────────────────────────────────────────────────────────────────── + +/** Discriminated result type returned by the parser. */ +export type ParseResult = + | { ok: true; data: EvalReviewFrontmatterT } + | { ok: false; error: string; pointer: string }; + +/** + * Parse and validate the frontmatter of an EVAL-REVIEW.md file. + * + * Failure cases are exhaustive and deterministic: + * - missing/unclosed frontmatter → `pointer: "/"`, message names the cause + * - YAML syntax error → `pointer: "/"`, message contains "YAML" + * - schema violation → `pointer` is the JSON-Pointer path of the bad field + * + * Body content after the closing `---` is never inspected. This is an + * response to a prior parser that used regex over the body and silently + * failed on prose / tables / numbered lists. + * + * @param raw - Full contents of an EVAL-REVIEW.md file. + * @returns A discriminated `ParseResult`. + */ +export function parseEvalReviewFrontmatter(raw: string): ParseResult { + const fm = extractFrontmatterRaw(raw); + if ("error" in fm) { + return { ok: false, error: fm.error, pointer: "/" }; + } + + let parsed: unknown; + try { + parsed = parseYaml(fm.yaml, { schema: "core" }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { ok: false, error: `YAML parse error: ${msg}`, pointer: "/" }; + } + + const schema: TSchema = EvalReviewFrontmatter; + if (!Value.Check(schema, parsed)) { + const errs = [...Value.Errors(schema, parsed)]; + const first = errs[0]; + return { + ok: false, + error: `Schema validation failed: ${first?.message ?? "unknown error"}`, + pointer: first?.path ?? "/", + }; + } + + return { ok: true, data: parsed as EvalReviewFrontmatterT }; +} + +// ─── Derived fields ─────────────────────────────────────────────────────────── + +/** + * Compute `overall_score` from the two component scores using the rubric + * weights documented in `docs/user-docs/eval-review.md`. + * + * The handler always recomputes this value rather than trusting whatever the + * LLM emitted in `overall_score`. If the LLM-emitted value disagrees with the + * recomputed one, the disagreement is logged and the recomputed value wins. + * + * Clamps the result into `[MIN_SCORE, MAX_SCORE]` defensively. Schema-validated + * inputs are already in range, but the helper is exported and may be called + * from a code path that bypasses the schema (tests, future tools); the clamp + * keeps the contract honest in those cases. + * + * @param coverage - integer 0..100 from the auditor's coverage assessment. + * @param infrastructure - integer 0..100 from the auditor's infra assessment. + * @returns rounded integer 0..100. + */ +export function computeOverallScore(coverage: number, infrastructure: number): number { + const raw = Math.round(coverage * COVERAGE_WEIGHT + infrastructure * INFRASTRUCTURE_WEIGHT); + return Math.max(MIN_SCORE, Math.min(MAX_SCORE, raw)); +} + +/** + * Build the severity histogram for a list of gaps. + * + * Used by the handler to overwrite whatever the LLM put in `counts` — + * we recompute server-side rather than trust LLM arithmetic. + * + * @param gaps - validated gap list. + * @returns counts keyed by severity literal. + */ +export function deriveCounts(gaps: readonly EvalReviewGapT[]): EvalReviewCountsT { + const counts: EvalReviewCountsT = { blocker: 0, major: 0, minor: 0 }; + for (const g of gaps) counts[g.severity]++; + return counts; +} + +/** + * Map a numeric overall_score to its verdict literal using the bands from + * Bands per `docs/user-docs/eval-review.md`: ≥80 PRODUCTION_READY, 60..79 NEEDS_WORK, 40..59 SIGNIFICANT_GAPS, + * <40 NOT_IMPLEMENTED. + * + * @param overall - integer 0..100. + * @returns a verdict literal. + */ +export function verdictForScore(overall: number): Verdict { + if (overall >= 80) return "PRODUCTION_READY"; + if (overall >= 60) return "NEEDS_WORK"; + if (overall >= 40) return "SIGNIFICANT_GAPS"; + return "NOT_IMPLEMENTED"; +} diff --git a/src/resources/extensions/sf/memory-source-store.ts b/src/resources/extensions/sf/memory-source-store.ts new file mode 100644 index 000000000..960741941 --- /dev/null +++ b/src/resources/extensions/sf/memory-source-store.ts @@ -0,0 +1,138 @@ +// SF Memory Sources — CRUD for raw ingested content (notes, files, URLs, artifacts) +// +// Distinct from `memories`: a `memory_source` row is the preserved raw input +// that an extractor may (or may not) distill into one or more memories. +// Storing the source makes ingestion idempotent (content_hash) and gives the +// user a way to trace a memory back to its origin. + +import { createHash, randomUUID } from "node:crypto"; +import { _getAdapter, isDbAvailable, insertMemorySourceRow, deleteMemorySourceRow } from "./sf-db.js"; + +export type MemorySourceKind = "note" | "url" | "file" | "artifact" | "capture" | "learning"; + +export interface MemorySource { + id: string; + kind: MemorySourceKind; + uri: string | null; + title: string | null; + content: string; + content_hash: string; + imported_at: string; + scope: string; + tags: string[]; +} + +function rowToSource(row: Record): MemorySource { + const tagsRaw = typeof row["tags"] === "string" ? (row["tags"] as string) : "[]"; + let tags: string[] = []; + try { + const parsed = JSON.parse(tagsRaw); + if (Array.isArray(parsed)) tags = parsed.filter((t): t is string => typeof t === "string"); + } catch { + // leave tags empty + } + return { + id: row["id"] as string, + kind: row["kind"] as MemorySourceKind, + uri: (row["uri"] as string) ?? null, + title: (row["title"] as string) ?? null, + content: row["content"] as string, + content_hash: row["content_hash"] as string, + imported_at: row["imported_at"] as string, + scope: (row["scope"] as string) ?? "project", + tags, + }; +} + +export function hashContent(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +export function newSourceId(): string { + return `SRC-${randomUUID().slice(0, 8)}`; +} + +export interface CreateSourceOptions { + kind: MemorySourceKind; + uri?: string | null; + title?: string | null; + content: string; + scope?: string; + tags?: string[]; +} + +export interface CreateSourceResult { + id: string; + duplicate: boolean; +} + +/** + * Insert a memory_source. Idempotent — if the content_hash already exists, + * returns the existing source's id and duplicate=true instead of inserting. + */ +export function createMemorySource(opts: CreateSourceOptions): CreateSourceResult | null { + if (!isDbAvailable()) return null; + const adapter = _getAdapter(); + if (!adapter) return null; + + try { + const contentHash = hashContent(opts.content); + const existing = adapter + .prepare("SELECT id FROM memory_sources WHERE content_hash = :h") + .get({ ":h": contentHash }); + if (existing && typeof existing["id"] === "string") { + return { id: existing["id"] as string, duplicate: true }; + } + + const id = newSourceId(); + insertMemorySourceRow({ + id, + kind: opts.kind, + uri: opts.uri ?? null, + title: opts.title ?? null, + content: opts.content, + contentHash, + importedAt: new Date().toISOString(), + scope: opts.scope ?? "project", + tags: opts.tags ?? [], + }); + return { id, duplicate: false }; + } catch { + return null; + } +} + +export function listMemorySources(limit = 50): MemorySource[] { + if (!isDbAvailable()) return []; + const adapter = _getAdapter(); + if (!adapter) return []; + try { + const rows = adapter + .prepare("SELECT * FROM memory_sources ORDER BY imported_at DESC LIMIT :limit") + .all({ ":limit": limit }); + return rows.map(rowToSource); + } catch { + return []; + } +} + +export function getMemorySource(id: string): MemorySource | null { + if (!isDbAvailable()) return null; + const adapter = _getAdapter(); + if (!adapter) return null; + try { + const row = adapter.prepare("SELECT * FROM memory_sources WHERE id = :id").get({ ":id": id }); + return row ? rowToSource(row) : null; + } catch { + return null; + } +} + +export function deleteMemorySource(id: string): boolean { + if (!isDbAvailable()) return false; + try { + return deleteMemorySourceRow(id); + } catch { + return false; + } +} diff --git a/src/resources/extensions/sf/preferences-migrations.ts b/src/resources/extensions/sf/preferences-migrations.ts index 569c7c556..7caca4be2 100644 --- a/src/resources/extensions/sf/preferences-migrations.ts +++ b/src/resources/extensions/sf/preferences-migrations.ts @@ -112,6 +112,9 @@ export function migrateForward(input: SFPreferences): MigrationOutcome { * * Future: deprecated-key detection, missing-required-field detection. */ +/** + * Check preferences for schema version drift and return any warnings. + */ export function checkPreferencesDrift(prefs: SFPreferences): { warnings: string[]; } { diff --git a/src/resources/extensions/sf/workflow-manifest.ts b/src/resources/extensions/sf/workflow-manifest.ts index ba948d9d3..1f0d4d026 100644 --- a/src/resources/extensions/sf/workflow-manifest.ts +++ b/src/resources/extensions/sf/workflow-manifest.ts @@ -78,6 +78,30 @@ export function toNumeric( return fallback; } +/** + * Parse a JSON TEXT column that is expected to be an array of strings. + * Falls back to `[]` if the column is absent, empty, or if the parsed value + * is not an array whose every element is a string. This prevents malformed + * but technically-parseable JSON (e.g. escaped-quote corruption) from + * silently producing a wrong-typed value downstream. + */ +function parseStringArray(raw: unknown): string[] { + if (typeof raw !== "string" || raw.trim() === "") return []; + try { + const parsed = JSON.parse(raw); + if ( + Array.isArray(parsed) && + parsed.every((item) => typeof item === "string") + ) { + return parsed as string[]; + } + // Parseable but wrong shape — fall back to empty + return []; + } catch { + return []; + } +} + // ─── snapshotState ─────────────────────────────────────────────────────── /** diff --git a/src/resources/extensions/sf/worktree-resolver.ts b/src/resources/extensions/sf/worktree-resolver.ts index dd77c1203..43f60cde6 100644 --- a/src/resources/extensions/sf/worktree-resolver.ts +++ b/src/resources/extensions/sf/worktree-resolver.ts @@ -15,7 +15,7 @@ import { randomUUID } from "node:crypto"; import { existsSync, unlinkSync } from "node:fs"; -import { join } from "node:path"; +import { join, resolve } from "node:path"; import type { AutoSession } from "./auto/session.js"; import { debugLog } from "./debug-logger.js"; import { MergeConflictError } from "./git-service.js";