feat(sf): query-aware memory ranking — embeddings now actually matter

Previous commit populated memory_embeddings rows but no consumer read them — the read path (getActiveMemoriesRanked) used pure static score (confidence × hit_count). Embeddings were silent. This wires the read side: - rankMemoriesByEmbedding (pure, in memory-embeddings.ts) blends static score with cosine similarity: combined = static * (1 + α * cosine). Defaults α=0.6 — a perfect-static + zero-similarity hit ties roughly with a low-static + perfect-similarity hit, so semantically relevant cold memories can surface above stale-but-popular ones. - embedQueryViaGateway + loadEmbeddingMap — supporting helpers. - getRelevantMemoriesRanked (memory-store.ts) — async query-aware ranker. Oversamples the static pool 5×, embeds the query, blends, returns top-K. Falls back cleanly to static ranking when: - query empty - no SF_LLM_GATEWAY_KEY (gateway not configured) - gateway request fails (500/network) - no embeddings exist yet (fresh DB / worker offline) - auto-prompts.ts: execute-task injection now uses sliceTitle + taskTitle as the query so memories relevant to the current work surface first. 10 new tests lock the contract — pure ranker math, fallback chain, and the gateway-mocked promotion case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 22:18:45 +02:00 · 2026-05-02 22:18:45 +02:00 · eb5f7ef7b6
commit eb5f7ef7b6
parent 56ee89a946
4 changed files with 367 additions and 2 deletions
--- a/src/resources/extensions/sf/auto-prompts.ts
+++ b/src/resources/extensions/sf/auto-prompts.ts
@ -37,6 +37,7 @@ import { inlineGraphSubgraph } from "./graph-context.js";
 import {
 	formatMemoriesForPrompt,
 	getActiveMemoriesRanked,
+	getRelevantMemoriesRanked,
 } from "./memory-store.js";
 import { parseRoadmap } from "./parsers.js";
 import {
@ -2479,9 +2480,17 @@ export async function buildExecuteTaskPrompt(
 		{ pending: new Set(etPending.map((g) => g.gate_id)), allowOmit: true },
 	);

-	const memoriesSection = (() => {
+	// Query-aware memory ranking: build a short query from the active task
+	// context so embeddings can promote semantically-relevant memories above
+	// the cold static-rank top. Falls back to pure static ranking when no
+	// gateway is configured or no embeddings exist yet — see
+	// getRelevantMemoriesRanked for the fallback chain.
+	const memoryQuery = `${sTitle} ${tTitle}`.trim();
+	const memoriesSection = await (async () => {
 		try {
-			const memories = getActiveMemoriesRanked(10);
+			const memories = memoryQuery
+				? await getRelevantMemoriesRanked(memoryQuery, 10)
+				: getActiveMemoriesRanked(10);
 			if (memories.length === 0) return "## Project Memories\n(none yet)";
 			return `## Project Memories\n${formatMemoriesForPrompt(memories)}`;
 		} catch {
--- a/src/resources/extensions/sf/memory-embeddings.ts
+++ b/src/resources/extensions/sf/memory-embeddings.ts
@ -234,6 +234,73 @@ export async function embedMemories(
 	}
 }

+// ─── Query-aware ranking ──────────────────────────────────────────────────
+
+/** Combine static rank (confidence × hit-count boost) with semantic similarity
+ *  to the embedded query. When no embedding rows or no query embedding are
+ *  available, returns the input list unchanged so callers can rely on the
+ *  static order as a fallback. */
+export function rankMemoriesByEmbedding(
+	memories: Array<{ id: string; staticScore: number }>,
+	queryVector: Float32Array | null,
+	memoryEmbeddings: Map<string, Float32Array>,
+	options?: { semanticWeight?: number },
+): Array<{ id: string; combinedScore: number; cosine: number }> {
+	const alpha = options?.semanticWeight ?? 0.6;
+	if (!queryVector || memoryEmbeddings.size === 0) {
+		return memories.map((m) => ({
+			id: m.id,
+			combinedScore: m.staticScore,
+			cosine: 0,
+		}));
+	}
+	return memories
+		.map((m) => {
+			const vec = memoryEmbeddings.get(m.id);
+			const cosine = vec ? cosineSimilarity(queryVector, vec) : 0;
+			// Static score in [0, ~1.5+]; cosine in [-1, 1] but typically [0, 1].
+			// Blend so a perfect static + perfect cosine ≈ 2× a static-only top hit,
+			// and a memory with no embedding at all still ranks by static score.
+			const combinedScore = m.staticScore * (1 + alpha * cosine);
+			return { id: m.id, combinedScore, cosine };
+		})
+		.sort((a, b) => b.combinedScore - a.combinedScore);
+}
+
+/** Embed `query` via the configured gateway and return its Float32Array, or
+ *  null when no gateway is configured / the embed call fails. Best-effort. */
+export async function embedQueryViaGateway(
+	query: string,
+): Promise<Float32Array | null> {
+	if (!query.trim()) return null;
+	try {
+		const { loadGatewayConfigFromEnv, createGatewayEmbedFn } = await import(
+			"./memory-embeddings-llm-gateway.js"
+		);
+		const cfg = loadGatewayConfigFromEnv();
+		if (!cfg) return null;
+		const embedFn = createGatewayEmbedFn(cfg);
+		const vectors = await embedFn([query]);
+		return vectors[0] ?? null;
+	} catch (err) {
+		logWarning(
+			"memory-embeddings",
+			`query embed failed: ${(err as Error).message}`,
+		);
+		return null;
+	}
+}
+
+/** Map of memoryId → vector for all active memories that have embeddings.
+ *  Used by the ranker to look up vectors without N+1 queries. */
+export function loadEmbeddingMap(): Map<string, Float32Array> {
+	const map = new Map<string, Float32Array>();
+	for (const row of loadAllEmbeddings()) {
+		map.set(row.memoryId, row.vector);
+	}
+	return map;
+}
+
 // ─── Auto-engagement / backfill driver ────────────────────────────────────

 /** Find active memories (not superseded) that don't yet have an embedding row.
--- a/src/resources/extensions/sf/memory-store.ts
+++ b/src/resources/extensions/sf/memory-store.ts
@ -138,6 +138,58 @@ export function getActiveMemoriesRanked(limit = 30): Memory[] {
 	}
 }

+/**
+ * Query-aware ranking: when an embedding gateway is configured and at least
+ * some memories have vectors, rerank the top static-pool by combining the
+ * static score with cosine similarity to the embedded query. Falls back
+ * cleanly to pure static ranking when:
+ *   - query is empty
+ *   - no SF_LLM_GATEWAY_KEY (or gateway unreachable)
+ *   - no memories have vectors yet (fresh DB or worker offline)
+ *
+ * The pool oversample (`limit * 5` capped at 50) ensures the embedding
+ * rerank can promote a relevant-but-static-cold memory into the top-K.
+ */
+export async function getRelevantMemoriesRanked(
+	query: string,
+	limit = 10,
+): Promise<Memory[]> {
+	if (!isDbAvailable()) return [];
+	const pool = getActiveMemoriesRanked(Math.min(50, limit * 5));
+	if (pool.length === 0 || !query.trim()) {
+		return pool.slice(0, limit);
+	}
+	try {
+		const { embedQueryViaGateway, loadEmbeddingMap, rankMemoriesByEmbedding } =
+			await import("./memory-embeddings.js");
+		const [queryVec, embeddingMap] = await Promise.all([
+			embedQueryViaGateway(query),
+			Promise.resolve(loadEmbeddingMap()),
+		]);
+		if (!queryVec || embeddingMap.size === 0) {
+			return pool.slice(0, limit);
+		}
+		const ranked = rankMemoriesByEmbedding(
+			pool.map((m) => ({
+				id: m.id,
+				staticScore: m.confidence * (1 + m.hit_count * 0.1),
+			})),
+			queryVec,
+			embeddingMap,
+		);
+		const byId = new Map(pool.map((m) => [m.id, m]));
+		const out: Memory[] = [];
+		for (const r of ranked) {
+			const mem = byId.get(r.id);
+			if (mem) out.push(mem);
+			if (out.length >= limit) break;
+		}
+		return out;
+	} catch {
+		return pool.slice(0, limit);
+	}
+}
+
 /**
 * Generate the next memory ID: MEM + zero-padded 3-digit from MAX(seq).
 * Returns MEM001 if no memories exist.
--- a/src/resources/extensions/sf/tests/memory-query-ranking.test.ts
+++ b/src/resources/extensions/sf/tests/memory-query-ranking.test.ts
@ -0,0 +1,237 @@
+/**
+ * Query-aware memory ranking — combines static rank with embedding cosine
+ * similarity. Tests the pure ranker (no I/O) and the end-to-end async path
+ * with a mocked gateway.
+ *
+ * The contract being locked here:
+ *   - empty query / no gateway / no embeddings → static order preserved
+ *   - query + gateway + embeddings → semantically relevant memory promoted
+ *     even when its static score is lower
+ */
+
+import assert from "node:assert/strict";
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, test, vi } from "vitest";
+
+import {
+	loadEmbeddingMap,
+	rankMemoriesByEmbedding,
+	saveEmbedding,
+} from "../memory-embeddings.ts";
+import { closeDatabase, openDatabase } from "../sf-db.ts";
+import {
+	createMemory,
+	getActiveMemoriesRanked,
+	getRelevantMemoriesRanked,
+} from "../memory-store.ts";
+
+let dir: string;
+const originalEnv = { ...process.env };
+
+beforeEach(() => {
+	dir = mkdtempSync(join(tmpdir(), "sf-memory-rank-"));
+	openDatabase(join(dir, "sf.db"));
+	process.env = { ...originalEnv };
+});
+
+afterEach(() => {
+	closeDatabase();
+	rmSync(dir, { recursive: true, force: true });
+	vi.restoreAllMocks();
+	process.env = { ...originalEnv };
+});
+
+describe("rankMemoriesByEmbedding (pure)", () => {
+	test("returns static order unchanged when query vector is null", () => {
+		const out = rankMemoriesByEmbedding(
+			[
+				{ id: "a", staticScore: 1.0 },
+				{ id: "b", staticScore: 0.5 },
+			],
+			null,
+			new Map(),
+		);
+		assert.deepEqual(
+			out.map((r) => r.id),
+			["a", "b"],
+		);
+	});
+
+	test("returns static order unchanged when embedding map is empty", () => {
+		const q = Float32Array.from([1, 0, 0]);
+		const out = rankMemoriesByEmbedding(
+			[
+				{ id: "a", staticScore: 1.0 },
+				{ id: "b", staticScore: 0.5 },
+			],
+			q,
+			new Map(),
+		);
+		assert.deepEqual(
+			out.map((r) => r.id),
+			["a", "b"],
+		);
+	});
+
+	test("promotes semantically aligned memory above lower-similarity higher-static", () => {
+		const q = Float32Array.from([1, 0, 0]);
+		const map = new Map<string, Float32Array>([
+			// "a" is static-strong but orthogonal to query
+			["a", Float32Array.from([0, 1, 0])],
+			// "b" is static-weak but aligned with query
+			["b", Float32Array.from([1, 0, 0])],
+		]);
+		const out = rankMemoriesByEmbedding(
+			[
+				{ id: "a", staticScore: 1.0 },
+				{ id: "b", staticScore: 0.7 },
+			],
+			q,
+			map,
+			{ semanticWeight: 0.6 },
+		);
+		// b: 0.7 * (1 + 0.6 * 1.0) = 1.12
+		// a: 1.0 * (1 + 0.6 * 0.0) = 1.00
+		assert.equal(out[0].id, "b");
+		assert.equal(out[1].id, "a");
+	});
+
+	test("memory without an embedding row falls back to pure static score", () => {
+		const q = Float32Array.from([1, 0, 0]);
+		const map = new Map<string, Float32Array>([
+			["a", Float32Array.from([1, 0, 0])],
+		]);
+		const out = rankMemoriesByEmbedding(
+			[
+				{ id: "a", staticScore: 0.5 },
+				{ id: "b", staticScore: 0.4 },
+			],
+			q,
+			map,
+			{ semanticWeight: 0.6 },
+		);
+		// a: 0.5 * (1 + 0.6 * 1.0) = 0.80
+		// b: 0.4 * (1 + 0)         = 0.40
+		assert.equal(out[0].id, "a");
+		assert.equal(out[0].cosine, 1);
+		assert.equal(out[1].id, "b");
+		assert.equal(out[1].cosine, 0);
+	});
+});
+
+describe("loadEmbeddingMap", () => {
+	test("returns vectors keyed by memoryId for active memories", () => {
+		const a = createMemory({ category: "architecture", content: "alpha" });
+		const b = createMemory({ category: "architecture", content: "beta" });
+		assert.ok(a && b);
+		saveEmbedding(a, Float32Array.from([1, 2, 3]), "test-model");
+		saveEmbedding(b, Float32Array.from([4, 5, 6]), "test-model");
+		const map = loadEmbeddingMap();
+		assert.equal(map.size, 2);
+		assert.deepEqual(Array.from(map.get(a)!), [1, 2, 3]);
+		assert.deepEqual(Array.from(map.get(b)!), [4, 5, 6]);
+	});
+});
+
+describe("getRelevantMemoriesRanked (async, mocked gateway)", () => {
+	test("falls back to static ranking when SF_LLM_GATEWAY_KEY unset", async () => {
+		delete process.env.SF_LLM_GATEWAY_KEY;
+		createMemory({ category: "architecture", content: "alpha" });
+		createMemory({ category: "architecture", content: "beta" });
+		const out = await getRelevantMemoriesRanked("anything", 10);
+		assert.equal(out.length, 2);
+		const fetchMock = vi.fn();
+		vi.stubGlobal("fetch", fetchMock);
+		// Re-run to confirm no fetch happens on subsequent calls either.
+		await getRelevantMemoriesRanked("anything", 10);
+		assert.equal(fetchMock.mock.calls.length, 0);
+	});
+
+	test("falls back to static ranking when query is empty", async () => {
+		process.env.SF_LLM_GATEWAY_KEY = "x";
+		const a = createMemory({ category: "architecture", content: "alpha" });
+		assert.ok(a);
+		const fetchMock = vi.fn();
+		vi.stubGlobal("fetch", fetchMock);
+		const out = await getRelevantMemoriesRanked("", 10);
+		assert.equal(out.length, 1);
+		assert.equal(fetchMock.mock.calls.length, 0);
+	});
+
+	test("uses gateway embedding to promote relevant memory", async () => {
+		process.env.SF_LLM_GATEWAY_KEY = "x";
+		process.env.SF_LLM_GATEWAY_URL = "https://gateway.test/v1";
+		const a = createMemory({
+			category: "architecture",
+			content: "completely unrelated topic",
+		});
+		const b = createMemory({
+			category: "architecture",
+			content: "task plan format",
+		});
+		assert.ok(a && b);
+		// Static order: both 0.8 confidence, hit_count 0 → tie. DB insert order
+		// breaks the tie (a first). Pre-seed embeddings:
+		// a is orthogonal to query (cosine 0); b is aligned (cosine 1).
+		saveEmbedding(a, Float32Array.from([0, 1, 0]), "test-model");
+		saveEmbedding(b, Float32Array.from([1, 0, 0]), "test-model");
+
+		// Mock gateway → returns query vector aligned with b.
+		vi.stubGlobal(
+			"fetch",
+			vi.fn(async () =>
+				new Response(
+					JSON.stringify({
+						object: "list",
+						data: [
+							{
+								object: "embedding",
+								index: 0,
+								embedding: [1, 0, 0],
+							},
+						],
+					}),
+					{ status: 200 },
+				),
+			),
+		);
+
+		const out = await getRelevantMemoriesRanked("plan format query", 10);
+		assert.equal(out.length, 2);
+		assert.equal(out[0].id, b, "semantically relevant memory must rank first");
+		assert.equal(out[1].id, a);
+	});
+
+	test("falls back to static ranking when gateway fails", async () => {
+		process.env.SF_LLM_GATEWAY_KEY = "x";
+		process.env.SF_LLM_GATEWAY_URL = "https://gateway.test/v1";
+		const a = createMemory({ category: "architecture", content: "alpha" });
+		assert.ok(a);
+		vi.stubGlobal(
+			"fetch",
+			vi.fn(async () => new Response("boom", { status: 500 })),
+		);
+		const out = await getRelevantMemoriesRanked("query", 10);
+		assert.equal(out.length, 1);
+	});
+
+	test("static ranking still works with no embeddings table populated", async () => {
+		const fallbackOrder = getActiveMemoriesRanked(10).length;
+		assert.equal(fallbackOrder, 0); // pre-condition: empty
+		const a = createMemory({
+			category: "architecture",
+			content: "high-confidence",
+			confidence: 0.95,
+		});
+		const b = createMemory({
+			category: "architecture",
+			content: "low-confidence",
+			confidence: 0.5,
+		});
+		assert.ok(a && b);
+		const out = await getRelevantMemoriesRanked("anything", 10);
+		assert.equal(out[0].id, a, "high-confidence memory ranks first by static score");
+	});
+});