feat(learning): weight failure_mode in Bayesian blender — rate_limit=0.7, quota=0.2, auth=0.0

- AGGREGATE_ONE/GROUPED_SQL: compute effective_success_rate with CASE WHEN failure_mode - AggregatedStats: add effective_success_rate, hard_failure_count fields - computeObservedScore: uses effective_success_rate when available; 0.5x penalty if >50% hard failures - Tests: verify rate_limit ranked above quota_exhausted; hard failure penalty verified Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-10 23:20:33 +02:00 · 2026-05-10 23:20:33 +02:00 · b228bc9f5c
commit b228bc9f5c
parent 2dea73398d
3 changed files with 166 additions and 9 deletions
--- a/src/resources/extensions/sf/learning/bayesian-blender.mjs
+++ b/src/resources/extensions/sf/learning/bayesian-blender.mjs
@ -115,17 +115,23 @@ export function ucbBonus(modelSampleCount, totalSamples, c = DEFAULT_UCB_C) {
 * weighted combination. Score is 0-100.
 *
 * Components:
- *   success: success_rate
+ *   success: effective_success_rate (falls back to success_rate if unavailable)
 *   retry:   1 - min(avg_retries / maxRetries, 1)   (fewer retries → higher)
- *   verify:  verification_pass_rate (or success_rate if null)
+ *   verify:  verification_pass_rate (or effective/success rate if null)
 *   blocker: 1 - blocker_rate                       (fewer blockers → higher)
 *
+ * Hard-failure penalty: if hard_failure_count / sample_count > 0.5
+ * (more than half of failures are quota_exhausted or auth_error), the
+ * final scaled score is multiplied by 0.5.
+ *
 * @param {Object} stats - from outcome-aggregator.aggregateOutcomes
 * @param {number} [stats.sample_count]
- * @param {number} stats.success_rate       - 0.0 to 1.0
- * @param {number} stats.avg_retries        - float
+ * @param {number} stats.success_rate            - 0.0 to 1.0
+ * @param {number} [stats.effective_success_rate] - failure-mode-weighted rate, 0.0 to 1.0
+ * @param {number} [stats.hard_failure_count]     - count of quota_exhausted or auth_error failures
+ * @param {number} stats.avg_retries             - float
 * @param {number|null} stats.verification_pass_rate - 0.0 to 1.0 or null
- * @param {number} stats.blocker_rate       - 0.0 to 1.0
+ * @param {number} stats.blocker_rate            - 0.0 to 1.0
 * @param {Object} [weights=DEFAULT_OBSERVED_WEIGHTS]
 * @param {number} [maxRetries=5] - retries above this contribute 0 to retry component
 * @returns {number} observed score 0 to 100
@ -141,7 +147,10 @@ export function computeObservedScore(
 		return NEUTRAL_OBSERVED_SCORE;
 	}

-	const successRate = stats.success_rate ?? 0;
+	const successRate =
+		stats.effective_success_rate != null
+			? stats.effective_success_rate
+			: (stats.success_rate ?? 0);
 	const avgRetries = stats.avg_retries ?? 0;
 	const verifyRate = stats.verification_pass_rate ?? successRate;
 	const blockerRate = stats.blocker_rate ?? 0;
@ -155,7 +164,14 @@ export function computeObservedScore(
 		weights.verify * verifyRate +
 		weights.blocker * blockerComponent;

-	const scaled = weighted * SCORE_SCALE;
+	let scaled = weighted * SCORE_SCALE;
+
+	const sampleCount = stats.sample_count ?? 0;
+	const hardFailureCount = stats.hard_failure_count ?? 0;
+	if (hardFailureCount > 0 && sampleCount > 0 && hardFailureCount / sampleCount > 0.5) {
+		scaled *= 0.5;
+	}
+
 	return Math.max(0, Math.min(SCORE_SCALE, scaled));
 }

--- a/src/resources/extensions/sf/learning/outcome-aggregator.mjs
+++ b/src/resources/extensions/sf/learning/outcome-aggregator.mjs
@ -37,7 +37,15 @@ const AGGREGATE_ONE_SQL = `
        AVG(CAST(escalated AS REAL)) AS escalation_rate,
        AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
        AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
-        AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
+        AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
+        SUM(CASE
+            WHEN succeeded = 1 THEN 1.0
+            WHEN failure_mode = 'rate_limit' THEN 0.7
+            WHEN failure_mode = 'quota_exhausted' THEN 0.2
+            WHEN failure_mode = 'auth_error' THEN 0.0
+            ELSE 0.5
+        END) / COUNT(*) AS effective_success_rate,
+        SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
    FROM llm_task_outcomes
    WHERE model_id = ?
      AND unit_type = ?
@ -55,7 +63,15 @@ const AGGREGATE_GROUPED_SQL = `
        AVG(CAST(escalated AS REAL)) AS escalation_rate,
        AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
        AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
-        AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
+        AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
+        SUM(CASE
+            WHEN succeeded = 1 THEN 1.0
+            WHEN failure_mode = 'rate_limit' THEN 0.7
+            WHEN failure_mode = 'quota_exhausted' THEN 0.2
+            WHEN failure_mode = 'auth_error' THEN 0.0
+            ELSE 0.5
+        END) / COUNT(*) AS effective_success_rate,
+        SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
    FROM llm_task_outcomes
    WHERE unit_type = ?
      AND recorded_at > ?
@ -76,6 +92,8 @@ const TOTAL_SAMPLES_SQL = `
 * @property {string}      unitType
 * @property {number}      sample_count
 * @property {number}      success_rate            0.0 to 1.0
+ * @property {number}      effective_success_rate  failure-mode-weighted success rate, 0.0 to 1.0
+ * @property {number}      hard_failure_count      count of quota_exhausted or auth_error failures
 * @property {number}      avg_retries
 * @property {number|null} verification_pass_rate  0.0 to 1.0 or null if no verification data
 * @property {number}      blocker_rate            0.0 to 1.0
@ -100,6 +118,8 @@ function emptyStats(modelId, unitType, windowDays) {
 		unitType,
 		sample_count: 0,
 		success_rate: 0,
+		effective_success_rate: 0,
+		hard_failure_count: 0,
 		avg_retries: 0,
 		verification_pass_rate: null,
 		blocker_rate: 0,
@ -148,6 +168,8 @@ function rowToStats(row, modelId, unitType, windowDays) {
 		unitType,
 		sample_count: toNumber(row.sample_count),
 		success_rate: toNumber(row.success_rate),
+		effective_success_rate: toNumber(row.effective_success_rate),
+		hard_failure_count: toNumber(row.hard_failure_count),
 		avg_retries: toNumber(row.avg_retries),
 		verification_pass_rate:
 			row.verification_pass_rate === null ||
--- a/src/resources/extensions/sf/learning/outcome-recorder.test.mjs
+++ b/src/resources/extensions/sf/learning/outcome-recorder.test.mjs
@ -20,6 +20,7 @@ import {
 	recordOutcomeBatch,
 	validateOutcome,
 } from "./outcome-recorder.mjs";
+import { computeObservedScore } from "./bayesian-blender.mjs";

 // ---------------------------------------------------------------------------
 // Minimal in-memory fake of the SQLite surface consumed by sf-learning.
@ -167,11 +168,22 @@ function runGroupedAggregate(_sql, params, rows) {
 	return out;
 }

+function effectiveWeight(row) {
+	if (row.succeeded === 1) return 1.0;
+	const fm = row.failure_mode ?? null;
+	if (fm === "rate_limit") return 0.7;
+	if (fm === "quota_exhausted") return 0.2;
+	if (fm === "auth_error") return 0.0;
+	return 0.5;
+}
+
 function summarize(rows) {
 	if (rows.length === 0) {
 		return {
 			sample_count: 0,
 			success_rate: null,
+			effective_success_rate: null,
+			hard_failure_count: 0,
 			avg_retries: null,
 			verification_pass_rate: null,
 			blocker_rate: null,
@ -198,9 +210,18 @@ function summarize(rows) {
 			? null
 			: verificationVals.reduce((a, b) => a + b, 0) / verificationVals.length;

+	const effective_success_rate =
+		rows.reduce((sum, r) => sum + effectiveWeight(r), 0) / rows.length;
+
+	const hard_failure_count = rows.filter(
+		(r) => r.failure_mode === "quota_exhausted" || r.failure_mode === "auth_error",
+	).length;
+
 	return {
 		sample_count: rows.length,
 		success_rate: avg("succeeded"),
+		effective_success_rate,
+		hard_failure_count,
 		avg_retries: avg("retries"),
 		verification_pass_rate,
 		blocker_rate: avg("blocker_discovered"),
@ -583,3 +604,101 @@ test("recentOutcomes respects limit and filters", () => {
 	assert.equal(filtered[0].model_id, "a");
 	assert.equal(filtered[0].unit_type, "execute-task");
 });
+
+// ---------------------------------------------------------------------------
+// effective_success_rate and hard_failure_count
+// ---------------------------------------------------------------------------
+
+test("aggregateOutcomes_rate_limit_failures_rank_higher_than_quota_exhausted", () => {
+	const now = Date.now();
+	const dbRateLimit = createFakeDb();
+	const dbQuota = createFakeDb();
+
+	// model with only rate_limit failures (weight 0.7)
+	dbRateLimit._rows.push(
+		{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
+		{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
+	);
+	// model with only quota_exhausted failures (weight 0.2)
+	dbQuota._rows.push(
+		{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
+		{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
+	);
+
+	const statsRateLimit = aggregateOutcomes(dbRateLimit, "model-x", "execute-task", { now });
+	const statsQuota = aggregateOutcomes(dbQuota, "model-x", "execute-task", { now });
+
+	assert.ok(
+		statsRateLimit.effective_success_rate > statsQuota.effective_success_rate,
+		`rate_limit (${statsRateLimit.effective_success_rate}) should exceed quota_exhausted (${statsQuota.effective_success_rate})`,
+	);
+	assert.ok(Math.abs(statsRateLimit.effective_success_rate - 0.7) < 1e-9);
+	assert.ok(Math.abs(statsQuota.effective_success_rate - 0.2) < 1e-9);
+});
+
+test("computeObservedScore_uses_effective_success_rate_when_available", () => {
+	// Stats with low success_rate but high effective_success_rate (rate_limit failures)
+	const statsWithEffective = {
+		sample_count: 10,
+		success_rate: 0.0,
+		effective_success_rate: 0.7,
+		hard_failure_count: 0,
+		avg_retries: 0,
+		verification_pass_rate: null,
+		blocker_rate: 0,
+	};
+	// Stats with same success_rate but no effective_success_rate
+	const statsWithoutEffective = {
+		sample_count: 10,
+		success_rate: 0.0,
+		avg_retries: 0,
+		verification_pass_rate: null,
+		blocker_rate: 0,
+	};
+
+	const scoreWith = computeObservedScore(statsWithEffective);
+	const scoreWithout = computeObservedScore(statsWithoutEffective);
+
+	assert.ok(
+		scoreWith > scoreWithout,
+		`score with effective_success_rate (${scoreWith}) should exceed score without (${scoreWithout})`,
+	);
+});
+
+test("computeObservedScore_applies_0_5x_penalty_when_hard_failure_rate_exceeds_50_percent", () => {
+	const baseStats = {
+		sample_count: 10,
+		success_rate: 0.3,
+		effective_success_rate: 0.3,
+		hard_failure_count: 0,
+		avg_retries: 0,
+		verification_pass_rate: null,
+		blocker_rate: 0,
+	};
+	const hardFailureStats = {
+		...baseStats,
+		hard_failure_count: 6, // 6/10 = 60% > 50%
+	};
+	const borderlineStats = {
+		...baseStats,
+		hard_failure_count: 5, // 5/10 = 50%, not > 50%, no penalty
+	};
+
+	const baseScore = computeObservedScore(baseStats);
+	const penalizedScore = computeObservedScore(hardFailureStats);
+	const borderlineScore = computeObservedScore(borderlineStats);
+
+	assert.ok(
+		penalizedScore < baseScore,
+		`penalized score (${penalizedScore}) should be less than base score (${baseScore})`,
+	);
+	assert.ok(
+		Math.abs(penalizedScore - baseScore * 0.5) < 1e-9,
+		`penalized score (${penalizedScore}) should be exactly half of base score (${baseScore})`,
+	);
+	// Exactly 50% is NOT > 50%, so no penalty
+	assert.ok(
+		Math.abs(borderlineScore - baseScore) < 1e-9,
+		`borderline (50%) should not be penalized: ${borderlineScore} vs ${baseScore}`,
+	);
+});