feat(learning): weight failure_mode in Bayesian blender — rate_limit=0.7, quota=0.2, auth=0.0

- AGGREGATE_ONE/GROUPED_SQL: compute effective_success_rate with CASE WHEN failure_mode
- AggregatedStats: add effective_success_rate, hard_failure_count fields
- computeObservedScore: uses effective_success_rate when available; 0.5x penalty if >50% hard failures
- Tests: verify rate_limit ranked above quota_exhausted; hard failure penalty verified

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Mikael Hugo 2026-05-10 23:20:33 +02:00
parent 2dea73398d
commit b228bc9f5c
3 changed files with 166 additions and 9 deletions

View file

@ -115,17 +115,23 @@ export function ucbBonus(modelSampleCount, totalSamples, c = DEFAULT_UCB_C) {
* weighted combination. Score is 0-100.
*
* Components:
* success: success_rate
* success: effective_success_rate (falls back to success_rate if unavailable)
* retry: 1 - min(avg_retries / maxRetries, 1) (fewer retries higher)
* verify: verification_pass_rate (or success_rate if null)
* verify: verification_pass_rate (or effective/success rate if null)
* blocker: 1 - blocker_rate (fewer blockers higher)
*
* Hard-failure penalty: if hard_failure_count / sample_count > 0.5
* (more than half of failures are quota_exhausted or auth_error), the
* final scaled score is multiplied by 0.5.
*
* @param {Object} stats - from outcome-aggregator.aggregateOutcomes
* @param {number} [stats.sample_count]
* @param {number} stats.success_rate - 0.0 to 1.0
* @param {number} stats.avg_retries - float
* @param {number} stats.success_rate - 0.0 to 1.0
* @param {number} [stats.effective_success_rate] - failure-mode-weighted rate, 0.0 to 1.0
* @param {number} [stats.hard_failure_count] - count of quota_exhausted or auth_error failures
* @param {number} stats.avg_retries - float
* @param {number|null} stats.verification_pass_rate - 0.0 to 1.0 or null
* @param {number} stats.blocker_rate - 0.0 to 1.0
* @param {number} stats.blocker_rate - 0.0 to 1.0
* @param {Object} [weights=DEFAULT_OBSERVED_WEIGHTS]
* @param {number} [maxRetries=5] - retries above this contribute 0 to retry component
* @returns {number} observed score 0 to 100
@ -141,7 +147,10 @@ export function computeObservedScore(
return NEUTRAL_OBSERVED_SCORE;
}
const successRate = stats.success_rate ?? 0;
const successRate =
stats.effective_success_rate != null
? stats.effective_success_rate
: (stats.success_rate ?? 0);
const avgRetries = stats.avg_retries ?? 0;
const verifyRate = stats.verification_pass_rate ?? successRate;
const blockerRate = stats.blocker_rate ?? 0;
@ -155,7 +164,14 @@ export function computeObservedScore(
weights.verify * verifyRate +
weights.blocker * blockerComponent;
const scaled = weighted * SCORE_SCALE;
let scaled = weighted * SCORE_SCALE;
const sampleCount = stats.sample_count ?? 0;
const hardFailureCount = stats.hard_failure_count ?? 0;
if (hardFailureCount > 0 && sampleCount > 0 && hardFailureCount / sampleCount > 0.5) {
scaled *= 0.5;
}
return Math.max(0, Math.min(SCORE_SCALE, scaled));
}

View file

@ -37,7 +37,15 @@ const AGGREGATE_ONE_SQL = `
AVG(CAST(escalated AS REAL)) AS escalation_rate,
AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
SUM(CASE
WHEN succeeded = 1 THEN 1.0
WHEN failure_mode = 'rate_limit' THEN 0.7
WHEN failure_mode = 'quota_exhausted' THEN 0.2
WHEN failure_mode = 'auth_error' THEN 0.0
ELSE 0.5
END) / COUNT(*) AS effective_success_rate,
SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
FROM llm_task_outcomes
WHERE model_id = ?
AND unit_type = ?
@ -55,7 +63,15 @@ const AGGREGATE_GROUPED_SQL = `
AVG(CAST(escalated AS REAL)) AS escalation_rate,
AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
SUM(CASE
WHEN succeeded = 1 THEN 1.0
WHEN failure_mode = 'rate_limit' THEN 0.7
WHEN failure_mode = 'quota_exhausted' THEN 0.2
WHEN failure_mode = 'auth_error' THEN 0.0
ELSE 0.5
END) / COUNT(*) AS effective_success_rate,
SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
FROM llm_task_outcomes
WHERE unit_type = ?
AND recorded_at > ?
@ -76,6 +92,8 @@ const TOTAL_SAMPLES_SQL = `
* @property {string} unitType
* @property {number} sample_count
* @property {number} success_rate 0.0 to 1.0
* @property {number} effective_success_rate failure-mode-weighted success rate, 0.0 to 1.0
* @property {number} hard_failure_count count of quota_exhausted or auth_error failures
* @property {number} avg_retries
* @property {number|null} verification_pass_rate 0.0 to 1.0 or null if no verification data
* @property {number} blocker_rate 0.0 to 1.0
@ -100,6 +118,8 @@ function emptyStats(modelId, unitType, windowDays) {
unitType,
sample_count: 0,
success_rate: 0,
effective_success_rate: 0,
hard_failure_count: 0,
avg_retries: 0,
verification_pass_rate: null,
blocker_rate: 0,
@ -148,6 +168,8 @@ function rowToStats(row, modelId, unitType, windowDays) {
unitType,
sample_count: toNumber(row.sample_count),
success_rate: toNumber(row.success_rate),
effective_success_rate: toNumber(row.effective_success_rate),
hard_failure_count: toNumber(row.hard_failure_count),
avg_retries: toNumber(row.avg_retries),
verification_pass_rate:
row.verification_pass_rate === null ||

View file

@ -20,6 +20,7 @@ import {
recordOutcomeBatch,
validateOutcome,
} from "./outcome-recorder.mjs";
import { computeObservedScore } from "./bayesian-blender.mjs";
// ---------------------------------------------------------------------------
// Minimal in-memory fake of the SQLite surface consumed by sf-learning.
@ -167,11 +168,22 @@ function runGroupedAggregate(_sql, params, rows) {
return out;
}
function effectiveWeight(row) {
if (row.succeeded === 1) return 1.0;
const fm = row.failure_mode ?? null;
if (fm === "rate_limit") return 0.7;
if (fm === "quota_exhausted") return 0.2;
if (fm === "auth_error") return 0.0;
return 0.5;
}
function summarize(rows) {
if (rows.length === 0) {
return {
sample_count: 0,
success_rate: null,
effective_success_rate: null,
hard_failure_count: 0,
avg_retries: null,
verification_pass_rate: null,
blocker_rate: null,
@ -198,9 +210,18 @@ function summarize(rows) {
? null
: verificationVals.reduce((a, b) => a + b, 0) / verificationVals.length;
const effective_success_rate =
rows.reduce((sum, r) => sum + effectiveWeight(r), 0) / rows.length;
const hard_failure_count = rows.filter(
(r) => r.failure_mode === "quota_exhausted" || r.failure_mode === "auth_error",
).length;
return {
sample_count: rows.length,
success_rate: avg("succeeded"),
effective_success_rate,
hard_failure_count,
avg_retries: avg("retries"),
verification_pass_rate,
blocker_rate: avg("blocker_discovered"),
@ -583,3 +604,101 @@ test("recentOutcomes respects limit and filters", () => {
assert.equal(filtered[0].model_id, "a");
assert.equal(filtered[0].unit_type, "execute-task");
});
// ---------------------------------------------------------------------------
// effective_success_rate and hard_failure_count
// ---------------------------------------------------------------------------
test("aggregateOutcomes_rate_limit_failures_rank_higher_than_quota_exhausted", () => {
const now = Date.now();
const dbRateLimit = createFakeDb();
const dbQuota = createFakeDb();
// model with only rate_limit failures (weight 0.7)
dbRateLimit._rows.push(
{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
);
// model with only quota_exhausted failures (weight 0.2)
dbQuota._rows.push(
{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
);
const statsRateLimit = aggregateOutcomes(dbRateLimit, "model-x", "execute-task", { now });
const statsQuota = aggregateOutcomes(dbQuota, "model-x", "execute-task", { now });
assert.ok(
statsRateLimit.effective_success_rate > statsQuota.effective_success_rate,
`rate_limit (${statsRateLimit.effective_success_rate}) should exceed quota_exhausted (${statsQuota.effective_success_rate})`,
);
assert.ok(Math.abs(statsRateLimit.effective_success_rate - 0.7) < 1e-9);
assert.ok(Math.abs(statsQuota.effective_success_rate - 0.2) < 1e-9);
});
test("computeObservedScore_uses_effective_success_rate_when_available", () => {
// Stats with low success_rate but high effective_success_rate (rate_limit failures)
const statsWithEffective = {
sample_count: 10,
success_rate: 0.0,
effective_success_rate: 0.7,
hard_failure_count: 0,
avg_retries: 0,
verification_pass_rate: null,
blocker_rate: 0,
};
// Stats with same success_rate but no effective_success_rate
const statsWithoutEffective = {
sample_count: 10,
success_rate: 0.0,
avg_retries: 0,
verification_pass_rate: null,
blocker_rate: 0,
};
const scoreWith = computeObservedScore(statsWithEffective);
const scoreWithout = computeObservedScore(statsWithoutEffective);
assert.ok(
scoreWith > scoreWithout,
`score with effective_success_rate (${scoreWith}) should exceed score without (${scoreWithout})`,
);
});
test("computeObservedScore_applies_0_5x_penalty_when_hard_failure_rate_exceeds_50_percent", () => {
const baseStats = {
sample_count: 10,
success_rate: 0.3,
effective_success_rate: 0.3,
hard_failure_count: 0,
avg_retries: 0,
verification_pass_rate: null,
blocker_rate: 0,
};
const hardFailureStats = {
...baseStats,
hard_failure_count: 6, // 6/10 = 60% > 50%
};
const borderlineStats = {
...baseStats,
hard_failure_count: 5, // 5/10 = 50%, not > 50%, no penalty
};
const baseScore = computeObservedScore(baseStats);
const penalizedScore = computeObservedScore(hardFailureStats);
const borderlineScore = computeObservedScore(borderlineStats);
assert.ok(
penalizedScore < baseScore,
`penalized score (${penalizedScore}) should be less than base score (${baseScore})`,
);
assert.ok(
Math.abs(penalizedScore - baseScore * 0.5) < 1e-9,
`penalized score (${penalizedScore}) should be exactly half of base score (${baseScore})`,
);
// Exactly 50% is NOT > 50%, so no penalty
assert.ok(
Math.abs(borderlineScore - baseScore) < 1e-9,
`borderline (50%) should not be penalized: ${borderlineScore} vs ${baseScore}`,
);
});