feat(learning): weight failure_mode in Bayesian blender — rate_limit=0.7, quota=0.2, auth=0.0
- AGGREGATE_ONE/GROUPED_SQL: compute effective_success_rate with CASE WHEN failure_mode - AggregatedStats: add effective_success_rate, hard_failure_count fields - computeObservedScore: uses effective_success_rate when available; 0.5x penalty if >50% hard failures - Tests: verify rate_limit ranked above quota_exhausted; hard failure penalty verified Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
2dea73398d
commit
b228bc9f5c
3 changed files with 166 additions and 9 deletions
|
|
@ -115,17 +115,23 @@ export function ucbBonus(modelSampleCount, totalSamples, c = DEFAULT_UCB_C) {
|
|||
* weighted combination. Score is 0-100.
|
||||
*
|
||||
* Components:
|
||||
* success: success_rate
|
||||
* success: effective_success_rate (falls back to success_rate if unavailable)
|
||||
* retry: 1 - min(avg_retries / maxRetries, 1) (fewer retries → higher)
|
||||
* verify: verification_pass_rate (or success_rate if null)
|
||||
* verify: verification_pass_rate (or effective/success rate if null)
|
||||
* blocker: 1 - blocker_rate (fewer blockers → higher)
|
||||
*
|
||||
* Hard-failure penalty: if hard_failure_count / sample_count > 0.5
|
||||
* (more than half of failures are quota_exhausted or auth_error), the
|
||||
* final scaled score is multiplied by 0.5.
|
||||
*
|
||||
* @param {Object} stats - from outcome-aggregator.aggregateOutcomes
|
||||
* @param {number} [stats.sample_count]
|
||||
* @param {number} stats.success_rate - 0.0 to 1.0
|
||||
* @param {number} stats.avg_retries - float
|
||||
* @param {number} stats.success_rate - 0.0 to 1.0
|
||||
* @param {number} [stats.effective_success_rate] - failure-mode-weighted rate, 0.0 to 1.0
|
||||
* @param {number} [stats.hard_failure_count] - count of quota_exhausted or auth_error failures
|
||||
* @param {number} stats.avg_retries - float
|
||||
* @param {number|null} stats.verification_pass_rate - 0.0 to 1.0 or null
|
||||
* @param {number} stats.blocker_rate - 0.0 to 1.0
|
||||
* @param {number} stats.blocker_rate - 0.0 to 1.0
|
||||
* @param {Object} [weights=DEFAULT_OBSERVED_WEIGHTS]
|
||||
* @param {number} [maxRetries=5] - retries above this contribute 0 to retry component
|
||||
* @returns {number} observed score 0 to 100
|
||||
|
|
@ -141,7 +147,10 @@ export function computeObservedScore(
|
|||
return NEUTRAL_OBSERVED_SCORE;
|
||||
}
|
||||
|
||||
const successRate = stats.success_rate ?? 0;
|
||||
const successRate =
|
||||
stats.effective_success_rate != null
|
||||
? stats.effective_success_rate
|
||||
: (stats.success_rate ?? 0);
|
||||
const avgRetries = stats.avg_retries ?? 0;
|
||||
const verifyRate = stats.verification_pass_rate ?? successRate;
|
||||
const blockerRate = stats.blocker_rate ?? 0;
|
||||
|
|
@ -155,7 +164,14 @@ export function computeObservedScore(
|
|||
weights.verify * verifyRate +
|
||||
weights.blocker * blockerComponent;
|
||||
|
||||
const scaled = weighted * SCORE_SCALE;
|
||||
let scaled = weighted * SCORE_SCALE;
|
||||
|
||||
const sampleCount = stats.sample_count ?? 0;
|
||||
const hardFailureCount = stats.hard_failure_count ?? 0;
|
||||
if (hardFailureCount > 0 && sampleCount > 0 && hardFailureCount / sampleCount > 0.5) {
|
||||
scaled *= 0.5;
|
||||
}
|
||||
|
||||
return Math.max(0, Math.min(SCORE_SCALE, scaled));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,15 @@ const AGGREGATE_ONE_SQL = `
|
|||
AVG(CAST(escalated AS REAL)) AS escalation_rate,
|
||||
AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
|
||||
AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
|
||||
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
|
||||
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
|
||||
SUM(CASE
|
||||
WHEN succeeded = 1 THEN 1.0
|
||||
WHEN failure_mode = 'rate_limit' THEN 0.7
|
||||
WHEN failure_mode = 'quota_exhausted' THEN 0.2
|
||||
WHEN failure_mode = 'auth_error' THEN 0.0
|
||||
ELSE 0.5
|
||||
END) / COUNT(*) AS effective_success_rate,
|
||||
SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
|
||||
FROM llm_task_outcomes
|
||||
WHERE model_id = ?
|
||||
AND unit_type = ?
|
||||
|
|
@ -55,7 +63,15 @@ const AGGREGATE_GROUPED_SQL = `
|
|||
AVG(CAST(escalated AS REAL)) AS escalation_rate,
|
||||
AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms,
|
||||
AVG(CAST(tokens_total AS REAL)) AS avg_tokens,
|
||||
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd
|
||||
AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd,
|
||||
SUM(CASE
|
||||
WHEN succeeded = 1 THEN 1.0
|
||||
WHEN failure_mode = 'rate_limit' THEN 0.7
|
||||
WHEN failure_mode = 'quota_exhausted' THEN 0.2
|
||||
WHEN failure_mode = 'auth_error' THEN 0.0
|
||||
ELSE 0.5
|
||||
END) / COUNT(*) AS effective_success_rate,
|
||||
SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count
|
||||
FROM llm_task_outcomes
|
||||
WHERE unit_type = ?
|
||||
AND recorded_at > ?
|
||||
|
|
@ -76,6 +92,8 @@ const TOTAL_SAMPLES_SQL = `
|
|||
* @property {string} unitType
|
||||
* @property {number} sample_count
|
||||
* @property {number} success_rate 0.0 to 1.0
|
||||
* @property {number} effective_success_rate failure-mode-weighted success rate, 0.0 to 1.0
|
||||
* @property {number} hard_failure_count count of quota_exhausted or auth_error failures
|
||||
* @property {number} avg_retries
|
||||
* @property {number|null} verification_pass_rate 0.0 to 1.0 or null if no verification data
|
||||
* @property {number} blocker_rate 0.0 to 1.0
|
||||
|
|
@ -100,6 +118,8 @@ function emptyStats(modelId, unitType, windowDays) {
|
|||
unitType,
|
||||
sample_count: 0,
|
||||
success_rate: 0,
|
||||
effective_success_rate: 0,
|
||||
hard_failure_count: 0,
|
||||
avg_retries: 0,
|
||||
verification_pass_rate: null,
|
||||
blocker_rate: 0,
|
||||
|
|
@ -148,6 +168,8 @@ function rowToStats(row, modelId, unitType, windowDays) {
|
|||
unitType,
|
||||
sample_count: toNumber(row.sample_count),
|
||||
success_rate: toNumber(row.success_rate),
|
||||
effective_success_rate: toNumber(row.effective_success_rate),
|
||||
hard_failure_count: toNumber(row.hard_failure_count),
|
||||
avg_retries: toNumber(row.avg_retries),
|
||||
verification_pass_rate:
|
||||
row.verification_pass_rate === null ||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import {
|
|||
recordOutcomeBatch,
|
||||
validateOutcome,
|
||||
} from "./outcome-recorder.mjs";
|
||||
import { computeObservedScore } from "./bayesian-blender.mjs";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Minimal in-memory fake of the SQLite surface consumed by sf-learning.
|
||||
|
|
@ -167,11 +168,22 @@ function runGroupedAggregate(_sql, params, rows) {
|
|||
return out;
|
||||
}
|
||||
|
||||
function effectiveWeight(row) {
|
||||
if (row.succeeded === 1) return 1.0;
|
||||
const fm = row.failure_mode ?? null;
|
||||
if (fm === "rate_limit") return 0.7;
|
||||
if (fm === "quota_exhausted") return 0.2;
|
||||
if (fm === "auth_error") return 0.0;
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
function summarize(rows) {
|
||||
if (rows.length === 0) {
|
||||
return {
|
||||
sample_count: 0,
|
||||
success_rate: null,
|
||||
effective_success_rate: null,
|
||||
hard_failure_count: 0,
|
||||
avg_retries: null,
|
||||
verification_pass_rate: null,
|
||||
blocker_rate: null,
|
||||
|
|
@ -198,9 +210,18 @@ function summarize(rows) {
|
|||
? null
|
||||
: verificationVals.reduce((a, b) => a + b, 0) / verificationVals.length;
|
||||
|
||||
const effective_success_rate =
|
||||
rows.reduce((sum, r) => sum + effectiveWeight(r), 0) / rows.length;
|
||||
|
||||
const hard_failure_count = rows.filter(
|
||||
(r) => r.failure_mode === "quota_exhausted" || r.failure_mode === "auth_error",
|
||||
).length;
|
||||
|
||||
return {
|
||||
sample_count: rows.length,
|
||||
success_rate: avg("succeeded"),
|
||||
effective_success_rate,
|
||||
hard_failure_count,
|
||||
avg_retries: avg("retries"),
|
||||
verification_pass_rate,
|
||||
blocker_rate: avg("blocker_discovered"),
|
||||
|
|
@ -583,3 +604,101 @@ test("recentOutcomes respects limit and filters", () => {
|
|||
assert.equal(filtered[0].model_id, "a");
|
||||
assert.equal(filtered[0].unit_type, "execute-task");
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// effective_success_rate and hard_failure_count
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
test("aggregateOutcomes_rate_limit_failures_rank_higher_than_quota_exhausted", () => {
|
||||
const now = Date.now();
|
||||
const dbRateLimit = createFakeDb();
|
||||
const dbQuota = createFakeDb();
|
||||
|
||||
// model with only rate_limit failures (weight 0.7)
|
||||
dbRateLimit._rows.push(
|
||||
{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
|
||||
{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
|
||||
);
|
||||
// model with only quota_exhausted failures (weight 0.2)
|
||||
dbQuota._rows.push(
|
||||
{ id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 },
|
||||
{ id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 },
|
||||
);
|
||||
|
||||
const statsRateLimit = aggregateOutcomes(dbRateLimit, "model-x", "execute-task", { now });
|
||||
const statsQuota = aggregateOutcomes(dbQuota, "model-x", "execute-task", { now });
|
||||
|
||||
assert.ok(
|
||||
statsRateLimit.effective_success_rate > statsQuota.effective_success_rate,
|
||||
`rate_limit (${statsRateLimit.effective_success_rate}) should exceed quota_exhausted (${statsQuota.effective_success_rate})`,
|
||||
);
|
||||
assert.ok(Math.abs(statsRateLimit.effective_success_rate - 0.7) < 1e-9);
|
||||
assert.ok(Math.abs(statsQuota.effective_success_rate - 0.2) < 1e-9);
|
||||
});
|
||||
|
||||
test("computeObservedScore_uses_effective_success_rate_when_available", () => {
|
||||
// Stats with low success_rate but high effective_success_rate (rate_limit failures)
|
||||
const statsWithEffective = {
|
||||
sample_count: 10,
|
||||
success_rate: 0.0,
|
||||
effective_success_rate: 0.7,
|
||||
hard_failure_count: 0,
|
||||
avg_retries: 0,
|
||||
verification_pass_rate: null,
|
||||
blocker_rate: 0,
|
||||
};
|
||||
// Stats with same success_rate but no effective_success_rate
|
||||
const statsWithoutEffective = {
|
||||
sample_count: 10,
|
||||
success_rate: 0.0,
|
||||
avg_retries: 0,
|
||||
verification_pass_rate: null,
|
||||
blocker_rate: 0,
|
||||
};
|
||||
|
||||
const scoreWith = computeObservedScore(statsWithEffective);
|
||||
const scoreWithout = computeObservedScore(statsWithoutEffective);
|
||||
|
||||
assert.ok(
|
||||
scoreWith > scoreWithout,
|
||||
`score with effective_success_rate (${scoreWith}) should exceed score without (${scoreWithout})`,
|
||||
);
|
||||
});
|
||||
|
||||
test("computeObservedScore_applies_0_5x_penalty_when_hard_failure_rate_exceeds_50_percent", () => {
|
||||
const baseStats = {
|
||||
sample_count: 10,
|
||||
success_rate: 0.3,
|
||||
effective_success_rate: 0.3,
|
||||
hard_failure_count: 0,
|
||||
avg_retries: 0,
|
||||
verification_pass_rate: null,
|
||||
blocker_rate: 0,
|
||||
};
|
||||
const hardFailureStats = {
|
||||
...baseStats,
|
||||
hard_failure_count: 6, // 6/10 = 60% > 50%
|
||||
};
|
||||
const borderlineStats = {
|
||||
...baseStats,
|
||||
hard_failure_count: 5, // 5/10 = 50%, not > 50%, no penalty
|
||||
};
|
||||
|
||||
const baseScore = computeObservedScore(baseStats);
|
||||
const penalizedScore = computeObservedScore(hardFailureStats);
|
||||
const borderlineScore = computeObservedScore(borderlineStats);
|
||||
|
||||
assert.ok(
|
||||
penalizedScore < baseScore,
|
||||
`penalized score (${penalizedScore}) should be less than base score (${baseScore})`,
|
||||
);
|
||||
assert.ok(
|
||||
Math.abs(penalizedScore - baseScore * 0.5) < 1e-9,
|
||||
`penalized score (${penalizedScore}) should be exactly half of base score (${baseScore})`,
|
||||
);
|
||||
// Exactly 50% is NOT > 50%, so no penalty
|
||||
assert.ok(
|
||||
Math.abs(borderlineScore - baseScore) < 1e-9,
|
||||
`borderline (50%) should not be penalized: ${borderlineScore} vs ${baseScore}`,
|
||||
);
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue