diff --git a/src/resources/extensions/sf/learning/bayesian-blender.mjs b/src/resources/extensions/sf/learning/bayesian-blender.mjs index a15373fe5..d8d232b6b 100644 --- a/src/resources/extensions/sf/learning/bayesian-blender.mjs +++ b/src/resources/extensions/sf/learning/bayesian-blender.mjs @@ -115,17 +115,23 @@ export function ucbBonus(modelSampleCount, totalSamples, c = DEFAULT_UCB_C) { * weighted combination. Score is 0-100. * * Components: - * success: success_rate + * success: effective_success_rate (falls back to success_rate if unavailable) * retry: 1 - min(avg_retries / maxRetries, 1) (fewer retries → higher) - * verify: verification_pass_rate (or success_rate if null) + * verify: verification_pass_rate (or effective/success rate if null) * blocker: 1 - blocker_rate (fewer blockers → higher) * + * Hard-failure penalty: if hard_failure_count / sample_count > 0.5 + * (more than half of failures are quota_exhausted or auth_error), the + * final scaled score is multiplied by 0.5. + * * @param {Object} stats - from outcome-aggregator.aggregateOutcomes * @param {number} [stats.sample_count] - * @param {number} stats.success_rate - 0.0 to 1.0 - * @param {number} stats.avg_retries - float + * @param {number} stats.success_rate - 0.0 to 1.0 + * @param {number} [stats.effective_success_rate] - failure-mode-weighted rate, 0.0 to 1.0 + * @param {number} [stats.hard_failure_count] - count of quota_exhausted or auth_error failures + * @param {number} stats.avg_retries - float * @param {number|null} stats.verification_pass_rate - 0.0 to 1.0 or null - * @param {number} stats.blocker_rate - 0.0 to 1.0 + * @param {number} stats.blocker_rate - 0.0 to 1.0 * @param {Object} [weights=DEFAULT_OBSERVED_WEIGHTS] * @param {number} [maxRetries=5] - retries above this contribute 0 to retry component * @returns {number} observed score 0 to 100 @@ -141,7 +147,10 @@ export function computeObservedScore( return NEUTRAL_OBSERVED_SCORE; } - const successRate = stats.success_rate ?? 0; + const successRate = + stats.effective_success_rate != null + ? stats.effective_success_rate + : (stats.success_rate ?? 0); const avgRetries = stats.avg_retries ?? 0; const verifyRate = stats.verification_pass_rate ?? successRate; const blockerRate = stats.blocker_rate ?? 0; @@ -155,7 +164,14 @@ export function computeObservedScore( weights.verify * verifyRate + weights.blocker * blockerComponent; - const scaled = weighted * SCORE_SCALE; + let scaled = weighted * SCORE_SCALE; + + const sampleCount = stats.sample_count ?? 0; + const hardFailureCount = stats.hard_failure_count ?? 0; + if (hardFailureCount > 0 && sampleCount > 0 && hardFailureCount / sampleCount > 0.5) { + scaled *= 0.5; + } + return Math.max(0, Math.min(SCORE_SCALE, scaled)); } diff --git a/src/resources/extensions/sf/learning/outcome-aggregator.mjs b/src/resources/extensions/sf/learning/outcome-aggregator.mjs index bee2c539b..8350bcde8 100644 --- a/src/resources/extensions/sf/learning/outcome-aggregator.mjs +++ b/src/resources/extensions/sf/learning/outcome-aggregator.mjs @@ -37,7 +37,15 @@ const AGGREGATE_ONE_SQL = ` AVG(CAST(escalated AS REAL)) AS escalation_rate, AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms, AVG(CAST(tokens_total AS REAL)) AS avg_tokens, - AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd + AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd, + SUM(CASE + WHEN succeeded = 1 THEN 1.0 + WHEN failure_mode = 'rate_limit' THEN 0.7 + WHEN failure_mode = 'quota_exhausted' THEN 0.2 + WHEN failure_mode = 'auth_error' THEN 0.0 + ELSE 0.5 + END) / COUNT(*) AS effective_success_rate, + SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count FROM llm_task_outcomes WHERE model_id = ? AND unit_type = ? @@ -55,7 +63,15 @@ const AGGREGATE_GROUPED_SQL = ` AVG(CAST(escalated AS REAL)) AS escalation_rate, AVG(CAST(duration_ms AS REAL)) AS avg_duration_ms, AVG(CAST(tokens_total AS REAL)) AS avg_tokens, - AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd + AVG(CAST(cost_usd AS REAL)) AS avg_cost_usd, + SUM(CASE + WHEN succeeded = 1 THEN 1.0 + WHEN failure_mode = 'rate_limit' THEN 0.7 + WHEN failure_mode = 'quota_exhausted' THEN 0.2 + WHEN failure_mode = 'auth_error' THEN 0.0 + ELSE 0.5 + END) / COUNT(*) AS effective_success_rate, + SUM(CASE WHEN failure_mode IN ('quota_exhausted', 'auth_error') THEN 1 ELSE 0 END) AS hard_failure_count FROM llm_task_outcomes WHERE unit_type = ? AND recorded_at > ? @@ -76,6 +92,8 @@ const TOTAL_SAMPLES_SQL = ` * @property {string} unitType * @property {number} sample_count * @property {number} success_rate 0.0 to 1.0 + * @property {number} effective_success_rate failure-mode-weighted success rate, 0.0 to 1.0 + * @property {number} hard_failure_count count of quota_exhausted or auth_error failures * @property {number} avg_retries * @property {number|null} verification_pass_rate 0.0 to 1.0 or null if no verification data * @property {number} blocker_rate 0.0 to 1.0 @@ -100,6 +118,8 @@ function emptyStats(modelId, unitType, windowDays) { unitType, sample_count: 0, success_rate: 0, + effective_success_rate: 0, + hard_failure_count: 0, avg_retries: 0, verification_pass_rate: null, blocker_rate: 0, @@ -148,6 +168,8 @@ function rowToStats(row, modelId, unitType, windowDays) { unitType, sample_count: toNumber(row.sample_count), success_rate: toNumber(row.success_rate), + effective_success_rate: toNumber(row.effective_success_rate), + hard_failure_count: toNumber(row.hard_failure_count), avg_retries: toNumber(row.avg_retries), verification_pass_rate: row.verification_pass_rate === null || diff --git a/src/resources/extensions/sf/learning/outcome-recorder.test.mjs b/src/resources/extensions/sf/learning/outcome-recorder.test.mjs index ffffc650d..5586d20f0 100644 --- a/src/resources/extensions/sf/learning/outcome-recorder.test.mjs +++ b/src/resources/extensions/sf/learning/outcome-recorder.test.mjs @@ -20,6 +20,7 @@ import { recordOutcomeBatch, validateOutcome, } from "./outcome-recorder.mjs"; +import { computeObservedScore } from "./bayesian-blender.mjs"; // --------------------------------------------------------------------------- // Minimal in-memory fake of the SQLite surface consumed by sf-learning. @@ -167,11 +168,22 @@ function runGroupedAggregate(_sql, params, rows) { return out; } +function effectiveWeight(row) { + if (row.succeeded === 1) return 1.0; + const fm = row.failure_mode ?? null; + if (fm === "rate_limit") return 0.7; + if (fm === "quota_exhausted") return 0.2; + if (fm === "auth_error") return 0.0; + return 0.5; +} + function summarize(rows) { if (rows.length === 0) { return { sample_count: 0, success_rate: null, + effective_success_rate: null, + hard_failure_count: 0, avg_retries: null, verification_pass_rate: null, blocker_rate: null, @@ -198,9 +210,18 @@ function summarize(rows) { ? null : verificationVals.reduce((a, b) => a + b, 0) / verificationVals.length; + const effective_success_rate = + rows.reduce((sum, r) => sum + effectiveWeight(r), 0) / rows.length; + + const hard_failure_count = rows.filter( + (r) => r.failure_mode === "quota_exhausted" || r.failure_mode === "auth_error", + ).length; + return { sample_count: rows.length, success_rate: avg("succeeded"), + effective_success_rate, + hard_failure_count, avg_retries: avg("retries"), verification_pass_rate, blocker_rate: avg("blocker_discovered"), @@ -583,3 +604,101 @@ test("recentOutcomes respects limit and filters", () => { assert.equal(filtered[0].model_id, "a"); assert.equal(filtered[0].unit_type, "execute-task"); }); + +// --------------------------------------------------------------------------- +// effective_success_rate and hard_failure_count +// --------------------------------------------------------------------------- + +test("aggregateOutcomes_rate_limit_failures_rank_higher_than_quota_exhausted", () => { + const now = Date.now(); + const dbRateLimit = createFakeDb(); + const dbQuota = createFakeDb(); + + // model with only rate_limit failures (weight 0.7) + dbRateLimit._rows.push( + { id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 }, + { id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "rate_limit", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 }, + ); + // model with only quota_exhausted failures (weight 0.2) + dbQuota._rows.push( + { id: 1, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T01", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 1000 }, + { id: 2, model_id: "model-x", provider: "p", unit_type: "execute-task", unit_id: "T02", succeeded: 0, failure_mode: "quota_exhausted", retries: 0, escalated: 0, verification_passed: null, blocker_discovered: 0, duration_ms: 100, tokens_total: 10, cost_usd: 0, recorded_at: now - 2000 }, + ); + + const statsRateLimit = aggregateOutcomes(dbRateLimit, "model-x", "execute-task", { now }); + const statsQuota = aggregateOutcomes(dbQuota, "model-x", "execute-task", { now }); + + assert.ok( + statsRateLimit.effective_success_rate > statsQuota.effective_success_rate, + `rate_limit (${statsRateLimit.effective_success_rate}) should exceed quota_exhausted (${statsQuota.effective_success_rate})`, + ); + assert.ok(Math.abs(statsRateLimit.effective_success_rate - 0.7) < 1e-9); + assert.ok(Math.abs(statsQuota.effective_success_rate - 0.2) < 1e-9); +}); + +test("computeObservedScore_uses_effective_success_rate_when_available", () => { + // Stats with low success_rate but high effective_success_rate (rate_limit failures) + const statsWithEffective = { + sample_count: 10, + success_rate: 0.0, + effective_success_rate: 0.7, + hard_failure_count: 0, + avg_retries: 0, + verification_pass_rate: null, + blocker_rate: 0, + }; + // Stats with same success_rate but no effective_success_rate + const statsWithoutEffective = { + sample_count: 10, + success_rate: 0.0, + avg_retries: 0, + verification_pass_rate: null, + blocker_rate: 0, + }; + + const scoreWith = computeObservedScore(statsWithEffective); + const scoreWithout = computeObservedScore(statsWithoutEffective); + + assert.ok( + scoreWith > scoreWithout, + `score with effective_success_rate (${scoreWith}) should exceed score without (${scoreWithout})`, + ); +}); + +test("computeObservedScore_applies_0_5x_penalty_when_hard_failure_rate_exceeds_50_percent", () => { + const baseStats = { + sample_count: 10, + success_rate: 0.3, + effective_success_rate: 0.3, + hard_failure_count: 0, + avg_retries: 0, + verification_pass_rate: null, + blocker_rate: 0, + }; + const hardFailureStats = { + ...baseStats, + hard_failure_count: 6, // 6/10 = 60% > 50% + }; + const borderlineStats = { + ...baseStats, + hard_failure_count: 5, // 5/10 = 50%, not > 50%, no penalty + }; + + const baseScore = computeObservedScore(baseStats); + const penalizedScore = computeObservedScore(hardFailureStats); + const borderlineScore = computeObservedScore(borderlineStats); + + assert.ok( + penalizedScore < baseScore, + `penalized score (${penalizedScore}) should be less than base score (${baseScore})`, + ); + assert.ok( + Math.abs(penalizedScore - baseScore * 0.5) < 1e-9, + `penalized score (${penalizedScore}) should be exactly half of base score (${baseScore})`, + ); + // Exactly 50% is NOT > 50%, so no penalty + assert.ok( + Math.abs(borderlineScore - baseScore) < 1e-9, + `borderline (50%) should not be penalized: ${borderlineScore} vs ${baseScore}`, + ); +});