/** * Centralized Metrics Collector — Unified metrics sink for all SF subsystems. * * Purpose: Replace scattered metrics emission (DB, Prometheus, stderr, JSONL) * with a single collector that aggregates counters, gauges, and histograms, * then exposes them in Prometheus text format AND persists to SQLite for * queryable historical analysis. * * Consumer: /uok status, health widgets, external Prometheus scrapers, * TUI cost/context overlay, and programmatic queries via sf-db. * * Design: * - In-memory aggregation with configurable flush interval * - Prometheus text format output (compatible with existing exposition) * - SQLite persistence for historical queries (session-scoped) * - Cost/token metrics alongside operational metrics * - Retry with exponential backoff on flush failures * - Zero external dependencies */ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { DatabaseSync } from "node:sqlite"; import { sfRoot } from "./paths.js"; import { logWarning } from "./workflow-logger.js"; const FLUSH_INTERVAL_MS = 60_000; // 1 minute const MAX_HISTOGRAM_BUCKETS = 10; const FLUSH_RETRY_MAX = 3; const FLUSH_RETRY_BASE_MS = 1000; const METRIC_NAME_PATTERN = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/; const METRICS_DB_ROW_CAP = 10_000; // keep newest N rows; prune on flush when exceeded // ─── Metrics System Performance Monitoring ────────────────────────────────── let _metricsSystemStartTime = Date.now(); let _flushCount = 0; let _flushSuccessCount = 0; let _flushFailureCount = 0; let _lastFlushDuration = 0; let _lastFlushTimestamp = 0; let _totalFlushDuration = 0; /** * Get metrics system performance stats. */ export function getMetricsSystemStats() { const uptime = Date.now() - _metricsSystemStartTime; return { uptimeMs: uptime, uptimeSeconds: Math.floor(uptime / 1000), flushCount: _flushCount, flushSuccessCount: _flushSuccessCount, flushFailureCount: _flushFailureCount, successRate: _flushCount > 0 ? `${((_flushSuccessCount / _flushCount) * 100).toFixed(1)}%` : "0%", lastFlushDuration: _lastFlushDuration, lastFlushTimestamp: _lastFlushTimestamp, averageFlushDuration: _flushSuccessCount > 0 ? Math.round(_totalFlushDuration / _flushSuccessCount) : 0, databaseStatus: _metricsDb ? "connected" : "disconnected", }; } /** * Get system performance dashboard metrics. * Returns a formatted summary of key performance indicators. */ export function getSystemPerformanceDashboard() { const systemStats = getMetricsSystemStats(); const registry = getRegistry(); return { uptime: systemStats.uptimeSeconds, metricsSystemHealth: { status: systemStats.databaseStatus, successRate: systemStats.successRate, flushCount: systemStats.flushCount, averageFlushDuration: `${systemStats.averageFlushDuration}ms`, }, cost: extractMetricValue(registry, "sf_cost_total"), tokens: { input: extractMetricValue(registry, "sf_tokens_input_total"), output: extractMetricValue(registry, "sf_tokens_output_total"), }, performance: { averageToolExecution: extractMetricHistogramMean( registry, "sf_tool_execution_duration_ms", ), averageModelRequest: extractMetricHistogramMean( registry, "sf_model_request_duration_ms", ), averageDatabaseQuery: extractMetricHistogramMean( registry, "sf_database_query_duration_ms", ), }, errors: { tool: extractMetricValue(registry, "sf_tool_errors_total"), model: extractMetricValue(registry, "sf_model_errors_total"), database: extractMetricValue(registry, "sf_database_errors_total"), system: extractMetricValue(registry, "sf_system_warnings_total"), }, resources: { activeSessions: extractMetricGaugeValue( registry, "sf_active_sessions_count", ), activeAgents: extractMetricGaugeValue(registry, "sf_active_agents_count"), concurrentToolCalls: extractMetricGaugeValue( registry, "sf_concurrent_tool_calls", ), }, }; } /** * Extract a metric value from the registry. */ function extractMetricValue(registry, metricName) { const metric = registry.counters.get(metricName); if (!metric) return 0; let total = 0; for (const value of metric.values.values()) total += value; return total; } /** * Extract histogram mean value. */ function extractMetricHistogramMean(registry, metricName) { const hist = registry.histograms.get(metricName); if (!hist || hist.count === 0) return 0; return Math.round(hist.sum / hist.count); } /** * Extract gauge value. */ function extractMetricGaugeValue(registry, metricName) { const gauge = registry.gauges.get(metricName); if (!gauge || gauge.values.size === 0) return 0; // For gauges, return the most recent value const values = Array.from(gauge.values.values()); return values[values.length - 1] ?? 0; } // ─── Metric Types ─────────────────────────────────────────────────────────── class Counter { constructor(name, help, labelNames = []) { this.name = name; this.help = help; this.labelNames = labelNames; this.values = new Map(); // key → number } inc(labels = {}, amount = 1) { const key = this._key(labels); this.values.set(key, (this.values.get(key) ?? 0) + amount); } get(labels = {}) { return this.values.get(this._key(labels)) ?? 0; } _key(labels) { return _buildLabelKey(labels); } *lines() { yield `# HELP ${this.name} ${this.help}`; yield `# TYPE ${this.name} counter`; for (const [key, value] of this.values) { const labels = _parseLabelKey(key); yield fmtLine(this.name, value, labels); } } } class Gauge { constructor(name, help, labelNames = []) { this.name = name; this.help = help; this.labelNames = labelNames; this.values = new Map(); } set(labels = {}, value) { const safe = Number.isFinite(value) ? value : 0; this.values.set(this._key(labels), safe); } get(labels = {}) { return this.values.get(this._key(labels)) ?? 0; } _key(labels) { return _buildLabelKey(labels); } *lines() { yield `# HELP ${this.name} ${this.help}`; yield `# TYPE ${this.name} gauge`; for (const [key, value] of this.values) { const labels = _parseLabelKey(key); yield fmtLine(this.name, value, labels); } } } class Histogram { constructor( name, help, buckets = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], ) { this.name = name; this.help = help; const capped = [...buckets] .sort((a, b) => a - b) .slice(0, MAX_HISTOGRAM_BUCKETS); this.buckets = capped; this.counts = new Map(); // bucket → count this.sum = 0; this.count = 0; } observe(value) { this.sum += value; this.count++; for (const bucket of this.buckets) { if (value <= bucket) { this.counts.set(bucket, (this.counts.get(bucket) ?? 0) + 1); } } } *lines() { yield `# HELP ${this.name} ${this.help}`; yield `# TYPE ${this.name} histogram`; for (const bucket of this.buckets) { yield fmtLine(`${this.name}_bucket`, this.counts.get(bucket) ?? 0, { le: String(bucket), }); } yield fmtLine(`${this.name}_bucket`, this.count, { le: "+Inf" }); yield fmtLine(`${this.name}_sum`, this.sum); yield fmtLine(`${this.name}_count`, this.count); } } // ─── Label Escaping ───────────────────────────────────────────────────────── function _escapeLabel(v) { return String(v) .replace(/\\/g, "\\\\") .replace(/=/g, "\\=") .replace(/,/g, "\\,"); } function _unescapeLabel(v) { return v.replace(/\\,/g, ",").replace(/\\=/g, "=").replace(/\\\\/g, "\\"); } // ─── Label Key Builder (escapes values, stable ordering) ──────────────────── function _buildLabelKey(labels) { const keys = Object.keys(labels).sort(); return keys.map((k) => `${k}=${_escapeLabel(labels[k] ?? "")}`).join(","); } function _parseLabelKey(key) { const labels = {}; let i = 0; while (i < key.length) { // Find the '=' separator for this label const eqIdx = key.indexOf("=", i); if (eqIdx === -1) break; const k = key.slice(i, eqIdx); // Parse the value, handling escapes let v = ""; let j = eqIdx + 1; while (j < key.length) { const ch = key[j]; if (ch === "\\" && j + 1 < key.length) { const next = key[j + 1]; if (next === "\\" || next === "=" || next === ",") { v += next; j += 2; continue; } } if (ch === ",") { break; } v += ch; j++; } labels[k] = v; i = j + 1; // skip the ',' } return labels; } // ─── Formatter ────────────────────────────────────────────────────────────── function fmtLine(name, value, labels = {}) { const labelStr = Object.entries(labels) .map(([k, v]) => `${k}="${v}"`) .join(","); const suffix = labelStr ? `{${labelStr}}` : ""; return `${name}${suffix} ${value}`; } // ─── Validation ───────────────────────────────────────────────────────────── function validateMetricName(name) { if (!name || typeof name !== "string") { throw new TypeError( `Metric name must be a non-empty string, got: ${typeof name}`, ); } if (!METRIC_NAME_PATTERN.test(name)) { throw new Error( `Invalid metric name "${name}". Must match Prometheus naming convention: ` + `^[a-zA-Z_:][a-zA-Z0-9_:]*$`, ); } } // ─── Central Registry ─────────────────────────────────────────────────────── class MetricsRegistry { counters = new Map(); gauges = new Map(); histograms = new Map(); _metadata = new Map(); counter(name, help, labelNames) { if (!this.counters.has(name)) { this.counters.set(name, new Counter(name, help, labelNames)); } return this.counters.get(name); } gauge(name, help, labelNames) { if (!this.gauges.has(name)) { this.gauges.set(name, new Gauge(name, help, labelNames)); } return this.gauges.get(name); } histogram(name, help, buckets) { if (!this.histograms.has(name)) { this.histograms.set(name, new Histogram(name, help, buckets)); } return this.histograms.get(name); } buildText() { const lines = []; for (const c of this.counters.values()) { lines.push(...c.lines()); } for (const g of this.gauges.values()) { lines.push(...g.lines()); } for (const h of this.histograms.values()) { lines.push(...h.lines()); } return lines.join("\n") + "\n"; } clear() { this.counters.clear(); this.gauges.clear(); this.histograms.clear(); } } // ─── Singleton ────────────────────────────────────────────────────────────── let _registry = null; let _flushTimer = null; let _metricsHealthTimer = null; let _basePath = ""; let _sessionId = ""; let _dbAdapter = null; // kept for API compat but no longer used for metrics writes let _metricsDb = null; // dedicated metrics.db connection let _flushFailures = 0; function getRegistry() { if (!_registry) _registry = new MetricsRegistry(); return _registry; } function metricsFilePath(basePath) { return join(sfRoot(basePath), "runtime", "sf-metrics.prom"); } // ─── DB Persistence ───────────────────────────────────────────────────────── function metricsDbPath(basePath) { return join(sfRoot(basePath), "metrics.db"); } function openMetricsDb(basePath) { if (_metricsDb) return; try { mkdirSync(sfRoot(basePath), { recursive: true }); const db = new DatabaseSync(metricsDbPath(basePath)); db.exec("PRAGMA journal_mode=WAL"); db.exec("PRAGMA synchronous=NORMAL"); db.exec(` CREATE TABLE IF NOT EXISTS metrics ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, type TEXT NOT NULL CHECK(type IN ('counter', 'gauge', 'histogram')), labels TEXT, value REAL NOT NULL, timestamp TEXT NOT NULL DEFAULT (datetime('now')), session_id TEXT ) `); db.exec(`CREATE INDEX IF NOT EXISTS idx_metrics_name ON metrics(name)`); db.exec( `CREATE INDEX IF NOT EXISTS idx_metrics_session ON metrics(session_id)`, ); db.exec( `CREATE INDEX IF NOT EXISTS idx_metrics_name_ts ON metrics(name, timestamp DESC)`, ); _metricsDb = db; } catch (err) { logWarning("metrics-central", `Failed to open metrics.db: ${err.message}`); } } function closeMetricsDb() { if (!_metricsDb) return; try { _metricsDb.close(); } catch { // swallow } _metricsDb = null; } function _ensureMetricsTable(db) { // no-op — metrics.db is set up by openMetricsDb void db; } function persistMetricsToDb(registry, sessionId, _ignored) { const db = _metricsDb; if (!db) return; const ts = new Date().toISOString(); function safeNum(n) { return Number.isFinite(n) ? n : 0; } try { const insert = db.prepare( "INSERT INTO metrics (name, type, labels, value, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?)", ); for (const c of registry.counters.values()) { for (const [key, value] of c.values) { const labels = _parseLabelKey(key); insert.run( c.name, "counter", JSON.stringify(labels), safeNum(value), ts, sessionId, ); } } for (const g of registry.gauges.values()) { for (const [key, value] of g.values) { const labels = _parseLabelKey(key); insert.run( g.name, "gauge", JSON.stringify(labels), safeNum(value), ts, sessionId, ); } } for (const h of registry.histograms.values()) { insert.run( h.name, "histogram", JSON.stringify({ count: h.count, sum: h.sum }), safeNum(h.sum), ts, sessionId, ); } } catch (err) { if (err.message?.includes("database is not open")) { closeMetricsDb(); return; } logWarning("metrics-central", `DB persist failed: ${err.message}`); } // Prune if the table has grown beyond the cap (best-effort; never block flush) try { const row = _metricsDb?.prepare("SELECT count(*) as n FROM metrics").get(); if (row && row.n > METRICS_DB_ROW_CAP) { _metricsDb .prepare( `DELETE FROM metrics WHERE rowid NOT IN ( SELECT rowid FROM metrics ORDER BY timestamp DESC LIMIT ${METRICS_DB_ROW_CAP} )`, ) .run(); } } catch (_) { // swallow — prune failure must never surface to the user } } // ─── Flush with Retry ─────────────────────────────────────────────────────── function flushMetrics() { if (!_basePath) return; const flushStartTime = Date.now(); _flushCount++; try { const text = getRegistry().buildText(); const path = metricsFilePath(_basePath); mkdirSync(join(sfRoot(_basePath), "runtime"), { recursive: true }); writeFileSync(path, text, "utf-8"); // Persist to dedicated metrics.db persistMetricsToDb(getRegistry(), _sessionId, null); // Update performance metrics _flushSuccessCount++; _lastFlushDuration = Date.now() - flushStartTime; _lastFlushTimestamp = Date.now(); _totalFlushDuration += _lastFlushDuration; _flushFailures = 0; // Record flush performance metrics try { getRegistry() .counter( "sf_metrics_flush_success_total", "Total successful metrics flushes", [], ) .inc({}, 1); getRegistry() .gauge( "sf_metrics_flush_duration_ms", "Duration of last metrics flush in milliseconds", [], ) .set({}, _lastFlushDuration); } catch { // Best effort - don't let metrics recording break the flush } } catch (err) { _flushFailureCount++; _flushFailures++; logWarning( "metrics-central", `Flush failed (attempt ${_flushFailures}): ${err.message}`, ); if (_flushFailures < FLUSH_RETRY_MAX) { const delay = FLUSH_RETRY_BASE_MS * 2 ** (_flushFailures - 1); setTimeout(flushMetrics, delay); } else { // Record flush failure as a metric try { getRegistry() .counter( "sf_metrics_flush_failed_total", "Total metrics flush failures", [], ) .inc({}, 1); } catch { // Best effort } } } } // ─── Public API ───────────────────────────────────────────────────────────── /** * Initialize the centralized metrics system. * * @param {string} basePath — project root * @param {object} [opts] — { flushIntervalMs, sessionId, dbAdapter } */ export function initMetricsCentral(basePath, opts = {}) { _basePath = basePath; _sessionId = opts.sessionId ?? ""; _dbAdapter = opts.dbAdapter ?? null; // accepted but no longer used for metrics writes const interval = opts.flushIntervalMs ?? FLUSH_INTERVAL_MS; // Reset metrics system stats on fresh init if (!_flushTimer) { _metricsSystemStartTime = Date.now(); _flushCount = 0; _flushSuccessCount = 0; _flushFailureCount = 0; _lastFlushDuration = 0; _lastFlushTimestamp = 0; _totalFlushDuration = 0; } if (_flushTimer) clearInterval(_flushTimer); _flushTimer = setInterval(flushMetrics, interval); // Ensure timer doesn't keep process alive if (_flushTimer.unref) _flushTimer.unref(); // Open dedicated metrics.db (separate from main sf.db to avoid WAL pressure) openMetricsDb(basePath); // Start periodic metrics system health reporting if (!_metricsHealthTimer) { _metricsHealthTimer = setInterval(() => { try { updateMetricsSystemHealth(); } catch { // Non-fatal } }, 300000); // Every 5 minutes if (_metricsHealthTimer.unref) _metricsHealthTimer.unref(); } } /** * Update metrics system health metrics. */ function updateMetricsSystemHealth() { const registry = getRegistry(); try { // Record system uptime const uptime = Math.floor((Date.now() - _metricsSystemStartTime) / 1000); registry .gauge( "sf_metrics_system_uptime_seconds", "Metrics system uptime in seconds", [], ) .set({}, uptime); // Record database status registry .gauge( "sf_metrics_database_status", "Database connection status (1=connected, 0=disconnected)", ["project_path"], ) .set({ project_path: _basePath || "unknown" }, _metricsDb ? 1 : 0); // Record in-memory metrics count let totalMetrics = 0; totalMetrics += registry.counters.size; totalMetrics += registry.gauges.size; totalMetrics += registry.histograms.size; registry .gauge( "sf_metrics_active_count", "Number of active metrics in memory", [], ) .set({}, totalMetrics); } catch (err) { logWarning( "metrics-central", `Failed to update health metrics: ${err.message}`, ); } } /** * Stop the metrics collector. */ export function stopMetricsCentral() { if (_flushTimer) { clearInterval(_flushTimer); _flushTimer = null; } if (_metricsHealthTimer) { clearInterval(_metricsHealthTimer); _metricsHealthTimer = null; } // Final flush attempt flushMetrics(); _basePath = ""; _sessionId = ""; _dbAdapter = null; closeMetricsDb(); } /** * Record a counter increment. * * @param {string} name — metric name (sf_ prefix recommended) * @param {object} [labels] — label key-value pairs * @param {number} [amount] — increment amount (default 1) */ export function recordCounter(name, labels = {}, amount = 1) { validateMetricName(name); const meta = getMetricMeta(name); // Inject session_id into labels if available if (_sessionId && !labels.session_id) { labels = { ...labels, session_id: _sessionId }; } getRegistry() .counter(name, meta.help, Object.keys(labels)) .inc(labels, amount); } /** * Record a gauge value. * * @param {string} name — metric name * @param {number} value — gauge value * @param {object} [labels] — label key-value pairs */ export function recordGauge(name, value, labels = {}) { validateMetricName(name); const meta = getMetricMeta(name); if (_sessionId && !labels.session_id) { labels = { ...labels, session_id: _sessionId }; } getRegistry().gauge(name, meta.help, Object.keys(labels)).set(labels, value); } /** * Record a histogram observation. * * @param {string} name — metric name * @param {number} value — observed value */ export function recordHistogram(name, value) { validateMetricName(name); const meta = getMetricMeta(name); getRegistry().histogram(name, meta.help, meta.buckets).observe(value); } /** * Record cost and token usage for a unit. * * @param {string} unitId — unit identifier * @param {string} modelId — model identifier * @param {number} inputTokens — input token count * @param {number} outputTokens — output token count * @param {number} cost — cost in USD * @param {string} [workMode] — current work mode */ export function recordCost( unitId, modelId, inputTokens, outputTokens, cost, workMode = "", ) { const labels = { unit_id: unitId, model_id: modelId }; if (workMode) labels.work_mode = workMode; recordCounter("sf_cost_total", labels, cost); recordCounter("sf_tokens_input_total", { model_id: modelId }, inputTokens); recordCounter("sf_tokens_output_total", { model_id: modelId }, outputTokens); recordGauge("sf_cost_last", cost, { unit_id: unitId, model_id: modelId }); } /** * Record tool execution performance. * * @param {string} toolName — name of the tool * @param {number} durationMs — execution duration in milliseconds * @param {boolean} [isError] — whether the execution resulted in an error * @param {string} [errorType] — type of error if isError is true */ export function recordToolExecution( toolName, durationMs, isError = false, errorType = "", ) { recordHistogram("sf_tool_execution_duration_ms", durationMs); if (isError) { recordCounter( "sf_tool_errors_total", { tool_name: toolName, error_type: errorType || "unknown" }, 1, ); } } /** * Record model request performance. * * @param {string} modelId — model identifier * @param {number} durationMs — request duration in milliseconds * @param {boolean} [isError] — whether the request resulted in an error * @param {string} [errorType] — type of error if isError is true */ export function recordModelRequest( modelId, durationMs, isError = false, errorType = "", ) { recordHistogram("sf_model_request_duration_ms", durationMs); if (isError) { recordCounter( "sf_model_errors_total", { model_id: modelId, error_type: errorType || "unknown" }, 1, ); } } /** * Record database operation performance. * * @param {string} operation — database operation name * @param {number} durationMs — query duration in milliseconds * @param {boolean} [isError] — whether the operation resulted in an error * @param {string} [errorType] — type of error if isError is true */ export function recordDatabaseOperation( operation, durationMs, isError = false, errorType = "", ) { recordHistogram("sf_database_query_duration_ms", durationMs); if (isError) { recordCounter( "sf_database_errors_total", { operation, error_type: errorType || "unknown" }, 1, ); } } /** * Record system warning. * * @param {string} component — system component that issued the warning * @param {string} warningType — type of warning */ export function recordSystemWarning(component, warningType) { recordCounter( "sf_system_warnings_total", { component, warning_type: warningType }, 1, ); } /** * Update resource usage gauges. * * @param {object} resources — resource usage data * @param {number} [resources.activeSessions] — number of active sessions * @param {number} [resources.activeAgents] — number of active agents * @param {number} [resources.concurrentToolCalls] — number of concurrent tool calls */ export function updateResourceGauges(resources = {}) { if (resources.activeSessions !== undefined) { recordGauge("sf_active_sessions_count", resources.activeSessions); } if (resources.activeAgents !== undefined) { recordGauge("sf_active_agents_count", resources.activeAgents); } if (resources.concurrentToolCalls !== undefined) { recordGauge("sf_concurrent_tool_calls", resources.concurrentToolCalls); } } /** * Get current metrics text in Prometheus format. */ export function getMetricsText() { return getRegistry().buildText(); } /** * Read persisted metrics from disk. */ export function readMetricsFile(basePath) { const path = metricsFilePath(basePath); if (!existsSync(path)) return null; try { return readFileSync(path, "utf-8"); } catch { return null; } } /** * Query metrics from DB for a session. * * @param {object} db — DB adapter * @param {string} [sessionId] — session to filter by * @param {string} [name] — metric name to filter by * @param {number} [limit] — max rows to return * @returns {Array} — metric rows */ export function queryMetrics(_db, sessionId = null, name = null, limit = 1000) { if (!_metricsDb) return []; try { let sql = "SELECT * FROM metrics WHERE 1=1"; const params = []; if (sessionId) { sql += " AND session_id = ?"; params.push(sessionId); } if (name) { sql += " AND name = ?"; params.push(name); } sql += " ORDER BY timestamp DESC LIMIT ?"; params.push(limit); const stmt = _metricsDb.prepare(sql); return stmt.all(...params); } catch (err) { logWarning("metrics-central", `Query failed: ${err.message}`); return []; } } // ─── Metric Metadata Registry ─────────────────────────────────────────────── const METRIC_META = { // Subagent inheritance sf_subagent_dispatch_total: { help: "Total subagent dispatch attempts", labels: ["work_mode", "permission_profile"], }, sf_subagent_dispatch_blocked: { help: "Subagent dispatches blocked by inheritance policy", labels: ["reason", "work_mode", "permission_profile"], }, sf_subagent_dispatch_allowed: { help: "Subagent dispatches allowed after inheritance check", labels: ["work_mode", "permission_profile"], }, // Mode transitions sf_mode_transition_total: { help: "Total mode transitions", labels: ["axis", "from", "to", "reason"], }, // Task frontmatter sf_task_created_total: { help: "Total tasks created with frontmatter", labels: ["risk_level", "mutation_scope"], }, sf_task_parallel_blocked: { help: "Tasks blocked from parallel execution by frontmatter", labels: ["reason"], }, // Parallel intent sf_parallel_intent_declared: { help: "Parallel worker intents declared", labels: ["milestone_id"], }, sf_parallel_intent_conflict: { help: "Parallel intent conflicts detected", labels: ["milestone_id"], }, // Remote steering sf_remote_steering_applied: { help: "Remote steering directives applied", labels: ["directive_type", "source"], }, sf_remote_steering_rejected: { help: "Remote steering directives rejected (throttle/invalid)", labels: ["reason"], }, // Skill eval sf_skill_eval_runs_total: { help: "Total skill evaluation runs", labels: ["skill_name", "passed"], }, sf_skill_eval_duration_ms: { help: "Skill evaluation duration in milliseconds", buckets: [100, 500, 1000, 5000, 10000, 30000], }, // Cost guard sf_cost_guard_blocked: { help: "Units blocked by cost guard", labels: ["reason", "model_id"], }, sf_cost_guard_hourly_spend: { help: "Current hourly spend in USD", }, // Gate runner sf_gate_runs_total: { help: "Total gate executions", labels: ["gate_id", "outcome"], }, sf_gate_latency_ms: { help: "Gate execution latency in milliseconds", buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000], }, // Message bus sf_message_bus_messages_total: { help: "Total messages in bus", labels: ["agent_id"], }, sf_message_bus_unread_total: { help: "Unread messages in bus", labels: ["agent_id"], }, // Cost tracking sf_cost_total: { help: "Total cost in USD", labels: ["unit_id", "model_id", "work_mode"], }, sf_tokens_input_total: { help: "Total input tokens", labels: ["model_id"], }, sf_tokens_output_total: { help: "Total output tokens", labels: ["model_id"], }, sf_cost_last: { help: "Last recorded cost in USD", labels: ["unit_id", "model_id"], }, // Performance tracking sf_session_start_duration_ms: { help: "Session start duration in milliseconds", buckets: [100, 250, 500, 1000, 2000, 5000], }, sf_tool_execution_duration_ms: { help: "Tool execution duration in milliseconds", buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000], }, sf_model_request_duration_ms: { help: "Model request duration in milliseconds", buckets: [100, 500, 1000, 2500, 5000, 10000, 30000, 60000], }, sf_database_query_duration_ms: { help: "Database query duration in milliseconds", buckets: [1, 5, 10, 25, 50, 100, 250, 500], }, // Resource usage sf_active_sessions_count: { help: "Number of active sessions", }, sf_active_agents_count: { help: "Number of active agents", }, sf_concurrent_tool_calls: { help: "Number of concurrent tool calls", }, // Error tracking sf_tool_errors_total: { help: "Total tool execution errors", labels: ["tool_name", "error_type"], }, sf_model_errors_total: { help: "Total model request errors", labels: ["model_id", "error_type"], }, sf_database_errors_total: { help: "Total database operation errors", labels: ["operation", "error_type"], }, sf_system_warnings_total: { help: "Total system warnings", labels: ["component", "warning_type"], }, // Internal sf_metrics_flush_failed_total: { help: "Total metrics flush failures", }, sf_metrics_flush_success_total: { help: "Total successful metrics flushes", }, sf_metrics_flush_duration_ms: { help: "Duration of last metrics flush in milliseconds", }, sf_metrics_system_uptime_seconds: { help: "Metrics system uptime in seconds", }, sf_metrics_database_status: { help: "Database connection status (1=connected, 0=disconnected)", labels: ["project_path"], }, }; function getMetricMeta(name) { return METRIC_META[name] ?? { help: name, labels: [] }; } /** * Register custom metric metadata. */ export function registerMetricMeta(name, help, labels = [], buckets) { METRIC_META[name] = { help, labels, buckets }; }