diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index c2bcfe8f4..3f2df4967 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -66,6 +66,7 @@ import { import { ensureGitignore, untrackRuntimeFiles } from "./gitignore.js"; import { runGSDDoctor, rebuildState } from "./doctor.js"; import { snapshotSkills, clearSkillSnapshot } from "./skill-discovery.js"; +import { captureAvailableSkills, getAndClearSkills, resetSkillTelemetry } from "./skill-telemetry.js"; import { initMetrics, resetMetrics, snapshotUnitMetrics, getLedger, getProjectTotals, formatCost, formatTokenCount, @@ -480,6 +481,7 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi clearUnitTimeout(); if (lockBase()) clearLock(lockBase()); clearSkillSnapshot(); + resetSkillTelemetry(); _dispatching = false; _skipDepth = 0; @@ -2210,6 +2212,7 @@ async function dispatchNextUnit( } } currentUnit = { type: unitType, id: unitId, startedAt: Date.now() }; + captureAvailableSkills(); // Capture skill telemetry at dispatch time (#599) writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, { phase: "dispatched", wrapupWarningSent: false, diff --git a/src/resources/extensions/gsd/commands.ts b/src/resources/extensions/gsd/commands.ts index 17fb3de2b..b320a7159 100644 --- a/src/resources/extensions/gsd/commands.ts +++ b/src/resources/extensions/gsd/commands.ts @@ -66,13 +66,13 @@ function projectRoot(): string { export function registerGSDCommand(pi: ExtensionAPI): void { pi.registerCommand("gsd", { - description: "GSD — Get Shit Done: /gsd help|next|auto|stop|pause|status|visualize|queue|capture|triage|history|undo|skip|export|cleanup|prefs|config|hooks|run-hook|doctor|migrate|remote|steer|knowledge", + description: "GSD — Get Shit Done: /gsd help|next|auto|stop|pause|status|visualize|queue|capture|triage|history|undo|skip|export|cleanup|prefs|config|hooks|run-hook|skill-health|doctor|migrate|remote|steer|knowledge", getArgumentCompletions: (prefix: string) => { const subcommands = [ "help", "next", "auto", "stop", "pause", "status", "visualize", "queue", "discuss", "capture", "triage", "history", "undo", "skip", "export", "cleanup", "prefs", - "config", "hooks", "run-hook", "doctor", "migrate", "remote", "steer", "inspect", "knowledge", + "config", "hooks", "run-hook", "skill-health", "doctor", "migrate", "remote", "steer", "inspect", "knowledge", ]; const parts = prefix.trim().split(/\s+/); @@ -293,6 +293,12 @@ export function registerGSDCommand(pi: ExtensionAPI): void { return; } + // ─── Skill Health ──────────────────────────────────────────── + if (trimmed === "skill-health" || trimmed.startsWith("skill-health ")) { + await handleSkillHealth(trimmed.replace(/^skill-health\s*/, "").trim(), ctx); + return; + } + if (trimmed.startsWith("run-hook ")) { await handleRunHook(trimmed.replace(/^run-hook\s*/, "").trim(), ctx, pi); return; @@ -629,6 +635,47 @@ async function handleInspect(ctx: ExtensionCommandContext): Promise { } } +// ─── Skill Health ───────────────────────────────────────────────────────────── + +async function handleSkillHealth(args: string, ctx: ExtensionCommandContext): Promise { + const { + generateSkillHealthReport, + formatSkillHealthReport, + formatSkillDetail, + } = await import("./skill-health.js"); + + const basePath = projectRoot(); + + // /gsd skill-health — detail view + if (args && !args.startsWith("--")) { + const detail = formatSkillDetail(basePath, args); + ctx.ui.notify(detail, "info"); + return; + } + + // Parse flags + const staleMatch = args.match(/--stale\s+(\d+)/); + const staleDays = staleMatch ? parseInt(staleMatch[1], 10) : undefined; + const decliningOnly = args.includes("--declining"); + + const report = generateSkillHealthReport(basePath, staleDays); + + if (decliningOnly) { + if (report.decliningSkills.length === 0) { + ctx.ui.notify("No skills flagged for declining performance.", "info"); + return; + } + const filtered = { + ...report, + skills: report.skills.filter(s => s.flagged), + }; + ctx.ui.notify(formatSkillHealthReport(filtered), "info"); + return; + } + + ctx.ui.notify(formatSkillHealthReport(report), "info"); +} + // ─── Preferences Wizard ─────────────────────────────────────────────────────── /** Build short summary strings for each preference category. */ diff --git a/src/resources/extensions/gsd/metrics.ts b/src/resources/extensions/gsd/metrics.ts index ad48d614e..8f0daa34a 100644 --- a/src/resources/extensions/gsd/metrics.ts +++ b/src/resources/extensions/gsd/metrics.ts @@ -17,6 +17,7 @@ import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; import { join } from "node:path"; import type { ExtensionContext } from "@gsd/pi-coding-agent"; import { gsdRoot } from "./paths.js"; +import { getAndClearSkills } from "./skill-telemetry.js"; // ─── Types ──────────────────────────────────────────────────────────────────── @@ -43,6 +44,7 @@ export interface UnitMetrics { baselineCharCount?: number; tier?: string; // complexity tier (light/standard/heavy) if dynamic routing active modelDowngraded?: boolean; // true if dynamic routing used a cheaper model + skills?: string[]; // skill names available/loaded during this unit (#599) } export interface MetricsLedger { @@ -167,6 +169,12 @@ export function snapshotUnitMetrics( ...(opts?.modelDowngraded !== undefined ? { modelDowngraded: opts.modelDowngraded } : {}), }; + // Auto-capture skill telemetry (#599) + const skills = getAndClearSkills(); + if (skills.length > 0) { + unit.skills = skills; + } + ledger.units.push(unit); saveLedger(basePath, ledger); diff --git a/src/resources/extensions/gsd/preferences.ts b/src/resources/extensions/gsd/preferences.ts index f408c7763..86dfea6e4 100644 --- a/src/resources/extensions/gsd/preferences.ts +++ b/src/resources/extensions/gsd/preferences.ts @@ -28,6 +28,7 @@ const KNOWN_PREFERENCE_KEYS = new Set([ "custom_instructions", "models", "skill_discovery", + "skill_staleness_days", "auto_supervisor", "uat_dispatch", "unique_milestone_ids", @@ -122,6 +123,7 @@ export interface GSDPreferences { custom_instructions?: string[]; models?: GSDModelConfig | GSDModelConfigV2; skill_discovery?: SkillDiscoveryMode; + skill_staleness_days?: number; // Skills unused for N days get deprioritized (#599). 0 = disabled. Default: 60. auto_supervisor?: AutoSupervisorConfig; uat_dispatch?: boolean; unique_milestone_ids?: boolean; @@ -453,6 +455,15 @@ export function resolveSkillDiscoveryMode(): SkillDiscoveryMode { return prefs?.preferences.skill_discovery ?? "suggest"; } +/** + * Resolve the skill staleness threshold in days. + * Returns 0 if disabled, default 60 if not configured. + */ +export function resolveSkillStalenessDays(): number { + const prefs = loadEffectiveGSDPreferences(); + return prefs?.preferences.skill_staleness_days ?? 60; +} + /** * Resolve which model ID to use for a given auto-mode unit type. * Returns undefined if no model preference is set for this unit type. @@ -658,6 +669,7 @@ function mergePreferences(base: GSDPreferences, override: GSDPreferences): GSDPr custom_instructions: mergeStringLists(base.custom_instructions, override.custom_instructions), models: { ...(base.models ?? {}), ...(override.models ?? {}) }, skill_discovery: override.skill_discovery ?? base.skill_discovery, + skill_staleness_days: override.skill_staleness_days ?? base.skill_staleness_days, auto_supervisor: { ...(base.auto_supervisor ?? {}), ...(override.auto_supervisor ?? {}) }, uat_dispatch: override.uat_dispatch ?? base.uat_dispatch, unique_milestone_ids: override.unique_milestone_ids ?? base.unique_milestone_ids, @@ -718,6 +730,15 @@ export function validatePreferences(preferences: GSDPreferences): { } } + if (preferences.skill_staleness_days !== undefined) { + const days = Number(preferences.skill_staleness_days); + if (Number.isFinite(days) && days >= 0) { + validated.skill_staleness_days = Math.floor(days); + } else { + errors.push(`invalid skill_staleness_days: must be a non-negative number`); + } + } + validated.always_use_skills = normalizeStringList(preferences.always_use_skills); validated.prefer_skills = normalizeStringList(preferences.prefer_skills); validated.avoid_skills = normalizeStringList(preferences.avoid_skills); diff --git a/src/resources/extensions/gsd/prompts/heal-skill.md b/src/resources/extensions/gsd/prompts/heal-skill.md new file mode 100644 index 000000000..6388bfb9b --- /dev/null +++ b/src/resources/extensions/gsd/prompts/heal-skill.md @@ -0,0 +1,45 @@ +## Skill Heal Analysis + +Analyze the just-completed unit ({{unitId}}) for skill drift. + +### Steps + +1. **Identify loaded skill**: Check which SKILL.md file was read during this unit by examining recent tool calls. If no skill was explicitly loaded (no `read` call to a SKILL.md path), write "No skill loaded — skipping heal analysis" to {{healArtifact}} and stop. + +2. **Read the skill**: Load the SKILL.md that was used during this unit. + +3. **Compare execution to skill guidance**: Review what the agent actually did vs what the skill recommended. Look for: + - API patterns the skill recommended that the agent did differently + - Error handling approaches the skill specified but the agent bypassed + - Conventions the skill documented that the agent ignored + - Outdated instructions in the skill that caused errors, retries, or workarounds + - Commands or tools the skill referenced that no longer exist or have changed + +4. **Assess drift severity**: + - **None**: Agent followed skill correctly → write "No drift detected" to {{healArtifact}} and stop + - **Minor**: Agent found a better approach but skill isn't wrong → append a note to `.gsd/KNOWLEDGE.md` and stop + - **Significant**: Skill has outdated or incorrect guidance → continue to step 5 + +5. **If significant drift found**, append a heal suggestion to `.gsd/skill-review-queue.md`: + +```markdown +### {{skillName}} (flagged {{date}}) +- **Unit:** {{unitId}} +- **Issue:** {1-2 sentence description of what was wrong} +- **Root cause:** {outdated API / incorrect pattern / missing context / etc.} +- **Discovery method:** {how the agent discovered the skill was wrong — error message, trial and error, docs lookup, etc.} +- **Proposed fix:** + - File: {relative path to the file in the skill directory} + - Section: {section heading or line range} + - Current: {quote the incorrect/outdated text} + - Suggested: {the corrected text} +- **Action:** [ ] Reviewed [ ] Updated [ ] Dismissed +``` + +Then write a brief summary of the finding to {{healArtifact}}. + +**Critical rules:** +- Do NOT modify any skill files directly. Only write to the review queue. +- The SkillsBench research (Feb 2026) shows curated skills beat auto-generated ones by +16.2pp. Human review is what makes this valuable. +- Keep the analysis focused — don't flag stylistic preferences, only genuine errors or outdated content. +- If multiple issues found, write one entry per issue. diff --git a/src/resources/extensions/gsd/skill-health.ts b/src/resources/extensions/gsd/skill-health.ts new file mode 100644 index 000000000..e08ce3352 --- /dev/null +++ b/src/resources/extensions/gsd/skill-health.ts @@ -0,0 +1,417 @@ +/** + * GSD Skill Health — Dashboard, Staleness, and Heal-Skill Integration (#599) + * + * Aggregates skill telemetry from metrics.json to surface: + * - Per-skill pass/fail rates, token usage, and trends + * - Staleness warnings for unused skills + * - Declining performance flags + * - Heal-skill suggestions (inspired by glittercowboy's heal-skill command) + * + * The heal-skill concept: when an agent deviates from what a skill recommends + * during execution, detect the drift and propose specific fixes with user + * approval before applying. This closes the feedback loop that SkillsBench + * research identified as critical for skill quality. + */ + +import { existsSync, readFileSync, readdirSync } from "node:fs"; +import { join } from "node:path"; +import { getAgentDir } from "@gsd/pi-coding-agent"; +import type { UnitMetrics, MetricsLedger } from "./metrics.js"; +import { formatCost, formatTokenCount, loadLedgerFromDisk } from "./metrics.js"; +import { getSkillLastUsed, detectStaleSkills } from "./skill-telemetry.js"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export interface SkillHealthEntry { + name: string; + totalUses: number; + /** Success rate: units with this skill that completed without retry */ + successRate: number; + /** Average tokens per unit when this skill is loaded */ + avgTokens: number; + /** Token trend over recent uses */ + tokenTrend: "stable" | "rising" | "declining"; + /** Timestamp of most recent use */ + lastUsed: number; + /** Days since last use */ + staleDays: number; + /** Average cost per unit when this skill is loaded */ + avgCost: number; + /** Whether this skill is flagged for review */ + flagged: boolean; + /** Reason for flag, if any */ + flagReason?: string; +} + +export interface SkillHealthReport { + generatedAt: string; + totalUnitsWithSkills: number; + skills: SkillHealthEntry[]; + staleSkills: string[]; + decliningSkills: string[]; + suggestions: SkillHealSuggestion[]; +} + +export interface SkillHealSuggestion { + skillName: string; + trigger: "declining_success" | "rising_tokens" | "high_retry_rate" | "stale"; + message: string; + severity: "info" | "warning" | "critical"; +} + +// ─── Constants ──────────────────────────────────────────────────────────────── + +/** Default staleness threshold in days */ +const DEFAULT_STALE_DAYS = 60; + +/** Success rate below this triggers a flag */ +const SUCCESS_RATE_THRESHOLD = 0.70; + +/** Token increase percentage that triggers a "rising" flag */ +const TOKEN_RISE_THRESHOLD = 0.20; + +/** Minimum uses before trend analysis kicks in */ +const MIN_USES_FOR_TREND = 5; + +/** Window size for trend comparison (compare last N to previous N) */ +const TREND_WINDOW = 5; + +// ─── Public API ─────────────────────────────────────────────────────────────── + +/** + * Generate a full skill health report from metrics data. + */ +export function generateSkillHealthReport(basePath: string, staleDays?: number): SkillHealthReport { + const ledger = loadLedgerFromDisk(basePath); + const unitsWithSkills = (ledger?.units ?? []).filter(u => u.skills && u.skills.length > 0); + const threshold = staleDays ?? DEFAULT_STALE_DAYS; + + const skillMap = aggregateBySkill(unitsWithSkills); + const skills = Array.from(skillMap.values()).sort((a, b) => b.totalUses - a.totalUses); + const staleSkills = detectStaleSkills(unitsWithSkills, threshold); + const decliningSkills = skills.filter(s => s.flagged).map(s => s.name); + const suggestions = generateSuggestions(skills, staleSkills); + + return { + generatedAt: new Date().toISOString(), + totalUnitsWithSkills: unitsWithSkills.length, + skills, + staleSkills, + decliningSkills, + suggestions, + }; +} + +/** + * Format a skill health report for terminal display. + */ +export function formatSkillHealthReport(report: SkillHealthReport): string { + const lines: string[] = []; + + lines.push("Skill Health Report"); + lines.push("═".repeat(60)); + lines.push(`Generated: ${report.generatedAt}`); + lines.push(`Units with skill data: ${report.totalUnitsWithSkills}`); + lines.push(""); + + if (report.skills.length === 0) { + lines.push("No skill telemetry data yet. Run auto-mode to start collecting."); + lines.push("Skill usage is recorded per-unit in metrics.json."); + return lines.join("\n"); + } + + // Main table + lines.push("Skill Uses Success% Avg Tokens Trend Last Used"); + lines.push("─".repeat(80)); + + for (const s of report.skills) { + const name = s.name.padEnd(24).slice(0, 24); + const uses = String(s.totalUses).padStart(5); + const success = `${Math.round(s.successRate * 100)}%`.padStart(8); + const tokens = formatTokenCount(s.avgTokens).padStart(11); + const trend = s.tokenTrend.padEnd(10); + const lastUsed = s.staleDays === 0 ? "today" : + s.staleDays === 1 ? "1 day ago" : + `${s.staleDays} days ago`; + const flag = s.flagged ? " ⚠" : ""; + lines.push(`${name}${uses}${success}${tokens} ${trend}${lastUsed}${flag}`); + } + + // Stale skills + if (report.staleSkills.length > 0) { + lines.push(""); + lines.push("Stale Skills (unused for 60+ days):"); + for (const name of report.staleSkills) { + lines.push(` ⏸ ${name}`); + } + } + + // Declining skills + if (report.decliningSkills.length > 0) { + lines.push(""); + lines.push("Declining Skills (flagged for review):"); + for (const name of report.decliningSkills) { + const entry = report.skills.find(s => s.name === name); + if (entry?.flagReason) { + lines.push(` ⚠ ${name}: ${entry.flagReason}`); + } + } + } + + // Suggestions + if (report.suggestions.length > 0) { + lines.push(""); + lines.push("Heal Suggestions:"); + for (const sug of report.suggestions) { + const icon = sug.severity === "critical" ? "🔴" : sug.severity === "warning" ? "🟡" : "🔵"; + lines.push(` ${icon} ${sug.skillName}: ${sug.message}`); + } + } + + return lines.join("\n"); +} + +/** + * Format a detailed health view for a single skill. + */ +export function formatSkillDetail(basePath: string, skillName: string): string { + const ledger = loadLedgerFromDisk(basePath); + const units = (ledger?.units ?? []).filter(u => u.skills?.includes(skillName)); + const lines: string[] = []; + + lines.push(`Skill Detail: ${skillName}`); + lines.push("═".repeat(50)); + + if (units.length === 0) { + lines.push("No usage data recorded for this skill."); + return lines.join("\n"); + } + + const totalTokens = units.reduce((s, u) => s + u.tokens.total, 0); + const totalCost = units.reduce((s, u) => s + u.cost, 0); + const avgTokens = Math.round(totalTokens / units.length); + const avgCost = totalCost / units.length; + + lines.push(`Total uses: ${units.length}`); + lines.push(`Total tokens: ${formatTokenCount(totalTokens)}`); + lines.push(`Total cost: ${formatCost(totalCost)}`); + lines.push(`Avg tokens/use: ${formatTokenCount(avgTokens)}`); + lines.push(`Avg cost/use: ${formatCost(avgCost)}`); + lines.push(""); + + // Recent uses + lines.push("Recent uses:"); + const recent = units.slice(-10).reverse(); + for (const u of recent) { + const date = new Date(u.finishedAt).toISOString().slice(0, 10); + lines.push(` ${date} ${u.id.padEnd(20)} ${formatTokenCount(u.tokens.total).padStart(8)} tokens ${formatCost(u.cost)}`); + } + + // Check for SKILL.md existence + const skillPath = join(getAgentDir(), "skills", skillName, "SKILL.md"); + if (existsSync(skillPath)) { + const stat = require("node:fs").statSync(skillPath); + lines.push(""); + lines.push(`SKILL.md: ${skillPath}`); + lines.push(`Last modified: ${stat.mtime.toISOString().slice(0, 10)}`); + } + + return lines.join("\n"); +} + +/** + * Build the heal-skill prompt for a post-unit hook. + * This is the GSD-integrated version of glittercowboy's heal-skill concept. + * + * The prompt instructs the agent to: + * 1. Detect which skill was loaded during the completed unit + * 2. Analyze whether the agent deviated from the skill's instructions + * 3. If deviations found, propose specific fixes (not auto-apply) + * 4. Write suggestions to a review queue for human approval + */ +export function buildHealSkillPrompt(unitId: string): string { + return `## Skill Heal Analysis + +Analyze the just-completed unit (${unitId}) for skill drift. + +### Steps + +1. **Identify loaded skill**: Check which SKILL.md file was read during this unit. + If no skill was loaded, write "No skill loaded — skipping heal analysis" and stop. + +2. **Read the skill**: Load the SKILL.md that was used. + +3. **Compare execution to skill guidance**: Review what the agent actually did vs what + the skill recommended. Look for: + - API patterns the skill recommended that the agent did differently + - Error handling approaches the skill specified but the agent bypassed + - Conventions the skill documented that the agent ignored + - Outdated instructions in the skill that caused errors or retries + +4. **Assess drift severity**: + - **None**: Agent followed skill correctly → write "No drift detected" to the summary and stop + - **Minor**: Agent found a better approach but skill isn't wrong → note in KNOWLEDGE.md + - **Significant**: Skill has outdated or incorrect guidance → propose fix + +5. **If significant drift found**, write a heal suggestion to \`.gsd/skill-review-queue.md\`: + +\`\`\`markdown +### {skill-name} (flagged {date}) +- **Unit:** ${unitId} +- **Issue:** {1-2 sentence description} +- **Root cause:** {outdated API / incorrect pattern / missing context} +- **Proposed fix:** + - File: SKILL.md + - Section: {section name} + - Current: {quote the incorrect text} + - Suggested: {the corrected text} +- **Action:** [ ] Reviewed [ ] Updated [ ] Dismissed +\`\`\` + +**Important:** Do NOT modify the skill directly. Write the suggestion to the review queue. +The SkillsBench research shows that human-curated skills outperform auto-generated ones by +16.2pp. +The human review step is what makes this valuable.`; +} + +/** + * Compute stale skills that should be added to avoid_skills. + * Returns only skills not already in the avoid list. + */ +export function computeStaleAvoidList( + basePath: string, + currentAvoidList: string[], + staleDays?: number, +): string[] { + const ledger = loadLedgerFromDisk(basePath); + const units = (ledger?.units ?? []).filter(u => u.skills && u.skills.length > 0); + const stale = detectStaleSkills(units, staleDays ?? DEFAULT_STALE_DAYS); + const avoidSet = new Set(currentAvoidList); + + return stale.filter(s => !avoidSet.has(s)); +} + +// ─── Internals ──────────────────────────────────────────────────────────────── + +function aggregateBySkill(units: UnitMetrics[]): Map { + const map = new Map(); + + for (const u of units) { + if (!u.skills) continue; + for (const skill of u.skills) { + let entry = map.get(skill); + if (!entry) { + entry = { uses: [] }; + map.set(skill, entry); + } + entry.uses.push(u); + } + } + + const result = new Map(); + const now = Date.now(); + + for (const [name, { uses }] of map) { + const totalTokens = uses.reduce((s, u) => s + u.tokens.total, 0); + const totalCost = uses.reduce((s, u) => s + u.cost, 0); + const avgTokens = Math.round(totalTokens / uses.length); + const avgCost = totalCost / uses.length; + + // Success rate: units that didn't have excessive retries (proxy: low tool call count relative to messages) + // Without direct retry tracking, use a heuristic: success if toolCalls < assistantMessages * 20 + const successCount = uses.filter(u => u.toolCalls < u.assistantMessages * 20).length; + const successRate = uses.length > 0 ? successCount / uses.length : 1; + + // Token trend + const tokenTrend = computeTokenTrend(uses); + + // Last used + const lastUsed = Math.max(...uses.map(u => u.finishedAt)); + const staleDays = Math.floor((now - lastUsed) / (24 * 60 * 60 * 1000)); + + // Flag conditions + let flagged = false; + let flagReason: string | undefined; + + if (uses.length >= MIN_USES_FOR_TREND) { + if (successRate < SUCCESS_RATE_THRESHOLD) { + flagged = true; + flagReason = `Success rate ${Math.round(successRate * 100)}% (below ${Math.round(SUCCESS_RATE_THRESHOLD * 100)}% threshold)`; + } else if (tokenTrend === "rising") { + flagged = true; + flagReason = `Token usage trending upward (${Math.round(TOKEN_RISE_THRESHOLD * 100)}%+ increase)`; + } + } + + result.set(name, { + name, + totalUses: uses.length, + successRate, + avgTokens, + tokenTrend, + lastUsed, + staleDays, + avgCost, + flagged, + flagReason, + }); + } + + return result; +} + +function computeTokenTrend(uses: UnitMetrics[]): "stable" | "rising" | "declining" { + if (uses.length < MIN_USES_FOR_TREND * 2) return "stable"; + + // Sort by start time + const sorted = [...uses].sort((a, b) => a.startedAt - b.startedAt); + const window = Math.min(TREND_WINDOW, Math.floor(sorted.length / 2)); + + const recent = sorted.slice(-window); + const previous = sorted.slice(-window * 2, -window); + + const recentAvg = recent.reduce((s, u) => s + u.tokens.total, 0) / recent.length; + const previousAvg = previous.reduce((s, u) => s + u.tokens.total, 0) / previous.length; + + if (previousAvg === 0) return "stable"; + + const change = (recentAvg - previousAvg) / previousAvg; + + if (change > TOKEN_RISE_THRESHOLD) return "rising"; + if (change < -TOKEN_RISE_THRESHOLD) return "declining"; + return "stable"; +} + +function generateSuggestions(skills: SkillHealthEntry[], staleSkills: string[]): SkillHealSuggestion[] { + const suggestions: SkillHealSuggestion[] = []; + + for (const skill of skills) { + if (skill.totalUses >= MIN_USES_FOR_TREND && skill.successRate < SUCCESS_RATE_THRESHOLD) { + suggestions.push({ + skillName: skill.name, + trigger: "declining_success", + message: `Success rate dropped to ${Math.round(skill.successRate * 100)}% over ${skill.totalUses} uses. Review SKILL.md for outdated patterns.`, + severity: skill.successRate < 0.5 ? "critical" : "warning", + }); + } + + if (skill.tokenTrend === "rising" && skill.totalUses >= MIN_USES_FOR_TREND * 2) { + suggestions.push({ + skillName: skill.name, + trigger: "rising_tokens", + message: `Token usage trending upward. Skill may be causing inefficient execution patterns.`, + severity: "info", + }); + } + } + + for (const name of staleSkills) { + suggestions.push({ + skillName: name, + trigger: "stale", + message: `Not used in ${DEFAULT_STALE_DAYS}+ days. Consider archiving or updating.`, + severity: "info", + }); + } + + return suggestions; +} diff --git a/src/resources/extensions/gsd/skill-telemetry.ts b/src/resources/extensions/gsd/skill-telemetry.ts new file mode 100644 index 000000000..ac99e4e83 --- /dev/null +++ b/src/resources/extensions/gsd/skill-telemetry.ts @@ -0,0 +1,127 @@ +/** + * GSD Skill Telemetry — Track which skills are loaded per unit (#599) + * + * Captures skill names at dispatch time for inclusion in UnitMetrics. + * Distinguishes between "available" skills (in system prompt) and + * "actively loaded" skills (read via tool calls during execution). + * + * Data flow: + * 1. At dispatch, captureAvailableSkills() records skills from the system prompt + * 2. During execution, recordSkillRead() tracks explicit SKILL.md reads + * 3. At unit completion, getAndClearSkills() returns the loaded list for metrics + */ + +import { existsSync, readdirSync, readFileSync, statSync } from "node:fs"; +import { join } from "node:path"; +import { getAgentDir } from "@gsd/pi-coding-agent"; + +// ─── In-memory state ────────────────────────────────────────────────────────── + +/** Skills available in the system prompt for the current unit */ +let availableSkills: string[] = []; + +/** Skills explicitly read (SKILL.md loaded) during the current unit */ +const activelyLoadedSkills = new Set(); + +// ─── Public API ─────────────────────────────────────────────────────────────── + +/** + * Capture the list of available skill names at dispatch time. + * Called before each unit starts. + */ +export function captureAvailableSkills(): void { + const skillsDir = join(getAgentDir(), "skills"); + availableSkills = listSkillNames(skillsDir); + activelyLoadedSkills.clear(); +} + +/** + * Record that a skill was actively loaded (its SKILL.md was read). + * Call this when the agent reads a SKILL.md file. + */ +export function recordSkillRead(skillName: string): void { + activelyLoadedSkills.add(skillName); +} + +/** + * Get the skill names for the current unit and clear state. + * Returns actively loaded skills if any, otherwise available skills. + * This gives the most useful signal: if the agent read specific skills, + * report those; otherwise report what was available. + */ +export function getAndClearSkills(): string[] { + const result = activelyLoadedSkills.size > 0 + ? Array.from(activelyLoadedSkills) + : [...availableSkills]; + availableSkills = []; + activelyLoadedSkills.clear(); + return result; +} + +/** + * Reset all telemetry state. Called when auto-mode stops. + */ +export function resetSkillTelemetry(): void { + availableSkills = []; + activelyLoadedSkills.clear(); +} + +/** + * Get last-used timestamps for all skills from metrics data. + * Returns a Map from skill name to most recent ms timestamp. + */ +export function getSkillLastUsed(units: Array<{ finishedAt: number; skills?: string[] }>): Map { + const lastUsed = new Map(); + for (const u of units) { + if (!u.skills) continue; + for (const skill of u.skills) { + const existing = lastUsed.get(skill) ?? 0; + if (u.finishedAt > existing) { + lastUsed.set(skill, u.finishedAt); + } + } + } + return lastUsed; +} + +/** + * Detect stale skills — those not used within the given threshold (in days). + * Returns skill names that should be deprioritized. + */ +export function detectStaleSkills( + units: Array<{ finishedAt: number; skills?: string[] }>, + thresholdDays: number, +): string[] { + if (thresholdDays <= 0) return []; + + const lastUsed = getSkillLastUsed(units); + const cutoff = Date.now() - (thresholdDays * 24 * 60 * 60 * 1000); + const stale: string[] = []; + + // Check all installed skills, not just those with usage data + const skillsDir = join(getAgentDir(), "skills"); + const installed = listSkillNames(skillsDir); + + for (const skill of installed) { + const lastTs = lastUsed.get(skill); + if (lastTs === undefined || lastTs < cutoff) { + stale.push(skill); + } + } + + return stale; +} + +// ─── Internals ──────────────────────────────────────────────────────────────── + +function listSkillNames(skillsDir: string): string[] { + if (!existsSync(skillsDir)) return []; + try { + return readdirSync(skillsDir, { withFileTypes: true }) + .filter(d => d.isDirectory() && !d.name.startsWith(".")) + .filter(d => existsSync(join(skillsDir, d.name, "SKILL.md"))) + .map(d => d.name); + } catch { + return []; + } +} diff --git a/src/resources/extensions/gsd/tests/skill-lifecycle.test.ts b/src/resources/extensions/gsd/tests/skill-lifecycle.test.ts new file mode 100644 index 000000000..ec97d1a02 --- /dev/null +++ b/src/resources/extensions/gsd/tests/skill-lifecycle.test.ts @@ -0,0 +1,126 @@ +/** + * Tests for skill telemetry and skill health (#599). + * Tests the pure functions — no file I/O, no extension context. + */ + +import { describe, it, beforeEach } from "node:test"; +import assert from "node:assert/strict"; +import type { UnitMetrics } from "../metrics.js"; + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +function makeUnit(overrides: Partial = {}): UnitMetrics { + return { + type: "execute-task", + id: "M001/S01/T01", + model: "claude-sonnet-4-20250514", + startedAt: 1000, + finishedAt: 2000, + tokens: { input: 1000, output: 500, cacheRead: 200, cacheWrite: 100, total: 1800 }, + cost: 0.05, + toolCalls: 3, + assistantMessages: 5, + userMessages: 2, + ...overrides, + }; +} + +// ─── Skill Telemetry ────────────────────────────────────────────────────────── + +describe("skill-telemetry", () => { + // Note: captureAvailableSkills/getAndClearSkills depend on filesystem (getAgentDir) + // so we test the data flow via getSkillLastUsed and detectStaleSkills which are pure + + it("getSkillLastUsed returns most recent timestamp per skill", async () => { + const { getSkillLastUsed } = await import("../skill-telemetry.js"); + + const units = [ + makeUnit({ finishedAt: 1000, skills: ["rust-core", "axum-web-framework"] }), + makeUnit({ finishedAt: 2000, skills: ["rust-core"] }), + makeUnit({ finishedAt: 3000, skills: ["axum-web-framework"] }), + ]; + + const result = getSkillLastUsed(units); + assert.equal(result.get("rust-core"), 2000); + assert.equal(result.get("axum-web-framework"), 3000); + }); + + it("getSkillLastUsed returns empty map for units without skills", async () => { + const { getSkillLastUsed } = await import("../skill-telemetry.js"); + + const units = [makeUnit(), makeUnit()]; + const result = getSkillLastUsed(units); + assert.equal(result.size, 0); + }); +}); + +// ─── Skill Health ───────────────────────────────────────────────────────────── + +describe("skill-health", () => { + it("buildHealSkillPrompt includes unit ID", async () => { + const { buildHealSkillPrompt } = await import("../skill-health.js"); + const prompt = buildHealSkillPrompt("M001/S01/T01"); + assert.ok(prompt.includes("M001/S01/T01")); + assert.ok(prompt.includes("Skill Heal Analysis")); + assert.ok(prompt.includes("skill-review-queue.md")); + }); + + it("computeStaleAvoidList excludes already-avoided skills", async () => { + // This test requires filesystem access for loadLedgerFromDisk + // so we test the filtering logic conceptually + const { computeStaleAvoidList } = await import("../skill-health.js"); + + // With no metrics file, should return empty + const result = computeStaleAvoidList("/nonexistent/path", ["some-skill"]); + assert.ok(Array.isArray(result)); + }); +}); + +// ─── UnitMetrics skills field ───────────────────────────────────────────────── + +describe("UnitMetrics skills field", () => { + it("skills field is optional and accepts string array", () => { + const unit = makeUnit({ skills: ["rust-core", "axum-web-framework"] }); + assert.deepEqual(unit.skills, ["rust-core", "axum-web-framework"]); + }); + + it("skills field is undefined when not provided", () => { + const unit = makeUnit(); + assert.equal(unit.skills, undefined); + }); +}); + +// ─── Preferences ────────────────────────────────────────────────────────────── + +describe("skill_staleness_days preference", () => { + it("validates valid staleness days", async () => { + const { validatePreferences } = await import("../preferences.js"); + + const result = validatePreferences({ skill_staleness_days: 30 }); + assert.equal(result.preferences.skill_staleness_days, 30); + assert.equal(result.errors.length, 0); + }); + + it("validates zero (disabled) staleness days", async () => { + const { validatePreferences } = await import("../preferences.js"); + + const result = validatePreferences({ skill_staleness_days: 0 }); + assert.equal(result.preferences.skill_staleness_days, 0); + assert.equal(result.errors.length, 0); + }); + + it("rejects negative staleness days", async () => { + const { validatePreferences } = await import("../preferences.js"); + + const result = validatePreferences({ skill_staleness_days: -5 }); + assert.equal(result.preferences.skill_staleness_days, undefined); + assert.ok(result.errors.some(e => e.includes("skill_staleness_days"))); + }); + + it("floors fractional days", async () => { + const { validatePreferences } = await import("../preferences.js"); + + const result = validatePreferences({ skill_staleness_days: 30.7 }); + assert.equal(result.preferences.skill_staleness_days, 30); + }); +});