452 lines
16 KiB
Python
452 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Hermes Incident Commander — Interactive Demo
|
|
============================================
|
|
Run this to see Incident Commander in action without a full Hermes
|
|
installation. Uses the Anthropic API directly to simulate Hermes's
|
|
tool-calling agent loop.
|
|
|
|
Requirements:
|
|
pip install anthropic rich
|
|
|
|
Usage:
|
|
export ANTHROPIC_API_KEY=sk-ant-...
|
|
python demo/demo_incident.py
|
|
|
|
# Or run a specific scenario:
|
|
python demo/demo_incident.py --scenario disk-full-logs
|
|
python demo/demo_incident.py --scenario svc-crash-nginx
|
|
python demo/demo_incident.py --scenario cpu-runaway-process
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
# ── Rich for pretty terminal output ──────────────────────────────────────────
|
|
try:
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.markdown import Markdown
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
from rich.rule import Rule
|
|
from rich.syntax import Syntax
|
|
from rich.table import Table
|
|
RICH_AVAILABLE = True
|
|
except ImportError:
|
|
RICH_AVAILABLE = False
|
|
print("Tip: pip install rich — for beautiful output")
|
|
|
|
# ── Anthropic SDK ─────────────────────────────────────────────────────────────
|
|
try:
|
|
import anthropic
|
|
except ImportError:
|
|
print("Error: pip install anthropic")
|
|
sys.exit(1)
|
|
|
|
console = Console() if RICH_AVAILABLE else None
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool implementations (simulated system tools for demo safety)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
INCIDENT_DIR = Path.home() / ".hermes" / "incidents"
|
|
SKILLS_DIR = Path.home() / ".hermes" / "skills"
|
|
|
|
def _run(cmd: str, timeout: int = 15) -> Dict[str, Any]:
|
|
"""Run a shell command and return structured output."""
|
|
try:
|
|
result = subprocess.run(
|
|
cmd, shell=True, capture_output=True, text=True, timeout=timeout
|
|
)
|
|
return {
|
|
"exit_code": result.returncode,
|
|
"output": result.stdout[:4000],
|
|
"error": result.stderr[:1000] if result.stderr else None,
|
|
"success": result.returncode == 0,
|
|
}
|
|
except subprocess.TimeoutExpired:
|
|
return {"exit_code": -1, "output": "", "error": "Command timed out", "success": False}
|
|
except Exception as exc:
|
|
return {"exit_code": -1, "output": "", "error": str(exc), "success": False}
|
|
|
|
|
|
TOOL_DEFINITIONS = [
|
|
{
|
|
"name": "terminal",
|
|
"description": (
|
|
"Execute a shell command on the server. Use for system diagnostics, "
|
|
"service management, log inspection, and remediation. "
|
|
"Commands run as the current user."
|
|
),
|
|
"input_schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"command": {
|
|
"type": "string",
|
|
"description": "The shell command to execute",
|
|
},
|
|
"timeout": {
|
|
"type": "integer",
|
|
"description": "Timeout in seconds (default 15)",
|
|
"default": 15,
|
|
},
|
|
},
|
|
"required": ["command"],
|
|
},
|
|
},
|
|
{
|
|
"name": "write_file",
|
|
"description": "Write content to a file. Use to create incident reports and prevention skills.",
|
|
"input_schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"path": {"type": "string", "description": "Absolute or ~ path"},
|
|
"content": {"type": "string", "description": "File content"},
|
|
},
|
|
"required": ["path", "content"],
|
|
},
|
|
},
|
|
{
|
|
"name": "read_file",
|
|
"description": "Read a file's contents.",
|
|
"input_schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"path": {"type": "string"},
|
|
},
|
|
"required": ["path"],
|
|
},
|
|
},
|
|
]
|
|
|
|
|
|
def dispatch_tool(tool_name: str, tool_input: Dict[str, Any]) -> str:
|
|
"""Route a tool call to its implementation and return a string result."""
|
|
if tool_name == "terminal":
|
|
cmd = tool_input["command"]
|
|
timeout = tool_input.get("timeout", 15)
|
|
result = _run(cmd, timeout)
|
|
parts = []
|
|
if result["output"]:
|
|
parts.append(result["output"])
|
|
if result["error"]:
|
|
parts.append(f"STDERR: {result['error']}")
|
|
parts.append(f"[exit_code={result['exit_code']}]")
|
|
return "\n".join(parts)
|
|
|
|
elif tool_name == "write_file":
|
|
path = Path(tool_input["path"].replace("~", str(Path.home())))
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(tool_input["content"])
|
|
return f"Written {len(tool_input['content'])} bytes to {path}"
|
|
|
|
elif tool_name == "read_file":
|
|
path = Path(tool_input["path"].replace("~", str(Path.home())))
|
|
if path.exists():
|
|
return path.read_text()[:4000]
|
|
return f"File not found: {path}"
|
|
|
|
else:
|
|
return f"Unknown tool: {tool_name}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Incident Scenarios (subset, self-contained for demo)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DEMO_SCENARIOS = {
|
|
"disk-full-logs": {
|
|
"title": "🚨 Disk 95% full — Log files exploded",
|
|
"severity": "P1",
|
|
"setup": [
|
|
"mkdir -p /tmp/hermes_demo_logs",
|
|
"dd if=/dev/urandom of=/tmp/hermes_demo_logs/app.log.old bs=1M count=30 2>/dev/null",
|
|
"dd if=/dev/urandom of=/tmp/hermes_demo_logs/debug.log.old bs=1M count=20 2>/dev/null",
|
|
"echo 'DISK_INCIDENT_ACTIVE=1' > /tmp/hermes_incident_marker",
|
|
],
|
|
"cleanup": [
|
|
"rm -rf /tmp/hermes_demo_logs /tmp/hermes_incident_marker",
|
|
],
|
|
"prompt": (
|
|
"ALERT: Disk usage just hit 95% on our application server. "
|
|
"Services are failing because they can't write to disk. "
|
|
"Log rotation hasn't been running properly. "
|
|
"There are huge log files in /tmp/hermes_demo_logs consuming all our space. "
|
|
"Find them, clean up disk space, and document what you did. "
|
|
"Write a post-incident report to ~/.hermes/incidents/ "
|
|
"and create a prevention skill in ~/.hermes/skills/disk-monitor/"
|
|
),
|
|
},
|
|
"svc-crash-nginx": {
|
|
"title": "🚨 nginx crashed — Website unreachable",
|
|
"severity": "P0",
|
|
"setup": [
|
|
"echo 'SERVICE_INCIDENT_ACTIVE=1' > /tmp/hermes_incident_marker",
|
|
],
|
|
"cleanup": [
|
|
"rm -f /tmp/hermes_incident_marker",
|
|
],
|
|
"prompt": (
|
|
"ALERT: Our website is down! Users are getting connection refused. "
|
|
"nginx was running 10 minutes ago but now it's not responding. "
|
|
"This is a P0 incident — we're losing revenue every minute. "
|
|
"Investigate the system, check what services are running or failing, "
|
|
"identify the problem, attempt to fix it, "
|
|
"and write a detailed post-incident report to ~/.hermes/incidents/."
|
|
),
|
|
},
|
|
"cpu-runaway-process": {
|
|
"title": "🚨 CPU at 95% — Runaway computation detected",
|
|
"severity": "P2",
|
|
"setup": [
|
|
# Spin up a moderate CPU consumer (not too heavy for demo)
|
|
"python3 -c \""
|
|
"import os, time; "
|
|
"open('/tmp/hermes_cpu_hog.pid','w').write(str(os.getpid())); "
|
|
"[abs(x) for x in range(50_000_000)]"
|
|
"\" &",
|
|
"sleep 0.5",
|
|
"echo 'CPU_INCIDENT_ACTIVE=1' > /tmp/hermes_incident_marker",
|
|
],
|
|
"cleanup": [
|
|
"kill $(cat /tmp/hermes_cpu_hog.pid 2>/dev/null) 2>/dev/null || true",
|
|
"rm -f /tmp/hermes_cpu_hog.pid /tmp/hermes_incident_marker",
|
|
],
|
|
"prompt": (
|
|
"ALERT: CPU utilization has been at 90%+ for the last 10 minutes. "
|
|
"Server response times are severely degraded. "
|
|
"Something is doing heavy computation that shouldn't be. "
|
|
"Find the runaway process (check /tmp/hermes_cpu_hog.pid), "
|
|
"identify what it is, terminate it safely, "
|
|
"verify the fix, and write a post-incident report to ~/.hermes/incidents/."
|
|
),
|
|
},
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Agent loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SYSTEM_PROMPT = """You are Hermes Incident Commander — an autonomous Site Reliability Engineer.
|
|
|
|
When you receive an incident alert:
|
|
1. Immediately run diagnostics (uptime, df -h, free -h, ps aux, systemctl list-units --failed)
|
|
2. Classify severity (P0/P1/P2/P3) and announce it
|
|
3. Find the root cause through systematic investigation
|
|
4. Apply the safest effective remediation
|
|
5. Verify the fix worked
|
|
6. Write a structured post-incident report to ~/.hermes/incidents/<timestamp>-<slug>.md
|
|
7. Create a prevention skill SKILL.md in ~/.hermes/skills/<category>-prevention/
|
|
|
|
Be autonomous and thorough. Do not ask for permission for safe diagnostic operations.
|
|
Speed matters — every minute of downtime costs money."""
|
|
|
|
|
|
def run_incident_agent(
|
|
scenario: Dict[str, Any],
|
|
api_key: str,
|
|
max_turns: int = 20,
|
|
) -> Dict[str, Any]:
|
|
"""Run the agent loop for one incident scenario."""
|
|
|
|
client = anthropic.Anthropic(api_key=api_key)
|
|
messages: List[Dict[str, Any]] = [
|
|
{"role": "user", "content": scenario["prompt"]}
|
|
]
|
|
turn = 0
|
|
tool_calls_made: List[str] = []
|
|
start_time = time.time()
|
|
|
|
if RICH_AVAILABLE:
|
|
console.print(Rule(f"[bold red]{scenario['title']}[/]"))
|
|
console.print(Panel(
|
|
scenario["prompt"],
|
|
title="[yellow]📟 Incident Alert[/]",
|
|
border_style="red",
|
|
))
|
|
else:
|
|
print(f"\n{'='*60}")
|
|
print(scenario["title"])
|
|
print(scenario["prompt"])
|
|
print('='*60)
|
|
|
|
while turn < max_turns:
|
|
turn += 1
|
|
|
|
if RICH_AVAILABLE:
|
|
with Progress(
|
|
SpinnerColumn("dots"),
|
|
TextColumn(f"[cyan]Hermes thinking... (turn {turn}/{max_turns})[/]"),
|
|
transient=True,
|
|
console=console,
|
|
) as p:
|
|
p.add_task("")
|
|
response = client.messages.create(
|
|
model="claude-sonnet-4-20250514",
|
|
max_tokens=4096,
|
|
system=SYSTEM_PROMPT,
|
|
tools=TOOL_DEFINITIONS,
|
|
messages=messages,
|
|
)
|
|
else:
|
|
print(f"\n[Turn {turn}] Hermes thinking...")
|
|
response = client.messages.create(
|
|
model="claude-sonnet-4-20250514",
|
|
max_tokens=4096,
|
|
system=SYSTEM_PROMPT,
|
|
tools=TOOL_DEFINITIONS,
|
|
messages=messages,
|
|
)
|
|
|
|
# Extract text and tool calls
|
|
tool_calls = [b for b in response.content if b.type == "tool_use"]
|
|
text_blocks = [b for b in response.content if b.type == "text" and b.text.strip()]
|
|
|
|
# Show agent's reasoning
|
|
for tb in text_blocks:
|
|
if RICH_AVAILABLE:
|
|
console.print(Panel(
|
|
Markdown(tb.text),
|
|
title="[green]🤖 Hermes[/]",
|
|
border_style="green",
|
|
))
|
|
else:
|
|
print(f"\n[Hermes]: {tb.text}")
|
|
|
|
# If no tool calls, agent is done
|
|
if not tool_calls or response.stop_reason == "end_turn":
|
|
break
|
|
|
|
# Execute each tool call
|
|
tool_results = []
|
|
for tc in tool_calls:
|
|
tool_calls_made.append(tc.name)
|
|
cmd_preview = (
|
|
tc.input.get("command", tc.input.get("path", ""))[:80]
|
|
)
|
|
|
|
if RICH_AVAILABLE:
|
|
console.print(
|
|
f" [yellow]⚡ {tc.name}[/] [dim]{cmd_preview}[/]"
|
|
)
|
|
else:
|
|
print(f"\n > {tc.name}: {cmd_preview}")
|
|
|
|
result_text = dispatch_tool(tc.name, tc.input)
|
|
|
|
if RICH_AVAILABLE and tc.name == "terminal" and len(result_text) < 2000:
|
|
console.print(Syntax(result_text, "bash", theme="monokai", line_numbers=False))
|
|
elif not RICH_AVAILABLE:
|
|
print(f" OUTPUT: {result_text[:500]}")
|
|
|
|
tool_results.append({
|
|
"type": "tool_result",
|
|
"tool_use_id": tc.id,
|
|
"content": result_text,
|
|
})
|
|
|
|
# Add assistant + tool results to conversation
|
|
messages.append({"role": "assistant", "content": response.content})
|
|
messages.append({"role": "user", "content": tool_results})
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────────────
|
|
report_files = list(INCIDENT_DIR.glob("*.md")) if INCIDENT_DIR.exists() else []
|
|
skill_dirs = list(SKILLS_DIR.iterdir()) if SKILLS_DIR.exists() else []
|
|
|
|
if RICH_AVAILABLE:
|
|
console.print(Rule("[bold green]Incident Resolution Summary[/]"))
|
|
t = Table(show_header=True, header_style="bold cyan")
|
|
t.add_column("Metric", style="dim")
|
|
t.add_column("Value")
|
|
t.add_row("Severity", scenario.get("severity", "?"))
|
|
t.add_row("Turns used", str(turn))
|
|
t.add_row("Tool calls", str(len(tool_calls_made)))
|
|
t.add_row("Elapsed time", f"{elapsed:.1f}s")
|
|
t.add_row("Reports written",str(len(report_files)))
|
|
t.add_row("Skills created", str(max(0, len(skill_dirs) - 5)))
|
|
t.add_row("Tools used", ", ".join(sorted(set(tool_calls_made))))
|
|
console.print(t)
|
|
else:
|
|
print(f"\nSUMMARY: {turn} turns, {len(tool_calls_made)} tool calls, {elapsed:.1f}s")
|
|
|
|
return {
|
|
"turns": turn,
|
|
"tool_calls": len(tool_calls_made),
|
|
"elapsed_seconds": elapsed,
|
|
"reports_written": len(report_files),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Hermes Incident Commander — Interactive Demo"
|
|
)
|
|
parser.add_argument(
|
|
"--scenario",
|
|
choices=list(DEMO_SCENARIOS.keys()),
|
|
default="disk-full-logs",
|
|
help="Which incident to simulate",
|
|
)
|
|
parser.add_argument(
|
|
"--no-setup",
|
|
action="store_true",
|
|
help="Skip environment setup (use if environment is already broken)",
|
|
)
|
|
parser.add_argument(
|
|
"--max-turns",
|
|
type=int,
|
|
default=20,
|
|
help="Maximum agent turns",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print("Error: Set ANTHROPIC_API_KEY environment variable")
|
|
print(" export ANTHROPIC_API_KEY=sk-ant-...")
|
|
sys.exit(1)
|
|
|
|
scenario = DEMO_SCENARIOS[args.scenario]
|
|
|
|
# Create output directories
|
|
INCIDENT_DIR.mkdir(parents=True, exist_ok=True)
|
|
SKILLS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Setup broken environment
|
|
if not args.no_setup:
|
|
if RICH_AVAILABLE:
|
|
console.print(f"\n[dim]Setting up incident environment for: {args.scenario}[/]")
|
|
for cmd in scenario.get("setup", []):
|
|
_run(cmd)
|
|
|
|
try:
|
|
run_incident_agent(scenario, api_key, args.max_turns)
|
|
finally:
|
|
# Cleanup
|
|
for cmd in scenario.get("cleanup", []):
|
|
_run(cmd)
|
|
|
|
if RICH_AVAILABLE:
|
|
console.print("\n[bold green]✅ Demo complete![/]")
|
|
console.print(f"Check [cyan]~/.hermes/incidents/[/] for incident reports")
|
|
console.print(f"Check [cyan]~/.hermes/skills/[/] for auto-created prevention skills")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|