singularity-forge/scripts/secret-scan.sh
Mikael Hugo fa9baf71d5 feat(secret-scan): SF_SECURITY_FAST contract for the regex-only fast path
Codifies AC4 of sf-mp4w2dij-xm6cwj: the regex-only path is the
today-default fast mode. SF_SECURITY_FAST=1 is the explicit opt-in for
callers that want to assert "regex-only, no LLM escalation, sub-100ms"
regardless of any future tiered reviewer landing in the script.

Today the env var changes only the trailing status line so operators
can verify the contract is observable. When the LLM-backed review hook
(AC1) lands, the absence of SF_SECURITY_FAST becomes the trigger for
escalation; setting it=1 keeps offline / pre-commit callers on the
fast path. Locked in by tests in both the .sh and .mjs scanners.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 07:57:02 +02:00

227 lines
7.8 KiB
Bash
Executable file

#!/usr/bin/env bash
# Secret scanner — detects hardcoded credentials in staged/changed files.
# Usage:
# scripts/secret-scan.sh # scan staged files (pre-commit mode)
# scripts/secret-scan.sh --diff HEAD # scan diff against HEAD (CI mode)
# scripts/secret-scan.sh --file path # scan a specific file
#
# Works on macOS (BSD grep) and Linux (GNU grep) — uses only ERE patterns.
set -euo pipefail
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
IGNOREFILE=".secretscanignore"
EXIT_CODE=0
# ── Pattern definitions ────────────────────────────────────────────────
# Each entry: "LABEL:::FLAGS:::REGEX"
# FLAGS: "" for default, "i" for case-insensitive (-i flag)
# All patterns use POSIX ERE (grep -E), no PCRE required.
PATTERNS=(
# AWS
"AWS Access Key::::::AKIA[0-9A-Z]{16}"
# Generic API keys / tokens (quoted strings that look like keys)
"Generic API Key:::i:::(api[_-]?key|apikey|api[_-]?secret)[[:space:]]*[:=][[:space:]]*['\"][0-9a-zA-Z_./-]{20,}['\"]"
"Generic Secret:::i:::(secret|token|password|passwd|pwd|credential)[[:space:]]*[:=][[:space:]]*['\"][^[:space:]'\"]{8,}['\"]"
"Authorization Header:::i:::(authorization|bearer)[[:space:]]*[:=][[:space:]]*['\"][^[:space:]'\"]{8,}['\"]"
# Private keys
"Private Key::::::-----BEGIN[[:space:]]+(RSA|DSA|EC|OPENSSH|PGP)[[:space:]]+PRIVATE[[:space:]]+KEY-----"
# Connection strings
"Database URL:::i:::(mysql|postgres|postgresql|mongodb|redis|amqp|mssql)://[^[:space:]'\"]{8,}"
# GitHub / GitLab tokens
"GitHub Token::::::gh[pousr]_[0-9a-zA-Z]{36,}"
"GitLab Token::::::glpat-[0-9a-zA-Z-]{20,}"
# Slack
"Slack Token::::::xox[baprs]-[0-9a-zA-Z-]{10,}"
"Slack Webhook::::::hooks\.slack\.com/services/T[0-9A-Z]{8,}/B[0-9A-Z]{8,}/[0-9a-zA-Z]{20,}"
# Google
"Google API Key::::::AIza[0-9A-Za-z_-]{35}"
# Stripe
"Stripe Key::::::[sr]k_(live|test)_[0-9a-zA-Z]{20,}"
# npm token
"npm Token::::::npm_[0-9a-zA-Z]{36,}"
# Hex-encoded secrets (high-entropy, 32+ hex chars assigned to a variable)
"Hex Secret:::i:::(secret|key|token|password)[[:space:]]*[:=][[:space:]]*['\"]?[0-9a-f]{32,}['\"]?"
# Hardcoded passwords in config-like files
"Hardcoded Password:::i:::password[[:space:]]*[:=][[:space:]]*['\"][^'\"]{4,}['\"]"
)
# ── Load ignorefile ────────────────────────────────────────────────────
load_ignore_patterns() {
local ignore_patterns=()
if [[ -f "$IGNOREFILE" ]]; then
while IFS= read -r line; do
# skip blank lines and comments
[[ -z "$line" || "$line" =~ ^# ]] && continue
ignore_patterns+=("$line")
done < "$IGNOREFILE"
fi
echo "${ignore_patterns[@]+"${ignore_patterns[@]}"}"
}
is_ignored() {
local file="$1" line_content="$2"
local ignore_patterns
read -ra ignore_patterns <<< "$(load_ignore_patterns)"
for pattern in "${ignore_patterns[@]+"${ignore_patterns[@]}"}"; do
# Pattern can be "filepath:pattern" or just "pattern"
if [[ "$pattern" == *:* ]]; then
local ignore_file="${pattern%%:*}"
local ignore_regex="${pattern#*:}"
if [[ "$file" == $ignore_file ]] && echo "$line_content" | grep -qiE "$ignore_regex" 2>/dev/null; then
return 0
fi
else
if echo "$line_content" | grep -qiE "$pattern" 2>/dev/null; then
return 0
fi
fi
done
return 1
}
# ── Determine files to scan ───────────────────────────────────────────
get_files() {
if [[ "${1:-}" == "--diff" ]]; then
local ref="${2:-HEAD}"
git diff --name-only --diff-filter=ACMR "$ref" 2>/dev/null || true
elif [[ "${1:-}" == "--file" ]]; then
echo "${2:-}"
else
# Pre-commit mode: staged files only
git diff --cached --name-only --diff-filter=ACMR 2>/dev/null || true
fi
}
# ── File-type filter (skip binaries and known safe files) ─────────────
should_scan() {
local file="$1"
# Skip binary extensions
case "$file" in
*.png|*.jpg|*.jpeg|*.gif|*.ico|*.svg|*.woff|*.woff2|*.ttf|*.eot|\
*.zip|*.tar|*.gz|*.tgz|*.bz2|*.7z|*.rar|\
*.exe|*.dll|*.so|*.dylib|*.o|*.a|\
*.pdf|*.doc|*.docx|*.xls|*.xlsx|\
*.lock|package-lock.json|pnpm-lock.yaml|\
*.min.js|*.min.css|*.map|\
*.node|*.wasm)
return 1 ;;
esac
# Skip known non-secret files
case "$file" in
.secretscanignore|.gitignore|.gitattributes|LICENSE*|CHANGELOG*|*.md)
return 1 ;;
esac
# Skip node_modules, dist, coverage
case "$file" in
node_modules/*|dist/*|coverage/*|.sf/*)
return 1 ;;
esac
return 0
}
# ── Get content to scan ───────────────────────────────────────────────
get_content() {
local file="$1"
if [[ "${SCAN_MODE:-staged}" == "staged" ]]; then
# For pre-commit, scan the staged version
git show ":$file" 2>/dev/null || cat "$file" 2>/dev/null || true
else
cat "$file" 2>/dev/null || true
fi
}
# ── Main scan ─────────────────────────────────────────────────────────
SCAN_MODE="staged"
FILES_ARG=()
while [[ $# -gt 0 ]]; do
case "$1" in
--diff) SCAN_MODE="diff"; FILES_ARG=("--diff" "${2:-HEAD}"); shift 2 ;;
--file) SCAN_MODE="file"; FILES_ARG=("--file" "$2"); shift 2 ;;
*) shift ;;
esac
done
FILES=$(get_files "${FILES_ARG[@]+"${FILES_ARG[@]}"}")
FINDINGS=0
if [[ -z "$FILES" ]]; then
echo "secret-scan: no files to scan"
exit 0
fi
while IFS= read -r file; do
[[ -z "$file" ]] && continue
should_scan "$file" || continue
content=$(get_content "$file")
[[ -z "$content" ]] && continue
for entry in "${PATTERNS[@]}"; do
label="${entry%%:::*}"
rest="${entry#*:::}"
flags="${rest%%:::*}"
regex="${rest#*:::}"
# Build grep flags
grep_flags="-nE"
if [[ "$flags" == *i* ]]; then
grep_flags="-niE"
fi
matches=$(echo "$content" | grep $grep_flags -e "$regex" 2>/dev/null || true)
if [[ -n "$matches" ]]; then
while IFS= read -r match_line; do
[[ -z "$match_line" ]] && continue
line_num="${match_line%%:*}"
line_content="${match_line#*:}"
# Check ignorefile
if is_ignored "$file" "$line_content"; then
continue
fi
# Mask the actual secret value in output
echo -e "${RED}[SECRET DETECTED]${NC} ${YELLOW}${label}${NC}"
echo " File: $file:$line_num"
echo " Line: $(echo "$line_content" | head -c 120)..."
echo ""
FINDINGS=$((FINDINGS + 1))
EXIT_CODE=1
done <<< "$matches"
fi
done
done <<< "$FILES"
if [[ $FINDINGS -gt 0 ]]; then
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${RED}Found $FINDINGS potential secret(s) in staged files.${NC}"
echo -e "${RED}Commit blocked. Remove the secrets or add exceptions${NC}"
echo -e "${RED}to .secretscanignore if these are false positives.${NC}"
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
elif [[ "${SF_SECURITY_FAST:-0}" == "1" ]]; then
# AC4 contract (sf-mp4w2dij-xm6cwj): SF_SECURITY_FAST=1 is the explicit
# opt-in for the regex-only fast path. When LLM escalation lands, this
# env var becomes the documented opt-out for offline/pre-commit callers.
echo "secret-scan: fast mode (regex-only, no LLM escalation) ✓"
else
echo "secret-scan: no secrets detected ✓"
fi
exit $EXIT_CODE