From 5ad53a829f8baf45a9b791e0bb0ec800ea7076c8 Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Sat, 23 May 2026 22:11:05 -0400 Subject: [PATCH 1/5] feat(content-guards): add no-real-ips PreToolUse Write/Edit hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blocks IPv4 literals in Write content and Edit new_string when they fall outside the allowlist: 192.168.0.0/24 (sanctioned sample CIDR), loopback, 0.0.0.0, broadcast, and 169.254.169.254 (cloud metadata). Skips lines matching pre-commit version-pin shape ("rev: v0.10.0.1"). First-block / second-allow flow: the first attempt to write a non-allowed IP into a given file blocks with a clear warning explaining the risk and the allowed alternatives. A retry within 5 minutes (same file + same IP) is treated as the agent's acknowledgment and is allowed through — for legitimate uses like private repos, .gitignored files, or scratch buffers. Per-(file, IP) tracking: a new IP on the second write still blocks; the same IP in a different file blocks anew. State lives in $XDG_CACHE_HOME/content-guards/no-real-ips-state.json with a 300s TTL and prune-on-read. Wired into content-guards/hooks/hooks.json alongside validate-token-limits under the existing PreToolUse Write|Edit matcher. Motivated by a real leak in JacobPEvans/orbstack-kubernetes PR #234, where an agent iterating on a failing test pasted the live Splunk IP (observed in Cribl Stream's outputs.yml output) verbatim into two new test cases. The repo's existing pre-commit no-real-ips hook missed it because it only scanned *.yaml/*.sh under k8s/, scripts/, docker/. This PreToolUse hook catches the same class of leak at write time, before it ever lands on disk, and covers every Claude-managed repo automatically. Coverage: 16 bats tests (tool filtering, allowlist, version-pin skip, first-block / second-allow flow, per-file tracking, multi-IP partial acknowledgment). Assisted-by: Claude --- content-guards/README.md | 7 + content-guards/hooks/hooks.json | 5 + .../scripts/validate-no-real-ips.py | 171 ++++++++++++++++++ .../no-real-ips/no-real-ips.bats | 167 +++++++++++++++++ 4 files changed, 350 insertions(+) create mode 100755 content-guards/scripts/validate-no-real-ips.py create mode 100644 tests/content-guards/no-real-ips/no-real-ips.bats diff --git a/content-guards/README.md b/content-guards/README.md index de08a4a..9ccf5f0 100644 --- a/content-guards/README.md +++ b/content-guards/README.md @@ -8,6 +8,7 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams. - **markdown-validator**: Validates markdown with markdownlint - **token-validator**: Enforces configurable file token limits +- **no-real-ips**: Blocks non-allowed IPv4 literals in Write/Edit content; first-block, second-allow flow - **webfetch-guard**: Blocks outdated year references in web queries - **readme-validator**: Checks README files for required sections and badge health - **issue-limiter**: Prevents GitHub issue backlog overflow with 24h rate limiting @@ -17,6 +18,12 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams. No manual invocation required. All hooks activate automatically: - **token-validator** — blocks files exceeding token limits (PreToolUse: Write, Edit) +- **no-real-ips** — blocks IPv4 literals outside the allowlist + (`192.168.0.0/24`, loopback, `0.0.0.0`, broadcast, link-local metadata). + First attempt blocks with a clear warning; a retry within 5 minutes is + treated as the agent's acknowledgment and allowed through (PreToolUse: + Write, Edit). State persists in + `$XDG_CACHE_HOME/content-guards/no-real-ips-state.json`. - **webfetch-guard** — blocks outdated year references in web queries (PreToolUse: WebFetch, WebSearch) - **issue-limiter** — rate limits `gh issue create` and `gh pr create` (PreToolUse: Bash) - **branch-limiter** — limits concurrent open branches (PreToolUse: Bash) diff --git a/content-guards/hooks/hooks.json b/content-guards/hooks/hooks.json index 7435660..9e30d4f 100644 --- a/content-guards/hooks/hooks.json +++ b/content-guards/hooks/hooks.json @@ -8,6 +8,11 @@ "type": "command", "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-token-limits.py", "timeout": 30 + }, + { + "type": "command", + "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-no-real-ips.py", + "timeout": 30 } ] }, diff --git a/content-guards/scripts/validate-no-real-ips.py b/content-guards/scripts/validate-no-real-ips.py new file mode 100755 index 0000000..0234d02 --- /dev/null +++ b/content-guards/scripts/validate-no-real-ips.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +"""Claude Code PreToolUse hook: block non-allowed IP addresses in Write/Edit. + +Scans Write `content` and Edit `new_string` for IPv4 literals outside the +allowlist. First attempt blocks with a clear warning explaining the risk +and the allowed alternatives. Second attempt within TTL (same file + IP) +passes through — the retry IS the agent's acknowledgment that the use is +legitimate (private repo, .gitignored file, scratch buffer). + +Allowed IPv4 values: + 192.168.0.0/24 sanctioned example CIDR (the ONLY sample range) + 127.0.0.0, .1 loopback + 0.0.0.0 wildcard bind + 255.255.255.x broadcast + 169.254.169.254 cloud metadata service + +State lives in $XDG_CACHE_HOME/content-guards/no-real-ips-state.json +(falls back to ~/.cache/...). Per-(file, ip) timestamps; entries expire +after TTL_SECONDS. +""" + +from __future__ import annotations + +import json +import os +import re +import sys +import time +from pathlib import Path + +IP_PATTERN = re.compile(r"(? bool: + return any(p.match(ip) for p in ALLOWED_PATTERNS) + + +def find_violations(content: str) -> list[str]: + """Return non-allowed IPv4 literals in content, deduped, in first-seen order. + + Skips lines that look like pre-commit version pins ("rev: v1.2.3.4"). + """ + found: list[str] = [] + seen: set[str] = set() + for line in content.splitlines(): + if VERSION_PIN_PATTERN.search(line): + continue + for match in IP_PATTERN.finditer(line): + ip = match.group(0) + if is_allowed(ip) or ip in seen: + continue + seen.add(ip) + found.append(ip) + return found + + +def load_state() -> dict[str, float]: + try: + data = json.loads(STATE_FILE.read_text()) + except (OSError, json.JSONDecodeError): + return {} + return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))} + + +def save_state(state: dict[str, float]) -> None: + try: + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + STATE_FILE.write_text(json.dumps(state)) + except OSError: + pass + + +def prune(state: dict[str, float], now: float) -> dict[str, float]: + return {k: ts for k, ts in state.items() if now - ts < TTL_SECONDS} + + +def extract_content(tool_name: str, tool_input: dict) -> str: + if tool_name == "Write": + return str(tool_input.get("content") or "") + if tool_name == "Edit": + return str(tool_input.get("new_string") or "") + return "" + + +def emit(decision: str, reason: str) -> None: + print(json.dumps({ + "hookSpecificOutput": { + "hookEventName": "PreToolUse", + "permissionDecision": decision, + "permissionDecisionReason": reason, + } + })) + + +def main() -> int: + try: + hook_input = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 + + tool_name = hook_input.get("tool_name", "") + if tool_name not in ("Write", "Edit"): + return 0 + + tool_input = hook_input.get("tool_input") or {} + file_path = str(tool_input.get("file_path") or "") + content = extract_content(tool_name, tool_input) + + if not content: + return 0 + + violations = find_violations(content) + if not violations: + return 0 + + now = time.time() + state = prune(load_state(), now) + + keys = {ip: f"{file_path}:{ip}" for ip in violations} + unwarned = [ip for ip in violations if keys[ip] not in state] + + if not unwarned: + emit("allow", ( + f"WARNING (acknowledged): non-allowed IP(s) in {tool_name} of {file_path}: " + f"{', '.join(violations)}. Proceeding because this is a retry within the " + f"{TTL_SECONDS // 60}-min acknowledgment window. Confirm the file is not " + "committed publicly, or use a value from tests/fixtures.py / 192.168.0.x." + )) + return 0 + + for ip in unwarned: + state[keys[ip]] = now + save_state(state) + + emit("deny", ( + f"BLOCKED (first attempt): non-allowed IP(s) in {tool_name} of {file_path}: " + f"{', '.join(unwarned)}.\n\n" + "These IPs look like live network artifacts (often pasted from kubectl/cribl " + "tool output) and will leak into the repo if committed.\n\n" + "Preferred fixes:\n" + " 1. Replace with 192.168.0.x (the only sanctioned sample CIDR), or import " + "a constant from the repo's tests/fixtures.py if one exists.\n" + " 2. Reference the real value via a secret / env var.\n\n" + "If this use is legitimate (private repo, .gitignored file, scratch buffer), " + f"retry the same {tool_name} within {TTL_SECONDS // 60} minutes. The retry IS " + "your acknowledgment that you accept the risk." + )) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/content-guards/no-real-ips/no-real-ips.bats b/tests/content-guards/no-real-ips/no-real-ips.bats new file mode 100644 index 0000000..bb51c2e --- /dev/null +++ b/tests/content-guards/no-real-ips/no-real-ips.bats @@ -0,0 +1,167 @@ +#!/usr/bin/env bats +# Test suite for content-guards/scripts/validate-no-real-ips.py +# +# Tests tool name filtering, IP allowlist, version-pin skip, and the +# first-block / second-allow acknowledgment flow. +# +# Run with: bats tests/content-guards/no-real-ips/no-real-ips.bats + +setup() { + REPO_ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/../../.." && pwd)" + SCRIPT="$REPO_ROOT/content-guards/scripts/validate-no-real-ips.py" + STATE_FILE="$(mktemp)" + rm -f "$STATE_FILE" + export NO_REAL_IPS_STATE_FILE="$STATE_FILE" + + if [[ ! -f "$SCRIPT" ]]; then + echo "ERROR: Script not found at $SCRIPT" >&2 + return 1 + fi +} + +teardown() { + rm -f "$STATE_FILE" +} + +run_hook() { + run python3 "$SCRIPT" <<< "$1" +} + +# --------------------------------------------------------------------------- +# TC1: Non Write/Edit tools are silently allowed (exit 0, no output) +# --------------------------------------------------------------------------- + +@test "TC1: Bash tool is allowed silently" { + run_hook '{"tool_name":"Bash","tool_input":{"command":"echo 10.0.1.200"}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC1b: Read tool is allowed silently" { + run_hook '{"tool_name":"Read","tool_input":{"file_path":"/x"}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC1c: invalid JSON is allowed silently (fail-open)" { + run python3 "$SCRIPT" <<< "not json" + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +# --------------------------------------------------------------------------- +# TC2: Allowlist — every sanctioned range passes +# --------------------------------------------------------------------------- + +@test "TC2a: 192.168.0.x sample range allowed" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"url=\"192.168.0.200\""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC2b: loopback 127.0.0.1 allowed" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"host=\"127.0.0.1\""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC2c: 0.0.0.0 wildcard allowed" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.yml","content":"bind: 0.0.0.0"}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC2d: 169.254.169.254 metadata allowed" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.sh","content":"curl 169.254.169.254/latest"}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC2e: version pin rev: v0.10.0.1 is skipped" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.yml","content":"rev: v0.10.0.1"}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC2f: 192.168.1.x (not 192.168.0.x) is BLOCKED" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"192.168.1.50\""}}' + [ "$status" -eq 0 ] + echo "$output" | grep -q '"permissionDecision": "deny"' +} + +# --------------------------------------------------------------------------- +# TC3: First attempt blocks +# --------------------------------------------------------------------------- + +@test "TC3a: Write with 10.0.1.200 blocked on first attempt" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/repo/tests/test.py","content":"url=\"10.0.1.200\""}}' + [ "$status" -eq 0 ] + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'BLOCKED (first attempt)' + echo "$output" | grep -q '10.0.1.200' +} + +@test "TC3b: Edit with new_string containing 172.16.0.5 blocked" { + run_hook '{"tool_name":"Edit","tool_input":{"file_path":"/x.yml","old_string":"old","new_string":"host: 172.16.0.5"}}' + [ "$status" -eq 0 ] + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q '172.16.0.5' +} + +# --------------------------------------------------------------------------- +# TC4: Second attempt within TTL allows (acknowledgment) +# --------------------------------------------------------------------------- + +@test "TC4a: same file + same IP retried allows" { + INPUT='{"tool_name":"Write","tool_input":{"file_path":"/repo/scratch.py","content":"x=\"10.0.1.200\""}}' + run python3 "$SCRIPT" <<< "$INPUT" + echo "$output" | grep -q '"permissionDecision": "deny"' + + run python3 "$SCRIPT" <<< "$INPUT" + echo "$output" | grep -q '"permissionDecision": "allow"' + echo "$output" | grep -q 'WARNING (acknowledged)' +} + +@test "TC4b: same IP in a different file blocks again (per-file tracking)" { + FIRST='{"tool_name":"Write","tool_input":{"file_path":"/a.py","content":"x=\"10.0.1.200\""}}' + SECOND='{"tool_name":"Write","tool_input":{"file_path":"/b.py","content":"x=\"10.0.1.200\""}}' + run python3 "$SCRIPT" <<< "$FIRST" + echo "$output" | grep -q '"permissionDecision": "deny"' + + run python3 "$SCRIPT" <<< "$SECOND" + echo "$output" | grep -q '"permissionDecision": "deny"' +} + +@test "TC4c: same file, NEW IP on second write blocks the new one" { + FIRST='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"x=\"10.0.1.200\""}}' + SECOND='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"y=\"172.16.0.5\""}}' + run python3 "$SCRIPT" <<< "$FIRST" + echo "$output" | grep -q '"permissionDecision": "deny"' + + run python3 "$SCRIPT" <<< "$SECOND" + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q '172.16.0.5' +} + +@test "TC4d: same file, retry with BOTH old (acknowledged) + new IP blocks only new" { + FIRST='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"x=\"10.0.1.200\""}}' + SECOND='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"a=\"10.0.1.200\"\nb=\"172.16.0.5\""}}' + run python3 "$SCRIPT" <<< "$FIRST" + echo "$output" | grep -q '"permissionDecision": "deny"' + + run python3 "$SCRIPT" <<< "$SECOND" + echo "$output" | grep -q '"permissionDecision": "deny"' + # The block message should name the new IP, not the acknowledged one + echo "$output" | grep -q '172.16.0.5' + echo "$output" | grep -vq 'BLOCKED.*10\.0\.1\.200,' +} + +# --------------------------------------------------------------------------- +# TC5: Empty content allowed silently +# --------------------------------------------------------------------------- + +@test "TC5: Write with no content is allowed silently" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} From 4f6d3ffb34ee7062ca20c3b5de50c2fa69b18fc1 Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Sat, 23 May 2026 22:19:39 -0400 Subject: [PATCH 2/5] fix(content-guards): tighten octet regex, atomic state writes, normalize paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses gemini-code-assist review feedback on PR #319. - IP_PATTERN and ALLOWED_PATTERNS now use a strict 0-255 octet sub-pattern (_OCTET) so values like 999.999.999.999 no longer match as IPs at all. Reduces false positives. - save_state writes to a sibling .tmp file and os.replace's into place. Atomic against concurrent hook invocations during parallel tool execution. - file_path is normalized via os.path.realpath (stronger than the suggested os.path.abspath — also resolves symlinks). On macOS the /var -> /private/var symlink would otherwise cause the same file to be tracked under two state keys depending on how the agent spelled the path. realpath collapses both spellings to the same canonical path. Adds 4 bats tests (TC6a/b/c, TC7) covering the new behaviors. Assisted-by: Claude --- .../scripts/validate-no-real-ips.py | 14 ++++-- .../no-real-ips/no-real-ips.bats | 48 +++++++++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/content-guards/scripts/validate-no-real-ips.py b/content-guards/scripts/validate-no-real-ips.py index 0234d02..8136803 100755 --- a/content-guards/scripts/validate-no-real-ips.py +++ b/content-guards/scripts/validate-no-real-ips.py @@ -28,13 +28,14 @@ import time from pathlib import Path -IP_PATTERN = re.compile(r"(? dict[str, float]: def save_state(state: dict[str, float]) -> None: try: STATE_FILE.parent.mkdir(parents=True, exist_ok=True) - STATE_FILE.write_text(json.dumps(state)) + tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp") + tmp.write_text(json.dumps(state)) + os.replace(tmp, STATE_FILE) except OSError: pass @@ -122,7 +125,8 @@ def main() -> int: return 0 tool_input = hook_input.get("tool_input") or {} - file_path = str(tool_input.get("file_path") or "") + raw_file_path = str(tool_input.get("file_path") or "") + file_path = os.path.realpath(raw_file_path) if raw_file_path else "" content = extract_content(tool_name, tool_input) if not content: diff --git a/tests/content-guards/no-real-ips/no-real-ips.bats b/tests/content-guards/no-real-ips/no-real-ips.bats index bb51c2e..3cd94af 100644 --- a/tests/content-guards/no-real-ips/no-real-ips.bats +++ b/tests/content-guards/no-real-ips/no-real-ips.bats @@ -165,3 +165,51 @@ run_hook() { [ "$status" -eq 0 ] [ -z "$output" ] } + +# --------------------------------------------------------------------------- +# TC6: Strict octet regex — out-of-range octets are NOT treated as IPs +# --------------------------------------------------------------------------- + +@test "TC6a: 999.999.999.999 is not matched (each octet > 255)" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"999.999.999.999\""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC6b: 256.1.1.1 is not matched (first octet > 255)" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"256.1.1.1\""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +@test "TC6c: 192.168.0.256 is not matched as allowed-range (last octet > 255)" { + # If matched, it would BLOCK (since 256 isn't in 192.168.0.0/24). + # With the strict regex it's not an IP at all, so the line is skipped. + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"192.168.0.256\""}}' + [ "$status" -eq 0 ] + [ -z "$output" ] +} + +# --------------------------------------------------------------------------- +# TC7: Path normalization — relative and absolute paths to the same file +# share the same acknowledgment slot. +# --------------------------------------------------------------------------- + +@test "TC7: relative and absolute paths to the same file share state" { + TMPDIR_TEST="$(mktemp -d)" + ABS_PATH="$TMPDIR_TEST/foo.py" + REL_PATH="./foo.py" + FIRST="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$ABS_PATH\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" + SECOND="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$REL_PATH\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" + + # First attempt blocks + run python3 "$SCRIPT" <<< "$FIRST" + echo "$output" | grep -q '"permissionDecision": "deny"' + + # Same IP, this time as a relative path resolved from the same cwd → allow + cd "$TMPDIR_TEST" + run python3 "$SCRIPT" <<< "$SECOND" + echo "$output" | grep -q '"permissionDecision": "allow"' + + rm -rf "$TMPDIR_TEST" +} From 3ad7257189fe10b687a82e8bc25e1ead96082ac6 Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Sun, 24 May 2026 13:04:59 -0400 Subject: [PATCH 3/5] feat(content-guards)!: generalize no-real-ips to sensitive-content-guard Renames the IPv4-only hook to a general sensitive-content guard covering 7 detector categories with clean regexes and low false-positive rates. Each detector has its own allowlist and shares the first-block / second- allow UX so legitimate uses (private repos, scratch files, .gitignored paths) can proceed on retry. Detectors: - ipv4: existing behavior preserved (192.168.0.0/24, loopback, 0.0.0.0, broadcast, link-local metadata) - ipv6: outside ::, ::1, fe80::, fc00::/7, 2001:db8::, ff00:: - email: outside noreply@github.com, *.users.noreply.github.com, *@example.{com,org,net,local}, *@test, *@localhost, - absolute_user_path: hard-coded /Users// or /home// outside ${USER}/$USER/ placeholders - private_key_header: always blocked - aws_account_id: line-context-gated 12-digit numbers, allows AWS's documented 123456789012 sample - real_domain: FQDN-shaped tokens outside *.example.*, *.test, *.localhost, *.invalid, *.local, and a short explicit allowlist (github.com, docs.jacobpevans.com, runs-on.com, healthchecks.io) State key is (file, detector, value) so acknowledging one IPv4 does not pre-allow an unrelated email or domain. Bats tests split into sensitive-content.bats (IPv4 regression: 25 cases) and detectors.bats (per-detector + isolation: 30 cases). All 55 tests pass. BREAKING CHANGE: renames validate-no-real-ips.py to validate-sensitive-content.py, state file no-real-ips-state.json to sensitive-content-state.json, env var NO_REAL_IPS_STATE_FILE to SENSITIVE_CONTENT_STATE_FILE. Assisted-by: Claude --- content-guards/README.md | 39 ++- content-guards/hooks/hooks.json | 2 +- .../scripts/validate-no-real-ips.py | 175 ----------- .../scripts/validate-sensitive-content.py | 291 ++++++++++++++++++ .../sensitive-content/detectors.bats | 237 ++++++++++++++ .../sensitive-content.bats} | 104 +++---- 6 files changed, 602 insertions(+), 246 deletions(-) delete mode 100755 content-guards/scripts/validate-no-real-ips.py create mode 100755 content-guards/scripts/validate-sensitive-content.py create mode 100644 tests/content-guards/sensitive-content/detectors.bats rename tests/content-guards/{no-real-ips/no-real-ips.bats => sensitive-content/sensitive-content.bats} (61%) diff --git a/content-guards/README.md b/content-guards/README.md index 9ccf5f0..6a2fe22 100644 --- a/content-guards/README.md +++ b/content-guards/README.md @@ -8,7 +8,9 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams. - **markdown-validator**: Validates markdown with markdownlint - **token-validator**: Enforces configurable file token limits -- **no-real-ips**: Blocks non-allowed IPv4 literals in Write/Edit content; first-block, second-allow flow +- **sensitive-content-guard**: Blocks 7 categories of sensitive literals + (IPv4, IPv6, emails, user paths, private keys, AWS account IDs, real + domains) in Write/Edit content; first-block, second-allow flow per detector - **webfetch-guard**: Blocks outdated year references in web queries - **readme-validator**: Checks README files for required sections and badge health - **issue-limiter**: Prevents GitHub issue backlog overflow with 24h rate limiting @@ -18,12 +20,35 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams. No manual invocation required. All hooks activate automatically: - **token-validator** — blocks files exceeding token limits (PreToolUse: Write, Edit) -- **no-real-ips** — blocks IPv4 literals outside the allowlist - (`192.168.0.0/24`, loopback, `0.0.0.0`, broadcast, link-local metadata). - First attempt blocks with a clear warning; a retry within 5 minutes is - treated as the agent's acknowledgment and allowed through (PreToolUse: - Write, Edit). State persists in - `$XDG_CACHE_HOME/content-guards/no-real-ips-state.json`. +- **sensitive-content-guard** — blocks 7 categories of sensitive literals + in Write/Edit content (PreToolUse: Write, Edit). First attempt blocks + with a per-detector hint; a retry within 5 minutes for the same + `(file, detector, value)` is treated as the agent's acknowledgment and + allowed through. State persists in + `$XDG_CACHE_HOME/content-guards/sensitive-content-state.json`. + + Detectors and their allowlist anchors: + - **`ipv4`** — IPv4 outside `192.168.0.0/24`, loopback, `0.0.0.0`, + broadcast (`255.255.255.x`), link-local metadata (`169.254.169.254`). + - **`ipv6`** — IPv6 outside `::`/`::1`, `fe80::*` (link-local), + `fc00::/7` (ULA), `2001:db8::*` (RFC 3849 doc prefix), `ff00::*` + (multicast). + - **`email`** — real email addresses outside `noreply@github.com`, + `*@users.noreply.github.com`, `*@example.{com,org,net,local}`, + `*@test`, `*@localhost`, and `` shapes. + - **`absolute_user_path`** — hard-coded `/Users//` or + `/home//` outside `${USER}`, `$USER`, or `` placeholder + shapes. + - **`private_key_header`** — PEM private key markers + (`-----BEGIN … PRIVATE KEY-----`); always blocked. + - **`aws_account_id`** — bare 12-digit numbers on lines mentioning + `account_id`, `arn:aws:`, `aws_account_id`, or `:account:`; allows + `123456789012` (AWS's documented sample) and repeated-digit shapes. + - **`real_domain`** — FQDN-shaped tokens outside `*.example.*`, + `*.test`, `*.localhost`, `*.invalid`, `*.local`, the project's + short explicit allowlist (`github.com`, `api.github.com`, + `raw.githubusercontent.com`, `docs.jacobpevans.com`, `runs-on.com`, + `healthchecks.io`), and known file-extension shapes. - **webfetch-guard** — blocks outdated year references in web queries (PreToolUse: WebFetch, WebSearch) - **issue-limiter** — rate limits `gh issue create` and `gh pr create` (PreToolUse: Bash) - **branch-limiter** — limits concurrent open branches (PreToolUse: Bash) diff --git a/content-guards/hooks/hooks.json b/content-guards/hooks/hooks.json index 9e30d4f..1a141cc 100644 --- a/content-guards/hooks/hooks.json +++ b/content-guards/hooks/hooks.json @@ -11,7 +11,7 @@ }, { "type": "command", - "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-no-real-ips.py", + "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-sensitive-content.py", "timeout": 30 } ] diff --git a/content-guards/scripts/validate-no-real-ips.py b/content-guards/scripts/validate-no-real-ips.py deleted file mode 100755 index 8136803..0000000 --- a/content-guards/scripts/validate-no-real-ips.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python3 -"""Claude Code PreToolUse hook: block non-allowed IP addresses in Write/Edit. - -Scans Write `content` and Edit `new_string` for IPv4 literals outside the -allowlist. First attempt blocks with a clear warning explaining the risk -and the allowed alternatives. Second attempt within TTL (same file + IP) -passes through — the retry IS the agent's acknowledgment that the use is -legitimate (private repo, .gitignored file, scratch buffer). - -Allowed IPv4 values: - 192.168.0.0/24 sanctioned example CIDR (the ONLY sample range) - 127.0.0.0, .1 loopback - 0.0.0.0 wildcard bind - 255.255.255.x broadcast - 169.254.169.254 cloud metadata service - -State lives in $XDG_CACHE_HOME/content-guards/no-real-ips-state.json -(falls back to ~/.cache/...). Per-(file, ip) timestamps; entries expire -after TTL_SECONDS. -""" - -from __future__ import annotations - -import json -import os -import re -import sys -import time -from pathlib import Path - -_OCTET = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" -IP_PATTERN = re.compile(rf"(? bool: - return any(p.match(ip) for p in ALLOWED_PATTERNS) - - -def find_violations(content: str) -> list[str]: - """Return non-allowed IPv4 literals in content, deduped, in first-seen order. - - Skips lines that look like pre-commit version pins ("rev: v1.2.3.4"). - """ - found: list[str] = [] - seen: set[str] = set() - for line in content.splitlines(): - if VERSION_PIN_PATTERN.search(line): - continue - for match in IP_PATTERN.finditer(line): - ip = match.group(0) - if is_allowed(ip) or ip in seen: - continue - seen.add(ip) - found.append(ip) - return found - - -def load_state() -> dict[str, float]: - try: - data = json.loads(STATE_FILE.read_text()) - except (OSError, json.JSONDecodeError): - return {} - return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))} - - -def save_state(state: dict[str, float]) -> None: - try: - STATE_FILE.parent.mkdir(parents=True, exist_ok=True) - tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp") - tmp.write_text(json.dumps(state)) - os.replace(tmp, STATE_FILE) - except OSError: - pass - - -def prune(state: dict[str, float], now: float) -> dict[str, float]: - return {k: ts for k, ts in state.items() if now - ts < TTL_SECONDS} - - -def extract_content(tool_name: str, tool_input: dict) -> str: - if tool_name == "Write": - return str(tool_input.get("content") or "") - if tool_name == "Edit": - return str(tool_input.get("new_string") or "") - return "" - - -def emit(decision: str, reason: str) -> None: - print(json.dumps({ - "hookSpecificOutput": { - "hookEventName": "PreToolUse", - "permissionDecision": decision, - "permissionDecisionReason": reason, - } - })) - - -def main() -> int: - try: - hook_input = json.load(sys.stdin) - except (json.JSONDecodeError, ValueError): - return 0 - - tool_name = hook_input.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - return 0 - - tool_input = hook_input.get("tool_input") or {} - raw_file_path = str(tool_input.get("file_path") or "") - file_path = os.path.realpath(raw_file_path) if raw_file_path else "" - content = extract_content(tool_name, tool_input) - - if not content: - return 0 - - violations = find_violations(content) - if not violations: - return 0 - - now = time.time() - state = prune(load_state(), now) - - keys = {ip: f"{file_path}:{ip}" for ip in violations} - unwarned = [ip for ip in violations if keys[ip] not in state] - - if not unwarned: - emit("allow", ( - f"WARNING (acknowledged): non-allowed IP(s) in {tool_name} of {file_path}: " - f"{', '.join(violations)}. Proceeding because this is a retry within the " - f"{TTL_SECONDS // 60}-min acknowledgment window. Confirm the file is not " - "committed publicly, or use a value from tests/fixtures.py / 192.168.0.x." - )) - return 0 - - for ip in unwarned: - state[keys[ip]] = now - save_state(state) - - emit("deny", ( - f"BLOCKED (first attempt): non-allowed IP(s) in {tool_name} of {file_path}: " - f"{', '.join(unwarned)}.\n\n" - "These IPs look like live network artifacts (often pasted from kubectl/cribl " - "tool output) and will leak into the repo if committed.\n\n" - "Preferred fixes:\n" - " 1. Replace with 192.168.0.x (the only sanctioned sample CIDR), or import " - "a constant from the repo's tests/fixtures.py if one exists.\n" - " 2. Reference the real value via a secret / env var.\n\n" - "If this use is legitimate (private repo, .gitignored file, scratch buffer), " - f"retry the same {tool_name} within {TTL_SECONDS // 60} minutes. The retry IS " - "your acknowledgment that you accept the risk." - )) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/content-guards/scripts/validate-sensitive-content.py b/content-guards/scripts/validate-sensitive-content.py new file mode 100755 index 0000000..29d9d92 --- /dev/null +++ b/content-guards/scripts/validate-sensitive-content.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""PreToolUse hook: block sensitive content in Write/Edit. See README.""" +from __future__ import annotations + +import json +import os +import re +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Optional + +VERSION_PIN = re.compile(r"\brev:\s*v?\d") +HASH_LINE = re.compile(r"\b(?:sha\d+|md5|cas)[-: ]", re.IGNORECASE) +EMAIL_PLACEHOLDER = re.compile(r"<[A-Za-z][A-Za-z0-9._-]*@[A-Za-z0-9._-]*>") +LINK_REF = re.compile(r"^\s*\[[^\]]+\]:\s") +REPO_LINE = re.compile(r"^\s*repo:\s") +IMAGE_LINE = re.compile(r"^\s*image:\s") + +_OCT = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" +IPV4 = re.compile(rf"(? bool: + return any(p.match(v) for p in _IPV4_OK) + + +def _ipv6_allowed(v: str) -> bool: + v = v.lower() + if v in ("::", "::1"): + return True + if v.startswith(("fe80:", "fe80::", "2001:db8:", "2001:db8::")): + return True + return bool(re.match(r"^(?:f[cd][0-9a-f]{0,2}|ff[0-9a-f]{2}):", v)) + + +def _email_allowed(v: str) -> bool: + v = v.lower() + return (v == "noreply@github.com" + or v.endswith(_EMAIL_SUFFIX) or v.startswith(_EMAIL_PREFIX)) + + +def _user_path_allowed(v: str) -> bool: + lower = v.lower() + if "" in lower or "$user" in lower or "${user}" in lower: + return True + return bool(_REAL_USER) and v.endswith(f"/{_REAL_USER}/") + + +def _aws_allowed(v: str) -> bool: + return v in {"123456789012", "000000000000"} or len(set(v)) == 1 + + +def _domain_allowed(v: str) -> bool: + v = v.lower() + if v in _DOMAIN_EXACT or v.endswith(_DOMAIN_SUFFIX): + return True + return v.rsplit(".", 1)[-1] in FILE_EXTENSION_TLDS + + +def _domain_skip(line: str) -> bool: + return bool( + REPO_LINE.match(line) or IMAGE_LINE.match(line) + or LINK_REF.match(line) or EMAIL_PLACEHOLDER.search(line) + ) + + +def _ip_skip(line: str) -> bool: + return bool(VERSION_PIN.search(line) or HASH_LINE.search(line)) + + +@dataclass +class Detector: + name: str + pattern: re.Pattern + is_allowed: Callable[[str], bool] + message_hint: str + skip_line: Optional[Callable[[str], bool]] = None + line_context: Optional[Callable[[str], bool]] = None + normalize: Callable[[str], str] = field(default=lambda v: v) + + +DETECTORS: list[Detector] = [ + Detector("ipv4", IPV4, _ipv4_allowed, + "use 192.168.0.x sample CIDR or env/secret.", + skip_line=lambda l: bool(VERSION_PIN.search(l))), + Detector("ipv6", IPV6, _ipv6_allowed, + "use 2001:db8:: (RFC 3849 doc prefix), fe80::, ::1, or env var.", + skip_line=_ip_skip), + Detector("email", EMAIL, _email_allowed, + "use ``, *@example.com, or a GitHub no-reply variant.", + skip_line=lambda l: bool(EMAIL_PLACEHOLDER.search(l))), + Detector("absolute_user_path", USER_PATH, _user_path_allowed, + "use ${HOME}, ~, ${USER}, or placeholders."), + Detector("private_key_header", PRIVATE_KEY, lambda _v: False, + "private keys belong in keychain/SOPS/Doppler, never a file."), + Detector("aws_account_id", AWS_ACCT, _aws_allowed, + "use 123456789012 (AWS sample) or ${AWS_ACCOUNT_ID}.", + line_context=lambda l: bool(AWS_CTX.search(l))), + Detector("real_domain", DOMAIN, _domain_allowed, + "use example.com, *.test, *.localhost, or env var.", + skip_line=_domain_skip, normalize=str.lower), +] + +_CACHE = Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache")) +STATE_FILE = _CACHE / "content-guards" / "sensitive-content-state.json" +TTL_SECONDS = 300 +if (_override := os.environ.get("SENSITIVE_CONTENT_STATE_FILE")): + STATE_FILE = Path(_override) + + +def find_violations(content: str) -> list[tuple[Detector, str]]: + found: list[tuple[Detector, str]] = [] + seen: set[tuple[str, str]] = set() + for line in content.splitlines(): + for det in DETECTORS: + if det.skip_line and det.skip_line(line): + continue + if det.line_context and not det.line_context(line): + continue + for match in det.pattern.finditer(line): + value = det.normalize(match.group(0)) + if det.is_allowed(value): + continue + key = (det.name, value) + if key in seen: + continue + seen.add(key) + found.append((det, value)) + return found + + +def load_state() -> dict[str, float]: + try: + data = json.loads(STATE_FILE.read_text()) + except (OSError, json.JSONDecodeError): + return {} + return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))} + + +def save_state(state: dict[str, float]) -> None: + try: + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp") + tmp.write_text(json.dumps(state)) + os.replace(tmp, STATE_FILE) + except OSError: + pass + + +def prune(state: dict[str, float], now: float) -> dict[str, float]: + return {k: ts for k, ts in state.items() if now - ts < TTL_SECONDS} + + +def extract_content(tool_name: str, tool_input: dict) -> str: + if tool_name == "Write": + return str(tool_input.get("content") or "") + if tool_name == "Edit": + return str(tool_input.get("new_string") or "") + return "" + + +def emit(decision: str, reason: str) -> None: + print(json.dumps({"hookSpecificOutput": { + "hookEventName": "PreToolUse", + "permissionDecision": decision, + "permissionDecisionReason": reason, + }})) + + +def _group(violations: list[tuple[Detector, str]]) -> list[tuple[Detector, list[str]]]: + order: list[Detector] = [] + by_name: dict[str, list[str]] = {} + for det, value in violations: + if det.name not in by_name: + by_name[det.name] = [] + order.append(det) + by_name[det.name].append(value) + return [(det, by_name[det.name]) for det in order] + + +def _format(groups: list[tuple[Detector, list[str]]], hint: bool) -> str: + lines = [] + for det, values in groups: + head = f" [{det.name}] {', '.join(values)}" + if hint and det.message_hint: + head += f"\n -> {det.message_hint}" + lines.append(head) + return "\n".join(lines) + + +def main() -> int: + try: + hook_input = json.load(sys.stdin) + except (json.JSONDecodeError, ValueError): + return 0 + tool_name = hook_input.get("tool_name", "") + if tool_name not in ("Write", "Edit"): + return 0 + tool_input = hook_input.get("tool_input") or {} + raw_path = str(tool_input.get("file_path") or "") + file_path = os.path.realpath(raw_path) if raw_path else "" + content = extract_content(tool_name, tool_input) + if not content: + return 0 + violations = find_violations(content) + if not violations: + return 0 + + now = time.time() + state = prune(load_state(), now) + keys = {(d.name, v): f"{file_path}:{d.name}:{v}" for d, v in violations} + unwarned = [(d, v) for d, v in violations if keys[(d.name, v)] not in state] + + if not unwarned: + emit("allow", ( + f"WARNING (acknowledged): sensitive content in {tool_name} of " + f"{file_path}:\n{_format(_group(violations), hint=False)}\n\n" + f"Proceeding because this is a retry within the {TTL_SECONDS // 60}-min " + "window. Confirm the file is not committed publicly." + )) + return 0 + + for det, value in unwarned: + state[keys[(det.name, value)]] = now + save_state(state) + emit("deny", ( + f"BLOCKED (first attempt): sensitive content in {tool_name} of " + f"{file_path}:\n{_format(_group(unwarned), hint=True)}\n\n" + "These values look like real artifacts and would leak if committed.\n" + f"Retry within {TTL_SECONDS // 60} min to acknowledge and proceed." + )) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/content-guards/sensitive-content/detectors.bats b/tests/content-guards/sensitive-content/detectors.bats new file mode 100644 index 0000000..e2f2792 --- /dev/null +++ b/tests/content-guards/sensitive-content/detectors.bats @@ -0,0 +1,237 @@ +#!/usr/bin/env bats +# Per-detector cases for validate-sensitive-content.py: ipv6, email, +# absolute_user_path, private_key_header, aws_account_id, real_domain, +# and cross-detector state isolation. +# +# Run with: bats tests/content-guards/sensitive-content/detectors.bats + +setup() { + REPO_ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/../../.." && pwd)" + SCRIPT="$REPO_ROOT/content-guards/scripts/validate-sensitive-content.py" + STATE_FILE="$(mktemp)" + rm -f "$STATE_FILE" + export SENSITIVE_CONTENT_STATE_FILE="$STATE_FILE" +} + +teardown() { rm -f "$STATE_FILE"; } + +run_hook() { run python3 "$SCRIPT" <<< "$1"; } + +# --- ipv6 ------------------------------------------------------------------- + +@test "ipv6 allow: ::1 loopback" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"h=\"::1\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "ipv6 allow: 2001:db8:: documentation prefix" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"a=\"2001:db8::1\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "ipv6 allow: fe80:: link-local" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"a=\"fe80::1234\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "ipv6 block: 2620:0:860::1 (real cloudflare-ish)" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"a=\"2620:0:860::1\""}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'ipv6' +} + +@test "ipv6 skip: cas-sha256 hash not matched" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.json","content":"cas-sha256:abcd:1234:5678:beef"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "ipv6 first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/i6s.py","content":"a=\"2620:0:860::1\""}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- email ------------------------------------------------------------------ + +@test "email allow: noreply@github.com" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"a=\"noreply@github.com\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "email allow: foo@users.noreply.github.com" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"e=\"foo@users.noreply.github.com\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "email allow: bar@example.com" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"e=\"bar@example.com\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "email allow: placeholder shape " { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.md","content":"contact "}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "email block: real alice@realdomain.io" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"e=\"alice@realdomain.io\""}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'email' +} + +@test "email first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/es.py","content":"e=\"bob@realcompany.io\""}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- absolute_user_path ----------------------------------------------------- + +@test "user_path allow: placeholder" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.md","content":"cd /Users//p"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "user_path allow: \\$USER var" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.sh","content":"cd /home/$USER/w"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "user_path block: hard-coded /Users/alice/" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.md","content":"cd /Users/alice/p"}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'absolute_user_path' +} + +@test "user_path first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/ps.md","content":"cd /Users/carol/p"}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- private_key_header ----------------------------------------------------- + +@test "private_key block: BEGIN RSA PRIVATE KEY" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.pem","content":"-----BEGIN RSA PRIVATE KEY-----"}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'private_key_header' +} + +@test "private_key block: bare BEGIN PRIVATE KEY (PKCS8)" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.pem","content":"-----BEGIN PRIVATE KEY-----"}}' + echo "$output" | grep -q '"permissionDecision": "deny"' +} + +@test "private_key allow: no header literal text" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.md","content":"do not commit private keys"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "private_key first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/ks.pem","content":"-----BEGIN OPENSSH PRIVATE KEY-----"}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- aws_account_id --------------------------------------------------------- + +@test "aws allow: 123456789012 sample" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.tf","content":"account_id = \"123456789012\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "aws allow: 12-digit with no AWS context" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.txt","content":"phone: 555123456789"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "aws block: 987654321098 with account_id" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.tf","content":"account_id = \"987654321098\""}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'aws_account_id' +} + +@test "aws block: ARN with real account id" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.tf","content":"arn:aws:iam::246813579246:role/x"}}' + echo "$output" | grep -q '"permissionDecision": "deny"' +} + +@test "aws first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/as.tf","content":"aws_account_id = 555444333222"}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- real_domain ------------------------------------------------------------ + +@test "domain allow: example.com" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"u=\"https://example.com/a\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: github.com (allowlist)" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"u=\"https://github.com/f/b\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: docs.jacobpevans.com" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.md","content":"see https://docs.jacobpevans.com/f"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: db.foo.test" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"h=\"db.foo.test\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: filename foo.md is not a domain" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"p=\"docs/foo.md\""}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: repo: line skipped" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.yaml","content":"repo: https://realdomain.io/f"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain allow: image: line skipped" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.yaml","content":"image: docker.io/library/nginx:1.25"}}' + [ "$status" -eq 0 ] && [ -z "$output" ] +} + +@test "domain block: realbusiness.io" { + run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"u=\"https://realbusiness.io/a\""}}' + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'real_domain' +} + +@test "domain first-block-second-allow" { + IN='{"tool_name":"Write","tool_input":{"file_path":"/ds.py","content":"u=\"https://realbusiness.io\""}}' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$IN" + echo "$output" | grep -q '"permissionDecision": "allow"' +} + +# --- state-key isolation ---------------------------------------------------- + +@test "state isolation: acknowledged IPv4 does not pre-allow new email" { + F='{"tool_name":"Write","tool_input":{"file_path":"/mix.py","content":"x=\"10.0.1.200\""}}' + S='{"tool_name":"Write","tool_input":{"file_path":"/mix.py","content":"x=\"10.0.1.200\"\ne=\"al@realdom.io\""}}' + run python3 "$SCRIPT" <<< "$F" + echo "$output" | grep -q '"permissionDecision": "deny"' + run python3 "$SCRIPT" <<< "$S" + echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'email' + echo "$output" | grep -q 'al@realdom.io' +} diff --git a/tests/content-guards/no-real-ips/no-real-ips.bats b/tests/content-guards/sensitive-content/sensitive-content.bats similarity index 61% rename from tests/content-guards/no-real-ips/no-real-ips.bats rename to tests/content-guards/sensitive-content/sensitive-content.bats index 3cd94af..3f38599 100644 --- a/tests/content-guards/no-real-ips/no-real-ips.bats +++ b/tests/content-guards/sensitive-content/sensitive-content.bats @@ -1,17 +1,18 @@ #!/usr/bin/env bats -# Test suite for content-guards/scripts/validate-no-real-ips.py +# Test suite for content-guards/scripts/validate-sensitive-content.py +# IPv4 detector + tool filtering + state machine (regression preservation +# from the original no-real-ips hook). Per-detector cases for ipv6, email, +# absolute_user_path, private_key_header, aws_account_id, real_domain live +# in detectors.bats in the same directory. # -# Tests tool name filtering, IP allowlist, version-pin skip, and the -# first-block / second-allow acknowledgment flow. -# -# Run with: bats tests/content-guards/no-real-ips/no-real-ips.bats +# Run with: bats tests/content-guards/sensitive-content/sensitive-content.bats setup() { REPO_ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/../../.." && pwd)" - SCRIPT="$REPO_ROOT/content-guards/scripts/validate-no-real-ips.py" + SCRIPT="$REPO_ROOT/content-guards/scripts/validate-sensitive-content.py" STATE_FILE="$(mktemp)" rm -f "$STATE_FILE" - export NO_REAL_IPS_STATE_FILE="$STATE_FILE" + export SENSITIVE_CONTENT_STATE_FILE="$STATE_FILE" if [[ ! -f "$SCRIPT" ]]; then echo "ERROR: Script not found at $SCRIPT" >&2 @@ -50,7 +51,7 @@ run_hook() { } # --------------------------------------------------------------------------- -# TC2: Allowlist — every sanctioned range passes +# TC2: IPv4 allowlist — every sanctioned range passes # --------------------------------------------------------------------------- @test "TC2a: 192.168.0.x sample range allowed" { @@ -85,79 +86,68 @@ run_hook() { @test "TC2f: 192.168.1.x (not 192.168.0.x) is BLOCKED" { run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"192.168.1.50\""}}' - [ "$status" -eq 0 ] echo "$output" | grep -q '"permissionDecision": "deny"' + echo "$output" | grep -q 'ipv4' } # --------------------------------------------------------------------------- -# TC3: First attempt blocks +# TC3-TC4: First-block / second-allow flow # --------------------------------------------------------------------------- @test "TC3a: Write with 10.0.1.200 blocked on first attempt" { run_hook '{"tool_name":"Write","tool_input":{"file_path":"/repo/tests/test.py","content":"url=\"10.0.1.200\""}}' - [ "$status" -eq 0 ] echo "$output" | grep -q '"permissionDecision": "deny"' echo "$output" | grep -q 'BLOCKED (first attempt)' echo "$output" | grep -q '10.0.1.200' } @test "TC3b: Edit with new_string containing 172.16.0.5 blocked" { - run_hook '{"tool_name":"Edit","tool_input":{"file_path":"/x.yml","old_string":"old","new_string":"host: 172.16.0.5"}}' - [ "$status" -eq 0 ] + run_hook '{"tool_name":"Edit","tool_input":{"file_path":"/x.yml","old_string":"o","new_string":"host: 172.16.0.5"}}' echo "$output" | grep -q '"permissionDecision": "deny"' echo "$output" | grep -q '172.16.0.5' } -# --------------------------------------------------------------------------- -# TC4: Second attempt within TTL allows (acknowledgment) -# --------------------------------------------------------------------------- - @test "TC4a: same file + same IP retried allows" { - INPUT='{"tool_name":"Write","tool_input":{"file_path":"/repo/scratch.py","content":"x=\"10.0.1.200\""}}' - run python3 "$SCRIPT" <<< "$INPUT" + IN='{"tool_name":"Write","tool_input":{"file_path":"/repo/scratch.py","content":"x=\"10.0.1.200\""}}' + run python3 "$SCRIPT" <<< "$IN" echo "$output" | grep -q '"permissionDecision": "deny"' - - run python3 "$SCRIPT" <<< "$INPUT" + run python3 "$SCRIPT" <<< "$IN" echo "$output" | grep -q '"permissionDecision": "allow"' echo "$output" | grep -q 'WARNING (acknowledged)' } -@test "TC4b: same IP in a different file blocks again (per-file tracking)" { - FIRST='{"tool_name":"Write","tool_input":{"file_path":"/a.py","content":"x=\"10.0.1.200\""}}' - SECOND='{"tool_name":"Write","tool_input":{"file_path":"/b.py","content":"x=\"10.0.1.200\""}}' - run python3 "$SCRIPT" <<< "$FIRST" +@test "TC4b: same IP in a different file blocks again" { + F='{"tool_name":"Write","tool_input":{"file_path":"/a.py","content":"x=\"10.0.1.200\""}}' + S='{"tool_name":"Write","tool_input":{"file_path":"/b.py","content":"x=\"10.0.1.200\""}}' + run python3 "$SCRIPT" <<< "$F" echo "$output" | grep -q '"permissionDecision": "deny"' - - run python3 "$SCRIPT" <<< "$SECOND" + run python3 "$SCRIPT" <<< "$S" echo "$output" | grep -q '"permissionDecision": "deny"' } @test "TC4c: same file, NEW IP on second write blocks the new one" { - FIRST='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"x=\"10.0.1.200\""}}' - SECOND='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"y=\"172.16.0.5\""}}' - run python3 "$SCRIPT" <<< "$FIRST" + F='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"x=\"10.0.1.200\""}}' + S='{"tool_name":"Write","tool_input":{"file_path":"/c.py","content":"y=\"172.16.0.5\""}}' + run python3 "$SCRIPT" <<< "$F" echo "$output" | grep -q '"permissionDecision": "deny"' - - run python3 "$SCRIPT" <<< "$SECOND" + run python3 "$SCRIPT" <<< "$S" echo "$output" | grep -q '"permissionDecision": "deny"' echo "$output" | grep -q '172.16.0.5' } -@test "TC4d: same file, retry with BOTH old (acknowledged) + new IP blocks only new" { - FIRST='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"x=\"10.0.1.200\""}}' - SECOND='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"a=\"10.0.1.200\"\nb=\"172.16.0.5\""}}' - run python3 "$SCRIPT" <<< "$FIRST" +@test "TC4d: same file, retry with both old (ack) + new IP blocks only new" { + F='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"x=\"10.0.1.200\""}}' + S='{"tool_name":"Write","tool_input":{"file_path":"/d.py","content":"a=\"10.0.1.200\"\nb=\"172.16.0.5\""}}' + run python3 "$SCRIPT" <<< "$F" echo "$output" | grep -q '"permissionDecision": "deny"' - - run python3 "$SCRIPT" <<< "$SECOND" + run python3 "$SCRIPT" <<< "$S" echo "$output" | grep -q '"permissionDecision": "deny"' - # The block message should name the new IP, not the acknowledged one echo "$output" | grep -q '172.16.0.5' echo "$output" | grep -vq 'BLOCKED.*10\.0\.1\.200,' } # --------------------------------------------------------------------------- -# TC5: Empty content allowed silently +# TC5-TC6: Edge cases — empty content, strict octet regex # --------------------------------------------------------------------------- @test "TC5: Write with no content is allowed silently" { @@ -166,10 +156,6 @@ run_hook() { [ -z "$output" ] } -# --------------------------------------------------------------------------- -# TC6: Strict octet regex — out-of-range octets are NOT treated as IPs -# --------------------------------------------------------------------------- - @test "TC6a: 999.999.999.999 is not matched (each octet > 255)" { run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"999.999.999.999\""}}' [ "$status" -eq 0 ] @@ -182,34 +168,26 @@ run_hook() { [ -z "$output" ] } -@test "TC6c: 192.168.0.256 is not matched as allowed-range (last octet > 255)" { - # If matched, it would BLOCK (since 256 isn't in 192.168.0.0/24). - # With the strict regex it's not an IP at all, so the line is skipped. +@test "TC6c: 192.168.0.256 is not matched (last octet > 255)" { run_hook '{"tool_name":"Write","tool_input":{"file_path":"/x.py","content":"x=\"192.168.0.256\""}}' [ "$status" -eq 0 ] [ -z "$output" ] } # --------------------------------------------------------------------------- -# TC7: Path normalization — relative and absolute paths to the same file -# share the same acknowledgment slot. +# TC7: Path normalization # --------------------------------------------------------------------------- @test "TC7: relative and absolute paths to the same file share state" { - TMPDIR_TEST="$(mktemp -d)" - ABS_PATH="$TMPDIR_TEST/foo.py" - REL_PATH="./foo.py" - FIRST="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$ABS_PATH\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" - SECOND="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$REL_PATH\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" - - # First attempt blocks - run python3 "$SCRIPT" <<< "$FIRST" + T="$(mktemp -d)" + ABS="$T/foo.py" + REL="./foo.py" + F="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$ABS\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" + S="{\"tool_name\":\"Write\",\"tool_input\":{\"file_path\":\"$REL\",\"content\":\"x=\\\"10.0.1.200\\\"\"}}" + run python3 "$SCRIPT" <<< "$F" echo "$output" | grep -q '"permissionDecision": "deny"' - - # Same IP, this time as a relative path resolved from the same cwd → allow - cd "$TMPDIR_TEST" - run python3 "$SCRIPT" <<< "$SECOND" + cd "$T" + run python3 "$SCRIPT" <<< "$S" echo "$output" | grep -q '"permissionDecision": "allow"' - - rm -rf "$TMPDIR_TEST" + rm -rf "$T" } From 54cc8e5939304e201511033d0d75deef4bb92b46 Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Sun, 24 May 2026 13:06:18 -0400 Subject: [PATCH 4/5] chore(content-guards): silence pyright unused-arg on private_key lambda The detector's is_allowed is always False (private keys never have a legitimate allowlist), so the argument is intentionally unused. Rename `_v` to `_` to match the Pyright convention for ignored args. Assisted-by: Claude --- content-guards/scripts/validate-sensitive-content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content-guards/scripts/validate-sensitive-content.py b/content-guards/scripts/validate-sensitive-content.py index 29d9d92..12c86d2 100755 --- a/content-guards/scripts/validate-sensitive-content.py +++ b/content-guards/scripts/validate-sensitive-content.py @@ -146,7 +146,7 @@ class Detector: skip_line=lambda l: bool(EMAIL_PLACEHOLDER.search(l))), Detector("absolute_user_path", USER_PATH, _user_path_allowed, "use ${HOME}, ~, ${USER}, or placeholders."), - Detector("private_key_header", PRIVATE_KEY, lambda _v: False, + Detector("private_key_header", PRIVATE_KEY, lambda _: False, "private keys belong in keychain/SOPS/Doppler, never a file."), Detector("aws_account_id", AWS_ACCT, _aws_allowed, "use 123456789012 (AWS sample) or ${AWS_ACCOUNT_ID}.", From 7bc3435880872b9f18e49202497c8dc242f165ab Mon Sep 17 00:00:00 2001 From: JacobPEvans <20714140+JacobPEvans@users.noreply.github.com> Date: Sun, 24 May 2026 16:32:59 -0400 Subject: [PATCH 5/5] refactor(content-guards): flip real_domain to small popular-TLD allowlist Replace the 86-entry file-extension skip set with a focused ~29-TLD allowlist of popular real TLDs (com, net, org, io, ai, dev, app, co, cloud, gov, edu, mil, info, biz, me, tv, fm, ly, us, uk, de, jp, ca, au, fr, cn, eu, tech, xyz, online, sh). Only candidates whose TLD is in this set are even considered; everything else (filenames, version strings, anything ending in an unfamiliar suffix) is allowed by default. Lower false-positive risk and far easier to audit than enumerating every possible non-TLD suffix. Verified domain logic against 14 representative cases (filename foo.py allowed, real .io/.ai/.dev blocked, allowlist exacts preserved). Assisted-by: Claude --- content-guards/README.md | 14 ++++++++----- .../scripts/validate-sensitive-content.py | 21 +++++++++++-------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/content-guards/README.md b/content-guards/README.md index 6a2fe22..ce483c9 100644 --- a/content-guards/README.md +++ b/content-guards/README.md @@ -44,11 +44,15 @@ No manual invocation required. All hooks activate automatically: - **`aws_account_id`** — bare 12-digit numbers on lines mentioning `account_id`, `arn:aws:`, `aws_account_id`, or `:account:`; allows `123456789012` (AWS's documented sample) and repeated-digit shapes. - - **`real_domain`** — FQDN-shaped tokens outside `*.example.*`, - `*.test`, `*.localhost`, `*.invalid`, `*.local`, the project's - short explicit allowlist (`github.com`, `api.github.com`, - `raw.githubusercontent.com`, `docs.jacobpevans.com`, `runs-on.com`, - `healthchecks.io`), and known file-extension shapes. + - **`real_domain`** — only flags tokens whose TLD is in a focused + allowlist of ~29 popular public TLDs (`com`, `net`, `org`, `io`, + `ai`, `dev`, `app`, `co`, `cloud`, `gov`, `edu`, etc. — see + `REAL_TLDS` in the script). Anything outside that set + (filenames like `foo.py`, version strings) is left alone. + Also allows `*.example.*`, `*.test`, `*.localhost`, `*.invalid`, + `*.local`, and the project's short explicit allowlist + (`github.com`, `api.github.com`, `raw.githubusercontent.com`, + `docs.jacobpevans.com`, `runs-on.com`, `healthchecks.io`). - **webfetch-guard** — blocks outdated year references in web queries (PreToolUse: WebFetch, WebSearch) - **issue-limiter** — rate limits `gh issue create` and `gh pr create` (PreToolUse: Bash) - **branch-limiter** — limits concurrent open branches (PreToolUse: Bash) diff --git a/content-guards/scripts/validate-sensitive-content.py b/content-guards/scripts/validate-sensitive-content.py index 12c86d2..cc12a75 100755 --- a/content-guards/scripts/validate-sensitive-content.py +++ b/content-guards/scripts/validate-sensitive-content.py @@ -52,14 +52,16 @@ DOMAIN = re.compile( r"\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,24}\b", re.IGNORECASE ) -# Explicit; keeps the false-positive surface auditable. -FILE_EXTENSION_TLDS = frozenset(( - "md mdx rst tex bib py js ts tsx jsx json yaml yml toml lock lockb sh " - "bash zsh fish nix tf hcl go rs rb html css scss svg png jpg jpeg gif " - "pdf txt csv xml zip tar gz bz2 7z mp3 mp4 mov dockerignore gitignore " - "log ini cfg conf env example sample template j2 ipynb sql proto " - "graphql gql vue svelte c h cpp hpp cc cxx java kt swift m mm php pl " - "lua r jl dart elm ex exs erl bats" +# Only flag candidates whose TLD is plausibly a real public TLD. Anything +# outside this set (filenames like foo.py, version strings like v1.2.3) is +# left alone. Keep this list focused on TLDs we'd actually see in this org's +# work; add sparingly. +REAL_TLDS = frozenset(( + "com net org info biz " + "io ai dev app co cloud tech xyz online sh " + "gov edu mil " + "me tv fm ly us " + "uk de jp ca au fr cn eu" ).split()) _DOMAIN_SUFFIX = ( ".example.com", ".example.org", ".example.net", ".example.local", @@ -109,7 +111,8 @@ def _domain_allowed(v: str) -> bool: v = v.lower() if v in _DOMAIN_EXACT or v.endswith(_DOMAIN_SUFFIX): return True - return v.rsplit(".", 1)[-1] in FILE_EXTENSION_TLDS + # TLD not in REAL_TLDS = not a domain we care about (filename, version, etc.) + return v.rsplit(".", 1)[-1] not in REAL_TLDS def _domain_skip(line: str) -> bool: