dryvist · JacobPEvans · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/content-guards/README.md b/content-guards/README.md
@@ -8,6 +8,9 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams.
 
 - **markdown-validator**: Validates markdown with markdownlint
 - **token-validator**: Enforces configurable file token limits
+- **sensitive-content-guard**: Blocks 7 categories of sensitive literals
+  (IPv4, IPv6, emails, user paths, private keys, AWS account IDs, real
+  domains) in Write/Edit content; first-block, second-allow flow per detector
 - **webfetch-guard**: Blocks outdated year references in web queries
 - **readme-validator**: Checks README files for required sections and badge health
 - **issue-limiter**: Prevents GitHub issue backlog overflow with 24h rate limiting
@@ -17,6 +20,39 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for integration diagrams.
 No manual invocation required. All hooks activate automatically:
 
 - **token-validator** — blocks files exceeding token limits (PreToolUse: Write, Edit)
+- **sensitive-content-guard** — blocks 7 categories of sensitive literals
+  in Write/Edit content (PreToolUse: Write, Edit). First attempt blocks
+  with a per-detector hint; a retry within 5 minutes for the same
+  `(file, detector, value)` is treated as the agent's acknowledgment and
+  allowed through. State persists in
+  `$XDG_CACHE_HOME/content-guards/sensitive-content-state.json`.
+
+  Detectors and their allowlist anchors:
+  - **`ipv4`** — IPv4 outside `192.168.0.0/24`, loopback, `0.0.0.0`,
+    broadcast (`255.255.255.x`), link-local metadata (`169.254.169.254`).
+  - **`ipv6`** — IPv6 outside `::`/`::1`, `fe80::*` (link-local),
+    `fc00::/7` (ULA), `2001:db8::*` (RFC 3849 doc prefix), `ff00::*`
+    (multicast).
+  - **`email`** — real email addresses outside `noreply@github.com`,
+    `*@users.noreply.github.com`, `*@example.{com,org,net,local}`,
+    `*@test`, `*@localhost`, and `<placeholder@…>` shapes.
+  - **`absolute_user_path`** — hard-coded `/Users/<name>/` or
+    `/home/<name>/` outside `${USER}`, `$USER`, or `<user>` placeholder
+    shapes.
+  - **`private_key_header`** — PEM private key markers
+    (`-----BEGIN … PRIVATE KEY-----`); always blocked.
+  - **`aws_account_id`** — bare 12-digit numbers on lines mentioning
+    `account_id`, `arn:aws:`, `aws_account_id`, or `:account:`; allows
+    `123456789012` (AWS's documented sample) and repeated-digit shapes.
+  - **`real_domain`** — only flags tokens whose TLD is in a focused
+    allowlist of ~29 popular public TLDs (`com`, `net`, `org`, `io`,
+    `ai`, `dev`, `app`, `co`, `cloud`, `gov`, `edu`, etc. — see
+    `REAL_TLDS` in the script). Anything outside that set
+    (filenames like `foo.py`, version strings) is left alone.
+    Also allows `*.example.*`, `*.test`, `*.localhost`, `*.invalid`,
+    `*.local`, and the project's short explicit allowlist
+    (`github.com`, `api.github.com`, `raw.githubusercontent.com`,
+    `docs.jacobpevans.com`, `runs-on.com`, `healthchecks.io`).
 - **webfetch-guard** — blocks outdated year references in web queries (PreToolUse: WebFetch, WebSearch)
 - **issue-limiter** — rate limits `gh issue create` and `gh pr create` (PreToolUse: Bash)
 - **branch-limiter** — limits concurrent open branches (PreToolUse: Bash)

diff --git a/content-guards/hooks/hooks.json b/content-guards/hooks/hooks.json
@@ -8,6 +8,11 @@
             "type": "command",
             "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-token-limits.py",
             "timeout": 30
+          },
+          {
+            "type": "command",
+            "command": "${CLAUDE_PLUGIN_ROOT}/scripts/validate-sensitive-content.py",
+            "timeout": 30
           }
         ]
       },

diff --git a/content-guards/scripts/validate-sensitive-content.py b/content-guards/scripts/validate-sensitive-content.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""PreToolUse hook: block sensitive content in Write/Edit. See README."""
+from __future__ import annotations
+
+import json
+import os
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+VERSION_PIN = re.compile(r"\brev:\s*v?\d")
+HASH_LINE = re.compile(r"\b(?:sha\d+|md5|cas)[-: ]", re.IGNORECASE)
+EMAIL_PLACEHOLDER = re.compile(r"<[A-Za-z][A-Za-z0-9._-]*@[A-Za-z0-9._-]*>")
+LINK_REF = re.compile(r"^\s*\[[^\]]+\]:\s")
+REPO_LINE = re.compile(r"^\s*repo:\s")
+IMAGE_LINE = re.compile(r"^\s*image:\s")
+
+_OCT = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+IPV4 = re.compile(rf"(?<![\w.]){_OCT}(?:\.{_OCT}){{3}}(?![\w.])")
+_IPV4_OK = [re.compile(p) for p in (
+    rf"^192\.168\.0\.{_OCT}$", r"^127\.0\.0\.[01]$", r"^0\.0\.0\.0$",
+    rf"^255\.255\.255\.{_OCT}$", r"^169\.254\.169\.254$",
+)]
+
+_H = r"[0-9A-Fa-f]{1,4}"
+IPV6 = re.compile(
+    rf"(?<![\w:])(?:{_H}(?::{_H}){{7}}"
+    rf"|(?:{_H}:){{1,7}}:(?:{_H}(?::{_H}){{0,6}})?"
+    rf"|:(?::{_H}){{1,7}}|::)(?![\w:])"
+)
+
+EMAIL = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
+_EMAIL_SUFFIX = (
+    "@users.noreply.github.com", "@example.com", "@example.org",
+    "@example.net", "@example.local", "@test", "@localhost",
+)
+_EMAIL_PREFIX = ("your-email@", "email@example.", "user@example.")
+
+USER_PATH = re.compile(r"(?:/Users|/home)/[A-Za-z][A-Za-z0-9._-]*/")
+_REAL_USER = os.environ.get("USER", "")
+
+PRIVATE_KEY = re.compile(
+    r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |PGP |ENCRYPTED )?PRIVATE KEY-----"
+)
+
+AWS_ACCT = re.compile(r"(?<![\d.])\d{12}(?![\d.])")
+AWS_CTX = re.compile(r"account[_ ]?id|arn:aws:|aws_account_id|:account:", re.IGNORECASE)
+
+DOMAIN = re.compile(
+    r"\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,24}\b", re.IGNORECASE
+)
+# Only flag candidates whose TLD is plausibly a real public TLD. Anything
+# outside this set (filenames like foo.py, version strings like v1.2.3) is
+# left alone. Keep this list focused on TLDs we'd actually see in this org's
+# work; add sparingly.
+REAL_TLDS = frozenset((
+    "com net org info biz "
+    "io ai dev app co cloud tech xyz online sh "
+    "gov edu mil "
+    "me tv fm ly us "
+    "uk de jp ca au fr cn eu"
+).split())
+_DOMAIN_SUFFIX = (
+    ".example.com", ".example.org", ".example.net", ".example.local",
+    ".example", ".test", ".localhost", ".invalid", ".local",
+    ".users.noreply.github.com",
+)
+_DOMAIN_EXACT = frozenset((
+    "example.com", "example.org", "example.net", "example.local",
+    "your-domain.com", "your-domain.example",
+    "github.com", "api.github.com", "raw.githubusercontent.com",
+    "docs.jacobpevans.com", "runs-on.com", "healthchecks.io",
+    "noreply.github.com", "users.noreply.github.com",
+))
+
+
+def _ipv4_allowed(v: str) -> bool:
+    return any(p.match(v) for p in _IPV4_OK)
+
+
+def _ipv6_allowed(v: str) -> bool:
+    v = v.lower()
+    if v in ("::", "::1"):
+        return True
+    if v.startswith(("fe80:", "fe80::", "2001:db8:", "2001:db8::")):
+        return True
+    return bool(re.match(r"^(?:f[cd][0-9a-f]{0,2}|ff[0-9a-f]{2}):", v))
+
+
+def _email_allowed(v: str) -> bool:
+    v = v.lower()
+    return (v == "noreply@github.com"
+            or v.endswith(_EMAIL_SUFFIX) or v.startswith(_EMAIL_PREFIX))
+
+
+def _user_path_allowed(v: str) -> bool:
+    lower = v.lower()
+    if "<user>" in lower or "$user" in lower or "${user}" in lower:
+        return True
+    return bool(_REAL_USER) and v.endswith(f"/{_REAL_USER}/")
+
+
+def _aws_allowed(v: str) -> bool:
+    return v in {"123456789012", "000000000000"} or len(set(v)) == 1
+
+
+def _domain_allowed(v: str) -> bool:
+    v = v.lower()
+    if v in _DOMAIN_EXACT or v.endswith(_DOMAIN_SUFFIX):
+        return True
+    # TLD not in REAL_TLDS = not a domain we care about (filename, version, etc.)
+    return v.rsplit(".", 1)[-1] not in REAL_TLDS
+
+
+def _domain_skip(line: str) -> bool:
+    return bool(
+        REPO_LINE.match(line) or IMAGE_LINE.match(line)
+        or LINK_REF.match(line) or EMAIL_PLACEHOLDER.search(line)
+    )
+
+
+def _ip_skip(line: str) -> bool:
+    return bool(VERSION_PIN.search(line) or HASH_LINE.search(line))
+
+
+@dataclass
+class Detector:
+    name: str
+    pattern: re.Pattern
+    is_allowed: Callable[[str], bool]
+    message_hint: str
+    skip_line: Optional[Callable[[str], bool]] = None
+    line_context: Optional[Callable[[str], bool]] = None
+    normalize: Callable[[str], str] = field(default=lambda v: v)
+
+
+DETECTORS: list[Detector] = [
+    Detector("ipv4", IPV4, _ipv4_allowed,
+             "use 192.168.0.x sample CIDR or env/secret.",
+             skip_line=lambda l: bool(VERSION_PIN.search(l))),
+    Detector("ipv6", IPV6, _ipv6_allowed,
+             "use 2001:db8:: (RFC 3849 doc prefix), fe80::, ::1, or env var.",
+             skip_line=_ip_skip),
+    Detector("email", EMAIL, _email_allowed,
+             "use `<email>`, *@example.com, or a GitHub no-reply variant.",
+             skip_line=lambda l: bool(EMAIL_PLACEHOLDER.search(l))),
+    Detector("absolute_user_path", USER_PATH, _user_path_allowed,
+             "use ${HOME}, ~, ${USER}, or <user> placeholders."),
+    Detector("private_key_header", PRIVATE_KEY, lambda _: False,
+             "private keys belong in keychain/SOPS/Doppler, never a file."),
+    Detector("aws_account_id", AWS_ACCT, _aws_allowed,
+             "use 123456789012 (AWS sample) or ${AWS_ACCOUNT_ID}.",
+             line_context=lambda l: bool(AWS_CTX.search(l))),
+    Detector("real_domain", DOMAIN, _domain_allowed,
+             "use example.com, *.test, *.localhost, or env var.",
+             skip_line=_domain_skip, normalize=str.lower),
+]
+
+_CACHE = Path(os.environ.get("XDG_CACHE_HOME") or (Path.home() / ".cache"))
+STATE_FILE = _CACHE / "content-guards" / "sensitive-content-state.json"
+TTL_SECONDS = 300
+if (_override := os.environ.get("SENSITIVE_CONTENT_STATE_FILE")):
+    STATE_FILE = Path(_override)
+
+
+def find_violations(content: str) -> list[tuple[Detector, str]]:
+    found: list[tuple[Detector, str]] = []
+    seen: set[tuple[str, str]] = set()
+    for line in content.splitlines():
+        for det in DETECTORS:
+            if det.skip_line and det.skip_line(line):
+                continue
+            if det.line_context and not det.line_context(line):
+                continue
+            for match in det.pattern.finditer(line):
+                value = det.normalize(match.group(0))
+                if det.is_allowed(value):
+                    continue
+                key = (det.name, value)
+                if key in seen:
+                    continue
+                seen.add(key)
+                found.append((det, value))
+    return found
+
+
+def load_state() -> dict[str, float]:
+    try:
+        data = json.loads(STATE_FILE.read_text())
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))}
+
+
+def save_state(state: dict[str, float]) -> None:
+    try:
+        STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+        tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
+        tmp.write_text(json.dumps(state))
+        os.replace(tmp, STATE_FILE)
+    except OSError:
+        pass
+
+
+def prune(state: dict[str, float], now: float) -> dict[str, float]:
+    return {k: ts for k, ts in state.items() if now - ts < TTL_SECONDS}
+
+
+def extract_content(tool_name: str, tool_input: dict) -> str:
+    if tool_name == "Write":
+        return str(tool_input.get("content") or "")
+    if tool_name == "Edit":
+        return str(tool_input.get("new_string") or "")
+    return ""
+
+
+def emit(decision: str, reason: str) -> None:
+    print(json.dumps({"hookSpecificOutput": {
+        "hookEventName": "PreToolUse",
+        "permissionDecision": decision,
+        "permissionDecisionReason": reason,
+    }}))
+
+
+def _group(violations: list[tuple[Detector, str]]) -> list[tuple[Detector, list[str]]]:
+    order: list[Detector] = []
+    by_name: dict[str, list[str]] = {}
+    for det, value in violations:
+        if det.name not in by_name:
+            by_name[det.name] = []
+            order.append(det)
+        by_name[det.name].append(value)
+    return [(det, by_name[det.name]) for det in order]
+
+
+def _format(groups: list[tuple[Detector, list[str]]], hint: bool) -> str:
+    lines = []
+    for det, values in groups:
+        head = f"  [{det.name}] {', '.join(values)}"
+        if hint and det.message_hint:
+            head += f"\n    -> {det.message_hint}"
+        lines.append(head)
+    return "\n".join(lines)
+
+
+def main() -> int:
+    try:
+        hook_input = json.load(sys.stdin)
+    except (json.JSONDecodeError, ValueError):
+        return 0
+    tool_name = hook_input.get("tool_name", "")
+    if tool_name not in ("Write", "Edit"):
+        return 0
+    tool_input = hook_input.get("tool_input") or {}
+    raw_path = str(tool_input.get("file_path") or "")
+    file_path = os.path.realpath(raw_path) if raw_path else ""
+    content = extract_content(tool_name, tool_input)
+    if not content:
+        return 0
+    violations = find_violations(content)
+    if not violations:
+        return 0
+
+    now = time.time()
+    state = prune(load_state(), now)
+    keys = {(d.name, v): f"{file_path}:{d.name}:{v}" for d, v in violations}
+    unwarned = [(d, v) for d, v in violations if keys[(d.name, v)] not in state]
+
+    if not unwarned:
+        emit("allow", (
+            f"WARNING (acknowledged): sensitive content in {tool_name} of "
+            f"{file_path}:\n{_format(_group(violations), hint=False)}\n\n"
+            f"Proceeding because this is a retry within the {TTL_SECONDS // 60}-min "
+            "window. Confirm the file is not committed publicly."
+        ))
+        return 0
+
+    for det, value in unwarned:
+        state[keys[(det.name, value)]] = now
+    save_state(state)
+    emit("deny", (
+        f"BLOCKED (first attempt): sensitive content in {tool_name} of "
+        f"{file_path}:\n{_format(_group(unwarned), hint=True)}\n\n"
+        "These values look like real artifacts and would leak if committed.\n"
+        f"Retry within {TTL_SECONDS // 60} min to acknowledge and proceed."
+    ))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())