rag/scripts/encoding_coverage.py at main · EndogenAI/rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""scripts/encoding_coverage.py

Checks MANIFESTO.md and AGENTS.md to determine whether each named principle
and axiom has all four [4,1] encoding forms present.

Encoding forms:
    F1 = verbal description   — at least one substantive paragraph in the
                                principle's section body
    F2 = canonical example    — labeled ``**Canonical example**:`` block
    F3 = anti-pattern         — labeled ``**Anti-pattern**`` block
    F4 = programmatic gate    — labeled ``**Programmatic gate**:`` OR an
                                explicit reference to a script/hook/CI mechanism

Purpose:
    Produce a Markdown coverage table as a baseline for tracking encoding
    completeness of every MANIFESTO principle.  Gaps in F2–F4 signal
    principles where knowledge is verbally described but has not been
    concretized into examples, anti-patterns, or enforcement mechanisms.

Inputs:
    --manifesto PATH   Path to MANIFESTO.md (default: MANIFESTO.md)
    --agents PATH      Path to AGENTS.md    (default: AGENTS.md)

Outputs:
    Markdown table written to stdout.  Exits 0 on success, 1 on a missing
    input file.

Exit codes:
    0  Table generated successfully.
    1  One or more input files not found — error written to stderr.

Usage:
    uv run python scripts/encoding_coverage.py --manifesto MANIFESTO.md --agents AGENTS.md
"""

from __future__ import annotations

import argparse
import re
import sys
from dataclasses import dataclass
from pathlib import Path

# ---------------------------------------------------------------------------
# Principle registry
# ---------------------------------------------------------------------------


@dataclass
class Principle:
    name: str
    layer: str


#: All named principles extracted from MANIFESTO.md in document order.
PRINCIPLES: list[Principle] = [
    Principle("Endogenous-First", "Axiom 1"),
    Principle("Algorithms Before Tokens", "Axiom 2"),
    Principle("Local Compute-First", "Axiom 3"),
    Principle("Programmatic-First", "Cross-cutting"),
    Principle("Documentation-First", "Cross-cutting"),
    Principle("Adopt Over Author", "Cross-cutting"),
    Principle("Self-Governance & Guardrails", "Cross-cutting"),
    Principle("Compress Context, Not Content", "Cross-cutting"),
    Principle("Isolate Invocations, Parallelize Safely", "Cross-cutting"),
    Principle("Validate & Gate, Always", "Cross-cutting"),
    Principle("Minimal Posture", "Cross-cutting"),
    Principle("Testing-First", "Cross-cutting"),
]


# ---------------------------------------------------------------------------
# Detection patterns
# ---------------------------------------------------------------------------

# F1 — a line of ≥40 printable characters that is not a heading, blockquote,
#       list item, table row, code fence, or blank line.
_F1_PARA_RE = re.compile(
    r"^(?![ \t]*[>|#\-\*`])[A-Za-z\(\"'].{39,}$",
    re.MULTILINE,
)

# F2 — explicit canonical-example label (bold, case-insensitive)
_CANONICAL_EXAMPLE_RE = re.compile(r"\*\*canonical example\*\*", re.IGNORECASE)

# F3 — explicit anti-pattern label (bold, case-insensitive; may include a
#       parenthesised sub-label before the colon)
_ANTI_PATTERN_RE = re.compile(r"\*\*anti-pattern", re.IGNORECASE)

# F4 — explicit programmatic-gate label OR an unambiguous enforcement reference
_PROGRAMMATIC_GATE_RE = re.compile(
    r"\*\*programmatic gate\*\*"
    r"|scripts/\S+\.py"
    r"|pre-commit hook"
    r"|pre-push hook"
    r"|`uv run pytest"
    r"|CI step"
    r"|`uv run python",
    re.IGNORECASE,
)


# ---------------------------------------------------------------------------
# Section extraction
# ---------------------------------------------------------------------------


def extract_h3_section(text: str, principle_name: str) -> str:
    """Return the body of the first H3 section whose title contains *principle_name*.

    Searches for the heading using a case-insensitive substring match so that
    numbering prefixes ("1. Endogenous-First") and parenthesised suffixes
    ("Adopt Over Author (Avoid Reinventing the Wheel)") are handled
    transparently.

    Returns an empty string if no matching heading is found.
    """
    escaped = re.escape(principle_name)
    heading_re = re.compile(
        rf"^###\s+.*{escaped}.*$",
        re.MULTILINE | re.IGNORECASE,
    )
    m = heading_re.search(text)
    if not m:
        return ""

    start = m.end()
    # Terminate at the next ## or ### heading
    next_heading_re = re.compile(r"^#{2,3}\s+", re.MULTILINE)
    nx = next_heading_re.search(text, start)
    end = nx.start() if nx else len(text)
    return text[start:end]


def _agents_context(agents_text: str, principle_name: str) -> str:
    """Return up to ~3 000 characters of AGENTS.md surrounding each mention of
    *principle_name*, for use as a supplementary F2–F4 source."""
    pattern = re.compile(re.escape(principle_name), re.IGNORECASE)
    snippets: list[str] = []
    for m in pattern.finditer(agents_text):
        start = max(0, m.start() - 600)
        end = min(len(agents_text), m.end() + 600)
        snippets.append(agents_text[start:end])
    return "\n".join(snippets)


# ---------------------------------------------------------------------------
# Coverage checks
# ---------------------------------------------------------------------------


def check_coverage(
    section_body: str,
    agents_text: str,
    principle_name: str,
) -> tuple[bool, bool, bool, bool]:
    """Return (F1, F2, F3, F4) coverage flags for *principle_name*.

    F1 is evaluated against the MANIFESTO section body only.
    F2–F4 are evaluated against the section body first; if absent there, the
    nearby AGENTS.md context is also checked to capture cross-document encoding.
    """
    ctx = _agents_context(agents_text, principle_name)

    f1 = bool(_F1_PARA_RE.search(section_body)) if section_body else False
    f2 = bool(_CANONICAL_EXAMPLE_RE.search(section_body) or _CANONICAL_EXAMPLE_RE.search(ctx))
    f3 = bool(_ANTI_PATTERN_RE.search(section_body) or _ANTI_PATTERN_RE.search(ctx))
    f4 = bool(_PROGRAMMATIC_GATE_RE.search(section_body) or _PROGRAMMATIC_GATE_RE.search(ctx))

    return f1, f2, f3, f4


# ---------------------------------------------------------------------------
# Table rendering
# ---------------------------------------------------------------------------

_TICK = "✓"
_CROSS = "✗"


def _cell(flag: bool) -> str:
    return _TICK if flag else _CROSS


def build_table(manifesto_text: str, agents_text: str) -> str:
    """Return the complete Markdown coverage table as a string."""
    header = "| Principle | Layer | F1 Desc | F2 Canonical | F3 Anti-pattern | F4 Programmatic | Score |"
    separator = "|-----------|-------|---------|--------------|-----------------|-----------------|-------|"
    rows: list[str] = [header, separator]

    for p in PRINCIPLES:
        section = extract_h3_section(manifesto_text, p.name)
        f1, f2, f3, f4 = check_coverage(section, agents_text, p.name)
        score = sum([f1, f2, f3, f4])
        rows.append(f"| {p.name} | {p.layer} | {_cell(f1)} | {_cell(f2)} | {_cell(f3)} | {_cell(f4)} | {score}/4 |")

    return "\n".join(rows)


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Check MANIFESTO F1-F4 encoding coverage for named principles.")
    parser.add_argument(
        "--manifesto",
        default="MANIFESTO.md",
        help="Path to MANIFESTO.md (default: MANIFESTO.md)",
    )
    parser.add_argument(
        "--agents",
        default="AGENTS.md",
        help="Path to AGENTS.md (default: AGENTS.md)",
    )
    args = parser.parse_args(argv)

    manifesto_path = Path(args.manifesto)
    agents_path = Path(args.agents)

    missing = [p for p in (manifesto_path, agents_path) if not p.exists()]
    if missing:
        for p in missing:
            print(f"Error: file not found: {p}", file=sys.stderr)
        return 1

    manifesto_text = manifesto_path.read_text(encoding="utf-8")
    agents_text = agents_path.read_text(encoding="utf-8")

    print(build_table(manifesto_text, agents_text))
    return 0


if __name__ == "__main__":
    sys.exit(main())