diff --git a/.agents/skills/protocol-migration/SKILL.md b/.agents/skills/protocol-migration/SKILL.md index 55ad0a5..7b806de 100644 --- a/.agents/skills/protocol-migration/SKILL.md +++ b/.agents/skills/protocol-migration/SKILL.md @@ -1,75 +1,113 @@ --- name: protocol-migration -description: Convert legacy/source.txt into README.md using the repository template, preserving scientific meaning and marking uncertainty with CHECK:. +description: Convert legacy/source.md into README.md using the repository template, using source.txt only as fallback and marking uncertainty with CHECK:. --- Use this skill when migrating a legacy protocol into this repository template. Goal: -Convert `legacy/source.txt` into `README.md`, using the existing `README.md` as the target template and structure. +Convert `legacy/source.md` into `README.md`, using the existing `README.md` as the target template and structure. -Primary sources: -- `legacy/source.txt` is the main source -- also consult the PDF in `legacy/` for tables, layout-dependent content, and anything unclear +## Primary rule +Do not change protocol meaning. +Use `legacy/source.md` as the primary source when rewriting `README.md`. +Use `legacy/source.txt` only as a fallback when `legacy/source.md` looks malformed, incomplete, or unclear. +Use the PDF file in `legacy/` as the final reference source of truth for tables, figures, layout-dependent content, and anything still ambiguous. +If `legacy/source.md` and `legacy/source.txt` disagree, prefer `legacy/source.md` for general structure and prose, but use the original PDF as the final tie-breaker. -Core rules: -- Do not change protocol meaning -- Do not invent missing information -- Do not delete source content -- Do not silently summarize, compress, or merge steps -- Preserve exact reagent names, quantities, timings, temperatures, and conditions unless only formatting is being normalized -- Preserve step order unless the source clearly indicates otherwise -- If anything is uncertain, mark it with `CHECK:` instead of guessing +## Migration behavior +When converting legacy protocol content into the repository template: -If content does not fit cleanly: -- place it under `# Migration notes` or `## Unplaced content` +- Preserve all protocol content. +- Preserve all procedural content, warnings, notes, reagent names, quantities, timings, temperatures, and conditions, preserving their location. +- Do not change scientific meaning. +- Do not invent missing information. +- Do not invent missing values or steps. +- Do not delete any content from the source. +- Do not silently summarize, compress, or merge steps. +- Keep exact reagent names, quantities, temperatures, timings, and conditions unless only formatting is being normalized. +- Preserve exact reagent and equipment names unless only formatting is changing. +- Preserve the step order from the source unless the source clearly indicates otherwise. +- Do not delete repeated warnings or notes. +- If any text does not fit cleanly into the template, place it under `# Migration notes` or `## Unplaced content`. +- Mark uncertainty with `CHECK:` instead of guessing. -Allowed formatting normalization only when meaning is unchanged: -- add a space between numbers and units -- standardize temperature formatting to `37 °C` -- standardize volumes to `µL`, `mL`, `L` -- standardize concentrations to `mM`, `µM`, `nM`, `% (w/v)` -- standardize time units to `seconds`, `minutes`, `hours` -- standardize pH formatting to `pH 7.4` -- normalize bullets, headings, and markdown tables to match the template -- use tables for reaction mixes and other tabular content -- use HTML subscripts for chemical formulas where needed -- normalize note-like text to blockquote style, e.g. `> **Note**` +## Allowed formatting normalization +You may normalize formatting only when the meaning is unchanged and unambiguous: -Do not: -- infer omitted values -- replace vague wording like `overnight` or `room temperature` with precise values -- reorder steps unless clearly justified by the source -- remove repeated warnings or notes -- replace one reagent with another -- omit unmapped text +- Add a space between numbers and units. +- Standardize temperature formatting to `37 °C`. +- Standardize volume units to `µL`, `mL`, `L`, using the micro sign `µ` consistently. +- Standardize concentration units to `mM`, `µM`, `nM`, `% (w/v)`, etc., using the micro sign `µ` consistently. +- Standardize time units to full words: `seconds`, `minutes`, `hours`. +- Standardize chemical names to match the source but with consistent formatting (e.g. `Tris-HCl` instead of `Tris HCl`). +- Standardize pH formatting to `pH 7.4`. +- Standardize chemical formulas with HTML subscripts, for example H2O to H2O. Similarly for other chemical formulas (e.g. MgCl2 to MgCl2). +- Do not use Unicode subscript characters such as `₂`. +- Standardize `RNAseq` or `RNA-Seq` to `RNA-seq`. Same for `ChIP-seq`, `ATAC-seq`, etc. +- Normalize bullet formatting and markdown table formatting. +- Normalize heading structure to match the repository template. +- For reaction mixes and anything tabular, place them inside a table as in template. +- Normalize markdown headings, bullets, and tables. +- "Note" or "NOTE" or "Optional" or "Recommended" or "Warning" are normalized to start with `>` (example `> **Note**`) and are placed immediately after the step they refer to, or at the end of the protocol if they clearly refer to the whole protocol. +- Remove empty columns from tables. +- Synchronize `Contents` with actual headings in the protocol. -Output requirements: +## Disallowed changes +- Do not infer omitted concentrations, times, temperatures, or volumes. +- Do not infer values for missing quantities. +- Do not try to calculate or infer values that are not explicitly stated. +- Do not convert `overnight`, `RT`, `briefly`, `room temperature` or similar vague language into precise values. +- Do not replace vague language with precise values. +- Do not reorder steps unless the source clearly numbers them in that order. +- Do not remove duplicate-looking content unless it is truly identical and both copies are preserved in review notes. +- Do not rewrite scientific wording for style if that risks changing meaning. +- Do not fill in table cells with values that are missing from the source. +- Do not replace one reagent name with another. +- Do not remove repeated warnings or notes. +- Do not omit unmapped text. + +## Output requirements - edit `README.md` -- use the template headings -- keep the template badge at the top -- remove the template instruction note +- use the template headings exactly +- use the template headings in `README.md` +- keep all source content +- add `CHECK:` markers for uncertainty +- use `CHECK:` only for genuine unresolved uncertainty. If no uncertainty remains, do not mention `CHECK:` at all +- add `# Migration notes` +- after drafting, add a short summary in `# Migration notes` covering: + - formatting normalizations performed + - ambiguities and uncertainty flagged + - content placed in `## Unplaced content` - add `# Migration notes` including: - imported protocol metadata from `source-metadata.yml` if present + - imported protocol metadata from `source-metadata.yml` using only the non-blank lines - template metadata from `template-metadata.yml` - ambiguous mappings - - formatting normalizations performed + - normalized formatting changes - content copied verbatim but not confidently placed +- keep the template badge at the top +- keep ![Created with ulelab Protocol Template](https://img.shields.io/badge/created%20with-ulelab%20Protocol%20Template-blue) at the top of the file +- remove the template instruction note +- delete the "Template repository: Click `Use this template` to create a new protocol repo..." note +## Verification After drafting, verify the migration against the source: -- compare the migrated `README.md` against `legacy/source.txt` -- - compare the migrated `README.md` against the PDF in `legacy/` +- compare the migrated `README.md` against `legacy/source.md` +- compare any malformed, incomplete, or ambiguous passages against `legacy/source.txt` +- compare the migrated `README.md` against the PDF in `legacy/` for tables, figures, layout-dependent content, and any remaining ambiguity - check that all protocol steps, notes, warnings, reagent names, quantities, temperatures, timings, and conditions are still present - check that no source content has been silently omitted, merged, or reordered without justification - check any tables, layout-dependent content, or ambiguous sections against the PDF in `legacy/` - leave `CHECK:` anywhere the mapping is uncertain rather than guessing Verification checklist: -- `README.md` still matches the scientific content of `legacy/source.txt` +- `README.md` still matches the scientific content of `legacy/source.md` +- any malformed, incomplete, or ambiguous passages were cross-checked against `legacy/source.txt` - no protocol steps or warnings were omitted - no values were invented or made more precise than in the source - tables and layout-dependent content were checked against the PDF in `legacy/` - any uncertain mappings are marked with `CHECK:` - any meaningful normalization choices are noted in `# Migration notes` -Prefer preserving meaning over making the output prettier. \ No newline at end of file +Prefer preserving meaning over making the output prettier. diff --git a/.claude/.claude/skills/protocol-migration/SKILL.md b/.claude/.claude/skills/protocol-migration/SKILL.md index 55ad0a5..7b806de 100644 --- a/.claude/.claude/skills/protocol-migration/SKILL.md +++ b/.claude/.claude/skills/protocol-migration/SKILL.md @@ -1,75 +1,113 @@ --- name: protocol-migration -description: Convert legacy/source.txt into README.md using the repository template, preserving scientific meaning and marking uncertainty with CHECK:. +description: Convert legacy/source.md into README.md using the repository template, using source.txt only as fallback and marking uncertainty with CHECK:. --- Use this skill when migrating a legacy protocol into this repository template. Goal: -Convert `legacy/source.txt` into `README.md`, using the existing `README.md` as the target template and structure. +Convert `legacy/source.md` into `README.md`, using the existing `README.md` as the target template and structure. -Primary sources: -- `legacy/source.txt` is the main source -- also consult the PDF in `legacy/` for tables, layout-dependent content, and anything unclear +## Primary rule +Do not change protocol meaning. +Use `legacy/source.md` as the primary source when rewriting `README.md`. +Use `legacy/source.txt` only as a fallback when `legacy/source.md` looks malformed, incomplete, or unclear. +Use the PDF file in `legacy/` as the final reference source of truth for tables, figures, layout-dependent content, and anything still ambiguous. +If `legacy/source.md` and `legacy/source.txt` disagree, prefer `legacy/source.md` for general structure and prose, but use the original PDF as the final tie-breaker. -Core rules: -- Do not change protocol meaning -- Do not invent missing information -- Do not delete source content -- Do not silently summarize, compress, or merge steps -- Preserve exact reagent names, quantities, timings, temperatures, and conditions unless only formatting is being normalized -- Preserve step order unless the source clearly indicates otherwise -- If anything is uncertain, mark it with `CHECK:` instead of guessing +## Migration behavior +When converting legacy protocol content into the repository template: -If content does not fit cleanly: -- place it under `# Migration notes` or `## Unplaced content` +- Preserve all protocol content. +- Preserve all procedural content, warnings, notes, reagent names, quantities, timings, temperatures, and conditions, preserving their location. +- Do not change scientific meaning. +- Do not invent missing information. +- Do not invent missing values or steps. +- Do not delete any content from the source. +- Do not silently summarize, compress, or merge steps. +- Keep exact reagent names, quantities, temperatures, timings, and conditions unless only formatting is being normalized. +- Preserve exact reagent and equipment names unless only formatting is changing. +- Preserve the step order from the source unless the source clearly indicates otherwise. +- Do not delete repeated warnings or notes. +- If any text does not fit cleanly into the template, place it under `# Migration notes` or `## Unplaced content`. +- Mark uncertainty with `CHECK:` instead of guessing. -Allowed formatting normalization only when meaning is unchanged: -- add a space between numbers and units -- standardize temperature formatting to `37 °C` -- standardize volumes to `µL`, `mL`, `L` -- standardize concentrations to `mM`, `µM`, `nM`, `% (w/v)` -- standardize time units to `seconds`, `minutes`, `hours` -- standardize pH formatting to `pH 7.4` -- normalize bullets, headings, and markdown tables to match the template -- use tables for reaction mixes and other tabular content -- use HTML subscripts for chemical formulas where needed -- normalize note-like text to blockquote style, e.g. `> **Note**` +## Allowed formatting normalization +You may normalize formatting only when the meaning is unchanged and unambiguous: -Do not: -- infer omitted values -- replace vague wording like `overnight` or `room temperature` with precise values -- reorder steps unless clearly justified by the source -- remove repeated warnings or notes -- replace one reagent with another -- omit unmapped text +- Add a space between numbers and units. +- Standardize temperature formatting to `37 °C`. +- Standardize volume units to `µL`, `mL`, `L`, using the micro sign `µ` consistently. +- Standardize concentration units to `mM`, `µM`, `nM`, `% (w/v)`, etc., using the micro sign `µ` consistently. +- Standardize time units to full words: `seconds`, `minutes`, `hours`. +- Standardize chemical names to match the source but with consistent formatting (e.g. `Tris-HCl` instead of `Tris HCl`). +- Standardize pH formatting to `pH 7.4`. +- Standardize chemical formulas with HTML subscripts, for example H2O to H2O. Similarly for other chemical formulas (e.g. MgCl2 to MgCl2). +- Do not use Unicode subscript characters such as `₂`. +- Standardize `RNAseq` or `RNA-Seq` to `RNA-seq`. Same for `ChIP-seq`, `ATAC-seq`, etc. +- Normalize bullet formatting and markdown table formatting. +- Normalize heading structure to match the repository template. +- For reaction mixes and anything tabular, place them inside a table as in template. +- Normalize markdown headings, bullets, and tables. +- "Note" or "NOTE" or "Optional" or "Recommended" or "Warning" are normalized to start with `>` (example `> **Note**`) and are placed immediately after the step they refer to, or at the end of the protocol if they clearly refer to the whole protocol. +- Remove empty columns from tables. +- Synchronize `Contents` with actual headings in the protocol. -Output requirements: +## Disallowed changes +- Do not infer omitted concentrations, times, temperatures, or volumes. +- Do not infer values for missing quantities. +- Do not try to calculate or infer values that are not explicitly stated. +- Do not convert `overnight`, `RT`, `briefly`, `room temperature` or similar vague language into precise values. +- Do not replace vague language with precise values. +- Do not reorder steps unless the source clearly numbers them in that order. +- Do not remove duplicate-looking content unless it is truly identical and both copies are preserved in review notes. +- Do not rewrite scientific wording for style if that risks changing meaning. +- Do not fill in table cells with values that are missing from the source. +- Do not replace one reagent name with another. +- Do not remove repeated warnings or notes. +- Do not omit unmapped text. + +## Output requirements - edit `README.md` -- use the template headings -- keep the template badge at the top -- remove the template instruction note +- use the template headings exactly +- use the template headings in `README.md` +- keep all source content +- add `CHECK:` markers for uncertainty +- use `CHECK:` only for genuine unresolved uncertainty. If no uncertainty remains, do not mention `CHECK:` at all +- add `# Migration notes` +- after drafting, add a short summary in `# Migration notes` covering: + - formatting normalizations performed + - ambiguities and uncertainty flagged + - content placed in `## Unplaced content` - add `# Migration notes` including: - imported protocol metadata from `source-metadata.yml` if present + - imported protocol metadata from `source-metadata.yml` using only the non-blank lines - template metadata from `template-metadata.yml` - ambiguous mappings - - formatting normalizations performed + - normalized formatting changes - content copied verbatim but not confidently placed +- keep the template badge at the top +- keep ![Created with ulelab Protocol Template](https://img.shields.io/badge/created%20with-ulelab%20Protocol%20Template-blue) at the top of the file +- remove the template instruction note +- delete the "Template repository: Click `Use this template` to create a new protocol repo..." note +## Verification After drafting, verify the migration against the source: -- compare the migrated `README.md` against `legacy/source.txt` -- - compare the migrated `README.md` against the PDF in `legacy/` +- compare the migrated `README.md` against `legacy/source.md` +- compare any malformed, incomplete, or ambiguous passages against `legacy/source.txt` +- compare the migrated `README.md` against the PDF in `legacy/` for tables, figures, layout-dependent content, and any remaining ambiguity - check that all protocol steps, notes, warnings, reagent names, quantities, temperatures, timings, and conditions are still present - check that no source content has been silently omitted, merged, or reordered without justification - check any tables, layout-dependent content, or ambiguous sections against the PDF in `legacy/` - leave `CHECK:` anywhere the mapping is uncertain rather than guessing Verification checklist: -- `README.md` still matches the scientific content of `legacy/source.txt` +- `README.md` still matches the scientific content of `legacy/source.md` +- any malformed, incomplete, or ambiguous passages were cross-checked against `legacy/source.txt` - no protocol steps or warnings were omitted - no values were invented or made more precise than in the source - tables and layout-dependent content were checked against the PDF in `legacy/` - any uncertain mappings are marked with `CHECK:` - any meaningful normalization choices are noted in `# Migration notes` -Prefer preserving meaning over making the output prettier. \ No newline at end of file +Prefer preserving meaning over making the output prettier. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 4026df8..f312e0e 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -4,18 +4,26 @@ This repository stores laboratory protocols in Markdown in `README.md`. ## Primary rule Do not change protocol meaning. -Use `legacy/source.txt` as the primary source when rewriting `README.md`. -Also consult the PDF file in `legacy/` as the reference source for tables, layout-dependent content, and anything unclear. +Use `legacy/source.md` as the primary source when rewriting `README.md`. +Use `legacy/source.txt` only as a fallback when `legacy/source.md` looks malformed, incomplete, or unclear. +Use the PDF file in `legacy/` as the final reference source of truth for tables, figures, layout-dependent content, and anything still ambiguous. +If `legacy/source.md` and `legacy/source.txt` disagree, prefer `legacy/source.md` for general structure and prose, but use the original PDF as the final tie-breaker. ## Migration behavior -When converting legacy protocol text into the repository template: +When converting legacy protocol content into the repository template: +- Preserve all protocol content. - Preserve all procedural content, warnings, notes, reagent names, quantities, timings, temperatures, and conditions, preserving their location. +- Do not change scientific meaning. +- Do not invent missing information. - Do not invent missing values or steps. - Do not delete any content from the source. - Do not silently summarize, compress, or merge steps. +- Keep exact reagent names, quantities, temperatures, timings, and conditions unless only formatting is being normalized. - If text does not map cleanly into the template, place it under `# Migration notes` or `## Unplaced content`. - If any interpretation is uncertain, mark it with `CHECK:` rather than guessing. +- Do not delete repeated warnings or notes. +- Preserve the step order from the source unless the source clearly indicates otherwise. - Preserve exact reagent and equipment names unless only formatting is changing. ## Allowed formatting normalization @@ -25,15 +33,15 @@ You may normalize formatting only when the meaning is unchanged and unambiguous: - Standardize volume units to `µL`, `mL`, `L`, using the micro sign `µ` consistently. - Standardize concentration units to `mM`, `µM`, `nM`, `% (w/v)`, etc., using the micro sign `µ` consistently. - Standardize time units to full words: `seconds`, `minutes`, `hours`. -- Standarize chemical names to match the source but with consistent formatting (e.g. `Tris-HCl` instead of `Tris HCl`). +- Standardize chemical names to match the source but with consistent formatting (e.g. `Tris-HCl` instead of `Tris HCl`). - Standardize pH formatting to `pH 7.4`. - Standardize chemical formulas with HTML subscripts, for example H2O to H2O. Similarly for other chemical formulas (e.g. MgCl2 to MgCl2). - Do not use Unicode subscript characters such as `₂`. -- Standardize `RNAseq` or `RNA-Seq` to `RNA-seq`.Same for `ChIP-seq`, `ATAC-seq`, etc. +- Standardize `RNAseq` or `RNA-Seq` to `RNA-seq`. Same for `ChIP-seq`, `ATAC-seq`, etc. - Normalize bullet formatting and markdown table formatting. - Normalize heading structure to match the repository template. - For reaction mixes and anything tabular, place them inside a table as in template. -- Normalize markdown headings, bullets, and tables +- Normalize markdown headings, bullets, and tables. - "Note" or "NOTE" or "Optional" or "Recommended" or "Warning" are normalized to start with `>` (example `> **Note**`) and are placed immediately after the step they refer to, or at the end of the protocol if they clearly refer to the whole protocol. - Remove empty columns from tables. - Synchronize `Contents` with actual headings in the protocol. @@ -55,17 +63,42 @@ You may normalize formatting only when the meaning is unchanged and unambiguous: ## Output requirements When drafting a migrated protocol: - Use the template headings exactly. -- Use the template headings in `README.md` +- Use the template headings in `README.md`. - Keep all source content. - Add `CHECK:` markers for uncertainty. - Use `CHECK:` only for genuine unresolved uncertainty. If no uncertainty remains, do not mention `CHECK:` at all. - Add an `# Migration notes` section listing: + - formatting normalizations performed + - ambiguities and uncertainty flagged + - content placed in `## Unplaced content` - Imported protocol metadata from `source-metadata.yml` (only the non-blank lines). - - template metadata from `template-metadata.yml` - - ambiguous mappings - - normalized formatting changes - - content copied verbatim but not confidently placed + - Imported protocol metadata from `source-metadata.yml` if present. + - template metadata from `template-metadata.yml`. + - ambiguous mappings. + - normalized formatting changes. + - content copied verbatim but not confidently placed. - Keep ![Created with ulelab Protocol Template](https://img.shields.io/badge/created%20with-ulelab%20Protocol%20Template-blue) at the top of the file. - Delete the "Template repository: Click `Use this template` to create a new protocol repo..." note. +- Remove the template instruction note. +## Verification +After drafting, verify the migration against the source: +- compare the migrated `README.md` against `legacy/source.md` +- compare any malformed, incomplete, or ambiguous passages against `legacy/source.txt` +- compare the migrated `README.md` against the PDF in `legacy/` for tables, figures, layout-dependent content, and any remaining ambiguity +- check that all protocol steps, notes, warnings, reagent names, quantities, temperatures, timings, and conditions are still present +- check that no source content has been silently omitted, merged, or reordered without justification +- check any tables, layout-dependent content, or ambiguous sections against the PDF in `legacy/` +- leave `CHECK:` anywhere the mapping is uncertain rather than guessing + +Verification checklist: +- `README.md` still matches the scientific content of `legacy/source.md` +- any malformed, incomplete, or ambiguous passages were cross-checked against `legacy/source.txt` +- no protocol steps or warnings were omitted +- no values were invented or made more precise than in the source +- tables and layout-dependent content were checked against the PDF in `legacy/` +- any uncertain mappings are marked with `CHECK:` +- any meaningful normalization choices are noted in `# Migration notes` + +Prefer preserving meaning over making the output prettier. diff --git a/.github/workflows/pdf-to-markdown.yml b/.github/workflows/pdf-to-markdown.yml new file mode 100644 index 0000000..cf04edb --- /dev/null +++ b/.github/workflows/pdf-to-markdown.yml @@ -0,0 +1,101 @@ +name: pdf-to-markdown + +on: + push: + branches-ignore: + - main + paths: + - legacy/*.pdf + - legacy/*.PDF + workflow_dispatch: + +permissions: + contents: write + +jobs: + convert: + runs-on: ubuntu-latest + env: + LEGACY_DIR: legacy + + steps: + - name: Check out repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.ref }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Check whether conversion should run + id: pdf_gate + run: | + set -euo pipefail + + if [ ! -d "$LEGACY_DIR" ]; then + echo "should_convert=false" >> "$GITHUB_OUTPUT" + echo "Conversion skipped: $LEGACY_DIR/ does not exist." + exit 0 + fi + + mapfile -t pdf_files < <(find "$LEGACY_DIR" -maxdepth 1 -type f \( -iname '*.pdf' \) | sort) + pdf_count="${#pdf_files[@]}" + + if [ "$pdf_count" -eq 0 ]; then + echo "should_convert=false" >> "$GITHUB_OUTPUT" + echo "Conversion skipped: no PDF files found in $LEGACY_DIR/." + exit 0 + fi + + if [ "$pdf_count" -gt 1 ]; then + printf 'Found %s PDF files in %s/:\n' "$pdf_count" "$LEGACY_DIR" + printf ' - %s\n' "${pdf_files[@]}" + echo "Expected exactly one PDF file in $LEGACY_DIR/." + exit 1 + fi + + echo "should_convert=true" >> "$GITHUB_OUTPUT" + echo "pdf_path=${pdf_files[0]}" >> "$GITHUB_OUTPUT" + echo "Using PDF: ${pdf_files[0]}" + + - name: Install fast-mode dependencies + if: steps.pdf_gate.outputs.should_convert == 'true' + run: | + python -m pip install --upgrade pip + pip install pymupdf pymupdf4llm + + - name: Convert PDF to legacy/source.md + if: steps.pdf_gate.outputs.should_convert == 'true' + run: | + set -euo pipefail + python scripts/pdf_to_md/pdf_to_md.py "$LEGACY_DIR" "$LEGACY_DIR/source.md" --no-progress + + - name: Show generated files + if: steps.pdf_gate.outputs.should_convert == 'true' + run: | + find "$LEGACY_DIR" -maxdepth 3 \( -name "source.md" -o -path "$LEGACY_DIR/images/*" \) | sort + + - name: Commit and push generated files + if: steps.pdf_gate.outputs.should_convert == 'true' + run: | + set -euo pipefail + + branch_name="${GITHUB_REF_NAME}" + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git add -A "$LEGACY_DIR" + + if git diff --cached --quiet; then + echo "No changes to commit" + exit 0 + fi + + git commit -m "Prepare migration: extract markdown from legacy PDF" + git fetch origin "$branch_name" + git rebase "origin/$branch_name" + git push origin "HEAD:$branch_name" diff --git a/.github/workflows/prepare_migration.yml b/.github/workflows/pdf-to-text.yml similarity index 67% rename from .github/workflows/prepare_migration.yml rename to .github/workflows/pdf-to-text.yml index 54c0ae6..71ec8d9 100644 --- a/.github/workflows/prepare_migration.yml +++ b/.github/workflows/pdf-to-text.yml @@ -1,4 +1,4 @@ -name: Prepare migration from PDF +name: pdf-to-text on: workflow_dispatch: @@ -18,6 +18,9 @@ jobs: steps: - name: Check out repo uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.ref }} - name: Set up Python uses: actions/setup-python@v5 @@ -40,8 +43,18 @@ jobs: - name: Commit extracted text run: | + set -euo pipefail + + branch_name="${GITHUB_REF_NAME}" + git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" git add legacy/source.txt - git diff --cached --quiet || git commit -m "Prepare migration: extract text from legacy PDF" - git push + if git diff --cached --quiet; then + echo "No changes to commit" + exit 0 + fi + git commit -m "Prepare migration: extract text from legacy PDF" + git fetch origin "$branch_name" + git rebase "origin/$branch_name" + git push origin "HEAD:$branch_name" diff --git a/.github/workflows/readme_to_pdf.yml b/.github/workflows/readme-to-pdf.yml similarity index 81% rename from .github/workflows/readme_to_pdf.yml rename to .github/workflows/readme-to-pdf.yml index 97bacb4..964112f 100644 --- a/.github/workflows/readme_to_pdf.yml +++ b/.github/workflows/readme-to-pdf.yml @@ -1,4 +1,4 @@ -name: Build README PDF +name: README-to-pdf on: push: @@ -42,14 +42,37 @@ jobs: set -euo pipefail cp README.md README._pdf.md - - name: Trim README preamble for PDF + - name: Drop leading YAML front matter for PDF run: | set -euo pipefail awk ' - found || /^# / { - found = 1 + BEGIN { in_front_matter = 0 } + NR == 1 && /^---[[:space:]]*$/ { + in_front_matter = 1 + next + } + in_front_matter && (/^---[[:space:]]*$/ || /^\.\.\.[[:space:]]*$/) { + in_front_matter = 0 + next + } + in_front_matter { next } + { print } + ' README._pdf.md > README._pdf.tmp && mv README._pdf.tmp README._pdf.md + + - name: Remove top template preamble for PDF + run: | + set -euo pipefail + awk ' + BEGIN { started = 0 } + !started { + if (/^\[!\[/) next + if (/^> Template repository:/) next + if (/^[[:space:]]*$/) next + started = 1 print + next } + { print } ' README._pdf.md > README._pdf.tmp && mv README._pdf.tmp README._pdf.md - name: Exclude migration notes from PDF @@ -107,6 +130,7 @@ jobs: pandoc README._pdf.md \ -o "${repo_name}.pdf" \ + --from=markdown-yaml_metadata_block \ --pdf-engine=xelatex \ -V colorlinks=true \ -V linkcolor=blue \ diff --git a/.github/workflows/validate_protocol.yml b/.github/workflows/validate-protocol.yml similarity index 98% rename from .github/workflows/validate_protocol.yml rename to .github/workflows/validate-protocol.yml index 8af38c8..bd4ea95 100644 --- a/.github/workflows/validate_protocol.yml +++ b/.github/workflows/validate-protocol.yml @@ -1,4 +1,4 @@ -name: Validate README protocol +name: validate-protocol-README on: pull_request: diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md new file mode 100644 index 0000000..acde86c --- /dev/null +++ b/THIRD_PARTY_NOTICES.md @@ -0,0 +1,22 @@ +# Third-Party Notices + +This repository is primarily licensed under GPL-3.0, but it also contains +specific files adapted from third-party code under different terms. + +## `pdf_to_md` scripts + +Files: +- `scripts/pdf_to_md/pdf_to_md.py` +- `scripts/pdf_to_md/extractor.py` + +These files include code adapted from: +- Repository: `aliceisjustplaying/claude-skill-pdf-to-markdown` +- Source: + +Upstream licensing note: +- The upstream repository README states `MIT` as the license. +- Source checked: + +Attribution note: +- This repository records the upstream source and preserves attribution for the + adapted files listed above. \ No newline at end of file diff --git a/docs/PROMPT.md b/docs/PROMPT.md index 03f22a6..b991a6d 100644 --- a/docs/PROMPT.md +++ b/docs/PROMPT.md @@ -1,30 +1,109 @@ -Convert `legacy/source.txt` into `README.md`. +Convert `legacy/source.md` into `README.md`. Use the existing `README.md` as the target template and structure. Also read and follow `.github/copilot-instructions.md`. Apply those instructions even if you are not GitHub Copilot. -Use `legacy/source.txt` as the primary source. -Also check the PDF file in the `legacy/` folder as the reference source, especially for tables, layout-dependent content, and anything unclear. - -Requirements: -1. Preserve all protocol content. -2. Do not change scientific meaning. -3. Do not invent missing information. -4. Keep exact reagent names, quantities, temperatures, timings, and conditions unless only formatting is being normalized. -5. Normalize only safe formatting, such as: - - adding a space between numbers and units - - using `seconds`, `minutes`, `hours` - - using `µL`, `mL`, `L` - - using `37 °C` style temperature formatting -6. Preserve the step order from the source. -7. Do not delete repeated warnings or notes. -8. If any text does not fit cleanly into the template, place it under `# Migration notes` or `## Unplaced content`. -9. Mark uncertainty with `CHECK:` instead of guessing. -10. After drafting, add a short summary in `# Migration notes` covering: - - formatting normalizations performed - - ambiguities and uncertainty flagged - - content placed in `## Unplaced content` - -Only edit `README.md`. +## Primary rule +Do not change protocol meaning. +Use `legacy/source.md` as the primary source when rewriting `README.md`. +Use `legacy/source.txt` only as a fallback when `legacy/source.md` looks malformed, incomplete, or unclear. +Use the PDF file in `legacy/` as the final reference source of truth for tables, figures, layout-dependent content, and anything still ambiguous after checking the generated text sources. +If `legacy/source.md` and `legacy/source.txt` disagree, prefer `legacy/source.md` for general structure and prose, but use the original PDF as the final tie-breaker. + +## Migration behavior +When converting legacy protocol content into the repository template: + +- Preserve all protocol content. +- Preserve all procedural content, warnings, notes, reagent names, quantities, timings, temperatures, and conditions, preserving their location. +- Do not change scientific meaning. +- Do not invent missing information. +- Do not invent missing values or steps. +- Do not delete any content from the source. +- Do not silently summarize, compress, or merge steps. +- Keep exact reagent names, quantities, temperatures, timings, and conditions unless only formatting is being normalized. +- Preserve exact reagent and equipment names unless only formatting is changing. +- Preserve the step order from the source unless the source clearly indicates otherwise. +- Do not delete repeated warnings or notes. +- If any text does not fit cleanly into the template, place it under `# Migration notes` or `## Unplaced content`. +- Mark uncertainty with `CHECK:` instead of guessing. + +## Allowed formatting normalization +Normalize formatting only when the meaning is unchanged and unambiguous: + +- add a space between numbers and units +- use `seconds`, `minutes`, `hours` +- use `µL`, `mL`, `L` +- use `37 °C` style temperature formatting +- standardize concentration units to `mM`, `µM`, `nM`, `% (w/v)`, etc., using the micro sign `µ` consistently +- standardize pH formatting to `pH 7.4` +- standardize chemical names to match the source but with consistent formatting, for example `Tris-HCl` instead of `Tris HCl` +- standardize chemical formulas with HTML subscripts, for example H2O to H2O and MgCl2 to MgCl2 +- do not use Unicode subscript characters such as `₂` +- standardize `RNAseq` or `RNA-Seq` to `RNA-seq`, and similarly for `ChIP-seq`, `ATAC-seq`, and related names +- normalize bullets, headings, and markdown tables to match the repository template +- use tables for reaction mixes and other tabular content +- normalize note-like text to blockquote style, for example `> **Note**` +- place note-like text immediately after the step it refers to, or at the end of the protocol if it clearly refers to the whole protocol +- remove empty columns from tables +- synchronize `Contents` with the actual headings in the protocol + +## Disallowed changes +- do not infer omitted concentrations, times, temperatures, or volumes +- do not infer values for missing quantities +- do not try to calculate or infer values that are not explicitly stated +- do not convert `overnight`, `RT`, `briefly`, `room temperature`, or similar vague language into precise values +- do not replace vague language with precise values +- do not reorder steps unless the source clearly numbers them in that order +- do not remove duplicate-looking content unless it is truly identical and both copies are preserved in review notes +- do not rewrite scientific wording for style if that risks changing meaning +- do not fill in table cells with values that are missing from the source +- do not replace one reagent name with another +- do not remove repeated warnings or notes +- do not omit unmapped text + +## Output requirements +- Only edit `README.md`. +- Use the template headings exactly. +- Use the template headings in `README.md`. +- Keep all source content. +- Add `CHECK:` markers for uncertainty. +- Use `CHECK:` only for genuine unresolved uncertainty. If no uncertainty remains, do not mention `CHECK:` at all. +- Add an `# Migration notes` section. +- After drafting, add a short summary in `# Migration notes` covering: + - formatting normalizations performed + - ambiguities and uncertainty flagged + - content placed in `## Unplaced content` +- Include the following in `# Migration notes`: + - imported protocol metadata from `source-metadata.yml` if present + - imported protocol metadata from `source-metadata.yml` using only the non-blank lines + - template metadata from `template-metadata.yml` + - ambiguous mappings + - normalized formatting changes + - content copied verbatim but not confidently placed +- Keep ![Created with ulelab Protocol Template](https://img.shields.io/badge/created%20with-ulelab%20Protocol%20Template-blue) at the top of the file. +- Remove the template instruction note. +- Delete the "Template repository: Click `Use this template` to create a new protocol repo..." note. + +## Verification +After drafting, verify the migration against the source: + +- compare the migrated `README.md` against `legacy/source.md` +- compare any malformed, incomplete, or ambiguous passages against `legacy/source.txt` +- compare the migrated `README.md` against the PDF in `legacy/` for tables, figures, layout-dependent content, and any remaining ambiguity +- check that all protocol steps, notes, warnings, reagent names, quantities, temperatures, timings, and conditions are still present +- check that no source content has been silently omitted, merged, or reordered without justification +- check any tables, layout-dependent content, or ambiguous sections against the PDF in `legacy/` +- leave `CHECK:` anywhere the mapping is uncertain rather than guessing + +Verification checklist: +- `README.md` still matches the scientific content of `legacy/source.md` +- any malformed, incomplete, or ambiguous passages were cross-checked against `legacy/source.txt` +- no protocol steps or warnings were omitted +- no values were invented or made more precise than in the source +- tables and layout-dependent content were checked against the PDF in `legacy/` +- any uncertain mappings are marked with `CHECK:` +- any meaningful normalization choices are noted in `# Migration notes` + +Prefer preserving meaning over making the output prettier. diff --git a/docs/USING_THIS_TEMPLATE.md b/docs/USING_THIS_TEMPLATE.md index 833102a..0f81379 100644 --- a/docs/USING_THIS_TEMPLATE.md +++ b/docs/USING_THIS_TEMPLATE.md @@ -101,11 +101,11 @@ This route can save time. It helps keep the template structure consistent, norma > **Recommended**: Also fill in the `source-metadata.yml`, even if not fully. Helps track source protocol provenance. 4. Keep exactly one PDF in the `legacy` folder, otherwise the process will fail. -5. Once you push a PDF change in the `legacy` folder to a non-`main` branch, the `Prepare migration from PDF` GitHub Action will run. This extracts the PDF text and writes `legacy/source.txt`. Check that this file was created before the next step. +5. Once you push a PDF change in the `legacy` folder to a non-`main` branch, the migration GitHub Actions will run. `pdf-to-text` writes `legacy/source.txt`, and `pdf-to-markdown` writes `legacy/source.md`. Check that these files were created before the next step. 6. Clone the repo locally, and switch to `import-protocol` branch. If you already have a local clone, run `git pull` to get the latest changes locally. > **Note**: Alternatively, you can complete steps 6-15 in GitHub Codespaces. On GitHub.com select the branch you want to work on, click **Code**, go to **Codespaces** tab and click **Create codespace on import-protocol**. This will open VS Code in a new browser tab, with all files loaded automatically. Note that this uses GitHub-hosted compute, and free usage is limited. 7. Open the repo folder in a code editor and use GitHub Copilot or another LLM assistant. We recommend [VS Code](https://code.visualstudio.com/). -8. Use the `protocol-migration` skill (or if you prefer, paste the prompt in `docs/PROMPT.md`) to ask GitHub Copilot or another LLM to rewrite `README.md`. The model will also follow the repository instructions in [`.github/copilot-instructions.md`](.github/copilot-instructions.md). This will edit the `README.md` file in-place, using `legacy/source.txt` and the legacy PDF as sources. +8. Use the `protocol-migration` skill (or if you prefer, paste the prompt in `docs/PROMPT.md`) to ask GitHub Copilot or another LLM to rewrite `README.md`. The model will also follow the repository instructions in [`.github/copilot-instructions.md`](.github/copilot-instructions.md). This will edit the `README.md` file in-place, using `legacy/source.md` as the primary source, `legacy/source.txt` as a fallback when needed, and the legacy PDF as the final tie-breaker for tables, figures, and unclear layout-dependent content. > **Note**: Use the best model you have access to. We tested capability with the Copilot Free Usage plan, and it works reasonably well, but advanced models will likely work even better. **In VS Code**: diff --git a/protocol-template.pdf b/protocol-template.pdf index 808d2c0..dc56efa 100644 Binary files a/protocol-template.pdf and b/protocol-template.pdf differ diff --git a/scripts/pdf_to_md/extractor.py b/scripts/pdf_to_md/extractor.py new file mode 100755 index 0000000..1be2722 --- /dev/null +++ b/scripts/pdf_to_md/extractor.py @@ -0,0 +1,297 @@ +# Adapted in part from: +# https://github.com/aliceisjustplaying/claude-skill-pdf-to-markdown +# Original upstream license: MIT +# See THIRD_PARTY_NOTICES.md for attribution details. + +""" +PDF extraction with multiple backends: +- Fast mode: PyMuPDF with multi-strategy table detection (good for simple tables) +- Accurate mode: IBM Docling with TableFormer AI (better for complex/borderless tables) +""" + +import os +import sys +from pathlib import Path + +# Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation +# This prints to stdout and pollutes --stdout output +os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0") + +# Version for cache invalidation - increment when extraction logic changes +# Format: major.minor.patch +# 3.1.0: Page separators now use instead of ----- +# Image extraction includes nested XObjects (full=True) +# 3.2.0: Fast mode now includes image references in markdown (write_images=True) +# Cache keys now include no_images flag to avoid contamination +# 3.3.0: Image paths in cached markdown now use relative 'images/' prefix +# (fixes broken temp directory references in cached output) +EXTRACTOR_VERSION = "3.3.0" + + +def check_docling_models(): + """Check if Docling models are downloaded.""" + try: + from huggingface_hub import scan_cache_dir + + cache_info = scan_cache_dir() + # Check for docling models in HF cache + docling_repos = [r for r in cache_info.repos if "docling" in r.repo_id.lower()] + return len(docling_repos) > 0 + except Exception: + return False + + +def extract_pdf_fast( + pdf_path: str, image_dir: str = None, show_progress: bool = False +) -> str: + """ + Fast PDF extraction using PyMuPDF with text-based table detection. + + Uses 'text' table strategy which handles borderless/whitespace-based + tables better than the default 'lines_strict' for mixed document types. + + Args: + pdf_path: Path to the PDF file + image_dir: Directory to save extracted images (None = skip images) + show_progress: Whether to show progress output + + Returns: + Markdown string of the PDF content with image references if image_dir provided + """ + import pymupdf4llm + + if show_progress: + print("Extracting with PyMuPDF (fast mode)...", file=sys.stderr) + + # Use text strategy which handles borderless tables better + # than the default lines_strict + markdown = pymupdf4llm.to_markdown( + pdf_path, + show_progress=show_progress, + table_strategy="text", # Better for mixed table types + write_images=image_dir is not None, + image_path=image_dir, + ) + + # Replace pymupdf4llm's default page separator with explicit sentinel. + # This prevents false splits when documents contain literal "-----" + # (horizontal rules, ASCII tables, etc.) + markdown = markdown.replace("\n-----\n", "\n\n") + + return markdown + + +def _save_docling_images(result, output_dir: Path) -> list: + """ + Save images from a Docling conversion result to output directory. + + Images are saved in iteration order, which matches the order of + placeholders in the exported markdown. + + Args: + result: Docling ConversionResult object + output_dir: Directory to save images to + + Returns: + List of saved image paths (in iteration order) + """ + output_dir.mkdir(parents=True, exist_ok=True) + image_paths = [] + + for i, (element, _level) in enumerate(result.document.iterate_items()): + if hasattr(element, "image") and element.image is not None: + img_path = output_dir / f"figure_{i:04d}.png" + element.image.pil_image.save(str(img_path)) + image_paths.append(str(img_path)) + + return image_paths + + +def extract_pdf_docling( + pdf_path: str, + output_dir: str = None, + images_scale: float = 4.0, + show_progress: bool = False, +) -> tuple: + """ + Extract PDF using Docling with accurate tables + high-res images. + + Uses IBM's TableFormer AI model for ~93.6% table extraction accuracy. + Also extracts images at configurable resolution (default 4x for crisp images). + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save extracted images (None = skip images) + images_scale: Image resolution multiplier (default: 4.0 for high-res) + show_progress: Whether to show progress output + + Returns: + tuple: (markdown: str, image_paths: list[str]) + """ + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode + from docling_core.types.doc.base import ImageRefMode + + # Check if this is first run (models need downloading) + if not check_docling_models(): + print( + "First run: downloading Docling AI models (one-time setup, ~2-3 minutes)...", + file=sys.stderr, + ) + + if show_progress: + print( + f"Processing PDF with Docling (accurate mode, ~1 sec/page)...", + file=sys.stderr, + ) + + # Configure pipeline for accurate tables + image extraction + pipeline_options = PdfPipelineOptions( + do_table_structure=True, + generate_picture_images=output_dir is not None, + images_scale=images_scale, + ) + pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) + + # Convert the document + result = converter.convert(pdf_path) + + # Check for conversion errors + if hasattr(result, "errors") and result.errors: + for error in result.errors: + print(f"WARNING: Docling conversion error: {error}", file=sys.stderr) + + # Check conversion status + from docling.datamodel.base_models import ConversionStatus + + if hasattr(result, "status") and result.status != ConversionStatus.SUCCESS: + print( + f"WARNING: Docling conversion status: {result.status.name}", + file=sys.stderr, + ) + + # Save images to output directory (order matters for placeholder replacement) + image_paths = [] + if output_dir: + image_paths = _save_docling_images(result, Path(output_dir)) + if show_progress and image_paths: + print( + f"Extracted {len(image_paths)} images at {images_scale}x resolution", + file=sys.stderr, + ) + + # Export markdown with placeholders + md = result.document.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER) + + # Replace placeholders with actual image references (order must match iteration order) + for img_path in image_paths: + md = md.replace("", f"![Figure](images/{Path(img_path).name})", 1) + + return md, image_paths + + +def extract_pdf_to_markdown( + pdf_path: str, accurate: bool = False, show_progress: bool = False +) -> str: + """ + Extract PDF to markdown with configurable accuracy/speed trade-off. + + Args: + pdf_path: Path to the PDF file + accurate: If True, use Docling AI (better for complex tables, slower). + If False, use PyMuPDF (fast, good for simple tables). + show_progress: Whether to show progress output + + Returns: + Markdown string of the PDF content + """ + if accurate: + # Use Docling without image extraction + md, _ = extract_pdf_docling( + pdf_path, output_dir=None, show_progress=show_progress + ) + return md + else: + return extract_pdf_fast(pdf_path, show_progress=show_progress) + + +def get_page_count(pdf_path: str) -> int: + """Get the number of pages in a PDF using pymupdf (faster than Docling for this).""" + import pymupdf + + doc = pymupdf.open(pdf_path) + count = len(doc) + doc.close() + return count + + +def extract_images(pdf_path: str, output_dir: str, show_progress: bool = False) -> list: + """ + Extract images from PDF to output directory. + + Uses pymupdf for image extraction since Docling focuses on document structure. + Deduplicates by xref to avoid extracting the same image multiple times + (e.g., icons/logos reused across pages). + + Returns: + List of extracted image paths + """ + import pymupdf + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + doc = pymupdf.open(pdf_path) + extracted = [] + image_count = 0 + seen_xrefs = set() # Track already-extracted images by xref + + for page_num in range(len(doc)): + page = doc[page_num] + # full=True includes images nested inside form XObjects (common in + # documents exported from Word/PowerPoint) + images = page.get_images(full=True) + + for img_index, img in enumerate(images): + try: + xref = img[0] + + # Skip if we've already extracted this image + if xref in seen_xrefs: + continue + seen_xrefs.add(xref) + + pix = pymupdf.Pixmap(doc, xref) + + # Convert CMYK to RGB if necessary + if pix.n - pix.alpha > 3: + pix = pymupdf.Pixmap(pymupdf.csRGB, pix) + + image_count += 1 + img_filename = f"image_{image_count:04d}.png" + img_path = output_path / img_filename + pix.save(str(img_path)) + extracted.append(str(img_path)) + + pix = None + except Exception as e: + # Log instead of silently swallowing errors + print( + f"WARNING: Failed to extract image {img_index} on page {page_num + 1}: {e}", + file=sys.stderr, + ) + continue + + doc.close() + + if show_progress and extracted: + print(f"Extracted {len(extracted)} unique images", file=sys.stderr) + + return extracted diff --git a/scripts/pdf_to_md/pdf_to_md.py b/scripts/pdf_to_md/pdf_to_md.py new file mode 100755 index 0000000..bf58069 --- /dev/null +++ b/scripts/pdf_to_md/pdf_to_md.py @@ -0,0 +1,867 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +# Adapted in part from: +# https://github.com/aliceisjustplaying/claude-skill-pdf-to-markdown +# Original upstream license: MIT +# See THIRD_PARTY_NOTICES.md for attribution details. + +""" +PDF to Markdown Converter for LLM Context + +Extracts entire PDF content as clean, structured markdown. +Images are extracted to cache directory and copied to output location. + +Features: +- High-accuracy table extraction using IBM Docling (TableFormer AI model) +- Aggressive persistent caching (extracts once, reuses forever) +- Cache only cleared on explicit request or source file change + +Usage: + python pdf_to_md.py [output.md] + python pdf_to_md.py --docling # Accurate tables (slower) + python pdf_to_md.py --clear-cache # Re-extract + python pdf_to_md.py --clear-all-cache # Clear entire cache + +Dependencies: + uv pip install pymupdf pymupdf4llm # Fast mode + uv pip install docling docling-core # Docling mode (optional) +""" + +import argparse +import sys +import os +import re +import json +import hashlib +import shutil +import tempfile +from dataclasses import dataclass +from pathlib import Path +from datetime import datetime + + +# ============================================================================= +# DATACLASSES +# ============================================================================= + + +@dataclass +class ExtractionConfig: + """Configuration for PDF extraction.""" + + pdf_path: str + docling: bool = False + images_scale: float = 4.0 + + +@dataclass +class ExtractionResult: + """Result of PDF extraction or cache load.""" + + markdown: str + image_dir: Path | None + total_pages: int + from_cache: bool = False + + +# Suppress PyMuPDF's "Consider using pymupdf_layout" recommendation +os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0") + +# Default cache directory +DEFAULT_CACHE_DIR = Path.home() / ".cache" / "pdf-to-markdown" + + +def resolve_input_pdf(input_arg: str) -> Path: + """Resolve either a direct PDF path or a folder containing exactly one PDF.""" + input_path = Path(input_arg) + + if input_path.is_file(): + return input_path + + if input_path.is_dir(): + pdf_files = sorted( + [p for p in input_path.iterdir() if p.is_file() and p.suffix.lower() == ".pdf"] + ) + + if len(pdf_files) == 0: + raise FileNotFoundError( + f"No PDF files found in folder: {input_path}. Expected exactly one .pdf file." + ) + + if len(pdf_files) > 1: + names = ", ".join(p.name for p in pdf_files) + raise ValueError( + f"Multiple PDF files found in folder: {input_path}. " + f"Expected exactly one .pdf file, found {len(pdf_files)}: {names}" + ) + + return pdf_files[0] + + raise FileNotFoundError(f"Input path does not exist: {input_path}") + + +# ============================================================================= +# CACHE MANAGER +# ============================================================================= + + +class CacheManager: + """Manages PDF extraction cache.""" + + def __init__(self, cache_dir: Path = None): + self.cache_dir = cache_dir or DEFAULT_CACHE_DIR + + def get_key(self, config: ExtractionConfig) -> str: + """Generate cache key from file content + size + mode.""" + p = Path(config.pdf_path).resolve() + stat = p.stat() + file_size = stat.st_size + + chunk_size = 65536 # 64KB + hasher = hashlib.sha256() + + with open(p, "rb") as f: + if file_size <= chunk_size * 2: + hasher.update(f.read()) + else: + hasher.update(f.read(chunk_size)) + f.seek(-chunk_size, 2) + hasher.update(f.read(chunk_size)) + + mode = f"docling_{config.images_scale}" if config.docling else "fast" + raw = f"{file_size}|{hasher.hexdigest()}|{mode}" + return hashlib.sha256(raw.encode()).hexdigest()[:16] + + def _get_dir(self, cache_key: str) -> Path: + """Get cache directory for a given cache key.""" + return self.cache_dir / cache_key + + def is_valid(self, config: ExtractionConfig) -> tuple[bool, str]: + """Check if valid cache exists for this PDF.""" + from extractor import EXTRACTOR_VERSION + + try: + cache_key = self.get_key(config) + except (FileNotFoundError, OSError): + return False, "" + + cache_dir = self._get_dir(cache_key) + metadata_file = cache_dir / "metadata.json" + output_file = cache_dir / "full_output.md" + + if not metadata_file.exists() or not output_file.exists(): + return False, cache_key + + try: + with open(metadata_file) as f: + metadata = json.load(f) + + p = Path(config.pdf_path).resolve() + stat = p.stat() + + if ( + metadata.get("source_size") != stat.st_size + or metadata.get("source_mtime") != stat.st_mtime + ): + return False, cache_key + + if metadata.get("extractor_version") != EXTRACTOR_VERSION: + return False, cache_key + + return True, cache_key + except (json.JSONDecodeError, KeyError, OSError): + return False, cache_key + + def load(self, cache_key: str) -> ExtractionResult | None: + """Load markdown from cache.""" + cache_dir = self._get_dir(cache_key) + + try: + full_md = (cache_dir / "full_output.md").read_text(encoding="utf-8") + with open(cache_dir / "metadata.json") as f: + metadata = json.load(f) + total_pages = metadata.get("total_pages", 0) + except (FileNotFoundError, IOError, json.JSONDecodeError, OSError) as e: + print( + f"WARNING: Cache corrupted ({e.__class__.__name__}), regenerating...", + file=sys.stderr, + ) + try: + if cache_dir.exists(): + shutil.rmtree(cache_dir) + except OSError: + pass + return None + + # Check if markdown references images + has_image_refs = bool(re.search(r"!\[[^\]]*\]\([^)]+\)", full_md)) + + # Get cached images directory + cached_image_dir = cache_dir / "images" + has_images = cached_image_dir.exists() and any(cached_image_dir.iterdir()) + + # If markdown expects images but they're missing, invalidate cache + if has_image_refs and not has_images: + print( + "WARNING: Cache missing images, regenerating...", + file=sys.stderr, + ) + try: + shutil.rmtree(cache_dir) + except OSError: + pass + return None + + image_dir = cached_image_dir if has_images else None + + return ExtractionResult( + markdown=full_md, + image_dir=image_dir, + total_pages=total_pages, + from_cache=True, + ) + + def _normalize_image_paths(self, markdown: str, source_image_dir: Path) -> str: + """Normalize image paths in markdown to use relative 'images/' prefix.""" + if not source_image_dir: + return markdown + + source_image_dir = Path(source_image_dir) + + def normalize_ref(match): + alt_text = match.group(1) + filename_raw = match.group(2) + filename = Path(filename_raw).name + if (source_image_dir / filename).exists(): + return f"![{alt_text}](images/{filename})" + return match.group(0) + + pattern = r"!\[([^\]]*)\]\(([^)]+)\)" + return re.sub(pattern, normalize_ref, markdown) + + def save(self, cache_key: str, result: ExtractionResult, config: ExtractionConfig): + """Save full extraction to cache using atomic writes.""" + from extractor import EXTRACTOR_VERSION + + cache_dir = self._get_dir(cache_key) + cache_dir.mkdir(parents=True, exist_ok=True) + + markdown = result.markdown + if result.image_dir: + markdown = self._normalize_image_paths(markdown, result.image_dir) + + p = Path(config.pdf_path).resolve() + stat = p.stat() + mode = f"docling_{config.images_scale}" if config.docling else "fast" + + metadata = { + "source_path": str(p), + "source_mtime": stat.st_mtime, + "source_size": stat.st_size, + "cache_key": cache_key, + "cached_at": datetime.now().isoformat(), + "total_pages": result.total_pages, + "extractor_version": EXTRACTOR_VERSION, + "mode": mode, + "images_scale": config.images_scale if config.docling else None, + } + + temp_md = None + temp_json = None + try: + with tempfile.NamedTemporaryFile( + mode="w", + dir=cache_dir, + suffix=".md.tmp", + delete=False, + encoding="utf-8", + ) as f: + f.write(markdown) + temp_md = f.name + + with tempfile.NamedTemporaryFile( + mode="w", dir=cache_dir, suffix=".json.tmp", delete=False + ) as f: + json.dump(metadata, f, indent=2) + temp_json = f.name + + os.replace(temp_md, cache_dir / "full_output.md") + temp_md = None + os.replace(temp_json, cache_dir / "metadata.json") + temp_json = None + + if result.image_dir and Path(result.image_dir).exists(): + temp_images = cache_dir / "images.tmp" + final_images = cache_dir / "images" + + if temp_images.exists(): + shutil.rmtree(temp_images) + + shutil.copytree(result.image_dir, temp_images) + + if final_images.exists(): + shutil.rmtree(final_images) + os.rename(temp_images, final_images) + + finally: + if temp_md and os.path.exists(temp_md): + os.unlink(temp_md) + if temp_json and os.path.exists(temp_json): + os.unlink(temp_json) + + def clear(self, pdf_path: str = None) -> bool: + """Clear cache for specific PDF (both fast and docling modes) or entire cache.""" + if pdf_path: + # Clear BOTH fast and docling caches for this PDF + cleared = False + for docling_mode in [False, True]: + try: + config = ExtractionConfig(pdf_path=pdf_path, docling=docling_mode) + cache_key = self.get_key(config) + cache_dir = self._get_dir(cache_key) + if cache_dir.exists(): + shutil.rmtree(cache_dir) + cleared = True + except (FileNotFoundError, OSError): + pass + return cleared + else: + if self.cache_dir.exists(): + shutil.rmtree(self.cache_dir) + return True + return False + + def get_stats(self) -> dict: + """Get statistics about the cache.""" + if not self.cache_dir.exists(): + return {"entries": 0, "total_size_mb": 0, "cache_dir": str(self.cache_dir)} + + entries = 0 + total_size = 0 + + for entry in self.cache_dir.iterdir(): + if entry.is_dir(): + entries += 1 + for f in entry.rglob("*"): + if f.is_file(): + total_size += f.stat().st_size + + return { + "entries": entries, + "total_size_mb": round(total_size / (1024 * 1024), 2), + "cache_dir": str(self.cache_dir), + } + + +# ============================================================================= +# IMAGE MANAGER +# ============================================================================= + + +class ImageManager: + """Manages image extraction and cleanup.""" + + def __init__(self): + self._temp_dirs: list[Path] = [] + + def create_temp_dir(self, pdf_path: str) -> Path: + """Create tracked temp directory for image extraction.""" + pdf_name = Path(pdf_path).stem + safe_name = re.sub(r"[^\w\-_]", "_", pdf_name) + temp_dir = Path(tempfile.mkdtemp(prefix=f"pdf_images_{safe_name}_")) + self._temp_dirs.append(temp_dir) + return temp_dir + + def cleanup(self): + """Clean up all tracked temp directories.""" + for temp_dir in self._temp_dirs: + if temp_dir.exists(): + shutil.rmtree(temp_dir) + self._temp_dirs.clear() + + def extract_references(self, markdown: str) -> set: + """Extract the set of image filenames referenced in markdown.""" + pattern = r"!\[[^\]]*\]\(([^)]+)\)" + matches = re.findall(pattern, markdown) + return {Path(m).name for m in matches} + + def get_info(self, image_dir: Path, referenced_only: set = None) -> list: + """Get information about extracted images.""" + if not image_dir or not Path(image_dir).exists(): + return [] + + image_dir = Path(image_dir) + images = [] + + for img_path in sorted(image_dir.glob("*")): + if img_path.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"): + if referenced_only is not None and img_path.name not in referenced_only: + continue + + try: + size_bytes = img_path.stat().st_size + size_kb = size_bytes / 1024 + + try: + import pymupdf + pix = pymupdf.Pixmap(str(img_path)) + dimensions = f"{pix.width}x{pix.height}" + pix = None + except Exception: + dimensions = "unknown" + + images.append({ + "filename": img_path.name, + "path": str(img_path), + "size_kb": round(size_kb, 1), + "dimensions": dimensions, + }) + except Exception: + pass + + return images + + def enhance_markdown(self, markdown: str, image_dir: Path) -> str: + """Rewrite image references to use relative paths (portable, Windows-safe).""" + if not image_dir: + return markdown + + image_dir = Path(image_dir) + + def replace_image_ref(match): + alt_text = match.group(1) + filename_raw = match.group(2) + filename = Path(filename_raw).name + full_path = image_dir / filename + + # Use relative path for portability (POSIX format for Windows compatibility) + relative_path = Path("images") / filename + + if full_path.exists(): + try: + size_kb = round(full_path.stat().st_size / 1024, 1) + try: + import pymupdf + pix = pymupdf.Pixmap(str(full_path)) + dims = f"{pix.width}x{pix.height}" + pix = None + except Exception: + dims = "?" + + return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename} ({dims}, {size_kb}KB)]**" + except Exception: + return f"![{alt_text}]({relative_path.as_posix()})\n\n**[Image: {filename}]**" + + return match.group(0) + + pattern = r"!\[([^\]]*)\]\(([^)]+)\)" + return re.sub(pattern, replace_image_ref, markdown) + + def create_summary(self, images: list) -> str: + """Create a summary section listing all extracted images.""" + if not images: + return "" + + lines = [ + "", + "---", + "", + "## Extracted Images", + "", + "| # | File | Dimensions | Size |", + "|---|------|------------|------|", + ] + + for i, img in enumerate(images, 1): + lines.append( + f"| {i} | {img['filename']} | {img['dimensions']} | {img['size_kb']}KB |" + ) + + lines.append("") + return "\n".join(lines) + + def finalize_images( + self, temp_dir: Path, cache_dir: Path, output_path: Path, show_progress: bool = False + ) -> Path | None: + """Finalize image directory after extraction. + + Copies images from cache to output location (next to the markdown file). + Cleans up temp directories. + + Returns the final image directory (next to output) for reference. + """ + if not temp_dir: + return None + + temp_dir = Path(temp_dir) + output_path = Path(output_path) + output_images_dir = ( + output_path.parent / "images" if output_path.suffix else output_path / "images" + ) + + # Clean up empty temp directories + if not temp_dir.exists() or not any(temp_dir.iterdir()): + if temp_dir.exists(): + shutil.rmtree(temp_dir) + if temp_dir in self._temp_dirs: + self._temp_dirs.remove(temp_dir) + if output_images_dir.exists(): + shutil.rmtree(output_images_dir) + return None + + # Clean up temp directory (images are saved to cache) + if temp_dir.exists(): + shutil.rmtree(temp_dir) + if temp_dir in self._temp_dirs: + self._temp_dirs.remove(temp_dir) + + # Copy images from cache to output location + if cache_dir: + cached_image_dir = cache_dir / "images" + if cached_image_dir.exists() and any(cached_image_dir.iterdir()): + return self._copy_images_to_output(cached_image_dir, output_path, show_progress) + + if output_images_dir.exists(): + shutil.rmtree(output_images_dir) + + return None + + def _copy_images_to_output( + self, source_dir: Path, output_path: Path, show_progress: bool = False + ) -> Path | None: + """Copy images from cache to output location (next to markdown file).""" + output_path = Path(output_path) + + # Determine output images directory (sibling to markdown file) + if output_path.suffix: # It's a file path like "output.md" + output_images_dir = output_path.parent / "images" + else: # It's a directory + output_images_dir = output_path / "images" + + # Don't copy if already at output location + if output_images_dir.resolve() == Path(source_dir).resolve(): + return output_images_dir + + # Replace previously generated content so stale images do not linger. + if output_images_dir.exists(): + shutil.rmtree(output_images_dir) + + output_images_dir.mkdir(parents=True, exist_ok=True) + copied_count = 0 + for img in source_dir.iterdir(): + if img.is_file(): + shutil.copy2(img, output_images_dir / img.name) + copied_count += 1 + + if show_progress and copied_count > 0: + print(f"Copied {copied_count} images to: {output_images_dir}", file=sys.stderr) + + return output_images_dir + + +# ============================================================================= +# PDF PROCESSING +# ============================================================================= + + +def check_dependencies(docling_mode: bool = False): + """Check if required packages are installed.""" + missing = [] + + try: + import pymupdf + except ImportError: + missing.append("pymupdf") + + if docling_mode: + try: + import docling + except ImportError: + missing.append("docling") + + try: + import docling_core + except ImportError: + missing.append("docling-core") + + install_cmd = "uv pip install pymupdf docling docling-core" + else: + try: + import pymupdf4llm + except ImportError: + missing.append("pymupdf4llm") + + install_cmd = "uv pip install pymupdf pymupdf4llm" + + if missing: + print(f"ERROR: Missing dependencies: {', '.join(missing)}", file=sys.stderr) + print(f"Install with: {install_cmd}", file=sys.stderr) + return False + + return True + + +def convert_pdf(pdf_path, image_dir, show_progress=False, docling=False, images_scale=4.0): + """Convert PDF to markdown.""" + if docling: + from extractor import extract_pdf_docling + + markdown, _image_paths = extract_pdf_docling( + pdf_path, + output_dir=image_dir, + images_scale=images_scale, + show_progress=show_progress, + ) + return markdown + else: + from extractor import extract_pdf_fast + + markdown = extract_pdf_fast( + pdf_path, + image_dir=image_dir, + show_progress=show_progress, + ) + return markdown + + +def add_metadata_header(markdown, pdf_path, total_pages, image_dir=None, cached=False): + """Add metadata header to markdown output.""" + filename = os.path.basename(pdf_path) + + header_lines = [ + "---", + f"source: {filename}", + f"total_pages: {total_pages}", + f"extracted_at: {datetime.now().isoformat()}", + ] + + if cached: + header_lines.append("from_cache: true") + + if image_dir: + # Use relative path for portability + header_lines.append("images_dir: images") + + header_lines.extend(["---", "", ""]) + + return "\n".join(header_lines) + markdown + + +# ============================================================================= +# MAIN +# ============================================================================= + + +def main(): + parser = argparse.ArgumentParser( + description="Convert PDF to Markdown for LLM context (with persistent caching)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python pdf_to_md.py document.pdf # Output to document.md (cached) + python pdf_to_md.py document.pdf output.md # Custom output path + python pdf_to_md.py document.pdf --docling # Accurate tables (slower) + python pdf_to_md.py document.pdf --clear-cache # Clear cache and re-extract + python pdf_to_md.py --clear-all-cache # Clear entire cache + +Caching: + PDFs are cached in ~/.cache/pdf-to-markdown/ + Cache is keyed by file content hash + extraction mode. + Cache persists until explicitly cleared or source PDF changes. + """, + ) + + parser.add_argument( + "input", nargs="?", help="Input PDF file path or folder containing one PDF" + ) + parser.add_argument( + "output", nargs="?", help="Output markdown file path (default: .md)" + ) + parser.add_argument( + "--docling", + "--accurate", + action="store_true", + dest="docling", + help="Use Docling AI for complex/borderless tables (slower, ~1 sec/page)", + ) + parser.add_argument("--no-progress", action="store_true", help="Disable progress indicator") + + # Cache options + parser.add_argument( + "--clear-cache", + action="store_true", + help="Clear cache for this PDF before processing", + ) + parser.add_argument( + "--clear-all-cache", + action="store_true", + help="Clear entire cache directory and exit", + ) + parser.add_argument("--cache-stats", action="store_true", help="Show cache statistics and exit") + + args = parser.parse_args() + + cache_mgr = CacheManager() + + # Handle cache management commands + if args.clear_all_cache: + if cache_mgr.clear(): + print(f"Cache cleared: {cache_mgr.cache_dir}", file=sys.stderr) + else: + print("Cache was already empty.", file=sys.stderr) + sys.exit(0) + + if args.cache_stats: + stats = cache_mgr.get_stats() + print(f"Cache directory: {stats['cache_dir']}", file=sys.stderr) + print(f"Cached PDFs: {stats['entries']}", file=sys.stderr) + print(f"Total size: {stats['total_size_mb']} MB", file=sys.stderr) + sys.exit(0) + + # Require input for all other operations + if not args.input: + parser.error("the following arguments are required: input") + + # Handle --clear-cache + if args.clear_cache: + try: + resolved_input = resolve_input_pdf(args.input) + except (FileNotFoundError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + sys.exit(1) + + if cache_mgr.clear(str(resolved_input)): + print(f"Cache cleared for: {resolved_input}", file=sys.stderr) + else: + print(f"No cache found for: {resolved_input}", file=sys.stderr) + + try: + input_pdf = resolve_input_pdf(args.input) + except (FileNotFoundError, ValueError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + sys.exit(1) + + if input_pdf.suffix.lower() != ".pdf": + print(f"WARNING: File may not be a PDF: {input_pdf}", file=sys.stderr) + + show_progress = sys.stderr.isatty() and not args.no_progress + + # Check cache + config = ExtractionConfig(pdf_path=str(input_pdf), docling=args.docling) + valid, cache_key = cache_mgr.is_valid(config) + + result = None + image_dir = None + cache_hit = False + + if valid: + if show_progress: + mode = "docling" if args.docling else "fast" + print(f"Loading from cache ({mode} mode)...", file=sys.stderr) + + cache_result = cache_mgr.load(cache_key) + if cache_result: + result = cache_result.markdown + total_pages = cache_result.total_pages + cache_hit = True + + # Copy images from cache to output location + if cache_result.image_dir: + output_path = args.output or str(input_pdf.with_suffix(".md")) + img_mgr = ImageManager() + image_dir = img_mgr._copy_images_to_output( + cache_result.image_dir, output_path, show_progress + ) + + # Extract if no cache hit + if not cache_hit: + if not check_dependencies(docling_mode=args.docling): + sys.exit(1) + + from extractor import get_page_count + + total_pages = get_page_count(str(input_pdf)) + + if not cache_key: + cache_key = cache_mgr.get_key(config) + + img_mgr = ImageManager() + temp_image_dir = img_mgr.create_temp_dir(str(input_pdf)) + + try: + if show_progress: + if args.docling: + print( + f"Extracting {total_pages} pages with Docling AI (~1 sec/page)...", + file=sys.stderr, + ) + else: + print( + f"Extracting {total_pages} pages with PyMuPDF (fast mode)...", + file=sys.stderr, + ) + + result = convert_pdf( + str(input_pdf), + image_dir=temp_image_dir, + show_progress=show_progress, + docling=args.docling, + ) + except Exception as e: + img_mgr.cleanup() + print(f"ERROR: Conversion failed: {e}", file=sys.stderr) + sys.exit(1) + + # Save to cache + extraction_result = ExtractionResult( + markdown=result, + image_dir=temp_image_dir, + total_pages=total_pages, + ) + cache_mgr.save(cache_key, extraction_result, config) + if show_progress: + print(f"Cached: {cache_mgr._get_dir(cache_key)}", file=sys.stderr) + + # Finalize images + output_path = args.output or str(input_pdf.with_suffix(".md")) + image_dir = img_mgr.finalize_images( + temp_dir=temp_image_dir, + cache_dir=cache_mgr._get_dir(cache_key), + output_path=output_path, + show_progress=show_progress, + ) + + # Format output + output = result + img_mgr_for_output = ImageManager() # Fresh instance for output processing + + referenced_images = img_mgr_for_output.extract_references(result) if result else set() + + if image_dir: + output = img_mgr_for_output.enhance_markdown(output, image_dir) + images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images) + if images: + output += img_mgr_for_output.create_summary(images) + + output = add_metadata_header( + output, str(input_pdf), total_pages, image_dir, cached=cache_hit + ) + + # Write output + output_path = args.output or str(input_pdf.with_suffix(".md")) + with open(output_path, "w", encoding="utf-8") as f: + f.write(output) + + msg = f"Converted {total_pages} pages to: {output_path}" + if cache_hit: + msg += " (from cache)" + if image_dir: + images = img_mgr_for_output.get_info(image_dir, referenced_only=referenced_images) + if images: + msg += f" ({len(images)} images)" + print(msg, file=sys.stderr) + + +if __name__ == "__main__": + main()