Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 113 additions & 47 deletions pkg/parser/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,17 @@

reader, err := pdflib.NewReader(bytes.NewReader(docBytes), int64(len(docBytes)))
if err != nil {
// ledongthuc/pdf can fail on PDFs pdftable accepts (e.g. some
// xref-stream variants). Outline access is optional, so a
// failure here is not fatal — we just skip the outline path.
// Log at debug level and carry on with the heuristic flow.
slog.Debug("pdf: outline backend unavailable", "err", err)
reader = nil
// ledongthuc/pdf failed on a PDF pdftable accepted (some
// xref-stream variants are pdftable-only). Without ledongthuc
// we lose both outline access AND the primitive layer the
// row extractor uses — so we bail with a clear message rather
// than emit garbled text from pdftable.Words() (its word
// grouping concatenates words on standard-14 fonts without
// AFM metrics; see v0.4.x followup).
return nil, fmt.Errorf("pdf: open: ledongthuc/pdf backend rejected the document: %w", err)
}

rows, err := extractPDFRows(pdoc)
rows, err := extractPDFRows(reader)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -526,33 +528,37 @@
// The bucket tolerance (Y1 within 2pt) matches what the previous
// ledongthuc-backed implementation used; word-level Y1 jitter is the
// same scale as the per-glyph jitter it replaced.
func extractPDFRows(doc pdftable.Document) ([]pdfRow, error) {
numPages := doc.NumPages()
// extractPDFRows walks each page, grouping glyphs into rows by Y-position
// and recording the dominant font size + bold ratio per row.
//
// We use ledongthuc/pdf's Content() as the primitive source rather than
// pdftable.Words() because pdftable v0.3.0's word grouping silently
// concatenates adjacent words on the standard-14 fonts SEC filings use
// (no bundled AFM widths → glyph X-advance estimated wrong → word
// X-gaps collapse → "Currentmarketablesecurities"). The X-gap-into-spaces
// heuristic below is robust to that because we never trust pdftable's
// word boundaries — we re-derive them from raw glyph X positions.
//
// Once pdftable bundles standard-14 AFM metrics (v0.4.x goal) we can
// swap back to its Words() output.
func extractPDFRows(reader *pdflib.Reader) ([]pdfRow, error) {
numPages := reader.NumPage()
var out []pdfRow

for pageNum := 1; pageNum <= numPages; pageNum++ {
page, err := doc.Page(pageNum)
if err != nil {
// A bad page shouldn't take down the document — pdftable
// can fail page-by-page on malformed content streams. Skip.
continue
}
words, err := page.Words(pdftable.DefaultWordOpts())
if err != nil {
continue
}
if len(words) == 0 {
page := reader.Page(pageNum)
if page.V.IsNull() {
continue
}
content := page.Content()
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): Page.Content() return value looks like it might be ignoring a possible error

In ledongthuc/pdf, Content() typically returns (Content, error). Here it’s treated as value-only, so any error would be silently ignored and could lead to mis-parsing or panics when accessing content.Text. Please either handle the error (e.g., skip the page on failure, consistent with prior behavior) or confirm this API cannot fail and document that assumption clearly.


// Group words by visual top (Y1). Values within 2pt are
// considered the same row — pdftable already clusters chars
// into words by its own YTolerance, so this is just the next
// step up: words at near-identical baselines become a row.
// Group letters by (approximate) baseline Y. Values within 2pt
// are considered the same row — PDFs frequently jitter Y by a
// fraction.
type rowBucket struct {
y float64
maxFS float64
words []pdftable.Word
chars []pdflib.Text
}
var buckets []*rowBucket
find := func(y float64) *rowBucket {
Expand All @@ -565,55 +571,115 @@
buckets = append(buckets, b)
return b
}
for _, w := range words {
b := find(w.Y1)
b.words = append(b.words, w)
if w.FontSize > b.maxFS {
b.maxFS = w.FontSize
for _, t := range content.Text {
b := find(t.Y)
b.chars = append(b.chars, t)
if t.FontSize > b.maxFS {
b.maxFS = t.FontSize
}
}
// Sort rows top-to-bottom (higher Y = higher on page in PDF
// user space).
sort.Slice(buckets, func(i, j int) bool { return buckets[i].y > buckets[j].y })

for _, b := range buckets {
sort.Slice(b.words, func(i, j int) bool { return b.words[i].X0 < b.words[j].X0 })
sort.Slice(b.chars, func(i, j int) bool { return b.chars[i].X < b.chars[j].X })
var sb strings.Builder
boldWords, totalWords := 0, 0
for i, w := range b.words {
if i > 0 {
var lastX float64
boldGlyphs, totalGlyphs := 0, 0
for i, ch := range b.chars {
// Insert a space when the gap between the previous
// glyph's end and this glyph's start exceeds 0.20 of
// the font size. Tuned against real PDFs (arXiv +
// SEC 10-Ks): word-boundary gaps land around
// 0.20-0.30·fontSize; intra-word kerning stays well
// below 0.10.
if i > 0 && ch.X-lastX > ch.FontSize*0.20 {
sb.WriteString(" ")
}
sb.WriteString(w.Text)
if strings.TrimSpace(w.Text) != "" {
totalWords++
if isBoldFont(w.FontName) {
boldWords++
sb.WriteString(ch.S)
lastX = ch.X + ch.W
if strings.TrimSpace(ch.S) != "" {
totalGlyphs++
if isBoldFont(ch.Font) {
boldGlyphs++
}
}
}
text := strings.TrimSpace(sb.String())
// Wide letter-tracking — common on filing cover pages and
// bold section headers — makes every glyph gap exceed the
// space threshold, yielding "U N I T E D S T A T E S".
// Re-join those runs into real words.
text := collapseLetterSpacing(strings.TrimSpace(sb.String()))
if text == "" {
continue
}
// Drop publisher/preprint boilerplate (e.g. the rotated
// arXiv license stamp in the left margin). Left in, it
// pollutes the structure with junk top-level "headings"
// and the document title.
if isBoilerplateLine(text) {
continue
}
out = append(out, pdfRow{
page: pageNum,
fontSize: b.maxFS,
bold: totalWords > 0 && boldWords*2 > totalWords,
bold: totalGlyphs > 0 && boldGlyphs*2 > totalGlyphs,
text: text,
})
}
}
return out, nil
}

// multiSpaceRe matches two or more consecutive whitespace characters.
var multiSpaceRe = regexp.MustCompile(`\s{2,}`)

// looksLetterSpaced reports whether s appears to have been rendered with
// wide letter-tracking — a chain of single characters separated by
// spaces ("U N I T E D"). Used to detect cover-page / heading rows that
// the glyph-spacing heuristic over-split.
func looksLetterSpaced(s string) bool {
s = strings.TrimSpace(s)
if len(s) < 5 {
return false
}
parts := strings.Fields(s)
if len(parts) < 4 {
return false
}
singles := 0
for _, p := range parts {
if len(p) <= 1 {
singles++
}
}
// At least half of the parts must be single characters for us to
// call the run letter-spaced.
return singles*2 >= len(parts)
}

// collapseLetterSpacing rejoins runs of single-character "words"
// (the artifact of wide letter-tracking) into real words. Multi-space
// gaps between letter-spaced groups become regular word boundaries.
func collapseLetterSpacing(s string) string {
if !looksLetterSpaced(s) {
return s
}
// Split on runs of 2+ spaces to identify word groups, then collapse
// each group's single-character chain.
groups := multiSpaceRe.Split(s, -1)
for i, g := range groups {
parts := strings.Fields(g)
// If every part is a single character, glue them.
allSingles := true
for _, p := range parts {
if len(p) > 1 {
Comment on lines +670 to +671
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Using byte length to detect single-character tokens can misclassify multi-byte runes

In collapseLetterSpacing, len(p) is used for this check, which counts bytes, so non-ASCII glyphs (e.g., Japanese or accented Latin) will appear as length > 1 and be treated as multi-character. If you need to support non-ASCII PDFs, use utf8.RuneCountInString(p) so the condition reflects rune count instead of byte length.

Suggested implementation:

		for _, p := range parts {
			if utf8.RuneCountInString(p) > 1 {
				allSingles = false
				break
			}
		}

To compile successfully, ensure that pkg/parser/pdf.go imports the utf8 package:

  • Add import "unicode/utf8" to the existing import block (or include utf8 in an existing grouped import).

allSingles = false
break
}
}
if allSingles {
groups[i] = strings.Join(parts, "")
}
}
return strings.Join(groups, " ")
}

// buildHeadingLevelMap returns a map from rounded-font-size → heading level
// (1 = largest = h1). Only sizes above headingFloor are considered.
// Levels are capped at 6.
Expand Down Expand Up @@ -957,7 +1023,7 @@
// isEncryptedPDFError reports whether the given error from
// ledongthuc/pdf indicates the document is encrypted. The library
// has no proper error type for this, so we match on the message.
func isEncryptedPDFError(err error) bool {

Check failure on line 1026 in pkg/parser/pdf.go

View workflow job for this annotation

GitHub Actions / lint

func isEncryptedPDFError is unused (U1000)
if err == nil {
return false
}
Expand Down
Loading