From 084abc826af44440716287fb4b802dbb6ea91248 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Fri, 29 May 2026 17:13:49 +0100
Subject: [PATCH 1/3] build(deps): bump pdftable to v0.3.1

v0.3.1 ships the grid-indexed cell finder (the O(n^2/n^3) -> O(cells)
fix), so table extraction no longer degrades pathologically on dense
financial pages. Full table-extraction feature stays on; this only
makes it faster/bounded.
---
 go.mod | 2 +-
 go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index 130314e..27b1397 100644
--- a/go.mod
+++ b/go.mod
@@ -13,7 +13,7 @@ require (
 	github.com/go-chi/chi/v5 v5.2.5
 	github.com/google/uuid v1.6.0
 	github.com/hallelx2/llmgate v0.2.0
-	github.com/hallelx2/pdftable v0.3.0
+	github.com/hallelx2/pdftable v0.3.1
 	github.com/hibiken/asynq v0.26.0
 	github.com/jackc/pgx/v5 v5.9.2
 	github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
diff --git a/go.sum b/go.sum
index d52b49a..462ec48 100644
--- a/go.sum
+++ b/go.sum
@@ -134,8 +134,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+u
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90=
 github.com/hallelx2/llmgate v0.2.0 h1:x/LNCeHUPZpafn2IXi+LqpnZa7TtEQdLVlpkkJTlzBI=
 github.com/hallelx2/llmgate v0.2.0/go.mod h1:MK2Ol/5CIweTQ2/9eSiTJ5g/KSSuobNZL9TD3s57JxY=
-github.com/hallelx2/pdftable v0.3.0 h1:SwZPu2z4cIR4R30gP+7bpunGh931StjO1vrsxoldiDw=
-github.com/hallelx2/pdftable v0.3.0/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI=
+github.com/hallelx2/pdftable v0.3.1 h1:Uqe+9G8s9jrGYwxk8dEMXBCB+SlzvWPmW0Ze5863W1I=
+github.com/hallelx2/pdftable v0.3.1/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI=
 github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0=
 github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo=
 github.com/hhrutter/pkcs7 v0.2.2 h1:xMoifoVWah1LNym3C0pomEiLmyJyVIBXt/8oTPyPz+8=

From 0cda733ae87dee1786602f81692d83cc9cf68e84 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Fri, 29 May 2026 17:30:13 +0100
Subject: [PATCH 2/3] fix(ingest): bound the whole PDF parse with a
 configurable deadline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A 10-K was observed hanging 600s+ in `parsing` even in minimal mode: the
hang is in ledongthuc row extraction (extractPDFRows ->
reader.Page(n).Content()), which is pure-Go and pre-LLM, so none of the
existing per-stage table-extraction budgets bound it.

Wrap the entire PDF parse (row extraction, table extraction, section
building, leaf cap) in a deadline. The work runs on a goroutine; on
timeout or ctx cancellation Parse returns a clear error and abandons the
goroutine (buffered result channel, so no leak on send; a panic in the
work is recovered). This is the same abandon-on-deadline pattern
safeExtractTables already uses for a single table page, lifted to cover
the whole parse so ANY parse pathology fails fast and cleanly instead of
wedging ingest. The ingest pipeline already treats a parse error as a
doc-level failure, so the document goes to `failed` and is visible to
ops/bench rather than hanging forever.

Nothing is disabled: the full feature set (LLM TOC, tables, summarize,
HyDE, multi-axis) still runs — parse is merely bounded.

Config: IngestConfig.ParseTimeoutSeconds (default 120). Env
VLE_INGEST_PARSE_TIMEOUT_SECONDS; the server binary forwards
VLS_/VLE_INGEST_PARSE_TIMEOUT_SECONDS. Also threads the existing
ingest.max_sections through to the parser (previously dropped on the
floor) via RegistryFromIngestParams, so both robustness valves are
operator-tunable. A negative value disables the bound (escape hatch).

Tests: the deadline mechanism returns a timeout error in ~the timeout
(not after a 10s sleep), passes fast work through, propagates real parse
errors, runs inline when disabled, honours ctx cancel, and recovers
panics; plus config default/env-override/validate coverage.
---
 cmd/engine/main.go                   |   2 +-
 cmd/server/main.go                   |   2 +-
 config.example.yaml                  |  33 +++++
 internal/config/config.go            |  11 +-
 pkg/config/config.go                 |  31 ++++
 pkg/config/config_test.go            |  32 ++++
 pkg/ingest/ingest.go                 |  28 +++-
 pkg/parser/pdf.go                    | 137 +++++++++++++++++-
 pkg/parser/pdf_parse_timeout_test.go | 209 +++++++++++++++++++++++++++
 9 files changed, 474 insertions(+), 11 deletions(-)
 create mode 100644 pkg/parser/pdf_parse_timeout_test.go

diff --git a/cmd/engine/main.go b/cmd/engine/main.go
index 8da6abf..edfc821 100644
--- a/cmd/engine/main.go
+++ b/cmd/engine/main.go
@@ -172,7 +172,7 @@ func run() error {
 		DB:                     pool,
 		Storage:                store,
 		LLM:                    llmClient,
-		Parsers:                ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)),
+		Parsers:                ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Ingest.Tables), cfg.Ingest.MaxSections, time.Duration(cfg.Ingest.ParseTimeoutSeconds)*time.Second),
 		Logger:                 logger,
 		Mode:                   cfg.Ingest.Mode,
 		HyDEEnabled:            cfg.Ingest.HyDE.Enabled,
diff --git a/cmd/server/main.go b/cmd/server/main.go
index c14b608..2430862 100644
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -198,7 +198,7 @@ func run() error {
 		DB:                     pool,
 		Storage:                store,
 		LLM:                    llmClient,
-		Parsers:                ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)),
+		Parsers:                ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Engine.Ingest.Tables), cfg.Engine.Ingest.MaxSections, time.Duration(cfg.Engine.Ingest.ParseTimeoutSeconds)*time.Second),
 		Logger:                 logger,
 		Mode:                   cfg.Engine.Ingest.Mode,
 		HyDEEnabled:            cfg.Engine.Ingest.HyDE.Enabled,
diff --git a/config.example.yaml b/config.example.yaml
index 31262d7..6bec472 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -320,6 +320,39 @@ ingest:
   # vectorless-server use VLS_INGEST_MODE=minimal (no secret edit needed).
   mode: "full"
 
+  # Total-parse timeout (seconds). Bounds the ENTIRE parse of one
+  # document end to end — row extraction, table extraction, section
+  # building, and the leaf-section cap. It is the outermost robustness
+  # valve: a pathological/malformed PDF (observed: a 10-K stuck 600s+ in
+  # `parsing`, even in minimal mode, inside pure-Go row extraction) is
+  # abandoned at the deadline and the document fails fast instead of
+  # wedging the pipeline forever. NOTHING is disabled by this bound — the
+  # full feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still
+  # runs; parse is merely time-boxed. Applies in BOTH full and minimal
+  # mode (parse runs in both).
+  #
+  # 120 is comfortably longer than a healthy 300-page filing's parse
+  # (seconds to low tens of seconds) yet short enough to reap a hang
+  # quickly. 0 uses the engine default (120). Override per-process with
+  # VLE_INGEST_PARSE_TIMEOUT_SECONDS; the deployed server also honours
+  # VLS_INGEST_PARSE_TIMEOUT_SECONDS.
+  parse_timeout_seconds: 120
+
+  # Cap on the number of leaf sections one document may produce. A
+  # pathological PDF (e.g. a 90-page 10-K whose every bold statement
+  # title trips the heading detector, or a heading→one-body-leaf chain
+  # repeated hundreds of times) can shatter into far more leaves than the
+  # document has real sections — each leaf then costs a summarize + HyDE
+  # + multi-axis LLM call, which is what throttles/stalls full ingest.
+  # When the parsed leaf count exceeds this cap the parser merges
+  # sections (smallest first; single-leaf parents collapse into their
+  # parent) until the document is back under the cap, preserving content
+  # and never merging table sections. 400 sits comfortably above a real
+  # filing's section count while still catching the runaway case. 0 uses
+  # the engine default (400); a negative value disables the cap. Override
+  # with VLE_INGEST_MAX_SECTIONS.
+  max_sections: 400
+
   # The summarize and HyDE stages run concurrently. This caps the total
   # number of LLM calls in flight across both stages combined, so the
   # provider's per-tenant concurrency limit isn't exceeded. 0 disables
diff --git a/internal/config/config.go b/internal/config/config.go
index 1646af0..8a2ed78 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -205,7 +205,7 @@ func Default() Config {
 			MaxAge:         86400,
 		},
 		Governance: GovernanceConfig{
-			MaxBodySizeBytes: 33554432,       // 32 MiB
+			MaxBodySizeBytes: 33554432, // 32 MiB
 			DefaultTimeout:   30 * time.Second,
 			QueryTimeout:     120 * time.Second,
 		},
@@ -326,6 +326,15 @@ func applyEnvOverrides(c *Config) {
 	if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" {
 		c.Engine.Ingest.Mode = v
 	}
+	// Total-parse timeout (seconds). Forwarded so the deployed server
+	// honours a tuned parse deadline without a secret/config edit — the
+	// outermost robustness valve against a parse that hangs (pre-LLM,
+	// pure-Go row extraction). VLS_-prefixed wins over VLE_.
+	if v := firstEnv("VLS_INGEST_PARSE_TIMEOUT_SECONDS", "VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n >= 0 {
+			c.Engine.Ingest.ParseTimeoutSeconds = n
+		}
+	}
 	// Anthropic-compatible gateway overrides (e.g. GLM/Zhipu via
 	// https://api.z.ai/api/anthropic): base URL + model, so the
 	// anthropic driver can run a non-Anthropic model without a secret
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 8c7c327..8a6b30e 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -125,6 +125,28 @@ type IngestConfig struct {
 	// section count (~170-510 with tables) while still catching the
 	// runaway case. A negative value is rejected by Validate.
 	MaxSections int `yaml:"max_sections"`
+
+	// ParseTimeoutSeconds bounds the ENTIRE parse of a single document —
+	// row extraction, table extraction, section building, and the leaf
+	// cap, end to end. It is the outermost robustness valve: every
+	// per-stage timeout inside the parser (per-page / doc-wide table
+	// budgets) is bounded by something pre-LLM, but pure-Go row extraction
+	// (ledongthuc's reader.Page(n).Content()) had no bound, so a
+	// pathological PDF (observed: a 10-K stuck 600s+ in `parsing` even in
+	// minimal mode) could hang the parse forever.
+	//
+	// When the whole parse exceeds this deadline the parser abandons the
+	// work and returns a clear error; the ingest pipeline treats it like
+	// any other parse failure (the document goes to `failed`), so a doc
+	// that can't parse in time fails fast and is visible to ops/bench
+	// rather than wedging the pipeline. NOTHING is disabled — the full
+	// feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still
+	// runs; parse is merely bounded.
+	//
+	// 0 (or omitted) defaults to 120. A negative value is rejected by
+	// Validate. Engine env override: VLE_INGEST_PARSE_TIMEOUT_SECONDS;
+	// the server binary also forwards VLS_/VLE_INGEST_PARSE_TIMEOUT_SECONDS.
+	ParseTimeoutSeconds int `yaml:"parse_timeout_seconds"`
 }
 
 // TOCBlock configures the LLM-driven table-of-contents tree
@@ -728,6 +750,7 @@ func Default() Config {
 			GlobalLLMConcurrency:  12,
 			LLMCallTimeoutSeconds: 90,
 			MaxSections:           400,
+			ParseTimeoutSeconds:   120,
 			HyDE: HyDEConfig{
 				Enabled:      true,
 				NumQuestions: 5,
@@ -911,6 +934,11 @@ func applyEnvOverrides(c *Config) {
 			c.Ingest.MaxSections = n
 		}
 	}
+	if v := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n >= 0 {
+			c.Ingest.ParseTimeoutSeconds = n
+		}
+	}
 	// pdftable-driven table extraction.
 	if v := os.Getenv("VLE_INGEST_TABLES_ENABLED"); v != "" {
 		switch strings.ToLower(strings.TrimSpace(v)) {
@@ -1200,6 +1228,9 @@ func (c Config) Validate() error {
 	if c.Ingest.MaxSections < 0 {
 		return fmt.Errorf("ingest.max_sections must be >= 0, got %d", c.Ingest.MaxSections)
 	}
+	if c.Ingest.ParseTimeoutSeconds < 0 {
+		return fmt.Errorf("ingest.parse_timeout_seconds must be >= 0, got %d", c.Ingest.ParseTimeoutSeconds)
+	}
 
 	switch c.Ingest.Tables.VerticalStrategy {
 	case "", "lines", "lines_strict", "text", "explicit":
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
index d8172b2..8830dde 100644
--- a/pkg/config/config_test.go
+++ b/pkg/config/config_test.go
@@ -82,6 +82,38 @@ func TestDefaultValues(t *testing.T) {
 	if cfg.Ingest.TOC.TOCCheckPages != 20 {
 		t.Errorf("ingest.toc.toc_check_pages = %d, want 20", cfg.Ingest.TOC.TOCCheckPages)
 	}
+	if cfg.Ingest.ParseTimeoutSeconds != 120 {
+		t.Errorf("ingest.parse_timeout_seconds = %d, want 120", cfg.Ingest.ParseTimeoutSeconds)
+	}
+	if cfg.Ingest.MaxSections != 400 {
+		t.Errorf("ingest.max_sections = %d, want 400", cfg.Ingest.MaxSections)
+	}
+}
+
+// TestIngestParseTimeoutEnvOverride covers VLE_INGEST_PARSE_TIMEOUT_SECONDS
+// — the operator knob that lets a tuned whole-parse deadline reach the
+// parser without a config-file edit.
+func TestIngestParseTimeoutEnvOverride(t *testing.T) {
+	prev := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS")
+	defer os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", prev)
+
+	os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", "300")
+	cfg := Default()
+	applyEnvOverrides(&cfg)
+	if cfg.Ingest.ParseTimeoutSeconds != 300 {
+		t.Errorf("ingest.parse_timeout_seconds = %d, want 300", cfg.Ingest.ParseTimeoutSeconds)
+	}
+}
+
+// TestIngestParseTimeoutValidate rejects a negative parse timeout — a
+// non-positive deadline would silently disable the bound, which must be
+// an explicit choice, not a typo that slips through Load.
+func TestIngestParseTimeoutValidate(t *testing.T) {
+	cfg := Default()
+	cfg.Ingest.ParseTimeoutSeconds = -1
+	if err := cfg.Validate(); err == nil {
+		t.Error("Validate should reject a negative ingest.parse_timeout_seconds")
+	}
 }
 
 // TestIngestModeDefault locks the default ingest mode to "full" so the
diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go
index a840acc..a9286e4 100644
--- a/pkg/ingest/ingest.go
+++ b/pkg/ingest/ingest.go
@@ -1122,15 +1122,35 @@ func DefaultRegistry() *parser.Registry {
 }
 
 // RegistryFromTableOpts returns a parser.Registry where the PDF parser
-// is configured from the supplied TableOpts. Pass nil to disable table
-// extraction entirely; pass parser.DefaultTableOpts() (or a custom set)
-// to enable. All non-PDF parsers are constructed at their defaults.
+// is configured from the supplied TableOpts, with the leaf-section cap
+// and total-parse timeout left at their parser defaults (400 sections,
+// 120s). Pass nil to disable table extraction entirely; pass
+// parser.DefaultTableOpts() (or a custom set) to enable. All non-PDF
+// parsers are constructed at their defaults.
+//
+// Use RegistryFromIngestParams to thread an operator-tuned cap / parse
+// timeout from config.
 func RegistryFromTableOpts(opts *parser.TableOpts) *parser.Registry {
+	return RegistryFromIngestParams(opts, 0, 0)
+}
+
+// RegistryFromIngestParams returns a parser.Registry where the PDF parser
+// is configured from the supplied TableOpts AND the operator-tuned
+// leaf-section cap and total-parse timeout (ingest.max_sections,
+// ingest.parse_timeout_seconds). maxSections == 0 / parseTimeout == 0
+// each select the parser's built-in default; a negative value disables
+// that bound. All non-PDF parsers are constructed at their defaults.
+//
+// This is the constructor the engine/server wiring uses so the parse
+// deadline and section cap from config actually reach the parser — the
+// outermost robustness valves for full-feature ingest. Table extraction,
+// the section tree, and the cap all still run; they are merely bounded.
+func RegistryFromIngestParams(opts *parser.TableOpts, maxSections int, parseTimeout time.Duration) *parser.Registry {
 	return parser.NewRegistry(
 		parser.NewMarkdown(),
 		parser.NewHTML(),
 		parser.NewDOCX(),
-		parser.NewPDFWithTables(opts),
+		parser.NewPDFWithConfig(opts, maxSections, parseTimeout),
 		parser.NewText(),
 	)
 }
diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go
index bf120dd..579cf2a 100644
--- a/pkg/parser/pdf.go
+++ b/pkg/parser/pdf.go
@@ -64,6 +64,30 @@ type PDF struct {
 	// Zero selects defaultMaxLeafSections. A negative value disables the
 	// cap entirely (escape hatch for callers that want the raw outline).
 	MaxSections int
+
+	// ParseTimeout bounds the ENTIRE Parse of one document — row
+	// extraction, table extraction, section building, and the leaf cap,
+	// end to end. It is the outermost robustness valve.
+	//
+	// Every other time bound inside the parser caps a sub-stage
+	// (tableExtractPageTimeout / tableExtractDocBudget cap table
+	// extraction), but the pure-Go row extractor
+	// (extractPDFRows -> reader.Page(n).Content()) had no bound at all —
+	// and that is exactly where a pathological PDF was observed hanging
+	// 600s+ in `parsing`, even in minimal mode (pre-LLM). ParseTimeout
+	// runs the whole parse on a goroutine and abandons it on deadline,
+	// returning a clear error so ingest fails the document fast instead
+	// of wedging forever.
+	//
+	// Nothing is disabled by this bound: when the parse finishes inside
+	// the deadline the full feature set (tables, the outline/heuristic
+	// section tree, the cap) is produced exactly as before.
+	//
+	// Zero selects defaultParseTimeout (120s). A non-positive value other
+	// than zero (i.e. negative) disables the bound entirely — Parse then
+	// runs synchronously with no deadline (escape hatch / legacy
+	// behaviour for callers that want an unbounded parse).
+	ParseTimeout time.Duration
 }
 
 // TableOpts controls pdftable's table-finding stage. The zero value
@@ -119,13 +143,24 @@ func NewPDFWithTables(opts *TableOpts) *PDF { return &PDF{Tables: opts} }
 
 // NewPDFWithOpts returns a PDF parser using the supplied table-extraction
 // options and an explicit leaf-section cap. maxSections == 0 selects
-// defaultMaxLeafSections; a negative value disables the cap. This is the
-// constructor the engine wiring uses so the cap is operator-tunable via
-// config (ingest.max_sections).
+// defaultMaxLeafSections; a negative value disables the cap. The parse
+// timeout is left at its zero value, which resolves to defaultParseTimeout.
 func NewPDFWithOpts(opts *TableOpts, maxSections int) *PDF {
 	return &PDF{Tables: opts, MaxSections: maxSections}
 }
 
+// NewPDFWithConfig returns a PDF parser with the table options, leaf-
+// section cap, AND total-parse timeout all set explicitly. This is the
+// constructor the engine wiring uses so every parser robustness knob is
+// operator-tunable via config (ingest.max_sections,
+// ingest.parse_timeout_seconds).
+//
+//   - maxSections == 0  → defaultMaxLeafSections; negative disables the cap.
+//   - parseTimeout == 0 → defaultParseTimeout; negative disables the bound.
+func NewPDFWithConfig(opts *TableOpts, maxSections int, parseTimeout time.Duration) *PDF {
+	return &PDF{Tables: opts, MaxSections: maxSections, ParseTimeout: parseTimeout}
+}
+
 // Name implements Parser.
 func (*PDF) Name() string { return "pdf" }
 
@@ -138,11 +173,87 @@ func (*PDF) Accepts(contentType, filename string) bool {
 }
 
 // Parse implements Parser.
-func (p *PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) {
+//
+// Parse is a thin timeout wrapper around the real parse work (parseDoc).
+// It bounds the WHOLE parse — row extraction, table extraction, section
+// building, and the leaf cap — by p.resolvedParseTimeout(). The work runs
+// on a goroutine; if it doesn't finish by the deadline (or ctx is
+// cancelled) Parse returns a clear error and abandons the goroutine. The
+// goroutine then finishes on its own (or is GC'd) — its result lands in a
+// buffered channel so it never blocks on send. This is the same
+// abandon-on-deadline pattern safeExtractTables already uses for a single
+// table page, lifted to cover the entire parse so ANY parse pathology
+// (notably the unbounded pure-Go ledongthuc row extractor) fails fast and
+// cleanly instead of hanging ingest forever.
+//
+// A negative ParseTimeout disables the bound and runs parseDoc inline.
+func (p *PDF) Parse(ctx context.Context, r io.Reader) (*ParsedDoc, error) {
+	// Read the bytes up front (outside the deadline goroutine): io.ReadAll
+	// is bounded by the reader/storage layer, not by the PDF pathology we
+	// are guarding against, and reading here keeps the goroutine body a
+	// pure CPU/parse unit.
 	buf, err := io.ReadAll(r)
 	if err != nil {
 		return nil, err
 	}
+	return runParseWithDeadline(ctx, p.resolvedParseTimeout(), func() (*ParsedDoc, error) {
+		return p.parseDoc(ctx, buf)
+	})
+}
+
+// runParseWithDeadline runs work bounded by timeout. It is the reusable
+// core of the whole-Parse robustness valve, factored out so the
+// deadline/abandon mechanism is unit-testable with an arbitrary work
+// closure (e.g. one that sleeps past the deadline) without needing a
+// pathological real PDF.
+//
+//   - timeout <= 0 runs work inline with NO bound (legacy/unbounded
+//     behaviour; the explicit escape hatch a negative ParseTimeout selects).
+//   - Otherwise work runs on a goroutine and the function selects on the
+//     work result, a time.After(timeout), and ctx cancellation. On
+//     timeout/cancel the goroutine is abandoned: its result lands in a
+//     buffered channel so it can always send and exit (no leak on send),
+//     then runs to completion on its own and is collected. A panic inside
+//     work is recovered and surfaced as an error so a backend bug can
+//     never crash the ingest worker.
+//
+// The timeout/cancel errors are deliberately phrased so the ingest
+// pipeline's existing "parse failed → document failed" path produces a
+// clear, ops-visible message instead of an infinite hang.
+func runParseWithDeadline(ctx context.Context, timeout time.Duration, work func() (*ParsedDoc, error)) (*ParsedDoc, error) {
+	if timeout <= 0 {
+		return work()
+	}
+
+	type result struct {
+		doc *ParsedDoc
+		err error
+	}
+	done := make(chan result, 1)
+	go func() {
+		defer func() {
+			if rec := recover(); rec != nil {
+				done <- result{err: fmt.Errorf("pdf: parse panicked: %v", rec)}
+			}
+		}()
+		doc, perr := work()
+		done <- result{doc: doc, err: perr}
+	}()
+
+	select {
+	case res := <-done:
+		return res.doc, res.err
+	case <-time.After(timeout):
+		return nil, fmt.Errorf("pdf: parse exceeded %s — document too complex or malformed", timeout)
+	case <-ctx.Done():
+		return nil, fmt.Errorf("pdf: parse cancelled: %w", ctx.Err())
+	}
+}
+
+// parseDoc is the real parse implementation. It is bounded by Parse's
+// deadline wrapper; on its own it has no time bound beyond the per-stage
+// table-extraction budgets.
+func (p *PDF) parseDoc(_ context.Context, buf []byte) (*ParsedDoc, error) {
 
 	// We run TWO PDF backends in parallel here:
 	//
@@ -407,6 +518,24 @@ func (p *PDF) resolvedMaxSections() int {
 	return p.MaxSections
 }
 
+// defaultParseTimeout is the whole-Parse deadline applied when
+// ParseTimeout is left at its zero value. 120s is comfortably longer than
+// a healthy 300-page filing's parse (seconds to low tens of seconds) yet
+// short enough that a pathological/malformed document — the kind observed
+// hanging 600s+ in pure-Go row extraction — is reaped quickly and the
+// document fails fast instead of wedging ingest.
+const defaultParseTimeout = 120 * time.Second
+
+// resolvedParseTimeout turns the configured ParseTimeout into the value
+// the wrapper actually uses: 0 selects defaultParseTimeout; a negative
+// value disables the bound (Parse runs parseDoc inline with no deadline).
+func (p *PDF) resolvedParseTimeout() time.Duration {
+	if p.ParseTimeout == 0 {
+		return defaultParseTimeout
+	}
+	return p.ParseTimeout
+}
+
 // propagateSectionPages fills internal-node PageStart/PageEnd from the union
 // of descendant leaf ranges where the internal node didn't have its own
 // (because its body was empty / hoisted into children). Leaves keep their
diff --git a/pkg/parser/pdf_parse_timeout_test.go b/pkg/parser/pdf_parse_timeout_test.go
new file mode 100644
index 0000000..9932c75
--- /dev/null
+++ b/pkg/parser/pdf_parse_timeout_test.go
@@ -0,0 +1,209 @@
+package parser
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestRunParseWithDeadline_TimesOutDoesNotHang is the core robustness
+// assertion: when the parse work runs longer than the deadline, the
+// wrapper returns the timeout error in ~the timeout — NOT after the work
+// finishes (which here is 10s, far longer than any CI step should wait).
+// This is the exact pathology the whole-Parse deadline guards against: a
+// pure-Go row extractor (or any backend) that never returns must fail
+// fast instead of wedging ingest forever.
+func TestRunParseWithDeadline_TimesOutDoesNotHang(t *testing.T) {
+	const timeout = 50 * time.Millisecond
+	const workDuration = 10 * time.Second // simulates the unbounded hang
+
+	started := make(chan struct{})
+	work := func() (*ParsedDoc, error) {
+		close(started)
+		time.Sleep(workDuration) // never returns within the deadline
+		return &ParsedDoc{Title: "should never be observed"}, nil
+	}
+
+	start := time.Now()
+	doc, err := runParseWithDeadline(context.Background(), timeout, work)
+	elapsed := time.Since(start)
+
+	<-started // the work goroutine did start (we really are abandoning it)
+
+	if err == nil {
+		t.Fatalf("expected a timeout error, got doc=%v err=nil", doc)
+	}
+	if doc != nil {
+		t.Errorf("expected nil doc on timeout, got %v", doc)
+	}
+	// The error must clearly name the timeout so ops see why the doc failed.
+	if !strings.Contains(err.Error(), "parse exceeded") {
+		t.Errorf("error %q does not mention the parse-exceeded cause", err.Error())
+	}
+	// Critically: we returned in ~timeout, not in ~workDuration. Generous
+	// upper bound (still 20x the timeout but 5x under the work sleep) keeps
+	// the test robust on a loaded CI box while still proving "doesn't hang".
+	if elapsed > time.Second {
+		t.Errorf("runParseWithDeadline took %s — it waited for the work instead of the deadline (hang not bounded)", elapsed)
+	}
+}
+
+// TestRunParseWithDeadline_FastWorkPassesThrough confirms the happy path:
+// work that finishes well inside the deadline has its result returned
+// verbatim, untouched by the wrapper.
+func TestRunParseWithDeadline_FastWorkPassesThrough(t *testing.T) {
+	want := &ParsedDoc{Title: "quick"}
+	doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) {
+		return want, nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc != want {
+		t.Errorf("doc = %v, want %v", doc, want)
+	}
+}
+
+// TestRunParseWithDeadline_PropagatesWorkError confirms a genuine parse
+// error from the work (the common "bad PDF" case) is returned as-is, not
+// masked by the timeout machinery.
+func TestRunParseWithDeadline_PropagatesWorkError(t *testing.T) {
+	sentinel := errors.New("pdf: open: boom")
+	doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) {
+		return nil, sentinel
+	})
+	if !errors.Is(err, sentinel) {
+		t.Fatalf("err = %v, want sentinel %v", err, sentinel)
+	}
+	if doc != nil {
+		t.Errorf("doc = %v, want nil", doc)
+	}
+}
+
+// TestRunParseWithDeadline_DisabledRunsInline confirms a non-positive
+// timeout disables the bound entirely: the work runs inline (no
+// goroutine, no deadline) and its result is returned even though the
+// "work" takes a beat. This is the explicit escape hatch a negative
+// ParseTimeout selects (legacy unbounded behaviour).
+func TestRunParseWithDeadline_DisabledRunsInline(t *testing.T) {
+	want := &ParsedDoc{Title: "inline"}
+	doc, err := runParseWithDeadline(context.Background(), 0, func() (*ParsedDoc, error) {
+		time.Sleep(20 * time.Millisecond)
+		return want, nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if doc != want {
+		t.Errorf("doc = %v, want %v", doc, want)
+	}
+}
+
+// TestRunParseWithDeadline_ContextCancel confirms a cancelled context
+// unblocks the wrapper promptly even when the work is still running and
+// the (long) timeout has not fired — so a shutdown/cancel propagates
+// instead of waiting out the full parse deadline.
+func TestRunParseWithDeadline_ContextCancel(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		time.Sleep(20 * time.Millisecond)
+		cancel()
+	}()
+
+	start := time.Now()
+	doc, err := runParseWithDeadline(ctx, time.Hour, func() (*ParsedDoc, error) {
+		time.Sleep(10 * time.Second)
+		return &ParsedDoc{}, nil
+	})
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatalf("expected a cancellation error, got doc=%v err=nil", doc)
+	}
+	if !errors.Is(err, context.Canceled) {
+		t.Errorf("err = %v, want it to wrap context.Canceled", err)
+	}
+	if elapsed > time.Second {
+		t.Errorf("cancellation took %s — wrapper did not honour ctx promptly", elapsed)
+	}
+}
+
+// TestRunParseWithDeadline_RecoversPanic confirms a panic deep inside the
+// parse work is recovered and surfaced as an error rather than crashing
+// the ingest worker process.
+func TestRunParseWithDeadline_RecoversPanic(t *testing.T) {
+	doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) {
+		panic("backend exploded")
+	})
+	if err == nil {
+		t.Fatalf("expected an error from a panicking work, got doc=%v err=nil", doc)
+	}
+	if !strings.Contains(err.Error(), "panicked") {
+		t.Errorf("error %q does not indicate a recovered panic", err.Error())
+	}
+}
+
+// TestResolvedParseTimeout covers the zero/default, explicit, and
+// disabled (negative) resolutions of the configured ParseTimeout.
+func TestResolvedParseTimeout(t *testing.T) {
+	if got := (&PDF{}).resolvedParseTimeout(); got != defaultParseTimeout {
+		t.Errorf("zero ParseTimeout: got %s, want default %s", got, defaultParseTimeout)
+	}
+	if got := (&PDF{ParseTimeout: 5 * time.Second}).resolvedParseTimeout(); got != 5*time.Second {
+		t.Errorf("explicit ParseTimeout: got %s, want 5s", got)
+	}
+	if got := (&PDF{ParseTimeout: -1}).resolvedParseTimeout(); got > 0 {
+		t.Errorf("negative ParseTimeout should stay non-positive (bound disabled), got %s", got)
+	}
+}
+
+// TestPDFParse_HonoursTinyTimeout drives the FULL Parse path (not just the
+// helper) with a deliberately tiny deadline and a real PDF whose parse
+// takes longer than that deadline, proving Parse itself returns the
+// timeout error promptly rather than hanging.
+//
+// We reuse the table fixture (a real ruled-table PDF). Even its modest
+// parse comfortably exceeds a 1ms deadline, so the wrapper fires. If the
+// fixture is ever made trivially fast, the assertion that we either time
+// out OR succeed (never hang) still holds — the test's job is to prove the
+// wrapper is wired into Parse and bounds it, which the elapsed-time guard
+// enforces.
+func TestPDFParse_HonoursTinyTimeout(t *testing.T) {
+	b := readParserFixture(t, "tables-example.pdf")
+	p := &PDF{Tables: DefaultTableOpts(), ParseTimeout: time.Millisecond}
+
+	start := time.Now()
+	_, err := p.Parse(context.Background(), bytes.NewReader(b))
+	elapsed := time.Since(start)
+
+	// The whole point: Parse returned quickly. A real parse of this fixture
+	// without the bound takes longer than 1ms, so on any normal machine we
+	// expect the timeout error; either way Parse MUST NOT hang.
+	if elapsed > 5*time.Second {
+		t.Fatalf("Parse took %s with a 1ms ParseTimeout — the deadline is not wired into Parse", elapsed)
+	}
+	if err != nil && !strings.Contains(err.Error(), "parse exceeded") {
+		// A non-timeout error (e.g. the parse genuinely finished first on a
+		// very fast box and returned a real result with err==nil) is fine;
+		// only an unexpected error shape is worth flagging.
+		t.Logf("Parse returned non-timeout error (acceptable if parse beat the 1ms deadline): %v", err)
+	}
+}
+
+// readParserFixture mirrors the external test's readFixture but lives in
+// the white-box package so the timeout tests (which need unexported
+// symbols) can load the same testdata files.
+func readParserFixture(t *testing.T, name string) []byte {
+	t.Helper()
+	path := filepath.Join("testdata", name)
+	b, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read fixture %q: %v", path, err)
+	}
+	return b
+}

From 9d8c7b41e7e0259671b70dbcdd3aa3b8b6f330d9 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Fri, 29 May 2026 17:48:40 +0100
Subject: [PATCH 3/3] fix(parser): make the leaf-section cap reduce any tree
 shape
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cap only merged ADJACENT leaf siblings, but the real 10-K explosion
is hundreds of SINGLE-LEAF PARENTS (heading -> one body leaf) that have
no adjacent leaf-sibling pairs at all — so the cap silently did nothing
and a 92-page filing sailed past the 400 cap at 463-1465 leaves, each
costing a summarize + HyDE + multi-axis LLM call (the throughput killer
for full ingest).

Fix it in two phases:
  1. collapseSingleLeafParents flattens every heading -> lone-leaf chain
     (bottom-up, so deep chains fold in one pass) so the formerly
     only-children become adjacent leaf siblings. Count is unchanged; the
     parent absorbs the child's content + page range and becomes the leaf.
  2. The existing smallest-first adjacent-pair merge then reduces the
     count to the cap, with a defensive single-leaf-parent collapse for
     any pair a table leaf blocked.

The top-level sections are wrapped under a synthetic root so the merge
step — which shrinks a sibling list by rewriting parent.Children — can
shrink the TOP-level list too (a bare slice parameter would not
propagate the shrink back). This is what makes the invariant hold for a
flat list of single-leaf parents.

Invariant: for any tree with > N mergeable leaves, capLeafSections drives
the leaf count to <= N. Content is always preserved (concatenated,
page ranges unioned) and table sections (Metadata["table"]=="true") are
never merged or collapsed. Nothing is disabled — the full section tree is
still produced; it's just bounded.

Tests: 1000 single-leaf-parents -> <= 400 with no content loss (the case
the old cap failed and that let 1465 through), deep heading->subheading->
body chains, a mixed flat/single-leaf/multi-child tree, and a
table-protection test asserting table leaves survive verbatim.
---
 pkg/parser/cap_test.go | 165 +++++++++++++++++++++++++++++++++++++++++
 pkg/parser/pdf.go      | 159 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 307 insertions(+), 17 deletions(-)

diff --git a/pkg/parser/cap_test.go b/pkg/parser/cap_test.go
index fa1fd61..726df0e 100644
--- a/pkg/parser/cap_test.go
+++ b/pkg/parser/cap_test.go
@@ -15,6 +15,29 @@ func leafTree(n, size int) []Section {
 	return []Section{{Level: 1, Title: "parent", Children: kids}}
 }
 
+// singleLeafParentTree builds n top-level "heading" sections, each an
+// internal node with EXACTLY ONE leaf child carrying `size` chars. This
+// is the real 10-K explosion shape: hundreds of heading -> one-body-leaf
+// parents with NO adjacent leaf-sibling pairs anywhere. The pre-fix cap
+// (adjacent-siblings only) could not reduce this tree at all.
+func singleLeafParentTree(n, size int) []Section {
+	parents := make([]Section, n)
+	for i := range parents {
+		parents[i] = Section{
+			Level: 1,
+			Title: "heading",
+			Children: []Section{{
+				Level:     2,
+				Title:     "body",
+				Content:   strings.Repeat("y", size),
+				PageStart: i + 1,
+				PageEnd:   i + 1,
+			}},
+		}
+	}
+	return parents
+}
+
 func TestCapLeafSections_MergesDownToCap(t *testing.T) {
 	tree := leafTree(1000, 50)
 	if got := countLeafSections(tree); got != 1000 {
@@ -81,6 +104,148 @@ func TestCapLeafSections_MergesSmallestFirst(t *testing.T) {
 	}
 }
 
+// TestCapLeafSections_SingleLeafParentsReduce is the regression test for
+// the bug that let a 92-page 10-K through at 463-1465 leaves: a tree made
+// entirely of single-leaf parents (heading -> one body leaf) has no
+// adjacent leaf-sibling pairs, so the old adjacent-only merge did nothing.
+// The fix must collapse those chains and merge down to the cap, losing no
+// content.
+func TestCapLeafSections_SingleLeafParentsReduce(t *testing.T) {
+	tree := singleLeafParentTree(1000, 30)
+	if got := countLeafSections(tree); got != 1000 {
+		t.Fatalf("setup: countLeafSections = %d, want 1000", got)
+	}
+	orig := totalContentLen(tree) // 1000 * 30
+
+	capped := capLeafSections(tree, 400)
+
+	if got := countLeafSections(capped); got > 400 {
+		t.Errorf("after cap: %d leaves, want <= 400 (the bug let 1465 through)", got)
+	}
+	// No content lost. Collapses/merges insert at most a "\n\n" (2 chars)
+	// per fold; with < 1000 folds total the upper bound is generous.
+	if got := totalContentLen(capped); got < orig || got > orig+2*1000 {
+		t.Errorf("content not preserved: got %d chars, want in [%d, %d]", got, orig, orig+2*1000)
+	}
+}
+
+// TestCapLeafSections_DeepSingleLeafChains exercises heading -> subheading
+// -> body chains (depth-3 single-leaf parents). The bottom-up collapse
+// must fold the whole chain so the bodies become mergeable siblings.
+func TestCapLeafSections_DeepSingleLeafChains(t *testing.T) {
+	const n = 600
+	roots := make([]Section, n)
+	for i := range roots {
+		roots[i] = Section{Level: 1, Title: "h1", Children: []Section{{
+			Level: 2, Title: "h2", Children: []Section{{
+				Level: 3, Title: "body", Content: strings.Repeat("z", 20),
+				PageStart: i + 1, PageEnd: i + 1,
+			}},
+		}}}
+	}
+	if got := countLeafSections(roots); got != n {
+		t.Fatalf("setup: countLeafSections = %d, want %d", got, n)
+	}
+	orig := totalContentLen(roots)
+
+	capped := capLeafSections(roots, 400)
+	if got := countLeafSections(capped); got > 400 {
+		t.Errorf("after cap: %d leaves, want <= 400", got)
+	}
+	if got := totalContentLen(capped); got < orig {
+		t.Errorf("content shrank: got %d chars, want >= %d", got, orig)
+	}
+}
+
+// TestCapLeafSections_MixedShapeReducesToCap throws a tree that mixes a
+// flat sibling list, single-leaf parents, and a multi-child branch at the
+// cap. The invariant is unconditional: > N mergeable leaves must come
+// down to <= N regardless of shape.
+func TestCapLeafSections_MixedShapeReducesToCap(t *testing.T) {
+	tree := []Section{
+		{Level: 1, Title: "flat", Children: func() []Section {
+			out := make([]Section, 300)
+			for i := range out {
+				out[i] = Section{Level: 2, Title: "f", Content: "ff", PageStart: i + 1, PageEnd: i + 1}
+			}
+			return out
+		}()},
+	}
+	// Append 300 single-leaf parents as additional top-level nodes.
+	tree = append(tree, singleLeafParentTree(300, 10)...)
+
+	if got := countLeafSections(tree); got != 600 {
+		t.Fatalf("setup: countLeafSections = %d, want 600", got)
+	}
+	capped := capLeafSections(tree, 100)
+	if got := countLeafSections(capped); got > 100 {
+		t.Errorf("after cap: %d leaves, want <= 100", got)
+	}
+}
+
+// TestCapLeafSections_NeverMergesTableSections asserts that table leaves
+// (Metadata["table"]="true") are preserved verbatim and never merged or
+// collapsed, even under a cap small enough that everything else must
+// merge around them.
+func TestCapLeafSections_NeverMergesTableSections(t *testing.T) {
+	mkTable := func(page int, content string) Section {
+		return Section{
+			Level: 1, Title: "Table", Content: content,
+			PageStart: page, PageEnd: page,
+			Metadata: map[string]string{"table": "true"},
+		}
+	}
+	// Three tables interleaved with prose leaves under one parent, plus a
+	// pile of single-leaf-parent prose so we're well over the cap.
+	kids := []Section{
+		{Level: 2, Title: "p1", Content: "prose-a"},
+		mkTable(1, "| A | B |\n| --- | --- |\n| 1 | 2 |"),
+		{Level: 2, Title: "p2", Content: "prose-b"},
+		mkTable(2, "| C | D |\n| --- | --- |\n| 3 | 4 |"),
+		{Level: 2, Title: "p3", Content: "prose-c"},
+		mkTable(3, "| E | F |\n| --- | --- |\n| 5 | 6 |"),
+	}
+	tree := []Section{{Level: 1, Title: "parent", Children: kids}}
+	tree = append(tree, singleLeafParentTree(200, 10)...)
+
+	capped := capLeafSections(tree, 5)
+
+	// Every original table's content must still be present, verbatim, in
+	// some leaf.
+	tableContents := []string{
+		"| A | B |\n| --- | --- |\n| 1 | 2 |",
+		"| C | D |\n| --- | --- |\n| 3 | 4 |",
+		"| E | F |\n| --- | --- |\n| 5 | 6 |",
+	}
+	var leaves []Section
+	var collect func([]Section)
+	collect = func(ss []Section) {
+		for i := range ss {
+			if len(ss[i].Children) == 0 {
+				leaves = append(leaves, ss[i])
+			} else {
+				collect(ss[i].Children)
+			}
+		}
+	}
+	collect(capped)
+
+	for _, want := range tableContents {
+		found := false
+		for _, l := range leaves {
+			// A table must survive as its OWN leaf — its content present and
+			// the table metadata intact (i.e. it wasn't merged into prose).
+			if l.Content == want && l.Metadata["table"] == "true" {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Errorf("table content %q was merged away or lost its metadata", want)
+		}
+	}
+}
+
 func totalContentLen(sections []Section) int {
 	n := 0
 	for i := range sections {
diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go
index 579cf2a..6a14b95 100644
--- a/pkg/parser/pdf.go
+++ b/pkg/parser/pdf.go
@@ -605,34 +605,158 @@ func countLeafSections(sections []Section) int {
 	return n
 }
 
-// capLeafSections enforces a ceiling on the total leaf-section count.
-// While the count exceeds maxLeaves it repeatedly merges the two
-// smallest ADJACENT leaf siblings under whichever parent currently has
-// the most leaf children — so the runaway byte-split sections collapse
-// back first, while genuinely distinct top-level sections are left
-// alone. maxLeaves <= 0 disables the cap.
+// capLeafSections enforces a ceiling on the total leaf-section count so a
+// pathological PDF can't shatter into thousands of tiny leaves (each of
+// which costs a summarize + HyDE + multi-axis LLM call at ingest, which
+// is what throttles/stalls the full pipeline). maxLeaves <= 0 disables
+// the cap; a tree already at or under the cap is returned untouched.
 //
-// Merged leaves concatenate their content (blank-line separated), keep
-// the first sibling's title, and union their page ranges. Table
-// sections are attached AFTER this pass (attachTableSections), so their
-// numeric content is never merged away.
+// The reduction is robust to ANY tree shape, which is the fix for the
+// real 10-K explosion: that document is not a flat list of adjacent
+// leaves but hundreds of SINGLE-LEAF PARENTS (heading -> one body leaf),
+// which have no adjacent leaf-sibling pairs at all — so the old
+// adjacent-only merge silently did nothing and a 92-page filing sailed
+// past the cap at 463-1465 leaves. We fix it in two phases:
+//
+//  1. collapseSingleLeafParents flattens every heading -> lone-leaf chain
+//     so the formerly-only-children become adjacent leaf SIBLINGS. This
+//     restructures the tree without changing the leaf count (the parent
+//     absorbs the child and itself becomes the leaf).
+//
+//  2. The merge loop then repeatedly merges the smallest adjacent leaf
+//     pair until the count is back under the cap (a defensive collapse
+//     step covers any pair a table leaf blocked).
+//
+// The invariant: for any tree with > maxLeaves MERGEABLE leaves,
+// capLeafSections drives countLeafSections to <= maxLeaves. The only
+// leaves it will not merge are table sections (Metadata["table"]=="true")
+// — those carry distinct numeric content and must survive verbatim. In
+// the normal parse flow table sections are attached AFTER this pass
+// (attachTableSections), so the cap never sees them; the guard is
+// defensive for any caller that pre-attaches tables.
+//
+// Merged/collapsed leaves concatenate their content (blank-line
+// separated), keep the parent/first-sibling title, and union their page
+// ranges, so no body text is ever dropped.
 func capLeafSections(sections []Section, maxLeaves int) []Section {
 	if maxLeaves <= 0 {
 		return sections
 	}
-	// Guard against pathological loops: at most one merge per excess leaf.
-	for guard := 0; countLeafSections(sections) > maxLeaves && guard < 100000; guard++ {
-		if !mergeOneSmallestAdjacentLeafPair(sections) {
-			break // no mergeable adjacent leaf pair anywhere
+	if countLeafSections(sections) <= maxLeaves {
+		return sections // already under the cap — leave structure untouched
+	}
+
+	// Wrap the top-level sections under a synthetic root. The merge step
+	// shrinks a sibling list in place (it rewrites parent.Children), which
+	// only propagates back to the caller when the mutated list is a struct
+	// FIELD, not a bare slice parameter. The runaway shape we're fixing —
+	// hundreds of single-leaf PARENTS — needs the TOP-level list to shrink,
+	// so we make the top level a nested list (root.Children) and operate on
+	// that; the wrapper itself (length 1) never changes. We return
+	// root.Children at the end.
+	root := []Section{{Children: sections}}
+
+	// Phase 1: flatten single-leaf-parent chains so only-children become
+	// mergeable siblings. Count is unchanged; structure is normalised.
+	root[0].Children = collapseSingleLeafParents(root[0].Children)
+
+	// Phase 2: merge adjacent leaf pairs (smallest first) until under cap.
+	// Each merge removes one leaf and each fallback collapse removes one
+	// internal node — both strictly monotonic — so the loop is bounded;
+	// the guard is belt-and-braces against a logic regression.
+	for guard := 0; countLeafSections(root) > maxLeaves && guard < 1000000; guard++ {
+		if mergeOneSmallestAdjacentLeafPair(root) {
+			continue
+		}
+		// No adjacent mergeable pair left. Try to free one up by
+		// collapsing a remaining single-leaf parent; if that's impossible
+		// too, every remaining leaf is unmergeable (e.g. table sections)
+		// and we stop — there is nothing left we're permitted to merge.
+		if !collapseOneSingleLeafParent(&root) {
+			break
+		}
+	}
+	return root[0].Children
+}
+
+// isTableLeaf reports whether s is a table section that must never be
+// merged or collapsed (its cell content is distinct numeric data).
+func isTableLeaf(s *Section) bool {
+	return s.Metadata != nil && s.Metadata["table"] == "true"
+}
+
+// collapseSingleLeafParents recursively flattens every parent that has
+// exactly one child which is a (non-table) leaf: the parent absorbs the
+// child's content + page range and itself becomes the leaf. Chains
+// (heading -> subheading -> body) collapse fully in one bottom-up pass.
+//
+// Leaf count is preserved (one internal node + one leaf become one leaf);
+// the point is purely to turn only-children into adjacent siblings so the
+// adjacent-pair merge can then reduce the count. Returns the rewritten
+// slice.
+func collapseSingleLeafParents(sections []Section) []Section {
+	for i := range sections {
+		s := &sections[i]
+		if len(s.Children) == 0 {
+			continue
+		}
+		// Bottom-up: collapse within the children first so a chain folds
+		// from the leaf upward.
+		s.Children = collapseSingleLeafParents(s.Children)
+		if len(s.Children) == 1 && len(s.Children[0].Children) == 0 && !isTableLeaf(&s.Children[0]) {
+			absorbChildIntoParent(s, s.Children[0])
+			s.Children = nil
 		}
 	}
 	return sections
 }
 
+// collapseOneSingleLeafParent finds the first parent in the tree with
+// exactly one (non-table) leaf child and collapses it (parent absorbs the
+// leaf, becomes a leaf). Returns false when no such parent exists. Used
+// as a defensive fallback inside the merge loop when an adjacent pair was
+// blocked by a table leaf; the bulk flattening is done up front by
+// collapseSingleLeafParents.
+func collapseOneSingleLeafParent(sections *[]Section) bool {
+	s := *sections
+	for i := range s {
+		n := &s[i]
+		if len(n.Children) == 1 && len(n.Children[0].Children) == 0 && !isTableLeaf(&n.Children[0]) {
+			absorbChildIntoParent(n, n.Children[0])
+			n.Children = nil
+			return true
+		}
+		if len(n.Children) > 0 {
+			if collapseOneSingleLeafParent(&n.Children) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// absorbChildIntoParent folds a leaf child's content and page range up
+// into its parent. The parent keeps its own title (the heading) and
+// gains the child's body; an empty parent body is replaced outright so we
+// don't prefix a stray separator. Page ranges union (min start, max end).
+func absorbChildIntoParent(parent *Section, child Section) {
+	switch {
+	case strings.TrimSpace(parent.Content) == "":
+		parent.Content = child.Content
+	case strings.TrimSpace(child.Content) != "":
+		parent.Content = parent.Content + "\n\n" + child.Content
+	}
+	parent.PageStart = minNonZero(parent.PageStart, child.PageStart)
+	if child.PageEnd > parent.PageEnd {
+		parent.PageEnd = child.PageEnd
+	}
+}
+
 // mergeOneSmallestAdjacentLeafPair finds the adjacent leaf-sibling pair
 // with the smallest combined content length anywhere in the tree and
-// merges it in place. Returns false when no sibling list has two
-// adjacent leaves to merge.
+// merges it in place. Only pairs where BOTH siblings are NON-table leaves
+// are eligible — table sections are never merged. Returns false when no
+// sibling list has two adjacent mergeable leaves.
 func mergeOneSmallestAdjacentLeafPair(sections []Section) bool {
 	bestList := (*[]Section)(nil)
 	bestIdx := -1
@@ -642,7 +766,8 @@ func mergeOneSmallestAdjacentLeafPair(sections []Section) bool {
 	walk = func(list *[]Section) {
 		s := *list
 		for i := 0; i+1 < len(s); i++ {
-			if len(s[i].Children) == 0 && len(s[i+1].Children) == 0 {
+			if len(s[i].Children) == 0 && len(s[i+1].Children) == 0 &&
+				!isTableLeaf(&s[i]) && !isTableLeaf(&s[i+1]) {
 				size := len(s[i].Content) + len(s[i+1].Content)
 				if bestSize < 0 || size < bestSize {
 					bestSize, bestList, bestIdx = size, list, i