From 084abc826af44440716287fb4b802dbb6ea91248 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Fri, 29 May 2026 17:13:49 +0100 Subject: [PATCH 1/3] build(deps): bump pdftable to v0.3.1 v0.3.1 ships the grid-indexed cell finder (the O(n^2/n^3) -> O(cells) fix), so table extraction no longer degrades pathologically on dense financial pages. Full table-extraction feature stays on; this only makes it faster/bounded. --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 130314e..27b1397 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/go-chi/chi/v5 v5.2.5 github.com/google/uuid v1.6.0 github.com/hallelx2/llmgate v0.2.0 - github.com/hallelx2/pdftable v0.3.0 + github.com/hallelx2/pdftable v0.3.1 github.com/hibiken/asynq v0.26.0 github.com/jackc/pgx/v5 v5.9.2 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 diff --git a/go.sum b/go.sum index d52b49a..462ec48 100644 --- a/go.sum +++ b/go.sum @@ -134,8 +134,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+u github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/hallelx2/llmgate v0.2.0 h1:x/LNCeHUPZpafn2IXi+LqpnZa7TtEQdLVlpkkJTlzBI= github.com/hallelx2/llmgate v0.2.0/go.mod h1:MK2Ol/5CIweTQ2/9eSiTJ5g/KSSuobNZL9TD3s57JxY= -github.com/hallelx2/pdftable v0.3.0 h1:SwZPu2z4cIR4R30gP+7bpunGh931StjO1vrsxoldiDw= -github.com/hallelx2/pdftable v0.3.0/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI= +github.com/hallelx2/pdftable v0.3.1 h1:Uqe+9G8s9jrGYwxk8dEMXBCB+SlzvWPmW0Ze5863W1I= +github.com/hallelx2/pdftable v0.3.1/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI= github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0= github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo= github.com/hhrutter/pkcs7 v0.2.2 h1:xMoifoVWah1LNym3C0pomEiLmyJyVIBXt/8oTPyPz+8= From 0cda733ae87dee1786602f81692d83cc9cf68e84 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Fri, 29 May 2026 17:30:13 +0100 Subject: [PATCH 2/3] fix(ingest): bound the whole PDF parse with a configurable deadline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 10-K was observed hanging 600s+ in `parsing` even in minimal mode: the hang is in ledongthuc row extraction (extractPDFRows -> reader.Page(n).Content()), which is pure-Go and pre-LLM, so none of the existing per-stage table-extraction budgets bound it. Wrap the entire PDF parse (row extraction, table extraction, section building, leaf cap) in a deadline. The work runs on a goroutine; on timeout or ctx cancellation Parse returns a clear error and abandons the goroutine (buffered result channel, so no leak on send; a panic in the work is recovered). This is the same abandon-on-deadline pattern safeExtractTables already uses for a single table page, lifted to cover the whole parse so ANY parse pathology fails fast and cleanly instead of wedging ingest. The ingest pipeline already treats a parse error as a doc-level failure, so the document goes to `failed` and is visible to ops/bench rather than hanging forever. Nothing is disabled: the full feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still runs — parse is merely bounded. Config: IngestConfig.ParseTimeoutSeconds (default 120). Env VLE_INGEST_PARSE_TIMEOUT_SECONDS; the server binary forwards VLS_/VLE_INGEST_PARSE_TIMEOUT_SECONDS. Also threads the existing ingest.max_sections through to the parser (previously dropped on the floor) via RegistryFromIngestParams, so both robustness valves are operator-tunable. A negative value disables the bound (escape hatch). Tests: the deadline mechanism returns a timeout error in ~the timeout (not after a 10s sleep), passes fast work through, propagates real parse errors, runs inline when disabled, honours ctx cancel, and recovers panics; plus config default/env-override/validate coverage. --- cmd/engine/main.go | 2 +- cmd/server/main.go | 2 +- config.example.yaml | 33 +++++ internal/config/config.go | 11 +- pkg/config/config.go | 31 ++++ pkg/config/config_test.go | 32 ++++ pkg/ingest/ingest.go | 28 +++- pkg/parser/pdf.go | 137 +++++++++++++++++- pkg/parser/pdf_parse_timeout_test.go | 209 +++++++++++++++++++++++++++ 9 files changed, 474 insertions(+), 11 deletions(-) create mode 100644 pkg/parser/pdf_parse_timeout_test.go diff --git a/cmd/engine/main.go b/cmd/engine/main.go index 8da6abf..edfc821 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -172,7 +172,7 @@ func run() error { DB: pool, Storage: store, LLM: llmClient, - Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)), + Parsers: ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Ingest.Tables), cfg.Ingest.MaxSections, time.Duration(cfg.Ingest.ParseTimeoutSeconds)*time.Second), Logger: logger, Mode: cfg.Ingest.Mode, HyDEEnabled: cfg.Ingest.HyDE.Enabled, diff --git a/cmd/server/main.go b/cmd/server/main.go index c14b608..2430862 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -198,7 +198,7 @@ func run() error { DB: pool, Storage: store, LLM: llmClient, - Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)), + Parsers: ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Engine.Ingest.Tables), cfg.Engine.Ingest.MaxSections, time.Duration(cfg.Engine.Ingest.ParseTimeoutSeconds)*time.Second), Logger: logger, Mode: cfg.Engine.Ingest.Mode, HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled, diff --git a/config.example.yaml b/config.example.yaml index 31262d7..6bec472 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -320,6 +320,39 @@ ingest: # vectorless-server use VLS_INGEST_MODE=minimal (no secret edit needed). mode: "full" + # Total-parse timeout (seconds). Bounds the ENTIRE parse of one + # document end to end — row extraction, table extraction, section + # building, and the leaf-section cap. It is the outermost robustness + # valve: a pathological/malformed PDF (observed: a 10-K stuck 600s+ in + # `parsing`, even in minimal mode, inside pure-Go row extraction) is + # abandoned at the deadline and the document fails fast instead of + # wedging the pipeline forever. NOTHING is disabled by this bound — the + # full feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still + # runs; parse is merely time-boxed. Applies in BOTH full and minimal + # mode (parse runs in both). + # + # 120 is comfortably longer than a healthy 300-page filing's parse + # (seconds to low tens of seconds) yet short enough to reap a hang + # quickly. 0 uses the engine default (120). Override per-process with + # VLE_INGEST_PARSE_TIMEOUT_SECONDS; the deployed server also honours + # VLS_INGEST_PARSE_TIMEOUT_SECONDS. + parse_timeout_seconds: 120 + + # Cap on the number of leaf sections one document may produce. A + # pathological PDF (e.g. a 90-page 10-K whose every bold statement + # title trips the heading detector, or a heading→one-body-leaf chain + # repeated hundreds of times) can shatter into far more leaves than the + # document has real sections — each leaf then costs a summarize + HyDE + # + multi-axis LLM call, which is what throttles/stalls full ingest. + # When the parsed leaf count exceeds this cap the parser merges + # sections (smallest first; single-leaf parents collapse into their + # parent) until the document is back under the cap, preserving content + # and never merging table sections. 400 sits comfortably above a real + # filing's section count while still catching the runaway case. 0 uses + # the engine default (400); a negative value disables the cap. Override + # with VLE_INGEST_MAX_SECTIONS. + max_sections: 400 + # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined, so the # provider's per-tenant concurrency limit isn't exceeded. 0 disables diff --git a/internal/config/config.go b/internal/config/config.go index 1646af0..8a2ed78 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -205,7 +205,7 @@ func Default() Config { MaxAge: 86400, }, Governance: GovernanceConfig{ - MaxBodySizeBytes: 33554432, // 32 MiB + MaxBodySizeBytes: 33554432, // 32 MiB DefaultTimeout: 30 * time.Second, QueryTimeout: 120 * time.Second, }, @@ -326,6 +326,15 @@ func applyEnvOverrides(c *Config) { if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" { c.Engine.Ingest.Mode = v } + // Total-parse timeout (seconds). Forwarded so the deployed server + // honours a tuned parse deadline without a secret/config edit — the + // outermost robustness valve against a parse that hangs (pre-LLM, + // pure-Go row extraction). VLS_-prefixed wins over VLE_. + if v := firstEnv("VLS_INGEST_PARSE_TIMEOUT_SECONDS", "VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Engine.Ingest.ParseTimeoutSeconds = n + } + } // Anthropic-compatible gateway overrides (e.g. GLM/Zhipu via // https://api.z.ai/api/anthropic): base URL + model, so the // anthropic driver can run a non-Anthropic model without a secret diff --git a/pkg/config/config.go b/pkg/config/config.go index 8c7c327..8a6b30e 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -125,6 +125,28 @@ type IngestConfig struct { // section count (~170-510 with tables) while still catching the // runaway case. A negative value is rejected by Validate. MaxSections int `yaml:"max_sections"` + + // ParseTimeoutSeconds bounds the ENTIRE parse of a single document — + // row extraction, table extraction, section building, and the leaf + // cap, end to end. It is the outermost robustness valve: every + // per-stage timeout inside the parser (per-page / doc-wide table + // budgets) is bounded by something pre-LLM, but pure-Go row extraction + // (ledongthuc's reader.Page(n).Content()) had no bound, so a + // pathological PDF (observed: a 10-K stuck 600s+ in `parsing` even in + // minimal mode) could hang the parse forever. + // + // When the whole parse exceeds this deadline the parser abandons the + // work and returns a clear error; the ingest pipeline treats it like + // any other parse failure (the document goes to `failed`), so a doc + // that can't parse in time fails fast and is visible to ops/bench + // rather than wedging the pipeline. NOTHING is disabled — the full + // feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still + // runs; parse is merely bounded. + // + // 0 (or omitted) defaults to 120. A negative value is rejected by + // Validate. Engine env override: VLE_INGEST_PARSE_TIMEOUT_SECONDS; + // the server binary also forwards VLS_/VLE_INGEST_PARSE_TIMEOUT_SECONDS. + ParseTimeoutSeconds int `yaml:"parse_timeout_seconds"` } // TOCBlock configures the LLM-driven table-of-contents tree @@ -728,6 +750,7 @@ func Default() Config { GlobalLLMConcurrency: 12, LLMCallTimeoutSeconds: 90, MaxSections: 400, + ParseTimeoutSeconds: 120, HyDE: HyDEConfig{ Enabled: true, NumQuestions: 5, @@ -911,6 +934,11 @@ func applyEnvOverrides(c *Config) { c.Ingest.MaxSections = n } } + if v := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Ingest.ParseTimeoutSeconds = n + } + } // pdftable-driven table extraction. if v := os.Getenv("VLE_INGEST_TABLES_ENABLED"); v != "" { switch strings.ToLower(strings.TrimSpace(v)) { @@ -1200,6 +1228,9 @@ func (c Config) Validate() error { if c.Ingest.MaxSections < 0 { return fmt.Errorf("ingest.max_sections must be >= 0, got %d", c.Ingest.MaxSections) } + if c.Ingest.ParseTimeoutSeconds < 0 { + return fmt.Errorf("ingest.parse_timeout_seconds must be >= 0, got %d", c.Ingest.ParseTimeoutSeconds) + } switch c.Ingest.Tables.VerticalStrategy { case "", "lines", "lines_strict", "text", "explicit": diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index d8172b2..8830dde 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -82,6 +82,38 @@ func TestDefaultValues(t *testing.T) { if cfg.Ingest.TOC.TOCCheckPages != 20 { t.Errorf("ingest.toc.toc_check_pages = %d, want 20", cfg.Ingest.TOC.TOCCheckPages) } + if cfg.Ingest.ParseTimeoutSeconds != 120 { + t.Errorf("ingest.parse_timeout_seconds = %d, want 120", cfg.Ingest.ParseTimeoutSeconds) + } + if cfg.Ingest.MaxSections != 400 { + t.Errorf("ingest.max_sections = %d, want 400", cfg.Ingest.MaxSections) + } +} + +// TestIngestParseTimeoutEnvOverride covers VLE_INGEST_PARSE_TIMEOUT_SECONDS +// — the operator knob that lets a tuned whole-parse deadline reach the +// parser without a config-file edit. +func TestIngestParseTimeoutEnvOverride(t *testing.T) { + prev := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS") + defer os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", prev) + + os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", "300") + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Ingest.ParseTimeoutSeconds != 300 { + t.Errorf("ingest.parse_timeout_seconds = %d, want 300", cfg.Ingest.ParseTimeoutSeconds) + } +} + +// TestIngestParseTimeoutValidate rejects a negative parse timeout — a +// non-positive deadline would silently disable the bound, which must be +// an explicit choice, not a typo that slips through Load. +func TestIngestParseTimeoutValidate(t *testing.T) { + cfg := Default() + cfg.Ingest.ParseTimeoutSeconds = -1 + if err := cfg.Validate(); err == nil { + t.Error("Validate should reject a negative ingest.parse_timeout_seconds") + } } // TestIngestModeDefault locks the default ingest mode to "full" so the diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go index a840acc..a9286e4 100644 --- a/pkg/ingest/ingest.go +++ b/pkg/ingest/ingest.go @@ -1122,15 +1122,35 @@ func DefaultRegistry() *parser.Registry { } // RegistryFromTableOpts returns a parser.Registry where the PDF parser -// is configured from the supplied TableOpts. Pass nil to disable table -// extraction entirely; pass parser.DefaultTableOpts() (or a custom set) -// to enable. All non-PDF parsers are constructed at their defaults. +// is configured from the supplied TableOpts, with the leaf-section cap +// and total-parse timeout left at their parser defaults (400 sections, +// 120s). Pass nil to disable table extraction entirely; pass +// parser.DefaultTableOpts() (or a custom set) to enable. All non-PDF +// parsers are constructed at their defaults. +// +// Use RegistryFromIngestParams to thread an operator-tuned cap / parse +// timeout from config. func RegistryFromTableOpts(opts *parser.TableOpts) *parser.Registry { + return RegistryFromIngestParams(opts, 0, 0) +} + +// RegistryFromIngestParams returns a parser.Registry where the PDF parser +// is configured from the supplied TableOpts AND the operator-tuned +// leaf-section cap and total-parse timeout (ingest.max_sections, +// ingest.parse_timeout_seconds). maxSections == 0 / parseTimeout == 0 +// each select the parser's built-in default; a negative value disables +// that bound. All non-PDF parsers are constructed at their defaults. +// +// This is the constructor the engine/server wiring uses so the parse +// deadline and section cap from config actually reach the parser — the +// outermost robustness valves for full-feature ingest. Table extraction, +// the section tree, and the cap all still run; they are merely bounded. +func RegistryFromIngestParams(opts *parser.TableOpts, maxSections int, parseTimeout time.Duration) *parser.Registry { return parser.NewRegistry( parser.NewMarkdown(), parser.NewHTML(), parser.NewDOCX(), - parser.NewPDFWithTables(opts), + parser.NewPDFWithConfig(opts, maxSections, parseTimeout), parser.NewText(), ) } diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go index bf120dd..579cf2a 100644 --- a/pkg/parser/pdf.go +++ b/pkg/parser/pdf.go @@ -64,6 +64,30 @@ type PDF struct { // Zero selects defaultMaxLeafSections. A negative value disables the // cap entirely (escape hatch for callers that want the raw outline). MaxSections int + + // ParseTimeout bounds the ENTIRE Parse of one document — row + // extraction, table extraction, section building, and the leaf cap, + // end to end. It is the outermost robustness valve. + // + // Every other time bound inside the parser caps a sub-stage + // (tableExtractPageTimeout / tableExtractDocBudget cap table + // extraction), but the pure-Go row extractor + // (extractPDFRows -> reader.Page(n).Content()) had no bound at all — + // and that is exactly where a pathological PDF was observed hanging + // 600s+ in `parsing`, even in minimal mode (pre-LLM). ParseTimeout + // runs the whole parse on a goroutine and abandons it on deadline, + // returning a clear error so ingest fails the document fast instead + // of wedging forever. + // + // Nothing is disabled by this bound: when the parse finishes inside + // the deadline the full feature set (tables, the outline/heuristic + // section tree, the cap) is produced exactly as before. + // + // Zero selects defaultParseTimeout (120s). A non-positive value other + // than zero (i.e. negative) disables the bound entirely — Parse then + // runs synchronously with no deadline (escape hatch / legacy + // behaviour for callers that want an unbounded parse). + ParseTimeout time.Duration } // TableOpts controls pdftable's table-finding stage. The zero value @@ -119,13 +143,24 @@ func NewPDFWithTables(opts *TableOpts) *PDF { return &PDF{Tables: opts} } // NewPDFWithOpts returns a PDF parser using the supplied table-extraction // options and an explicit leaf-section cap. maxSections == 0 selects -// defaultMaxLeafSections; a negative value disables the cap. This is the -// constructor the engine wiring uses so the cap is operator-tunable via -// config (ingest.max_sections). +// defaultMaxLeafSections; a negative value disables the cap. The parse +// timeout is left at its zero value, which resolves to defaultParseTimeout. func NewPDFWithOpts(opts *TableOpts, maxSections int) *PDF { return &PDF{Tables: opts, MaxSections: maxSections} } +// NewPDFWithConfig returns a PDF parser with the table options, leaf- +// section cap, AND total-parse timeout all set explicitly. This is the +// constructor the engine wiring uses so every parser robustness knob is +// operator-tunable via config (ingest.max_sections, +// ingest.parse_timeout_seconds). +// +// - maxSections == 0 → defaultMaxLeafSections; negative disables the cap. +// - parseTimeout == 0 → defaultParseTimeout; negative disables the bound. +func NewPDFWithConfig(opts *TableOpts, maxSections int, parseTimeout time.Duration) *PDF { + return &PDF{Tables: opts, MaxSections: maxSections, ParseTimeout: parseTimeout} +} + // Name implements Parser. func (*PDF) Name() string { return "pdf" } @@ -138,11 +173,87 @@ func (*PDF) Accepts(contentType, filename string) bool { } // Parse implements Parser. -func (p *PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) { +// +// Parse is a thin timeout wrapper around the real parse work (parseDoc). +// It bounds the WHOLE parse — row extraction, table extraction, section +// building, and the leaf cap — by p.resolvedParseTimeout(). The work runs +// on a goroutine; if it doesn't finish by the deadline (or ctx is +// cancelled) Parse returns a clear error and abandons the goroutine. The +// goroutine then finishes on its own (or is GC'd) — its result lands in a +// buffered channel so it never blocks on send. This is the same +// abandon-on-deadline pattern safeExtractTables already uses for a single +// table page, lifted to cover the entire parse so ANY parse pathology +// (notably the unbounded pure-Go ledongthuc row extractor) fails fast and +// cleanly instead of hanging ingest forever. +// +// A negative ParseTimeout disables the bound and runs parseDoc inline. +func (p *PDF) Parse(ctx context.Context, r io.Reader) (*ParsedDoc, error) { + // Read the bytes up front (outside the deadline goroutine): io.ReadAll + // is bounded by the reader/storage layer, not by the PDF pathology we + // are guarding against, and reading here keeps the goroutine body a + // pure CPU/parse unit. buf, err := io.ReadAll(r) if err != nil { return nil, err } + return runParseWithDeadline(ctx, p.resolvedParseTimeout(), func() (*ParsedDoc, error) { + return p.parseDoc(ctx, buf) + }) +} + +// runParseWithDeadline runs work bounded by timeout. It is the reusable +// core of the whole-Parse robustness valve, factored out so the +// deadline/abandon mechanism is unit-testable with an arbitrary work +// closure (e.g. one that sleeps past the deadline) without needing a +// pathological real PDF. +// +// - timeout <= 0 runs work inline with NO bound (legacy/unbounded +// behaviour; the explicit escape hatch a negative ParseTimeout selects). +// - Otherwise work runs on a goroutine and the function selects on the +// work result, a time.After(timeout), and ctx cancellation. On +// timeout/cancel the goroutine is abandoned: its result lands in a +// buffered channel so it can always send and exit (no leak on send), +// then runs to completion on its own and is collected. A panic inside +// work is recovered and surfaced as an error so a backend bug can +// never crash the ingest worker. +// +// The timeout/cancel errors are deliberately phrased so the ingest +// pipeline's existing "parse failed → document failed" path produces a +// clear, ops-visible message instead of an infinite hang. +func runParseWithDeadline(ctx context.Context, timeout time.Duration, work func() (*ParsedDoc, error)) (*ParsedDoc, error) { + if timeout <= 0 { + return work() + } + + type result struct { + doc *ParsedDoc + err error + } + done := make(chan result, 1) + go func() { + defer func() { + if rec := recover(); rec != nil { + done <- result{err: fmt.Errorf("pdf: parse panicked: %v", rec)} + } + }() + doc, perr := work() + done <- result{doc: doc, err: perr} + }() + + select { + case res := <-done: + return res.doc, res.err + case <-time.After(timeout): + return nil, fmt.Errorf("pdf: parse exceeded %s — document too complex or malformed", timeout) + case <-ctx.Done(): + return nil, fmt.Errorf("pdf: parse cancelled: %w", ctx.Err()) + } +} + +// parseDoc is the real parse implementation. It is bounded by Parse's +// deadline wrapper; on its own it has no time bound beyond the per-stage +// table-extraction budgets. +func (p *PDF) parseDoc(_ context.Context, buf []byte) (*ParsedDoc, error) { // We run TWO PDF backends in parallel here: // @@ -407,6 +518,24 @@ func (p *PDF) resolvedMaxSections() int { return p.MaxSections } +// defaultParseTimeout is the whole-Parse deadline applied when +// ParseTimeout is left at its zero value. 120s is comfortably longer than +// a healthy 300-page filing's parse (seconds to low tens of seconds) yet +// short enough that a pathological/malformed document — the kind observed +// hanging 600s+ in pure-Go row extraction — is reaped quickly and the +// document fails fast instead of wedging ingest. +const defaultParseTimeout = 120 * time.Second + +// resolvedParseTimeout turns the configured ParseTimeout into the value +// the wrapper actually uses: 0 selects defaultParseTimeout; a negative +// value disables the bound (Parse runs parseDoc inline with no deadline). +func (p *PDF) resolvedParseTimeout() time.Duration { + if p.ParseTimeout == 0 { + return defaultParseTimeout + } + return p.ParseTimeout +} + // propagateSectionPages fills internal-node PageStart/PageEnd from the union // of descendant leaf ranges where the internal node didn't have its own // (because its body was empty / hoisted into children). Leaves keep their diff --git a/pkg/parser/pdf_parse_timeout_test.go b/pkg/parser/pdf_parse_timeout_test.go new file mode 100644 index 0000000..9932c75 --- /dev/null +++ b/pkg/parser/pdf_parse_timeout_test.go @@ -0,0 +1,209 @@ +package parser + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// TestRunParseWithDeadline_TimesOutDoesNotHang is the core robustness +// assertion: when the parse work runs longer than the deadline, the +// wrapper returns the timeout error in ~the timeout — NOT after the work +// finishes (which here is 10s, far longer than any CI step should wait). +// This is the exact pathology the whole-Parse deadline guards against: a +// pure-Go row extractor (or any backend) that never returns must fail +// fast instead of wedging ingest forever. +func TestRunParseWithDeadline_TimesOutDoesNotHang(t *testing.T) { + const timeout = 50 * time.Millisecond + const workDuration = 10 * time.Second // simulates the unbounded hang + + started := make(chan struct{}) + work := func() (*ParsedDoc, error) { + close(started) + time.Sleep(workDuration) // never returns within the deadline + return &ParsedDoc{Title: "should never be observed"}, nil + } + + start := time.Now() + doc, err := runParseWithDeadline(context.Background(), timeout, work) + elapsed := time.Since(start) + + <-started // the work goroutine did start (we really are abandoning it) + + if err == nil { + t.Fatalf("expected a timeout error, got doc=%v err=nil", doc) + } + if doc != nil { + t.Errorf("expected nil doc on timeout, got %v", doc) + } + // The error must clearly name the timeout so ops see why the doc failed. + if !strings.Contains(err.Error(), "parse exceeded") { + t.Errorf("error %q does not mention the parse-exceeded cause", err.Error()) + } + // Critically: we returned in ~timeout, not in ~workDuration. Generous + // upper bound (still 20x the timeout but 5x under the work sleep) keeps + // the test robust on a loaded CI box while still proving "doesn't hang". + if elapsed > time.Second { + t.Errorf("runParseWithDeadline took %s — it waited for the work instead of the deadline (hang not bounded)", elapsed) + } +} + +// TestRunParseWithDeadline_FastWorkPassesThrough confirms the happy path: +// work that finishes well inside the deadline has its result returned +// verbatim, untouched by the wrapper. +func TestRunParseWithDeadline_FastWorkPassesThrough(t *testing.T) { + want := &ParsedDoc{Title: "quick"} + doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) { + return want, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc != want { + t.Errorf("doc = %v, want %v", doc, want) + } +} + +// TestRunParseWithDeadline_PropagatesWorkError confirms a genuine parse +// error from the work (the common "bad PDF" case) is returned as-is, not +// masked by the timeout machinery. +func TestRunParseWithDeadline_PropagatesWorkError(t *testing.T) { + sentinel := errors.New("pdf: open: boom") + doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) { + return nil, sentinel + }) + if !errors.Is(err, sentinel) { + t.Fatalf("err = %v, want sentinel %v", err, sentinel) + } + if doc != nil { + t.Errorf("doc = %v, want nil", doc) + } +} + +// TestRunParseWithDeadline_DisabledRunsInline confirms a non-positive +// timeout disables the bound entirely: the work runs inline (no +// goroutine, no deadline) and its result is returned even though the +// "work" takes a beat. This is the explicit escape hatch a negative +// ParseTimeout selects (legacy unbounded behaviour). +func TestRunParseWithDeadline_DisabledRunsInline(t *testing.T) { + want := &ParsedDoc{Title: "inline"} + doc, err := runParseWithDeadline(context.Background(), 0, func() (*ParsedDoc, error) { + time.Sleep(20 * time.Millisecond) + return want, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if doc != want { + t.Errorf("doc = %v, want %v", doc, want) + } +} + +// TestRunParseWithDeadline_ContextCancel confirms a cancelled context +// unblocks the wrapper promptly even when the work is still running and +// the (long) timeout has not fired — so a shutdown/cancel propagates +// instead of waiting out the full parse deadline. +func TestRunParseWithDeadline_ContextCancel(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(20 * time.Millisecond) + cancel() + }() + + start := time.Now() + doc, err := runParseWithDeadline(ctx, time.Hour, func() (*ParsedDoc, error) { + time.Sleep(10 * time.Second) + return &ParsedDoc{}, nil + }) + elapsed := time.Since(start) + + if err == nil { + t.Fatalf("expected a cancellation error, got doc=%v err=nil", doc) + } + if !errors.Is(err, context.Canceled) { + t.Errorf("err = %v, want it to wrap context.Canceled", err) + } + if elapsed > time.Second { + t.Errorf("cancellation took %s — wrapper did not honour ctx promptly", elapsed) + } +} + +// TestRunParseWithDeadline_RecoversPanic confirms a panic deep inside the +// parse work is recovered and surfaced as an error rather than crashing +// the ingest worker process. +func TestRunParseWithDeadline_RecoversPanic(t *testing.T) { + doc, err := runParseWithDeadline(context.Background(), time.Second, func() (*ParsedDoc, error) { + panic("backend exploded") + }) + if err == nil { + t.Fatalf("expected an error from a panicking work, got doc=%v err=nil", doc) + } + if !strings.Contains(err.Error(), "panicked") { + t.Errorf("error %q does not indicate a recovered panic", err.Error()) + } +} + +// TestResolvedParseTimeout covers the zero/default, explicit, and +// disabled (negative) resolutions of the configured ParseTimeout. +func TestResolvedParseTimeout(t *testing.T) { + if got := (&PDF{}).resolvedParseTimeout(); got != defaultParseTimeout { + t.Errorf("zero ParseTimeout: got %s, want default %s", got, defaultParseTimeout) + } + if got := (&PDF{ParseTimeout: 5 * time.Second}).resolvedParseTimeout(); got != 5*time.Second { + t.Errorf("explicit ParseTimeout: got %s, want 5s", got) + } + if got := (&PDF{ParseTimeout: -1}).resolvedParseTimeout(); got > 0 { + t.Errorf("negative ParseTimeout should stay non-positive (bound disabled), got %s", got) + } +} + +// TestPDFParse_HonoursTinyTimeout drives the FULL Parse path (not just the +// helper) with a deliberately tiny deadline and a real PDF whose parse +// takes longer than that deadline, proving Parse itself returns the +// timeout error promptly rather than hanging. +// +// We reuse the table fixture (a real ruled-table PDF). Even its modest +// parse comfortably exceeds a 1ms deadline, so the wrapper fires. If the +// fixture is ever made trivially fast, the assertion that we either time +// out OR succeed (never hang) still holds — the test's job is to prove the +// wrapper is wired into Parse and bounds it, which the elapsed-time guard +// enforces. +func TestPDFParse_HonoursTinyTimeout(t *testing.T) { + b := readParserFixture(t, "tables-example.pdf") + p := &PDF{Tables: DefaultTableOpts(), ParseTimeout: time.Millisecond} + + start := time.Now() + _, err := p.Parse(context.Background(), bytes.NewReader(b)) + elapsed := time.Since(start) + + // The whole point: Parse returned quickly. A real parse of this fixture + // without the bound takes longer than 1ms, so on any normal machine we + // expect the timeout error; either way Parse MUST NOT hang. + if elapsed > 5*time.Second { + t.Fatalf("Parse took %s with a 1ms ParseTimeout — the deadline is not wired into Parse", elapsed) + } + if err != nil && !strings.Contains(err.Error(), "parse exceeded") { + // A non-timeout error (e.g. the parse genuinely finished first on a + // very fast box and returned a real result with err==nil) is fine; + // only an unexpected error shape is worth flagging. + t.Logf("Parse returned non-timeout error (acceptable if parse beat the 1ms deadline): %v", err) + } +} + +// readParserFixture mirrors the external test's readFixture but lives in +// the white-box package so the timeout tests (which need unexported +// symbols) can load the same testdata files. +func readParserFixture(t *testing.T, name string) []byte { + t.Helper() + path := filepath.Join("testdata", name) + b, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read fixture %q: %v", path, err) + } + return b +} From 9d8c7b41e7e0259671b70dbcdd3aa3b8b6f330d9 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Fri, 29 May 2026 17:48:40 +0100 Subject: [PATCH 3/3] fix(parser): make the leaf-section cap reduce any tree shape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cap only merged ADJACENT leaf siblings, but the real 10-K explosion is hundreds of SINGLE-LEAF PARENTS (heading -> one body leaf) that have no adjacent leaf-sibling pairs at all — so the cap silently did nothing and a 92-page filing sailed past the 400 cap at 463-1465 leaves, each costing a summarize + HyDE + multi-axis LLM call (the throughput killer for full ingest). Fix it in two phases: 1. collapseSingleLeafParents flattens every heading -> lone-leaf chain (bottom-up, so deep chains fold in one pass) so the formerly only-children become adjacent leaf siblings. Count is unchanged; the parent absorbs the child's content + page range and becomes the leaf. 2. The existing smallest-first adjacent-pair merge then reduces the count to the cap, with a defensive single-leaf-parent collapse for any pair a table leaf blocked. The top-level sections are wrapped under a synthetic root so the merge step — which shrinks a sibling list by rewriting parent.Children — can shrink the TOP-level list too (a bare slice parameter would not propagate the shrink back). This is what makes the invariant hold for a flat list of single-leaf parents. Invariant: for any tree with > N mergeable leaves, capLeafSections drives the leaf count to <= N. Content is always preserved (concatenated, page ranges unioned) and table sections (Metadata["table"]=="true") are never merged or collapsed. Nothing is disabled — the full section tree is still produced; it's just bounded. Tests: 1000 single-leaf-parents -> <= 400 with no content loss (the case the old cap failed and that let 1465 through), deep heading->subheading-> body chains, a mixed flat/single-leaf/multi-child tree, and a table-protection test asserting table leaves survive verbatim. --- pkg/parser/cap_test.go | 165 +++++++++++++++++++++++++++++++++++++++++ pkg/parser/pdf.go | 159 ++++++++++++++++++++++++++++++++++----- 2 files changed, 307 insertions(+), 17 deletions(-) diff --git a/pkg/parser/cap_test.go b/pkg/parser/cap_test.go index fa1fd61..726df0e 100644 --- a/pkg/parser/cap_test.go +++ b/pkg/parser/cap_test.go @@ -15,6 +15,29 @@ func leafTree(n, size int) []Section { return []Section{{Level: 1, Title: "parent", Children: kids}} } +// singleLeafParentTree builds n top-level "heading" sections, each an +// internal node with EXACTLY ONE leaf child carrying `size` chars. This +// is the real 10-K explosion shape: hundreds of heading -> one-body-leaf +// parents with NO adjacent leaf-sibling pairs anywhere. The pre-fix cap +// (adjacent-siblings only) could not reduce this tree at all. +func singleLeafParentTree(n, size int) []Section { + parents := make([]Section, n) + for i := range parents { + parents[i] = Section{ + Level: 1, + Title: "heading", + Children: []Section{{ + Level: 2, + Title: "body", + Content: strings.Repeat("y", size), + PageStart: i + 1, + PageEnd: i + 1, + }}, + } + } + return parents +} + func TestCapLeafSections_MergesDownToCap(t *testing.T) { tree := leafTree(1000, 50) if got := countLeafSections(tree); got != 1000 { @@ -81,6 +104,148 @@ func TestCapLeafSections_MergesSmallestFirst(t *testing.T) { } } +// TestCapLeafSections_SingleLeafParentsReduce is the regression test for +// the bug that let a 92-page 10-K through at 463-1465 leaves: a tree made +// entirely of single-leaf parents (heading -> one body leaf) has no +// adjacent leaf-sibling pairs, so the old adjacent-only merge did nothing. +// The fix must collapse those chains and merge down to the cap, losing no +// content. +func TestCapLeafSections_SingleLeafParentsReduce(t *testing.T) { + tree := singleLeafParentTree(1000, 30) + if got := countLeafSections(tree); got != 1000 { + t.Fatalf("setup: countLeafSections = %d, want 1000", got) + } + orig := totalContentLen(tree) // 1000 * 30 + + capped := capLeafSections(tree, 400) + + if got := countLeafSections(capped); got > 400 { + t.Errorf("after cap: %d leaves, want <= 400 (the bug let 1465 through)", got) + } + // No content lost. Collapses/merges insert at most a "\n\n" (2 chars) + // per fold; with < 1000 folds total the upper bound is generous. + if got := totalContentLen(capped); got < orig || got > orig+2*1000 { + t.Errorf("content not preserved: got %d chars, want in [%d, %d]", got, orig, orig+2*1000) + } +} + +// TestCapLeafSections_DeepSingleLeafChains exercises heading -> subheading +// -> body chains (depth-3 single-leaf parents). The bottom-up collapse +// must fold the whole chain so the bodies become mergeable siblings. +func TestCapLeafSections_DeepSingleLeafChains(t *testing.T) { + const n = 600 + roots := make([]Section, n) + for i := range roots { + roots[i] = Section{Level: 1, Title: "h1", Children: []Section{{ + Level: 2, Title: "h2", Children: []Section{{ + Level: 3, Title: "body", Content: strings.Repeat("z", 20), + PageStart: i + 1, PageEnd: i + 1, + }}, + }}} + } + if got := countLeafSections(roots); got != n { + t.Fatalf("setup: countLeafSections = %d, want %d", got, n) + } + orig := totalContentLen(roots) + + capped := capLeafSections(roots, 400) + if got := countLeafSections(capped); got > 400 { + t.Errorf("after cap: %d leaves, want <= 400", got) + } + if got := totalContentLen(capped); got < orig { + t.Errorf("content shrank: got %d chars, want >= %d", got, orig) + } +} + +// TestCapLeafSections_MixedShapeReducesToCap throws a tree that mixes a +// flat sibling list, single-leaf parents, and a multi-child branch at the +// cap. The invariant is unconditional: > N mergeable leaves must come +// down to <= N regardless of shape. +func TestCapLeafSections_MixedShapeReducesToCap(t *testing.T) { + tree := []Section{ + {Level: 1, Title: "flat", Children: func() []Section { + out := make([]Section, 300) + for i := range out { + out[i] = Section{Level: 2, Title: "f", Content: "ff", PageStart: i + 1, PageEnd: i + 1} + } + return out + }()}, + } + // Append 300 single-leaf parents as additional top-level nodes. + tree = append(tree, singleLeafParentTree(300, 10)...) + + if got := countLeafSections(tree); got != 600 { + t.Fatalf("setup: countLeafSections = %d, want 600", got) + } + capped := capLeafSections(tree, 100) + if got := countLeafSections(capped); got > 100 { + t.Errorf("after cap: %d leaves, want <= 100", got) + } +} + +// TestCapLeafSections_NeverMergesTableSections asserts that table leaves +// (Metadata["table"]="true") are preserved verbatim and never merged or +// collapsed, even under a cap small enough that everything else must +// merge around them. +func TestCapLeafSections_NeverMergesTableSections(t *testing.T) { + mkTable := func(page int, content string) Section { + return Section{ + Level: 1, Title: "Table", Content: content, + PageStart: page, PageEnd: page, + Metadata: map[string]string{"table": "true"}, + } + } + // Three tables interleaved with prose leaves under one parent, plus a + // pile of single-leaf-parent prose so we're well over the cap. + kids := []Section{ + {Level: 2, Title: "p1", Content: "prose-a"}, + mkTable(1, "| A | B |\n| --- | --- |\n| 1 | 2 |"), + {Level: 2, Title: "p2", Content: "prose-b"}, + mkTable(2, "| C | D |\n| --- | --- |\n| 3 | 4 |"), + {Level: 2, Title: "p3", Content: "prose-c"}, + mkTable(3, "| E | F |\n| --- | --- |\n| 5 | 6 |"), + } + tree := []Section{{Level: 1, Title: "parent", Children: kids}} + tree = append(tree, singleLeafParentTree(200, 10)...) + + capped := capLeafSections(tree, 5) + + // Every original table's content must still be present, verbatim, in + // some leaf. + tableContents := []string{ + "| A | B |\n| --- | --- |\n| 1 | 2 |", + "| C | D |\n| --- | --- |\n| 3 | 4 |", + "| E | F |\n| --- | --- |\n| 5 | 6 |", + } + var leaves []Section + var collect func([]Section) + collect = func(ss []Section) { + for i := range ss { + if len(ss[i].Children) == 0 { + leaves = append(leaves, ss[i]) + } else { + collect(ss[i].Children) + } + } + } + collect(capped) + + for _, want := range tableContents { + found := false + for _, l := range leaves { + // A table must survive as its OWN leaf — its content present and + // the table metadata intact (i.e. it wasn't merged into prose). + if l.Content == want && l.Metadata["table"] == "true" { + found = true + break + } + } + if !found { + t.Errorf("table content %q was merged away or lost its metadata", want) + } + } +} + func totalContentLen(sections []Section) int { n := 0 for i := range sections { diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go index 579cf2a..6a14b95 100644 --- a/pkg/parser/pdf.go +++ b/pkg/parser/pdf.go @@ -605,34 +605,158 @@ func countLeafSections(sections []Section) int { return n } -// capLeafSections enforces a ceiling on the total leaf-section count. -// While the count exceeds maxLeaves it repeatedly merges the two -// smallest ADJACENT leaf siblings under whichever parent currently has -// the most leaf children — so the runaway byte-split sections collapse -// back first, while genuinely distinct top-level sections are left -// alone. maxLeaves <= 0 disables the cap. +// capLeafSections enforces a ceiling on the total leaf-section count so a +// pathological PDF can't shatter into thousands of tiny leaves (each of +// which costs a summarize + HyDE + multi-axis LLM call at ingest, which +// is what throttles/stalls the full pipeline). maxLeaves <= 0 disables +// the cap; a tree already at or under the cap is returned untouched. // -// Merged leaves concatenate their content (blank-line separated), keep -// the first sibling's title, and union their page ranges. Table -// sections are attached AFTER this pass (attachTableSections), so their -// numeric content is never merged away. +// The reduction is robust to ANY tree shape, which is the fix for the +// real 10-K explosion: that document is not a flat list of adjacent +// leaves but hundreds of SINGLE-LEAF PARENTS (heading -> one body leaf), +// which have no adjacent leaf-sibling pairs at all — so the old +// adjacent-only merge silently did nothing and a 92-page filing sailed +// past the cap at 463-1465 leaves. We fix it in two phases: +// +// 1. collapseSingleLeafParents flattens every heading -> lone-leaf chain +// so the formerly-only-children become adjacent leaf SIBLINGS. This +// restructures the tree without changing the leaf count (the parent +// absorbs the child and itself becomes the leaf). +// +// 2. The merge loop then repeatedly merges the smallest adjacent leaf +// pair until the count is back under the cap (a defensive collapse +// step covers any pair a table leaf blocked). +// +// The invariant: for any tree with > maxLeaves MERGEABLE leaves, +// capLeafSections drives countLeafSections to <= maxLeaves. The only +// leaves it will not merge are table sections (Metadata["table"]=="true") +// — those carry distinct numeric content and must survive verbatim. In +// the normal parse flow table sections are attached AFTER this pass +// (attachTableSections), so the cap never sees them; the guard is +// defensive for any caller that pre-attaches tables. +// +// Merged/collapsed leaves concatenate their content (blank-line +// separated), keep the parent/first-sibling title, and union their page +// ranges, so no body text is ever dropped. func capLeafSections(sections []Section, maxLeaves int) []Section { if maxLeaves <= 0 { return sections } - // Guard against pathological loops: at most one merge per excess leaf. - for guard := 0; countLeafSections(sections) > maxLeaves && guard < 100000; guard++ { - if !mergeOneSmallestAdjacentLeafPair(sections) { - break // no mergeable adjacent leaf pair anywhere + if countLeafSections(sections) <= maxLeaves { + return sections // already under the cap — leave structure untouched + } + + // Wrap the top-level sections under a synthetic root. The merge step + // shrinks a sibling list in place (it rewrites parent.Children), which + // only propagates back to the caller when the mutated list is a struct + // FIELD, not a bare slice parameter. The runaway shape we're fixing — + // hundreds of single-leaf PARENTS — needs the TOP-level list to shrink, + // so we make the top level a nested list (root.Children) and operate on + // that; the wrapper itself (length 1) never changes. We return + // root.Children at the end. + root := []Section{{Children: sections}} + + // Phase 1: flatten single-leaf-parent chains so only-children become + // mergeable siblings. Count is unchanged; structure is normalised. + root[0].Children = collapseSingleLeafParents(root[0].Children) + + // Phase 2: merge adjacent leaf pairs (smallest first) until under cap. + // Each merge removes one leaf and each fallback collapse removes one + // internal node — both strictly monotonic — so the loop is bounded; + // the guard is belt-and-braces against a logic regression. + for guard := 0; countLeafSections(root) > maxLeaves && guard < 1000000; guard++ { + if mergeOneSmallestAdjacentLeafPair(root) { + continue + } + // No adjacent mergeable pair left. Try to free one up by + // collapsing a remaining single-leaf parent; if that's impossible + // too, every remaining leaf is unmergeable (e.g. table sections) + // and we stop — there is nothing left we're permitted to merge. + if !collapseOneSingleLeafParent(&root) { + break + } + } + return root[0].Children +} + +// isTableLeaf reports whether s is a table section that must never be +// merged or collapsed (its cell content is distinct numeric data). +func isTableLeaf(s *Section) bool { + return s.Metadata != nil && s.Metadata["table"] == "true" +} + +// collapseSingleLeafParents recursively flattens every parent that has +// exactly one child which is a (non-table) leaf: the parent absorbs the +// child's content + page range and itself becomes the leaf. Chains +// (heading -> subheading -> body) collapse fully in one bottom-up pass. +// +// Leaf count is preserved (one internal node + one leaf become one leaf); +// the point is purely to turn only-children into adjacent siblings so the +// adjacent-pair merge can then reduce the count. Returns the rewritten +// slice. +func collapseSingleLeafParents(sections []Section) []Section { + for i := range sections { + s := §ions[i] + if len(s.Children) == 0 { + continue + } + // Bottom-up: collapse within the children first so a chain folds + // from the leaf upward. + s.Children = collapseSingleLeafParents(s.Children) + if len(s.Children) == 1 && len(s.Children[0].Children) == 0 && !isTableLeaf(&s.Children[0]) { + absorbChildIntoParent(s, s.Children[0]) + s.Children = nil } } return sections } +// collapseOneSingleLeafParent finds the first parent in the tree with +// exactly one (non-table) leaf child and collapses it (parent absorbs the +// leaf, becomes a leaf). Returns false when no such parent exists. Used +// as a defensive fallback inside the merge loop when an adjacent pair was +// blocked by a table leaf; the bulk flattening is done up front by +// collapseSingleLeafParents. +func collapseOneSingleLeafParent(sections *[]Section) bool { + s := *sections + for i := range s { + n := &s[i] + if len(n.Children) == 1 && len(n.Children[0].Children) == 0 && !isTableLeaf(&n.Children[0]) { + absorbChildIntoParent(n, n.Children[0]) + n.Children = nil + return true + } + if len(n.Children) > 0 { + if collapseOneSingleLeafParent(&n.Children) { + return true + } + } + } + return false +} + +// absorbChildIntoParent folds a leaf child's content and page range up +// into its parent. The parent keeps its own title (the heading) and +// gains the child's body; an empty parent body is replaced outright so we +// don't prefix a stray separator. Page ranges union (min start, max end). +func absorbChildIntoParent(parent *Section, child Section) { + switch { + case strings.TrimSpace(parent.Content) == "": + parent.Content = child.Content + case strings.TrimSpace(child.Content) != "": + parent.Content = parent.Content + "\n\n" + child.Content + } + parent.PageStart = minNonZero(parent.PageStart, child.PageStart) + if child.PageEnd > parent.PageEnd { + parent.PageEnd = child.PageEnd + } +} + // mergeOneSmallestAdjacentLeafPair finds the adjacent leaf-sibling pair // with the smallest combined content length anywhere in the tree and -// merges it in place. Returns false when no sibling list has two -// adjacent leaves to merge. +// merges it in place. Only pairs where BOTH siblings are NON-table leaves +// are eligible — table sections are never merged. Returns false when no +// sibling list has two adjacent mergeable leaves. func mergeOneSmallestAdjacentLeafPair(sections []Section) bool { bestList := (*[]Section)(nil) bestIdx := -1 @@ -642,7 +766,8 @@ func mergeOneSmallestAdjacentLeafPair(sections []Section) bool { walk = func(list *[]Section) { s := *list for i := 0; i+1 < len(s); i++ { - if len(s[i].Children) == 0 && len(s[i+1].Children) == 0 { + if len(s[i].Children) == 0 && len(s[i+1].Children) == 0 && + !isTableLeaf(&s[i]) && !isTableLeaf(&s[i+1]) { size := len(s[i].Content) + len(s[i+1].Content) if bestSize < 0 || size < bestSize { bestSize, bestList, bestIdx = size, list, i