diff --git a/cmd/server/main.go b/cmd/server/main.go index 83714db..4c9b284 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -169,6 +169,10 @@ func run() error { SummaryAxesMaxTopics: cfg.Engine.Ingest.SummaryAxes.MaxTopics, SummaryAxesMaxEntities: cfg.Engine.Ingest.SummaryAxes.MaxEntities, SummaryAxesMaxNumbers: cfg.Engine.Ingest.SummaryAxes.MaxNumbers, + TOCEnabled: cfg.Engine.Ingest.TOC.Enabled, + TOCModel: cfg.Engine.Ingest.TOC.Model, + TOCConcurrency: cfg.Engine.Ingest.TOC.Concurrency, + TOCCheckPages: cfg.Engine.Ingest.TOC.TOCCheckPages, GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency, }) if cfg.Engine.Ingest.Tables.Enabled { diff --git a/config.example.yaml b/config.example.yaml index 7de0368..0ab32e2 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -297,6 +297,32 @@ ingest: max_entities: 8 max_numbers: 6 + # LLM-built table-of-contents tree (PageIndex-style). Runs after + # summarize+HyDE on PDF inputs and persists a hierarchical TOC on + # documents.toc_tree (JSONB). The tree is small (a few KB even + # for 300-page filings) and is intended as a higher-level map + # retrieval strategies can reason over before drilling into the + # parser-derived sections tree. + # + # ENABLED BY DEFAULT for PDFs. Non-PDF documents skip the stage + # unconditionally. Builder failures are non-fatal — the document + # remains fully retrievable via the existing sections tree. + toc: + enabled: true + # Override the LLM model used by the builder; empty inherits + # the summary model. Point this at a reasoning-capable model — + # the no-TOC generator has to find hierarchy in raw body text, + # which a small/fast model often botches. + model: "" + # Cap on parallel LLM calls during the verification phase + # (one call per leaf node). + concurrency: 4 + # The detector scans the first N pages for a table of + # contents. PageIndex defaults this to 20 — financial filings + # put their TOC inside the first dozen pages and a document + # without one by page 20 almost never has one further in. + toc_check_pages: 20 + log: level: "info" # debug | info | warn | error format: "json" # json | console diff --git a/pkg/config/config.go b/pkg/config/config.go index d4f2723..c5fa0e4 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -48,6 +48,13 @@ type IngestConfig struct { // populate it). SummaryAxes SummaryAxesBlock `yaml:"summary_axes"` + // TOC configures the PageIndex-style LLM-built table-of-contents + // tree stage. Enabled by default for PDF inputs; the resulting + // tree is persisted on documents.toc_tree (JSONB). Failures are + // non-fatal — they leave the column NULL and the document fully + // retrievable via the existing sections tree. + TOC TOCBlock `yaml:"toc"` + // GlobalLLMConcurrency caps the total number of LLM calls in flight // across the summarize and HyDE stages combined, which now run // concurrently. Each stage still respects its own per-stage cap @@ -60,6 +67,39 @@ type IngestConfig struct { GlobalLLMConcurrency int `yaml:"global_llm_concurrency"` } +// TOCBlock configures the LLM-driven table-of-contents tree +// builder. The builder reads page-by-page text from a freshly- +// ingested PDF and emits a hierarchical TOC (PageIndex-style), +// persisted on documents.toc_tree (JSONB). +// +// Enabled by default for PDF inputs; non-PDF documents skip the +// stage unconditionally. Builder failures never break ingest — +// the document remains fully retrievable via the existing +// sections tree. +type TOCBlock struct { + // Enabled toggles the stage. Default: true. Flip to false to + // skip the extra LLM round-trip when ingest budget matters + // more than having a TOC tree for retrieval to reason over. + Enabled bool `yaml:"enabled"` + + // Model overrides the LLM model used by the builder. Empty + // inherits the engine's configured default. Point this at a + // reasoning-capable model — the no-TOC generator has to find + // hierarchy in raw body text, which a small/fast model often + // botches. + Model string `yaml:"model"` + + // Concurrency caps parallel LLM calls during the verification + // phase (one call per leaf node). Default: 4. + Concurrency int `yaml:"concurrency"` + + // TOCCheckPages bounds the leading prefix the detector scans + // for a table of contents. Default: 20 — financial filings + // put their TOC inside the first dozen pages and a document + // without one by page 20 almost never has one further in. + TOCCheckPages int `yaml:"toc_check_pages"` +} + // SummaryAxesBlock configures the Phase 2.5 structured summarizer. // // When enabled, the summarize stage runs in JSON mode and produces @@ -584,6 +624,11 @@ func Default() Config { MaxEntities: 8, MaxNumbers: 6, }, + TOC: TOCBlock{ + Enabled: true, + Concurrency: 4, + TOCCheckPages: 20, + }, }, Log: LogConfig{Level: "info", Format: "json"}, } @@ -767,6 +812,30 @@ func applyEnvOverrides(c *Config) { c.Ingest.SummaryAxes.MaxNumbers = n } } + // LLM-built TOC tree (PageIndex-style). Same truthy-string set + // as the other ingest toggles; numeric overrides require a + // positive int so a typo doesn't silently flip the default. + if v := os.Getenv("VLE_INGEST_TOC_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Ingest.TOC.Enabled = true + case "0", "false", "no", "off": + c.Ingest.TOC.Enabled = false + } + } + if v := os.Getenv("VLE_INGEST_TOC_MODEL"); v != "" { + c.Ingest.TOC.Model = v + } + if v := os.Getenv("VLE_INGEST_TOC_CONCURRENCY"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Ingest.TOC.Concurrency = n + } + } + if v := os.Getenv("VLE_INGEST_TOC_TOC_CHECK_PAGES"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Ingest.TOC.TOCCheckPages = n + } + } if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" { switch strings.ToLower(strings.TrimSpace(v)) { case "1", "true", "yes", "on": @@ -978,6 +1047,13 @@ func (c Config) Validate() error { return fmt.Errorf("ingest.summary_axes.max_numbers must be >= 0, got %d", c.Ingest.SummaryAxes.MaxNumbers) } + if c.Ingest.TOC.Concurrency < 0 { + return fmt.Errorf("ingest.toc.concurrency must be >= 0, got %d", c.Ingest.TOC.Concurrency) + } + if c.Ingest.TOC.TOCCheckPages < 0 { + return fmt.Errorf("ingest.toc.toc_check_pages must be >= 0, got %d", c.Ingest.TOC.TOCCheckPages) + } + if c.Retrieval.Planning.CacheSize < 0 { return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize) } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index e4ba3a6..0936e1f 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -73,6 +73,55 @@ func TestDefaultValues(t *testing.T) { if cfg.Log.Level != "info" { t.Errorf("log.level = %q, want info", cfg.Log.Level) } + if !cfg.Ingest.TOC.Enabled { + t.Error("ingest.toc.enabled should default to true (opt-out)") + } + if cfg.Ingest.TOC.Concurrency != 4 { + t.Errorf("ingest.toc.concurrency = %d, want 4", cfg.Ingest.TOC.Concurrency) + } + if cfg.Ingest.TOC.TOCCheckPages != 20 { + t.Errorf("ingest.toc.toc_check_pages = %d, want 20", cfg.Ingest.TOC.TOCCheckPages) + } +} + +func TestTOCEnvOverride(t *testing.T) { + // Mutates env — restore on exit. Not parallel. + keys := []string{ + "VLE_INGEST_TOC_ENABLED", + "VLE_INGEST_TOC_MODEL", + "VLE_INGEST_TOC_CONCURRENCY", + "VLE_INGEST_TOC_TOC_CHECK_PAGES", + } + prev := make(map[string]string, len(keys)) + for _, k := range keys { + prev[k] = os.Getenv(k) + } + defer func() { + for k, v := range prev { + os.Setenv(k, v) + } + }() + + os.Setenv("VLE_INGEST_TOC_ENABLED", "false") + os.Setenv("VLE_INGEST_TOC_MODEL", "gemini-2.5-pro") + os.Setenv("VLE_INGEST_TOC_CONCURRENCY", "12") + os.Setenv("VLE_INGEST_TOC_TOC_CHECK_PAGES", "30") + + cfg := Default() + applyEnvOverrides(&cfg) + + if cfg.Ingest.TOC.Enabled { + t.Error("VLE_INGEST_TOC_ENABLED=false should disable the stage") + } + if cfg.Ingest.TOC.Model != "gemini-2.5-pro" { + t.Errorf("VLE_INGEST_TOC_MODEL not applied, got %q", cfg.Ingest.TOC.Model) + } + if cfg.Ingest.TOC.Concurrency != 12 { + t.Errorf("VLE_INGEST_TOC_CONCURRENCY=12 not applied, got %d", cfg.Ingest.TOC.Concurrency) + } + if cfg.Ingest.TOC.TOCCheckPages != 30 { + t.Errorf("VLE_INGEST_TOC_TOC_CHECK_PAGES=30 not applied, got %d", cfg.Ingest.TOC.TOCCheckPages) + } } func TestAbstainEnvOverride(t *testing.T) { diff --git a/pkg/db/documents.go b/pkg/db/documents.go index 68719da..67cbea2 100644 --- a/pkg/db/documents.go +++ b/pkg/db/documents.go @@ -43,6 +43,18 @@ type Document struct { Metadata map[string]string CreatedAt time.Time UpdatedAt time.Time + + // TOCTree is the JSONB blob persisted by the ingest pipeline's + // LLM-driven TOC builder ([]tree.TOCNode marshalled). nil + // (NULL in DB) means "not yet generated" — the expected state + // for non-PDF documents, for documents ingested before the + // 0006 migration, and when the builder failed (builder + // failures are non-fatal and leave this column NULL). + // + // Stored raw so the column round-trips byte-identically + // regardless of slice-element ordering inside the encoder. + // Callers that need the typed shape unmarshal at read time. + TOCTree []byte } // NewDocument inserts a fresh document row in the "pending" state. @@ -83,7 +95,7 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store } q := ` SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message, - byte_size, metadata, created_at, updated_at + byte_size, metadata, created_at, updated_at, toc_tree FROM documents WHERE id = $1 AND org_id = $2` args := []any{string(id), orgID} if storeID != "" { @@ -94,13 +106,14 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store var d Document var status string - var rawMeta []byte + var rawMeta, rawTOC []byte if err := row.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status, - &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil { + &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil { return nil, mapErr(err) } d.Status = DocumentStatus(status) d.Metadata = unmarshalMeta(rawMeta) + d.TOCTree = rawTOC return &d, nil } @@ -111,18 +124,19 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store func (p *Pool) GetDocumentForWorker(ctx context.Context, id tree.DocumentID) (*Document, error) { row := p.QueryRow(ctx, ` SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message, - byte_size, metadata, created_at, updated_at + byte_size, metadata, created_at, updated_at, toc_tree FROM documents WHERE id = $1`, string(id)) var d Document var status string - var rawMeta []byte + var rawMeta, rawTOC []byte if err := row.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status, - &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil { + &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil { return nil, mapErr(err) } d.Status = DocumentStatus(status) d.Metadata = unmarshalMeta(rawMeta) + d.TOCTree = rawTOC return &d, nil } @@ -143,6 +157,24 @@ func (p *Pool) SetDocumentTitle(ctx context.Context, id tree.DocumentID, title s return mapErr(err) } +// UpdateDocumentTOCTree persists the LLM-built table-of-contents +// tree onto the documents.toc_tree column. treeJSON is the already +// JSON-marshalled []tree.TOCNode; pass a nil slice to clear (writes +// SQL NULL — the "not yet generated" state). Mirrors +// UpdateSectionSummaryAxes so the column can be patched +// independently of the rest of the document row. +func (p *Pool) UpdateDocumentTOCTree(ctx context.Context, id tree.DocumentID, treeJSON []byte) error { + var arg any + if len(treeJSON) > 0 { + arg = treeJSON + } + _, err := p.Exec(ctx, ` + UPDATE documents + SET toc_tree = $2, updated_at = now() + WHERE id = $1`, string(id), arg) + return mapErr(err) +} + // ListDocumentsOpts controls pagination + filtering for ListDocuments. type ListDocumentsOpts struct { // OrgID restricts the listing to a single tenant. Required. @@ -197,7 +229,7 @@ func (p *Pool) ListDocuments(ctx context.Context, o ListDocumentsOpts) ([]Docume q := ` SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message, - byte_size, metadata, created_at, updated_at + byte_size, metadata, created_at, updated_at, toc_tree FROM documents ` + where + ` ORDER BY created_at DESC LIMIT $` + itoa(next) @@ -212,13 +244,14 @@ func (p *Pool) ListDocuments(ctx context.Context, o ListDocumentsOpts) ([]Docume for rows.Next() { var d Document var status string - var rawMeta []byte + var rawMeta, rawTOC []byte if err := rows.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status, - &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil { + &d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil { return nil, time.Time{}, err } d.Status = DocumentStatus(status) d.Metadata = unmarshalMeta(rawMeta) + d.TOCTree = rawTOC out = append(out, d) } if err := rows.Err(); err != nil { diff --git a/pkg/db/documents_marshal_test.go b/pkg/db/documents_marshal_test.go new file mode 100644 index 0000000..0260a77 --- /dev/null +++ b/pkg/db/documents_marshal_test.go @@ -0,0 +1,111 @@ +package db + +import ( + "bytes" + "encoding/json" + "testing" + + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// TestTOCTreeRoundTrip confirms a []tree.TOCNode marshals to JSON +// bytes that, when shoved through Document.TOCTree and pulled back +// out, decode to the same shape. The DB column stores the bytes +// verbatim so this is really a guard on the JSON tag contract — +// dropping a tag or renaming a field breaks downstream consumers +// that depend on the stable wire shape. +func TestTOCTreeRoundTrip(t *testing.T) { + in := []tree.TOCNode{ + { + NodeID: "toc_1", + Structure: "1", + Title: "Business", + StartPage: 1, + EndPage: 12, + Nodes: []tree.TOCNode{ + {NodeID: "toc_1_1", Structure: "1.1", Title: "Overview", StartPage: 1, EndPage: 4}, + {NodeID: "toc_1_2", Structure: "1.2", Title: "Strategy", StartPage: 5, EndPage: 12}, + }, + }, + { + NodeID: "toc_2", + Structure: "2", + Title: "Risk Factors", + StartPage: 13, + EndPage: 38, + }, + } + + raw, err := json.Marshal(in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + + var out []tree.TOCNode + if err := json.Unmarshal(raw, &out); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if len(out) != len(in) { + t.Fatalf("top-level len: got %d want %d", len(out), len(in)) + } + for i := range in { + assertTOCNodeEq(t, &out[i], &in[i]) + } + + // Re-marshal and check byte-stable form so persisting and + // re-reading never quietly changes content. JSON encoding is + // deterministic for a fixed key order; our struct tags fix that. + raw2, err := json.Marshal(out) + if err != nil { + t.Fatalf("re-marshal: %v", err) + } + if !bytes.Equal(raw, raw2) { + t.Errorf("round-trip changed bytes\n first: %s\n second: %s", raw, raw2) + } +} + +// TestTOCTreeOmitsZeroFields guards the wire contract: optional +// fields (EndPage, Summary, Nodes) drop out of the serialised form +// when zero, so the persisted blob stays small and free of noise. +func TestTOCTreeOmitsZeroFields(t *testing.T) { + in := []tree.TOCNode{{NodeID: "toc_x", Structure: "1", Title: "Stub", StartPage: 7}} + raw, err := json.Marshal(in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + s := string(raw) + for _, banned := range []string{"end_page", "summary", "nodes"} { + if bytes.Contains(raw, []byte(banned)) { + t.Errorf("expected %q to be omitted, got %s", banned, s) + } + } +} + +func assertTOCNodeEq(t *testing.T, got, want *tree.TOCNode) { + t.Helper() + if got.NodeID != want.NodeID { + t.Errorf("NodeID: got %q want %q", got.NodeID, want.NodeID) + } + if got.Structure != want.Structure { + t.Errorf("Structure: got %q want %q", got.Structure, want.Structure) + } + if got.Title != want.Title { + t.Errorf("Title: got %q want %q", got.Title, want.Title) + } + if got.StartPage != want.StartPage { + t.Errorf("StartPage: got %d want %d", got.StartPage, want.StartPage) + } + if got.EndPage != want.EndPage { + t.Errorf("EndPage: got %d want %d", got.EndPage, want.EndPage) + } + if got.Summary != want.Summary { + t.Errorf("Summary: got %q want %q", got.Summary, want.Summary) + } + if len(got.Nodes) != len(want.Nodes) { + t.Errorf("Nodes len: got %d want %d", len(got.Nodes), len(want.Nodes)) + return + } + for i := range want.Nodes { + assertTOCNodeEq(t, &got.Nodes[i], &want.Nodes[i]) + } +} diff --git a/pkg/db/migrations/0006_documents_toc_tree.down.sql b/pkg/db/migrations/0006_documents_toc_tree.down.sql new file mode 100644 index 0000000..8eb4315 --- /dev/null +++ b/pkg/db/migrations/0006_documents_toc_tree.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE documents + DROP COLUMN IF EXISTS toc_tree; diff --git a/pkg/db/migrations/0006_documents_toc_tree.up.sql b/pkg/db/migrations/0006_documents_toc_tree.up.sql new file mode 100644 index 0000000..f090819 --- /dev/null +++ b/pkg/db/migrations/0006_documents_toc_tree.up.sql @@ -0,0 +1,19 @@ +-- 0006_documents_toc_tree.up.sql — LLM-built table-of-contents tree. +-- +-- PR-A of the PageIndex-style redesign. The ingest pipeline runs an +-- LLM-driven TOC builder on PDFs (between summarize and StatusReady) +-- and persists the result here. The tree is small (a few KB even for +-- 300-page filings) and is read back at retrieval time by strategies +-- that want a hierarchical map of the document independent of the +-- parser's heading detection. +-- +-- toc_tree +-- JSONB blob carrying []tree.TOCNode. NULL for documents ingested +-- before this migration, for non-PDF inputs, or when the TOC +-- builder failed (failures are non-fatal — the document remains +-- fully retrievable via the existing sections tree). +-- +-- Not indexed: JSONB queries on this column aren't on the hot path. +-- Reads load the blob inline alongside the document row. +ALTER TABLE documents + ADD COLUMN IF NOT EXISTS toc_tree JSONB; diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go index d4c174d..71d6e8d 100644 --- a/pkg/ingest/ingest.go +++ b/pkg/ingest/ingest.go @@ -134,6 +134,29 @@ type Pipeline struct { // per-stage semaphore). Default applied by NewPipeline: 12. GlobalLLMConcurrency int + // TOCEnabled toggles the LLM-built table-of-contents stage. The + // stage runs after summarize+HyDE on PDF inputs and persists the + // resulting tree on documents.toc_tree (JSONB). Failures are + // non-fatal — they leave the column NULL. + // + // Defaulted to true by config wiring; left as the Go zero value + // (false) when Pipeline is constructed directly, so unit tests + // with no LLM can opt out by simply not setting it. + TOCEnabled bool + + // TOCModel overrides the LLM model used by the TOC builder. + // Empty inherits SummaryModel (which itself falls back to the + // client default). + TOCModel string + + // TOCConcurrency caps parallel LLM calls during the TOC + // verification phase. Default: 4. + TOCConcurrency int + + // TOCCheckPages bounds the leading prefix the detector scans + // for a table of contents. Default: 20. + TOCCheckPages int + // globalLLMSem is the lazily-initialized shared semaphore enforcing // GlobalLLMConcurrency. nil means "no global cap" — callers fall back // to per-stage limits only. @@ -168,6 +191,12 @@ func NewPipeline(p Pipeline) *Pipeline { if p.HyDEConcurrency <= 0 { p.HyDEConcurrency = 4 } + if p.TOCConcurrency <= 0 { + p.TOCConcurrency = 4 + } + if p.TOCCheckPages <= 0 { + p.TOCCheckPages = 20 + } // Default the global cap to a value that comfortably exceeds the // sum of the two default per-stage caps (4 + 4 = 8) while leaving // some headroom — but stays well below typical provider per-tenant @@ -265,6 +294,17 @@ func (p *Pipeline) Run(ctx context.Context, pl Payload) error { } log.Info("ingest: summarize+hyde complete", "elapsed", time.Since(stageStart)) + // LLM-built TOC tree (PageIndex-style). PDF-only because it + // relies on the parser's PageStart/PageEnd attribution to + // reconstruct per-page text. Non-fatal: a builder failure + // leaves documents.toc_tree NULL and the document remains + // fully retrievable via the sections tree above. + if p.TOCEnabled && pl.ContentType == "application/pdf" { + if err := p.runTOCBuilder(ctx, pl.DocumentID, parsed, log); err != nil { + log.Warn("ingest: toc-builder failed; falling back to NULL toc_tree", "err", err) + } + } + if err := p.DB.SetDocumentStatus(ctx, pl.DocumentID, db.StatusReady, ""); err != nil { return err } @@ -272,6 +312,113 @@ func (p *Pipeline) Run(ctx context.Context, pl Payload) error { return nil } +// runTOCBuilder assembles per-page text from the parsed PDF, runs +// the LLM-driven TOC builder over it, and persists the result. +// Returns an error only on a transport-level builder failure or a +// JSON-marshal blip; the caller logs and continues either way. +// +// A nil-result (no usable nodes) is treated as success and writes +// SQL NULL to documents.toc_tree (which is the column's default, +// so this is also the no-op). +func (p *Pipeline) runTOCBuilder(ctx context.Context, docID tree.DocumentID, parsed *parser.ParsedDoc, log *slog.Logger) error { + pages := assemblePagesFromSections(parsed.Sections) + if len(pages) == 0 { + log.Info("ingest: toc-builder skipped; no per-page text available") + return nil + } + model := p.TOCModel + if model == "" { + model = p.SummaryModel + } + builder := &TOCBuilder{ + LLM: p.LLM, + Model: model, + Concurrency: p.TOCConcurrency, + TOCCheckPages: p.TOCCheckPages, + } + nodes, usage, err := builder.Build(ctx, pages) + if err != nil { + return err + } + log.Info("ingest: toc-builder done", + "top_level_nodes", len(nodes), + "llm_calls", usage.LLMCalls, + "input_tokens", usage.InputTokens, + "output_tokens", usage.OutputTokens, + ) + if len(nodes) == 0 { + return nil + } + treeJSON, err := json.Marshal(nodes) + if err != nil { + return fmt.Errorf("marshal toc tree: %w", err) + } + if err := p.DB.UpdateDocumentTOCTree(ctx, docID, treeJSON); err != nil { + return fmt.Errorf("persist toc tree: %w", err) + } + return nil +} + +// assemblePagesFromSections groups the parsed sections' text by +// their PageStart, producing PageText entries the TOC builder can +// reason over. Sections that span multiple pages collapse onto +// their starting page — perfect page reconstruction would need +// raw glyph-level coordinates the parser doesn't currently +// surface, but the title-on-claimed-page heuristic still works +// because section starts (where the LLM looks for titles) live +// on PageStart. +// +// Sections with PageStart == 0 are skipped (the parser couldn't +// place them) so the builder never sees ambiguous page numbers. +func assemblePagesFromSections(secs []parser.Section) []PageText { + pageText := map[int]*strings.Builder{} + pages := []int{} + var walk func([]parser.Section) + walk = func(ss []parser.Section) { + for _, s := range ss { + if s.PageStart > 0 { + b, ok := pageText[s.PageStart] + if !ok { + b = &strings.Builder{} + pageText[s.PageStart] = b + pages = append(pages, s.PageStart) + } + if title := strings.TrimSpace(s.Title); title != "" { + if b.Len() > 0 { + b.WriteByte('\n') + } + b.WriteString(title) + b.WriteByte('\n') + } + if body := strings.TrimSpace(s.Content); body != "" { + b.WriteString(body) + b.WriteByte('\n') + } + } + walk(s.Children) + } + } + walk(secs) + // Sort the page-number index in place. + sortIntsAscending(pages) + out := make([]PageText, 0, len(pages)) + for _, p := range pages { + out = append(out, PageText{PageNumber: p, Text: pageText[p].String()}) + } + return out +} + +// sortIntsAscending sorts a slice of ints in place. Insertion sort +// is fine here — pages slice is typically a few hundred items +// at most. +func sortIntsAscending(xs []int) { + for i := 1; i < len(xs); i++ { + for j := i; j > 0 && xs[j-1] > xs[j]; j-- { + xs[j-1], xs[j] = xs[j], xs[j-1] + } + } +} + // runParallelStages runs summarize and HyDE concurrently, returning each // stage's error independently so callers can log them separately. A nil // hydeFn skips the HyDE stage (returns nil for hydeErr). diff --git a/pkg/ingest/toc_builder.go b/pkg/ingest/toc_builder.go new file mode 100644 index 0000000..3466cc5 --- /dev/null +++ b/pkg/ingest/toc_builder.go @@ -0,0 +1,826 @@ +package ingest + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log" + "strings" + "sync" + + "golang.org/x/sync/errgroup" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// PageText pairs a 1-indexed PDF page number with its extracted +// text. The TOC builder reasons over a slice of these in page order +// — it never sees raw PDF bytes, so it works equally well over the +// pages produced by the existing parser pipeline and over synthetic +// fixtures used in tests. +type PageText struct { + PageNumber int + Text string +} + +// TOCBuilder builds an LLM-derived table-of-contents tree for a +// document. The shape mirrors PageIndex's three-phase pipeline: +// +// 1. detect — scan the first TOCCheckPages pages and ask the LLM +// whether any of them looks like a real TOC. +// 2. extract — if a TOC page was found, ask the LLM to parse it +// into structured nodes; otherwise call the no-TOC +// path that generates a TOC straight from body +// text (the LLM is given the full page text tagged +// with markers it copies back as +// the start page). +// 3. verify — concurrently re-check each leaf node: does its +// title actually appear at the start of the claimed +// page? Mismatches are repaired by clearing the +// page back to zero; downstream readers treat zero +// as "open / unknown" rather than a wrong answer. +// +// EndPage is derived from sibling ordering once verification is +// done. The builder is deliberately tolerant of LLM parse blips +// (the same retry-then-degrade pattern the rest of the ingest path +// uses) — a single bad response never fails ingest. +type TOCBuilder struct { + // LLM is the provider client. Required. + LLM llmgate.Client + + // Model overrides the client's default. Empty inherits. + Model string + + // Concurrency caps parallel LLM calls during the verification + // phase. The detect + extract phases run sequentially because + // each page-by-page detector call is short and the no-TOC + // generator is one big call. Default: 4. + Concurrency int + + // TOCCheckPages bounds the prefix the detector scans for a + // table of contents. PageIndex defaults this to 20 — financial + // filings put their TOC inside the first dozen pages and a + // document with no TOC by page 20 almost never has one + // further in. Default: 20. + TOCCheckPages int +} + +// Usage is the cumulative LLM accounting returned by Build. Mirrors +// the retrieval.Usage shape so callers can fold it into the same +// per-document cost ledger that the retrieval path uses. +type Usage struct { + InputTokens int + OutputTokens int + TotalTokens int + CostUSD float64 + LLMCalls int +} + +// add folds the per-response usage from one LLM call into the +// running total. Keeps the call sites short. +func (u *Usage) add(r *llmgate.Response) { + if r == nil { + return + } + u.InputTokens += r.Usage.InputTokens + u.OutputTokens += r.Usage.OutputTokens + u.TotalTokens += r.Usage.TotalTokens + u.CostUSD += r.Usage.CostUSD + u.LLMCalls++ +} + +// Build runs the three-phase pipeline on pages and returns a +// flat-ish top-level TOC tree (children inside Nodes form the +// nested levels). Always returns a non-nil error chain only on a +// hard transport failure — LLM parse blips degrade to "empty +// result with logged warning" so the caller's ingest job never +// dies on a formatting glitch. +// +// pages must be in page order (PageNumber strictly ascending and +// 1-based). Build does not sort or de-duplicate. +func (b *TOCBuilder) Build(ctx context.Context, pages []PageText) ([]tree.TOCNode, Usage, error) { + var usage Usage + if len(pages) == 0 { + return nil, usage, nil + } + concurrency := b.Concurrency + if concurrency <= 0 { + concurrency = 4 + } + tocCheck := b.TOCCheckPages + if tocCheck <= 0 { + tocCheck = 20 + } + + // Phase 1: detect. Scan the leading pages for a TOC. + tocPages := b.detectTOCPages(ctx, pages, tocCheck, &usage) + + // Phase 2: extract. + var nodes []tree.TOCNode + var err error + if len(tocPages) > 0 { + nodes, err = b.extractFromTOCPages(ctx, pages, tocPages, &usage) + } else { + nodes, err = b.generateNoTOC(ctx, pages, &usage) + } + if err != nil { + return nil, usage, err + } + if len(nodes) == 0 { + return nil, usage, nil + } + + // Phase 3: verify each leaf's claimed start page actually + // starts the section. Mismatches clear the page (set to 0) + // rather than making one up — downstream treats zero as + // open/unknown. + b.verifyTitlesConcurrent(ctx, nodes, pages, concurrency, &usage) + + // Derive end pages from sibling order. Done last so verified + // start pages drive the derivation. + deriveEndPages(nodes, lastPage(pages)) + + // Stamp stable node IDs onto every node so callers / external + // consumers have an opaque handle independent of position. + stampNodeIDs(nodes, "") + + return nodes, usage, nil +} + +// detectTOCPages scans the first tocCheck pages with the +// PageIndex-style single-page detector. Returns the 1-indexed page +// numbers (in order) the LLM judged as table-of-contents pages. +// +// Detection failures (transport / parse) silently fall back to +// "no TOC found here" so the caller transitions to the no-TOC path. +// This matches the PageIndex contract — the no-TOC generator is +// strictly more general than the TOC-extraction path. +func (b *TOCBuilder) detectTOCPages(ctx context.Context, pages []PageText, tocCheck int, usage *Usage) []int { + limit := tocCheck + if limit > len(pages) { + limit = len(pages) + } + var found []int + for i := 0; i < limit; i++ { + if ctx.Err() != nil { + return found + } + page := pages[i] + text := strings.TrimSpace(page.Text) + if text == "" { + continue + } + isTOC, err := b.runTOCDetector(ctx, text, usage) + if err != nil { + // Transport / ErrNotImplemented — abandon detection and + // let the caller fall back to the no-TOC path. + return found + } + if isTOC { + found = append(found, page.PageNumber) + } + } + return found +} + +// runTOCDetector asks the LLM whether the supplied page text reads +// like a table of contents. Mirrors PageIndex's +// toc_detector_single_page. +func (b *TOCBuilder) runTOCDetector(ctx context.Context, pageText string, usage *Usage) (bool, error) { + prompt := fmt.Sprintf(`Your job is to detect if there is a table of contents provided in the given text. + +Given text: %s + +return the following JSON format: +{ + "thinking": "", + "toc_detected": "" +} + +Directly return the final JSON structure. Do not output anything else. +Please note: abstract, summary, notation list, figure list, table list, etc. are not tables of contents.`, truncate(pageText, tocDetectorMaxChars)) + + req := llmgate.Request{ + Model: b.Model, + Temperature: 0.0, + MaxTokens: 400, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: tocDetectorSystemPrompt}, + {Role: llmgate.RoleUser, Content: prompt}, + }, + JSONMode: true, + JSONSchema: []byte(tocDetectorJSONSchema), + } + raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage) + if err != nil { + return false, err + } + if raw == "" { + return false, nil + } + var p tocDetectorPayload + if err := unmarshalLenient([]byte(raw), &p); err != nil { + return false, nil + } + return strings.EqualFold(strings.TrimSpace(p.TOCDetected), "yes"), nil +} + +// extractFromTOCPages joins the detected TOC pages and asks the +// LLM to parse them into structured nodes. The path used when a +// TOC page was found — the structure on the page is the structure +// the LLM is asked to reproduce, just with start_page resolved. +// +// On parse failure or transport blip, returns nil — the caller +// degrades to an empty tree (still useful: the document remains +// retrievable via the existing sections tree). +func (b *TOCBuilder) extractFromTOCPages(ctx context.Context, pages []PageText, tocPages []int, usage *Usage) ([]tree.TOCNode, error) { + tocText := joinTOCPagesText(pages, tocPages) + bodyText := buildPhysicalIndexedText(pages, tocDetectorMaxChars*4) + + prompt := fmt.Sprintf(`You are an expert in extracting hierarchical tree structure. Given a raw table-of-contents block and the document's body text (tagged with markers), produce the hierarchical TOC as a JSON array of nodes. + +For each node: +- structure: dotted hierarchical index ("1", "1.1", "1.1.2") matching the heading depth. +- title: the original section title, only fixing space inconsistency. +- physical_index: the tag where the section begins. Look at the body text to resolve the page; if you cannot confidently locate it, use null. + +Raw table of contents: +%s + +Body text (with markers): +%s + +Return ONLY a JSON object: {"nodes": [{"structure": "1", "title": "...", "physical_index": ""}, ...]}. Do not output anything else.`, + truncate(tocText, tocExtractorMaxChars), + truncate(bodyText, tocExtractorMaxBody), + ) + + req := llmgate.Request{ + Model: b.Model, + Temperature: 0.0, + MaxTokens: 4096, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: tocExtractorSystemPrompt}, + {Role: llmgate.RoleUser, Content: prompt}, + }, + JSONMode: true, + JSONSchema: []byte(tocNodesJSONSchema), + } + raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage) + if err != nil { + return nil, err + } + flat := parseTOCNodesPayload(raw) + return assembleHierarchy(flat), nil +} + +// generateNoTOC is the PageIndex-style process_no_toc driver: when +// no TOC page was found, page content (tagged with +// markers) is fed to the LLM with instructions +// to emit a TOC straight from headings in the body. +func (b *TOCBuilder) generateNoTOC(ctx context.Context, pages []PageText, usage *Usage) ([]tree.TOCNode, error) { + body := buildPhysicalIndexedText(pages, noTOCMaxBody) + prompt := fmt.Sprintf(`You are an expert in extracting hierarchical tree structure; your task is to generate the table-of-contents tree of the document below from its body text. + +The structure variable is the dotted hierarchical index ("1", "1.1", "1.1.2") representing the section's position in the outline. + +For the title, extract the original heading verbatim; only fix space inconsistency. + +The text contains markers indicating the start and end of page X. For each section's physical_index, return the tag where the section starts (keep the format). + +Body text: +%s + +Return ONLY a JSON object: {"nodes": [{"structure": "1", "title": "...", "physical_index": ""}, ...]}. Do not output anything else.`, body) + + req := llmgate.Request{ + Model: b.Model, + Temperature: 0.0, + MaxTokens: 4096, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: tocExtractorSystemPrompt}, + {Role: llmgate.RoleUser, Content: prompt}, + }, + JSONMode: true, + JSONSchema: []byte(tocNodesJSONSchema), + } + raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage) + if err != nil { + return nil, err + } + flat := parseTOCNodesPayload(raw) + return assembleHierarchy(flat), nil +} + +// verifyTitlesConcurrent runs PageIndex's check_title_appearance_in_start +// over every node whose StartPage is set, with bounded concurrency. +// Mismatches set StartPage back to zero — the downstream contract +// is "zero means unknown / open" — so a misclaimed page never +// pretends to be authoritative. +func (b *TOCBuilder) verifyTitlesConcurrent(ctx context.Context, nodes []tree.TOCNode, pages []PageText, concurrency int, usage *Usage) { + pageByNumber := indexByPage(pages) + flat := flattenForVerify(nodes) + if len(flat) == 0 { + return + } + + sem := make(chan struct{}, concurrency) + g, gctx := errgroup.WithContext(ctx) + var ( + mu sync.Mutex + localUse Usage + ) + + type result struct { + node *tree.TOCNode + ok bool + } + results := make([]result, len(flat)) + + for i, n := range flat { + i, n := i, n + if n.StartPage <= 0 { + continue + } + pageText, ok := pageByNumber[n.StartPage] + if !ok { + // claimed a page we don't have — clear it. + results[i] = result{node: n, ok: false} + continue + } + g.Go(func() error { + select { + case sem <- struct{}{}: + defer func() { <-sem }() + case <-gctx.Done(): + return nil + } + startsHere, err := b.runVerifyTitleAtPageStart(gctx, n.Title, pageText, &localUse) + if err != nil { + // Transport / stub LLM — treat as "not verified" but + // don't clear the page; the LLM never weighed in. + results[i] = result{node: n, ok: true} + return nil + } + results[i] = result{node: n, ok: startsHere} + return nil + }) + } + _ = g.Wait() + + // Fold per-call usage into the caller's accumulator under the lock + // so concurrent additions stay coherent. + mu.Lock() + usage.InputTokens += localUse.InputTokens + usage.OutputTokens += localUse.OutputTokens + usage.TotalTokens += localUse.TotalTokens + usage.CostUSD += localUse.CostUSD + usage.LLMCalls += localUse.LLMCalls + mu.Unlock() + + for _, r := range results { + if r.node == nil { + continue + } + if !r.ok { + r.node.StartPage = 0 + } + } +} + +// runVerifyTitleAtPageStart mirrors PageIndex's +// check_title_appearance_in_start: does this section's title appear +// at the beginning of the supplied page? +func (b *TOCBuilder) runVerifyTitleAtPageStart(ctx context.Context, title, pageText string, usage *Usage) (bool, error) { + prompt := fmt.Sprintf(`You will be given a section title and a page's text. +Your job is to check if the section starts at the beginning of the given page text. +If there are other contents before the section title, then the section does NOT start at the beginning of the page text. +If the section title is the first meaningful content in the page text, then the section starts at the beginning. + +Note: do fuzzy matching; ignore space inconsistency. + +Section title: %s +Page text: %s + +Reply format: +{ + "thinking": "", + "start_begin": "" +} +Directly return the final JSON structure. Do not output anything else.`, title, truncate(pageText, verifyMaxChars)) + + req := llmgate.Request{ + Model: b.Model, + Temperature: 0.0, + MaxTokens: 400, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: tocVerifySystemPrompt}, + {Role: llmgate.RoleUser, Content: prompt}, + }, + JSONMode: true, + JSONSchema: []byte(tocVerifyJSONSchema), + } + raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage) + if err != nil { + return false, err + } + if raw == "" { + return false, nil + } + var p tocVerifyPayload + if err := unmarshalLenient([]byte(raw), &p); err != nil { + // Couldn't parse — keep the page (don't clear). The LLM had + // no clear say, so the safer move is "trust the extractor". + return true, nil + } + return strings.EqualFold(strings.TrimSpace(p.StartBegin), "yes"), nil +} + +// --- prompt + schema constants --- + +const ( + tocDetectorSystemPrompt = "You are a precise document-structure analyser. Decide whether a single page of text is a table of contents." + tocExtractorSystemPrompt = "You are an expert in extracting hierarchical tree structures from documents. You output strict JSON only." + tocVerifySystemPrompt = "You are a precise verifier. Decide whether a section title starts a page's text." + defaultTOCRetries = 2 + tocDetectorMaxChars = 12000 + tocExtractorMaxChars = 16000 + tocExtractorMaxBody = 60000 + noTOCMaxBody = 80000 + verifyMaxChars = 4000 + tocDetectorJSONSchema = `{"type":"object","properties":{"thinking":{"type":"string"},"toc_detected":{"type":"string"}},"required":["toc_detected"]}` + tocVerifyJSONSchema = `{"type":"object","properties":{"thinking":{"type":"string"},"start_begin":{"type":"string"}},"required":["start_begin"]}` + tocNodesJSONSchema = `{"type":"object","properties":{"nodes":{"type":"array","items":{"type":"object","properties":{"structure":{"type":"string"},"title":{"type":"string"},"physical_index":{"type":["string","null"]}},"required":["title"]}}},"required":["nodes"]}` +) + +// --- JSON payload types --- + +type tocDetectorPayload struct { + Thinking string `json:"thinking"` + TOCDetected string `json:"toc_detected"` +} + +type tocVerifyPayload struct { + Thinking string `json:"thinking"` + StartBegin string `json:"start_begin"` +} + +type tocNodePayload struct { + Structure string `json:"structure"` + Title string `json:"title"` + PhysicalIndex *string `json:"physical_index"` +} + +type tocNodesPayload struct { + Nodes []tocNodePayload `json:"nodes"` +} + +// --- shared helpers --- + +// runTOCJSONWithRetry runs a JSON-mode TOC LLM call, retrying up to +// maxRetries additional times if the response can't be parsed. +// Mirrors the runSelectionWithRetry contract from +// pkg/retrieval/single_pass.go — copied here rather than imported +// because the retrieval package owns its own per-domain version of +// the same idea and we want the TOC builder to be importable by +// any future consumer without dragging retrieval in. +// +// Returns the final raw response text (empty on transport / stub +// failure). Caller decodes; a final parse failure degrades to "no +// usable response" rather than an error. +func runTOCJSONWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int, usage *Usage) (string, error) { + if maxRetries < 0 { + maxRetries = 0 + } + var lastRaw string + for attempt := 0; attempt <= maxRetries; attempt++ { + req := baseReq + if attempt > 0 { + msgs := make([]llmgate.Message, len(baseReq.Messages)) + copy(msgs, baseReq.Messages) + tail := len(msgs) - 1 + msgs[tail] = llmgate.Message{ + Role: msgs[tail].Role, + Content: msgs[tail].Content + "\n\nIMPORTANT: respond with ONLY a JSON object matching the schema. No prose, no markdown fences.", + } + req.Messages = msgs + } + resp, err := client.Complete(ctx, req) + if err != nil { + // Stub LLM (ErrNotImplemented) is a soft failure — the + // caller will degrade. Transport errors do the same so + // ingest never dies on a transient blip. + if errors.Is(err, llmgate.ErrNotImplemented) { + return "", nil + } + return "", err + } + usage.add(resp) + lastRaw = resp.Content + if looksLikeJSON(resp.Content) { + return resp.Content, nil + } + } + log.Printf("toc-builder: response did not parse after %d attempts; degrading to empty", maxRetries+1) + return lastRaw, nil +} + +// looksLikeJSON is a cheap probe so the retry loop can stop once +// the model returns something that at least textually resembles a +// JSON object. The real parser may still reject — strict parsing +// happens at the caller — but this avoids burning retries on +// obvious non-JSON ("Sure, here is the TOC: ..."). +func looksLikeJSON(s string) bool { + s = strings.TrimSpace(s) + if s == "" { + return false + } + if strings.HasPrefix(s, "```") { + s = strings.TrimPrefix(s, "```json") + s = strings.TrimPrefix(s, "```") + s = strings.TrimSpace(s) + } + return strings.HasPrefix(s, "{") || strings.HasPrefix(s, "[") +} + +// unmarshalLenient strips code fences and any prose around the +// first { / last } before decoding, matching the parser pattern +// used in pkg/retrieval and pkg/ingest/summary_axes.go. +func unmarshalLenient(raw []byte, dst any) error { + s := strings.TrimSpace(string(raw)) + if strings.HasPrefix(s, "```") { + if i := strings.Index(s, "\n"); i >= 0 { + s = s[i+1:] + } + s = strings.TrimSuffix(s, "```") + s = strings.TrimSpace(s) + } + if i := strings.Index(s, "{"); i > 0 { + s = s[i:] + } + if j := strings.LastIndex(s, "}"); j >= 0 && j < len(s)-1 { + s = s[:j+1] + } + return json.Unmarshal([]byte(s), dst) +} + +// parseTOCNodesPayload decodes the raw nodes JSON. Returns an empty +// slice on any parse failure — the builder caller treats "no +// usable nodes" as "leave TOC NULL" and proceeds with ingest. +func parseTOCNodesPayload(raw string) []tocNodePayload { + if raw == "" { + return nil + } + var p tocNodesPayload + if err := unmarshalLenient([]byte(raw), &p); err != nil { + return nil + } + return p.Nodes +} + +// --- shape helpers --- + +// joinTOCPagesText collects the text of the supplied TOC pages, in +// order, separated by newlines so the LLM sees them as one +// coherent block. +func joinTOCPagesText(pages []PageText, tocPages []int) string { + idx := indexByPage(pages) + var b strings.Builder + for _, p := range tocPages { + text, ok := idx[p] + if !ok || text == "" { + continue + } + if b.Len() > 0 { + b.WriteString("\n\n") + } + b.WriteString(text) + } + return b.String() +} + +// buildPhysicalIndexedText renders pages with +// markers around each page's text — the literal format the LLM is +// told to reproduce as the section's start page. budget caps the +// total characters so we never blow past the model's context. +func buildPhysicalIndexedText(pages []PageText, budget int) string { + var b strings.Builder + for _, p := range pages { + seg := fmt.Sprintf("\n%s\n\n\n", p.PageNumber, p.Text, p.PageNumber) + if budget > 0 && b.Len()+len(seg) > budget { + break + } + b.WriteString(seg) + } + return b.String() +} + +// indexByPage returns a map of page number to page text. +func indexByPage(pages []PageText) map[int]string { + out := make(map[int]string, len(pages)) + for _, p := range pages { + out[p.PageNumber] = p.Text + } + return out +} + +// lastPage returns the highest PageNumber in pages, or zero if +// empty. Used as the default upper bound when deriving end pages. +func lastPage(pages []PageText) int { + if len(pages) == 0 { + return 0 + } + last := pages[0].PageNumber + for _, p := range pages[1:] { + if p.PageNumber > last { + last = p.PageNumber + } + } + return last +} + +// truncate caps s at max characters, appending an ellipsis when +// it had to cut. A non-positive max disables the cap. +func truncate(s string, max int) string { + if max <= 0 || len(s) <= max { + return s + } + return s[:max] + "…" +} + +// physicalIndexRE-like helper without regexp: parses the integer X +// out of "". Returns 0 when the input doesn't +// match — the verify phase treats zero as unknown. +func parsePhysicalIndex(s string) int { + const prefix = " '9' { + return 0 + } + n = n*10 + int(r-'0') + } + return n +} + +// assembleHierarchy turns a flat list of TOC node payloads into a +// nested tree based on the dotted structure ("1", "1.1", "1.1.2"). +// Missing intermediate parents are tolerated — orphans land at the +// top level so a misnumbered LLM response doesn't drop nodes +// silently. +func assembleHierarchy(flat []tocNodePayload) []tree.TOCNode { + if len(flat) == 0 { + return nil + } + // First materialise every payload as a TOCNode with its claimed + // start page resolved. + nodes := make([]tree.TOCNode, 0, len(flat)) + for _, n := range flat { + title := strings.TrimSpace(n.Title) + if title == "" { + continue + } + page := 0 + if n.PhysicalIndex != nil { + page = parsePhysicalIndex(*n.PhysicalIndex) + } + nodes = append(nodes, tree.TOCNode{ + Structure: strings.TrimSpace(n.Structure), + Title: title, + StartPage: page, + }) + } + if len(nodes) == 0 { + return nil + } + + // Build a sentinel root; nest by counting dots in Structure. + // "1" → depth 1, "1.2" → depth 2, "1.2.3" → depth 3. + type ref struct { + node *tree.TOCNode + structure string + } + var ( + out []tree.TOCNode + path []ref + ) + for i := range nodes { + n := &nodes[i] + depth := depthOf(n.Structure) + if depth <= 0 { + depth = 1 + } + // Pop the path stack down to depth-1 so a "1.2" inserts + // under whatever last touched depth 1. + for len(path) >= depth { + path = path[:len(path)-1] + } + if len(path) == 0 { + out = append(out, *n) + path = append(path, ref{node: &out[len(out)-1], structure: n.Structure}) + continue + } + parent := path[len(path)-1].node + parent.Nodes = append(parent.Nodes, *n) + path = append(path, ref{node: &parent.Nodes[len(parent.Nodes)-1], structure: n.Structure}) + } + return out +} + +// depthOf returns the depth implied by a dotted structure string +// ("1" → 1, "1.2" → 2, "" → 0). A malformed structure ("1..2", +// "a.b") still returns the number of dot-separated tokens — we'd +// rather group than crash. +func depthOf(structure string) int { + if structure == "" { + return 0 + } + return strings.Count(structure, ".") + 1 +} + +// flattenForVerify returns pointers to every node in the tree in +// depth-first pre-order so the verification phase can mutate +// StartPage in place. +func flattenForVerify(nodes []tree.TOCNode) []*tree.TOCNode { + var out []*tree.TOCNode + var walk func(ns []tree.TOCNode) + walk = func(ns []tree.TOCNode) { + for i := range ns { + out = append(out, &ns[i]) + walk(ns[i].Nodes) + } + } + walk(nodes) + return out +} + +// deriveEndPages walks the tree and fills each node's EndPage from +// the next sibling at the same depth (StartPage - 1) or the +// supplied docLastPage when no later sibling exists. Children's +// end pages cap at their parent's, which is what readers expect +// for a TOC. +func deriveEndPages(nodes []tree.TOCNode, docLastPage int) { + deriveEndPagesIn(nodes, docLastPage) +} + +func deriveEndPagesIn(nodes []tree.TOCNode, ceiling int) { + for i := range nodes { + n := &nodes[i] + // Find the next sibling's start page that is strictly + // greater than this one — that's our end. Skip sibling + // entries whose StartPage was cleared (zero) by + // verification so a single bad page doesn't sink the + // rest of the row. + end := 0 + for j := i + 1; j < len(nodes); j++ { + if nodes[j].StartPage > n.StartPage { + end = nodes[j].StartPage - 1 + break + } + } + if end <= 0 { + end = ceiling + } + // EndPage can never precede StartPage; clear to zero when + // the data conflicts. + if n.StartPage > 0 && end >= n.StartPage { + n.EndPage = end + } + // Recurse with the child ceiling = this node's EndPage (or + // the parent's ceiling if EndPage is unknown). + childCeiling := n.EndPage + if childCeiling == 0 { + childCeiling = ceiling + } + deriveEndPagesIn(n.Nodes, childCeiling) + } +} + +// stampNodeIDs assigns deterministic NodeIDs based on the dotted +// structure (with a prefix), recursing into children. IDs are +// stable across runs given the same structure, which is handy for +// callers that diff trees across re-ingestions. +func stampNodeIDs(nodes []tree.TOCNode, prefix string) { + for i := range nodes { + n := &nodes[i] + base := n.Structure + if base == "" { + base = fmt.Sprintf("n%d", i+1) + } + if prefix == "" { + n.NodeID = "toc_" + base + } else { + n.NodeID = prefix + "_" + base + } + stampNodeIDs(n.Nodes, n.NodeID) + } +} diff --git a/pkg/ingest/toc_builder_test.go b/pkg/ingest/toc_builder_test.go new file mode 100644 index 0000000..4dfe22a --- /dev/null +++ b/pkg/ingest/toc_builder_test.go @@ -0,0 +1,466 @@ +package ingest + +import ( + "context" + "strings" + "sync" + "sync/atomic" + "testing" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/parser" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// scriptedLLM is a minimal inline mock — kept inside the ingest +// package so it doesn't leak into the public API surface. Each +// call walks a script keyed by phase ("detect", "extract", +// "verify"), returning the next canned response. Mirrors the +// pattern used in pkg/retrieval/retrieval_test.go's mockLLM but +// scoped narrower so individual tests can wire bespoke behaviour +// without dragging the retrieval test fixture in. +type scriptedLLM struct { + mu sync.Mutex + calls int32 + + // route returns the response for a given prompt. Tests inject + // behaviour here; falls back to a permissive "no" detector + + // empty extractor when nil so unrelated test paths don't have + // to script every prompt. + route func(userPrompt string) string + + // captured holds every user prompt seen, in order. Tests + // assert phase ordering and prompt content from this. + captured []string +} + +func (m *scriptedLLM) Complete(_ context.Context, req llmgate.Request) (*llmgate.Response, error) { + atomic.AddInt32(&m.calls, 1) + var user string + for _, msg := range req.Messages { + if msg.Role == llmgate.RoleUser { + user = msg.Content + } + } + m.mu.Lock() + m.captured = append(m.captured, user) + m.mu.Unlock() + + content := "" + if m.route != nil { + content = m.route(user) + } + if content == "" { + content = `{"toc_detected":"no"}` + } + return &llmgate.Response{ + Content: content, + Usage: llmgate.Usage{InputTokens: 100, OutputTokens: 50, TotalTokens: 150}, + }, nil +} + +func (m *scriptedLLM) CountTokens(_ context.Context, s string) (int, error) { + return len(s) / 4, nil +} + +// TestBuildTOCFoundPath walks the happy path where the detector +// finds a TOC page, the extractor parses it into nested nodes, +// and verification leaves the start pages intact. +func TestBuildTOCFoundPath(t *testing.T) { + llm := &scriptedLLM{} + llm.route = func(prompt string) string { + switch { + case strings.Contains(prompt, "table of contents provided in the given text"): + // Detector: yes only when the page actually contains + // "Table of Contents". + if strings.Contains(prompt, "Table of Contents") { + return `{"toc_detected":"yes"}` + } + return `{"toc_detected":"no"}` + case strings.Contains(prompt, "hierarchical tree structure"): + // Extractor: return a small 10-K outline. + return `{"nodes":[ + {"structure":"1","title":"Business","physical_index":""}, + {"structure":"1.1","title":"Overview","physical_index":""}, + {"structure":"2","title":"Risk Factors","physical_index":""}, + {"structure":"3","title":"MD&A","physical_index":""} + ]}` + case strings.Contains(prompt, "section starts at the beginning"): + return `{"start_begin":"yes"}` + } + return `{"toc_detected":"no"}` + } + + pages := []PageText{ + {PageNumber: 1, Text: "Cover Page\nForm 10-K\n"}, + {PageNumber: 2, Text: "Table of Contents\n1. Business ... 3\n1.1 Overview ... 3\n2. Risk Factors ... 10\n3. MD&A ... 20"}, + {PageNumber: 3, Text: "Business\nWe are a company that does things."}, + {PageNumber: 10, Text: "Risk Factors\nVarious risks apply."}, + {PageNumber: 20, Text: "MD&A\nDiscussion of operations."}, + } + + b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2} + nodes, usage, err := b.Build(context.Background(), pages) + if err != nil { + t.Fatalf("Build: %v", err) + } + if len(nodes) != 3 { + t.Fatalf("top-level nodes: got %d want 3 (Business, Risk Factors, MD&A) — got: %+v", len(nodes), nodes) + } + if nodes[0].Title != "Business" || nodes[0].StartPage != 3 { + t.Errorf("nodes[0]: got %+v", nodes[0]) + } + if len(nodes[0].Nodes) != 1 || nodes[0].Nodes[0].Title != "Overview" { + t.Errorf("nodes[0].Nodes: got %+v", nodes[0].Nodes) + } + if nodes[1].Title != "Risk Factors" || nodes[1].StartPage != 10 { + t.Errorf("nodes[1]: got %+v", nodes[1]) + } + if nodes[1].EndPage != 19 { + t.Errorf("nodes[1].EndPage: got %d want 19 (one before MD&A's start)", nodes[1].EndPage) + } + if nodes[2].EndPage != 20 { + t.Errorf("nodes[2].EndPage (last sibling): got %d want 20 (doc last page)", nodes[2].EndPage) + } + if usage.LLMCalls < 2 { + t.Errorf("expected at least 2 LLM calls (detector + extractor), got %d", usage.LLMCalls) + } + if usage.InputTokens == 0 { + t.Errorf("usage should track input tokens; got 0") + } + // NodeIDs are stamped deterministically from structure. + if nodes[0].NodeID != "toc_1" || nodes[0].Nodes[0].NodeID != "toc_1_1.1" { + t.Errorf("node IDs not stamped: top=%q child=%q", nodes[0].NodeID, nodes[0].Nodes[0].NodeID) + } +} + +// TestBuildNoTOCPath drives the generateTOCInit branch — the +// detector replies "no" for every page, so the builder falls +// through to the body-text TOC generator. +func TestBuildNoTOCPath(t *testing.T) { + llm := &scriptedLLM{} + var extractorCalled atomic.Int32 + var noTOCCalled atomic.Int32 + llm.route = func(prompt string) string { + switch { + case strings.Contains(prompt, "table of contents provided in the given text"): + return `{"toc_detected":"no"}` + case strings.Contains(prompt, "hierarchical tree structure"): + // The no-TOC and extractor prompts share the same + // system prompt + JSON shape; the user prompt body + // differs. We distinguish by the "raw table of + // contents" marker which only the extractor uses. + if strings.Contains(prompt, "Raw table of contents") { + extractorCalled.Add(1) + } else { + noTOCCalled.Add(1) + } + return `{"nodes":[ + {"structure":"1","title":"Introduction","physical_index":""}, + {"structure":"2","title":"Methods","physical_index":""} + ]}` + case strings.Contains(prompt, "section starts at the beginning"): + return `{"start_begin":"yes"}` + } + return `{"toc_detected":"no"}` + } + + pages := []PageText{ + {PageNumber: 1, Text: "Cover page with no TOC."}, + {PageNumber: 2, Text: "Introduction\nWe study X."}, + {PageNumber: 5, Text: "Methods\nWe used Y."}, + } + + b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2} + nodes, _, err := b.Build(context.Background(), pages) + if err != nil { + t.Fatalf("Build: %v", err) + } + if extractorCalled.Load() != 0 { + t.Errorf("extractor should NOT run when no TOC page was detected") + } + if noTOCCalled.Load() == 0 { + t.Errorf("no-TOC generator should have been invoked") + } + if len(nodes) != 2 || nodes[0].Title != "Introduction" || nodes[1].Title != "Methods" { + t.Fatalf("got nodes %+v", nodes) + } + if nodes[0].StartPage != 2 || nodes[1].StartPage != 5 { + t.Errorf("page numbers not lifted from : got %+v", nodes) + } +} + +// TestVerificationRepairsWrongPage scripts a verifier that says +// "no" for a node whose claimed page doesn't match — the start +// page should be cleared back to zero. Downstream consumers treat +// zero as "open / unknown" rather than a lie. +func TestVerificationRepairsWrongPage(t *testing.T) { + llm := &scriptedLLM{} + llm.route = func(prompt string) string { + switch { + case strings.Contains(prompt, "table of contents provided"): + if strings.Contains(prompt, "Table of Contents") { + return `{"toc_detected":"yes"}` + } + return `{"toc_detected":"no"}` + case strings.Contains(prompt, "hierarchical tree structure"): + return `{"nodes":[ + {"structure":"1","title":"Foo","physical_index":""}, + {"structure":"2","title":"Bar","physical_index":""} + ]}` + case strings.Contains(prompt, "section starts at the beginning"): + // Only Foo's claim is valid; Bar's is a lie. + if strings.Contains(prompt, "Section title: Foo") { + return `{"start_begin":"yes"}` + } + return `{"start_begin":"no"}` + } + return `{"toc_detected":"no"}` + } + + pages := []PageText{ + {PageNumber: 1, Text: "Table of Contents\nFoo ... 4\nBar ... 7"}, + {PageNumber: 4, Text: "Foo\nbody of foo"}, + {PageNumber: 7, Text: "Some other content, not Bar"}, + } + + b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2} + nodes, _, err := b.Build(context.Background(), pages) + if err != nil { + t.Fatalf("Build: %v", err) + } + if len(nodes) != 2 { + t.Fatalf("nodes: got %d want 2", len(nodes)) + } + if nodes[0].StartPage != 4 { + t.Errorf("verified node Foo should keep page 4, got %d", nodes[0].StartPage) + } + if nodes[1].StartPage != 0 { + t.Errorf("repaired node Bar should have StartPage=0 (cleared), got %d", nodes[1].StartPage) + } +} + +// TestRetryOnBadJSON exercises the retry path: the first +// extractor response is plain prose, the second is valid JSON. +// The builder should retry and end up with usable nodes. +func TestRetryOnBadJSON(t *testing.T) { + llm := &scriptedLLM{} + var extractorCalls atomic.Int32 + llm.route = func(prompt string) string { + if strings.Contains(prompt, "table of contents provided") { + if strings.Contains(prompt, "Table of Contents") { + return `{"toc_detected":"yes"}` + } + return `{"toc_detected":"no"}` + } + if strings.Contains(prompt, "hierarchical tree structure") { + n := extractorCalls.Add(1) + if n == 1 { + // First try: plain prose. Retry loop should kick in. + return "Sure, here is the structure: I will explain it ..." + } + return `{"nodes":[{"structure":"1","title":"Solo","physical_index":""}]}` + } + if strings.Contains(prompt, "section starts at the beginning") { + return `{"start_begin":"yes"}` + } + return `{"toc_detected":"no"}` + } + + pages := []PageText{ + {PageNumber: 1, Text: "Table of Contents\nSolo ... 2"}, + {PageNumber: 2, Text: "Solo\nbody"}, + } + + b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2} + nodes, usage, err := b.Build(context.Background(), pages) + if err != nil { + t.Fatalf("Build: %v", err) + } + if len(nodes) != 1 || nodes[0].Title != "Solo" { + t.Fatalf("nodes: %+v", nodes) + } + if extractorCalls.Load() < 2 { + t.Errorf("expected the retry loop to fire (>=2 extractor calls), got %d", extractorCalls.Load()) + } + // Retry adds an extra LLM call beyond the minimum (detector + extractor + verify). + if usage.LLMCalls < 4 { + t.Errorf("expected >=4 LLM calls (detector + extractor*2 + verify), got %d", usage.LLMCalls) + } +} + +// TestEndPageDerivationFromSiblings asserts the post-verification +// pass fills EndPage from the next sibling's StartPage - 1 and +// the document's last page for the final sibling. +func TestEndPageDerivationFromSiblings(t *testing.T) { + root := []tree.TOCNode{ + {Structure: "1", Title: "A", StartPage: 5}, + {Structure: "2", Title: "B", StartPage: 12}, + {Structure: "3", Title: "C", StartPage: 30}, + } + deriveEndPages(root, 50) + if root[0].EndPage != 11 { + t.Errorf("A.EndPage: got %d want 11", root[0].EndPage) + } + if root[1].EndPage != 29 { + t.Errorf("B.EndPage: got %d want 29", root[1].EndPage) + } + if root[2].EndPage != 50 { + t.Errorf("C.EndPage (last): got %d want 50", root[2].EndPage) + } +} + +// TestAssembleHierarchyNestsByStructure makes sure dotted +// structure indices group correctly. "1.1" nests under "1", +// "2.1.1" three levels deep, etc. +func TestAssembleHierarchyNestsByStructure(t *testing.T) { + flat := []tocNodePayload{ + {Structure: "1", Title: "Top"}, + {Structure: "1.1", Title: "Sub-A"}, + {Structure: "1.1.1", Title: "Leaf-1"}, + {Structure: "1.2", Title: "Sub-B"}, + {Structure: "2", Title: "Sibling"}, + } + out := assembleHierarchy(flat) + if len(out) != 2 { + t.Fatalf("top-level: got %d want 2", len(out)) + } + if out[0].Title != "Top" || len(out[0].Nodes) != 2 { + t.Fatalf("Top children: %+v", out[0].Nodes) + } + if out[0].Nodes[0].Title != "Sub-A" || len(out[0].Nodes[0].Nodes) != 1 { + t.Fatalf("Sub-A: %+v", out[0].Nodes[0]) + } + if out[0].Nodes[0].Nodes[0].Title != "Leaf-1" { + t.Errorf("Leaf-1 missing under Sub-A; got %+v", out[0].Nodes[0].Nodes) + } +} + +// TestParsePhysicalIndex covers the tag parser used by +// assembleHierarchy. Malformed tags should return 0 (the +// "unknown" sentinel) rather than panic. +func TestParsePhysicalIndex(t *testing.T) { + cases := []struct { + in string + want int + }{ + {"", 5}, + {"", 42}, + {"", 0}, + {"not a tag", 0}, + {"", 0}, + {" ", 7}, + } + for _, c := range cases { + if got := parsePhysicalIndex(c.in); got != c.want { + t.Errorf("parsePhysicalIndex(%q): got %d want %d", c.in, got, c.want) + } + } +} + +// TestBuildEmptyPages should return cleanly with no usage and no nodes. +func TestBuildEmptyPages(t *testing.T) { + b := &TOCBuilder{LLM: &scriptedLLM{}} + nodes, usage, err := b.Build(context.Background(), nil) + if err != nil { + t.Fatalf("Build: %v", err) + } + if nodes != nil { + t.Errorf("empty input should yield nil nodes, got %v", nodes) + } + if usage.LLMCalls != 0 { + t.Errorf("empty input should make no LLM calls, got %d", usage.LLMCalls) + } +} + +// TestAssemblePagesFromSections covers the bridge between the +// parser's section-tree output and the TOC builder's per-page +// input. Sections sharing a starting page collapse into one +// PageText entry; sections with PageStart==0 are skipped so the +// builder never sees ambiguous page numbers. +func TestAssemblePagesFromSections(t *testing.T) { + secs := []parser.Section{ + { + Level: 1, + Title: "Business", + Content: "We do business.", + PageStart: 3, + PageEnd: 8, + Children: []parser.Section{ + {Level: 2, Title: "Overview", Content: "An overview.", PageStart: 3, PageEnd: 4}, + }, + }, + {Level: 1, Title: "Risk Factors", Content: "Risks here.", PageStart: 10, PageEnd: 12}, + {Level: 1, Title: "No-page section", Content: "Skipped.", PageStart: 0}, + } + pages := assemblePagesFromSections(secs) + if len(pages) != 2 { + t.Fatalf("want 2 distinct pages (3 + 10), got %d: %+v", len(pages), pages) + } + if pages[0].PageNumber != 3 || pages[1].PageNumber != 10 { + t.Errorf("pages out of order or wrong: %+v", pages) + } + if !strings.Contains(pages[0].Text, "Business") || !strings.Contains(pages[0].Text, "Overview") { + t.Errorf("page 3 missing expected titles: %q", pages[0].Text) + } + if !strings.Contains(pages[1].Text, "Risk Factors") { + t.Errorf("page 10 missing Risk Factors title: %q", pages[1].Text) + } + if strings.Contains(pages[0].Text, "Skipped.") || strings.Contains(pages[1].Text, "Skipped.") { + t.Errorf("PageStart=0 section should be skipped; got %+v", pages) + } +} + +// TestSynthetic10KFourTopLevelNodes drives a tiny but realistic +// 10-K-flavoured fixture and asserts the builder lands four +// top-level nodes (Business / Risk Factors / MD&A / Financial +// Statements). The fixture matches the example used in the PR +// reporting section. +func TestSynthetic10KFourTopLevelNodes(t *testing.T) { + llm := &scriptedLLM{} + llm.route = func(prompt string) string { + if strings.Contains(prompt, "table of contents provided in the given text") { + if strings.Contains(prompt, "TABLE OF CONTENTS") { + return `{"toc_detected":"yes"}` + } + return `{"toc_detected":"no"}` + } + if strings.Contains(prompt, "hierarchical tree structure") { + return `{"nodes":[ + {"structure":"1","title":"Item 1. Business","physical_index":""}, + {"structure":"2","title":"Item 1A. Risk Factors","physical_index":""}, + {"structure":"3","title":"Item 7. MD&A","physical_index":""}, + {"structure":"4","title":"Item 8. Financial Statements","physical_index":""} + ]}` + } + if strings.Contains(prompt, "section starts at the beginning") { + return `{"start_begin":"yes"}` + } + return `{"toc_detected":"no"}` + } + + pages := []PageText{ + {PageNumber: 1, Text: "Cover Page\nForm 10-K\n"}, + {PageNumber: 2, Text: "TABLE OF CONTENTS\nItem 1. Business ... 5\nItem 1A. Risk Factors ... 15\nItem 7. MD&A ... 40\nItem 8. Financial Statements ... 60"}, + {PageNumber: 5, Text: "Item 1. Business\nWe operate ..."}, + {PageNumber: 15, Text: "Item 1A. Risk Factors\nRisks include ..."}, + {PageNumber: 40, Text: "Item 7. MD&A\nDiscussion of operations."}, + {PageNumber: 60, Text: "Item 8. Financial Statements\nBalance sheet ..."}, + } + + b := &TOCBuilder{LLM: llm, TOCCheckPages: 10, Concurrency: 4} + nodes, _, err := b.Build(context.Background(), pages) + if err != nil { + t.Fatalf("Build: %v", err) + } + if len(nodes) != 4 { + t.Fatalf("synthetic 10-K should yield 4 top-level nodes, got %d: %+v", len(nodes), nodes) + } + wantTitles := []string{"Item 1. Business", "Item 1A. Risk Factors", "Item 7. MD&A", "Item 8. Financial Statements"} + for i, want := range wantTitles { + if nodes[i].Title != want { + t.Errorf("nodes[%d].Title = %q, want %q", i, nodes[i].Title, want) + } + } +} diff --git a/pkg/tree/tree.go b/pkg/tree/tree.go index edf36ba..9df9f74 100644 --- a/pkg/tree/tree.go +++ b/pkg/tree/tree.go @@ -110,6 +110,59 @@ func (s *Section) IsLeaf() bool { return len(s.Children) == 0 } +// TOCNode is one node in the LLM-built table-of-contents tree +// persisted on Document.toc_tree. Distinct from Section because +// it represents the document's logical outline (headings the LLM +// recovered or invented from body text) rather than the parser's +// chunked content tree. Used by the PageIndex-style retrieval +// strategy that reasons over the TOC before drilling into sections. +// +// Structure carries the PageIndex-style hierarchical index ("1", +// "1.1", "1.1.2"). Title is the original heading verbatim (spacing +// fixed). StartPage is 1-indexed and refers to the source PDF's +// physical page. EndPage is derived from the next sibling's +// StartPage at build time (when known); zero means "unknown / open" +// and downstream readers should treat the node as running until +// either the next sibling at the same depth or the document end. +// +// The shape mirrors PageIndex's tree-output JSON (start_page / +// end_page / nodes) so external tooling that expects that +// vocabulary can interop without translation. +type TOCNode struct { + // NodeID is a stable identifier for this TOC node within its + // owning document. Generated by the builder; opaque to clients. + NodeID string `json:"node_id"` + + // Structure is the dotted hierarchical index ("1", "1.1", + // "1.1.2"). Empty for roots that the builder couldn't number. + Structure string `json:"structure"` + + // Title is the section's heading text. Always populated. + Title string `json:"title"` + + // StartPage is the 1-indexed PDF page where this section + // begins. The verification phase checks that the title + // actually appears at the start of this page; mismatches are + // repaired before persistence. + StartPage int `json:"start_page"` + + // EndPage is the 1-indexed inclusive end page derived from + // sibling ordering. Zero means "unknown / open" and should be + // interpreted as running to the next sibling's StartPage - 1 + // (or document end). + EndPage int `json:"end_page,omitempty"` + + // Summary is an optional one-line description of the + // subsection's content. Populated only when the builder runs + // with summary-generation enabled (a follow-up PR; left blank + // here so the JSON shape is forward-compatible). + Summary string `json:"summary,omitempty"` + + // Nodes is the recursive list of child TOC nodes in document + // order. + Nodes []TOCNode `json:"nodes,omitempty"` +} + // Walk visits every section in depth-first, pre-order. Traversal stops if // visit returns false. func (s *Section) Walk(visit func(*Section) bool) {