diff --git a/cmd/server/main.go b/cmd/server/main.go
index 83714db..4c9b284 100644
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -169,6 +169,10 @@ func run() error {
 		SummaryAxesMaxTopics:   cfg.Engine.Ingest.SummaryAxes.MaxTopics,
 		SummaryAxesMaxEntities: cfg.Engine.Ingest.SummaryAxes.MaxEntities,
 		SummaryAxesMaxNumbers:  cfg.Engine.Ingest.SummaryAxes.MaxNumbers,
+		TOCEnabled:             cfg.Engine.Ingest.TOC.Enabled,
+		TOCModel:               cfg.Engine.Ingest.TOC.Model,
+		TOCConcurrency:         cfg.Engine.Ingest.TOC.Concurrency,
+		TOCCheckPages:          cfg.Engine.Ingest.TOC.TOCCheckPages,
 		GlobalLLMConcurrency:   cfg.Engine.Ingest.GlobalLLMConcurrency,
 	})
 	if cfg.Engine.Ingest.Tables.Enabled {
diff --git a/config.example.yaml b/config.example.yaml
index 7de0368..0ab32e2 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -297,6 +297,32 @@ ingest:
     max_entities: 8
     max_numbers: 6
 
+  # LLM-built table-of-contents tree (PageIndex-style). Runs after
+  # summarize+HyDE on PDF inputs and persists a hierarchical TOC on
+  # documents.toc_tree (JSONB). The tree is small (a few KB even
+  # for 300-page filings) and is intended as a higher-level map
+  # retrieval strategies can reason over before drilling into the
+  # parser-derived sections tree.
+  #
+  # ENABLED BY DEFAULT for PDFs. Non-PDF documents skip the stage
+  # unconditionally. Builder failures are non-fatal — the document
+  # remains fully retrievable via the existing sections tree.
+  toc:
+    enabled: true
+    # Override the LLM model used by the builder; empty inherits
+    # the summary model. Point this at a reasoning-capable model —
+    # the no-TOC generator has to find hierarchy in raw body text,
+    # which a small/fast model often botches.
+    model: ""
+    # Cap on parallel LLM calls during the verification phase
+    # (one call per leaf node).
+    concurrency: 4
+    # The detector scans the first N pages for a table of
+    # contents. PageIndex defaults this to 20 — financial filings
+    # put their TOC inside the first dozen pages and a document
+    # without one by page 20 almost never has one further in.
+    toc_check_pages: 20
+
 log:
   level: "info"            # debug | info | warn | error
   format: "json"           # json | console
diff --git a/pkg/config/config.go b/pkg/config/config.go
index d4f2723..c5fa0e4 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -48,6 +48,13 @@ type IngestConfig struct {
 	// populate it).
 	SummaryAxes SummaryAxesBlock `yaml:"summary_axes"`
 
+	// TOC configures the PageIndex-style LLM-built table-of-contents
+	// tree stage. Enabled by default for PDF inputs; the resulting
+	// tree is persisted on documents.toc_tree (JSONB). Failures are
+	// non-fatal — they leave the column NULL and the document fully
+	// retrievable via the existing sections tree.
+	TOC TOCBlock `yaml:"toc"`
+
 	// GlobalLLMConcurrency caps the total number of LLM calls in flight
 	// across the summarize and HyDE stages combined, which now run
 	// concurrently. Each stage still respects its own per-stage cap
@@ -60,6 +67,39 @@ type IngestConfig struct {
 	GlobalLLMConcurrency int `yaml:"global_llm_concurrency"`
 }
 
+// TOCBlock configures the LLM-driven table-of-contents tree
+// builder. The builder reads page-by-page text from a freshly-
+// ingested PDF and emits a hierarchical TOC (PageIndex-style),
+// persisted on documents.toc_tree (JSONB).
+//
+// Enabled by default for PDF inputs; non-PDF documents skip the
+// stage unconditionally. Builder failures never break ingest —
+// the document remains fully retrievable via the existing
+// sections tree.
+type TOCBlock struct {
+	// Enabled toggles the stage. Default: true. Flip to false to
+	// skip the extra LLM round-trip when ingest budget matters
+	// more than having a TOC tree for retrieval to reason over.
+	Enabled bool `yaml:"enabled"`
+
+	// Model overrides the LLM model used by the builder. Empty
+	// inherits the engine's configured default. Point this at a
+	// reasoning-capable model — the no-TOC generator has to find
+	// hierarchy in raw body text, which a small/fast model often
+	// botches.
+	Model string `yaml:"model"`
+
+	// Concurrency caps parallel LLM calls during the verification
+	// phase (one call per leaf node). Default: 4.
+	Concurrency int `yaml:"concurrency"`
+
+	// TOCCheckPages bounds the leading prefix the detector scans
+	// for a table of contents. Default: 20 — financial filings
+	// put their TOC inside the first dozen pages and a document
+	// without one by page 20 almost never has one further in.
+	TOCCheckPages int `yaml:"toc_check_pages"`
+}
+
 // SummaryAxesBlock configures the Phase 2.5 structured summarizer.
 //
 // When enabled, the summarize stage runs in JSON mode and produces
@@ -584,6 +624,11 @@ func Default() Config {
 				MaxEntities: 8,
 				MaxNumbers:  6,
 			},
+			TOC: TOCBlock{
+				Enabled:       true,
+				Concurrency:   4,
+				TOCCheckPages: 20,
+			},
 		},
 		Log: LogConfig{Level: "info", Format: "json"},
 	}
@@ -767,6 +812,30 @@ func applyEnvOverrides(c *Config) {
 			c.Ingest.SummaryAxes.MaxNumbers = n
 		}
 	}
+	// LLM-built TOC tree (PageIndex-style). Same truthy-string set
+	// as the other ingest toggles; numeric overrides require a
+	// positive int so a typo doesn't silently flip the default.
+	if v := os.Getenv("VLE_INGEST_TOC_ENABLED"); v != "" {
+		switch strings.ToLower(strings.TrimSpace(v)) {
+		case "1", "true", "yes", "on":
+			c.Ingest.TOC.Enabled = true
+		case "0", "false", "no", "off":
+			c.Ingest.TOC.Enabled = false
+		}
+	}
+	if v := os.Getenv("VLE_INGEST_TOC_MODEL"); v != "" {
+		c.Ingest.TOC.Model = v
+	}
+	if v := os.Getenv("VLE_INGEST_TOC_CONCURRENCY"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			c.Ingest.TOC.Concurrency = n
+		}
+	}
+	if v := os.Getenv("VLE_INGEST_TOC_TOC_CHECK_PAGES"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			c.Ingest.TOC.TOCCheckPages = n
+		}
+	}
 	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" {
 		switch strings.ToLower(strings.TrimSpace(v)) {
 		case "1", "true", "yes", "on":
@@ -978,6 +1047,13 @@ func (c Config) Validate() error {
 		return fmt.Errorf("ingest.summary_axes.max_numbers must be >= 0, got %d", c.Ingest.SummaryAxes.MaxNumbers)
 	}
 
+	if c.Ingest.TOC.Concurrency < 0 {
+		return fmt.Errorf("ingest.toc.concurrency must be >= 0, got %d", c.Ingest.TOC.Concurrency)
+	}
+	if c.Ingest.TOC.TOCCheckPages < 0 {
+		return fmt.Errorf("ingest.toc.toc_check_pages must be >= 0, got %d", c.Ingest.TOC.TOCCheckPages)
+	}
+
 	if c.Retrieval.Planning.CacheSize < 0 {
 		return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize)
 	}
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
index e4ba3a6..0936e1f 100644
--- a/pkg/config/config_test.go
+++ b/pkg/config/config_test.go
@@ -73,6 +73,55 @@ func TestDefaultValues(t *testing.T) {
 	if cfg.Log.Level != "info" {
 		t.Errorf("log.level = %q, want info", cfg.Log.Level)
 	}
+	if !cfg.Ingest.TOC.Enabled {
+		t.Error("ingest.toc.enabled should default to true (opt-out)")
+	}
+	if cfg.Ingest.TOC.Concurrency != 4 {
+		t.Errorf("ingest.toc.concurrency = %d, want 4", cfg.Ingest.TOC.Concurrency)
+	}
+	if cfg.Ingest.TOC.TOCCheckPages != 20 {
+		t.Errorf("ingest.toc.toc_check_pages = %d, want 20", cfg.Ingest.TOC.TOCCheckPages)
+	}
+}
+
+func TestTOCEnvOverride(t *testing.T) {
+	// Mutates env — restore on exit. Not parallel.
+	keys := []string{
+		"VLE_INGEST_TOC_ENABLED",
+		"VLE_INGEST_TOC_MODEL",
+		"VLE_INGEST_TOC_CONCURRENCY",
+		"VLE_INGEST_TOC_TOC_CHECK_PAGES",
+	}
+	prev := make(map[string]string, len(keys))
+	for _, k := range keys {
+		prev[k] = os.Getenv(k)
+	}
+	defer func() {
+		for k, v := range prev {
+			os.Setenv(k, v)
+		}
+	}()
+
+	os.Setenv("VLE_INGEST_TOC_ENABLED", "false")
+	os.Setenv("VLE_INGEST_TOC_MODEL", "gemini-2.5-pro")
+	os.Setenv("VLE_INGEST_TOC_CONCURRENCY", "12")
+	os.Setenv("VLE_INGEST_TOC_TOC_CHECK_PAGES", "30")
+
+	cfg := Default()
+	applyEnvOverrides(&cfg)
+
+	if cfg.Ingest.TOC.Enabled {
+		t.Error("VLE_INGEST_TOC_ENABLED=false should disable the stage")
+	}
+	if cfg.Ingest.TOC.Model != "gemini-2.5-pro" {
+		t.Errorf("VLE_INGEST_TOC_MODEL not applied, got %q", cfg.Ingest.TOC.Model)
+	}
+	if cfg.Ingest.TOC.Concurrency != 12 {
+		t.Errorf("VLE_INGEST_TOC_CONCURRENCY=12 not applied, got %d", cfg.Ingest.TOC.Concurrency)
+	}
+	if cfg.Ingest.TOC.TOCCheckPages != 30 {
+		t.Errorf("VLE_INGEST_TOC_TOC_CHECK_PAGES=30 not applied, got %d", cfg.Ingest.TOC.TOCCheckPages)
+	}
 }
 
 func TestAbstainEnvOverride(t *testing.T) {
diff --git a/pkg/db/documents.go b/pkg/db/documents.go
index 68719da..67cbea2 100644
--- a/pkg/db/documents.go
+++ b/pkg/db/documents.go
@@ -43,6 +43,18 @@ type Document struct {
 	Metadata     map[string]string
 	CreatedAt    time.Time
 	UpdatedAt    time.Time
+
+	// TOCTree is the JSONB blob persisted by the ingest pipeline's
+	// LLM-driven TOC builder ([]tree.TOCNode marshalled). nil
+	// (NULL in DB) means "not yet generated" — the expected state
+	// for non-PDF documents, for documents ingested before the
+	// 0006 migration, and when the builder failed (builder
+	// failures are non-fatal and leave this column NULL).
+	//
+	// Stored raw so the column round-trips byte-identically
+	// regardless of slice-element ordering inside the encoder.
+	// Callers that need the typed shape unmarshal at read time.
+	TOCTree []byte
 }
 
 // NewDocument inserts a fresh document row in the "pending" state.
@@ -83,7 +95,7 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store
 	}
 	q := `
         SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message,
-               byte_size, metadata, created_at, updated_at
+               byte_size, metadata, created_at, updated_at, toc_tree
         FROM documents WHERE id = $1 AND org_id = $2`
 	args := []any{string(id), orgID}
 	if storeID != "" {
@@ -94,13 +106,14 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store
 
 	var d Document
 	var status string
-	var rawMeta []byte
+	var rawMeta, rawTOC []byte
 	if err := row.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status,
-		&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil {
+		&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil {
 		return nil, mapErr(err)
 	}
 	d.Status = DocumentStatus(status)
 	d.Metadata = unmarshalMeta(rawMeta)
+	d.TOCTree = rawTOC
 	return &d, nil
 }
 
@@ -111,18 +124,19 @@ func (p *Pool) GetDocument(ctx context.Context, id tree.DocumentID, orgID, store
 func (p *Pool) GetDocumentForWorker(ctx context.Context, id tree.DocumentID) (*Document, error) {
 	row := p.QueryRow(ctx, `
         SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message,
-               byte_size, metadata, created_at, updated_at
+               byte_size, metadata, created_at, updated_at, toc_tree
         FROM documents WHERE id = $1`, string(id))
 
 	var d Document
 	var status string
-	var rawMeta []byte
+	var rawMeta, rawTOC []byte
 	if err := row.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status,
-		&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil {
+		&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil {
 		return nil, mapErr(err)
 	}
 	d.Status = DocumentStatus(status)
 	d.Metadata = unmarshalMeta(rawMeta)
+	d.TOCTree = rawTOC
 	return &d, nil
 }
 
@@ -143,6 +157,24 @@ func (p *Pool) SetDocumentTitle(ctx context.Context, id tree.DocumentID, title s
 	return mapErr(err)
 }
 
+// UpdateDocumentTOCTree persists the LLM-built table-of-contents
+// tree onto the documents.toc_tree column. treeJSON is the already
+// JSON-marshalled []tree.TOCNode; pass a nil slice to clear (writes
+// SQL NULL — the "not yet generated" state). Mirrors
+// UpdateSectionSummaryAxes so the column can be patched
+// independently of the rest of the document row.
+func (p *Pool) UpdateDocumentTOCTree(ctx context.Context, id tree.DocumentID, treeJSON []byte) error {
+	var arg any
+	if len(treeJSON) > 0 {
+		arg = treeJSON
+	}
+	_, err := p.Exec(ctx, `
+        UPDATE documents
+        SET toc_tree = $2, updated_at = now()
+        WHERE id = $1`, string(id), arg)
+	return mapErr(err)
+}
+
 // ListDocumentsOpts controls pagination + filtering for ListDocuments.
 type ListDocumentsOpts struct {
 	// OrgID restricts the listing to a single tenant. Required.
@@ -197,7 +229,7 @@ func (p *Pool) ListDocuments(ctx context.Context, o ListDocumentsOpts) ([]Docume
 
 	q := `
         SELECT id, org_id, store_id, title, content_type, source_ref, status, error_message,
-               byte_size, metadata, created_at, updated_at
+               byte_size, metadata, created_at, updated_at, toc_tree
         FROM documents ` + where + `
         ORDER BY created_at DESC
         LIMIT $` + itoa(next)
@@ -212,13 +244,14 @@ func (p *Pool) ListDocuments(ctx context.Context, o ListDocumentsOpts) ([]Docume
 	for rows.Next() {
 		var d Document
 		var status string
-		var rawMeta []byte
+		var rawMeta, rawTOC []byte
 		if err := rows.Scan(&d.ID, &d.OrgID, &d.StoreID, &d.Title, &d.ContentType, &d.SourceRef, &status,
-			&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt); err != nil {
+			&d.ErrorMessage, &d.ByteSize, &rawMeta, &d.CreatedAt, &d.UpdatedAt, &rawTOC); err != nil {
 			return nil, time.Time{}, err
 		}
 		d.Status = DocumentStatus(status)
 		d.Metadata = unmarshalMeta(rawMeta)
+		d.TOCTree = rawTOC
 		out = append(out, d)
 	}
 	if err := rows.Err(); err != nil {
diff --git a/pkg/db/documents_marshal_test.go b/pkg/db/documents_marshal_test.go
new file mode 100644
index 0000000..0260a77
--- /dev/null
+++ b/pkg/db/documents_marshal_test.go
@@ -0,0 +1,111 @@
+package db
+
+import (
+	"bytes"
+	"encoding/json"
+	"testing"
+
+	"github.com/hallelx2/vectorless-engine/pkg/tree"
+)
+
+// TestTOCTreeRoundTrip confirms a []tree.TOCNode marshals to JSON
+// bytes that, when shoved through Document.TOCTree and pulled back
+// out, decode to the same shape. The DB column stores the bytes
+// verbatim so this is really a guard on the JSON tag contract —
+// dropping a tag or renaming a field breaks downstream consumers
+// that depend on the stable wire shape.
+func TestTOCTreeRoundTrip(t *testing.T) {
+	in := []tree.TOCNode{
+		{
+			NodeID:    "toc_1",
+			Structure: "1",
+			Title:     "Business",
+			StartPage: 1,
+			EndPage:   12,
+			Nodes: []tree.TOCNode{
+				{NodeID: "toc_1_1", Structure: "1.1", Title: "Overview", StartPage: 1, EndPage: 4},
+				{NodeID: "toc_1_2", Structure: "1.2", Title: "Strategy", StartPage: 5, EndPage: 12},
+			},
+		},
+		{
+			NodeID:    "toc_2",
+			Structure: "2",
+			Title:     "Risk Factors",
+			StartPage: 13,
+			EndPage:   38,
+		},
+	}
+
+	raw, err := json.Marshal(in)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+
+	var out []tree.TOCNode
+	if err := json.Unmarshal(raw, &out); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(out) != len(in) {
+		t.Fatalf("top-level len: got %d want %d", len(out), len(in))
+	}
+	for i := range in {
+		assertTOCNodeEq(t, &out[i], &in[i])
+	}
+
+	// Re-marshal and check byte-stable form so persisting and
+	// re-reading never quietly changes content. JSON encoding is
+	// deterministic for a fixed key order; our struct tags fix that.
+	raw2, err := json.Marshal(out)
+	if err != nil {
+		t.Fatalf("re-marshal: %v", err)
+	}
+	if !bytes.Equal(raw, raw2) {
+		t.Errorf("round-trip changed bytes\n  first:  %s\n  second: %s", raw, raw2)
+	}
+}
+
+// TestTOCTreeOmitsZeroFields guards the wire contract: optional
+// fields (EndPage, Summary, Nodes) drop out of the serialised form
+// when zero, so the persisted blob stays small and free of noise.
+func TestTOCTreeOmitsZeroFields(t *testing.T) {
+	in := []tree.TOCNode{{NodeID: "toc_x", Structure: "1", Title: "Stub", StartPage: 7}}
+	raw, err := json.Marshal(in)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	s := string(raw)
+	for _, banned := range []string{"end_page", "summary", "nodes"} {
+		if bytes.Contains(raw, []byte(banned)) {
+			t.Errorf("expected %q to be omitted, got %s", banned, s)
+		}
+	}
+}
+
+func assertTOCNodeEq(t *testing.T, got, want *tree.TOCNode) {
+	t.Helper()
+	if got.NodeID != want.NodeID {
+		t.Errorf("NodeID: got %q want %q", got.NodeID, want.NodeID)
+	}
+	if got.Structure != want.Structure {
+		t.Errorf("Structure: got %q want %q", got.Structure, want.Structure)
+	}
+	if got.Title != want.Title {
+		t.Errorf("Title: got %q want %q", got.Title, want.Title)
+	}
+	if got.StartPage != want.StartPage {
+		t.Errorf("StartPage: got %d want %d", got.StartPage, want.StartPage)
+	}
+	if got.EndPage != want.EndPage {
+		t.Errorf("EndPage: got %d want %d", got.EndPage, want.EndPage)
+	}
+	if got.Summary != want.Summary {
+		t.Errorf("Summary: got %q want %q", got.Summary, want.Summary)
+	}
+	if len(got.Nodes) != len(want.Nodes) {
+		t.Errorf("Nodes len: got %d want %d", len(got.Nodes), len(want.Nodes))
+		return
+	}
+	for i := range want.Nodes {
+		assertTOCNodeEq(t, &got.Nodes[i], &want.Nodes[i])
+	}
+}
diff --git a/pkg/db/migrations/0006_documents_toc_tree.down.sql b/pkg/db/migrations/0006_documents_toc_tree.down.sql
new file mode 100644
index 0000000..8eb4315
--- /dev/null
+++ b/pkg/db/migrations/0006_documents_toc_tree.down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE documents
+    DROP COLUMN IF EXISTS toc_tree;
diff --git a/pkg/db/migrations/0006_documents_toc_tree.up.sql b/pkg/db/migrations/0006_documents_toc_tree.up.sql
new file mode 100644
index 0000000..f090819
--- /dev/null
+++ b/pkg/db/migrations/0006_documents_toc_tree.up.sql
@@ -0,0 +1,19 @@
+-- 0006_documents_toc_tree.up.sql — LLM-built table-of-contents tree.
+--
+-- PR-A of the PageIndex-style redesign. The ingest pipeline runs an
+-- LLM-driven TOC builder on PDFs (between summarize and StatusReady)
+-- and persists the result here. The tree is small (a few KB even for
+-- 300-page filings) and is read back at retrieval time by strategies
+-- that want a hierarchical map of the document independent of the
+-- parser's heading detection.
+--
+-- toc_tree
+--     JSONB blob carrying []tree.TOCNode. NULL for documents ingested
+--     before this migration, for non-PDF inputs, or when the TOC
+--     builder failed (failures are non-fatal — the document remains
+--     fully retrievable via the existing sections tree).
+--
+-- Not indexed: JSONB queries on this column aren't on the hot path.
+-- Reads load the blob inline alongside the document row.
+ALTER TABLE documents
+    ADD COLUMN IF NOT EXISTS toc_tree JSONB;
diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go
index d4c174d..71d6e8d 100644
--- a/pkg/ingest/ingest.go
+++ b/pkg/ingest/ingest.go
@@ -134,6 +134,29 @@ type Pipeline struct {
 	// per-stage semaphore). Default applied by NewPipeline: 12.
 	GlobalLLMConcurrency int
 
+	// TOCEnabled toggles the LLM-built table-of-contents stage. The
+	// stage runs after summarize+HyDE on PDF inputs and persists the
+	// resulting tree on documents.toc_tree (JSONB). Failures are
+	// non-fatal — they leave the column NULL.
+	//
+	// Defaulted to true by config wiring; left as the Go zero value
+	// (false) when Pipeline is constructed directly, so unit tests
+	// with no LLM can opt out by simply not setting it.
+	TOCEnabled bool
+
+	// TOCModel overrides the LLM model used by the TOC builder.
+	// Empty inherits SummaryModel (which itself falls back to the
+	// client default).
+	TOCModel string
+
+	// TOCConcurrency caps parallel LLM calls during the TOC
+	// verification phase. Default: 4.
+	TOCConcurrency int
+
+	// TOCCheckPages bounds the leading prefix the detector scans
+	// for a table of contents. Default: 20.
+	TOCCheckPages int
+
 	// globalLLMSem is the lazily-initialized shared semaphore enforcing
 	// GlobalLLMConcurrency. nil means "no global cap" — callers fall back
 	// to per-stage limits only.
@@ -168,6 +191,12 @@ func NewPipeline(p Pipeline) *Pipeline {
 	if p.HyDEConcurrency <= 0 {
 		p.HyDEConcurrency = 4
 	}
+	if p.TOCConcurrency <= 0 {
+		p.TOCConcurrency = 4
+	}
+	if p.TOCCheckPages <= 0 {
+		p.TOCCheckPages = 20
+	}
 	// Default the global cap to a value that comfortably exceeds the
 	// sum of the two default per-stage caps (4 + 4 = 8) while leaving
 	// some headroom — but stays well below typical provider per-tenant
@@ -265,6 +294,17 @@ func (p *Pipeline) Run(ctx context.Context, pl Payload) error {
 	}
 	log.Info("ingest: summarize+hyde complete", "elapsed", time.Since(stageStart))
 
+	// LLM-built TOC tree (PageIndex-style). PDF-only because it
+	// relies on the parser's PageStart/PageEnd attribution to
+	// reconstruct per-page text. Non-fatal: a builder failure
+	// leaves documents.toc_tree NULL and the document remains
+	// fully retrievable via the sections tree above.
+	if p.TOCEnabled && pl.ContentType == "application/pdf" {
+		if err := p.runTOCBuilder(ctx, pl.DocumentID, parsed, log); err != nil {
+			log.Warn("ingest: toc-builder failed; falling back to NULL toc_tree", "err", err)
+		}
+	}
+
 	if err := p.DB.SetDocumentStatus(ctx, pl.DocumentID, db.StatusReady, ""); err != nil {
 		return err
 	}
@@ -272,6 +312,113 @@ func (p *Pipeline) Run(ctx context.Context, pl Payload) error {
 	return nil
 }
 
+// runTOCBuilder assembles per-page text from the parsed PDF, runs
+// the LLM-driven TOC builder over it, and persists the result.
+// Returns an error only on a transport-level builder failure or a
+// JSON-marshal blip; the caller logs and continues either way.
+//
+// A nil-result (no usable nodes) is treated as success and writes
+// SQL NULL to documents.toc_tree (which is the column's default,
+// so this is also the no-op).
+func (p *Pipeline) runTOCBuilder(ctx context.Context, docID tree.DocumentID, parsed *parser.ParsedDoc, log *slog.Logger) error {
+	pages := assemblePagesFromSections(parsed.Sections)
+	if len(pages) == 0 {
+		log.Info("ingest: toc-builder skipped; no per-page text available")
+		return nil
+	}
+	model := p.TOCModel
+	if model == "" {
+		model = p.SummaryModel
+	}
+	builder := &TOCBuilder{
+		LLM:           p.LLM,
+		Model:         model,
+		Concurrency:   p.TOCConcurrency,
+		TOCCheckPages: p.TOCCheckPages,
+	}
+	nodes, usage, err := builder.Build(ctx, pages)
+	if err != nil {
+		return err
+	}
+	log.Info("ingest: toc-builder done",
+		"top_level_nodes", len(nodes),
+		"llm_calls", usage.LLMCalls,
+		"input_tokens", usage.InputTokens,
+		"output_tokens", usage.OutputTokens,
+	)
+	if len(nodes) == 0 {
+		return nil
+	}
+	treeJSON, err := json.Marshal(nodes)
+	if err != nil {
+		return fmt.Errorf("marshal toc tree: %w", err)
+	}
+	if err := p.DB.UpdateDocumentTOCTree(ctx, docID, treeJSON); err != nil {
+		return fmt.Errorf("persist toc tree: %w", err)
+	}
+	return nil
+}
+
+// assemblePagesFromSections groups the parsed sections' text by
+// their PageStart, producing PageText entries the TOC builder can
+// reason over. Sections that span multiple pages collapse onto
+// their starting page — perfect page reconstruction would need
+// raw glyph-level coordinates the parser doesn't currently
+// surface, but the title-on-claimed-page heuristic still works
+// because section starts (where the LLM looks for titles) live
+// on PageStart.
+//
+// Sections with PageStart == 0 are skipped (the parser couldn't
+// place them) so the builder never sees ambiguous page numbers.
+func assemblePagesFromSections(secs []parser.Section) []PageText {
+	pageText := map[int]*strings.Builder{}
+	pages := []int{}
+	var walk func([]parser.Section)
+	walk = func(ss []parser.Section) {
+		for _, s := range ss {
+			if s.PageStart > 0 {
+				b, ok := pageText[s.PageStart]
+				if !ok {
+					b = &strings.Builder{}
+					pageText[s.PageStart] = b
+					pages = append(pages, s.PageStart)
+				}
+				if title := strings.TrimSpace(s.Title); title != "" {
+					if b.Len() > 0 {
+						b.WriteByte('\n')
+					}
+					b.WriteString(title)
+					b.WriteByte('\n')
+				}
+				if body := strings.TrimSpace(s.Content); body != "" {
+					b.WriteString(body)
+					b.WriteByte('\n')
+				}
+			}
+			walk(s.Children)
+		}
+	}
+	walk(secs)
+	// Sort the page-number index in place.
+	sortIntsAscending(pages)
+	out := make([]PageText, 0, len(pages))
+	for _, p := range pages {
+		out = append(out, PageText{PageNumber: p, Text: pageText[p].String()})
+	}
+	return out
+}
+
+// sortIntsAscending sorts a slice of ints in place. Insertion sort
+// is fine here — pages slice is typically a few hundred items
+// at most.
+func sortIntsAscending(xs []int) {
+	for i := 1; i < len(xs); i++ {
+		for j := i; j > 0 && xs[j-1] > xs[j]; j-- {
+			xs[j-1], xs[j] = xs[j], xs[j-1]
+		}
+	}
+}
+
 // runParallelStages runs summarize and HyDE concurrently, returning each
 // stage's error independently so callers can log them separately. A nil
 // hydeFn skips the HyDE stage (returns nil for hydeErr).
diff --git a/pkg/ingest/toc_builder.go b/pkg/ingest/toc_builder.go
new file mode 100644
index 0000000..3466cc5
--- /dev/null
+++ b/pkg/ingest/toc_builder.go
@@ -0,0 +1,826 @@
+package ingest
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"strings"
+	"sync"
+
+	"golang.org/x/sync/errgroup"
+
+	"github.com/hallelx2/llmgate"
+
+	"github.com/hallelx2/vectorless-engine/pkg/tree"
+)
+
+// PageText pairs a 1-indexed PDF page number with its extracted
+// text. The TOC builder reasons over a slice of these in page order
+// — it never sees raw PDF bytes, so it works equally well over the
+// pages produced by the existing parser pipeline and over synthetic
+// fixtures used in tests.
+type PageText struct {
+	PageNumber int
+	Text       string
+}
+
+// TOCBuilder builds an LLM-derived table-of-contents tree for a
+// document. The shape mirrors PageIndex's three-phase pipeline:
+//
+//  1. detect    — scan the first TOCCheckPages pages and ask the LLM
+//                 whether any of them looks like a real TOC.
+//  2. extract   — if a TOC page was found, ask the LLM to parse it
+//                 into structured nodes; otherwise call the no-TOC
+//                 path that generates a TOC straight from body
+//                 text (the LLM is given the full page text tagged
+//                 with <physical_index_X> markers it copies back as
+//                 the start page).
+//  3. verify    — concurrently re-check each leaf node: does its
+//                 title actually appear at the start of the claimed
+//                 page? Mismatches are repaired by clearing the
+//                 page back to zero; downstream readers treat zero
+//                 as "open / unknown" rather than a wrong answer.
+//
+// EndPage is derived from sibling ordering once verification is
+// done. The builder is deliberately tolerant of LLM parse blips
+// (the same retry-then-degrade pattern the rest of the ingest path
+// uses) — a single bad response never fails ingest.
+type TOCBuilder struct {
+	// LLM is the provider client. Required.
+	LLM llmgate.Client
+
+	// Model overrides the client's default. Empty inherits.
+	Model string
+
+	// Concurrency caps parallel LLM calls during the verification
+	// phase. The detect + extract phases run sequentially because
+	// each page-by-page detector call is short and the no-TOC
+	// generator is one big call. Default: 4.
+	Concurrency int
+
+	// TOCCheckPages bounds the prefix the detector scans for a
+	// table of contents. PageIndex defaults this to 20 — financial
+	// filings put their TOC inside the first dozen pages and a
+	// document with no TOC by page 20 almost never has one
+	// further in. Default: 20.
+	TOCCheckPages int
+}
+
+// Usage is the cumulative LLM accounting returned by Build. Mirrors
+// the retrieval.Usage shape so callers can fold it into the same
+// per-document cost ledger that the retrieval path uses.
+type Usage struct {
+	InputTokens  int
+	OutputTokens int
+	TotalTokens  int
+	CostUSD      float64
+	LLMCalls     int
+}
+
+// add folds the per-response usage from one LLM call into the
+// running total. Keeps the call sites short.
+func (u *Usage) add(r *llmgate.Response) {
+	if r == nil {
+		return
+	}
+	u.InputTokens += r.Usage.InputTokens
+	u.OutputTokens += r.Usage.OutputTokens
+	u.TotalTokens += r.Usage.TotalTokens
+	u.CostUSD += r.Usage.CostUSD
+	u.LLMCalls++
+}
+
+// Build runs the three-phase pipeline on pages and returns a
+// flat-ish top-level TOC tree (children inside Nodes form the
+// nested levels). Always returns a non-nil error chain only on a
+// hard transport failure — LLM parse blips degrade to "empty
+// result with logged warning" so the caller's ingest job never
+// dies on a formatting glitch.
+//
+// pages must be in page order (PageNumber strictly ascending and
+// 1-based). Build does not sort or de-duplicate.
+func (b *TOCBuilder) Build(ctx context.Context, pages []PageText) ([]tree.TOCNode, Usage, error) {
+	var usage Usage
+	if len(pages) == 0 {
+		return nil, usage, nil
+	}
+	concurrency := b.Concurrency
+	if concurrency <= 0 {
+		concurrency = 4
+	}
+	tocCheck := b.TOCCheckPages
+	if tocCheck <= 0 {
+		tocCheck = 20
+	}
+
+	// Phase 1: detect. Scan the leading pages for a TOC.
+	tocPages := b.detectTOCPages(ctx, pages, tocCheck, &usage)
+
+	// Phase 2: extract.
+	var nodes []tree.TOCNode
+	var err error
+	if len(tocPages) > 0 {
+		nodes, err = b.extractFromTOCPages(ctx, pages, tocPages, &usage)
+	} else {
+		nodes, err = b.generateNoTOC(ctx, pages, &usage)
+	}
+	if err != nil {
+		return nil, usage, err
+	}
+	if len(nodes) == 0 {
+		return nil, usage, nil
+	}
+
+	// Phase 3: verify each leaf's claimed start page actually
+	// starts the section. Mismatches clear the page (set to 0)
+	// rather than making one up — downstream treats zero as
+	// open/unknown.
+	b.verifyTitlesConcurrent(ctx, nodes, pages, concurrency, &usage)
+
+	// Derive end pages from sibling order. Done last so verified
+	// start pages drive the derivation.
+	deriveEndPages(nodes, lastPage(pages))
+
+	// Stamp stable node IDs onto every node so callers / external
+	// consumers have an opaque handle independent of position.
+	stampNodeIDs(nodes, "")
+
+	return nodes, usage, nil
+}
+
+// detectTOCPages scans the first tocCheck pages with the
+// PageIndex-style single-page detector. Returns the 1-indexed page
+// numbers (in order) the LLM judged as table-of-contents pages.
+//
+// Detection failures (transport / parse) silently fall back to
+// "no TOC found here" so the caller transitions to the no-TOC path.
+// This matches the PageIndex contract — the no-TOC generator is
+// strictly more general than the TOC-extraction path.
+func (b *TOCBuilder) detectTOCPages(ctx context.Context, pages []PageText, tocCheck int, usage *Usage) []int {
+	limit := tocCheck
+	if limit > len(pages) {
+		limit = len(pages)
+	}
+	var found []int
+	for i := 0; i < limit; i++ {
+		if ctx.Err() != nil {
+			return found
+		}
+		page := pages[i]
+		text := strings.TrimSpace(page.Text)
+		if text == "" {
+			continue
+		}
+		isTOC, err := b.runTOCDetector(ctx, text, usage)
+		if err != nil {
+			// Transport / ErrNotImplemented — abandon detection and
+			// let the caller fall back to the no-TOC path.
+			return found
+		}
+		if isTOC {
+			found = append(found, page.PageNumber)
+		}
+	}
+	return found
+}
+
+// runTOCDetector asks the LLM whether the supplied page text reads
+// like a table of contents. Mirrors PageIndex's
+// toc_detector_single_page.
+func (b *TOCBuilder) runTOCDetector(ctx context.Context, pageText string, usage *Usage) (bool, error) {
+	prompt := fmt.Sprintf(`Your job is to detect if there is a table of contents provided in the given text.
+
+Given text: %s
+
+return the following JSON format:
+{
+    "thinking": "<why do you think there is a table of contents in the given text>",
+    "toc_detected": "<yes or no>"
+}
+
+Directly return the final JSON structure. Do not output anything else.
+Please note: abstract, summary, notation list, figure list, table list, etc. are not tables of contents.`, truncate(pageText, tocDetectorMaxChars))
+
+	req := llmgate.Request{
+		Model:       b.Model,
+		Temperature: 0.0,
+		MaxTokens:   400,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: tocDetectorSystemPrompt},
+			{Role: llmgate.RoleUser, Content: prompt},
+		},
+		JSONMode:   true,
+		JSONSchema: []byte(tocDetectorJSONSchema),
+	}
+	raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage)
+	if err != nil {
+		return false, err
+	}
+	if raw == "" {
+		return false, nil
+	}
+	var p tocDetectorPayload
+	if err := unmarshalLenient([]byte(raw), &p); err != nil {
+		return false, nil
+	}
+	return strings.EqualFold(strings.TrimSpace(p.TOCDetected), "yes"), nil
+}
+
+// extractFromTOCPages joins the detected TOC pages and asks the
+// LLM to parse them into structured nodes. The path used when a
+// TOC page was found — the structure on the page is the structure
+// the LLM is asked to reproduce, just with start_page resolved.
+//
+// On parse failure or transport blip, returns nil — the caller
+// degrades to an empty tree (still useful: the document remains
+// retrievable via the existing sections tree).
+func (b *TOCBuilder) extractFromTOCPages(ctx context.Context, pages []PageText, tocPages []int, usage *Usage) ([]tree.TOCNode, error) {
+	tocText := joinTOCPagesText(pages, tocPages)
+	bodyText := buildPhysicalIndexedText(pages, tocDetectorMaxChars*4)
+
+	prompt := fmt.Sprintf(`You are an expert in extracting hierarchical tree structure. Given a raw table-of-contents block and the document's body text (tagged with <physical_index_X> markers), produce the hierarchical TOC as a JSON array of nodes.
+
+For each node:
+- structure: dotted hierarchical index ("1", "1.1", "1.1.2") matching the heading depth.
+- title: the original section title, only fixing space inconsistency.
+- physical_index: the <physical_index_X> tag where the section begins. Look at the body text to resolve the page; if you cannot confidently locate it, use null.
+
+Raw table of contents:
+%s
+
+Body text (with <physical_index_X> markers):
+%s
+
+Return ONLY a JSON object: {"nodes": [{"structure": "1", "title": "...", "physical_index": "<physical_index_3>"}, ...]}. Do not output anything else.`,
+		truncate(tocText, tocExtractorMaxChars),
+		truncate(bodyText, tocExtractorMaxBody),
+	)
+
+	req := llmgate.Request{
+		Model:       b.Model,
+		Temperature: 0.0,
+		MaxTokens:   4096,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: tocExtractorSystemPrompt},
+			{Role: llmgate.RoleUser, Content: prompt},
+		},
+		JSONMode:   true,
+		JSONSchema: []byte(tocNodesJSONSchema),
+	}
+	raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage)
+	if err != nil {
+		return nil, err
+	}
+	flat := parseTOCNodesPayload(raw)
+	return assembleHierarchy(flat), nil
+}
+
+// generateNoTOC is the PageIndex-style process_no_toc driver: when
+// no TOC page was found, page content (tagged with
+// <physical_index_X> markers) is fed to the LLM with instructions
+// to emit a TOC straight from headings in the body.
+func (b *TOCBuilder) generateNoTOC(ctx context.Context, pages []PageText, usage *Usage) ([]tree.TOCNode, error) {
+	body := buildPhysicalIndexedText(pages, noTOCMaxBody)
+	prompt := fmt.Sprintf(`You are an expert in extracting hierarchical tree structure; your task is to generate the table-of-contents tree of the document below from its body text.
+
+The structure variable is the dotted hierarchical index ("1", "1.1", "1.1.2") representing the section's position in the outline.
+
+For the title, extract the original heading verbatim; only fix space inconsistency.
+
+The text contains <physical_index_X> markers indicating the start and end of page X. For each section's physical_index, return the <physical_index_X> tag where the section starts (keep the format).
+
+Body text:
+%s
+
+Return ONLY a JSON object: {"nodes": [{"structure": "1", "title": "...", "physical_index": "<physical_index_3>"}, ...]}. Do not output anything else.`, body)
+
+	req := llmgate.Request{
+		Model:       b.Model,
+		Temperature: 0.0,
+		MaxTokens:   4096,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: tocExtractorSystemPrompt},
+			{Role: llmgate.RoleUser, Content: prompt},
+		},
+		JSONMode:   true,
+		JSONSchema: []byte(tocNodesJSONSchema),
+	}
+	raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage)
+	if err != nil {
+		return nil, err
+	}
+	flat := parseTOCNodesPayload(raw)
+	return assembleHierarchy(flat), nil
+}
+
+// verifyTitlesConcurrent runs PageIndex's check_title_appearance_in_start
+// over every node whose StartPage is set, with bounded concurrency.
+// Mismatches set StartPage back to zero — the downstream contract
+// is "zero means unknown / open" — so a misclaimed page never
+// pretends to be authoritative.
+func (b *TOCBuilder) verifyTitlesConcurrent(ctx context.Context, nodes []tree.TOCNode, pages []PageText, concurrency int, usage *Usage) {
+	pageByNumber := indexByPage(pages)
+	flat := flattenForVerify(nodes)
+	if len(flat) == 0 {
+		return
+	}
+
+	sem := make(chan struct{}, concurrency)
+	g, gctx := errgroup.WithContext(ctx)
+	var (
+		mu       sync.Mutex
+		localUse Usage
+	)
+
+	type result struct {
+		node *tree.TOCNode
+		ok   bool
+	}
+	results := make([]result, len(flat))
+
+	for i, n := range flat {
+		i, n := i, n
+		if n.StartPage <= 0 {
+			continue
+		}
+		pageText, ok := pageByNumber[n.StartPage]
+		if !ok {
+			// claimed a page we don't have — clear it.
+			results[i] = result{node: n, ok: false}
+			continue
+		}
+		g.Go(func() error {
+			select {
+			case sem <- struct{}{}:
+				defer func() { <-sem }()
+			case <-gctx.Done():
+				return nil
+			}
+			startsHere, err := b.runVerifyTitleAtPageStart(gctx, n.Title, pageText, &localUse)
+			if err != nil {
+				// Transport / stub LLM — treat as "not verified" but
+				// don't clear the page; the LLM never weighed in.
+				results[i] = result{node: n, ok: true}
+				return nil
+			}
+			results[i] = result{node: n, ok: startsHere}
+			return nil
+		})
+	}
+	_ = g.Wait()
+
+	// Fold per-call usage into the caller's accumulator under the lock
+	// so concurrent additions stay coherent.
+	mu.Lock()
+	usage.InputTokens += localUse.InputTokens
+	usage.OutputTokens += localUse.OutputTokens
+	usage.TotalTokens += localUse.TotalTokens
+	usage.CostUSD += localUse.CostUSD
+	usage.LLMCalls += localUse.LLMCalls
+	mu.Unlock()
+
+	for _, r := range results {
+		if r.node == nil {
+			continue
+		}
+		if !r.ok {
+			r.node.StartPage = 0
+		}
+	}
+}
+
+// runVerifyTitleAtPageStart mirrors PageIndex's
+// check_title_appearance_in_start: does this section's title appear
+// at the beginning of the supplied page?
+func (b *TOCBuilder) runVerifyTitleAtPageStart(ctx context.Context, title, pageText string, usage *Usage) (bool, error) {
+	prompt := fmt.Sprintf(`You will be given a section title and a page's text.
+Your job is to check if the section starts at the beginning of the given page text.
+If there are other contents before the section title, then the section does NOT start at the beginning of the page text.
+If the section title is the first meaningful content in the page text, then the section starts at the beginning.
+
+Note: do fuzzy matching; ignore space inconsistency.
+
+Section title: %s
+Page text: %s
+
+Reply format:
+{
+    "thinking": "<why you think the section appears or starts in the page text>",
+    "start_begin": "<yes or no>"
+}
+Directly return the final JSON structure. Do not output anything else.`, title, truncate(pageText, verifyMaxChars))
+
+	req := llmgate.Request{
+		Model:       b.Model,
+		Temperature: 0.0,
+		MaxTokens:   400,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: tocVerifySystemPrompt},
+			{Role: llmgate.RoleUser, Content: prompt},
+		},
+		JSONMode:   true,
+		JSONSchema: []byte(tocVerifyJSONSchema),
+	}
+	raw, err := runTOCJSONWithRetry(ctx, b.LLM, req, defaultTOCRetries, usage)
+	if err != nil {
+		return false, err
+	}
+	if raw == "" {
+		return false, nil
+	}
+	var p tocVerifyPayload
+	if err := unmarshalLenient([]byte(raw), &p); err != nil {
+		// Couldn't parse — keep the page (don't clear). The LLM had
+		// no clear say, so the safer move is "trust the extractor".
+		return true, nil
+	}
+	return strings.EqualFold(strings.TrimSpace(p.StartBegin), "yes"), nil
+}
+
+// --- prompt + schema constants ---
+
+const (
+	tocDetectorSystemPrompt   = "You are a precise document-structure analyser. Decide whether a single page of text is a table of contents."
+	tocExtractorSystemPrompt  = "You are an expert in extracting hierarchical tree structures from documents. You output strict JSON only."
+	tocVerifySystemPrompt     = "You are a precise verifier. Decide whether a section title starts a page's text."
+	defaultTOCRetries         = 2
+	tocDetectorMaxChars       = 12000
+	tocExtractorMaxChars      = 16000
+	tocExtractorMaxBody       = 60000
+	noTOCMaxBody              = 80000
+	verifyMaxChars            = 4000
+	tocDetectorJSONSchema     = `{"type":"object","properties":{"thinking":{"type":"string"},"toc_detected":{"type":"string"}},"required":["toc_detected"]}`
+	tocVerifyJSONSchema       = `{"type":"object","properties":{"thinking":{"type":"string"},"start_begin":{"type":"string"}},"required":["start_begin"]}`
+	tocNodesJSONSchema        = `{"type":"object","properties":{"nodes":{"type":"array","items":{"type":"object","properties":{"structure":{"type":"string"},"title":{"type":"string"},"physical_index":{"type":["string","null"]}},"required":["title"]}}},"required":["nodes"]}`
+)
+
+// --- JSON payload types ---
+
+type tocDetectorPayload struct {
+	Thinking    string `json:"thinking"`
+	TOCDetected string `json:"toc_detected"`
+}
+
+type tocVerifyPayload struct {
+	Thinking   string `json:"thinking"`
+	StartBegin string `json:"start_begin"`
+}
+
+type tocNodePayload struct {
+	Structure     string  `json:"structure"`
+	Title         string  `json:"title"`
+	PhysicalIndex *string `json:"physical_index"`
+}
+
+type tocNodesPayload struct {
+	Nodes []tocNodePayload `json:"nodes"`
+}
+
+// --- shared helpers ---
+
+// runTOCJSONWithRetry runs a JSON-mode TOC LLM call, retrying up to
+// maxRetries additional times if the response can't be parsed.
+// Mirrors the runSelectionWithRetry contract from
+// pkg/retrieval/single_pass.go — copied here rather than imported
+// because the retrieval package owns its own per-domain version of
+// the same idea and we want the TOC builder to be importable by
+// any future consumer without dragging retrieval in.
+//
+// Returns the final raw response text (empty on transport / stub
+// failure). Caller decodes; a final parse failure degrades to "no
+// usable response" rather than an error.
+func runTOCJSONWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int, usage *Usage) (string, error) {
+	if maxRetries < 0 {
+		maxRetries = 0
+	}
+	var lastRaw string
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		req := baseReq
+		if attempt > 0 {
+			msgs := make([]llmgate.Message, len(baseReq.Messages))
+			copy(msgs, baseReq.Messages)
+			tail := len(msgs) - 1
+			msgs[tail] = llmgate.Message{
+				Role:    msgs[tail].Role,
+				Content: msgs[tail].Content + "\n\nIMPORTANT: respond with ONLY a JSON object matching the schema. No prose, no markdown fences.",
+			}
+			req.Messages = msgs
+		}
+		resp, err := client.Complete(ctx, req)
+		if err != nil {
+			// Stub LLM (ErrNotImplemented) is a soft failure — the
+			// caller will degrade. Transport errors do the same so
+			// ingest never dies on a transient blip.
+			if errors.Is(err, llmgate.ErrNotImplemented) {
+				return "", nil
+			}
+			return "", err
+		}
+		usage.add(resp)
+		lastRaw = resp.Content
+		if looksLikeJSON(resp.Content) {
+			return resp.Content, nil
+		}
+	}
+	log.Printf("toc-builder: response did not parse after %d attempts; degrading to empty", maxRetries+1)
+	return lastRaw, nil
+}
+
+// looksLikeJSON is a cheap probe so the retry loop can stop once
+// the model returns something that at least textually resembles a
+// JSON object. The real parser may still reject — strict parsing
+// happens at the caller — but this avoids burning retries on
+// obvious non-JSON ("Sure, here is the TOC: ...").
+func looksLikeJSON(s string) bool {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return false
+	}
+	if strings.HasPrefix(s, "```") {
+		s = strings.TrimPrefix(s, "```json")
+		s = strings.TrimPrefix(s, "```")
+		s = strings.TrimSpace(s)
+	}
+	return strings.HasPrefix(s, "{") || strings.HasPrefix(s, "[")
+}
+
+// unmarshalLenient strips code fences and any prose around the
+// first { / last } before decoding, matching the parser pattern
+// used in pkg/retrieval and pkg/ingest/summary_axes.go.
+func unmarshalLenient(raw []byte, dst any) error {
+	s := strings.TrimSpace(string(raw))
+	if strings.HasPrefix(s, "```") {
+		if i := strings.Index(s, "\n"); i >= 0 {
+			s = s[i+1:]
+		}
+		s = strings.TrimSuffix(s, "```")
+		s = strings.TrimSpace(s)
+	}
+	if i := strings.Index(s, "{"); i > 0 {
+		s = s[i:]
+	}
+	if j := strings.LastIndex(s, "}"); j >= 0 && j < len(s)-1 {
+		s = s[:j+1]
+	}
+	return json.Unmarshal([]byte(s), dst)
+}
+
+// parseTOCNodesPayload decodes the raw nodes JSON. Returns an empty
+// slice on any parse failure — the builder caller treats "no
+// usable nodes" as "leave TOC NULL" and proceeds with ingest.
+func parseTOCNodesPayload(raw string) []tocNodePayload {
+	if raw == "" {
+		return nil
+	}
+	var p tocNodesPayload
+	if err := unmarshalLenient([]byte(raw), &p); err != nil {
+		return nil
+	}
+	return p.Nodes
+}
+
+// --- shape helpers ---
+
+// joinTOCPagesText collects the text of the supplied TOC pages, in
+// order, separated by newlines so the LLM sees them as one
+// coherent block.
+func joinTOCPagesText(pages []PageText, tocPages []int) string {
+	idx := indexByPage(pages)
+	var b strings.Builder
+	for _, p := range tocPages {
+		text, ok := idx[p]
+		if !ok || text == "" {
+			continue
+		}
+		if b.Len() > 0 {
+			b.WriteString("\n\n")
+		}
+		b.WriteString(text)
+	}
+	return b.String()
+}
+
+// buildPhysicalIndexedText renders pages with <physical_index_X>
+// markers around each page's text — the literal format the LLM is
+// told to reproduce as the section's start page. budget caps the
+// total characters so we never blow past the model's context.
+func buildPhysicalIndexedText(pages []PageText, budget int) string {
+	var b strings.Builder
+	for _, p := range pages {
+		seg := fmt.Sprintf("<physical_index_%d>\n%s\n<physical_index_%d>\n\n", p.PageNumber, p.Text, p.PageNumber)
+		if budget > 0 && b.Len()+len(seg) > budget {
+			break
+		}
+		b.WriteString(seg)
+	}
+	return b.String()
+}
+
+// indexByPage returns a map of page number to page text.
+func indexByPage(pages []PageText) map[int]string {
+	out := make(map[int]string, len(pages))
+	for _, p := range pages {
+		out[p.PageNumber] = p.Text
+	}
+	return out
+}
+
+// lastPage returns the highest PageNumber in pages, or zero if
+// empty. Used as the default upper bound when deriving end pages.
+func lastPage(pages []PageText) int {
+	if len(pages) == 0 {
+		return 0
+	}
+	last := pages[0].PageNumber
+	for _, p := range pages[1:] {
+		if p.PageNumber > last {
+			last = p.PageNumber
+		}
+	}
+	return last
+}
+
+// truncate caps s at max characters, appending an ellipsis when
+// it had to cut. A non-positive max disables the cap.
+func truncate(s string, max int) string {
+	if max <= 0 || len(s) <= max {
+		return s
+	}
+	return s[:max] + "…"
+}
+
+// physicalIndexRE-like helper without regexp: parses the integer X
+// out of "<physical_index_X>". Returns 0 when the input doesn't
+// match — the verify phase treats zero as unknown.
+func parsePhysicalIndex(s string) int {
+	const prefix = "<physical_index_"
+	const suffix = ">"
+	s = strings.TrimSpace(s)
+	if !strings.HasPrefix(s, prefix) || !strings.HasSuffix(s, suffix) {
+		return 0
+	}
+	mid := s[len(prefix) : len(s)-len(suffix)]
+	n := 0
+	for _, r := range mid {
+		if r < '0' || r > '9' {
+			return 0
+		}
+		n = n*10 + int(r-'0')
+	}
+	return n
+}
+
+// assembleHierarchy turns a flat list of TOC node payloads into a
+// nested tree based on the dotted structure ("1", "1.1", "1.1.2").
+// Missing intermediate parents are tolerated — orphans land at the
+// top level so a misnumbered LLM response doesn't drop nodes
+// silently.
+func assembleHierarchy(flat []tocNodePayload) []tree.TOCNode {
+	if len(flat) == 0 {
+		return nil
+	}
+	// First materialise every payload as a TOCNode with its claimed
+	// start page resolved.
+	nodes := make([]tree.TOCNode, 0, len(flat))
+	for _, n := range flat {
+		title := strings.TrimSpace(n.Title)
+		if title == "" {
+			continue
+		}
+		page := 0
+		if n.PhysicalIndex != nil {
+			page = parsePhysicalIndex(*n.PhysicalIndex)
+		}
+		nodes = append(nodes, tree.TOCNode{
+			Structure: strings.TrimSpace(n.Structure),
+			Title:     title,
+			StartPage: page,
+		})
+	}
+	if len(nodes) == 0 {
+		return nil
+	}
+
+	// Build a sentinel root; nest by counting dots in Structure.
+	// "1" → depth 1, "1.2" → depth 2, "1.2.3" → depth 3.
+	type ref struct {
+		node      *tree.TOCNode
+		structure string
+	}
+	var (
+		out  []tree.TOCNode
+		path []ref
+	)
+	for i := range nodes {
+		n := &nodes[i]
+		depth := depthOf(n.Structure)
+		if depth <= 0 {
+			depth = 1
+		}
+		// Pop the path stack down to depth-1 so a "1.2" inserts
+		// under whatever last touched depth 1.
+		for len(path) >= depth {
+			path = path[:len(path)-1]
+		}
+		if len(path) == 0 {
+			out = append(out, *n)
+			path = append(path, ref{node: &out[len(out)-1], structure: n.Structure})
+			continue
+		}
+		parent := path[len(path)-1].node
+		parent.Nodes = append(parent.Nodes, *n)
+		path = append(path, ref{node: &parent.Nodes[len(parent.Nodes)-1], structure: n.Structure})
+	}
+	return out
+}
+
+// depthOf returns the depth implied by a dotted structure string
+// ("1" → 1, "1.2" → 2, "" → 0). A malformed structure ("1..2",
+// "a.b") still returns the number of dot-separated tokens — we'd
+// rather group than crash.
+func depthOf(structure string) int {
+	if structure == "" {
+		return 0
+	}
+	return strings.Count(structure, ".") + 1
+}
+
+// flattenForVerify returns pointers to every node in the tree in
+// depth-first pre-order so the verification phase can mutate
+// StartPage in place.
+func flattenForVerify(nodes []tree.TOCNode) []*tree.TOCNode {
+	var out []*tree.TOCNode
+	var walk func(ns []tree.TOCNode)
+	walk = func(ns []tree.TOCNode) {
+		for i := range ns {
+			out = append(out, &ns[i])
+			walk(ns[i].Nodes)
+		}
+	}
+	walk(nodes)
+	return out
+}
+
+// deriveEndPages walks the tree and fills each node's EndPage from
+// the next sibling at the same depth (StartPage - 1) or the
+// supplied docLastPage when no later sibling exists. Children's
+// end pages cap at their parent's, which is what readers expect
+// for a TOC.
+func deriveEndPages(nodes []tree.TOCNode, docLastPage int) {
+	deriveEndPagesIn(nodes, docLastPage)
+}
+
+func deriveEndPagesIn(nodes []tree.TOCNode, ceiling int) {
+	for i := range nodes {
+		n := &nodes[i]
+		// Find the next sibling's start page that is strictly
+		// greater than this one — that's our end. Skip sibling
+		// entries whose StartPage was cleared (zero) by
+		// verification so a single bad page doesn't sink the
+		// rest of the row.
+		end := 0
+		for j := i + 1; j < len(nodes); j++ {
+			if nodes[j].StartPage > n.StartPage {
+				end = nodes[j].StartPage - 1
+				break
+			}
+		}
+		if end <= 0 {
+			end = ceiling
+		}
+		// EndPage can never precede StartPage; clear to zero when
+		// the data conflicts.
+		if n.StartPage > 0 && end >= n.StartPage {
+			n.EndPage = end
+		}
+		// Recurse with the child ceiling = this node's EndPage (or
+		// the parent's ceiling if EndPage is unknown).
+		childCeiling := n.EndPage
+		if childCeiling == 0 {
+			childCeiling = ceiling
+		}
+		deriveEndPagesIn(n.Nodes, childCeiling)
+	}
+}
+
+// stampNodeIDs assigns deterministic NodeIDs based on the dotted
+// structure (with a prefix), recursing into children. IDs are
+// stable across runs given the same structure, which is handy for
+// callers that diff trees across re-ingestions.
+func stampNodeIDs(nodes []tree.TOCNode, prefix string) {
+	for i := range nodes {
+		n := &nodes[i]
+		base := n.Structure
+		if base == "" {
+			base = fmt.Sprintf("n%d", i+1)
+		}
+		if prefix == "" {
+			n.NodeID = "toc_" + base
+		} else {
+			n.NodeID = prefix + "_" + base
+		}
+		stampNodeIDs(n.Nodes, n.NodeID)
+	}
+}
diff --git a/pkg/ingest/toc_builder_test.go b/pkg/ingest/toc_builder_test.go
new file mode 100644
index 0000000..4dfe22a
--- /dev/null
+++ b/pkg/ingest/toc_builder_test.go
@@ -0,0 +1,466 @@
+package ingest
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+
+	"github.com/hallelx2/llmgate"
+
+	"github.com/hallelx2/vectorless-engine/pkg/parser"
+	"github.com/hallelx2/vectorless-engine/pkg/tree"
+)
+
+// scriptedLLM is a minimal inline mock — kept inside the ingest
+// package so it doesn't leak into the public API surface. Each
+// call walks a script keyed by phase ("detect", "extract",
+// "verify"), returning the next canned response. Mirrors the
+// pattern used in pkg/retrieval/retrieval_test.go's mockLLM but
+// scoped narrower so individual tests can wire bespoke behaviour
+// without dragging the retrieval test fixture in.
+type scriptedLLM struct {
+	mu    sync.Mutex
+	calls int32
+
+	// route returns the response for a given prompt. Tests inject
+	// behaviour here; falls back to a permissive "no" detector +
+	// empty extractor when nil so unrelated test paths don't have
+	// to script every prompt.
+	route func(userPrompt string) string
+
+	// captured holds every user prompt seen, in order. Tests
+	// assert phase ordering and prompt content from this.
+	captured []string
+}
+
+func (m *scriptedLLM) Complete(_ context.Context, req llmgate.Request) (*llmgate.Response, error) {
+	atomic.AddInt32(&m.calls, 1)
+	var user string
+	for _, msg := range req.Messages {
+		if msg.Role == llmgate.RoleUser {
+			user = msg.Content
+		}
+	}
+	m.mu.Lock()
+	m.captured = append(m.captured, user)
+	m.mu.Unlock()
+
+	content := ""
+	if m.route != nil {
+		content = m.route(user)
+	}
+	if content == "" {
+		content = `{"toc_detected":"no"}`
+	}
+	return &llmgate.Response{
+		Content: content,
+		Usage:   llmgate.Usage{InputTokens: 100, OutputTokens: 50, TotalTokens: 150},
+	}, nil
+}
+
+func (m *scriptedLLM) CountTokens(_ context.Context, s string) (int, error) {
+	return len(s) / 4, nil
+}
+
+// TestBuildTOCFoundPath walks the happy path where the detector
+// finds a TOC page, the extractor parses it into nested nodes,
+// and verification leaves the start pages intact.
+func TestBuildTOCFoundPath(t *testing.T) {
+	llm := &scriptedLLM{}
+	llm.route = func(prompt string) string {
+		switch {
+		case strings.Contains(prompt, "table of contents provided in the given text"):
+			// Detector: yes only when the page actually contains
+			// "Table of Contents".
+			if strings.Contains(prompt, "Table of Contents") {
+				return `{"toc_detected":"yes"}`
+			}
+			return `{"toc_detected":"no"}`
+		case strings.Contains(prompt, "hierarchical tree structure"):
+			// Extractor: return a small 10-K outline.
+			return `{"nodes":[
+				{"structure":"1","title":"Business","physical_index":"<physical_index_3>"},
+				{"structure":"1.1","title":"Overview","physical_index":"<physical_index_3>"},
+				{"structure":"2","title":"Risk Factors","physical_index":"<physical_index_10>"},
+				{"structure":"3","title":"MD&A","physical_index":"<physical_index_20>"}
+			]}`
+		case strings.Contains(prompt, "section starts at the beginning"):
+			return `{"start_begin":"yes"}`
+		}
+		return `{"toc_detected":"no"}`
+	}
+
+	pages := []PageText{
+		{PageNumber: 1, Text: "Cover Page\nForm 10-K\n"},
+		{PageNumber: 2, Text: "Table of Contents\n1. Business ... 3\n1.1 Overview ... 3\n2. Risk Factors ... 10\n3. MD&A ... 20"},
+		{PageNumber: 3, Text: "Business\nWe are a company that does things."},
+		{PageNumber: 10, Text: "Risk Factors\nVarious risks apply."},
+		{PageNumber: 20, Text: "MD&A\nDiscussion of operations."},
+	}
+
+	b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2}
+	nodes, usage, err := b.Build(context.Background(), pages)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(nodes) != 3 {
+		t.Fatalf("top-level nodes: got %d want 3 (Business, Risk Factors, MD&A) — got: %+v", len(nodes), nodes)
+	}
+	if nodes[0].Title != "Business" || nodes[0].StartPage != 3 {
+		t.Errorf("nodes[0]: got %+v", nodes[0])
+	}
+	if len(nodes[0].Nodes) != 1 || nodes[0].Nodes[0].Title != "Overview" {
+		t.Errorf("nodes[0].Nodes: got %+v", nodes[0].Nodes)
+	}
+	if nodes[1].Title != "Risk Factors" || nodes[1].StartPage != 10 {
+		t.Errorf("nodes[1]: got %+v", nodes[1])
+	}
+	if nodes[1].EndPage != 19 {
+		t.Errorf("nodes[1].EndPage: got %d want 19 (one before MD&A's start)", nodes[1].EndPage)
+	}
+	if nodes[2].EndPage != 20 {
+		t.Errorf("nodes[2].EndPage (last sibling): got %d want 20 (doc last page)", nodes[2].EndPage)
+	}
+	if usage.LLMCalls < 2 {
+		t.Errorf("expected at least 2 LLM calls (detector + extractor), got %d", usage.LLMCalls)
+	}
+	if usage.InputTokens == 0 {
+		t.Errorf("usage should track input tokens; got 0")
+	}
+	// NodeIDs are stamped deterministically from structure.
+	if nodes[0].NodeID != "toc_1" || nodes[0].Nodes[0].NodeID != "toc_1_1.1" {
+		t.Errorf("node IDs not stamped: top=%q child=%q", nodes[0].NodeID, nodes[0].Nodes[0].NodeID)
+	}
+}
+
+// TestBuildNoTOCPath drives the generateTOCInit branch — the
+// detector replies "no" for every page, so the builder falls
+// through to the body-text TOC generator.
+func TestBuildNoTOCPath(t *testing.T) {
+	llm := &scriptedLLM{}
+	var extractorCalled atomic.Int32
+	var noTOCCalled atomic.Int32
+	llm.route = func(prompt string) string {
+		switch {
+		case strings.Contains(prompt, "table of contents provided in the given text"):
+			return `{"toc_detected":"no"}`
+		case strings.Contains(prompt, "hierarchical tree structure"):
+			// The no-TOC and extractor prompts share the same
+			// system prompt + JSON shape; the user prompt body
+			// differs. We distinguish by the "raw table of
+			// contents" marker which only the extractor uses.
+			if strings.Contains(prompt, "Raw table of contents") {
+				extractorCalled.Add(1)
+			} else {
+				noTOCCalled.Add(1)
+			}
+			return `{"nodes":[
+				{"structure":"1","title":"Introduction","physical_index":"<physical_index_2>"},
+				{"structure":"2","title":"Methods","physical_index":"<physical_index_5>"}
+			]}`
+		case strings.Contains(prompt, "section starts at the beginning"):
+			return `{"start_begin":"yes"}`
+		}
+		return `{"toc_detected":"no"}`
+	}
+
+	pages := []PageText{
+		{PageNumber: 1, Text: "Cover page with no TOC."},
+		{PageNumber: 2, Text: "Introduction\nWe study X."},
+		{PageNumber: 5, Text: "Methods\nWe used Y."},
+	}
+
+	b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2}
+	nodes, _, err := b.Build(context.Background(), pages)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if extractorCalled.Load() != 0 {
+		t.Errorf("extractor should NOT run when no TOC page was detected")
+	}
+	if noTOCCalled.Load() == 0 {
+		t.Errorf("no-TOC generator should have been invoked")
+	}
+	if len(nodes) != 2 || nodes[0].Title != "Introduction" || nodes[1].Title != "Methods" {
+		t.Fatalf("got nodes %+v", nodes)
+	}
+	if nodes[0].StartPage != 2 || nodes[1].StartPage != 5 {
+		t.Errorf("page numbers not lifted from <physical_index>: got %+v", nodes)
+	}
+}
+
+// TestVerificationRepairsWrongPage scripts a verifier that says
+// "no" for a node whose claimed page doesn't match — the start
+// page should be cleared back to zero. Downstream consumers treat
+// zero as "open / unknown" rather than a lie.
+func TestVerificationRepairsWrongPage(t *testing.T) {
+	llm := &scriptedLLM{}
+	llm.route = func(prompt string) string {
+		switch {
+		case strings.Contains(prompt, "table of contents provided"):
+			if strings.Contains(prompt, "Table of Contents") {
+				return `{"toc_detected":"yes"}`
+			}
+			return `{"toc_detected":"no"}`
+		case strings.Contains(prompt, "hierarchical tree structure"):
+			return `{"nodes":[
+				{"structure":"1","title":"Foo","physical_index":"<physical_index_4>"},
+				{"structure":"2","title":"Bar","physical_index":"<physical_index_7>"}
+			]}`
+		case strings.Contains(prompt, "section starts at the beginning"):
+			// Only Foo's claim is valid; Bar's is a lie.
+			if strings.Contains(prompt, "Section title: Foo") {
+				return `{"start_begin":"yes"}`
+			}
+			return `{"start_begin":"no"}`
+		}
+		return `{"toc_detected":"no"}`
+	}
+
+	pages := []PageText{
+		{PageNumber: 1, Text: "Table of Contents\nFoo ... 4\nBar ... 7"},
+		{PageNumber: 4, Text: "Foo\nbody of foo"},
+		{PageNumber: 7, Text: "Some other content, not Bar"},
+	}
+
+	b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2}
+	nodes, _, err := b.Build(context.Background(), pages)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(nodes) != 2 {
+		t.Fatalf("nodes: got %d want 2", len(nodes))
+	}
+	if nodes[0].StartPage != 4 {
+		t.Errorf("verified node Foo should keep page 4, got %d", nodes[0].StartPage)
+	}
+	if nodes[1].StartPage != 0 {
+		t.Errorf("repaired node Bar should have StartPage=0 (cleared), got %d", nodes[1].StartPage)
+	}
+}
+
+// TestRetryOnBadJSON exercises the retry path: the first
+// extractor response is plain prose, the second is valid JSON.
+// The builder should retry and end up with usable nodes.
+func TestRetryOnBadJSON(t *testing.T) {
+	llm := &scriptedLLM{}
+	var extractorCalls atomic.Int32
+	llm.route = func(prompt string) string {
+		if strings.Contains(prompt, "table of contents provided") {
+			if strings.Contains(prompt, "Table of Contents") {
+				return `{"toc_detected":"yes"}`
+			}
+			return `{"toc_detected":"no"}`
+		}
+		if strings.Contains(prompt, "hierarchical tree structure") {
+			n := extractorCalls.Add(1)
+			if n == 1 {
+				// First try: plain prose. Retry loop should kick in.
+				return "Sure, here is the structure: I will explain it ..."
+			}
+			return `{"nodes":[{"structure":"1","title":"Solo","physical_index":"<physical_index_2>"}]}`
+		}
+		if strings.Contains(prompt, "section starts at the beginning") {
+			return `{"start_begin":"yes"}`
+		}
+		return `{"toc_detected":"no"}`
+	}
+
+	pages := []PageText{
+		{PageNumber: 1, Text: "Table of Contents\nSolo ... 2"},
+		{PageNumber: 2, Text: "Solo\nbody"},
+	}
+
+	b := &TOCBuilder{LLM: llm, TOCCheckPages: 5, Concurrency: 2}
+	nodes, usage, err := b.Build(context.Background(), pages)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(nodes) != 1 || nodes[0].Title != "Solo" {
+		t.Fatalf("nodes: %+v", nodes)
+	}
+	if extractorCalls.Load() < 2 {
+		t.Errorf("expected the retry loop to fire (>=2 extractor calls), got %d", extractorCalls.Load())
+	}
+	// Retry adds an extra LLM call beyond the minimum (detector + extractor + verify).
+	if usage.LLMCalls < 4 {
+		t.Errorf("expected >=4 LLM calls (detector + extractor*2 + verify), got %d", usage.LLMCalls)
+	}
+}
+
+// TestEndPageDerivationFromSiblings asserts the post-verification
+// pass fills EndPage from the next sibling's StartPage - 1 and
+// the document's last page for the final sibling.
+func TestEndPageDerivationFromSiblings(t *testing.T) {
+	root := []tree.TOCNode{
+		{Structure: "1", Title: "A", StartPage: 5},
+		{Structure: "2", Title: "B", StartPage: 12},
+		{Structure: "3", Title: "C", StartPage: 30},
+	}
+	deriveEndPages(root, 50)
+	if root[0].EndPage != 11 {
+		t.Errorf("A.EndPage: got %d want 11", root[0].EndPage)
+	}
+	if root[1].EndPage != 29 {
+		t.Errorf("B.EndPage: got %d want 29", root[1].EndPage)
+	}
+	if root[2].EndPage != 50 {
+		t.Errorf("C.EndPage (last): got %d want 50", root[2].EndPage)
+	}
+}
+
+// TestAssembleHierarchyNestsByStructure makes sure dotted
+// structure indices group correctly. "1.1" nests under "1",
+// "2.1.1" three levels deep, etc.
+func TestAssembleHierarchyNestsByStructure(t *testing.T) {
+	flat := []tocNodePayload{
+		{Structure: "1", Title: "Top"},
+		{Structure: "1.1", Title: "Sub-A"},
+		{Structure: "1.1.1", Title: "Leaf-1"},
+		{Structure: "1.2", Title: "Sub-B"},
+		{Structure: "2", Title: "Sibling"},
+	}
+	out := assembleHierarchy(flat)
+	if len(out) != 2 {
+		t.Fatalf("top-level: got %d want 2", len(out))
+	}
+	if out[0].Title != "Top" || len(out[0].Nodes) != 2 {
+		t.Fatalf("Top children: %+v", out[0].Nodes)
+	}
+	if out[0].Nodes[0].Title != "Sub-A" || len(out[0].Nodes[0].Nodes) != 1 {
+		t.Fatalf("Sub-A: %+v", out[0].Nodes[0])
+	}
+	if out[0].Nodes[0].Nodes[0].Title != "Leaf-1" {
+		t.Errorf("Leaf-1 missing under Sub-A; got %+v", out[0].Nodes[0].Nodes)
+	}
+}
+
+// TestParsePhysicalIndex covers the tag parser used by
+// assembleHierarchy. Malformed tags should return 0 (the
+// "unknown" sentinel) rather than panic.
+func TestParsePhysicalIndex(t *testing.T) {
+	cases := []struct {
+		in   string
+		want int
+	}{
+		{"<physical_index_5>", 5},
+		{"<physical_index_42>", 42},
+		{"<physical_index_>", 0},
+		{"not a tag", 0},
+		{"<physical_index_abc>", 0},
+		{"  <physical_index_7>  ", 7},
+	}
+	for _, c := range cases {
+		if got := parsePhysicalIndex(c.in); got != c.want {
+			t.Errorf("parsePhysicalIndex(%q): got %d want %d", c.in, got, c.want)
+		}
+	}
+}
+
+// TestBuildEmptyPages should return cleanly with no usage and no nodes.
+func TestBuildEmptyPages(t *testing.T) {
+	b := &TOCBuilder{LLM: &scriptedLLM{}}
+	nodes, usage, err := b.Build(context.Background(), nil)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if nodes != nil {
+		t.Errorf("empty input should yield nil nodes, got %v", nodes)
+	}
+	if usage.LLMCalls != 0 {
+		t.Errorf("empty input should make no LLM calls, got %d", usage.LLMCalls)
+	}
+}
+
+// TestAssemblePagesFromSections covers the bridge between the
+// parser's section-tree output and the TOC builder's per-page
+// input. Sections sharing a starting page collapse into one
+// PageText entry; sections with PageStart==0 are skipped so the
+// builder never sees ambiguous page numbers.
+func TestAssemblePagesFromSections(t *testing.T) {
+	secs := []parser.Section{
+		{
+			Level:     1,
+			Title:     "Business",
+			Content:   "We do business.",
+			PageStart: 3,
+			PageEnd:   8,
+			Children: []parser.Section{
+				{Level: 2, Title: "Overview", Content: "An overview.", PageStart: 3, PageEnd: 4},
+			},
+		},
+		{Level: 1, Title: "Risk Factors", Content: "Risks here.", PageStart: 10, PageEnd: 12},
+		{Level: 1, Title: "No-page section", Content: "Skipped.", PageStart: 0},
+	}
+	pages := assemblePagesFromSections(secs)
+	if len(pages) != 2 {
+		t.Fatalf("want 2 distinct pages (3 + 10), got %d: %+v", len(pages), pages)
+	}
+	if pages[0].PageNumber != 3 || pages[1].PageNumber != 10 {
+		t.Errorf("pages out of order or wrong: %+v", pages)
+	}
+	if !strings.Contains(pages[0].Text, "Business") || !strings.Contains(pages[0].Text, "Overview") {
+		t.Errorf("page 3 missing expected titles: %q", pages[0].Text)
+	}
+	if !strings.Contains(pages[1].Text, "Risk Factors") {
+		t.Errorf("page 10 missing Risk Factors title: %q", pages[1].Text)
+	}
+	if strings.Contains(pages[0].Text, "Skipped.") || strings.Contains(pages[1].Text, "Skipped.") {
+		t.Errorf("PageStart=0 section should be skipped; got %+v", pages)
+	}
+}
+
+// TestSynthetic10KFourTopLevelNodes drives a tiny but realistic
+// 10-K-flavoured fixture and asserts the builder lands four
+// top-level nodes (Business / Risk Factors / MD&A / Financial
+// Statements). The fixture matches the example used in the PR
+// reporting section.
+func TestSynthetic10KFourTopLevelNodes(t *testing.T) {
+	llm := &scriptedLLM{}
+	llm.route = func(prompt string) string {
+		if strings.Contains(prompt, "table of contents provided in the given text") {
+			if strings.Contains(prompt, "TABLE OF CONTENTS") {
+				return `{"toc_detected":"yes"}`
+			}
+			return `{"toc_detected":"no"}`
+		}
+		if strings.Contains(prompt, "hierarchical tree structure") {
+			return `{"nodes":[
+				{"structure":"1","title":"Item 1. Business","physical_index":"<physical_index_5>"},
+				{"structure":"2","title":"Item 1A. Risk Factors","physical_index":"<physical_index_15>"},
+				{"structure":"3","title":"Item 7. MD&A","physical_index":"<physical_index_40>"},
+				{"structure":"4","title":"Item 8. Financial Statements","physical_index":"<physical_index_60>"}
+			]}`
+		}
+		if strings.Contains(prompt, "section starts at the beginning") {
+			return `{"start_begin":"yes"}`
+		}
+		return `{"toc_detected":"no"}`
+	}
+
+	pages := []PageText{
+		{PageNumber: 1, Text: "Cover Page\nForm 10-K\n"},
+		{PageNumber: 2, Text: "TABLE OF CONTENTS\nItem 1. Business ... 5\nItem 1A. Risk Factors ... 15\nItem 7. MD&A ... 40\nItem 8. Financial Statements ... 60"},
+		{PageNumber: 5, Text: "Item 1. Business\nWe operate ..."},
+		{PageNumber: 15, Text: "Item 1A. Risk Factors\nRisks include ..."},
+		{PageNumber: 40, Text: "Item 7. MD&A\nDiscussion of operations."},
+		{PageNumber: 60, Text: "Item 8. Financial Statements\nBalance sheet ..."},
+	}
+
+	b := &TOCBuilder{LLM: llm, TOCCheckPages: 10, Concurrency: 4}
+	nodes, _, err := b.Build(context.Background(), pages)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(nodes) != 4 {
+		t.Fatalf("synthetic 10-K should yield 4 top-level nodes, got %d: %+v", len(nodes), nodes)
+	}
+	wantTitles := []string{"Item 1. Business", "Item 1A. Risk Factors", "Item 7. MD&A", "Item 8. Financial Statements"}
+	for i, want := range wantTitles {
+		if nodes[i].Title != want {
+			t.Errorf("nodes[%d].Title = %q, want %q", i, nodes[i].Title, want)
+		}
+	}
+}
diff --git a/pkg/tree/tree.go b/pkg/tree/tree.go
index edf36ba..9df9f74 100644
--- a/pkg/tree/tree.go
+++ b/pkg/tree/tree.go
@@ -110,6 +110,59 @@ func (s *Section) IsLeaf() bool {
 	return len(s.Children) == 0
 }
 
+// TOCNode is one node in the LLM-built table-of-contents tree
+// persisted on Document.toc_tree. Distinct from Section because
+// it represents the document's logical outline (headings the LLM
+// recovered or invented from body text) rather than the parser's
+// chunked content tree. Used by the PageIndex-style retrieval
+// strategy that reasons over the TOC before drilling into sections.
+//
+// Structure carries the PageIndex-style hierarchical index ("1",
+// "1.1", "1.1.2"). Title is the original heading verbatim (spacing
+// fixed). StartPage is 1-indexed and refers to the source PDF's
+// physical page. EndPage is derived from the next sibling's
+// StartPage at build time (when known); zero means "unknown / open"
+// and downstream readers should treat the node as running until
+// either the next sibling at the same depth or the document end.
+//
+// The shape mirrors PageIndex's tree-output JSON (start_page /
+// end_page / nodes) so external tooling that expects that
+// vocabulary can interop without translation.
+type TOCNode struct {
+	// NodeID is a stable identifier for this TOC node within its
+	// owning document. Generated by the builder; opaque to clients.
+	NodeID string `json:"node_id"`
+
+	// Structure is the dotted hierarchical index ("1", "1.1",
+	// "1.1.2"). Empty for roots that the builder couldn't number.
+	Structure string `json:"structure"`
+
+	// Title is the section's heading text. Always populated.
+	Title string `json:"title"`
+
+	// StartPage is the 1-indexed PDF page where this section
+	// begins. The verification phase checks that the title
+	// actually appears at the start of this page; mismatches are
+	// repaired before persistence.
+	StartPage int `json:"start_page"`
+
+	// EndPage is the 1-indexed inclusive end page derived from
+	// sibling ordering. Zero means "unknown / open" and should be
+	// interpreted as running to the next sibling's StartPage - 1
+	// (or document end).
+	EndPage int `json:"end_page,omitempty"`
+
+	// Summary is an optional one-line description of the
+	// subsection's content. Populated only when the builder runs
+	// with summary-generation enabled (a follow-up PR; left blank
+	// here so the JSON shape is forward-compatible).
+	Summary string `json:"summary,omitempty"`
+
+	// Nodes is the recursive list of child TOC nodes in document
+	// order.
+	Nodes []TOCNode `json:"nodes,omitempty"`
+}
+
 // Walk visits every section in depth-first, pre-order. Traversal stops if
 // visit returns false.
 func (s *Section) Walk(visit func(*Section) bool) {