From 249d4b21294236dff0a0a1cff4330f0d57157813 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Thu, 28 May 2026 23:40:15 +0100 Subject: [PATCH 1/3] feat(config): add ingest mode switch (full|minimal) with env forwarding Add IngestConfig.Mode (yaml `mode`, values full|minimal, default full) to the engine config, with VLE_INGEST_MODE env override and Validate rejecting unknown values. Forward it from the deployed server's config wrapper via firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE") so the live vectorless-server can be flipped to minimal ingest with a single env var, no secret edit. --- internal/config/config.go | 6 ++++++ pkg/config/config.go | 41 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/internal/config/config.go b/internal/config/config.go index 82b8118..1646af0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -320,6 +320,12 @@ func applyEnvOverrides(c *Config) { if v := firstEnv("VLS_LLM_DRIVER", "VLE_LLM_DRIVER"); v != "" { c.Engine.LLM.Driver = v } + // Ingest mode (full | minimal). Forwarded so the live + // vectorless-server can be flipped to minimal ingest with a single + // env var, no secret/config edit. VLS_-prefixed wins over VLE_. + if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" { + c.Engine.Ingest.Mode = v + } // Anthropic-compatible gateway overrides (e.g. GLM/Zhipu via // https://api.z.ai/api/anthropic): base URL + model, so the // anthropic driver can run a non-Anthropic model without a secret diff --git a/pkg/config/config.go b/pkg/config/config.go index 3add5b0..8c7c327 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -33,6 +33,35 @@ type Config struct { // IngestConfig configures retrieval-quality boosters that run during // the ingest pipeline (between summarize and StatusReady). type IngestConfig struct { + // Mode selects how much work the ingest pipeline does before a + // document is marked ready. + // + // "full" (default) — parse → build tree → persist → summarize + // → HyDE → multi-axis summaries → TOC build. + // Maximises retrieval quality at the cost of + // ~1,000-3,000 LLM calls + a table-extraction + // pass on a large filing (minutes of wall time). + // + // "minimal" — parse → build tree → persist → ready. + // Skips ALL per-section LLM enrichment + // (summarize, HyDE, multi-axis, TOC build) + // AND the pdftable table-finding pass, so a + // document becomes queryable in ~parse-speed + // (seconds). The page-based retrieval strategy + // (/v1/answer/pageindex) needs none of the + // skipped enrichment: it navigates a TOC tree + // (synthesised from the section tree when + // documents.toc_tree is NULL) and reads raw + // section/page text at query time — and the raw + // page text still contains the tables' text, so + // dropping table *sections* loses nothing for + // it. The summary-dependent strategies + // (chunked-tree, agentic) degrade to using + // titles + raw content with no summaries. + // + // Empty defaults to "full". Engine env override: VLE_INGEST_MODE. + Mode string `yaml:"mode"` + HyDE HyDEConfig `yaml:"hyde"` // Tables configures pdftable's table-finding pass over PDF inputs. @@ -695,6 +724,7 @@ func Default() Config { }, }, Ingest: IngestConfig{ + Mode: "full", GlobalLLMConcurrency: 12, LLMCallTimeoutSeconds: 90, MaxSections: 400, @@ -838,6 +868,11 @@ func applyEnvOverrides(c *Config) { if v := os.Getenv("VLE_RETRIEVAL_AGENTIC_MODEL"); v != "" { c.Retrieval.Agentic.Model = v } + // Ingest mode switch (full | minimal). A single env var flips the + // engine into fast/minimal ingest with no secret edit. + if v := os.Getenv("VLE_INGEST_MODE"); v != "" { + c.Ingest.Mode = v + } // Ingest / HyDE knobs. Booleans accept the usual truthy strings — // kept narrow so a typo doesn't silently flip the flag. if v := os.Getenv("VLE_INGEST_HYDE_ENABLED"); v != "" { @@ -1144,6 +1179,12 @@ func (c Config) Validate() error { return fmt.Errorf("server.tls.min_version must be 1.2 or 1.3, got %q", v) } + switch c.Ingest.Mode { + case "", "full", "minimal": + default: + return fmt.Errorf("ingest.mode must be one of full|minimal, got %q", c.Ingest.Mode) + } + if c.Ingest.HyDE.NumQuestions < 0 { return fmt.Errorf("ingest.hyde.num_questions must be >= 0, got %d", c.Ingest.HyDE.NumQuestions) } From 6444532f6184ec52b65435a0679d5c75b24dfb36 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Thu, 28 May 2026 23:49:01 +0100 Subject: [PATCH 2/3] =?UTF-8?q?feat(ingest):=20minimal-mode=20pipeline=20p?= =?UTF-8?q?ath=20=E2=80=94=20parse=E2=86=92persist=E2=86=92ready,=20no=20L?= =?UTF-8?q?LM/tables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Pipeline.Mode; when "minimal", Run dispatches to runMinimal which does parse → build tree → persist → ready and skips every per-section LLM stage (summarize, HyDE, multi-axis summaries, TOC build). The parser registry is rebuilt with table extraction DISABLED (nil opts) regardless of ingest.tables.enabled, since the pdftable table-finding pass is the slow/hang-prone part of parse and the page-based strategy reads raw page text (which still contains the table's text). persistTree/parse/fail now take the persistence target through a narrow docPersister interface (*db.Pool satisfies it) so the minimal path is exercisable without a live Postgres. Both cmd/engine and cmd/server set Mode from cfg.Ingest.Mode and log when minimal mode is active. --- cmd/engine/main.go | 5 +- cmd/server/main.go | 5 +- pkg/ingest/ingest.go | 116 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 112 insertions(+), 14 deletions(-) diff --git a/cmd/engine/main.go b/cmd/engine/main.go index 3f2742b..8da6abf 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -174,6 +174,7 @@ func run() error { LLM: llmClient, Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)), Logger: logger, + Mode: cfg.Ingest.Mode, HyDEEnabled: cfg.Ingest.HyDE.Enabled, HyDEModel: cfg.Ingest.HyDE.Model, HyDENumQuestions: cfg.Ingest.HyDE.NumQuestions, @@ -184,7 +185,9 @@ func run() error { SummaryAxesMaxNumbers: cfg.Ingest.SummaryAxes.MaxNumbers, GlobalLLMConcurrency: cfg.Ingest.GlobalLLMConcurrency, }) - if cfg.Ingest.Tables.Enabled { + if cfg.Ingest.Mode == ingest.ModeMinimal { + logger.Info("ingest: MINIMAL mode — parse→persist→ready; skipping summarize/HyDE/multi-axis/TOC + table extraction") + } else if cfg.Ingest.Tables.Enabled { logger.Info("ingest: pdf table extraction enabled", "vertical_strategy", cfg.Ingest.Tables.VerticalStrategy, "horizontal_strategy", cfg.Ingest.Tables.HorizontalStrategy, diff --git a/cmd/server/main.go b/cmd/server/main.go index a241eda..c14b608 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -200,6 +200,7 @@ func run() error { LLM: llmClient, Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)), Logger: logger, + Mode: cfg.Engine.Ingest.Mode, HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled, HyDEModel: cfg.Engine.Ingest.HyDE.Model, HyDENumQuestions: cfg.Engine.Ingest.HyDE.NumQuestions, @@ -214,7 +215,9 @@ func run() error { TOCCheckPages: cfg.Engine.Ingest.TOC.TOCCheckPages, GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency, }) - if cfg.Engine.Ingest.Tables.Enabled { + if cfg.Engine.Ingest.Mode == ingest.ModeMinimal { + logger.Info("ingest: MINIMAL mode — parse→persist→ready; skipping summarize/HyDE/multi-axis/TOC + table extraction") + } else if cfg.Engine.Ingest.Tables.Enabled { logger.Info("ingest: pdf table extraction enabled", "vertical_strategy", cfg.Engine.Ingest.Tables.VerticalStrategy, "horizontal_strategy", cfg.Engine.Ingest.Tables.HorizontalStrategy, diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go index 2adf859..a840acc 100644 --- a/pkg/ingest/ingest.go +++ b/pkg/ingest/ingest.go @@ -45,6 +45,22 @@ import ( "github.com/hallelx2/vectorless-engine/pkg/tree" ) +// ModeMinimal is the Pipeline.Mode value that collapses ingest to +// parse → build tree → persist → ready, skipping all LLM enrichment +// and table extraction. Any other value runs the full pipeline. +const ModeMinimal = "minimal" + +// docPersister is the narrow slice of *db.Pool the parse → persist → +// ready path depends on. Declaring it here (rather than threading the +// concrete *db.Pool) lets the minimal-mode runner be exercised with a +// fake store, so the "zero LLM calls, still reaches ready" guarantee is +// provable without a live Postgres. *db.Pool satisfies it. +type docPersister interface { + SetDocumentStatus(ctx context.Context, id tree.DocumentID, s db.DocumentStatus, errMsg string) error + SetDocumentTitle(ctx context.Context, id tree.DocumentID, title string) error + UpsertSection(ctx context.Context, s db.Section) error +} + // Payload is the JSON body attached to an ingest job. type Payload struct { DocumentID tree.DocumentID `json:"document_id"` @@ -65,6 +81,19 @@ type Pipeline struct { Parsers *parser.Registry Logger *slog.Logger + // Mode selects how much work Run does before marking a document + // ready. "minimal" collapses ingest to parse → build tree → persist + // → ready, skipping every per-section LLM stage (summarize, HyDE, + // multi-axis summaries, TOC build) AND the pdftable table-finding + // pass. Anything else (including the empty Go zero value used by + // Pipeline literals in tests) runs the full enrichment pipeline. + // + // The page-based retrieval strategy (/v1/answer/pageindex) needs none + // of the skipped enrichment — it navigates a synthesised-from-sections + // TOC and reads raw section/page text at query time — so a + // minimal-ingested document is immediately queryable through it. + Mode string + // SummaryMaxChars caps the content window sent to the LLM per section. // Sections longer than this are truncated — we're generating a short // summary, not reproducing the text. @@ -301,8 +330,16 @@ func (p *Pipeline) Handler() queue.Handler { } } -// Run executes the full pipeline for one document. Safe to retry. +// Run executes the pipeline for one document. Safe to retry. +// +// When Mode == ModeMinimal it dispatches to runMinimal — parse → build +// tree → persist → ready, with no LLM enrichment and no table +// extraction. Otherwise it runs the full enrichment pipeline below. func (p *Pipeline) Run(ctx context.Context, pl Payload) error { + if p.Mode == ModeMinimal { + return p.runMinimal(ctx, p.DB, pl) + } + log := p.Logger.With("document_id", string(pl.DocumentID)) log.Info("ingest: start", "source_ref", pl.SourceRef) @@ -310,15 +347,15 @@ func (p *Pipeline) Run(ctx context.Context, pl Payload) error { return err } - parsed, err := p.parse(ctx, pl) + parsed, err := p.parse(ctx, p.Parsers, pl) if err != nil { - p.fail(ctx, pl.DocumentID, "parse", err) + p.fail(ctx, p.DB, pl.DocumentID, "parse", err) return err } log.Info("ingest: parsed", "sections", len(parsed.Flatten()), "title", parsed.Title) - if err := p.persistTree(ctx, pl.DocumentID, parsed); err != nil { - p.fail(ctx, pl.DocumentID, "persist tree", err) + if err := p.persistTree(ctx, p.DB, pl.DocumentID, parsed); err != nil { + p.fail(ctx, p.DB, pl.DocumentID, "persist tree", err) return err } @@ -504,25 +541,80 @@ func runParallelStages(ctx context.Context, summarizeFn, hydeFn func(context.Con return summarizeErr, hydeErr } -func (p *Pipeline) parse(ctx context.Context, pl Payload) (*parser.ParsedDoc, error) { +func (p *Pipeline) parse(ctx context.Context, parsers *parser.Registry, pl Payload) (*parser.ParsedDoc, error) { rc, _, err := p.Storage.Get(ctx, pl.SourceRef) if err != nil { return nil, fmt.Errorf("fetch source: %w", err) } defer rc.Close() - return p.Parsers.Parse(ctx, pl.ContentType, pl.Filename, rc) + return parsers.Parse(ctx, pl.ContentType, pl.Filename, rc) +} + +// runMinimal is the fast/minimal ingest path: parse → build tree → +// persist → ready. It does ZERO LLM work — no summarize, no HyDE, no +// multi-axis summaries, no TOC build — and parses with table extraction +// DISABLED (the pdftable table-finding pass is the slow/hang-prone part +// of parse, and the page-based strategy reads raw page text which still +// contains the table's text, so dropping table *sections* loses nothing +// for it). +// +// The doc reaches StatusReady the moment the section tree is persisted, +// which is what "ready" means for the page-based strategy: it +// synthesises its TOC from the section tree (titles + page ranges) when +// documents.toc_tree is NULL — and minimal mode leaves it NULL — and +// reads section bodies from storage at query time. +// +// store is the persistence target; production passes p.DB. The DB seam +// is an interface so this path is testable without a live Postgres. +func (p *Pipeline) runMinimal(ctx context.Context, store docPersister, pl Payload) error { + log := p.Logger.With("document_id", string(pl.DocumentID)) + log.Info("ingest: start (minimal mode)", "source_ref", pl.SourceRef) + + if err := store.SetDocumentStatus(ctx, pl.DocumentID, db.StatusParsing, ""); err != nil { + return err + } + + // Table extraction is disabled unconditionally in minimal mode, + // regardless of ingest.tables.enabled: a nil-opts registry makes the + // PDF parser skip the table-finding pass entirely. All other parsers + // are unaffected. + parsers := RegistryFromTableOpts(nil) + parsed, err := p.parse(ctx, parsers, pl) + if err != nil { + p.fail(ctx, store, pl.DocumentID, "parse", err) + return err + } + log.Info("ingest: parsed", "sections", len(parsed.Flatten()), "title", parsed.Title) + + if err := p.persistTree(ctx, store, pl.DocumentID, parsed); err != nil { + p.fail(ctx, store, pl.DocumentID, "persist tree", err) + return err + } + + // Skip summarize / HyDE / multi-axis / TOC entirely — flip straight + // to ready. The document is now queryable via the page-based + // strategy (synthesised TOC + raw page reads). + if err := store.SetDocumentStatus(ctx, pl.DocumentID, db.StatusReady, ""); err != nil { + return err + } + log.Info("ingest: ready (minimal mode)") + return nil } // persistTree writes sections + full content in document order. Parents // are written before children so the FK on sections.parent_id holds. -func (p *Pipeline) persistTree(ctx context.Context, docID tree.DocumentID, doc *parser.ParsedDoc) error { +// +// The DB operations go through the narrow docPersister interface so the +// persist path can be exercised (e.g. by the minimal-mode test) without +// a live Postgres; production callers pass p.DB, which satisfies it. +func (p *Pipeline) persistTree(ctx context.Context, store docPersister, docID tree.DocumentID, doc *parser.ParsedDoc) error { // Only overwrite the row's title (which was seeded with the // filename at upload time) when the parsed title looks usable. // Watermarked PDFs whose overlay text shares a Y coordinate with // the real title produce mojibake like "GGlloobbaall SSttrraatteeggyy" // — we'd rather keep the original filename than show that to a user. if doc.Title != "" && !isLikelyMojibakeTitle(doc.Title) { - if err := p.DB.SetDocumentTitle(ctx, docID, doc.Title); err != nil { + if err := store.SetDocumentTitle(ctx, docID, doc.Title); err != nil { return err } } @@ -550,7 +642,7 @@ func (p *Pipeline) persistTree(ctx context.Context, docID tree.DocumentID, doc * } } - if err := p.DB.UpsertSection(ctx, db.Section{ + if err := store.UpsertSection(ctx, db.Section{ ID: id, DocumentID: docID, ParentID: parent, @@ -870,14 +962,14 @@ func fallbackSummary(title, body string) string { return strings.Join(strings.Fields(body), " ") } -func (p *Pipeline) fail(ctx context.Context, id tree.DocumentID, stage string, cause error) { +func (p *Pipeline) fail(ctx context.Context, store docPersister, id tree.DocumentID, stage string, cause error) { msg := fmt.Sprintf("%s: %s", stage, cause.Error()) // Use a FRESH context for the failure write — the inbound one is // almost certainly the reason we're failing (timeout/cancel) and // reusing it would leave the doc stuck on "parsing" forever. failCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() - if err := p.DB.SetDocumentStatus(failCtx, id, db.StatusFailed, msg); err != nil { + if err := store.SetDocumentStatus(failCtx, id, db.StatusFailed, msg); err != nil { p.Logger.Error("ingest: failed to mark document failed", "err", err, "cause", cause) } } From 0f9a2cf65d38ebee53182f7acaf40cffd54659f0 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Fri, 29 May 2026 00:07:13 +0100 Subject: [PATCH 3/3] test(ingest): prove minimal mode does zero LLM work and stays queryable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - pkg/ingest/minimal_mode_test.go: a minimal-mode pipeline run with an LLM client that fails the test on any call reaches StatusReady with sections persisted and a call counter of 0 — proving minimal ingest is pure-Go. A second test reconstructs the persisted tree and confirms the synthesised-TOC fallback is title-bearing and section bodies load back from storage. - pkg/retrieval: TestPageIndexMinimalIngestedDoc drives the page-based strategy end-to-end against a minimal-ingested doc shape (page ranges + content refs, NO summaries, nil TOC) and asserts it produces a cited answer from the synthesised TOC + raw page reads. - pkg/config: default mode is "full"; VLE_INGEST_MODE=minimal override and Validate accept/reject coverage. - Document ingest.mode in both example configs. --- config.example.yaml | 27 ++ config.server.example.yaml | 12 + pkg/config/config_test.go | 46 ++++ pkg/ingest/minimal_mode_test.go | 317 +++++++++++++++++++++++ pkg/retrieval/pageindex_strategy_test.go | 81 ++++++ 5 files changed, 483 insertions(+) create mode 100644 pkg/ingest/minimal_mode_test.go diff --git a/config.example.yaml b/config.example.yaml index 13a40da..31262d7 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -294,10 +294,37 @@ retrieval: model: "" ingest: + # Ingest mode — how much work the pipeline does before a document is + # marked `ready` (queryable). + # + # full (default) parse -> build tree -> persist -> summarize -> + # HyDE -> multi-axis summaries -> TOC build. Maximises + # retrieval quality but costs ~1,000-3,000 LLM calls plus a + # pdftable table-finding pass on a large filing — minutes of + # wall time for a 90-page 10-K. + # + # minimal parse -> build tree -> persist -> ready. Skips ALL + # per-section LLM enrichment (summarize, HyDE, multi-axis, + # TOC build) AND the pdftable table-extraction pass, so a + # document becomes queryable in ~parse-speed (seconds). + # The page-based strategy (/v1/answer/pageindex) needs none + # of the skipped work: it navigates a TOC synthesised from + # the section tree (documents.toc_tree is left NULL) and + # reads raw section/page text at query time — and that raw + # page text still contains the tables' text, so dropping + # table *sections* loses nothing for it. The + # summary-dependent strategies (chunked-tree, agentic) + # degrade to titles + raw content with no summaries. + # + # Override per-process with VLE_INGEST_MODE; on the deployed + # vectorless-server use VLS_INGEST_MODE=minimal (no secret edit needed). + mode: "full" + # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined, so the # provider's per-tenant concurrency limit isn't exceeded. 0 disables # the global cap; default applied by the engine is 12. + # (Ignored when mode: minimal — no LLM stages run.) global_llm_concurrency: 12 # HyDE candidate-question stage. For each leaf section the pipeline asks diff --git a/config.server.example.yaml b/config.server.example.yaml index 76bc29c..9e56a7b 100644 --- a/config.server.example.yaml +++ b/config.server.example.yaml @@ -99,9 +99,21 @@ engine: include_sibling_breadcrumbs: true ingest: + # Ingest mode: full (default) | minimal. + # full parse -> persist -> summarize -> HyDE -> multi-axis -> + # TOC build. Maximum retrieval quality; minutes on a large + # filing. + # minimal parse -> persist -> ready. Skips every LLM enrichment + # stage AND table extraction — queryable in seconds. The + # page-based strategy (/v1/answer/pageindex) works on it + # unchanged (synthesised TOC + raw page reads). + # Flip the live service without a secret edit: VLS_INGEST_MODE=minimal. + mode: "full" + # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined. # 0 disables the global cap; default is 12. + # (Ignored when mode: minimal — no LLM stages run.) global_llm_concurrency: 12 # HyDE candidate-question generation per leaf section. Folded into diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 6da4b92..d8172b2 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -84,6 +84,52 @@ func TestDefaultValues(t *testing.T) { } } +// TestIngestModeDefault locks the default ingest mode to "full" so the +// current full-enrichment behaviour is preserved unless explicitly +// switched. +func TestIngestModeDefault(t *testing.T) { + t.Parallel() + cfg := Default() + if cfg.Ingest.Mode != "full" { + t.Errorf("ingest.mode = %q, want full (default)", cfg.Ingest.Mode) + } +} + +// TestIngestModeEnvOverride covers the VLE_INGEST_MODE override — the +// single env var that flips the engine into fast/minimal ingest. +func TestIngestModeEnvOverride(t *testing.T) { + prev := os.Getenv("VLE_INGEST_MODE") + defer os.Setenv("VLE_INGEST_MODE", prev) + + os.Setenv("VLE_INGEST_MODE", "minimal") + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Ingest.Mode != "minimal" { + t.Errorf("VLE_INGEST_MODE=minimal not applied, got %q", cfg.Ingest.Mode) + } +} + +// TestIngestModeValidate asserts Validate accepts the documented values +// (and empty, which Default normalises to full) and rejects garbage. +func TestIngestModeValidate(t *testing.T) { + t.Parallel() + for _, m := range []string{"", "full", "minimal"} { + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + cfg.Ingest.Mode = m + if err := cfg.Validate(); err != nil { + t.Errorf("ingest.mode=%q should pass validation, got %v", m, err) + } + } + + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + cfg.Ingest.Mode = "turbo" + if err := cfg.Validate(); err == nil { + t.Error("ingest.mode=turbo should fail validation") + } +} + func TestTOCEnvOverride(t *testing.T) { // Mutates env — restore on exit. Not parallel. keys := []string{ diff --git a/pkg/ingest/minimal_mode_test.go b/pkg/ingest/minimal_mode_test.go new file mode 100644 index 0000000..34a96c1 --- /dev/null +++ b/pkg/ingest/minimal_mode_test.go @@ -0,0 +1,317 @@ +package ingest + +import ( + "bytes" + "context" + "io" + "log/slog" + "os" + "sync" + "testing" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/db" + "github.com/hallelx2/vectorless-engine/pkg/storage" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// fakeDocStore is an in-memory docPersister. It captures the status +// transitions and section upserts the minimal pipeline performs so the +// "reaches ready, sections persisted" guarantee can be asserted without +// a live Postgres. Safe for the pipeline's concurrent use (minimal mode +// is sequential, but the mutex keeps the race detector quiet regardless). +type fakeDocStore struct { + mu sync.Mutex + status db.DocumentStatus + errMsg string + title string + sections []db.Section +} + +func (f *fakeDocStore) SetDocumentStatus(_ context.Context, _ tree.DocumentID, s db.DocumentStatus, errMsg string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.status = s + f.errMsg = errMsg + return nil +} + +func (f *fakeDocStore) SetDocumentTitle(_ context.Context, _ tree.DocumentID, title string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.title = title + return nil +} + +func (f *fakeDocStore) UpsertSection(_ context.Context, s db.Section) error { + f.mu.Lock() + defer f.mu.Unlock() + f.sections = append(f.sections, s) + return nil +} + +func (f *fakeDocStore) snapshot() (db.DocumentStatus, string, []db.Section) { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]db.Section, len(f.sections)) + copy(out, f.sections) + return f.status, f.errMsg, out +} + +// failIfCalledLLM is an llmgate.Client that fails the test the instant +// any LLM call is issued. It is the proof harness for minimal mode: +// minimal ingest must do ZERO LLM work, so a single Complete call is a +// hard test failure. Calls() lets the test assert the counter stayed 0. +type failIfCalledLLM struct { + t *testing.T + calls int + mu sync.Mutex +} + +func (l *failIfCalledLLM) Complete(_ context.Context, _ llmgate.Request) (*llmgate.Response, error) { + l.mu.Lock() + l.calls++ + l.mu.Unlock() + l.t.Helper() + l.t.Errorf("minimal mode issued an LLM Complete call; it must do zero LLM work") + return nil, llmgate.ErrNotImplemented +} + +func (l *failIfCalledLLM) CountTokens(_ context.Context, text string) (int, error) { + l.mu.Lock() + l.calls++ + l.mu.Unlock() + l.t.Helper() + l.t.Errorf("minimal mode issued an LLM CountTokens call; it must do zero LLM work") + return len(text) / 4, nil +} + +func (l *failIfCalledLLM) callCount() int { + l.mu.Lock() + defer l.mu.Unlock() + return l.calls +} + +// TestMinimalModeZeroLLMCalls is the headline guarantee: a minimal-mode +// pipeline run reaches StatusReady with sections persisted while making +// ZERO LLM calls. The LLM client fails the test on any call, and we also +// assert its call counter stayed at 0 — together proving minimal ingest +// is pure-Go (parse → persist → ready), no summarize / HyDE / multi-axis +// / TOC. +func TestMinimalModeZeroLLMCalls(t *testing.T) { + t.Parallel() + + ctx := context.Background() + + store, err := storage.NewLocal(t.TempDir()) + if err != nil { + t.Fatalf("init local storage: %v", err) + } + + fixture, err := os.ReadFile("../../testdata/rust-ownership.md") + if err != nil { + t.Fatalf("read fixture: %v", err) + } + docID := NewDocumentID() + srcKey := SourceKey(docID, "rust-ownership.md") + if err := store.Put(ctx, srcKey, bytes.NewReader(fixture), storage.Metadata{ + ContentType: "text/markdown", + Size: int64(len(fixture)), + }); err != nil { + t.Fatalf("stage source: %v", err) + } + + llm := &failIfCalledLLM{t: t} + + // Construct the pipeline through NewPipeline (the production path) in + // minimal mode. HyDE/SummaryAxes flags are intentionally left at + // their full-mode-on values to prove the minimal switch — not a pile + // of disabled sub-flags — is what suppresses the LLM work. + p := NewPipeline(Pipeline{ + DB: nil, // never touched: runMinimal takes the store explicitly + Storage: store, + LLM: llm, + Parsers: DefaultRegistry(), + Logger: slog.New(slog.NewTextHandler(io.Discard, nil)), + Mode: ModeMinimal, + HyDEEnabled: true, + SummaryAxesEnabled: true, + TOCEnabled: true, + }) + + fake := &fakeDocStore{} + if err := p.runMinimal(ctx, fake, Payload{ + DocumentID: docID, + ContentType: "text/markdown", + Filename: "rust-ownership.md", + SourceRef: srcKey, + }); err != nil { + t.Fatalf("runMinimal: %v", err) + } + + status, errMsg, sections := fake.snapshot() + if status != db.StatusReady { + t.Fatalf("doc status = %q (err=%q); minimal mode did not reach ready", status, errMsg) + } + if len(sections) == 0 { + t.Fatal("minimal mode persisted zero sections") + } + if n := llm.callCount(); n != 0 { + t.Fatalf("minimal mode made %d LLM calls; want 0", n) + } + + // No summaries / axes / candidate-questions were written — minimal + // mode skips every enrichment stage, so every persisted section is + // bare (title + content ref only). + for _, s := range sections { + if s.Summary != "" { + t.Errorf("section %s carries a summary in minimal mode: %q", s.ID, s.Summary) + } + if s.SummaryAxes != nil { + t.Errorf("section %s carries summary_axes in minimal mode", s.ID) + } + if len(s.CandidateQuestions) != 0 { + t.Errorf("section %s carries HyDE questions in minimal mode", s.ID) + } + } +} + +// TestMinimalModeReadyIsQueryable proves a minimal-ingested document is +// usable by the page-based retrieval strategy's two run-time inputs: +// +// 1. the synthesised TOC (documents.toc_tree is NULL after minimal +// ingest, so the strategy falls back to synthesiseTOC over the +// section tree) — must be a non-empty, title-bearing structure; and +// 2. raw section bodies read from storage via the section ContentRef — +// must return the persisted text. +// +// It reconstructs the tree from exactly what runMinimal persisted, so it +// exercises the real post-ingest shape. The end-to-end PageIndexStrategy +// loop is covered in pkg/retrieval (TestPageIndexMinimalIngestedDoc). +func TestMinimalModeReadyIsQueryable(t *testing.T) { + t.Parallel() + + ctx := context.Background() + store, err := storage.NewLocal(t.TempDir()) + if err != nil { + t.Fatalf("init local storage: %v", err) + } + + fixture, err := os.ReadFile("../../testdata/rust-ownership.md") + if err != nil { + t.Fatalf("read fixture: %v", err) + } + docID := NewDocumentID() + srcKey := SourceKey(docID, "rust-ownership.md") + if err := store.Put(ctx, srcKey, bytes.NewReader(fixture), storage.Metadata{ + ContentType: "text/markdown", + Size: int64(len(fixture)), + }); err != nil { + t.Fatalf("stage source: %v", err) + } + + p := NewPipeline(Pipeline{ + Storage: store, + LLM: &failIfCalledLLM{t: t}, + Parsers: DefaultRegistry(), + Logger: slog.New(slog.NewTextHandler(io.Discard, nil)), + Mode: ModeMinimal, + }) + fake := &fakeDocStore{} + if err := p.runMinimal(ctx, fake, Payload{ + DocumentID: docID, + ContentType: "text/markdown", + Filename: "rust-ownership.md", + SourceRef: srcKey, + }); err != nil { + t.Fatalf("runMinimal: %v", err) + } + _, _, sections := fake.snapshot() + + // Reconstruct the tree from persisted rows (mirrors db.buildTree's + // parent→children wiring) and confirm it is non-trivial. + root := reconstructTree(docID, fake.title, sections) + if root == nil { + t.Fatal("reconstructed tree root is nil; minimal mode persisted nothing usable") + } + + // (1) Synthesised TOC fallback returns a usable, title-bearing view. + titleSeen := false + var walk func(*tree.Section) + walk = func(s *tree.Section) { + if s == nil { + return + } + if s.Title != "" { + titleSeen = true + } + for _, c := range s.Children { + walk(c) + } + } + walk(root) + if !titleSeen { + t.Error("reconstructed section tree carries no titles; synthesised TOC would be empty") + } + + // (2) At least one persisted leaf has a ContentRef whose bytes load + // back from storage — the raw text the page strategy reads at query + // time. + loadedSomeBody := false + for _, s := range sections { + if s.ContentRef == "" { + continue + } + rc, _, err := store.Get(ctx, s.ContentRef) + if err != nil { + t.Fatalf("load section %s content: %v", s.ID, err) + } + body, _ := io.ReadAll(rc) + rc.Close() + if len(bytes.TrimSpace(body)) > 0 { + loadedSomeBody = true + } + } + if !loadedSomeBody { + t.Error("no section body loaded from storage; page strategy would have no raw text to read") + } +} + +// reconstructTree wires a flat db.Section list into a tree.Section root, +// matching db.buildTree's behaviour (which is unexported): a single +// top-level section becomes the root; multiple are wrapped in a +// synthetic empty-ID root carrying the document title. +func reconstructTree(_ tree.DocumentID, title string, rows []db.Section) *tree.Section { + byID := make(map[tree.SectionID]*tree.Section, len(rows)) + for _, r := range rows { + byID[r.ID] = &tree.Section{ + ID: r.ID, + ParentID: r.ParentID, + Ordinal: r.Ordinal, + Title: r.Title, + ContentRef: r.ContentRef, + PageStart: r.PageStart, + PageEnd: r.PageEnd, + } + } + var topLevel []*tree.Section + for _, r := range rows { + s := byID[r.ID] + if s.ParentID == "" { + topLevel = append(topLevel, s) + continue + } + if parent, ok := byID[s.ParentID]; ok { + parent.Children = append(parent.Children, s) + } + } + switch len(topLevel) { + case 0: + return nil + case 1: + return topLevel[0] + default: + return &tree.Section{Title: title, Children: topLevel} + } +} diff --git a/pkg/retrieval/pageindex_strategy_test.go b/pkg/retrieval/pageindex_strategy_test.go index e6b9e0d..4d50b37 100644 --- a/pkg/retrieval/pageindex_strategy_test.go +++ b/pkg/retrieval/pageindex_strategy_test.go @@ -190,6 +190,87 @@ func TestPageIndexHappyPath(t *testing.T) { } } +// buildMinimalIngestedTree mirrors the post-ingest shape of a document +// run through MINIMAL ingest mode: sections carry page ranges (the PDF +// parser populates them) and content refs (persisted bodies) but NO +// summaries (minimal mode skips the summarize stage) and NO HyDE +// questions. documents.toc_tree is NULL after minimal ingest, which the +// strategy models by leaving TOC nil — forcing synthesiseTOC. +func buildMinimalIngestedTree() *tree.Tree { + a1 := &tree.Section{ID: "sec_a1", ParentID: "sec_a", Title: "Ownership", ContentRef: "a1_ref", PageStart: 1, PageEnd: 2} + a2 := &tree.Section{ID: "sec_a2", ParentID: "sec_a", Title: "Borrowing", ContentRef: "a2_ref", PageStart: 3, PageEnd: 4} + b1 := &tree.Section{ID: "sec_b1", ParentID: "sec_b", Title: "Lifetimes", ContentRef: "b1_ref", PageStart: 5, PageEnd: 7} + a := &tree.Section{ID: "sec_a", ParentID: "sec_root", Title: "Memory", Children: []*tree.Section{a1, a2}, PageStart: 1, PageEnd: 4} + b := &tree.Section{ID: "sec_b", ParentID: "sec_root", Title: "Advanced", Children: []*tree.Section{b1}, PageStart: 5, PageEnd: 7} + root := &tree.Section{ID: "sec_root", Title: "Rust", Children: []*tree.Section{a, b}, PageStart: 1, PageEnd: 7} + return &tree.Tree{DocumentID: "doc_minimal", Title: "Rust", Root: root} +} + +// TestPageIndexMinimalIngestedDoc is the cross-package guarantee for the +// minimal ingest mode: a document ingested with NO LLM enrichment (no +// summaries, no HyDE, NULL toc_tree) is still fully answerable through +// the page-based strategy. It drives the canonical structure → get_pages +// → done loop with TOC left nil (the NULL-toc_tree state) and asserts: +// +// - get_document_structure surfaces the SYNTHESISED TOC (section titles +// from the tree) — proving the NULL-toc_tree fallback works; and +// - get_pages surfaces RAW section content read via the loader — the +// text the strategy answers from, which on a real minimal-ingested +// doc is the persisted page text (and still contains any table text). +// +// No summaries are present anywhere in the tree, so this also proves the +// strategy does not hard-require a summary to navigate or answer. +func TestPageIndexMinimalIngestedDoc(t *testing.T) { + t.Parallel() + + tr := buildMinimalIngestedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"get_document_structure","reasoning":"orient by titles"}`, + `{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"ownership lives up front"}`, + `{"tool":"done","answer":"Ownership is a set of rules the compiler checks.","cited_pages":[[1,2]],"reasoning":"pages 1-2 define ownership"}`, + }, + } + loader := pageMapLoader{data: map[string]string{ + "a1_ref": "Ownership is a set of rules that govern how a Rust program manages memory.", + "a2_ref": "References borrow a value without taking ownership.", + "b1_ref": "Lifetimes ensure references are valid.", + }} + + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = loader + // s.TOC intentionally left nil — models the NULL documents.toc_tree + // state minimal ingest leaves behind. The strategy must synthesise. + + res, err := s.SelectWithCost(context.Background(), tr, "what is ownership?", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost on minimal-ingested doc: %v", err) + } + if !strings.Contains(res.Reasoning, "Ownership is a set of rules") { + t.Errorf("answer must carry the model's reply, got %q", res.Reasoning) + } + if _, ok := indexOfSection(res.SelectedIDs, "sec_a1"); !ok { + t.Errorf("sec_a1 (pages 1-2) must be cited, got %v", res.SelectedIDs) + } + if len(res.PagesRead) != 1 || res.PagesRead[0].CharCount == 0 { + t.Errorf("expected one non-empty get_pages read, got %+v", res.PagesRead) + } + + llm.mu.Lock() + defer llm.mu.Unlock() + if len(llm.lastPrompts) < 3 { + t.Fatalf("expected >=3 prompts captured, got %d", len(llm.lastPrompts)) + } + // (1) Synthesised TOC carried a section title (no toc_tree provider). + if !strings.Contains(llm.lastPrompts[1], "Ownership") { + t.Errorf("synthesised TOC observation should include section titles; got:\n%s", llm.lastPrompts[1]) + } + // (2) get_pages carried the RAW persisted body, not a summary. + if !strings.Contains(llm.lastPrompts[2], "Ownership is a set of rules that govern") { + t.Errorf("get_pages observation should include raw section content; got:\n%s", llm.lastPrompts[2]) + } +} + // TestPageIndexMultiRangeDone covers a done with two cited ranges: // the strategy must surface every section that overlaps EITHER // range. This is the FinanceBench-shaped pattern: an answer that