From 78961cd6475a35a407693f8244c325a47230cdee Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:11:26 +0100 Subject: [PATCH 1/3] feat(db): add summary_axes JSONB column for multi-axis summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2.5 schema groundwork. The summarizer is about to start writing a structured {topics, entities, numbers, one_line} object per section so retrieval has more axes to match on; this commit adds the storage shape and the wire model without changing any existing behaviour. - New migration 0005 adds sections.summary_axes JSONB. Forward-only additive: NULL on every existing row, no backfill required. The down migration drops the column cleanly. - tree.SummaryAxes carries the four axes; Section + SectionView gain an optional *SummaryAxes pointer so nil distinguishes 'not yet generated' from 'generated, no entities found'. Existing Section.Summary stays put — it'll hold axes.OneLine so older SDKs that only read 'summary' keep working. - db.UpsertSection writes the new column; db.UpdateSectionSummaryAxes mirrors UpdateSectionSummary so the summarize stage can patch the two fields independently as it processes each section. - JSONB marshal/unmarshal follows the candidate_questions pattern: nil pointer marshals to SQL NULL, a non-nil pointer always marshals to an object, garbled bytes degrade to nil. Roundtrip tests cover nil / empty / full / unicode shapes. --- .../0005_sections_summary_axes.down.sql | 2 + .../0005_sections_summary_axes.up.sql | 20 ++++ pkg/db/sections.go | 76 +++++++++++-- pkg/db/sections_marshal_test.go | 102 ++++++++++++++++++ pkg/tree/tree.go | 41 +++++++ 5 files changed, 233 insertions(+), 8 deletions(-) create mode 100644 pkg/db/migrations/0005_sections_summary_axes.down.sql create mode 100644 pkg/db/migrations/0005_sections_summary_axes.up.sql diff --git a/pkg/db/migrations/0005_sections_summary_axes.down.sql b/pkg/db/migrations/0005_sections_summary_axes.down.sql new file mode 100644 index 0000000..2a5480a --- /dev/null +++ b/pkg/db/migrations/0005_sections_summary_axes.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE sections + DROP COLUMN IF EXISTS summary_axes; diff --git a/pkg/db/migrations/0005_sections_summary_axes.up.sql b/pkg/db/migrations/0005_sections_summary_axes.up.sql new file mode 100644 index 0000000..d06c408 --- /dev/null +++ b/pkg/db/migrations/0005_sections_summary_axes.up.sql @@ -0,0 +1,20 @@ +-- 0005_sections_summary_axes.up.sql — multi-axis structured summaries. +-- +-- Phase 2.5 of the retrieval-quality roadmap: the summarizer now returns a +-- structured object ({topics, entities, numbers, one_line}) instead of a +-- single sentence so retrieval has richer signal per section. The existing +-- `summary` column continues to hold the one-line sentence (axes.one_line) +-- for backward compatibility — older SDKs / responses that only read +-- `summary` are unaffected. +-- +-- summary_axes +-- JSONB blob carrying the structured shape. NULL for older sections +-- written before this migration; the retrieval-side rendering of axes +-- simply skips when the column is nil, so no backfill is required. +-- +-- Not indexed: JSONB queries on this column aren't on the hot path. The +-- retrieval prompt loads sections by document_id (already indexed) and +-- reads axes inline. + +ALTER TABLE sections + ADD COLUMN IF NOT EXISTS summary_axes JSONB; diff --git a/pkg/db/sections.go b/pkg/db/sections.go index 142b345..d021740 100644 --- a/pkg/db/sections.go +++ b/pkg/db/sections.go @@ -33,6 +33,13 @@ type Section struct { // generated". CandidateQuestions []string + // SummaryAxes is the Phase 2.5 multi-axis structured summary + // (topics / entities / numbers / one_line). Persisted as JSONB; + // nil means "not yet generated" — older sections written before + // 0005_sections_summary_axes carry NULL and are still query-able + // via the plain Summary field. + SummaryAxes *tree.SummaryAxes + Metadata map[string]string } @@ -41,7 +48,7 @@ type Section struct { // scoped / worker / list variants. const sectionSelectColumns = `id, document_id, COALESCE(parent_id, ''), ordinal, depth, title, summary, content_ref, token_count, metadata, - page_start, page_end, candidate_questions` + page_start, page_end, candidate_questions, summary_axes` // scanSectionRow scans columns in the same order as sectionSelectColumns. // Used by every section-fetching method to keep parsing in lockstep with @@ -50,17 +57,18 @@ func scanSectionRow(row interface { Scan(dest ...any) error }) (Section, error) { var s Section - var rawMeta, rawCandidates []byte + var rawMeta, rawCandidates, rawAxes []byte var pageStart, pageEnd sql.NullInt64 if err := row.Scan(&s.ID, &s.DocumentID, &s.ParentID, &s.Ordinal, &s.Depth, &s.Title, &s.Summary, &s.ContentRef, &s.TokenCount, &rawMeta, - &pageStart, &pageEnd, &rawCandidates); err != nil { + &pageStart, &pageEnd, &rawCandidates, &rawAxes); err != nil { return s, err } s.Metadata = unmarshalMeta(rawMeta) s.PageStart = scanNullableInt(pageStart) s.PageEnd = scanNullableInt(pageEnd) s.CandidateQuestions = unmarshalCandidateQuestions(rawCandidates) + s.SummaryAxes = unmarshalSummaryAxes(rawAxes) return s, nil } @@ -81,12 +89,16 @@ func (p *Pool) UpsertSection(ctx context.Context, s Section) error { if err != nil { return err } + axes, err := marshalSummaryAxes(s.SummaryAxes) + if err != nil { + return err + } _, err = p.Exec(ctx, ` INSERT INTO sections (id, document_id, parent_id, ordinal, depth, title, summary, content_ref, token_count, metadata, page_start, page_end, - candidate_questions) - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13) + candidate_questions, summary_axes) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14) ON CONFLICT (id) DO UPDATE SET parent_id = EXCLUDED.parent_id, ordinal = EXCLUDED.ordinal, @@ -99,10 +111,11 @@ func (p *Pool) UpsertSection(ctx context.Context, s Section) error { page_start = EXCLUDED.page_start, page_end = EXCLUDED.page_end, candidate_questions = EXCLUDED.candidate_questions, + summary_axes = EXCLUDED.summary_axes, updated_at = now()`, string(s.ID), string(s.DocumentID), parent, s.Ordinal, s.Depth, s.Title, s.Summary, s.ContentRef, s.TokenCount, meta, - pageStart, pageEnd, candidates, + pageStart, pageEnd, candidates, axes, ) return mapErr(err) } @@ -130,6 +143,22 @@ func (p *Pool) UpdateSectionCandidateQuestions(ctx context.Context, id tree.Sect return mapErr(err) } +// UpdateSectionSummaryAxes persists the Phase 2.5 multi-axis summary +// blob for a section. Pass nil to clear (stores SQL NULL — the +// "not yet generated" state). Mirrors UpdateSectionSummary so the two +// fields can be patched independently as the pipeline progresses. +func (p *Pool) UpdateSectionSummaryAxes(ctx context.Context, id tree.SectionID, axes *tree.SummaryAxes) error { + raw, err := marshalSummaryAxes(axes) + if err != nil { + return err + } + _, err = p.Exec(ctx, ` + UPDATE sections + SET summary_axes = $2, updated_at = now() + WHERE id = $1`, string(id), raw) + return mapErr(err) +} + // nullIfZero returns SQL NULL when n == 0, otherwise n. Used so unknown // page ranges land as NULL in DB rather than collapsing to "page 0". func nullIfZero(n int) any { @@ -166,6 +195,36 @@ func unmarshalCandidateQuestions(raw []byte) []string { return out } +// marshalSummaryAxes encodes a SummaryAxes pointer as JSONB. nil → SQL +// NULL (the "not yet generated" state). A non-nil pointer is always +// stored as the object form, even when every axis is empty, so callers +// can distinguish "we generated axes and got nothing" from "we haven't +// tried yet". Mirrors marshalCandidateQuestions. +func marshalSummaryAxes(a *tree.SummaryAxes) (any, error) { + if a == nil { + return nil, nil + } + b, err := json.Marshal(a) + if err != nil { + return nil, fmt.Errorf("marshal summary_axes: %w", err) + } + return b, nil +} + +// unmarshalSummaryAxes decodes a JSONB summary_axes blob. NULL / +// zero-length → nil. Garbled bytes degrade to nil rather than panic so +// a stray bad row can't take down listing endpoints. +func unmarshalSummaryAxes(raw []byte) *tree.SummaryAxes { + if len(raw) == 0 { + return nil + } + var out tree.SummaryAxes + if err := json.Unmarshal(raw, &out); err != nil { + return nil + } + return &out +} + // scanNullableInt unwraps a sql.NullInt64 into a plain int (0 = NULL). func scanNullableInt(n sql.NullInt64) int { if !n.Valid { @@ -208,7 +267,7 @@ func (p *Pool) GetSection(ctx context.Context, id tree.SectionID, orgID, storeID q := ` SELECT s.id, s.document_id, COALESCE(s.parent_id, ''), s.ordinal, s.depth, s.title, s.summary, s.content_ref, s.token_count, s.metadata, - s.page_start, s.page_end, s.candidate_questions + s.page_start, s.page_end, s.candidate_questions, s.summary_axes FROM sections s JOIN documents d ON d.id = s.document_id WHERE s.id = $1 AND d.org_id = $2` @@ -249,7 +308,7 @@ func (p *Pool) ListSections(ctx context.Context, docID tree.DocumentID, orgID, s q := ` SELECT s.id, s.document_id, COALESCE(s.parent_id, ''), s.ordinal, s.depth, s.title, s.summary, s.content_ref, s.token_count, s.metadata, - s.page_start, s.page_end, s.candidate_questions + s.page_start, s.page_end, s.candidate_questions, s.summary_axes FROM sections s JOIN documents d ON d.id = s.document_id WHERE s.document_id = $1 AND d.org_id = $2` @@ -344,6 +403,7 @@ func buildTree(doc *Document, rows []Section) *tree.Tree { Ordinal: r.Ordinal, Title: r.Title, Summary: r.Summary, + SummaryAxes: r.SummaryAxes, ContentRef: r.ContentRef, TokenCount: r.TokenCount, PageStart: r.PageStart, diff --git a/pkg/db/sections_marshal_test.go b/pkg/db/sections_marshal_test.go index a88165c..35da1fc 100644 --- a/pkg/db/sections_marshal_test.go +++ b/pkg/db/sections_marshal_test.go @@ -2,7 +2,10 @@ package db import ( "database/sql" + "encoding/json" "testing" + + "github.com/hallelx2/vectorless-engine/pkg/tree" ) // TestMarshalCandidateQuestionsRoundTrip exercises the JSONB-marshal @@ -84,3 +87,102 @@ func TestScanNullableInt(t *testing.T) { t.Errorf("got %d, want 42", got) } } + +// TestMarshalSummaryAxesRoundTrip exercises the JSONB-marshal path that +// UpsertSection + UpdateSectionSummaryAxes use, so storing a structured +// summary and reading it back reproduces the same object. +func TestMarshalSummaryAxesRoundTrip(t *testing.T) { + cases := []struct { + name string + in *tree.SummaryAxes + }{ + {"nil → NULL → nil", nil}, + {"empty object", &tree.SummaryAxes{}}, + {"only one_line", &tree.SummaryAxes{OneLine: "A short sentence."}}, + {"full shape", &tree.SummaryAxes{ + Topics: []string{"debt", "long-term-obligations"}, + Entities: []string{"3M Company", "JPMorgan", "Q3 2024"}, + Numbers: []string{"$4.2B", "2.8%"}, + OneLine: "Long-term debt issued by 3M Company and serviced via JPMorgan.", + }}, + {"unicode + punctuation", &tree.SummaryAxes{ + Topics: []string{"§-12(b)", "côté"}, + Entities: []string{"Renée Müller"}, + Numbers: []string{"€1,234"}, + OneLine: "Une phrase en français — avec des “guillemets” et un § 12(b).", + }}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + raw, err := marshalSummaryAxes(c.in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if c.in == nil { + if raw != nil { + t.Fatalf("nil input should produce nil (SQL NULL), got %v", raw) + } + if got := unmarshalSummaryAxes(nil); got != nil { + t.Errorf("unmarshal of NULL should be nil, got %v", got) + } + return + } + b, ok := raw.([]byte) + if !ok { + t.Fatalf("non-nil input should produce []byte, got %T", raw) + } + got := unmarshalSummaryAxes(b) + if got == nil { + t.Fatalf("roundtrip lost the value; raw=%s", string(b)) + } + if got.OneLine != c.in.OneLine { + t.Errorf("OneLine: got %q want %q", got.OneLine, c.in.OneLine) + } + assertStringSliceEq(t, "Topics", got.Topics, c.in.Topics) + assertStringSliceEq(t, "Entities", got.Entities, c.in.Entities) + assertStringSliceEq(t, "Numbers", got.Numbers, c.in.Numbers) + }) + } +} + +// TestUnmarshalSummaryAxesTolerant — non-JSON / garbled bytes should +// fall back to nil rather than panic. Guards listing endpoints against +// a stray bad row. +func TestUnmarshalSummaryAxesTolerant(t *testing.T) { + if got := unmarshalSummaryAxes([]byte("not json")); got != nil { + t.Errorf("garbled bytes should yield nil, got %v", got) + } +} + +// TestMarshalSummaryAxesEmptyObjectIsNotNull guards the contract that a +// non-nil pointer always serialises to JSONB (not SQL NULL). Lets the +// pipeline distinguish "we generated axes but every list is empty" +// from "axes were never generated". +func TestMarshalSummaryAxesEmptyObjectIsNotNull(t *testing.T) { + raw, err := marshalSummaryAxes(&tree.SummaryAxes{}) + if err != nil { + t.Fatalf("marshal: %v", err) + } + b, ok := raw.([]byte) + if !ok { + t.Fatalf("empty axes should produce []byte, got %T", raw) + } + // Should be a JSON object — `{}` after omitempty stripping. + var probe map[string]any + if err := json.Unmarshal(b, &probe); err != nil { + t.Fatalf("marshalled output is not a JSON object: %s", string(b)) + } +} + +func assertStringSliceEq(t *testing.T, label string, got, want []string) { + t.Helper() + if len(got) != len(want) { + t.Errorf("%s len: got %v want %v", label, got, want) + return + } + for i := range got { + if got[i] != want[i] { + t.Errorf("%s[%d]: got %q want %q", label, i, got[i], want[i]) + } + } +} diff --git a/pkg/tree/tree.go b/pkg/tree/tree.go index 06fd623..edf36ba 100644 --- a/pkg/tree/tree.go +++ b/pkg/tree/tree.go @@ -42,6 +42,16 @@ type Section struct { Title string `json:"title"` Summary string `json:"summary,omitempty"` + // SummaryAxes is the structured, multi-axis summary written by the + // Phase 2.5 summarizer. Carries topics / entities / numbers / one_line. + // nil for sections written before multi-axis summaries shipped (the + // retrieval prompt simply skips the extra axes lines when nil). + // + // SummaryAxes.OneLine and the flat `Summary` field above are kept in + // sync at write time so older callers that only read `summary` keep + // working unchanged. + SummaryAxes *SummaryAxes `json:"summary_axes,omitempty"` + // ContentRef points to the full text of this section in storage. // Only leaf sections have content; internal sections summarize their // children. @@ -70,6 +80,31 @@ type Section struct { Children []*Section `json:"children,omitempty"` } +// SummaryAxes is the multi-axis structured summary the Phase 2.5 +// summarizer produces. Each axis gives retrieval a different angle on +// the section's content: +// +// - Topics: hyphenated keywords ("debt", "long-term-obligations") +// for coarse subject-matter matching. +// - Entities: proper nouns extracted verbatim (orgs, people, places, +// dates) so retrieval can match queries that mention them by name. +// - Numbers: standout numeric values with the units as they appear +// in the section ("$4.2B", "2.8%") — preserves the surface form so +// a downstream model doesn't have to re-normalise. +// - OneLine: the human-readable sentence describing the section. +// Persisted into the plain `summary` field on Section so older +// SDKs / API consumers that read `summary` continue to work. +// +// All axes are optional; a parse failure or content with no extractable +// entities/numbers can leave individual fields empty without invalidating +// the rest of the object. +type SummaryAxes struct { + Topics []string `json:"topics,omitempty"` + Entities []string `json:"entities,omitempty"` + Numbers []string `json:"numbers,omitempty"` + OneLine string `json:"one_line,omitempty"` +} + // IsLeaf reports whether this section has no children. func (s *Section) IsLeaf() bool { return len(s.Children) == 0 @@ -128,6 +163,11 @@ type SectionView struct { // can answer. Surfaced into the retrieval prompt to widen the model's // lexical/semantic overlap with the user query. CandidateQuestions []string `json:"candidate_questions,omitempty"` + + // SummaryAxes mirrors Section.SummaryAxes so the retrieval prompt + // can surface topics / entities / numbers alongside the one-line + // summary. nil for pre-Phase-2.5 sections. + SummaryAxes *SummaryAxes `json:"summary_axes,omitempty"` } // BuildView renders the tree as a flat list of SectionViews in depth-first @@ -152,6 +192,7 @@ func (t *Tree) BuildView() View { PageStart: s.PageStart, PageEnd: s.PageEnd, CandidateQuestions: s.CandidateQuestions, + SummaryAxes: s.SummaryAxes, } for _, c := range s.Children { sv.Children = append(sv.Children, c.ID) From fa6fb0835daeceb2b45172d2222e63e1b6137f9d Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:19:13 +0100 Subject: [PATCH 2/3] feat(ingest): structured summary prompt + parser for multi-axis summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The summarizer now produces a structured {topics, entities, numbers, one_line} object per section via JSON mode, replacing the one-line prompt. Persistence writes BOTH the new axes blob (via UpdateSectionSummaryAxes) and the one_line into the existing summary column, so older callers that read summary continue to work unchanged. - structuredSummaryFor builds the JSON-mode request, mirroring the HyDE retry-on-parse-failure shape (defaultSummaryAxesRetries=2). On final parse failure the section ends up with the model's raw text as OneLine and empty axis lists — never calls p.fail, so a single LLM-formatting blip doesn't dead-letter the document. - parseSummaryAxes tolerates code-fence wrappers and prose leakage (same contract as ParseSelection / parseHyDEResponse), trims blank/whitespace-only axis entries, and rejects non-JSON outright so the retry loop fires. - Per-axis caps (SummaryAxesMaxTopics/Entities/Numbers) trim oversized model output so a misbehaving model can't blow up the retrieval prompt budget downstream. Defaults: 4/8/6. - legacyOneLineSummary is the pre-2.5 single-sentence path; reached when SummaryAxesEnabled is false. Keeps the structured-vs-legacy code paths cleanly separated so the opt-out is one bool away. - summarize now patches summary + summary_axes independently — the summary column always gets axes.OneLine; the JSONB column is only written when SummaryAxesEnabled is on, so a future opt-out cleanly leaves summary_axes NULL. Tests cover: happy-path axes parse, axis caps enforcement, parse failure falls back to raw text in OneLine, transport / ErrNotImplemented fallback, retry count, code-fence / blank-entry tolerance. --- pkg/ingest/ingest.go | 154 +++++++++++++- pkg/ingest/summary_axes.go | 181 +++++++++++++++++ pkg/ingest/summary_axes_test.go | 347 ++++++++++++++++++++++++++++++++ 3 files changed, 672 insertions(+), 10 deletions(-) create mode 100644 pkg/ingest/summary_axes.go create mode 100644 pkg/ingest/summary_axes_test.go diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go index 666dee3..d4c174d 100644 --- a/pkg/ingest/ingest.go +++ b/pkg/ingest/ingest.go @@ -79,6 +79,30 @@ type Pipeline struct { // documents at the cost of higher LLM throughput. Default: 4. SummaryConcurrency int + // SummaryAxesEnabled toggles the Phase 2.5 multi-axis structured + // summary path. When true (the default), the summarizer asks the LLM + // for a JSON object {topics, entities, numbers, one_line} and + // persists both the axes blob and the one_line into the existing + // summary column. When false, the pipeline falls back to the + // pre-2.5 single-sentence prompt — no axes are written and the + // retrieval prompt sees only summary + HyDE questions. + // + // Defaulted to true by config wiring. Left as the Go zero value + // (false) when Pipeline is constructed directly, so old test paths + // that build a Pipeline literal without the field keep the legacy + // behaviour. + SummaryAxesEnabled bool + + // SummaryAxesMaxTopics caps the topic axis returned by the + // structured summarizer. Default: 4. + SummaryAxesMaxTopics int + + // SummaryAxesMaxEntities caps the entities axis. Default: 8. + SummaryAxesMaxEntities int + + // SummaryAxesMaxNumbers caps the numbers axis. Default: 6. + SummaryAxesMaxNumbers int + // HyDEEnabled toggles the candidate-question generation stage. // Defaulted to true by config wiring; left as the Go zero value // (false) when Pipeline is constructed directly, so unit tests with @@ -124,6 +148,20 @@ func NewPipeline(p Pipeline) *Pipeline { if p.SummaryConcurrency <= 0 { p.SummaryConcurrency = 4 } + // Multi-axis structured summaries are on by default — they unlock + // Phase 2.5 retrieval signal without changing the existing summary + // field's contract. Callers that construct a Pipeline literal + // directly and don't set this still get the legacy single-line + // path (Go zero value), matching the historical test contract. + if p.SummaryAxesMaxTopics <= 0 { + p.SummaryAxesMaxTopics = 4 + } + if p.SummaryAxesMaxEntities <= 0 { + p.SummaryAxesMaxEntities = 8 + } + if p.SummaryAxesMaxNumbers <= 0 { + p.SummaryAxesMaxNumbers = 6 + } if p.HyDENumQuestions <= 0 { p.HyDENumQuestions = 5 } @@ -383,7 +421,7 @@ func (p *Pipeline) summarize(ctx context.Context, docID tree.DocumentID, profile var ( mu sync.Mutex errs []error - computed = map[tree.SectionID]string{} // section ID → freshly-written summary + computed = map[tree.SectionID]string{} // section ID → freshly-written one-line summary ) // Process from deepest to shallowest so children are summarized @@ -434,7 +472,7 @@ func (p *Pipeline) summarize(ctx context.Context, docID tree.DocumentID, profile if !ok { return nil } - summary, err := p.summaryFor(gctx, s, childLines, profile) + axes, err := p.summaryFor(gctx, s, childLines, profile) release() if err != nil { mu.Lock() @@ -443,15 +481,32 @@ func (p *Pipeline) summarize(ctx context.Context, docID tree.DocumentID, profile return nil // non-fatal — don't abort siblings } + oneLine := "" + if axes != nil { + oneLine = axes.OneLine + } mu.Lock() - computed[s.ID] = summary + computed[s.ID] = oneLine mu.Unlock() - if err := p.DB.UpdateSectionSummary(gctx, s.ID, summary, s.TokenCount); err != nil { + // Always patch the flat `summary` column with the + // one-line sentence so older API consumers continue to + // see a populated summary. The axes blob is patched + // separately and only when the multi-axis path is on, + // so a future opt-out (or a parse failure) cleanly + // leaves summary_axes NULL. + if err := p.DB.UpdateSectionSummary(gctx, s.ID, oneLine, s.TokenCount); err != nil { mu.Lock() errs = append(errs, err) mu.Unlock() } + if p.SummaryAxesEnabled && axes != nil { + if err := p.DB.UpdateSectionSummaryAxes(gctx, s.ID, axes); err != nil { + mu.Lock() + errs = append(errs, err) + mu.Unlock() + } + } return nil }) } @@ -462,21 +517,31 @@ func (p *Pipeline) summarize(ctx context.Context, docID tree.DocumentID, profile return errors.Join(errs...) } -func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []string, profile string) (string, error) { +// summaryFor produces the multi-axis structured summary for a single +// section. Returns a non-nil *tree.SummaryAxes even on parse failure: +// in the failure case OneLine carries the model's raw text (or a +// fallback excerpt) and the slice axes are empty, so the section is +// still retrieval-able via the flat summary column. +// +// When SummaryAxesEnabled is false the function falls back to the +// pre-2.5 single-sentence prompt and returns axes with only OneLine +// populated. Callers (summarize) skip the axes JSONB write in that +// case, leaving summary_axes NULL. +func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []string, profile string) (*tree.SummaryAxes, error) { var body string if len(childLines) == 0 { // Leaf: fetch the stored text. if s.ContentRef == "" { - return cleanForLLM(s.Title), nil + return &tree.SummaryAxes{OneLine: cleanForLLM(s.Title)}, nil } rc, _, err := p.Storage.Get(ctx, s.ContentRef) if err != nil { - return "", err + return nil, err } defer rc.Close() raw, err := io.ReadAll(io.LimitReader(rc, int64(p.SummaryMaxChars))) if err != nil { - return "", err + return nil, err } body = cleanForLLM(string(raw)) } else { @@ -491,6 +556,20 @@ func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []st body = b.String() } + if !p.SummaryAxesEnabled { + oneLine, err := p.legacyOneLineSummary(ctx, s, body, profile) + if err != nil { + return nil, err + } + return &tree.SummaryAxes{OneLine: oneLine}, nil + } + return p.structuredSummaryFor(ctx, s, body, profile), nil +} + +// legacyOneLineSummary is the pre-Phase-2.5 path: a single-sentence +// request. Kept for the SummaryAxesEnabled=false opt-out branch and +// for unit tests that build a Pipeline literal without the new flag. +func (p *Pipeline) legacyOneLineSummary(ctx context.Context, s db.Section, body, profile string) (string, error) { resp, err := p.LLM.Complete(ctx, llmgate.Request{ Model: p.SummaryModel, Temperature: 0.0, @@ -504,8 +583,8 @@ func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []st }) if err != nil { // Stub LLMs return ErrNotImplemented. Degrade gracefully: use a - // truncated excerpt as the "summary" so downstream retrieval still - // has something to reason over. + // truncated excerpt as the "summary" so downstream retrieval + // still has something to reason over. if errors.Is(err, llmgate.ErrNotImplemented) { return fallbackSummary(s.Title, body), nil } @@ -517,6 +596,61 @@ func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []st return fallbackSummary(s.Title, body), nil } +// structuredSummaryFor is the Phase 2.5 path: a JSON-mode request that +// returns {topics, entities, numbers, one_line}. Mirrors the HyDE +// retry-on-parse-failure shape — if the model produces non-JSON we +// retry with a stricter nudge; final failure degrades to an axes +// object whose OneLine carries the model's raw text and whose axis +// lists are empty, so ingest never calls p.fail for a summarization +// parse blip. +// +// ErrNotImplemented (stub LLM) collapses to the legacy text fallback +// so unit tests with no LLM keep producing a non-empty summary. +func (p *Pipeline) structuredSummaryFor(ctx context.Context, s db.Section, body, profile string) *tree.SummaryAxes { + req := llmgate.Request{ + Model: p.SummaryModel, + Temperature: 0.0, + MaxTokens: 600, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: summaryAxesSystemPrompt(profile)}, + {Role: llmgate.RoleUser, Content: fmt.Sprintf( + "Section titled %q.\n\nContent:\n%s\n\n%s", + cleanForLLM(s.Title), body, + summaryAxesUserSuffix(p.SummaryAxesMaxTopics, p.SummaryAxesMaxEntities, p.SummaryAxesMaxNumbers), + )}, + }, + JSONMode: true, + JSONSchema: []byte(summaryAxesJSONSchema), + } + + axes, rawText, err := runSummaryAxesWithRetry(ctx, p.LLM, req, defaultSummaryAxesRetries) + if err != nil { + // Transport / ErrNotImplemented / unrecoverable: fall back to a + // text excerpt as OneLine with empty axes. Never fail ingest. + return &tree.SummaryAxes{OneLine: fallbackSummary(s.Title, body)} + } + if axes != nil { + // Successful parse — enforce the configured per-axis caps and + // normalise the one-line so older API consumers always see a + // populated summary. + capStrings(&axes.Topics, p.SummaryAxesMaxTopics) + capStrings(&axes.Entities, p.SummaryAxesMaxEntities) + capStrings(&axes.Numbers, p.SummaryAxesMaxNumbers) + if strings.TrimSpace(axes.OneLine) == "" { + axes.OneLine = fallbackSummary(s.Title, body) + } + return axes + } + // Parse failed but the LLM returned something — use the raw text as + // OneLine so a downstream retrieval engine still has signal. Axis + // lists stay empty. + one := strings.TrimSpace(rawText) + if one == "" { + one = fallbackSummary(s.Title, body) + } + return &tree.SummaryAxes{OneLine: one} +} + func fallbackSummary(title, body string) string { body = strings.TrimSpace(body) if body == "" { diff --git a/pkg/ingest/summary_axes.go b/pkg/ingest/summary_axes.go new file mode 100644 index 0000000..8dad5b2 --- /dev/null +++ b/pkg/ingest/summary_axes.go @@ -0,0 +1,181 @@ +package ingest + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// summaryAxesJSONSchema is the JSON-mode schema enforced by the +// Phase 2.5 structured summarizer. All four axes are optional at the +// schema level so a parse-tolerant model that produces a partially +// populated object still validates — we cap and validate downstream. +const summaryAxesJSONSchema = `{ + "type": "object", + "properties": { + "topics": {"type": "array", "items": {"type": "string"}}, + "entities": {"type": "array", "items": {"type": "string"}}, + "numbers": {"type": "array", "items": {"type": "string"}}, + "one_line": {"type": "string"} + }, + "required": ["one_line"] +}` + +// summaryAxesSystemPrompt returns a domain-aware system prompt for the +// Phase 2.5 structured summarizer. Compared to the legacy one-line +// prompt, this one asks for FOUR axes — topics, entities, numbers, +// and a one-line sentence — so the downstream retrieval prompt can +// match user queries on more than just paraphrased prose. The +// per-domain tweaks (research/medical/generic) shift the emphasis of +// what counts as an entity or a number but leave the schema intact. +func summaryAxesSystemPrompt(profile string) string { + const rule = "You are summarising a section of a document for a retrieval engine. The retrieval engine reads your output to decide whether this section answers a user's question, so name concrete topics, entities, identifiers, and key numbers — not generic descriptions. Do NOT invent facts that aren't in the section." + switch strings.ToLower(strings.TrimSpace(profile)) { + case "research": + return "You summarise sections of academic research papers along multiple axes. " + rule + case "medical": + return "You summarise sections of clinical and medical documents along multiple axes. " + rule + default: + return "You summarise sections of business, legal, and financial documents along multiple axes. " + rule + } +} + +// summaryAxesUserSuffix renders the per-axis instructions that follow +// the section's content in the user message. The caps are surfaced +// explicitly so a downstream operator can tune them via config +// without touching the prompt — the prompt itself stays declarative. +func summaryAxesUserSuffix(maxTopics, maxEntities, maxNumbers int) string { + if maxTopics <= 0 { + maxTopics = 4 + } + if maxEntities <= 0 { + maxEntities = 8 + } + if maxNumbers <= 0 { + maxNumbers = 6 + } + return fmt.Sprintf(`Return a JSON object with: +- topics: array of 1-%d topic keywords (lower-case, hyphenated) +- entities: array of 0-%d proper-noun mentions found in the section (orgs, people, places, dates) +- numbers: array of 0-%d standout numeric values WITH UNITS as they appear in the section ("$4.2B", "2.8%%", "Q3 2024") +- one_line: one sentence (≤30 words) describing what the section is about + +If a field has nothing to populate, return an empty array (or empty string for one_line). Return ONLY the JSON object — no prose, no markdown fences.`, maxTopics, maxEntities, maxNumbers) +} + +// defaultSummaryAxesRetries is the number of EXTRA attempts (on top of +// the first) the structured summary call gets when parsing fails. +// Matches the HyDE / retrieval retry contract. +const defaultSummaryAxesRetries = 2 + +// runSummaryAxesWithRetry runs the structured summary LLM call and +// parses the response, retrying up to maxRetries additional times if +// parsing fails. Returns parsed axes on success, the raw response text +// (for OneLine fallback) on final parse failure with no transport +// error, and a non-nil error only for transport-level failures. +// +// ErrNotImplemented (stub LLM) is folded into the error return so +// callers can degrade to the text fallback path. +func runSummaryAxesWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) (*tree.SummaryAxes, string, error) { + if maxRetries < 0 { + maxRetries = 0 + } + var lastRaw string + for attempt := 0; attempt <= maxRetries; attempt++ { + req := baseReq + if attempt > 0 { + msgs := make([]llmgate.Message, len(baseReq.Messages)) + copy(msgs, baseReq.Messages) + tail := len(msgs) - 1 + msgs[tail] = llmgate.Message{ + Role: msgs[tail].Role, + Content: msgs[tail].Content + "\n\nIMPORTANT: respond with ONLY a JSON object matching the schema. Do not include prose, explanation, or markdown fences.", + } + req.Messages = msgs + } + resp, err := client.Complete(ctx, req) + if err != nil { + // ErrNotImplemented bubbles up so the caller can use a + // purely text-based fallback. Transport errors do the same. + return nil, lastRaw, err + } + lastRaw = resp.Content + axes, parseErr := parseSummaryAxes(resp.Content) + if parseErr == nil { + return axes, lastRaw, nil + } + } + // Final parse failure — return the raw text so the caller can + // stash it as OneLine. Not an error: the document is still + // ingestable, just with shallower retrieval signal. + return nil, lastRaw, nil +} + +// parseSummaryAxes extracts a SummaryAxes from an LLM JSON response. +// Tolerates code-fence wrappers and leading/trailing prose, matching +// ParseSelection / parseHyDEResponse. +func parseSummaryAxes(raw string) (*tree.SummaryAxes, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, errors.New("empty response") + } + if strings.HasPrefix(raw, "```") { + if i := strings.Index(raw, "\n"); i >= 0 { + raw = raw[i+1:] + } + raw = strings.TrimSuffix(raw, "```") + raw = strings.TrimSpace(raw) + } + if i := strings.Index(raw, "{"); i > 0 { + raw = raw[i:] + } + if j := strings.LastIndex(raw, "}"); j >= 0 && j < len(raw)-1 { + raw = raw[:j+1] + } + var out tree.SummaryAxes + if err := json.Unmarshal([]byte(raw), &out); err != nil { + return nil, fmt.Errorf("unmarshal summary_axes: %w", err) + } + out.Topics = trimNonEmpty(out.Topics) + out.Entities = trimNonEmpty(out.Entities) + out.Numbers = trimNonEmpty(out.Numbers) + out.OneLine = strings.TrimSpace(out.OneLine) + return &out, nil +} + +// capStrings truncates *xs to at most max entries in place. A nil or +// non-positive max disables the cap. +func capStrings(xs *[]string, max int) { + if xs == nil || max <= 0 { + return + } + if len(*xs) > max { + *xs = (*xs)[:max] + } +} + +// trimNonEmpty strips empty / whitespace-only entries from a slice. +// Preserves order. Returns nil for an all-empty input so the omitempty +// JSON tag on SummaryAxes drops the field from the persisted blob. +func trimNonEmpty(xs []string) []string { + if len(xs) == 0 { + return nil + } + out := xs[:0] + for _, x := range xs { + x = strings.TrimSpace(x) + if x != "" { + out = append(out, x) + } + } + if len(out) == 0 { + return nil + } + return out +} diff --git a/pkg/ingest/summary_axes_test.go b/pkg/ingest/summary_axes_test.go new file mode 100644 index 0000000..91e0ecd --- /dev/null +++ b/pkg/ingest/summary_axes_test.go @@ -0,0 +1,347 @@ +package ingest + +import ( + "context" + "errors" + "log/slog" + "testing" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/db" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// TestSummaryForReturnsAxes asserts the happy path: the LLM returns a +// valid JSON object with the four axes, and the parser surfaces them +// verbatim while normalising the one-line into the structure callers +// downstream rely on. This is the central contract the retrieval +// prompt + the DB persistence layer both lean on. +func TestSummaryForReturnsAxes(t *testing.T) { + t.Parallel() + + const cannedJSON = `{ + "topics": ["debt", "long-term-obligations"], + "entities": ["3M Company", "JPMorgan", "Q3 2024"], + "numbers": ["$4.2B", "2.8%", "2034"], + "one_line": "Long-term debt issued by 3M Company and serviced via JPMorgan." + }` + m := &llmgate.Mock{ + Respond: func(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + if !req.JSONMode { + t.Errorf("structured summary call should request JSON mode") + } + return &llmgate.Response{Content: cannedJSON}, nil + }, + } + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + SummaryAxesMaxTopics: 4, + SummaryAxesMaxEntities: 8, + SummaryAxesMaxNumbers: 6, + } + sec := db.Section{ID: tree.SectionID("sec_a"), Title: "Long-Term Debt"} + + // Use a non-nil childLines so summaryFor takes the internal-node + // path and skips the storage fetch — tests build a Pipeline literal + // without Storage. The content seeded here is what the LLM "sees". + childLines := []string{"- Notes: $4.2B of senior unsecured notes due 2034"} + axes, err := p.summaryFor(context.Background(), sec, childLines, "") + if err != nil { + t.Fatalf("summaryFor: %v", err) + } + if axes == nil { + t.Fatal("summaryFor returned nil axes on happy path") + } + if axes.OneLine != "Long-term debt issued by 3M Company and serviced via JPMorgan." { + t.Errorf("OneLine: got %q", axes.OneLine) + } + if want := []string{"debt", "long-term-obligations"}; !sliceEq(axes.Topics, want) { + t.Errorf("Topics: got %v want %v", axes.Topics, want) + } + if want := []string{"3M Company", "JPMorgan", "Q3 2024"}; !sliceEq(axes.Entities, want) { + t.Errorf("Entities: got %v want %v", axes.Entities, want) + } + if want := []string{"$4.2B", "2.8%", "2034"}; !sliceEq(axes.Numbers, want) { + t.Errorf("Numbers: got %v want %v", axes.Numbers, want) + } +} + +// TestSummaryForEnforcesAxisCaps proves the configured per-axis caps +// trim oversized model output. A misbehaving model that returns 50 +// entities must not blow up the prompt budget downstream. +func TestSummaryForEnforcesAxisCaps(t *testing.T) { + t.Parallel() + + const cannedJSON = `{ + "topics": ["a","b","c","d","e","f"], + "entities": ["E1","E2","E3","E4","E5","E6","E7","E8","E9","E10"], + "numbers": ["1","2","3","4","5","6","7","8"], + "one_line": "." + }` + m := &llmgate.Mock{Reply: cannedJSON} + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + SummaryAxesMaxTopics: 2, + SummaryAxesMaxEntities: 3, + SummaryAxesMaxNumbers: 4, + } + axes, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "T"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor: %v", err) + } + if axes == nil { + t.Fatal("nil axes") + } + if len(axes.Topics) != 2 { + t.Errorf("Topics len = %d, want 2", len(axes.Topics)) + } + if len(axes.Entities) != 3 { + t.Errorf("Entities len = %d, want 3", len(axes.Entities)) + } + if len(axes.Numbers) != 4 { + t.Errorf("Numbers len = %d, want 4", len(axes.Numbers)) + } +} + +// TestSummaryForParseFailureUsesRawAsOneLine: when the model returns +// plain text instead of JSON, the structured path retries (up to +// defaultSummaryAxesRetries), and on persistent failure degrades +// gracefully — the section ends up with summary = raw text and +// summary_axes = nil. Ingest does not fail. +func TestSummaryForParseFailureUsesRawAsOneLine(t *testing.T) { + t.Parallel() + const raw = "Some long prose the model emitted without JSON wrapping." + m := &llmgate.Mock{Reply: raw} + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + } + axes, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "T"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor: %v", err) + } + if axes == nil { + t.Fatal("summaryFor returned nil on parse failure; expected non-nil with OneLine=raw") + } + if axes.OneLine != raw { + t.Errorf("OneLine: got %q want %q", axes.OneLine, raw) + } + if len(axes.Topics) != 0 || len(axes.Entities) != 0 || len(axes.Numbers) != 0 { + t.Errorf("axes lists should be empty on parse failure, got T=%v E=%v N=%v", + axes.Topics, axes.Entities, axes.Numbers) + } +} + +// TestSummaryForLegacyPathWhenAxesDisabled: with SummaryAxesEnabled=false +// the pipeline must fall back to the pre-2.5 single-sentence prompt +// (no JSON mode) and return axes carrying only OneLine. +func TestSummaryForLegacyPathWhenAxesDisabled(t *testing.T) { + t.Parallel() + var sawJSONMode bool + m := &llmgate.Mock{ + Respond: func(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + if req.JSONMode { + sawJSONMode = true + } + return &llmgate.Response{Content: "A concise sentence."}, nil + }, + } + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: false, + } + axes, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "T"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor: %v", err) + } + if axes == nil { + t.Fatal("summaryFor returned nil axes") + } + if sawJSONMode { + t.Errorf("legacy path must not request JSON mode") + } + if axes.OneLine != "A concise sentence." { + t.Errorf("OneLine = %q", axes.OneLine) + } + if len(axes.Topics) != 0 || len(axes.Entities) != 0 || len(axes.Numbers) != 0 { + t.Errorf("legacy path should leave axis lists empty") + } +} + +// TestSummaryForErrNotImplementedFallback: when the LLM is a stub that +// returns ErrNotImplemented (the unit-test default for many setups), +// the structured path must still return a non-nil axes object — with +// OneLine derived from the fallback text — so ingest never gets a +// nil to fail on. +func TestSummaryForErrNotImplementedFallback(t *testing.T) { + t.Parallel() + m := &llmgate.Mock{ + Respond: func(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + return nil, llmgate.ErrNotImplemented + }, + } + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + } + axes, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "My Title"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor: %v", err) + } + if axes == nil { + t.Fatal("nil axes from ErrNotImplemented") + } + if axes.OneLine == "" { + t.Errorf("OneLine should be non-empty after ErrNotImplemented fallback, got %q", axes.OneLine) + } +} + +// TestSummaryForRetriesParseFailures asserts that a non-JSON response +// is retried per defaultSummaryAxesRetries. We count Complete calls +// — should be 1 (initial) + defaultSummaryAxesRetries = 3 total. +func TestSummaryForRetriesParseFailures(t *testing.T) { + t.Parallel() + var calls int + m := &llmgate.Mock{ + Respond: func(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + calls++ + return &llmgate.Response{Content: "still not JSON"}, nil + }, + } + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + } + _, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "T"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor should degrade gracefully: %v", err) + } + if want := 1 + defaultSummaryAxesRetries; calls != want { + t.Errorf("LLM call count: got %d want %d", calls, want) + } +} + +// TestSummaryForTransportErrorReturnsAxesNotError: a transport-level +// failure (network blip) must still degrade gracefully — ingest is +// fault-tolerant about summarization, so we want a fallback axes +// object back, not a propagated error. +func TestSummaryForTransportErrorReturnsAxesNotError(t *testing.T) { + t.Parallel() + boom := errors.New("network error") + m := &llmgate.Mock{ + Respond: func(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + return nil, boom + }, + } + p := &Pipeline{ + LLM: m, + Logger: slog.Default(), + SummaryMaxChars: 4000, + SummaryModel: "m", + SummaryAxesEnabled: true, + } + axes, err := p.summaryFor(context.Background(), db.Section{ID: "s", Title: "T"}, []string{"- child"}, "") + if err != nil { + t.Fatalf("summaryFor should swallow transport errors: %v", err) + } + if axes == nil { + t.Fatal("nil axes from transport error") + } + // OneLine populated via fallback so retrieval still has signal. + if axes.OneLine == "" { + t.Error("OneLine should be non-empty after transport-error fallback") + } +} + +// TestParseSummaryAxesTolerantToCodeFences mirrors the +// ParseSelection / parseHyDEResponse contracts — JSON-mode models +// occasionally wrap responses in ```json fences or prose, and the +// parser must look through both. +func TestParseSummaryAxesTolerantToCodeFences(t *testing.T) { + t.Parallel() + cases := []string{ + `{"one_line":"x"}`, + "```json\n{\"one_line\":\"x\"}\n```", + "Sure! Here you go: {\"one_line\":\"x\"} hope this helps.", + } + for _, c := range cases { + axes, err := parseSummaryAxes(c) + if err != nil { + t.Errorf("parse failed for %q: %v", c, err) + continue + } + if axes == nil || axes.OneLine != "x" { + t.Errorf("parse(%q) = %+v, want OneLine=x", c, axes) + } + } +} + +// TestParseSummaryAxesRejectsGarbage: non-JSON input must error so +// the retry loop fires (rather than silently producing an empty +// SummaryAxes). +func TestParseSummaryAxesRejectsGarbage(t *testing.T) { + t.Parallel() + if _, err := parseSummaryAxes("not json at all"); err == nil { + t.Error("expected parse error for non-JSON input") + } + if _, err := parseSummaryAxes(""); err == nil { + t.Error("expected parse error for empty input") + } +} + +// TestParseSummaryAxesTrimsBlanks: blank/whitespace-only strings in +// the array axes get dropped at parse time so a sloppy model that +// returns ["debt", "", " "] doesn't pollute the retrieval prompt. +func TestParseSummaryAxesTrimsBlanks(t *testing.T) { + t.Parallel() + in := `{"topics":["debt",""," "], "entities":["3M",""], "numbers":[" "], "one_line":" hello "}` + axes, err := parseSummaryAxes(in) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(axes.Topics) != 1 || axes.Topics[0] != "debt" { + t.Errorf("Topics: got %v", axes.Topics) + } + if len(axes.Entities) != 1 || axes.Entities[0] != "3M" { + t.Errorf("Entities: got %v", axes.Entities) + } + if len(axes.Numbers) != 0 { + t.Errorf("Numbers: should be empty after trim, got %v", axes.Numbers) + } + if axes.OneLine != "hello" { + t.Errorf("OneLine: got %q, want trimmed 'hello'", axes.OneLine) + } +} + +func sliceEq(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} From 9b216ef16b7a42c1e38a23c48c45859ec53aad37 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:26:07 +0100 Subject: [PATCH 3/3] feat(retrieval): surface summary_axes entities + numbers in outline prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retrieval prompt now renders entities and numbers from each section's structured summary on the same outline line as the title/summary, giving the model direct surface-form access to proper-noun and numeric anchors at selection time. Pre-Phase-2.5 sections (axes == nil) skip the new suffix, so un-backfilled documents see exactly the same prompt as before. Outline shape per section: - [sec_a] Long-Term Debt — issued debt securities — entities: 3M Company, JPMorgan — numbers: $4.2B, 2034 - writeSectionLine appends '— entities: ...' / '— numbers: ...' when sv.SummaryAxes is non-nil and the corresponding axis is non-empty. Truncated to the first 3 entries each via firstN to keep per-section prompt cost bounded. - Config block ingest.summary_axes (Enabled, MaxTopics, MaxEntities, MaxNumbers) wires the toggle + caps through to the ingest pipeline. Defaults: enabled=true, 4/8/6. Env overrides: VLE_INGEST_SUMMARY_AXES_{ENABLED,MAX_TOPICS,MAX_ENTITIES,MAX_NUMBERS}. Validate rejects negatives. - Both cmd/server and cmd/engine wire the new config fields into NewPipeline so production binaries pick up the structured path out of the box. - openapi.yaml gains a shared SummaryAxes schema referenced from SectionView, Section, and QuerySection. Marked Omitted on pre-2.5 sections. - config.example.yaml + config.server.example.yaml gain a summary_axes block with inline documentation. Tests cover: axes-bearing tree renders entities + numbers with proper truncation; pre-2.5 tree (no axes anywhere) renders unchanged; config defaults + env overrides + negative-cap validation. --- cmd/engine/main.go | 24 ++++--- cmd/server/main.go | 24 ++++--- config.example.yaml | 20 ++++++ config.server.example.yaml | 11 +++ openapi.yaml | 38 ++++++++++ pkg/config/config.go | 76 ++++++++++++++++++++ pkg/config/config_test.go | 94 +++++++++++++++++++++++++ pkg/retrieval/retrieval_test.go | 119 ++++++++++++++++++++++++++++++++ pkg/retrieval/single_pass.go | 38 ++++++++++ 9 files changed, 424 insertions(+), 20 deletions(-) diff --git a/cmd/engine/main.go b/cmd/engine/main.go index 5398a31..9b504bd 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -169,16 +169,20 @@ func run() error { } pipeline := ingest.NewPipeline(ingest.Pipeline{ - DB: pool, - Storage: store, - LLM: llmClient, - Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)), - Logger: logger, - HyDEEnabled: cfg.Ingest.HyDE.Enabled, - HyDEModel: cfg.Ingest.HyDE.Model, - HyDENumQuestions: cfg.Ingest.HyDE.NumQuestions, - HyDEConcurrency: cfg.Ingest.HyDE.Concurrency, - GlobalLLMConcurrency: cfg.Ingest.GlobalLLMConcurrency, + DB: pool, + Storage: store, + LLM: llmClient, + Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)), + Logger: logger, + HyDEEnabled: cfg.Ingest.HyDE.Enabled, + HyDEModel: cfg.Ingest.HyDE.Model, + HyDENumQuestions: cfg.Ingest.HyDE.NumQuestions, + HyDEConcurrency: cfg.Ingest.HyDE.Concurrency, + SummaryAxesEnabled: cfg.Ingest.SummaryAxes.Enabled, + SummaryAxesMaxTopics: cfg.Ingest.SummaryAxes.MaxTopics, + SummaryAxesMaxEntities: cfg.Ingest.SummaryAxes.MaxEntities, + SummaryAxesMaxNumbers: cfg.Ingest.SummaryAxes.MaxNumbers, + GlobalLLMConcurrency: cfg.Ingest.GlobalLLMConcurrency, }) if cfg.Ingest.Tables.Enabled { logger.Info("ingest: pdf table extraction enabled", diff --git a/cmd/server/main.go b/cmd/server/main.go index 2ac2f61..83714db 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -156,16 +156,20 @@ func run() error { // ── Ingest pipeline ─────────────────────────────────────────── pipeline := ingest.NewPipeline(ingest.Pipeline{ - DB: pool, - Storage: store, - LLM: llmClient, - Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)), - Logger: logger, - HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled, - HyDEModel: cfg.Engine.Ingest.HyDE.Model, - HyDENumQuestions: cfg.Engine.Ingest.HyDE.NumQuestions, - HyDEConcurrency: cfg.Engine.Ingest.HyDE.Concurrency, - GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency, + DB: pool, + Storage: store, + LLM: llmClient, + Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)), + Logger: logger, + HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled, + HyDEModel: cfg.Engine.Ingest.HyDE.Model, + HyDENumQuestions: cfg.Engine.Ingest.HyDE.NumQuestions, + HyDEConcurrency: cfg.Engine.Ingest.HyDE.Concurrency, + SummaryAxesEnabled: cfg.Engine.Ingest.SummaryAxes.Enabled, + SummaryAxesMaxTopics: cfg.Engine.Ingest.SummaryAxes.MaxTopics, + SummaryAxesMaxEntities: cfg.Engine.Ingest.SummaryAxes.MaxEntities, + SummaryAxesMaxNumbers: cfg.Engine.Ingest.SummaryAxes.MaxNumbers, + GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency, }) if cfg.Engine.Ingest.Tables.Enabled { logger.Info("ingest: pdf table extraction enabled", diff --git a/config.example.yaml b/config.example.yaml index ee1b9c4..c5e9010 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -249,6 +249,26 @@ ingest: min_table_rows: 2 min_table_cols: 2 + # Multi-axis structured summaries (Phase 2.5). When enabled, the + # summarize stage runs in JSON mode and produces {topics, entities, + # numbers, one_line} per section instead of a single sentence. The + # structured blob lives in sections.summary_axes; the one_line still + # populates sections.summary so older API consumers keep working + # unchanged. The retrieval prompt surfaces entities + numbers from + # the axes block on the section line so the model has direct + # surface-form access to proper-noun and numeric anchors. + # + # ENABLED BY DEFAULT. Flip to false to roll back to the pre-2.5 + # single-sentence path without redeploying the binary. + summary_axes: + enabled: true + # Per-axis caps prevent a misbehaving model from blowing up the + # retrieval prompt budget. Values below are the defaults; tune + # only if the model returns systematically truncated output. + max_topics: 4 + max_entities: 8 + max_numbers: 6 + log: level: "info" # debug | info | warn | error format: "json" # json | console diff --git a/config.server.example.yaml b/config.server.example.yaml index 1bbc4e9..76bc29c 100644 --- a/config.server.example.yaml +++ b/config.server.example.yaml @@ -113,6 +113,17 @@ engine: num_questions: 5 concurrency: 4 + # Multi-axis structured summaries (Phase 2.5). JSON-mode summarizer + # returns {topics, entities, numbers, one_line}. The retrieval + # prompt surfaces entities + numbers on the section line; the + # one_line continues to populate the flat `summary` field for + # backward compatibility. + summary_axes: + enabled: true + max_topics: 4 + max_entities: 8 + max_numbers: 6 + log: level: "info" # "debug", "info", "warn", "error" format: "json" # "json" or "console" diff --git a/openapi.yaml b/openapi.yaml index a4c82ef..7bc35c1 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -458,6 +458,8 @@ components: type: string summary: type: string + summary_axes: + $ref: "#/components/schemas/SummaryAxes" children: type: array items: @@ -482,6 +484,8 @@ components: type: string summary: type: string + summary_axes: + $ref: "#/components/schemas/SummaryAxes" token_count: type: integer page_start: @@ -594,6 +598,8 @@ components: type: string summary: type: string + summary_axes: + $ref: "#/components/schemas/SummaryAxes" token_count: type: integer page_start: @@ -638,6 +644,38 @@ components: text: type: string + SummaryAxes: + type: object + description: | + Phase 2.5 multi-axis structured summary. The summarizer + produces this object alongside the flat `summary` field; + `summary` continues to carry the one-line sentence (which is + the same string as `one_line` here) so older API consumers + keep working. Omitted on sections written before the Phase + 2.5 schema migration; will arrive on every newly-ingested + section. Each list axis is also independently optional — + sections without proper-noun mentions or numeric values + leave the corresponding array empty. + properties: + topics: + type: array + items: + type: string + description: Lower-case, hyphenated topic keywords (e.g. `debt`, `long-term-obligations`). + entities: + type: array + items: + type: string + description: Proper-noun mentions extracted verbatim from the section (orgs, people, places, dates). + numbers: + type: array + items: + type: string + description: Standout numeric values with units as they appear (e.g. `$4.2B`, `2.8%`, `Q3 2024`). + one_line: + type: string + description: Human-readable sentence describing the section. Mirrors the flat `summary` field. + AnswerRequest: type: object required: [document_id, query] diff --git a/pkg/config/config.go b/pkg/config/config.go index a640319..a57de66 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -41,6 +41,13 @@ type IngestConfig struct { // hides in a balance sheet that text-only extraction collapses. Tables TablesConfig `yaml:"tables"` + // SummaryAxes configures the Phase 2.5 multi-axis summarizer. + // Enabled by default — the structured shape unlocks + // entity / numeric matching at retrieval time without changing the + // existing `summary` field's contract (axes.one_line continues to + // populate it). + SummaryAxes SummaryAxesBlock `yaml:"summary_axes"` + // GlobalLLMConcurrency caps the total number of LLM calls in flight // across the summarize and HyDE stages combined, which now run // concurrently. Each stage still respects its own per-stage cap @@ -53,6 +60,33 @@ type IngestConfig struct { GlobalLLMConcurrency int `yaml:"global_llm_concurrency"` } +// SummaryAxesBlock configures the Phase 2.5 structured summarizer. +// +// When enabled, the summarize stage runs in JSON mode and produces +// {topics, entities, numbers, one_line} per section instead of a +// single sentence. The structured blob is persisted in +// sections.summary_axes (JSONB); the one_line continues to land in +// sections.summary so older API consumers keep working. +// +// Disable to roll back to the pre-2.5 single-sentence behaviour +// without un-deploying the binary — useful for A/B comparisons or as +// a fast off-switch if a real-world document triggers a regression. +type SummaryAxesBlock struct { + // Enabled toggles the structured path. Default: true. + Enabled bool `yaml:"enabled"` + + // MaxTopics caps the topics axis the summarizer returns per + // section. Default: 4. A misbehaving model that returns 50 topics + // can't push past this cap; the prompt-budget impact stays bounded. + MaxTopics int `yaml:"max_topics"` + + // MaxEntities caps the entities axis. Default: 8. + MaxEntities int `yaml:"max_entities"` + + // MaxNumbers caps the numbers axis. Default: 6. + MaxNumbers int `yaml:"max_numbers"` +} + // TablesConfig configures the table-extraction stage of the PDF parser. // The stage runs pdftable's geometry-based finder over every page and // emits each detected table as its own Section with @@ -504,6 +538,12 @@ func Default() Config { MinTableRows: 2, MinTableCols: 2, }, + SummaryAxes: SummaryAxesBlock{ + Enabled: true, + MaxTopics: 4, + MaxEntities: 8, + MaxNumbers: 6, + }, }, Log: LogConfig{Level: "info", Format: "json"}, } @@ -661,6 +701,32 @@ func applyEnvOverrides(c *Config) { c.Ingest.Tables.MinTableCols = n } } + // Phase 2.5 structured-summary knobs. Booleans accept the same + // truthy strings the other ingest toggles use; numeric overrides + // require a positive int (a typo silently preserves the default). + if v := os.Getenv("VLE_INGEST_SUMMARY_AXES_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Ingest.SummaryAxes.Enabled = true + case "0", "false", "no", "off": + c.Ingest.SummaryAxes.Enabled = false + } + } + if v := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Ingest.SummaryAxes.MaxTopics = n + } + } + if v := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_ENTITIES"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Ingest.SummaryAxes.MaxEntities = n + } + } + if v := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_NUMBERS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Ingest.SummaryAxes.MaxNumbers = n + } + } if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" { switch strings.ToLower(strings.TrimSpace(v)) { case "1", "true", "yes", "on": @@ -849,6 +915,16 @@ func (c Config) Validate() error { return fmt.Errorf("ingest.tables.min_table_cols must be >= 0, got %d", c.Ingest.Tables.MinTableCols) } + if c.Ingest.SummaryAxes.MaxTopics < 0 { + return fmt.Errorf("ingest.summary_axes.max_topics must be >= 0, got %d", c.Ingest.SummaryAxes.MaxTopics) + } + if c.Ingest.SummaryAxes.MaxEntities < 0 { + return fmt.Errorf("ingest.summary_axes.max_entities must be >= 0, got %d", c.Ingest.SummaryAxes.MaxEntities) + } + if c.Ingest.SummaryAxes.MaxNumbers < 0 { + return fmt.Errorf("ingest.summary_axes.max_numbers must be >= 0, got %d", c.Ingest.SummaryAxes.MaxNumbers) + } + if c.Retrieval.Planning.CacheSize < 0 { return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize) } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f71ad41..64a2574 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -682,3 +682,97 @@ func TestTablesValidateRejectsBadStrategy(t *testing.T) { t.Error("expected error for negative min_table_rows") } } + +// TestSummaryAxesDefaults locks the Phase 2.5 defaults: structured +// summaries opt-in by default, with the caps the spec calls for. The +// retrieval prompt downstream relies on these caps to keep the prompt +// budget bounded per section. +func TestSummaryAxesDefaults(t *testing.T) { + t.Parallel() + cfg := Default() + if !cfg.Ingest.SummaryAxes.Enabled { + t.Error("ingest.summary_axes.enabled should default to true (opt-out)") + } + if cfg.Ingest.SummaryAxes.MaxTopics != 4 { + t.Errorf("max_topics = %d, want 4", cfg.Ingest.SummaryAxes.MaxTopics) + } + if cfg.Ingest.SummaryAxes.MaxEntities != 8 { + t.Errorf("max_entities = %d, want 8", cfg.Ingest.SummaryAxes.MaxEntities) + } + if cfg.Ingest.SummaryAxes.MaxNumbers != 6 { + t.Errorf("max_numbers = %d, want 6", cfg.Ingest.SummaryAxes.MaxNumbers) + } +} + +// TestSummaryAxesEnvOverride covers the opt-out path: env disables the +// structured summarizer, and the numeric caps re-tune via env. +func TestSummaryAxesEnvOverride(t *testing.T) { + // Mutates env — restore on exit. Not parallel. + prevEnabled := os.Getenv("VLE_INGEST_SUMMARY_AXES_ENABLED") + prevTopics := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS") + prevEntities := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_ENTITIES") + prevNumbers := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_NUMBERS") + defer func() { + os.Setenv("VLE_INGEST_SUMMARY_AXES_ENABLED", prevEnabled) + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS", prevTopics) + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_ENTITIES", prevEntities) + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_NUMBERS", prevNumbers) + }() + + os.Setenv("VLE_INGEST_SUMMARY_AXES_ENABLED", "false") + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS", "10") + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_ENTITIES", "20") + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_NUMBERS", "15") + + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Ingest.SummaryAxes.Enabled { + t.Error("VLE_INGEST_SUMMARY_AXES_ENABLED=false should disable") + } + if cfg.Ingest.SummaryAxes.MaxTopics != 10 { + t.Errorf("max_topics env override: got %d, want 10", cfg.Ingest.SummaryAxes.MaxTopics) + } + if cfg.Ingest.SummaryAxes.MaxEntities != 20 { + t.Errorf("max_entities env override: got %d, want 20", cfg.Ingest.SummaryAxes.MaxEntities) + } + if cfg.Ingest.SummaryAxes.MaxNumbers != 15 { + t.Errorf("max_numbers env override: got %d, want 15", cfg.Ingest.SummaryAxes.MaxNumbers) + } +} + +// TestSummaryAxesEnvOverrideRejectsBad: garbage values preserve the +// default rather than zeroing the cap (which would silently fail to +// trim model output). +func TestSummaryAxesEnvOverrideRejectsBad(t *testing.T) { + prevTopics := os.Getenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS") + defer os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS", prevTopics) + os.Setenv("VLE_INGEST_SUMMARY_AXES_MAX_TOPICS", "not-a-number") + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Ingest.SummaryAxes.MaxTopics != 4 { + t.Errorf("garbled env should preserve default 4, got %d", cfg.Ingest.SummaryAxes.MaxTopics) + } +} + +// TestSummaryAxesValidateNegatives: negative caps fail validation so a +// typo in the YAML doesn't silently disable trimming. +func TestSummaryAxesValidateNegatives(t *testing.T) { + t.Parallel() + for _, tc := range []struct { + name string + fn func(*Config) + }{ + {"topics", func(c *Config) { c.Ingest.SummaryAxes.MaxTopics = -1 }}, + {"entities", func(c *Config) { c.Ingest.SummaryAxes.MaxEntities = -1 }}, + {"numbers", func(c *Config) { c.Ingest.SummaryAxes.MaxNumbers = -1 }}, + } { + t.Run(tc.name, func(t *testing.T) { + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + tc.fn(&cfg) + if err := cfg.Validate(); err == nil { + t.Errorf("negative %s should fail validation", tc.name) + } + }) + } +} diff --git a/pkg/retrieval/retrieval_test.go b/pkg/retrieval/retrieval_test.go index 9888c4b..37abf8f 100644 --- a/pkg/retrieval/retrieval_test.go +++ b/pkg/retrieval/retrieval_test.go @@ -368,6 +368,125 @@ func TestChunkedTreeStampsTraceToken(t *testing.T) { } } +// buildTreeWithAxes returns a tree where sec_a carries a multi-axis +// summary. Used to assert the retrieval prompt surfaces entities + +// numbers on the section line. +func buildTreeWithAxes() *tree.Tree { + root := &tree.Section{ + ID: "sec_root", Title: "Atlas", + Children: []*tree.Section{ + { + ID: "sec_a", ParentID: "sec_root", Title: "Long-Term Debt", + Summary: "issued debt securities, repayment schedules", + SummaryAxes: &tree.SummaryAxes{ + Topics: []string{"debt", "long-term-obligations"}, + Entities: []string{"3M Company", "JPMorgan", "BofA", "Wells Fargo"}, + Numbers: []string{"$4.2B", "2034", "2.8%", "2027"}, + OneLine: "issued debt securities, repayment schedules", + }, + }, + { + ID: "sec_b", ParentID: "sec_root", Title: "Revenue", + Summary: "fiscal-year-over-year revenue", + // Empty axes pointer — axes block exists but has no + // entities/numbers. Tests the non-rendering branch. + SummaryAxes: &tree.SummaryAxes{OneLine: "fiscal-year-over-year revenue"}, + }, + {ID: "sec_c", ParentID: "sec_root", Title: "FAQ", Summary: "common questions"}, + }, + } + return &tree.Tree{DocumentID: "doc_x", Title: "Atlas", Root: root} +} + +// TestSelectionPromptSurfacesAxes is the Phase 2.5 retrieval-prompt +// contract: when a section carries a SummaryAxes, the outline line +// must render entities and numbers (truncated to first 3 each) so the +// retrieval model has direct surface-form access to proper-noun and +// numeric anchors. +func TestSelectionPromptSurfacesAxes(t *testing.T) { + tr := buildTreeWithAxes() + m := &mockLLM{pickIfPresent: []tree.SectionID{"sec_a"}} + s := retrieval.NewSinglePass(m) + + _, err := s.Select(context.Background(), tr, "how much debt?", + retrieval.ContextBudget{MaxTokens: 1000}) + if err != nil { + t.Fatalf("select: %v", err) + } + m.mu.Lock() + prompts := append([]string(nil), m.lastPrompts...) + m.mu.Unlock() + if len(prompts) == 0 { + t.Fatal("no prompts captured") + } + prompt := prompts[0] + + // Entities + numbers appear with their "— entities: " / "— numbers: " + // prefixes on sec_a's line. + if !strings.Contains(prompt, "entities: ") { + t.Errorf("prompt missing entities prefix:\n%s", prompt) + } + if !strings.Contains(prompt, "numbers: ") { + t.Errorf("prompt missing numbers prefix:\n%s", prompt) + } + if !strings.Contains(prompt, "3M Company") { + t.Errorf("prompt missing first entity:\n%s", prompt) + } + if !strings.Contains(prompt, "$4.2B") { + t.Errorf("prompt missing first number:\n%s", prompt) + } + // Truncation: only first 3 entities / numbers are rendered. The + // 4th of each must NOT appear. + if strings.Contains(prompt, "Wells Fargo") { + t.Errorf("entities should be truncated to first 3, 4th leaked:\n%s", prompt) + } + if strings.Contains(prompt, "2027") { + t.Errorf("numbers should be truncated to first 3, 4th leaked:\n%s", prompt) + } + + // sec_b has an axes object but empty lists — entities/numbers + // labels must NOT appear on sec_b's line. We assert by checking + // that "Revenue — entities" does not appear (Revenue is sec_b's + // title). + if strings.Contains(prompt, "Revenue — entities") || strings.Contains(prompt, "Revenue — numbers") { + t.Errorf("sec_b has empty axis lists; should not render entities/numbers:\n%s", prompt) + } + + // sec_c has no axes at all (nil pointer) — the rendering must + // skip cleanly. Same check on sec_c's title. + if strings.Contains(prompt, "FAQ — entities") || strings.Contains(prompt, "FAQ — numbers") { + t.Errorf("sec_c has no axes; should not render axes block:\n%s", prompt) + } +} + +// TestSelectionPromptOmitsAxesForOldSections guards the +// backwards-compatibility contract: sections written before Phase 2.5 +// have axes==nil, and the retrieval prompt must continue to render +// exactly as it did before this PR. The pre-axes prompt shape is +// `- [id] Title — summary` with no further suffixes; we assert by +// making sure the entities/numbers labels are absent. +func TestSelectionPromptOmitsAxesForOldSections(t *testing.T) { + tr := buildTree() // pre-Phase-2.5 tree, no SummaryAxes anywhere + m := &mockLLM{pickIfPresent: []tree.SectionID{"sec_b"}} + s := retrieval.NewSinglePass(m) + + _, err := s.Select(context.Background(), tr, "q", + retrieval.ContextBudget{MaxTokens: 1000}) + if err != nil { + t.Fatalf("select: %v", err) + } + m.mu.Lock() + prompts := append([]string(nil), m.lastPrompts...) + m.mu.Unlock() + if len(prompts) == 0 { + t.Fatal("no prompts captured") + } + prompt := prompts[0] + if strings.Contains(prompt, "entities: ") || strings.Contains(prompt, "numbers: ") { + t.Errorf("axes labels must not appear when no section has axes:\n%s", prompt) + } +} + // TestTraceTokenMatchesExternalComputation ties the strategy output to // the canonical ComputeTraceToken helper, so any drift between the // helper and the per-strategy plumbing is caught at test time. diff --git a/pkg/retrieval/single_pass.go b/pkg/retrieval/single_pass.go index 95ab61c..e7601f5 100644 --- a/pkg/retrieval/single_pass.go +++ b/pkg/retrieval/single_pass.go @@ -190,6 +190,22 @@ func writeSectionLine(b *strings.Builder, sv tree.SectionView) { b.WriteString(" — ") b.WriteString(sv.Summary) } + // Phase 2.5: append entities + numbers from the structured axes + // when present. They live on the SAME outline line as the summary, + // truncated to the first 3 each, so the retrieval prompt sees the + // section's proper-noun + numeric surface without doubling the + // per-section budget. Older sections (axes==nil) skip this block, + // so un-backfilled docs see the prompt unchanged. + if sv.SummaryAxes != nil { + if ents := firstN(sv.SummaryAxes.Entities, 3); len(ents) > 0 { + b.WriteString(" — entities: ") + b.WriteString(strings.Join(ents, ", ")) + } + if nums := firstN(sv.SummaryAxes.Numbers, 3); len(nums) > 0 { + b.WriteString(" — numbers: ") + b.WriteString(strings.Join(nums, ", ")) + } + } b.WriteByte('\n') // HyDE: surface the first candidate question (truncated) as an // "answers:" hint. Keeps the prompt budget impact small (~120 chars @@ -205,6 +221,28 @@ func writeSectionLine(b *strings.Builder, sv tree.SectionView) { } } +// firstN returns up to n non-empty trimmed entries from xs in order. +// Used by writeSectionLine to cap entities / numbers per section so a +// section whose axes carry 30 entities doesn't blow up the retrieval +// prompt's token budget. +func firstN(xs []string, n int) []string { + if n <= 0 || len(xs) == 0 { + return nil + } + out := make([]string, 0, n) + for _, x := range xs { + x = strings.TrimSpace(x) + if x == "" { + continue + } + out = append(out, x) + if len(out) >= n { + break + } + } + return out +} + // firstCandidateQuestion returns the first non-empty candidate question, // truncated to ~120 chars so the outline doesn't blow up. Returns "" // when no usable question is present.