From b016bc59b85725f6d540e21301bb4d5480add2bb Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:08:30 +0100 Subject: [PATCH 1/3] feat(retrieval): per-pick confidence scores from selection LLM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the selection JSON schema to accept either the legacy {selected_section_ids: [...]} shape or the new {picks: [{id, confidence}]} shape with per-pick confidence in [0.0, 1.0]. ParseSelection returns (ids, confidences, err); legacy responses surface confidences=nil so callers can distinguish "no confidence signal" from "all confidences low". Each strategy plumbs the confidence map through: - SinglePass fills Result.Confidences from the parsed map, filtered against the post-FilterKnownIDs survivors. - ChunkedTree unions per-slice confidence maps (max-wins on duplicate IDs across overlapping slices) and filters to the merged ID set. - Agentic accepts both done-shape variants. The new picks shape surfaces per-pick confidences on the final Result. Result.SelectedIDs stays []tree.SectionID — the change is purely additive. Callers that don't care about confidence see no API change. The strategy never abstains; the API layer's abstention check (next commit) is the only place "all confidences below threshold" becomes an abstention response. Tests cover: new-shape parse, legacy-shape parse, mixed-shape parse (some picks with confidence, some without), confidence clamping, duplicate-pick dedup, per-strategy fill, chunked-tree merge, and the agentic done-with-picks path. --- pkg/retrieval/agentic.go | 81 +++++++++++-- pkg/retrieval/agentic_test.go | 60 ++++++++++ pkg/retrieval/chunked_tree.go | 40 +++++-- pkg/retrieval/retrieval_test.go | 205 +++++++++++++++++++++++++++++++- pkg/retrieval/single_pass.go | 171 ++++++++++++++++++++++---- pkg/retrieval/strategy.go | 17 ++- 6 files changed, 529 insertions(+), 45 deletions(-) diff --git a/pkg/retrieval/agentic.go b/pkg/retrieval/agentic.go index 41c49dd..ea1d367 100644 --- a/pkg/retrieval/agentic.go +++ b/pkg/retrieval/agentic.go @@ -125,10 +125,11 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer } var ( - totalUsage Usage - hopsTaken int - finalIDs []tree.SectionID - reasoning string + totalUsage Usage + hopsTaken int + finalIDs []tree.SectionID + finalConfidences map[tree.SectionID]float64 + reasoning string ) for hop := 0; hop < maxHops; hop++ { @@ -176,10 +177,11 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer switch action.Action { case actionDone: - finalIDs = filterToTreeIDs(action.PickedIDs, bySectionID) + finalIDs, finalConfidences = collectDonePicks(action, bySectionID) reasoning = action.Reasoning return &Result{ SelectedIDs: finalIDs, + Confidences: filterConfidences(finalConfidences, finalIDs), Reasoning: reasoning, ModelUsed: model, Usage: totalUsage, @@ -240,6 +242,7 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer log.Printf("retrieval: agentic strategy hit max_hops=%d without done; returning %d ids", maxHops, len(finalIDs)) return &Result{ SelectedIDs: finalIDs, + Confidences: filterConfidences(finalConfidences, finalIDs), Reasoning: reasoning, ModelUsed: model, Usage: totalUsage, @@ -248,6 +251,48 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer }, nil } +// collectDonePicks extracts the final IDs and per-pick confidences +// from a 'done' action. It honours the new Picks shape first; falls +// back to the legacy PickedIDs list when Picks is empty. Both shapes +// are filtered to the known tree IDs so a model can never inject an +// invented section into the result. +func collectDonePicks(action Action, bySectionID map[tree.SectionID]tree.SectionView) ([]tree.SectionID, map[tree.SectionID]float64) { + if len(action.Picks) > 0 { + ids := make([]tree.SectionID, 0, len(action.Picks)) + confidences := make(map[tree.SectionID]float64, len(action.Picks)) + seen := make(map[tree.SectionID]struct{}, len(action.Picks)) + for _, pk := range action.Picks { + sid := tree.SectionID(strings.TrimSpace(pk.ID)) + if sid == "" { + continue + } + if _, ok := bySectionID[sid]; !ok { + continue + } + if _, dup := seen[sid]; dup { + continue + } + seen[sid] = struct{}{} + ids = append(ids, sid) + if pk.Confidence != nil { + c := *pk.Confidence + if c < 0 { + c = 0 + } else if c > 1 { + c = 1 + } + confidences[sid] = c + } + } + if len(confidences) == 0 { + confidences = nil + } + return ids, confidences + } + // Legacy shape — no confidence signal. + return filterToTreeIDs(action.PickedIDs, bySectionID), nil +} + // initialUserPrompt is the very first user turn: it explains the task, // renders a shallow outline (default level=1) so the model has // something to react to, and reminds the model of the action protocol. @@ -316,15 +361,27 @@ Rules: - Prefer leaf sections. Include a parent only if its own body is directly relevant. - Include as few sections as possible. Quality over quantity. - Only return IDs you have seen in a prior observation. Do not invent IDs. -- If nothing in the document is relevant, return done with an empty picked_ids array.` +- When you finalise with 'done', attach a confidence score in [0.0, 1.0] + to every pick. Confidence reflects how directly the section answers + the query: 1.0 = near-certain, 0.0 = no signal. Use the full range — + do NOT score every pick at 1.0. If you genuinely cannot reason about + confidence, you may fall back to the legacy picked_ids array form. +- If nothing in the document is relevant, return done with an empty + picks (or picked_ids) array.` // actionProtocolHelp is the one-shot reminder appended to the initial // user prompt so the model gets concrete examples of valid actions // without us needing to maintain a separate few-shot block. +// +// The 'done' action accepts EITHER picked_ids (legacy, no +// confidence) OR picks (preferred, per-id confidence). Both shapes +// are parsed by ParseAction. The confidence-bearing shape unlocks +// abstention at the API layer. const actionProtocolHelp = `- {"action":"outline","level":2} — re-render the outline N levels deep - {"action":"expand","section_id":"sec_x"} — list immediate children of sec_x - {"action":"read","section_id":"sec_x"} — fetch the full body of sec_x -- {"action":"done","picked_ids":["sec_x","sec_y"],"reasoning":"why"} — finalize +- {"action":"done","picks":[{"id":"sec_x","confidence":0.8}],"reasoning":"why"} — finalize with per-pick confidence in [0.0, 1.0] +- {"action":"done","picked_ids":["sec_x","sec_y"],"reasoning":"why"} — legacy fallback when you cannot reason about confidence Reply with ONLY the JSON object. No prose, no markdown fences.` @@ -346,9 +403,17 @@ type Action struct { // SectionID is the target of expand and read actions. SectionID string `json:"section_id,omitempty"` - // PickedIDs is the final selection for a done action. + // PickedIDs is the legacy-shape final selection for a done action + // (no per-pick confidence). Either this or Picks may be set; if + // both are present, Picks wins. PickedIDs []string `json:"picked_ids,omitempty"` + // Picks is the preferred final selection for a done action: each + // entry carries an ID + optional confidence in [0.0, 1.0]. When + // the model populates this, the strategy surfaces per-pick + // confidences on the returned Result. + Picks []selectionPick `json:"picks,omitempty"` + // Reasoning is an optional explanation accompanying done. Reasoning string `json:"reasoning,omitempty"` } diff --git a/pkg/retrieval/agentic_test.go b/pkg/retrieval/agentic_test.go index b5226a1..50346ca 100644 --- a/pkg/retrieval/agentic_test.go +++ b/pkg/retrieval/agentic_test.go @@ -232,6 +232,66 @@ func TestAgenticBadJSONGraceful(t *testing.T) { } } +// TestAgenticDoneWithConfidences exercises the Phase 2.4 new-shape +// done action: each pick carries a confidence in [0.0, 1.0] and the +// resulting Result.Confidences map mirrors the picks. The strategy +// itself never abstains; the API layer alone does that. +func TestAgenticDoneWithConfidences(t *testing.T) { + t.Parallel() + + tr := buildAgenticTree() + llm := &scriptedLLM{ + replies: []string{ + `{"action":"done","picks":[{"id":"sec_a1","confidence":0.85},{"id":"sec_b1","confidence":0.42}],"reasoning":"two-section answer"}`, + }, + } + s := retrieval.NewAgentic(llm, mapFetcher{data: map[string]string{}}) + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatal(err) + } + if len(res.SelectedIDs) != 2 { + t.Fatalf("want 2 picks, got %v", res.SelectedIDs) + } + if res.Confidences == nil { + t.Fatal("Confidences must be populated when model returns picks") + } + if res.Confidences["sec_a1"] != 0.85 { + t.Errorf("sec_a1 = %v, want 0.85", res.Confidences["sec_a1"]) + } + if res.Confidences["sec_b1"] != 0.42 { + t.Errorf("sec_b1 = %v, want 0.42", res.Confidences["sec_b1"]) + } +} + +// TestAgenticDoneLegacyShapeNoConfidences confirms the legacy +// picked_ids shape continues to work — Confidences must stay nil so +// the API layer treats this as "no confidence signal" and does not +// fire abstention. +func TestAgenticDoneLegacyShapeNoConfidences(t *testing.T) { + t.Parallel() + + tr := buildAgenticTree() + llm := &scriptedLLM{ + replies: []string{ + `{"action":"done","picked_ids":["sec_a1","sec_b1"],"reasoning":"legacy"}`, + }, + } + s := retrieval.NewAgentic(llm, mapFetcher{data: map[string]string{}}) + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatal(err) + } + if len(res.SelectedIDs) != 2 { + t.Fatalf("want 2 IDs, got %v", res.SelectedIDs) + } + if res.Confidences != nil { + t.Errorf("legacy picked_ids must NOT populate Confidences, got %v", res.Confidences) + } +} + // TestAgenticFiltersUnknownPicks mirrors single-pass: if the model // invents IDs not present in the tree, they must be dropped. func TestAgenticFiltersUnknownPicks(t *testing.T) { diff --git a/pkg/retrieval/chunked_tree.go b/pkg/retrieval/chunked_tree.go index e7f9f09..7fcd98b 100644 --- a/pkg/retrieval/chunked_tree.go +++ b/pkg/retrieval/chunked_tree.go @@ -71,8 +71,9 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st sem := make(chan struct{}, maxPar) type sliceResult struct { - ids []tree.SectionID - usage Usage + ids []tree.SectionID + confidences map[tree.SectionID]float64 + usage Usage } results := make([]sliceResult, len(slices)) @@ -89,12 +90,12 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st return gctx.Err() } - ids, usage, err := c.reasonOverSliceWithCost(gctx, sl, query, budget) + ids, confidences, usage, err := c.reasonOverSliceWithCost(gctx, sl, query, budget) if err != nil { return err } mu.Lock() - results[i] = sliceResult{ids: ids, usage: usage} + results[i] = sliceResult{ids: ids, confidences: confidences, usage: usage} mu.Unlock() return nil }) @@ -107,14 +108,30 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st // Merge IDs and aggregate costs. allIDs := make([][]tree.SectionID, len(results)) var totalUsage Usage + // Union the per-slice confidence maps. When two slices both score + // the same ID (rare but possible if the splitter overlaps), we + // keep the higher confidence — the more confident slice has + // better signal about that section. + var mergedConfidences map[tree.SectionID]float64 for i, r := range results { allIDs[i] = r.ids totalUsage.Add(r.usage) + if len(r.confidences) > 0 { + if mergedConfidences == nil { + mergedConfidences = make(map[tree.SectionID]float64, len(r.confidences)) + } + for id, conf := range r.confidences { + if existing, ok := mergedConfidences[id]; !ok || conf > existing { + mergedConfidences[id] = conf + } + } + } } selected := c.Merge.Merge(allIDs) return &Result{ SelectedIDs: selected, + Confidences: filterConfidences(mergedConfidences, selected), Usage: totalUsage, HopsTaken: 1, TraceToken: ComputeTraceToken(t.DocumentID, traceDocVersionV1, budget.ModelName, selected), @@ -125,12 +142,14 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st // model picked, filtered against sl.Sections so a model can never fabricate // an ID that lives in a different slice. func (c *ChunkedTree) reasonOverSlice(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, error) { - ids, _, err := c.reasonOverSliceWithCost(ctx, sl, query, budget) + ids, _, _, err := c.reasonOverSliceWithCost(ctx, sl, query, budget) return ids, err } -// reasonOverSliceWithCost is like reasonOverSlice but also returns usage. -func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) { +// reasonOverSliceWithCost is like reasonOverSlice but also returns the +// per-pick confidence map (nil when the model returned the legacy +// response shape) and the usage spent on the call. +func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) { prompt := BuildSelectionPrompt(sl.Breadcrumb, sl.Sections, sl.SiblingSummaries, query) req := llmgate.Request{ @@ -145,11 +164,12 @@ func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, que JSONSchema: []byte(selectionJSONSchema), } - ids, usage, err := runSelectionWithRetry(ctx, c.LLM, req, defaultSelectionRetries) + ids, confidences, usage, err := runSelectionWithRetry(ctx, c.LLM, req, defaultSelectionRetries) if err != nil { - return nil, usage, err + return nil, nil, usage, err } - return FilterKnownIDs(ids, sl.Sections), usage, nil + filtered := FilterKnownIDs(ids, sl.Sections) + return filtered, filterConfidences(confidences, filtered), usage, nil } // MergePolicy determines how per-slice ID lists are combined into a single diff --git a/pkg/retrieval/retrieval_test.go b/pkg/retrieval/retrieval_test.go index 9888c4b..f9538f3 100644 --- a/pkg/retrieval/retrieval_test.go +++ b/pkg/retrieval/retrieval_test.go @@ -181,10 +181,13 @@ func TestParseSelection(t *testing.T) { } for _, c := range cases { t.Run(c.name, func(t *testing.T) { - got, err := retrieval.ParseSelection(c.in) + got, confidences, err := retrieval.ParseSelection(c.in) if err != nil { t.Fatal(err) } + if confidences != nil { + t.Errorf("legacy-shape input must not populate confidences, got %v", confidences) + } if len(got) != len(c.want) { t.Fatalf("len: got %v want %v", got, c.want) } @@ -197,6 +200,109 @@ func TestParseSelection(t *testing.T) { } } +// TestParseSelectionNewShape exercises the Phase 2.4 picks shape: +// each pick carries an id + confidence, the parser returns both the +// id list and a confidence map. +func TestParseSelectionNewShape(t *testing.T) { + raw := `{"picks":[{"id":"sec_a","confidence":0.82},{"id":"sec_b","confidence":0.31}],"reasoning":"x"}` + ids, confidences, err := retrieval.ParseSelection(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ids) != 2 || ids[0] != "sec_a" || ids[1] != "sec_b" { + t.Fatalf("ids: got %v want [sec_a sec_b]", ids) + } + if confidences == nil { + t.Fatal("confidences must be populated for new-shape response") + } + if got := confidences["sec_a"]; got != 0.82 { + t.Errorf("sec_a confidence = %v, want 0.82", got) + } + if got := confidences["sec_b"]; got != 0.31 { + t.Errorf("sec_b confidence = %v, want 0.31", got) + } +} + +// TestParseSelectionMixedShape covers a partially-populated new-shape +// response: some picks have confidence, others don't. The confidence +// map only surfaces IDs whose confidence was actually present — +// missing entries are NOT defaulted to 0 (which would force +// abstention) or to 1 (which would suppress it). +func TestParseSelectionMixedShape(t *testing.T) { + raw := `{"picks":[{"id":"sec_a","confidence":0.9},{"id":"sec_b"},{"id":"sec_c","confidence":0.4}]}` + ids, confidences, err := retrieval.ParseSelection(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ids) != 3 { + t.Fatalf("ids: got %v, want 3 picks", ids) + } + if _, present := confidences["sec_a"]; !present { + t.Error("sec_a should have confidence") + } + if _, present := confidences["sec_b"]; present { + t.Error("sec_b should NOT have confidence (model omitted it)") + } + if _, present := confidences["sec_c"]; !present { + t.Error("sec_c should have confidence") + } + if confidences["sec_a"] != 0.9 || confidences["sec_c"] != 0.4 { + t.Errorf("confidences = %v", confidences) + } +} + +// TestParseSelectionClampsConfidence asserts confidences outside +// [0.0, 1.0] are clamped — defence-in-depth against a model that +// returns 1.5 or -0.2 despite the prompt's range. +func TestParseSelectionClampsConfidence(t *testing.T) { + raw := `{"picks":[{"id":"sec_a","confidence":1.7},{"id":"sec_b","confidence":-0.3}]}` + _, confidences, err := retrieval.ParseSelection(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if confidences["sec_a"] != 1.0 { + t.Errorf("sec_a clamped: want 1.0, got %v", confidences["sec_a"]) + } + if confidences["sec_b"] != 0.0 { + t.Errorf("sec_b clamped: want 0.0, got %v", confidences["sec_b"]) + } +} + +// TestParseSelectionPicksDedup ensures duplicate IDs in `picks` are +// deduplicated (first-seen wins) so the strategy doesn't double-count +// a section the model accidentally listed twice. +func TestParseSelectionPicksDedup(t *testing.T) { + raw := `{"picks":[{"id":"sec_a","confidence":0.7},{"id":"sec_a","confidence":0.2}]}` + ids, confidences, err := retrieval.ParseSelection(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ids) != 1 || ids[0] != "sec_a" { + t.Fatalf("ids: got %v want [sec_a]", ids) + } + if confidences["sec_a"] != 0.7 { + t.Errorf("first-seen confidence should win: got %v want 0.7", confidences["sec_a"]) + } +} + +// TestParseSelectionNewShapeNoConfidences covers a new-shape response +// where the model returned `picks` but stamped no confidence values +// at all — must be treated as legacy (nil confidences) so the API +// layer does NOT abstain on a confidence signal that isn't there. +func TestParseSelectionNewShapeNoConfidences(t *testing.T) { + raw := `{"picks":[{"id":"sec_a"},{"id":"sec_b"}]}` + ids, confidences, err := retrieval.ParseSelection(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ids) != 2 { + t.Fatalf("ids: got %v, want 2", ids) + } + if confidences != nil { + t.Errorf("missing confidences must surface as nil map, got %v", confidences) + } +} + func TestChunkedTreeSinglesliceWhenItFits(t *testing.T) { tr := buildTree() m := &mockLLM{pickIfPresent: []tree.SectionID{"sec_a", "sec_b"}} @@ -317,6 +423,103 @@ func TestDefaultSplitterFastPath(t *testing.T) { } } +// TestSinglePassReturnsConfidences asserts that a new-shape LLM +// response with confidence scores surfaces a populated Confidences +// map on the strategy's Result. The strategy itself never abstains — +// even when every confidence is below the typical 0.4 threshold the +// IDs still come back and the API layer decides what to do. +func TestSinglePassReturnsConfidences(t *testing.T) { + tr := buildTree() + m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.78},{"id":"sec_b","confidence":0.12}],"reasoning":"x"}`} + s := retrieval.NewSinglePass(m) + + res, err := s.SelectWithCost(context.Background(), tr, "q", + retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000}) + if err != nil { + t.Fatalf("select: %v", err) + } + if len(res.SelectedIDs) != 2 { + t.Fatalf("want 2 IDs, got %v", res.SelectedIDs) + } + if res.Confidences == nil { + t.Fatal("Confidences should be populated for new-shape response") + } + if got := res.Confidences["sec_a"]; got != 0.78 { + t.Errorf("sec_a confidence = %v, want 0.78", got) + } + if got := res.Confidences["sec_b"]; got != 0.12 { + t.Errorf("sec_b confidence = %v, want 0.12", got) + } +} + +// TestSinglePassAllLowConfidencesStillReturnsIDs is the abstention +// smoke contract from the spec: the strategy itself never abstains. +// Even when every confidence is below 0.4 the IDs come back. The +// API layer is the only place that may convert "all low" into an +// abstention. +func TestSinglePassAllLowConfidencesStillReturnsIDs(t *testing.T) { + tr := buildTree() + m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.1},{"id":"sec_b","confidence":0.2}]}`} + s := retrieval.NewSinglePass(m) + + res, err := s.SelectWithCost(context.Background(), tr, "q", + retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000}) + if err != nil { + t.Fatalf("select: %v", err) + } + if len(res.SelectedIDs) != 2 { + t.Fatalf("strategy must return IDs even with low confidences, got %v", res.SelectedIDs) + } + if len(res.Confidences) != 2 { + t.Errorf("Confidences should mirror SelectedIDs, got %v", res.Confidences) + } +} + +// TestSinglePassLegacyShapeNoConfidences confirms that the legacy +// response shape continues to work after the new-shape refactor. +// Critically, Confidences stays nil so the API layer does not abstain. +func TestSinglePassLegacyShapeNoConfidences(t *testing.T) { + tr := buildTree() + m := &mockLLM{reply: `{"selected_section_ids":["sec_a","sec_b"],"reasoning":"x"}`} + s := retrieval.NewSinglePass(m) + + res, err := s.SelectWithCost(context.Background(), tr, "q", + retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000}) + if err != nil { + t.Fatalf("select: %v", err) + } + if len(res.SelectedIDs) != 2 { + t.Fatalf("legacy response shape must still work, got %v", res.SelectedIDs) + } + if res.Confidences != nil { + t.Errorf("legacy response must NOT populate Confidences, got %v", res.Confidences) + } +} + +// TestChunkedTreeMergesConfidences verifies the chunked-tree strategy +// surfaces confidences in the merged Result. Because the test tree +// is small enough to fit in one slice, this is effectively a single +// slice union — but the field still has to round-trip through the +// per-slice plumbing. +func TestChunkedTreeMergesConfidences(t *testing.T) { + tr := buildTree() + m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.6},{"id":"sec_c","confidence":0.9}]}`} + s := retrieval.NewChunkedTree(m) + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{ + ModelName: "model", MaxTokens: 100000, MaxParallelCalls: 4, + }) + if err != nil { + t.Fatal(err) + } + if len(res.Confidences) != 2 { + t.Fatalf("Confidences should carry both picks, got %v", res.Confidences) + } + if res.Confidences["sec_a"] != 0.6 || res.Confidences["sec_c"] != 0.9 { + t.Errorf("confidences = %v", res.Confidences) + } +} + // TestSinglePassStampsTraceToken verifies that SelectWithCost // populates a 64-char hex TraceToken on the returned Result. func TestSinglePassStampsTraceToken(t *testing.T) { diff --git a/pkg/retrieval/single_pass.go b/pkg/retrieval/single_pass.go index 95ab61c..83814cd 100644 --- a/pkg/retrieval/single_pass.go +++ b/pkg/retrieval/single_pass.go @@ -60,14 +60,16 @@ func (s *SinglePass) SelectWithCost(ctx context.Context, t *tree.Tree, query str JSONSchema: []byte(selectionJSONSchema), } - ids, usage, err := runSelectionWithRetry(ctx, s.LLM, req, defaultSelectionRetries) + ids, confidences, usage, err := runSelectionWithRetry(ctx, s.LLM, req, defaultSelectionRetries) if err != nil { return nil, fmt.Errorf("single-pass llm call: %w", err) } selected := FilterKnownIDs(ids, view.Sections) + filteredConfidences := filterConfidences(confidences, selected) return &Result{ SelectedIDs: selected, + Confidences: filteredConfidences, ModelUsed: model, Usage: usage, HopsTaken: 1, @@ -75,6 +77,27 @@ func (s *SinglePass) SelectWithCost(ctx context.Context, t *tree.Tree, query str }, nil } +// filterConfidences keeps only entries whose key appears in keep, so a +// strategy never surfaces a confidence for an ID it didn't ultimately +// select (post-filter / post-merge). Returns nil when src is nil or +// empty after filtering — preserving the "no confidence signal" +// distinction the API layer relies on for abstention. +func filterConfidences(src map[tree.SectionID]float64, keep []tree.SectionID) map[tree.SectionID]float64 { + if len(src) == 0 { + return nil + } + out := make(map[tree.SectionID]float64, len(keep)) + for _, id := range keep { + if v, ok := src[id]; ok { + out[id] = v + } + } + if len(out) == 0 { + return nil + } + return out +} + // traceDocVersionV1 is the placeholder document version used by every // strategy until Phase 3.2 wires real per-document versioning. Defined // once so the bump is a one-line change. @@ -88,11 +111,13 @@ const defaultSelectionRetries = 2 // runSelectionWithRetry runs a selection LLM call and parses the response, // retrying up to maxRetries additional times if the model returns something -// that doesn't parse as JSON. Returns the parsed IDs and the cumulative usage -// across all attempts. An error is returned only on a transport/LLM failure — -// final parse failure degrades gracefully to an empty selection (logged) so a -// single LLM-formatting blip doesn't 500 the entire query. -func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) ([]tree.SectionID, Usage, error) { +// that doesn't parse as JSON. Returns the parsed IDs, per-ID confidences +// (nil when the model returned the legacy shape without confidence), and +// the cumulative usage across all attempts. An error is returned only on a +// transport/LLM failure — final parse failure degrades gracefully to an +// empty selection (logged) so a single LLM-formatting blip doesn't 500 +// the entire query. +func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) { if maxRetries < 0 { maxRetries = 0 } @@ -114,7 +139,7 @@ func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq l } resp, err := client.Complete(ctx, req) if err != nil { - return nil, totalUsage, err + return nil, nil, totalUsage, err } totalUsage.Add(Usage{ InputTokens: resp.Usage.InputTokens, @@ -123,14 +148,14 @@ func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq l CostUSD: resp.Usage.CostUSD, LLMCalls: 1, }) - ids, parseErr := ParseSelection(resp.Content) + ids, confidences, parseErr := ParseSelection(resp.Content) if parseErr == nil { - return ids, totalUsage, nil + return ids, confidences, totalUsage, nil } lastParseErr = parseErr } log.Printf("retrieval: selection parse failed after %d attempts (%v); returning empty selection", maxRetries+1, lastParseErr) - return nil, totalUsage, nil + return nil, nil, totalUsage, nil } // --- shared prompt scaffolding --- @@ -141,15 +166,36 @@ Rules: - Prefer leaf sections. Include a parent only if the parent's own body is directly relevant. - Include as few sections as possible. Quality over quantity. - Only return IDs present in the provided outline. Do not invent IDs. -- If nothing is relevant, return an empty list.` +- If nothing is relevant, return an empty list. +- Attach a confidence score in [0.0, 1.0] to every pick reflecting how + likely that section's body answers the query. Use the full range — + do NOT score every pick at 1.0. 0.0 means "no signal", 1.0 means + "near-certain". If you cannot reason about confidence at all, omit + the picks array and return the legacy selected_section_ids form + instead; the engine accepts both shapes.` +// selectionJSONSchema is intentionally permissive: it accepts EITHER the +// legacy { selected_section_ids: [...] } shape OR the new +// { picks: [{id, confidence}] } shape so older / weaker models that +// can't reason about confidence still work. ParseSelection accepts +// both and returns confidences when present. const selectionJSONSchema = `{ "type": "object", "properties": { + "picks": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, + "confidence": {"type": "number", "minimum": 0, "maximum": 1} + }, + "required": ["id"] + } + }, "selected_section_ids": {"type": "array", "items": {"type": "string"}}, "reasoning": {"type": "string"} - }, - "required": ["selected_section_ids"] + } }` // BuildSelectionPrompt renders the user-side prompt for a selection call. @@ -174,7 +220,11 @@ func BuildSelectionPrompt(breadcrumb string, sections []tree.SectionView, siblin } b.WriteString("\nUser query:\n") b.WriteString(query) - b.WriteString("\n\nReturn a JSON object with fields `selected_section_ids` (array of strings) and `reasoning` (string).") + b.WriteString("\n\nReturn a JSON object. Preferred shape:\n") + b.WriteString(` {"picks": [{"id": "sec_x", "confidence": 0.82}, ...], "reasoning": "..."}` + "\n") + b.WriteString("confidence is a float in [0.0, 1.0] reflecting how likely the section's body answers the query. Use the full range; do not score every pick at 1.0.\n") + b.WriteString("Fallback shape (use ONLY if you cannot reason about confidence):\n") + b.WriteString(` {"selected_section_ids": ["sec_x", ...], "reasoning": "..."}`) return b.String() } @@ -229,18 +279,50 @@ func firstCandidateQuestion(qs []string) string { return "" } -// selectionPayload is the expected JSON-mode shape. +// selectionPick is one entry in the new-shape selection response. The +// `Confidence` field is a pointer so we can distinguish "model +// returned 0.0" from "model omitted the field" — the latter means +// "no signal for this pick" and skips the abstention check. +type selectionPick struct { + ID string `json:"id"` + Confidence *float64 `json:"confidence,omitempty"` +} + +// selectionPayload accepts both response shapes: +// +// - New shape (preferred): {"picks": [{"id": "...", "confidence": 0.8}], ...} +// - Legacy shape: {"selected_section_ids": ["..."], ...} +// +// When `Picks` is non-empty it wins; otherwise `SelectedSectionIDs` +// is used. This keeps backward compatibility with older models that +// can't reason about confidence (or with the legacy schema enforced +// by some provider integrations). type selectionPayload struct { - SelectedSectionIDs []string `json:"selected_section_ids"` - Reasoning string `json:"reasoning"` + Picks []selectionPick `json:"picks"` + SelectedSectionIDs []string `json:"selected_section_ids"` + Reasoning string `json:"reasoning"` } -// ParseSelection extracts the section-ID list from an LLM JSON response. -// Tolerates code-fence wrappers and leading/trailing prose. -func ParseSelection(raw string) ([]tree.SectionID, error) { +// ParseSelection extracts the section-ID list and (when present) per-ID +// confidence scores from an LLM JSON response. Tolerates code-fence +// wrappers and leading/trailing prose. +// +// Returns: +// +// - ids: the section IDs the model picked, in the order the +// model returned them. +// - confidences: map[id]float64 of per-pick confidences in [0.0, 1.0], +// populated only when the model returned the new-shape +// `picks` array. Returns nil (not an empty map) when +// the response was the legacy shape OR when every pick +// omitted its confidence — the distinction matters for +// abstention, which fires only when confidence signal +// is explicitly present. +// - err: non-nil only when the JSON cannot be decoded at all. +func ParseSelection(raw string) ([]tree.SectionID, map[tree.SectionID]float64, error) { raw = strings.TrimSpace(raw) if raw == "" { - return nil, nil + return nil, nil, nil } // Strip ```json ... ``` fences if present. if strings.HasPrefix(raw, "```") { @@ -260,8 +342,51 @@ func ParseSelection(raw string) ([]tree.SectionID, error) { var p selectionPayload if err := json.Unmarshal([]byte(raw), &p); err != nil { - return nil, fmt.Errorf("unmarshal selection: %w", err) + return nil, nil, fmt.Errorf("unmarshal selection: %w", err) + } + + // New shape wins. Even a single populated `picks` entry means the + // model attempted to follow the confidence protocol, so we honour + // it. Mixed responses (some picks with confidence, some without) + // surface only the present confidences — the missing ones are + // silently dropped from the confidence map, NOT defaulted to 0. + if len(p.Picks) > 0 { + ids := make([]tree.SectionID, 0, len(p.Picks)) + confidences := make(map[tree.SectionID]float64, len(p.Picks)) + seen := make(map[tree.SectionID]struct{}, len(p.Picks)) + for _, pk := range p.Picks { + id := strings.TrimSpace(pk.ID) + if id == "" { + continue + } + sid := tree.SectionID(id) + if _, dup := seen[sid]; dup { + continue + } + seen[sid] = struct{}{} + ids = append(ids, sid) + if pk.Confidence != nil { + c := *pk.Confidence + // Clamp into [0, 1]. The model is instructed to stay + // in range; clamping is a defence-in-depth so a + // runaway value never poisons the abstention check. + if c < 0 { + c = 0 + } else if c > 1 { + c = 1 + } + confidences[sid] = c + } + } + if len(confidences) == 0 { + // New-shape response but no confidences populated → treat + // as legacy for abstention purposes. + confidences = nil + } + return ids, confidences, nil } + + // Legacy shape. out := make([]tree.SectionID, 0, len(p.SelectedSectionIDs)) for _, id := range p.SelectedSectionIDs { id = strings.TrimSpace(id) @@ -269,7 +394,7 @@ func ParseSelection(raw string) ([]tree.SectionID, error) { out = append(out, tree.SectionID(id)) } } - return out, nil + return out, nil, nil } // FilterKnownIDs drops any IDs not present in the supplied section views and diff --git a/pkg/retrieval/strategy.go b/pkg/retrieval/strategy.go index 3edae3c..8f428f6 100644 --- a/pkg/retrieval/strategy.go +++ b/pkg/retrieval/strategy.go @@ -61,9 +61,20 @@ func (b ContextBudget) Available() int { // reasoning trace and cost accounting when the strategy supports it. type Result struct { SelectedIDs []tree.SectionID `json:"selected_ids"` - Reasoning string `json:"reasoning,omitempty"` - ModelUsed string `json:"model_used,omitempty"` - Usage Usage `json:"usage"` + + // Confidences carries per-pick relevance confidence in [0.0, 1.0] + // when the selection LLM returned the new-shape response with + // explicit confidence scores. Keys are restricted to IDs present in + // SelectedIDs (post-filter / post-merge). Nil when no confidence + // signal was present — either the legacy response shape was used or + // the model did not populate any confidence value. The API layer's + // abstention check fires only when this map is non-empty (see + // internal/api.handleQuery / handleAnswer). + Confidences map[tree.SectionID]float64 `json:"confidences,omitempty"` + + Reasoning string `json:"reasoning,omitempty"` + ModelUsed string `json:"model_used,omitempty"` + Usage Usage `json:"usage"` // HopsTaken is the number of LLM turns the strategy issued to reach the // final selection. Single-shot strategies set this to 1; iterative From 5d0a5f7f72da7697950089cc32c426bc0b529554 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:10:37 +0100 Subject: [PATCH 2/3] feat(config): retrieval.abstain block + VLE_RETRIEVAL_ABSTAIN_* env overrides AbstainBlock carries Enabled + Below (the [0.0, 1.0] confidence threshold below which picks count as "not confident"). When the selection LLM returns explicit per-pick confidence and EVERY pick falls below Below, the API layer surfaces an abstention response instead of pretending the document held an answer. Defaults: Enabled=true (opt-out), Below=0.4. Env overrides: VLE_RETRIEVAL_ABSTAIN_ENABLED (truthy/falsy), VLE_RETRIEVAL_ABSTAIN_BELOW (float in [0,1]). Validation rejects out-of-range Below values; bad env strings preserve the default rather than zeroing the field. Tests cover defaults, env overrides (enable/disable/parse), edge cases (0.0, 1.0 inclusive), bad-input rejection, and validation. --- config.example.yaml | 28 ++++++++++ pkg/config/config.go | 57 +++++++++++++++++++ pkg/config/config_test.go | 115 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+) diff --git a/config.example.yaml b/config.example.yaml index ee1b9c4..8b344c9 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -182,6 +182,34 @@ retrieval: # re-rank pass to do the final selection. top_k: 0 + # abstain: Phase 2.4 abstention. When the selection LLM returns + # per-pick confidence scores (the new picks shape) and every + # confidence falls below `below`, /v1/query and /v1/answer skip the + # normal path and return an abstention response instead: + # {abstained: true, abstention_reason: "...", sections: [], + # min_confidence_threshold: 0.4, candidate_confidences: {...}} + # For /v1/answer the synthesis call is skipped entirely; the answer + # is the honest "I cannot answer this question from the supplied + # document." This trades a likely hallucination for a clear refusal + # when the engine's own confidence is weak. + # + # OPT-OUT. Default enabled. Per-request `enable_abstain` body field + # overrides this block. When the selection LLM returns the legacy + # shape (no confidence scores) the engine never abstains regardless + # of this setting — abstention requires explicit confidence signal. + # + # The check is "all picks below threshold". If any pick scored + # above, the engine surfaces that section as evidence — abstention + # is reserved for the case where every candidate is weak. + abstain: + enabled: true + # Confidence threshold in [0.0, 1.0]. Picks with confidence + # strictly less than this are "not confident"; when ALL picks + # fall below, the response is an abstention. 0.4 is the default + # — high enough to filter weak matches, low enough not to + # suppress legitimate partial answers. + below: 0.4 + # replay: Phase 3.1 reproducibility store. Every /v1/query and # /v1/answer response carries a deterministic `trace_token`; the # response body is stored in an in-memory LRU under that token so diff --git a/pkg/config/config.go b/pkg/config/config.go index a640319..3a8dca0 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -254,6 +254,42 @@ type RetrievalConfig struct { Planning PlanningBlock `yaml:"planning"` ReRank ReRankBlock `yaml:"rerank"` Replay ReplayBlock `yaml:"replay"` + Abstain AbstainBlock `yaml:"abstain"` +} + +// AbstainBlock configures the Phase 2.4 abstention behaviour. +// +// When the selection LLM returns per-pick confidence scores and every +// confidence is below Below, the API layer (handleQuery / +// handleAnswer) replaces the normal response with an abstention: +// sections is empty and abstained=true. This refuses to ground an +// answer in evidence the model itself isn't confident is relevant, +// converting a likely hallucination into an honest "I don't know". +// +// Abstention fires only when explicit confidence signal is present. +// Legacy-shape responses (no confidences) always fall through to the +// normal path — the engine never abstains on the absence of signal. +// +// Per-request override: callers may set `enable_abstain` on the +// /v1/query or /v1/answer body to opt out of abstention for one +// request without restarting the server. When this block has +// Enabled=false, no request abstains regardless of the per-request +// flag. +type AbstainBlock struct { + // Enabled toggles abstention at the server level. Default: true + // (opt-out). + Enabled bool `yaml:"enabled"` + + // Below is the confidence threshold. Picks with confidence + // strictly less than Below are "not confident"; when ALL picks + // fall below this threshold the response is an abstention. + // Default: 0.4. + // + // The "all" semantics (vs "any") is deliberate: if even one + // section scored above the threshold, the engine has enough + // signal to surface it as evidence. Abstention is reserved for + // the case where every candidate is weak. + Below float64 `yaml:"below"` } // ReplayBlock configures the Phase 3.1 replay-trace store. @@ -489,6 +525,10 @@ func Default() Config { MaxEntries: 1024, TTLSeconds: 86400, }, + Abstain: AbstainBlock{ + Enabled: true, + Below: 0.4, + }, }, Ingest: IngestConfig{ GlobalLLMConcurrency: 12, @@ -748,6 +788,19 @@ func applyEnvOverrides(c *Config) { c.Retrieval.Replay.TTLSeconds = n } } + if v := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Retrieval.Abstain.Enabled = true + case "0", "false", "no", "off": + c.Retrieval.Abstain.Enabled = false + } + } + if v := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW"); v != "" { + if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil && f >= 0 && f <= 1 { + c.Retrieval.Abstain.Below = f + } + } } // Validate checks that required fields for the selected drivers are set. @@ -867,5 +920,9 @@ func (c Config) Validate() error { return fmt.Errorf("retrieval.replay.ttl_seconds must be >= 0, got %d", c.Retrieval.Replay.TTLSeconds) } + if c.Retrieval.Abstain.Below < 0 || c.Retrieval.Abstain.Below > 1 { + return fmt.Errorf("retrieval.abstain.below must be in [0.0, 1.0], got %v", c.Retrieval.Abstain.Below) + } + return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f71ad41..2a5c212 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -64,11 +64,126 @@ func TestDefaultValues(t *testing.T) { if cfg.Retrieval.Replay.TTLSeconds != 86400 { t.Errorf("retrieval.replay.ttl_seconds = %d, want 86400 (24h)", cfg.Retrieval.Replay.TTLSeconds) } + if !cfg.Retrieval.Abstain.Enabled { + t.Error("retrieval.abstain.enabled should default to true (opt-out)") + } + if cfg.Retrieval.Abstain.Below != 0.4 { + t.Errorf("retrieval.abstain.below = %v, want 0.4", cfg.Retrieval.Abstain.Below) + } if cfg.Log.Level != "info" { t.Errorf("log.level = %q, want info", cfg.Log.Level) } } +func TestAbstainEnvOverride(t *testing.T) { + // Mutates env — restore on exit. Not parallel. + prevEnabled := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED") + prevBelow := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW") + defer func() { + os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", prevEnabled) + os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prevBelow) + }() + + os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", "false") + os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", "0.6") + + cfg := Default() + applyEnvOverrides(&cfg) + + if cfg.Retrieval.Abstain.Enabled { + t.Error("VLE_RETRIEVAL_ABSTAIN_ENABLED=false should disable abstention") + } + if cfg.Retrieval.Abstain.Below != 0.6 { + t.Errorf("VLE_RETRIEVAL_ABSTAIN_BELOW=0.6 not applied, got %v", cfg.Retrieval.Abstain.Below) + } +} + +func TestAbstainEnvOverrideEnable(t *testing.T) { + // Toggle on via env from an explicitly-disabled starting state. + prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED") + defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", prev) + + cfg := Default() + cfg.Retrieval.Abstain.Enabled = false + os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", "true") + applyEnvOverrides(&cfg) + if !cfg.Retrieval.Abstain.Enabled { + t.Error("VLE_RETRIEVAL_ABSTAIN_ENABLED=true should enable abstention even when previously disabled") + } +} + +// TestAbstainEnvOverrideRejectsBad asserts a garbage float and an +// out-of-range value both preserve the default rather than silently +// zeroing or accepting a value that would break the abstention check +// (Below must be in [0,1]). +func TestAbstainEnvOverrideRejectsBad(t *testing.T) { + prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW") + defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prev) + + cases := []string{"not-a-float", "1.5", "-0.1", "abc"} + for _, v := range cases { + os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", v) + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Retrieval.Abstain.Below != 0.4 { + t.Errorf("bad ABSTAIN_BELOW=%q should preserve default 0.4, got %v", + v, cfg.Retrieval.Abstain.Below) + } + } +} + +// TestAbstainEnvOverrideParsesEdgeCases covers 0.0 and 1.0 (the +// inclusive bounds) and the canonical 0.4 default — these must all +// be accepted. +func TestAbstainEnvOverrideParsesEdgeCases(t *testing.T) { + prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW") + defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prev) + + cases := map[string]float64{ + "0": 0.0, + "0.0": 0.0, + "1": 1.0, + "1.0": 1.0, + "0.5": 0.5, + } + for raw, want := range cases { + os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", raw) + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Retrieval.Abstain.Below != want { + t.Errorf("ABSTAIN_BELOW=%q: got %v want %v", raw, cfg.Retrieval.Abstain.Below, want) + } + } +} + +// TestValidateAbstainOutOfRange asserts Validate rejects out-of-range +// Below values. The env-override path silently drops them, but a YAML +// file or explicit struct edit can still land a bad value here. +func TestValidateAbstainOutOfRange(t *testing.T) { + t.Parallel() + + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + cfg.Retrieval.Abstain.Below = 1.5 + if err := cfg.Validate(); err == nil { + t.Error("abstain.below=1.5 should fail validation") + } + + cfg2 := Default() + cfg2.Database.URL = "postgres://localhost/test" + cfg2.Retrieval.Abstain.Below = -0.1 + if err := cfg2.Validate(); err == nil { + t.Error("abstain.below=-0.1 should fail validation") + } + + cfg3 := Default() + cfg3.Database.URL = "postgres://localhost/test" + cfg3.Retrieval.Abstain.Below = 0.0 + if err := cfg3.Validate(); err != nil { + t.Errorf("abstain.below=0.0 should pass validation, got %v", err) + } +} + func TestReplayEnvOverride(t *testing.T) { // Not parallel — mutates env. Restore on exit. prevEnabled := os.Getenv("VLE_RETRIEVAL_REPLAY_ENABLED") From 666a74b82475505c7f2da80e4cd5213ba72baed3 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 03:20:50 +0100 Subject: [PATCH 3/3] feat(api): confidence-driven abstention on /v1/query and /v1/answer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the selection LLM returns per-pick confidences and every pick falls strictly below retrieval.abstain.below (default 0.4), the API layer skips the normal path and returns an abstention response: /v1/query → sections: [], abstained: true, abstention_reason, min_confidence_threshold, candidate_confidences /v1/answer → answer: "I cannot answer this question from the supplied document.", citations: [], same abstention fields, synthesis LLM call skipped entirely (planning + retrieval usage carried through) The "all picks below" semantics is deliberate: if even one section scored at-or-above the threshold the engine surfaces it as evidence. Abstention is reserved for the case where every candidate is weak. Abstention requires explicit confidence signal — legacy-shape LLM responses (no confidence map) always fall through to the normal path. Per-request `enable_abstain` body field overrides the server config; opt out globally via retrieval.abstain.enabled: false. Other changes: - Result.Confidences threads through the Decomposer (multi-hop plans union confidences max-wins on overlap). - Successful (non-abstained) responses surface a `confidences` map on the wire when the model returned them. - Abstention responses carry no trace_token — there is no retrieval result to replay. - cmd/engine wires cfg.Retrieval.Abstain into the Deps. Tests cover: shouldAbstain predicate (all-below, one-above, boundary, nil/empty); filterConfidencesToIDs sentinel preservation; stringKeyedConfidences conversion; abstentionEnabled body-override precedence; respondAbstained / respondAbstainedAnswer shape; synthesis tripwire (LLM must not be called on abstention path); trace_token absence on abstention. OpenAPI: - enable_abstain on QueryRequest + AnswerRequest. - abstained, abstention_reason, min_confidence_threshold, candidate_confidences, confidences on both response schemas. --- cmd/engine/main.go | 1 + internal/api/abstention_test.go | 340 ++++++++++++++++++++++++++++++++ internal/api/server.go | 242 +++++++++++++++++++++-- openapi.yaml | 119 ++++++++++- pkg/retrieval/decompose.go | 67 +++++-- 5 files changed, 727 insertions(+), 42 deletions(-) create mode 100644 internal/api/abstention_test.go diff --git a/cmd/engine/main.go b/cmd/engine/main.go index 5398a31..b40533c 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -209,6 +209,7 @@ func run() error { ReRanker: reRanker, ReRank: cfg.Retrieval.ReRank, Replay: replayStore, + Abstain: cfg.Retrieval.Abstain, } srv := &http.Server{ diff --git a/internal/api/abstention_test.go b/internal/api/abstention_test.go new file mode 100644 index 0000000..2018cdf --- /dev/null +++ b/internal/api/abstention_test.go @@ -0,0 +1,340 @@ +package api + +import ( + "bytes" + "context" + "encoding/json" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/go-chi/chi/v5" + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/config" + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// TestShouldAbstainAllBelow: every confidence under threshold → abstain. +func TestShouldAbstainAllBelow(t *testing.T) { + t.Parallel() + confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.2, "sec_c": 0.39} + if !shouldAbstain(confidences, 0.4) { + t.Error("all confidences below 0.4 must trigger abstention") + } +} + +// TestShouldAbstainOneAbove: any confidence at-or-above threshold → no abstain. +// The "all picks below" semantics is the spec's choice: if even one +// section has signal, surface it as evidence. +func TestShouldAbstainOneAbove(t *testing.T) { + t.Parallel() + confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.45} + if shouldAbstain(confidences, 0.4) { + t.Error("one pick at 0.45 should suppress abstention even when peers are low") + } +} + +// TestShouldAbstainBoundary: confidence == threshold counts as "above" so +// the engine is generous about evidence; the threshold is strict-below. +func TestShouldAbstainBoundary(t *testing.T) { + t.Parallel() + confidences := map[tree.SectionID]float64{"sec_a": 0.4} + if shouldAbstain(confidences, 0.4) { + t.Error("confidence == threshold must NOT trigger abstention (strict-below)") + } +} + +// TestShouldAbstainNilOrEmpty: missing confidence signal never abstains. +// This is the contract that keeps legacy-shape LLM responses working +// — the engine cannot abstain when it has no confidence to evaluate. +func TestShouldAbstainNilOrEmpty(t *testing.T) { + t.Parallel() + if shouldAbstain(nil, 0.4) { + t.Error("nil confidences must NOT trigger abstention") + } + if shouldAbstain(map[tree.SectionID]float64{}, 0.4) { + t.Error("empty confidences must NOT trigger abstention") + } +} + +// TestFilterConfidencesToIDsHappy verifies the helper restricts +// surfaced confidences to the IDs the response actually carries (post +// max_sections / re-rank truncation). +func TestFilterConfidencesToIDs(t *testing.T) { + t.Parallel() + src := map[tree.SectionID]float64{"a": 0.1, "b": 0.5, "c": 0.9} + got := filterConfidencesToIDs(src, []tree.SectionID{"a", "c"}) + if len(got) != 2 { + t.Fatalf("filtered length = %d, want 2", len(got)) + } + if got["a"] != 0.1 || got["c"] != 0.9 { + t.Errorf("filtered = %v", got) + } + if _, present := got["b"]; present { + t.Error("b should have been filtered out") + } +} + +// TestFilterConfidencesNilStaysNil preserves the "no signal" sentinel +// across the helper. +func TestFilterConfidencesNilStaysNil(t *testing.T) { + t.Parallel() + if got := filterConfidencesToIDs(nil, []tree.SectionID{"a"}); got != nil { + t.Errorf("nil input must produce nil output, got %v", got) + } + // All keys filtered out → nil too. + if got := filterConfidencesToIDs(map[tree.SectionID]float64{"x": 0.5}, []tree.SectionID{"a"}); got != nil { + t.Errorf("empty filtered result must produce nil, got %v", got) + } +} + +// TestStringKeyedConfidencesShape: the helper converts the typed map +// to JSON-friendly string keys for the wire response. +func TestStringKeyedConfidences(t *testing.T) { + t.Parallel() + got := stringKeyedConfidences(map[tree.SectionID]float64{"sec_a": 0.7}) + if got["sec_a"] != 0.7 { + t.Errorf("converted map should preserve the value, got %v", got) + } + if stringKeyedConfidences(nil) != nil { + t.Error("nil input must produce nil") + } +} + +// TestAbstentionEnabledOverride: per-request body field wins over server config. +func TestAbstentionEnabledOverride(t *testing.T) { + t.Parallel() + d := Deps{Abstain: config.AbstainBlock{Enabled: false}} + if !d.abstentionEnabled(boolPtr(true)) { + t.Error("body=true should override server=false") + } + d2 := Deps{Abstain: config.AbstainBlock{Enabled: true}} + if d2.abstentionEnabled(boolPtr(false)) { + t.Error("body=false should override server=true") + } +} + +// TestAbstentionEnabledFallsBackToConfig: when the body field is nil, +// the server config decides. +func TestAbstentionEnabledFallsBackToConfig(t *testing.T) { + t.Parallel() + d := Deps{Abstain: config.AbstainBlock{Enabled: true}} + if !d.abstentionEnabled(nil) { + t.Error("nil body should fall back to server=true") + } + d2 := Deps{Abstain: config.AbstainBlock{Enabled: false}} + if d2.abstentionEnabled(nil) { + t.Error("nil body should fall back to server=false") + } +} + +// --- Integration-style tests against handleQuery / handleAnswer --- +// +// These exercise the response-shape contracts: that all-low +// confidences yield an abstained response; that mixed +// (some-above-threshold) confidences yield a normal response; and +// that legacy responses (no confidences) never abstain. + +// stubStrategy is a CostStrategy that returns canned IDs + +// confidences without touching any LLM. +type stubStrategy struct { + ids []tree.SectionID + confidences map[tree.SectionID]float64 + usage retrieval.Usage + calls int32 +} + +func (s *stubStrategy) Name() string { return "stub" } + +func (s *stubStrategy) Select(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) ([]tree.SectionID, error) { + atomic.AddInt32(&s.calls, 1) + return s.ids, nil +} + +func (s *stubStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) (*retrieval.Result, error) { + atomic.AddInt32(&s.calls, 1) + return &retrieval.Result{ + SelectedIDs: s.ids, + Confidences: s.confidences, + Usage: s.usage, + HopsTaken: 1, + }, nil +} + +// abstentionRouter wires only handleQuery / handleAnswer. We mock the +// strategy and bypass DB by passing a tiny in-memory tree-loader +// stub. The simplest way is to give the handler a Strategy that +// short-circuits before any storage read — done by also stubbing +// the storage to return empty content. +func abstentionRouter(d Deps) http.Handler { + r := chi.NewRouter() + r.Route("/v1", func(r chi.Router) { + r.Post("/query", d.handleQuery) + r.Post("/answer", d.handleAnswer) + }) + return r +} + +// TestHandleQueryAbstainsOnAllLow: every confidence below threshold → +// the response is the abstention shape with sections=[] and +// abstained=true. +// +// We cannot run handleQuery without a DB-backed tree loader; instead, +// this test calls the helper functions on a Deps struct as the +// handler would, asserting the shape. +func TestRespondAbstained(t *testing.T) { + t.Parallel() + d := Deps{ + Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}}, + Abstain: config.AbstainBlock{Enabled: true, Below: 0.4}, + } + confidences := map[tree.SectionID]float64{"sec_a": 0.12, "sec_b": 0.30} + + rec := httptest.NewRecorder() + d.respondAbstained(rec, tree.DocumentID("doc_x"), "what is x?", confidences, nil) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want 200", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatal(err) + } + if v, _ := body["abstained"].(bool); !v { + t.Error("response must carry abstained=true") + } + if v, _ := body["abstention_reason"].(string); !strings.Contains(v, "confidence") { + t.Errorf("abstention_reason missing 'confidence': %q", v) + } + if v, _ := body["min_confidence_threshold"].(float64); v != 0.4 { + t.Errorf("min_confidence_threshold = %v, want 0.4", v) + } + if v, _ := body["sections"].([]any); len(v) != 0 { + t.Errorf("sections must be empty, got %v", v) + } + cc, ok := body["candidate_confidences"].(map[string]any) + if !ok { + t.Fatal("candidate_confidences missing") + } + if cc["sec_a"] != 0.12 { + t.Errorf("sec_a confidence = %v, want 0.12", cc["sec_a"]) + } +} + +// TestRespondAbstainedAnswer: same shape on /v1/answer. The synthesis +// call is skipped — answer is the canonical refusal string, citations +// is empty. +func TestRespondAbstainedAnswer(t *testing.T) { + t.Parallel() + d := Deps{ + Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}}, + Abstain: config.AbstainBlock{Enabled: true, Below: 0.4}, + Logger: slog.Default(), + } + confidences := map[tree.SectionID]float64{"sec_a": 0.1} + usage := retrieval.Usage{InputTokens: 100, OutputTokens: 20, TotalTokens: 120, LLMCalls: 2} + + rec := httptest.NewRecorder() + d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", confidences, nil, usage, time.Now()) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want 200", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatal(err) + } + if v, _ := body["abstained"].(bool); !v { + t.Error("answer response must carry abstained=true") + } + if v, _ := body["answer"].(string); !strings.Contains(v, "cannot answer") { + t.Errorf("answer must be the canonical refusal, got %q", v) + } + if v, _ := body["citations"].([]any); len(v) != 0 { + t.Errorf("citations must be empty, got %v", v) + } + // Usage carried through (planning + retrieval — no synthesis). + if u, ok := body["usage"].(map[string]any); !ok { + t.Error("usage block missing") + } else if u["llm_calls"].(float64) != 2 { + t.Errorf("usage.llm_calls = %v, want 2", u["llm_calls"]) + } +} + +// TestRespondAbstainedTraceTokenAbsent: replay isn't meaningful for +// an abstention (the engine produced no retrieval result); the +// response must NOT carry a trace_token so callers don't try to +// replay nothing. +func TestRespondAbstainedTraceTokenAbsent(t *testing.T) { + t.Parallel() + d := Deps{ + Strategy: &stubStrategy{}, + Abstain: config.AbstainBlock{Enabled: true, Below: 0.4}, + } + rec := httptest.NewRecorder() + d.respondAbstained(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil) + + var body map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &body) + if _, has := body["trace_token"]; has { + t.Error("abstention response must NOT carry trace_token") + } +} + +// boolPtr is a tiny helper for the body-override tests. +func boolPtr(b bool) *bool { return &b } + +// --- end-to-end through ServeHTTP without DB --- +// +// To exercise handleQuery / handleAnswer end-to-end we'd need a +// db.Pool. Instead we cover the in-handler logic by directly calling +// the helpers above (which is what the handler itself does on the +// abstention path) and by running the predicate tests through the +// handler-facing entrypoint via shouldAbstain + abstentionEnabled. +// A future test pass with a real test DB will exercise the full +// stack — for now, the abstention contract is unit-tested at the +// helper boundary, which is the only place the contract lives. + +// mockLLMNeverCalled fails the test loudly if any LLM call lands. +// Used as a tripwire in the abstention path: synthesis must NOT +// run when /v1/answer abstains. +type mockLLMNeverCalled struct{ t *testing.T } + +func (m mockLLMNeverCalled) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + m.t.Error("LLM should not be called on the abstention path") + return &llmgate.Response{Content: ""}, nil +} + +func (m mockLLMNeverCalled) CountTokens(ctx context.Context, s string) (int, error) { + return len(s) / 4, nil +} + +// TestRespondAbstainedAnswerSkipsSynthesis: the /v1/answer abstention +// helper must not invoke the LLM. We pass an LLM that explodes on +// any call so we'd see the test fail if synthesis leaks through. +func TestRespondAbstainedAnswerSkipsSynthesis(t *testing.T) { + t.Parallel() + d := Deps{ + Strategy: &stubStrategy{}, + Abstain: config.AbstainBlock{Enabled: true, Below: 0.4}, + LLM: mockLLMNeverCalled{t: t}, + } + rec := httptest.NewRecorder() + d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil, retrieval.Usage{}, time.Now()) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want 200", rec.Code) + } +} + +// (Imports that won't otherwise be referenced by every test file go +// through small uses below so go vet is happy.) +var _ = bytes.NewReader +var _ = io.EOF +var _ = abstentionRouter diff --git a/internal/api/server.go b/internal/api/server.go index 088db8e..b65ab78 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -91,6 +91,13 @@ type Deps struct { // /v1/replay (the endpoint returns 501) and skips the per- // response store write. Replay retrieval.ReplayStore + + // Abstain carries the server-side abstention config. The + // body-level `enable_abstain` field on /v1/query and /v1/answer + // overrides Abstain.Enabled. When abstention fires, the response + // carries abstained=true and an empty sections / citations list + // rather than risk hallucinating an answer from weak evidence. + Abstain config.AbstainBlock } // Router builds and returns the chi router wired with v1 routes. @@ -398,14 +405,19 @@ func (d Deps) handleGetSection(w http.ResponseWriter, r *http.Request) { // handleQuery accepts { document_id, query, model?, max_tokens?, // reserved_for_prompt?, max_parallel_calls?, max_sections?, -// enable_planning? } and runs the configured retrieval.Strategy against -// the document's tree. +// enable_planning?, enable_rerank?, enable_abstain? } and runs the +// configured retrieval.Strategy against the document's tree. // // When `enable_planning` is true (or `retrieval.planning.enabled` is on // at config level) the request first issues a planning LLM call. The // resulting Plan is surfaced in the response under "plan". If the plan // is multi-hop and decomposition is enabled, retrieval fans out one // strategy call per sub-question and unions the results. +// +// When the selection LLM returns per-pick confidence scores and every +// pick falls below `retrieval.abstain.below`, the response is an +// abstention: sections is empty and abstained=true. Per-request +// `enable_abstain` overrides the server-side flag for one request. func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { var body struct { DocumentID tree.DocumentID `json:"document_id"` @@ -423,6 +435,10 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { // content-aware re-rank pass. Pointer for the same reason as // EnablePlanning. Overrides retrieval.rerank.enabled. EnableReRank *bool `json:"enable_rerank"` + // EnableAbstain opts this request into the Phase 2.4 + // confidence-driven abstention check. Pointer for the same + // reason as EnablePlanning. Overrides retrieval.abstain.enabled. + EnableAbstain *bool `json:"enable_abstain"` } if err := json.NewDecoder(r.Body).Decode(&body); err != nil { writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) @@ -466,12 +482,24 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { started := time.Now() plan, _ := d.runPlanner(r.Context(), body.Query, body.EnablePlanning) - ids, err := d.runSelection(r.Context(), t, plan, body.Query, budget) + ids, confidences, err := d.runSelection(r.Context(), t, plan, body.Query, budget) if err != nil { d.Logger.Error("query: strategy failed", "err", err, "document_id", body.DocumentID) writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error()) return } + + // Phase 2.4 abstention: if every confident pick is below the + // configured threshold, refuse to ground an answer in evidence + // the model itself is not confident is relevant. The check fires + // only when explicit confidence signal is present — legacy-shape + // responses (no confidences) always fall through to the normal + // path so older models keep working. + if d.abstentionEnabled(body.EnableAbstain) && shouldAbstain(confidences, d.Abstain.Below) { + d.respondAbstained(w, body.DocumentID, body.Query, confidences, plan) + return + } + if body.MaxSections > 0 && len(ids) > body.MaxSections { ids = ids[:body.MaxSections] } @@ -539,6 +567,12 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { if plan != nil { resp["plan"] = plan } + // Surface the confidence map on the response when present. Only the + // finalIDs survive truncation, so trim accordingly. Empty map → + // omit so the field stays absent when no signal was available. + if filtered := filterConfidencesToIDs(confidences, finalIDs); len(filtered) > 0 { + resp["confidences"] = stringKeyedConfidences(filtered) + } raw, err := marshalJSONForReplay(resp) if err != nil { @@ -692,6 +726,11 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { // pass. Synthesis then sees the re-ranked top-k. Overrides // retrieval.rerank.enabled. EnableReRank *bool `json:"enable_rerank"` + // EnableAbstain opts this request into the Phase 2.4 + // confidence-driven abstention check. When all picks fall + // below the threshold, /v1/answer skips synthesis entirely + // and returns a refusal answer with no citations. + EnableAbstain *bool `json:"enable_abstain"` } if err := json.NewDecoder(r.Body).Decode(&body); err != nil { writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) @@ -734,13 +773,22 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { plan, planUsage := d.runPlanner(r.Context(), body.Query, body.EnablePlanning) totalUsage.Add(planUsage) - ids, retrievalUsage, err := d.runSelectionWithUsage(r.Context(), t, plan, body.Query, budget) + ids, confidences, retrievalUsage, err := d.runSelectionWithUsage(r.Context(), t, plan, body.Query, budget) if err != nil { writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error()) return } totalUsage.Add(retrievalUsage) + // Phase 2.4 abstention: skip synthesis entirely when every confident + // pick falls below the threshold. The response answers with a + // regulator-friendly refusal rather than a hallucinated synthesis + // of weak evidence. + if d.abstentionEnabled(body.EnableAbstain) && shouldAbstain(confidences, d.Abstain.Below) { + d.respondAbstainedAnswer(w, body.DocumentID, body.Query, confidences, plan, totalUsage, started) + return + } + maxSections := body.MaxSections if maxSections <= 0 { maxSections = d.Answer.MaxSections @@ -863,6 +911,9 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { if plan != nil { resp["plan"] = plan } + if filtered := filterConfidencesToIDs(confidences, finalIDs); len(filtered) > 0 { + resp["confidences"] = stringKeyedConfidences(filtered) + } raw, err := marshalJSONForReplay(resp) if err != nil { @@ -1191,38 +1242,45 @@ func (d Deps) runPlanner(ctx context.Context, query string, bodyOverride *bool) // runSelection picks section IDs for the query, optionally going // through the Decomposer when the plan is multi-hop AND planning-level -// decomposition is enabled. Returns the same []SectionID Strategy.Select -// would. -func (d Deps) runSelection(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, error) { - if d.shouldDecompose(plan) { - ids, _, err := retrieval.NewDecomposer(d.Strategy).DecomposedSelect(ctx, t, plan, query, budget) - return ids, err - } - return d.Strategy.Select(ctx, t, query, budget) +// decomposition is enabled. Returns the selected IDs plus the per-pick +// confidence map (nil when the selection LLM returned the legacy +// shape with no confidence signal). +func (d Deps) runSelection(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, error) { + ids, confidences, _, err := d.runSelectionFull(ctx, t, plan, query, budget) + return ids, confidences, err } // runSelectionWithUsage is the cost-tracking variant used by /v1/answer. -// Returns the selected IDs plus the Usage accumulated during selection -// (across all sub-questions for multi-hop plans). -func (d Deps) runSelectionWithUsage(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, retrieval.Usage, error) { +// Returns the selected IDs, per-pick confidences (nil when no signal), +// and the Usage accumulated during selection (across all sub-questions +// for multi-hop plans). +func (d Deps) runSelectionWithUsage(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, retrieval.Usage, error) { + return d.runSelectionFull(ctx, t, plan, query, budget) +} + +// runSelectionFull is the shared workhorse behind runSelection / +// runSelectionWithUsage. It routes through the Decomposer when the +// plan is multi-hop AND decomposition is enabled, and surfaces +// confidences for the Phase 2.4 abstention check. +func (d Deps) runSelectionFull(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, retrieval.Usage, error) { if d.shouldDecompose(plan) { - return retrieval.NewDecomposer(d.Strategy).DecomposedSelect(ctx, t, plan, query, budget) + return retrieval.NewDecomposer(d.Strategy).DecomposedSelectWithConfidences(ctx, t, plan, query, budget) } if cs, ok := d.Strategy.(retrieval.CostStrategy); ok { res, err := cs.SelectWithCost(ctx, t, query, budget) if err != nil { - return nil, retrieval.Usage{}, err + return nil, nil, retrieval.Usage{}, err } if res == nil { - return nil, retrieval.Usage{}, nil + return nil, nil, retrieval.Usage{}, nil } - return res.SelectedIDs, res.Usage, nil + return res.SelectedIDs, res.Confidences, res.Usage, nil } ids, err := d.Strategy.Select(ctx, t, query, budget) if err != nil { - return nil, retrieval.Usage{}, err + return nil, nil, retrieval.Usage{}, err } - return ids, retrieval.Usage{}, nil + return ids, nil, retrieval.Usage{}, nil } // shouldDecompose returns true when the plan is multi-hop AND @@ -1379,6 +1437,148 @@ func writePlanHints(b *strings.Builder, plan *retrieval.Plan) { } } +// --- abstention helpers --- + +// abstentionEnabled reports whether the request should run the +// confidence-driven abstention check. The per-request body field (when +// present) wins over the server-side config; a nil body field falls +// back to the config. When neither is enabled, abstention is skipped +// regardless of the confidence signal. +func (d Deps) abstentionEnabled(bodyOverride *bool) bool { + if bodyOverride != nil { + return *bodyOverride + } + return d.Abstain.Enabled +} + +// shouldAbstain returns true when confidences carry an explicit +// signal AND every entry is strictly below threshold. +// +// The "all picks below" semantics (vs "any pick below") is +// deliberate: if even one section scored above, the engine has +// enough evidence to surface it. Abstention is reserved for the case +// where every candidate is weak. +// +// nil / empty confidences never trigger abstention — abstention +// requires explicit confidence signal from the selection LLM. A +// legacy-shape response carries nil confidences and falls through +// to the normal path. +func shouldAbstain(confidences map[tree.SectionID]float64, threshold float64) bool { + if len(confidences) == 0 { + return false + } + for _, c := range confidences { + if c >= threshold { + return false + } + } + return true +} + +// stringKeyedConfidences converts the typed confidence map into a +// JSON-friendly {string: float} so encoding/json emits an object +// with section_id keys rather than relying on a tree.SectionID +// MarshalText shim. Returns nil when src is empty so the field +// stays absent on the wire. +func stringKeyedConfidences(src map[tree.SectionID]float64) map[string]float64 { + if len(src) == 0 { + return nil + } + out := make(map[string]float64, len(src)) + for id, c := range src { + out[string(id)] = c + } + return out +} + +// filterConfidencesToIDs keeps only the entries whose IDs appear in +// keep, preserving the "no signal" semantics: a nil input returns +// nil, an empty filtered result also returns nil so callers can do +// a single len()-check before serialising. +func filterConfidencesToIDs(src map[tree.SectionID]float64, keep []tree.SectionID) map[tree.SectionID]float64 { + if len(src) == 0 { + return nil + } + out := make(map[tree.SectionID]float64, len(keep)) + for _, id := range keep { + if v, ok := src[id]; ok { + out[id] = v + } + } + if len(out) == 0 { + return nil + } + return out +} + +// abstentionReason is the human-readable message attached to every +// abstention response. Kept as a single constant so callers don't +// drift on wording and analytics can group by exact string. +const abstentionReason = "no candidate section scored above the confidence threshold" + +// abstentionAnswerText is the canonical refusal used by /v1/answer +// when abstention fires. The text is regulator-friendly: it admits +// the engine could not answer rather than guessing, and does so in a +// language clients can surface verbatim. +const abstentionAnswerText = "I cannot answer this question from the supplied document." + +// respondAbstained writes the abstention shape for /v1/query. The +// response includes the threshold and the candidate_confidences map +// the model returned so callers (and downstream evaluators) can see +// exactly why the engine refused. +// +// trace_token is intentionally empty on abstention: we don't store +// the response in the replay log because there's no meaningful +// retrieval result to reproduce. Callers replaying an abstention +// will simply re-run /v1/query. +func (d Deps) respondAbstained(w http.ResponseWriter, docID tree.DocumentID, query string, confidences map[tree.SectionID]float64, plan *retrieval.Plan) { + resp := map[string]any{ + "document_id": docID, + "query": query, + "strategy": d.Strategy.Name(), + "sections": []any{}, + "abstained": true, + "abstention_reason": abstentionReason, + "min_confidence_threshold": d.Abstain.Below, + "candidate_confidences": stringKeyedConfidences(confidences), + } + if plan != nil { + resp["plan"] = plan + } + writeJSON(w, http.StatusOK, resp) +} + +// respondAbstainedAnswer writes the abstention shape for /v1/answer. +// The answer text is the canonical refusal; citations is empty; +// usage carries the LLM tokens spent up to the abstention point +// (planning + retrieval, no synthesis) so the caller's billing +// stays accurate. +func (d Deps) respondAbstainedAnswer(w http.ResponseWriter, docID tree.DocumentID, query string, confidences map[tree.SectionID]float64, plan *retrieval.Plan, usage retrieval.Usage, started time.Time) { + resp := map[string]any{ + "document_id": docID, + "query": query, + "answer": abstentionAnswerText, + "citations": []any{}, + "strategy": d.Strategy.Name(), + "usage": map[string]any{ + "input_tokens": usage.InputTokens, + "output_tokens": usage.OutputTokens, + "total_tokens": usage.TotalTokens, + "cost_usd": usage.CostUSD, + "llm_calls": usage.LLMCalls, + }, + "elapsed_ms": time.Since(started).Milliseconds(), + "abstained": true, + "abstention_reason": abstentionReason, + "min_confidence_threshold": d.Abstain.Below, + "candidate_confidences": stringKeyedConfidences(confidences), + } + if plan != nil { + resp["plan"] = plan + } + writeJSON(w, http.StatusOK, resp) +} + // --- helpers --- func writeJSON(w http.ResponseWriter, status int, v any) { diff --git a/openapi.yaml b/openapi.yaml index a4c82ef..6307047 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -551,6 +551,19 @@ components: original order — re-rank never drops sections. Overrides the server's `retrieval.rerank.enabled` setting for this request only. + enable_abstain: + type: boolean + description: | + Opt this request into the Phase 2.4 confidence-driven + abstention check. When the selection LLM returns per-pick + confidence scores and every score falls below + `retrieval.abstain.below`, the response is an abstention + (sections empty, abstained=true) rather than evidence the + engine itself is not confident is relevant. Abstention + requires explicit confidence signal — legacy-shape LLM + responses (no confidence) never trigger abstention. + Overrides the server's `retrieval.abstain.enabled` + setting for this request only. QueryResponse: type: object @@ -568,6 +581,9 @@ components: type: array items: $ref: "#/components/schemas/QuerySection" + description: | + Empty when `abstained=true`. The engine refused to surface + sections it isn't confident answer the query. plan: $ref: "#/components/schemas/Plan" elapsed_ms: @@ -581,7 +597,48 @@ components: Pass this token to /v1/replay along with the original `query` and `document_id` to retrieve the byte-identical response. Empty when the server has - `retrieval.replay.enabled=false`. + `retrieval.replay.enabled=false`, and omitted on + abstention responses (which have no retrieval result to + replay). + confidences: + type: object + additionalProperties: + type: number + description: | + Per-section confidence scores in [0.0, 1.0] returned by the + selection LLM, keyed by section_id. Only present when the + model returned the new-shape response with explicit + confidence values. Useful for downstream evaluators that + want to surface the engine's certainty alongside the + evidence. + abstained: + type: boolean + description: | + True when the engine refused to ground an answer in the + retrieved evidence because every candidate section scored + below `min_confidence_threshold`. When true, `sections` + is empty and `candidate_confidences` carries the actual + scores so the caller can decide whether to relax the + threshold and retry. + abstention_reason: + type: string + description: | + Human-readable explanation when `abstained=true`. Stable + string suitable for surfacing verbatim to end users. + min_confidence_threshold: + type: number + description: | + The `retrieval.abstain.below` value the engine compared + confidences against. Present only on abstention responses. + candidate_confidences: + type: object + additionalProperties: + type: number + description: | + Per-section confidence scores the selection LLM returned + for every candidate it considered, NOT just the final + picks. Surfaced only on abstention responses so callers + can see exactly why the engine refused. QuerySection: type: object @@ -675,6 +732,15 @@ components: When the pass runs, the synthesis prompt sees the re-ranked top-k (capped by `retrieval.rerank.top_k`), and each citation in the response carries a `score` field. + enable_abstain: + type: boolean + description: | + Opt this request into the Phase 2.4 confidence-driven + abstention check. See QueryRequest.enable_abstain for + full semantics. When abstention fires on /v1/answer the + synthesis call is skipped entirely; the response carries + a canonical refusal in `answer`, an empty `citations` + array, and `abstained=true`. AnswerResponse: type: object @@ -685,11 +751,17 @@ components: type: string answer: type: string - description: Natural-language answer grounded in the cited sections. + description: | + Natural-language answer grounded in the cited sections. + When `abstained=true` this is the canonical refusal + ("I cannot answer this question from the supplied + document.") rather than synthesised prose. citations: type: array items: $ref: "#/components/schemas/AnswerCitation" + description: | + Empty array when `abstained=true`. strategy: type: string model: @@ -702,6 +774,10 @@ components: total_tokens: {type: integer} cost_usd: {type: number} llm_calls: {type: integer} + description: | + On abstention this carries the planning + retrieval + tokens but no synthesis tokens — the engine skipped + the synthesis LLM call entirely. plan: $ref: "#/components/schemas/Plan" elapsed_ms: @@ -715,7 +791,44 @@ components: system prompt version). Pass to /v1/replay with the original `query` and `document_id` to fetch the byte-identical response. Empty when the server has - `retrieval.replay.enabled=false`. + `retrieval.replay.enabled=false`, and omitted on + abstention responses (which have no synthesis result to + replay). + confidences: + type: object + additionalProperties: + type: number + description: | + Per-section confidence scores in [0.0, 1.0] returned by + the selection LLM, keyed by section_id. Only present + when the model returned the new-shape response with + explicit confidence values. + abstained: + type: boolean + description: | + True when the engine refused to synthesise an answer + because every candidate section scored below + `min_confidence_threshold`. The synthesis LLM call is + skipped entirely; `answer` is the canonical refusal and + `citations` is empty. + abstention_reason: + type: string + description: | + Human-readable explanation when `abstained=true`. + min_confidence_threshold: + type: number + description: | + The `retrieval.abstain.below` value the engine compared + confidences against. Present only on abstention responses. + candidate_confidences: + type: object + additionalProperties: + type: number + description: | + Per-section confidence scores the selection LLM returned + for every candidate it considered. Surfaced only on + abstention responses so callers can see exactly why the + engine refused to synthesise. Plan: type: object diff --git a/pkg/retrieval/decompose.go b/pkg/retrieval/decompose.go index 8c5e83b..0d5aa6f 100644 --- a/pkg/retrieval/decompose.go +++ b/pkg/retrieval/decompose.go @@ -42,14 +42,31 @@ func NewDecomposer(s Strategy) *Decomposer { // returns the partial Usage gathered up to that point. This is the same // failure contract Strategy.Select has — a multi-hop loop shouldn't // silently mask retrieval errors. +// +// This method does NOT surface per-pick confidences. Callers that need +// them should use DecomposedSelectWithConfidences (added in Phase 2.4). func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *Plan, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) { + ids, _, usage, err := d.DecomposedSelectWithConfidences(ctx, t, plan, query, budget) + return ids, usage, err +} + +// DecomposedSelectWithConfidences is the Phase 2.4 variant of +// DecomposedSelect that also returns the per-pick confidence map. +// When a sub-question's underlying Strategy is a CostStrategy and +// surfaces confidences, those are unioned across sub-questions (max +// wins on duplicate IDs — the most confident sub-question wins). +// +// The returned confidences map is nil when no sub-question contributed +// any confidence signal at all — preserving the "no confidence signal" +// distinction the API layer's abstention check depends on. +func (d *Decomposer) DecomposedSelectWithConfidences(ctx context.Context, t *tree.Tree, plan *Plan, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) { if d == nil || d.Strategy == nil { - return nil, Usage{}, fmt.Errorf("decomposer: no strategy configured") + return nil, nil, Usage{}, fmt.Errorf("decomposer: no strategy configured") } // Fall-through: no plan or not multi-hop. Single retrieval call on - // the original query, with usage extracted from CostStrategy when - // available. + // the original query, with usage + confidences extracted from + // CostStrategy when available. if plan == nil || !plan.IsMultiHop || len(plan.SubQuestions) == 0 { return d.runOnce(ctx, t, query, budget) } @@ -59,15 +76,16 @@ func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *P // sub-question is usually the most important — and gives a // deterministic union ordering callers can rely on. var ( - totalUsage Usage - out = make([]tree.SectionID, 0) - seen = make(map[tree.SectionID]struct{}) + totalUsage Usage + out = make([]tree.SectionID, 0) + seen = make(map[tree.SectionID]struct{}) + confidences map[tree.SectionID]float64 ) for _, sub := range plan.SubQuestions { - ids, usage, err := d.runOnce(ctx, t, sub, budget) + ids, subConfidences, usage, err := d.runOnce(ctx, t, sub, budget) totalUsage.Add(usage) if err != nil { - return out, totalUsage, fmt.Errorf("decompose %q: %w", sub, err) + return out, confidences, totalUsage, fmt.Errorf("decompose %q: %w", sub, err) } for _, id := range ids { if _, dup := seen[id]; dup { @@ -76,28 +94,41 @@ func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *P seen[id] = struct{}{} out = append(out, id) } + // Union with max-wins on overlap: if two sub-questions both + // score the same section, the more confident verdict carries. + if len(subConfidences) > 0 { + if confidences == nil { + confidences = make(map[tree.SectionID]float64, len(subConfidences)) + } + for id, c := range subConfidences { + if existing, ok := confidences[id]; !ok || c > existing { + confidences[id] = c + } + } + } } - return out, totalUsage, nil + return out, confidences, totalUsage, nil } // runOnce delegates one retrieval call. Uses CostStrategy when the -// wrapped strategy implements it so per-sub-question usage flows into -// the aggregated total; otherwise falls back to plain Select with a -// zero Usage value. -func (d *Decomposer) runOnce(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) { +// wrapped strategy implements it so per-sub-question usage and (since +// Phase 2.4) confidences flow into the aggregated total; otherwise +// falls back to plain Select with a zero Usage value and nil +// confidences. +func (d *Decomposer) runOnce(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) { if cs, ok := d.Strategy.(CostStrategy); ok { res, err := cs.SelectWithCost(ctx, t, query, budget) if err != nil { - return nil, Usage{}, err + return nil, nil, Usage{}, err } if res == nil { - return nil, Usage{}, nil + return nil, nil, Usage{}, nil } - return res.SelectedIDs, res.Usage, nil + return res.SelectedIDs, res.Confidences, res.Usage, nil } ids, err := d.Strategy.Select(ctx, t, query, budget) if err != nil { - return nil, Usage{}, err + return nil, nil, Usage{}, err } - return ids, Usage{}, nil + return ids, nil, Usage{}, nil }