From b016bc59b85725f6d540e21301bb4d5480add2bb Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 03:08:30 +0100
Subject: [PATCH 1/3] feat(retrieval): per-pick confidence scores from
 selection LLM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the selection JSON schema to accept either the legacy
{selected_section_ids: [...]} shape or the new
{picks: [{id, confidence}]} shape with per-pick confidence in
[0.0, 1.0]. ParseSelection returns (ids, confidences, err); legacy
responses surface confidences=nil so callers can distinguish "no
confidence signal" from "all confidences low".

Each strategy plumbs the confidence map through:
- SinglePass fills Result.Confidences from the parsed map, filtered
  against the post-FilterKnownIDs survivors.
- ChunkedTree unions per-slice confidence maps (max-wins on duplicate
  IDs across overlapping slices) and filters to the merged ID set.
- Agentic accepts both done-shape variants. The new picks shape
  surfaces per-pick confidences on the final Result.

Result.SelectedIDs stays []tree.SectionID — the change is purely
additive. Callers that don't care about confidence see no API change.
The strategy never abstains; the API layer's abstention check (next
commit) is the only place "all confidences below threshold" becomes
an abstention response.

Tests cover: new-shape parse, legacy-shape parse, mixed-shape parse
(some picks with confidence, some without), confidence clamping,
duplicate-pick dedup, per-strategy fill, chunked-tree merge, and the
agentic done-with-picks path.
---
 pkg/retrieval/agentic.go        |  81 +++++++++++--
 pkg/retrieval/agentic_test.go   |  60 ++++++++++
 pkg/retrieval/chunked_tree.go   |  40 +++++--
 pkg/retrieval/retrieval_test.go | 205 +++++++++++++++++++++++++++++++-
 pkg/retrieval/single_pass.go    | 171 ++++++++++++++++++++++----
 pkg/retrieval/strategy.go       |  17 ++-
 6 files changed, 529 insertions(+), 45 deletions(-)

diff --git a/pkg/retrieval/agentic.go b/pkg/retrieval/agentic.go
index 41c49dd..ea1d367 100644
--- a/pkg/retrieval/agentic.go
+++ b/pkg/retrieval/agentic.go
@@ -125,10 +125,11 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer
 	}
 
 	var (
-		totalUsage Usage
-		hopsTaken  int
-		finalIDs   []tree.SectionID
-		reasoning  string
+		totalUsage       Usage
+		hopsTaken        int
+		finalIDs         []tree.SectionID
+		finalConfidences map[tree.SectionID]float64
+		reasoning        string
 	)
 
 	for hop := 0; hop < maxHops; hop++ {
@@ -176,10 +177,11 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer
 
 		switch action.Action {
 		case actionDone:
-			finalIDs = filterToTreeIDs(action.PickedIDs, bySectionID)
+			finalIDs, finalConfidences = collectDonePicks(action, bySectionID)
 			reasoning = action.Reasoning
 			return &Result{
 				SelectedIDs: finalIDs,
+				Confidences: filterConfidences(finalConfidences, finalIDs),
 				Reasoning:   reasoning,
 				ModelUsed:   model,
 				Usage:       totalUsage,
@@ -240,6 +242,7 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer
 	log.Printf("retrieval: agentic strategy hit max_hops=%d without done; returning %d ids", maxHops, len(finalIDs))
 	return &Result{
 		SelectedIDs: finalIDs,
+		Confidences: filterConfidences(finalConfidences, finalIDs),
 		Reasoning:   reasoning,
 		ModelUsed:   model,
 		Usage:       totalUsage,
@@ -248,6 +251,48 @@ func (a *AgenticStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, quer
 	}, nil
 }
 
+// collectDonePicks extracts the final IDs and per-pick confidences
+// from a 'done' action. It honours the new Picks shape first; falls
+// back to the legacy PickedIDs list when Picks is empty. Both shapes
+// are filtered to the known tree IDs so a model can never inject an
+// invented section into the result.
+func collectDonePicks(action Action, bySectionID map[tree.SectionID]tree.SectionView) ([]tree.SectionID, map[tree.SectionID]float64) {
+	if len(action.Picks) > 0 {
+		ids := make([]tree.SectionID, 0, len(action.Picks))
+		confidences := make(map[tree.SectionID]float64, len(action.Picks))
+		seen := make(map[tree.SectionID]struct{}, len(action.Picks))
+		for _, pk := range action.Picks {
+			sid := tree.SectionID(strings.TrimSpace(pk.ID))
+			if sid == "" {
+				continue
+			}
+			if _, ok := bySectionID[sid]; !ok {
+				continue
+			}
+			if _, dup := seen[sid]; dup {
+				continue
+			}
+			seen[sid] = struct{}{}
+			ids = append(ids, sid)
+			if pk.Confidence != nil {
+				c := *pk.Confidence
+				if c < 0 {
+					c = 0
+				} else if c > 1 {
+					c = 1
+				}
+				confidences[sid] = c
+			}
+		}
+		if len(confidences) == 0 {
+			confidences = nil
+		}
+		return ids, confidences
+	}
+	// Legacy shape — no confidence signal.
+	return filterToTreeIDs(action.PickedIDs, bySectionID), nil
+}
+
 // initialUserPrompt is the very first user turn: it explains the task,
 // renders a shallow outline (default level=1) so the model has
 // something to react to, and reminds the model of the action protocol.
@@ -316,15 +361,27 @@ Rules:
 - Prefer leaf sections. Include a parent only if its own body is directly relevant.
 - Include as few sections as possible. Quality over quantity.
 - Only return IDs you have seen in a prior observation. Do not invent IDs.
-- If nothing in the document is relevant, return done with an empty picked_ids array.`
+- When you finalise with 'done', attach a confidence score in [0.0, 1.0]
+  to every pick. Confidence reflects how directly the section answers
+  the query: 1.0 = near-certain, 0.0 = no signal. Use the full range —
+  do NOT score every pick at 1.0. If you genuinely cannot reason about
+  confidence, you may fall back to the legacy picked_ids array form.
+- If nothing in the document is relevant, return done with an empty
+  picks (or picked_ids) array.`
 
 // actionProtocolHelp is the one-shot reminder appended to the initial
 // user prompt so the model gets concrete examples of valid actions
 // without us needing to maintain a separate few-shot block.
+//
+// The 'done' action accepts EITHER picked_ids (legacy, no
+// confidence) OR picks (preferred, per-id confidence). Both shapes
+// are parsed by ParseAction. The confidence-bearing shape unlocks
+// abstention at the API layer.
 const actionProtocolHelp = `- {"action":"outline","level":2} — re-render the outline N levels deep
 - {"action":"expand","section_id":"sec_x"} — list immediate children of sec_x
 - {"action":"read","section_id":"sec_x"} — fetch the full body of sec_x
-- {"action":"done","picked_ids":["sec_x","sec_y"],"reasoning":"why"} — finalize
+- {"action":"done","picks":[{"id":"sec_x","confidence":0.8}],"reasoning":"why"} — finalize with per-pick confidence in [0.0, 1.0]
+- {"action":"done","picked_ids":["sec_x","sec_y"],"reasoning":"why"} — legacy fallback when you cannot reason about confidence
 
 Reply with ONLY the JSON object. No prose, no markdown fences.`
 
@@ -346,9 +403,17 @@ type Action struct {
 	// SectionID is the target of expand and read actions.
 	SectionID string `json:"section_id,omitempty"`
 
-	// PickedIDs is the final selection for a done action.
+	// PickedIDs is the legacy-shape final selection for a done action
+	// (no per-pick confidence). Either this or Picks may be set; if
+	// both are present, Picks wins.
 	PickedIDs []string `json:"picked_ids,omitempty"`
 
+	// Picks is the preferred final selection for a done action: each
+	// entry carries an ID + optional confidence in [0.0, 1.0]. When
+	// the model populates this, the strategy surfaces per-pick
+	// confidences on the returned Result.
+	Picks []selectionPick `json:"picks,omitempty"`
+
 	// Reasoning is an optional explanation accompanying done.
 	Reasoning string `json:"reasoning,omitempty"`
 }
diff --git a/pkg/retrieval/agentic_test.go b/pkg/retrieval/agentic_test.go
index b5226a1..50346ca 100644
--- a/pkg/retrieval/agentic_test.go
+++ b/pkg/retrieval/agentic_test.go
@@ -232,6 +232,66 @@ func TestAgenticBadJSONGraceful(t *testing.T) {
 	}
 }
 
+// TestAgenticDoneWithConfidences exercises the Phase 2.4 new-shape
+// done action: each pick carries a confidence in [0.0, 1.0] and the
+// resulting Result.Confidences map mirrors the picks. The strategy
+// itself never abstains; the API layer alone does that.
+func TestAgenticDoneWithConfidences(t *testing.T) {
+	t.Parallel()
+
+	tr := buildAgenticTree()
+	llm := &scriptedLLM{
+		replies: []string{
+			`{"action":"done","picks":[{"id":"sec_a1","confidence":0.85},{"id":"sec_b1","confidence":0.42}],"reasoning":"two-section answer"}`,
+		},
+	}
+	s := retrieval.NewAgentic(llm, mapFetcher{data: map[string]string{}})
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(res.SelectedIDs) != 2 {
+		t.Fatalf("want 2 picks, got %v", res.SelectedIDs)
+	}
+	if res.Confidences == nil {
+		t.Fatal("Confidences must be populated when model returns picks")
+	}
+	if res.Confidences["sec_a1"] != 0.85 {
+		t.Errorf("sec_a1 = %v, want 0.85", res.Confidences["sec_a1"])
+	}
+	if res.Confidences["sec_b1"] != 0.42 {
+		t.Errorf("sec_b1 = %v, want 0.42", res.Confidences["sec_b1"])
+	}
+}
+
+// TestAgenticDoneLegacyShapeNoConfidences confirms the legacy
+// picked_ids shape continues to work — Confidences must stay nil so
+// the API layer treats this as "no confidence signal" and does not
+// fire abstention.
+func TestAgenticDoneLegacyShapeNoConfidences(t *testing.T) {
+	t.Parallel()
+
+	tr := buildAgenticTree()
+	llm := &scriptedLLM{
+		replies: []string{
+			`{"action":"done","picked_ids":["sec_a1","sec_b1"],"reasoning":"legacy"}`,
+		},
+	}
+	s := retrieval.NewAgentic(llm, mapFetcher{data: map[string]string{}})
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(res.SelectedIDs) != 2 {
+		t.Fatalf("want 2 IDs, got %v", res.SelectedIDs)
+	}
+	if res.Confidences != nil {
+		t.Errorf("legacy picked_ids must NOT populate Confidences, got %v", res.Confidences)
+	}
+}
+
 // TestAgenticFiltersUnknownPicks mirrors single-pass: if the model
 // invents IDs not present in the tree, they must be dropped.
 func TestAgenticFiltersUnknownPicks(t *testing.T) {
diff --git a/pkg/retrieval/chunked_tree.go b/pkg/retrieval/chunked_tree.go
index e7f9f09..7fcd98b 100644
--- a/pkg/retrieval/chunked_tree.go
+++ b/pkg/retrieval/chunked_tree.go
@@ -71,8 +71,9 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st
 
 	sem := make(chan struct{}, maxPar)
 	type sliceResult struct {
-		ids   []tree.SectionID
-		usage Usage
+		ids         []tree.SectionID
+		confidences map[tree.SectionID]float64
+		usage       Usage
 	}
 	results := make([]sliceResult, len(slices))
 
@@ -89,12 +90,12 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st
 				return gctx.Err()
 			}
 
-			ids, usage, err := c.reasonOverSliceWithCost(gctx, sl, query, budget)
+			ids, confidences, usage, err := c.reasonOverSliceWithCost(gctx, sl, query, budget)
 			if err != nil {
 				return err
 			}
 			mu.Lock()
-			results[i] = sliceResult{ids: ids, usage: usage}
+			results[i] = sliceResult{ids: ids, confidences: confidences, usage: usage}
 			mu.Unlock()
 			return nil
 		})
@@ -107,14 +108,30 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st
 	// Merge IDs and aggregate costs.
 	allIDs := make([][]tree.SectionID, len(results))
 	var totalUsage Usage
+	// Union the per-slice confidence maps. When two slices both score
+	// the same ID (rare but possible if the splitter overlaps), we
+	// keep the higher confidence — the more confident slice has
+	// better signal about that section.
+	var mergedConfidences map[tree.SectionID]float64
 	for i, r := range results {
 		allIDs[i] = r.ids
 		totalUsage.Add(r.usage)
+		if len(r.confidences) > 0 {
+			if mergedConfidences == nil {
+				mergedConfidences = make(map[tree.SectionID]float64, len(r.confidences))
+			}
+			for id, conf := range r.confidences {
+				if existing, ok := mergedConfidences[id]; !ok || conf > existing {
+					mergedConfidences[id] = conf
+				}
+			}
+		}
 	}
 
 	selected := c.Merge.Merge(allIDs)
 	return &Result{
 		SelectedIDs: selected,
+		Confidences: filterConfidences(mergedConfidences, selected),
 		Usage:       totalUsage,
 		HopsTaken:   1,
 		TraceToken:  ComputeTraceToken(t.DocumentID, traceDocVersionV1, budget.ModelName, selected),
@@ -125,12 +142,14 @@ func (c *ChunkedTree) SelectWithCost(ctx context.Context, t *tree.Tree, query st
 // model picked, filtered against sl.Sections so a model can never fabricate
 // an ID that lives in a different slice.
 func (c *ChunkedTree) reasonOverSlice(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, error) {
-	ids, _, err := c.reasonOverSliceWithCost(ctx, sl, query, budget)
+	ids, _, _, err := c.reasonOverSliceWithCost(ctx, sl, query, budget)
 	return ids, err
 }
 
-// reasonOverSliceWithCost is like reasonOverSlice but also returns usage.
-func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) {
+// reasonOverSliceWithCost is like reasonOverSlice but also returns the
+// per-pick confidence map (nil when the model returned the legacy
+// response shape) and the usage spent on the call.
+func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) {
 	prompt := BuildSelectionPrompt(sl.Breadcrumb, sl.Sections, sl.SiblingSummaries, query)
 
 	req := llmgate.Request{
@@ -145,11 +164,12 @@ func (c *ChunkedTree) reasonOverSliceWithCost(ctx context.Context, sl Slice, que
 		JSONSchema:  []byte(selectionJSONSchema),
 	}
 
-	ids, usage, err := runSelectionWithRetry(ctx, c.LLM, req, defaultSelectionRetries)
+	ids, confidences, usage, err := runSelectionWithRetry(ctx, c.LLM, req, defaultSelectionRetries)
 	if err != nil {
-		return nil, usage, err
+		return nil, nil, usage, err
 	}
-	return FilterKnownIDs(ids, sl.Sections), usage, nil
+	filtered := FilterKnownIDs(ids, sl.Sections)
+	return filtered, filterConfidences(confidences, filtered), usage, nil
 }
 
 // MergePolicy determines how per-slice ID lists are combined into a single
diff --git a/pkg/retrieval/retrieval_test.go b/pkg/retrieval/retrieval_test.go
index 9888c4b..f9538f3 100644
--- a/pkg/retrieval/retrieval_test.go
+++ b/pkg/retrieval/retrieval_test.go
@@ -181,10 +181,13 @@ func TestParseSelection(t *testing.T) {
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			got, err := retrieval.ParseSelection(c.in)
+			got, confidences, err := retrieval.ParseSelection(c.in)
 			if err != nil {
 				t.Fatal(err)
 			}
+			if confidences != nil {
+				t.Errorf("legacy-shape input must not populate confidences, got %v", confidences)
+			}
 			if len(got) != len(c.want) {
 				t.Fatalf("len: got %v want %v", got, c.want)
 			}
@@ -197,6 +200,109 @@ func TestParseSelection(t *testing.T) {
 	}
 }
 
+// TestParseSelectionNewShape exercises the Phase 2.4 picks shape:
+// each pick carries an id + confidence, the parser returns both the
+// id list and a confidence map.
+func TestParseSelectionNewShape(t *testing.T) {
+	raw := `{"picks":[{"id":"sec_a","confidence":0.82},{"id":"sec_b","confidence":0.31}],"reasoning":"x"}`
+	ids, confidences, err := retrieval.ParseSelection(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(ids) != 2 || ids[0] != "sec_a" || ids[1] != "sec_b" {
+		t.Fatalf("ids: got %v want [sec_a sec_b]", ids)
+	}
+	if confidences == nil {
+		t.Fatal("confidences must be populated for new-shape response")
+	}
+	if got := confidences["sec_a"]; got != 0.82 {
+		t.Errorf("sec_a confidence = %v, want 0.82", got)
+	}
+	if got := confidences["sec_b"]; got != 0.31 {
+		t.Errorf("sec_b confidence = %v, want 0.31", got)
+	}
+}
+
+// TestParseSelectionMixedShape covers a partially-populated new-shape
+// response: some picks have confidence, others don't. The confidence
+// map only surfaces IDs whose confidence was actually present —
+// missing entries are NOT defaulted to 0 (which would force
+// abstention) or to 1 (which would suppress it).
+func TestParseSelectionMixedShape(t *testing.T) {
+	raw := `{"picks":[{"id":"sec_a","confidence":0.9},{"id":"sec_b"},{"id":"sec_c","confidence":0.4}]}`
+	ids, confidences, err := retrieval.ParseSelection(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(ids) != 3 {
+		t.Fatalf("ids: got %v, want 3 picks", ids)
+	}
+	if _, present := confidences["sec_a"]; !present {
+		t.Error("sec_a should have confidence")
+	}
+	if _, present := confidences["sec_b"]; present {
+		t.Error("sec_b should NOT have confidence (model omitted it)")
+	}
+	if _, present := confidences["sec_c"]; !present {
+		t.Error("sec_c should have confidence")
+	}
+	if confidences["sec_a"] != 0.9 || confidences["sec_c"] != 0.4 {
+		t.Errorf("confidences = %v", confidences)
+	}
+}
+
+// TestParseSelectionClampsConfidence asserts confidences outside
+// [0.0, 1.0] are clamped — defence-in-depth against a model that
+// returns 1.5 or -0.2 despite the prompt's range.
+func TestParseSelectionClampsConfidence(t *testing.T) {
+	raw := `{"picks":[{"id":"sec_a","confidence":1.7},{"id":"sec_b","confidence":-0.3}]}`
+	_, confidences, err := retrieval.ParseSelection(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if confidences["sec_a"] != 1.0 {
+		t.Errorf("sec_a clamped: want 1.0, got %v", confidences["sec_a"])
+	}
+	if confidences["sec_b"] != 0.0 {
+		t.Errorf("sec_b clamped: want 0.0, got %v", confidences["sec_b"])
+	}
+}
+
+// TestParseSelectionPicksDedup ensures duplicate IDs in `picks` are
+// deduplicated (first-seen wins) so the strategy doesn't double-count
+// a section the model accidentally listed twice.
+func TestParseSelectionPicksDedup(t *testing.T) {
+	raw := `{"picks":[{"id":"sec_a","confidence":0.7},{"id":"sec_a","confidence":0.2}]}`
+	ids, confidences, err := retrieval.ParseSelection(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(ids) != 1 || ids[0] != "sec_a" {
+		t.Fatalf("ids: got %v want [sec_a]", ids)
+	}
+	if confidences["sec_a"] != 0.7 {
+		t.Errorf("first-seen confidence should win: got %v want 0.7", confidences["sec_a"])
+	}
+}
+
+// TestParseSelectionNewShapeNoConfidences covers a new-shape response
+// where the model returned `picks` but stamped no confidence values
+// at all — must be treated as legacy (nil confidences) so the API
+// layer does NOT abstain on a confidence signal that isn't there.
+func TestParseSelectionNewShapeNoConfidences(t *testing.T) {
+	raw := `{"picks":[{"id":"sec_a"},{"id":"sec_b"}]}`
+	ids, confidences, err := retrieval.ParseSelection(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(ids) != 2 {
+		t.Fatalf("ids: got %v, want 2", ids)
+	}
+	if confidences != nil {
+		t.Errorf("missing confidences must surface as nil map, got %v", confidences)
+	}
+}
+
 func TestChunkedTreeSinglesliceWhenItFits(t *testing.T) {
 	tr := buildTree()
 	m := &mockLLM{pickIfPresent: []tree.SectionID{"sec_a", "sec_b"}}
@@ -317,6 +423,103 @@ func TestDefaultSplitterFastPath(t *testing.T) {
 	}
 }
 
+// TestSinglePassReturnsConfidences asserts that a new-shape LLM
+// response with confidence scores surfaces a populated Confidences
+// map on the strategy's Result. The strategy itself never abstains —
+// even when every confidence is below the typical 0.4 threshold the
+// IDs still come back and the API layer decides what to do.
+func TestSinglePassReturnsConfidences(t *testing.T) {
+	tr := buildTree()
+	m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.78},{"id":"sec_b","confidence":0.12}],"reasoning":"x"}`}
+	s := retrieval.NewSinglePass(m)
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q",
+		retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000})
+	if err != nil {
+		t.Fatalf("select: %v", err)
+	}
+	if len(res.SelectedIDs) != 2 {
+		t.Fatalf("want 2 IDs, got %v", res.SelectedIDs)
+	}
+	if res.Confidences == nil {
+		t.Fatal("Confidences should be populated for new-shape response")
+	}
+	if got := res.Confidences["sec_a"]; got != 0.78 {
+		t.Errorf("sec_a confidence = %v, want 0.78", got)
+	}
+	if got := res.Confidences["sec_b"]; got != 0.12 {
+		t.Errorf("sec_b confidence = %v, want 0.12", got)
+	}
+}
+
+// TestSinglePassAllLowConfidencesStillReturnsIDs is the abstention
+// smoke contract from the spec: the strategy itself never abstains.
+// Even when every confidence is below 0.4 the IDs come back. The
+// API layer is the only place that may convert "all low" into an
+// abstention.
+func TestSinglePassAllLowConfidencesStillReturnsIDs(t *testing.T) {
+	tr := buildTree()
+	m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.1},{"id":"sec_b","confidence":0.2}]}`}
+	s := retrieval.NewSinglePass(m)
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q",
+		retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000})
+	if err != nil {
+		t.Fatalf("select: %v", err)
+	}
+	if len(res.SelectedIDs) != 2 {
+		t.Fatalf("strategy must return IDs even with low confidences, got %v", res.SelectedIDs)
+	}
+	if len(res.Confidences) != 2 {
+		t.Errorf("Confidences should mirror SelectedIDs, got %v", res.Confidences)
+	}
+}
+
+// TestSinglePassLegacyShapeNoConfidences confirms that the legacy
+// response shape continues to work after the new-shape refactor.
+// Critically, Confidences stays nil so the API layer does not abstain.
+func TestSinglePassLegacyShapeNoConfidences(t *testing.T) {
+	tr := buildTree()
+	m := &mockLLM{reply: `{"selected_section_ids":["sec_a","sec_b"],"reasoning":"x"}`}
+	s := retrieval.NewSinglePass(m)
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q",
+		retrieval.ContextBudget{ModelName: "model", MaxTokens: 1000})
+	if err != nil {
+		t.Fatalf("select: %v", err)
+	}
+	if len(res.SelectedIDs) != 2 {
+		t.Fatalf("legacy response shape must still work, got %v", res.SelectedIDs)
+	}
+	if res.Confidences != nil {
+		t.Errorf("legacy response must NOT populate Confidences, got %v", res.Confidences)
+	}
+}
+
+// TestChunkedTreeMergesConfidences verifies the chunked-tree strategy
+// surfaces confidences in the merged Result. Because the test tree
+// is small enough to fit in one slice, this is effectively a single
+// slice union — but the field still has to round-trip through the
+// per-slice plumbing.
+func TestChunkedTreeMergesConfidences(t *testing.T) {
+	tr := buildTree()
+	m := &mockLLM{reply: `{"picks":[{"id":"sec_a","confidence":0.6},{"id":"sec_c","confidence":0.9}]}`}
+	s := retrieval.NewChunkedTree(m)
+
+	res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{
+		ModelName: "model", MaxTokens: 100000, MaxParallelCalls: 4,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(res.Confidences) != 2 {
+		t.Fatalf("Confidences should carry both picks, got %v", res.Confidences)
+	}
+	if res.Confidences["sec_a"] != 0.6 || res.Confidences["sec_c"] != 0.9 {
+		t.Errorf("confidences = %v", res.Confidences)
+	}
+}
+
 // TestSinglePassStampsTraceToken verifies that SelectWithCost
 // populates a 64-char hex TraceToken on the returned Result.
 func TestSinglePassStampsTraceToken(t *testing.T) {
diff --git a/pkg/retrieval/single_pass.go b/pkg/retrieval/single_pass.go
index 95ab61c..83814cd 100644
--- a/pkg/retrieval/single_pass.go
+++ b/pkg/retrieval/single_pass.go
@@ -60,14 +60,16 @@ func (s *SinglePass) SelectWithCost(ctx context.Context, t *tree.Tree, query str
 		JSONSchema:  []byte(selectionJSONSchema),
 	}
 
-	ids, usage, err := runSelectionWithRetry(ctx, s.LLM, req, defaultSelectionRetries)
+	ids, confidences, usage, err := runSelectionWithRetry(ctx, s.LLM, req, defaultSelectionRetries)
 	if err != nil {
 		return nil, fmt.Errorf("single-pass llm call: %w", err)
 	}
 
 	selected := FilterKnownIDs(ids, view.Sections)
+	filteredConfidences := filterConfidences(confidences, selected)
 	return &Result{
 		SelectedIDs: selected,
+		Confidences: filteredConfidences,
 		ModelUsed:   model,
 		Usage:       usage,
 		HopsTaken:   1,
@@ -75,6 +77,27 @@ func (s *SinglePass) SelectWithCost(ctx context.Context, t *tree.Tree, query str
 	}, nil
 }
 
+// filterConfidences keeps only entries whose key appears in keep, so a
+// strategy never surfaces a confidence for an ID it didn't ultimately
+// select (post-filter / post-merge). Returns nil when src is nil or
+// empty after filtering — preserving the "no confidence signal"
+// distinction the API layer relies on for abstention.
+func filterConfidences(src map[tree.SectionID]float64, keep []tree.SectionID) map[tree.SectionID]float64 {
+	if len(src) == 0 {
+		return nil
+	}
+	out := make(map[tree.SectionID]float64, len(keep))
+	for _, id := range keep {
+		if v, ok := src[id]; ok {
+			out[id] = v
+		}
+	}
+	if len(out) == 0 {
+		return nil
+	}
+	return out
+}
+
 // traceDocVersionV1 is the placeholder document version used by every
 // strategy until Phase 3.2 wires real per-document versioning. Defined
 // once so the bump is a one-line change.
@@ -88,11 +111,13 @@ const defaultSelectionRetries = 2
 
 // runSelectionWithRetry runs a selection LLM call and parses the response,
 // retrying up to maxRetries additional times if the model returns something
-// that doesn't parse as JSON. Returns the parsed IDs and the cumulative usage
-// across all attempts. An error is returned only on a transport/LLM failure —
-// final parse failure degrades gracefully to an empty selection (logged) so a
-// single LLM-formatting blip doesn't 500 the entire query.
-func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) ([]tree.SectionID, Usage, error) {
+// that doesn't parse as JSON. Returns the parsed IDs, per-ID confidences
+// (nil when the model returned the legacy shape without confidence), and
+// the cumulative usage across all attempts. An error is returned only on a
+// transport/LLM failure — final parse failure degrades gracefully to an
+// empty selection (logged) so a single LLM-formatting blip doesn't 500
+// the entire query.
+func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) {
 	if maxRetries < 0 {
 		maxRetries = 0
 	}
@@ -114,7 +139,7 @@ func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq l
 		}
 		resp, err := client.Complete(ctx, req)
 		if err != nil {
-			return nil, totalUsage, err
+			return nil, nil, totalUsage, err
 		}
 		totalUsage.Add(Usage{
 			InputTokens:  resp.Usage.InputTokens,
@@ -123,14 +148,14 @@ func runSelectionWithRetry(ctx context.Context, client llmgate.Client, baseReq l
 			CostUSD:      resp.Usage.CostUSD,
 			LLMCalls:     1,
 		})
-		ids, parseErr := ParseSelection(resp.Content)
+		ids, confidences, parseErr := ParseSelection(resp.Content)
 		if parseErr == nil {
-			return ids, totalUsage, nil
+			return ids, confidences, totalUsage, nil
 		}
 		lastParseErr = parseErr
 	}
 	log.Printf("retrieval: selection parse failed after %d attempts (%v); returning empty selection", maxRetries+1, lastParseErr)
-	return nil, totalUsage, nil
+	return nil, nil, totalUsage, nil
 }
 
 // --- shared prompt scaffolding ---
@@ -141,15 +166,36 @@ Rules:
 - Prefer leaf sections. Include a parent only if the parent's own body is directly relevant.
 - Include as few sections as possible. Quality over quantity.
 - Only return IDs present in the provided outline. Do not invent IDs.
-- If nothing is relevant, return an empty list.`
+- If nothing is relevant, return an empty list.
+- Attach a confidence score in [0.0, 1.0] to every pick reflecting how
+  likely that section's body answers the query. Use the full range —
+  do NOT score every pick at 1.0. 0.0 means "no signal", 1.0 means
+  "near-certain". If you cannot reason about confidence at all, omit
+  the picks array and return the legacy selected_section_ids form
+  instead; the engine accepts both shapes.`
 
+// selectionJSONSchema is intentionally permissive: it accepts EITHER the
+// legacy { selected_section_ids: [...] } shape OR the new
+// { picks: [{id, confidence}] } shape so older / weaker models that
+// can't reason about confidence still work. ParseSelection accepts
+// both and returns confidences when present.
 const selectionJSONSchema = `{
   "type": "object",
   "properties": {
+    "picks": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "id": {"type": "string"},
+          "confidence": {"type": "number", "minimum": 0, "maximum": 1}
+        },
+        "required": ["id"]
+      }
+    },
     "selected_section_ids": {"type": "array", "items": {"type": "string"}},
     "reasoning": {"type": "string"}
-  },
-  "required": ["selected_section_ids"]
+  }
 }`
 
 // BuildSelectionPrompt renders the user-side prompt for a selection call.
@@ -174,7 +220,11 @@ func BuildSelectionPrompt(breadcrumb string, sections []tree.SectionView, siblin
 	}
 	b.WriteString("\nUser query:\n")
 	b.WriteString(query)
-	b.WriteString("\n\nReturn a JSON object with fields `selected_section_ids` (array of strings) and `reasoning` (string).")
+	b.WriteString("\n\nReturn a JSON object. Preferred shape:\n")
+	b.WriteString(`  {"picks": [{"id": "sec_x", "confidence": 0.82}, ...], "reasoning": "..."}` + "\n")
+	b.WriteString("confidence is a float in [0.0, 1.0] reflecting how likely the section's body answers the query. Use the full range; do not score every pick at 1.0.\n")
+	b.WriteString("Fallback shape (use ONLY if you cannot reason about confidence):\n")
+	b.WriteString(`  {"selected_section_ids": ["sec_x", ...], "reasoning": "..."}`)
 	return b.String()
 }
 
@@ -229,18 +279,50 @@ func firstCandidateQuestion(qs []string) string {
 	return ""
 }
 
-// selectionPayload is the expected JSON-mode shape.
+// selectionPick is one entry in the new-shape selection response. The
+// `Confidence` field is a pointer so we can distinguish "model
+// returned 0.0" from "model omitted the field" — the latter means
+// "no signal for this pick" and skips the abstention check.
+type selectionPick struct {
+	ID         string   `json:"id"`
+	Confidence *float64 `json:"confidence,omitempty"`
+}
+
+// selectionPayload accepts both response shapes:
+//
+//   - New shape (preferred): {"picks": [{"id": "...", "confidence": 0.8}], ...}
+//   - Legacy shape: {"selected_section_ids": ["..."], ...}
+//
+// When `Picks` is non-empty it wins; otherwise `SelectedSectionIDs`
+// is used. This keeps backward compatibility with older models that
+// can't reason about confidence (or with the legacy schema enforced
+// by some provider integrations).
 type selectionPayload struct {
-	SelectedSectionIDs []string `json:"selected_section_ids"`
-	Reasoning          string   `json:"reasoning"`
+	Picks              []selectionPick `json:"picks"`
+	SelectedSectionIDs []string        `json:"selected_section_ids"`
+	Reasoning          string          `json:"reasoning"`
 }
 
-// ParseSelection extracts the section-ID list from an LLM JSON response.
-// Tolerates code-fence wrappers and leading/trailing prose.
-func ParseSelection(raw string) ([]tree.SectionID, error) {
+// ParseSelection extracts the section-ID list and (when present) per-ID
+// confidence scores from an LLM JSON response. Tolerates code-fence
+// wrappers and leading/trailing prose.
+//
+// Returns:
+//
+//   - ids:         the section IDs the model picked, in the order the
+//                  model returned them.
+//   - confidences: map[id]float64 of per-pick confidences in [0.0, 1.0],
+//                  populated only when the model returned the new-shape
+//                  `picks` array. Returns nil (not an empty map) when
+//                  the response was the legacy shape OR when every pick
+//                  omitted its confidence — the distinction matters for
+//                  abstention, which fires only when confidence signal
+//                  is explicitly present.
+//   - err:         non-nil only when the JSON cannot be decoded at all.
+func ParseSelection(raw string) ([]tree.SectionID, map[tree.SectionID]float64, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
-		return nil, nil
+		return nil, nil, nil
 	}
 	// Strip ```json ... ``` fences if present.
 	if strings.HasPrefix(raw, "```") {
@@ -260,8 +342,51 @@ func ParseSelection(raw string) ([]tree.SectionID, error) {
 
 	var p selectionPayload
 	if err := json.Unmarshal([]byte(raw), &p); err != nil {
-		return nil, fmt.Errorf("unmarshal selection: %w", err)
+		return nil, nil, fmt.Errorf("unmarshal selection: %w", err)
+	}
+
+	// New shape wins. Even a single populated `picks` entry means the
+	// model attempted to follow the confidence protocol, so we honour
+	// it. Mixed responses (some picks with confidence, some without)
+	// surface only the present confidences — the missing ones are
+	// silently dropped from the confidence map, NOT defaulted to 0.
+	if len(p.Picks) > 0 {
+		ids := make([]tree.SectionID, 0, len(p.Picks))
+		confidences := make(map[tree.SectionID]float64, len(p.Picks))
+		seen := make(map[tree.SectionID]struct{}, len(p.Picks))
+		for _, pk := range p.Picks {
+			id := strings.TrimSpace(pk.ID)
+			if id == "" {
+				continue
+			}
+			sid := tree.SectionID(id)
+			if _, dup := seen[sid]; dup {
+				continue
+			}
+			seen[sid] = struct{}{}
+			ids = append(ids, sid)
+			if pk.Confidence != nil {
+				c := *pk.Confidence
+				// Clamp into [0, 1]. The model is instructed to stay
+				// in range; clamping is a defence-in-depth so a
+				// runaway value never poisons the abstention check.
+				if c < 0 {
+					c = 0
+				} else if c > 1 {
+					c = 1
+				}
+				confidences[sid] = c
+			}
+		}
+		if len(confidences) == 0 {
+			// New-shape response but no confidences populated → treat
+			// as legacy for abstention purposes.
+			confidences = nil
+		}
+		return ids, confidences, nil
 	}
+
+	// Legacy shape.
 	out := make([]tree.SectionID, 0, len(p.SelectedSectionIDs))
 	for _, id := range p.SelectedSectionIDs {
 		id = strings.TrimSpace(id)
@@ -269,7 +394,7 @@ func ParseSelection(raw string) ([]tree.SectionID, error) {
 			out = append(out, tree.SectionID(id))
 		}
 	}
-	return out, nil
+	return out, nil, nil
 }
 
 // FilterKnownIDs drops any IDs not present in the supplied section views and
diff --git a/pkg/retrieval/strategy.go b/pkg/retrieval/strategy.go
index 3edae3c..8f428f6 100644
--- a/pkg/retrieval/strategy.go
+++ b/pkg/retrieval/strategy.go
@@ -61,9 +61,20 @@ func (b ContextBudget) Available() int {
 // reasoning trace and cost accounting when the strategy supports it.
 type Result struct {
 	SelectedIDs []tree.SectionID `json:"selected_ids"`
-	Reasoning   string           `json:"reasoning,omitempty"`
-	ModelUsed   string           `json:"model_used,omitempty"`
-	Usage       Usage            `json:"usage"`
+
+	// Confidences carries per-pick relevance confidence in [0.0, 1.0]
+	// when the selection LLM returned the new-shape response with
+	// explicit confidence scores. Keys are restricted to IDs present in
+	// SelectedIDs (post-filter / post-merge). Nil when no confidence
+	// signal was present — either the legacy response shape was used or
+	// the model did not populate any confidence value. The API layer's
+	// abstention check fires only when this map is non-empty (see
+	// internal/api.handleQuery / handleAnswer).
+	Confidences map[tree.SectionID]float64 `json:"confidences,omitempty"`
+
+	Reasoning string `json:"reasoning,omitempty"`
+	ModelUsed string `json:"model_used,omitempty"`
+	Usage     Usage  `json:"usage"`
 
 	// HopsTaken is the number of LLM turns the strategy issued to reach the
 	// final selection. Single-shot strategies set this to 1; iterative

From 5d0a5f7f72da7697950089cc32c426bc0b529554 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 03:10:37 +0100
Subject: [PATCH 2/3] feat(config): retrieval.abstain block +
 VLE_RETRIEVAL_ABSTAIN_* env overrides

AbstainBlock carries Enabled + Below (the [0.0, 1.0] confidence
threshold below which picks count as "not confident"). When the
selection LLM returns explicit per-pick confidence and EVERY pick
falls below Below, the API layer surfaces an abstention response
instead of pretending the document held an answer.

Defaults: Enabled=true (opt-out), Below=0.4. Env overrides:
VLE_RETRIEVAL_ABSTAIN_ENABLED (truthy/falsy), VLE_RETRIEVAL_ABSTAIN_BELOW
(float in [0,1]). Validation rejects out-of-range Below values; bad
env strings preserve the default rather than zeroing the field.

Tests cover defaults, env overrides (enable/disable/parse), edge
cases (0.0, 1.0 inclusive), bad-input rejection, and validation.
---
 config.example.yaml       |  28 ++++++++++
 pkg/config/config.go      |  57 +++++++++++++++++++
 pkg/config/config_test.go | 115 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 200 insertions(+)

diff --git a/config.example.yaml b/config.example.yaml
index ee1b9c4..8b344c9 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -182,6 +182,34 @@ retrieval:
     # re-rank pass to do the final selection.
     top_k: 0
 
+  # abstain: Phase 2.4 abstention. When the selection LLM returns
+  # per-pick confidence scores (the new picks shape) and every
+  # confidence falls below `below`, /v1/query and /v1/answer skip the
+  # normal path and return an abstention response instead:
+  #   {abstained: true, abstention_reason: "...", sections: [],
+  #    min_confidence_threshold: 0.4, candidate_confidences: {...}}
+  # For /v1/answer the synthesis call is skipped entirely; the answer
+  # is the honest "I cannot answer this question from the supplied
+  # document." This trades a likely hallucination for a clear refusal
+  # when the engine's own confidence is weak.
+  #
+  # OPT-OUT. Default enabled. Per-request `enable_abstain` body field
+  # overrides this block. When the selection LLM returns the legacy
+  # shape (no confidence scores) the engine never abstains regardless
+  # of this setting — abstention requires explicit confidence signal.
+  #
+  # The check is "all picks below threshold". If any pick scored
+  # above, the engine surfaces that section as evidence — abstention
+  # is reserved for the case where every candidate is weak.
+  abstain:
+    enabled: true
+    # Confidence threshold in [0.0, 1.0]. Picks with confidence
+    # strictly less than this are "not confident"; when ALL picks
+    # fall below, the response is an abstention. 0.4 is the default
+    # — high enough to filter weak matches, low enough not to
+    # suppress legitimate partial answers.
+    below: 0.4
+
   # replay: Phase 3.1 reproducibility store. Every /v1/query and
   # /v1/answer response carries a deterministic `trace_token`; the
   # response body is stored in an in-memory LRU under that token so
diff --git a/pkg/config/config.go b/pkg/config/config.go
index a640319..3a8dca0 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -254,6 +254,42 @@ type RetrievalConfig struct {
 	Planning    PlanningBlock    `yaml:"planning"`
 	ReRank      ReRankBlock      `yaml:"rerank"`
 	Replay      ReplayBlock      `yaml:"replay"`
+	Abstain     AbstainBlock     `yaml:"abstain"`
+}
+
+// AbstainBlock configures the Phase 2.4 abstention behaviour.
+//
+// When the selection LLM returns per-pick confidence scores and every
+// confidence is below Below, the API layer (handleQuery /
+// handleAnswer) replaces the normal response with an abstention:
+// sections is empty and abstained=true. This refuses to ground an
+// answer in evidence the model itself isn't confident is relevant,
+// converting a likely hallucination into an honest "I don't know".
+//
+// Abstention fires only when explicit confidence signal is present.
+// Legacy-shape responses (no confidences) always fall through to the
+// normal path — the engine never abstains on the absence of signal.
+//
+// Per-request override: callers may set `enable_abstain` on the
+// /v1/query or /v1/answer body to opt out of abstention for one
+// request without restarting the server. When this block has
+// Enabled=false, no request abstains regardless of the per-request
+// flag.
+type AbstainBlock struct {
+	// Enabled toggles abstention at the server level. Default: true
+	// (opt-out).
+	Enabled bool `yaml:"enabled"`
+
+	// Below is the confidence threshold. Picks with confidence
+	// strictly less than Below are "not confident"; when ALL picks
+	// fall below this threshold the response is an abstention.
+	// Default: 0.4.
+	//
+	// The "all" semantics (vs "any") is deliberate: if even one
+	// section scored above the threshold, the engine has enough
+	// signal to surface it as evidence. Abstention is reserved for
+	// the case where every candidate is weak.
+	Below float64 `yaml:"below"`
 }
 
 // ReplayBlock configures the Phase 3.1 replay-trace store.
@@ -489,6 +525,10 @@ func Default() Config {
 				MaxEntries: 1024,
 				TTLSeconds: 86400,
 			},
+			Abstain: AbstainBlock{
+				Enabled: true,
+				Below:   0.4,
+			},
 		},
 		Ingest: IngestConfig{
 			GlobalLLMConcurrency: 12,
@@ -748,6 +788,19 @@ func applyEnvOverrides(c *Config) {
 			c.Retrieval.Replay.TTLSeconds = n
 		}
 	}
+	if v := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED"); v != "" {
+		switch strings.ToLower(strings.TrimSpace(v)) {
+		case "1", "true", "yes", "on":
+			c.Retrieval.Abstain.Enabled = true
+		case "0", "false", "no", "off":
+			c.Retrieval.Abstain.Enabled = false
+		}
+	}
+	if v := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW"); v != "" {
+		if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil && f >= 0 && f <= 1 {
+			c.Retrieval.Abstain.Below = f
+		}
+	}
 }
 
 // Validate checks that required fields for the selected drivers are set.
@@ -867,5 +920,9 @@ func (c Config) Validate() error {
 		return fmt.Errorf("retrieval.replay.ttl_seconds must be >= 0, got %d", c.Retrieval.Replay.TTLSeconds)
 	}
 
+	if c.Retrieval.Abstain.Below < 0 || c.Retrieval.Abstain.Below > 1 {
+		return fmt.Errorf("retrieval.abstain.below must be in [0.0, 1.0], got %v", c.Retrieval.Abstain.Below)
+	}
+
 	return nil
 }
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
index f71ad41..2a5c212 100644
--- a/pkg/config/config_test.go
+++ b/pkg/config/config_test.go
@@ -64,11 +64,126 @@ func TestDefaultValues(t *testing.T) {
 	if cfg.Retrieval.Replay.TTLSeconds != 86400 {
 		t.Errorf("retrieval.replay.ttl_seconds = %d, want 86400 (24h)", cfg.Retrieval.Replay.TTLSeconds)
 	}
+	if !cfg.Retrieval.Abstain.Enabled {
+		t.Error("retrieval.abstain.enabled should default to true (opt-out)")
+	}
+	if cfg.Retrieval.Abstain.Below != 0.4 {
+		t.Errorf("retrieval.abstain.below = %v, want 0.4", cfg.Retrieval.Abstain.Below)
+	}
 	if cfg.Log.Level != "info" {
 		t.Errorf("log.level = %q, want info", cfg.Log.Level)
 	}
 }
 
+func TestAbstainEnvOverride(t *testing.T) {
+	// Mutates env — restore on exit. Not parallel.
+	prevEnabled := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED")
+	prevBelow := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW")
+	defer func() {
+		os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", prevEnabled)
+		os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prevBelow)
+	}()
+
+	os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", "false")
+	os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", "0.6")
+
+	cfg := Default()
+	applyEnvOverrides(&cfg)
+
+	if cfg.Retrieval.Abstain.Enabled {
+		t.Error("VLE_RETRIEVAL_ABSTAIN_ENABLED=false should disable abstention")
+	}
+	if cfg.Retrieval.Abstain.Below != 0.6 {
+		t.Errorf("VLE_RETRIEVAL_ABSTAIN_BELOW=0.6 not applied, got %v", cfg.Retrieval.Abstain.Below)
+	}
+}
+
+func TestAbstainEnvOverrideEnable(t *testing.T) {
+	// Toggle on via env from an explicitly-disabled starting state.
+	prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_ENABLED")
+	defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", prev)
+
+	cfg := Default()
+	cfg.Retrieval.Abstain.Enabled = false
+	os.Setenv("VLE_RETRIEVAL_ABSTAIN_ENABLED", "true")
+	applyEnvOverrides(&cfg)
+	if !cfg.Retrieval.Abstain.Enabled {
+		t.Error("VLE_RETRIEVAL_ABSTAIN_ENABLED=true should enable abstention even when previously disabled")
+	}
+}
+
+// TestAbstainEnvOverrideRejectsBad asserts a garbage float and an
+// out-of-range value both preserve the default rather than silently
+// zeroing or accepting a value that would break the abstention check
+// (Below must be in [0,1]).
+func TestAbstainEnvOverrideRejectsBad(t *testing.T) {
+	prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW")
+	defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prev)
+
+	cases := []string{"not-a-float", "1.5", "-0.1", "abc"}
+	for _, v := range cases {
+		os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", v)
+		cfg := Default()
+		applyEnvOverrides(&cfg)
+		if cfg.Retrieval.Abstain.Below != 0.4 {
+			t.Errorf("bad ABSTAIN_BELOW=%q should preserve default 0.4, got %v",
+				v, cfg.Retrieval.Abstain.Below)
+		}
+	}
+}
+
+// TestAbstainEnvOverrideParsesEdgeCases covers 0.0 and 1.0 (the
+// inclusive bounds) and the canonical 0.4 default — these must all
+// be accepted.
+func TestAbstainEnvOverrideParsesEdgeCases(t *testing.T) {
+	prev := os.Getenv("VLE_RETRIEVAL_ABSTAIN_BELOW")
+	defer os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", prev)
+
+	cases := map[string]float64{
+		"0":   0.0,
+		"0.0": 0.0,
+		"1":   1.0,
+		"1.0": 1.0,
+		"0.5": 0.5,
+	}
+	for raw, want := range cases {
+		os.Setenv("VLE_RETRIEVAL_ABSTAIN_BELOW", raw)
+		cfg := Default()
+		applyEnvOverrides(&cfg)
+		if cfg.Retrieval.Abstain.Below != want {
+			t.Errorf("ABSTAIN_BELOW=%q: got %v want %v", raw, cfg.Retrieval.Abstain.Below, want)
+		}
+	}
+}
+
+// TestValidateAbstainOutOfRange asserts Validate rejects out-of-range
+// Below values. The env-override path silently drops them, but a YAML
+// file or explicit struct edit can still land a bad value here.
+func TestValidateAbstainOutOfRange(t *testing.T) {
+	t.Parallel()
+
+	cfg := Default()
+	cfg.Database.URL = "postgres://localhost/test"
+	cfg.Retrieval.Abstain.Below = 1.5
+	if err := cfg.Validate(); err == nil {
+		t.Error("abstain.below=1.5 should fail validation")
+	}
+
+	cfg2 := Default()
+	cfg2.Database.URL = "postgres://localhost/test"
+	cfg2.Retrieval.Abstain.Below = -0.1
+	if err := cfg2.Validate(); err == nil {
+		t.Error("abstain.below=-0.1 should fail validation")
+	}
+
+	cfg3 := Default()
+	cfg3.Database.URL = "postgres://localhost/test"
+	cfg3.Retrieval.Abstain.Below = 0.0
+	if err := cfg3.Validate(); err != nil {
+		t.Errorf("abstain.below=0.0 should pass validation, got %v", err)
+	}
+}
+
 func TestReplayEnvOverride(t *testing.T) {
 	// Not parallel — mutates env. Restore on exit.
 	prevEnabled := os.Getenv("VLE_RETRIEVAL_REPLAY_ENABLED")

From 666a74b82475505c7f2da80e4cd5213ba72baed3 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 03:20:50 +0100
Subject: [PATCH 3/3] feat(api): confidence-driven abstention on /v1/query and
 /v1/answer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the selection LLM returns per-pick confidences and every pick
falls strictly below retrieval.abstain.below (default 0.4), the API
layer skips the normal path and returns an abstention response:

  /v1/query  → sections: [], abstained: true,
               abstention_reason, min_confidence_threshold,
               candidate_confidences
  /v1/answer → answer: "I cannot answer this question from the
               supplied document.", citations: [],
               same abstention fields, synthesis LLM call skipped
               entirely (planning + retrieval usage carried through)

The "all picks below" semantics is deliberate: if even one section
scored at-or-above the threshold the engine surfaces it as evidence.
Abstention is reserved for the case where every candidate is weak.

Abstention requires explicit confidence signal — legacy-shape LLM
responses (no confidence map) always fall through to the normal
path. Per-request `enable_abstain` body field overrides the server
config; opt out globally via retrieval.abstain.enabled: false.

Other changes:
- Result.Confidences threads through the Decomposer (multi-hop
  plans union confidences max-wins on overlap).
- Successful (non-abstained) responses surface a `confidences` map
  on the wire when the model returned them.
- Abstention responses carry no trace_token — there is no retrieval
  result to replay.
- cmd/engine wires cfg.Retrieval.Abstain into the Deps.

Tests cover: shouldAbstain predicate (all-below, one-above,
boundary, nil/empty); filterConfidencesToIDs sentinel preservation;
stringKeyedConfidences conversion; abstentionEnabled body-override
precedence; respondAbstained / respondAbstainedAnswer shape;
synthesis tripwire (LLM must not be called on abstention path);
trace_token absence on abstention.

OpenAPI:
- enable_abstain on QueryRequest + AnswerRequest.
- abstained, abstention_reason, min_confidence_threshold,
  candidate_confidences, confidences on both response schemas.
---
 cmd/engine/main.go              |   1 +
 internal/api/abstention_test.go | 340 ++++++++++++++++++++++++++++++++
 internal/api/server.go          | 242 +++++++++++++++++++++--
 openapi.yaml                    | 119 ++++++++++-
 pkg/retrieval/decompose.go      |  67 +++++--
 5 files changed, 727 insertions(+), 42 deletions(-)
 create mode 100644 internal/api/abstention_test.go

diff --git a/cmd/engine/main.go b/cmd/engine/main.go
index 5398a31..b40533c 100644
--- a/cmd/engine/main.go
+++ b/cmd/engine/main.go
@@ -209,6 +209,7 @@ func run() error {
 		ReRanker:   reRanker,
 		ReRank:     cfg.Retrieval.ReRank,
 		Replay:     replayStore,
+		Abstain:    cfg.Retrieval.Abstain,
 	}
 
 	srv := &http.Server{
diff --git a/internal/api/abstention_test.go b/internal/api/abstention_test.go
new file mode 100644
index 0000000..2018cdf
--- /dev/null
+++ b/internal/api/abstention_test.go
@@ -0,0 +1,340 @@
+package api
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+	"github.com/hallelx2/llmgate"
+
+	"github.com/hallelx2/vectorless-engine/pkg/config"
+	"github.com/hallelx2/vectorless-engine/pkg/retrieval"
+	"github.com/hallelx2/vectorless-engine/pkg/tree"
+)
+
+// TestShouldAbstainAllBelow: every confidence under threshold → abstain.
+func TestShouldAbstainAllBelow(t *testing.T) {
+	t.Parallel()
+	confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.2, "sec_c": 0.39}
+	if !shouldAbstain(confidences, 0.4) {
+		t.Error("all confidences below 0.4 must trigger abstention")
+	}
+}
+
+// TestShouldAbstainOneAbove: any confidence at-or-above threshold → no abstain.
+// The "all picks below" semantics is the spec's choice: if even one
+// section has signal, surface it as evidence.
+func TestShouldAbstainOneAbove(t *testing.T) {
+	t.Parallel()
+	confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.45}
+	if shouldAbstain(confidences, 0.4) {
+		t.Error("one pick at 0.45 should suppress abstention even when peers are low")
+	}
+}
+
+// TestShouldAbstainBoundary: confidence == threshold counts as "above" so
+// the engine is generous about evidence; the threshold is strict-below.
+func TestShouldAbstainBoundary(t *testing.T) {
+	t.Parallel()
+	confidences := map[tree.SectionID]float64{"sec_a": 0.4}
+	if shouldAbstain(confidences, 0.4) {
+		t.Error("confidence == threshold must NOT trigger abstention (strict-below)")
+	}
+}
+
+// TestShouldAbstainNilOrEmpty: missing confidence signal never abstains.
+// This is the contract that keeps legacy-shape LLM responses working
+// — the engine cannot abstain when it has no confidence to evaluate.
+func TestShouldAbstainNilOrEmpty(t *testing.T) {
+	t.Parallel()
+	if shouldAbstain(nil, 0.4) {
+		t.Error("nil confidences must NOT trigger abstention")
+	}
+	if shouldAbstain(map[tree.SectionID]float64{}, 0.4) {
+		t.Error("empty confidences must NOT trigger abstention")
+	}
+}
+
+// TestFilterConfidencesToIDsHappy verifies the helper restricts
+// surfaced confidences to the IDs the response actually carries (post
+// max_sections / re-rank truncation).
+func TestFilterConfidencesToIDs(t *testing.T) {
+	t.Parallel()
+	src := map[tree.SectionID]float64{"a": 0.1, "b": 0.5, "c": 0.9}
+	got := filterConfidencesToIDs(src, []tree.SectionID{"a", "c"})
+	if len(got) != 2 {
+		t.Fatalf("filtered length = %d, want 2", len(got))
+	}
+	if got["a"] != 0.1 || got["c"] != 0.9 {
+		t.Errorf("filtered = %v", got)
+	}
+	if _, present := got["b"]; present {
+		t.Error("b should have been filtered out")
+	}
+}
+
+// TestFilterConfidencesNilStaysNil preserves the "no signal" sentinel
+// across the helper.
+func TestFilterConfidencesNilStaysNil(t *testing.T) {
+	t.Parallel()
+	if got := filterConfidencesToIDs(nil, []tree.SectionID{"a"}); got != nil {
+		t.Errorf("nil input must produce nil output, got %v", got)
+	}
+	// All keys filtered out → nil too.
+	if got := filterConfidencesToIDs(map[tree.SectionID]float64{"x": 0.5}, []tree.SectionID{"a"}); got != nil {
+		t.Errorf("empty filtered result must produce nil, got %v", got)
+	}
+}
+
+// TestStringKeyedConfidencesShape: the helper converts the typed map
+// to JSON-friendly string keys for the wire response.
+func TestStringKeyedConfidences(t *testing.T) {
+	t.Parallel()
+	got := stringKeyedConfidences(map[tree.SectionID]float64{"sec_a": 0.7})
+	if got["sec_a"] != 0.7 {
+		t.Errorf("converted map should preserve the value, got %v", got)
+	}
+	if stringKeyedConfidences(nil) != nil {
+		t.Error("nil input must produce nil")
+	}
+}
+
+// TestAbstentionEnabledOverride: per-request body field wins over server config.
+func TestAbstentionEnabledOverride(t *testing.T) {
+	t.Parallel()
+	d := Deps{Abstain: config.AbstainBlock{Enabled: false}}
+	if !d.abstentionEnabled(boolPtr(true)) {
+		t.Error("body=true should override server=false")
+	}
+	d2 := Deps{Abstain: config.AbstainBlock{Enabled: true}}
+	if d2.abstentionEnabled(boolPtr(false)) {
+		t.Error("body=false should override server=true")
+	}
+}
+
+// TestAbstentionEnabledFallsBackToConfig: when the body field is nil,
+// the server config decides.
+func TestAbstentionEnabledFallsBackToConfig(t *testing.T) {
+	t.Parallel()
+	d := Deps{Abstain: config.AbstainBlock{Enabled: true}}
+	if !d.abstentionEnabled(nil) {
+		t.Error("nil body should fall back to server=true")
+	}
+	d2 := Deps{Abstain: config.AbstainBlock{Enabled: false}}
+	if d2.abstentionEnabled(nil) {
+		t.Error("nil body should fall back to server=false")
+	}
+}
+
+// --- Integration-style tests against handleQuery / handleAnswer ---
+//
+// These exercise the response-shape contracts: that all-low
+// confidences yield an abstained response; that mixed
+// (some-above-threshold) confidences yield a normal response; and
+// that legacy responses (no confidences) never abstain.
+
+// stubStrategy is a CostStrategy that returns canned IDs +
+// confidences without touching any LLM.
+type stubStrategy struct {
+	ids         []tree.SectionID
+	confidences map[tree.SectionID]float64
+	usage       retrieval.Usage
+	calls       int32
+}
+
+func (s *stubStrategy) Name() string { return "stub" }
+
+func (s *stubStrategy) Select(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) ([]tree.SectionID, error) {
+	atomic.AddInt32(&s.calls, 1)
+	return s.ids, nil
+}
+
+func (s *stubStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) (*retrieval.Result, error) {
+	atomic.AddInt32(&s.calls, 1)
+	return &retrieval.Result{
+		SelectedIDs: s.ids,
+		Confidences: s.confidences,
+		Usage:       s.usage,
+		HopsTaken:   1,
+	}, nil
+}
+
+// abstentionRouter wires only handleQuery / handleAnswer. We mock the
+// strategy and bypass DB by passing a tiny in-memory tree-loader
+// stub. The simplest way is to give the handler a Strategy that
+// short-circuits before any storage read — done by also stubbing
+// the storage to return empty content.
+func abstentionRouter(d Deps) http.Handler {
+	r := chi.NewRouter()
+	r.Route("/v1", func(r chi.Router) {
+		r.Post("/query", d.handleQuery)
+		r.Post("/answer", d.handleAnswer)
+	})
+	return r
+}
+
+// TestHandleQueryAbstainsOnAllLow: every confidence below threshold →
+// the response is the abstention shape with sections=[] and
+// abstained=true.
+//
+// We cannot run handleQuery without a DB-backed tree loader; instead,
+// this test calls the helper functions on a Deps struct as the
+// handler would, asserting the shape.
+func TestRespondAbstained(t *testing.T) {
+	t.Parallel()
+	d := Deps{
+		Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}},
+		Abstain:  config.AbstainBlock{Enabled: true, Below: 0.4},
+	}
+	confidences := map[tree.SectionID]float64{"sec_a": 0.12, "sec_b": 0.30}
+
+	rec := httptest.NewRecorder()
+	d.respondAbstained(rec, tree.DocumentID("doc_x"), "what is x?", confidences, nil)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200", rec.Code)
+	}
+	var body map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+		t.Fatal(err)
+	}
+	if v, _ := body["abstained"].(bool); !v {
+		t.Error("response must carry abstained=true")
+	}
+	if v, _ := body["abstention_reason"].(string); !strings.Contains(v, "confidence") {
+		t.Errorf("abstention_reason missing 'confidence': %q", v)
+	}
+	if v, _ := body["min_confidence_threshold"].(float64); v != 0.4 {
+		t.Errorf("min_confidence_threshold = %v, want 0.4", v)
+	}
+	if v, _ := body["sections"].([]any); len(v) != 0 {
+		t.Errorf("sections must be empty, got %v", v)
+	}
+	cc, ok := body["candidate_confidences"].(map[string]any)
+	if !ok {
+		t.Fatal("candidate_confidences missing")
+	}
+	if cc["sec_a"] != 0.12 {
+		t.Errorf("sec_a confidence = %v, want 0.12", cc["sec_a"])
+	}
+}
+
+// TestRespondAbstainedAnswer: same shape on /v1/answer. The synthesis
+// call is skipped — answer is the canonical refusal string, citations
+// is empty.
+func TestRespondAbstainedAnswer(t *testing.T) {
+	t.Parallel()
+	d := Deps{
+		Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}},
+		Abstain:  config.AbstainBlock{Enabled: true, Below: 0.4},
+		Logger:   slog.Default(),
+	}
+	confidences := map[tree.SectionID]float64{"sec_a": 0.1}
+	usage := retrieval.Usage{InputTokens: 100, OutputTokens: 20, TotalTokens: 120, LLMCalls: 2}
+
+	rec := httptest.NewRecorder()
+	d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", confidences, nil, usage, time.Now())
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200", rec.Code)
+	}
+	var body map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+		t.Fatal(err)
+	}
+	if v, _ := body["abstained"].(bool); !v {
+		t.Error("answer response must carry abstained=true")
+	}
+	if v, _ := body["answer"].(string); !strings.Contains(v, "cannot answer") {
+		t.Errorf("answer must be the canonical refusal, got %q", v)
+	}
+	if v, _ := body["citations"].([]any); len(v) != 0 {
+		t.Errorf("citations must be empty, got %v", v)
+	}
+	// Usage carried through (planning + retrieval — no synthesis).
+	if u, ok := body["usage"].(map[string]any); !ok {
+		t.Error("usage block missing")
+	} else if u["llm_calls"].(float64) != 2 {
+		t.Errorf("usage.llm_calls = %v, want 2", u["llm_calls"])
+	}
+}
+
+// TestRespondAbstainedTraceTokenAbsent: replay isn't meaningful for
+// an abstention (the engine produced no retrieval result); the
+// response must NOT carry a trace_token so callers don't try to
+// replay nothing.
+func TestRespondAbstainedTraceTokenAbsent(t *testing.T) {
+	t.Parallel()
+	d := Deps{
+		Strategy: &stubStrategy{},
+		Abstain:  config.AbstainBlock{Enabled: true, Below: 0.4},
+	}
+	rec := httptest.NewRecorder()
+	d.respondAbstained(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil)
+
+	var body map[string]any
+	_ = json.Unmarshal(rec.Body.Bytes(), &body)
+	if _, has := body["trace_token"]; has {
+		t.Error("abstention response must NOT carry trace_token")
+	}
+}
+
+// boolPtr is a tiny helper for the body-override tests.
+func boolPtr(b bool) *bool { return &b }
+
+// --- end-to-end through ServeHTTP without DB ---
+//
+// To exercise handleQuery / handleAnswer end-to-end we'd need a
+// db.Pool. Instead we cover the in-handler logic by directly calling
+// the helpers above (which is what the handler itself does on the
+// abstention path) and by running the predicate tests through the
+// handler-facing entrypoint via shouldAbstain + abstentionEnabled.
+// A future test pass with a real test DB will exercise the full
+// stack — for now, the abstention contract is unit-tested at the
+// helper boundary, which is the only place the contract lives.
+
+// mockLLMNeverCalled fails the test loudly if any LLM call lands.
+// Used as a tripwire in the abstention path: synthesis must NOT
+// run when /v1/answer abstains.
+type mockLLMNeverCalled struct{ t *testing.T }
+
+func (m mockLLMNeverCalled) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) {
+	m.t.Error("LLM should not be called on the abstention path")
+	return &llmgate.Response{Content: ""}, nil
+}
+
+func (m mockLLMNeverCalled) CountTokens(ctx context.Context, s string) (int, error) {
+	return len(s) / 4, nil
+}
+
+// TestRespondAbstainedAnswerSkipsSynthesis: the /v1/answer abstention
+// helper must not invoke the LLM. We pass an LLM that explodes on
+// any call so we'd see the test fail if synthesis leaks through.
+func TestRespondAbstainedAnswerSkipsSynthesis(t *testing.T) {
+	t.Parallel()
+	d := Deps{
+		Strategy: &stubStrategy{},
+		Abstain:  config.AbstainBlock{Enabled: true, Below: 0.4},
+		LLM:      mockLLMNeverCalled{t: t},
+	}
+	rec := httptest.NewRecorder()
+	d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil, retrieval.Usage{}, time.Now())
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200", rec.Code)
+	}
+}
+
+// (Imports that won't otherwise be referenced by every test file go
+// through small uses below so go vet is happy.)
+var _ = bytes.NewReader
+var _ = io.EOF
+var _ = abstentionRouter
diff --git a/internal/api/server.go b/internal/api/server.go
index 088db8e..b65ab78 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -91,6 +91,13 @@ type Deps struct {
 	// /v1/replay (the endpoint returns 501) and skips the per-
 	// response store write.
 	Replay retrieval.ReplayStore
+
+	// Abstain carries the server-side abstention config. The
+	// body-level `enable_abstain` field on /v1/query and /v1/answer
+	// overrides Abstain.Enabled. When abstention fires, the response
+	// carries abstained=true and an empty sections / citations list
+	// rather than risk hallucinating an answer from weak evidence.
+	Abstain config.AbstainBlock
 }
 
 // Router builds and returns the chi router wired with v1 routes.
@@ -398,14 +405,19 @@ func (d Deps) handleGetSection(w http.ResponseWriter, r *http.Request) {
 
 // handleQuery accepts { document_id, query, model?, max_tokens?,
 // reserved_for_prompt?, max_parallel_calls?, max_sections?,
-// enable_planning? } and runs the configured retrieval.Strategy against
-// the document's tree.
+// enable_planning?, enable_rerank?, enable_abstain? } and runs the
+// configured retrieval.Strategy against the document's tree.
 //
 // When `enable_planning` is true (or `retrieval.planning.enabled` is on
 // at config level) the request first issues a planning LLM call. The
 // resulting Plan is surfaced in the response under "plan". If the plan
 // is multi-hop and decomposition is enabled, retrieval fans out one
 // strategy call per sub-question and unions the results.
+//
+// When the selection LLM returns per-pick confidence scores and every
+// pick falls below `retrieval.abstain.below`, the response is an
+// abstention: sections is empty and abstained=true. Per-request
+// `enable_abstain` overrides the server-side flag for one request.
 func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 	var body struct {
 		DocumentID        tree.DocumentID `json:"document_id"`
@@ -423,6 +435,10 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 		// content-aware re-rank pass. Pointer for the same reason as
 		// EnablePlanning. Overrides retrieval.rerank.enabled.
 		EnableReRank *bool `json:"enable_rerank"`
+		// EnableAbstain opts this request into the Phase 2.4
+		// confidence-driven abstention check. Pointer for the same
+		// reason as EnablePlanning. Overrides retrieval.abstain.enabled.
+		EnableAbstain *bool `json:"enable_abstain"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
 		writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
@@ -466,12 +482,24 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 	started := time.Now()
 
 	plan, _ := d.runPlanner(r.Context(), body.Query, body.EnablePlanning)
-	ids, err := d.runSelection(r.Context(), t, plan, body.Query, budget)
+	ids, confidences, err := d.runSelection(r.Context(), t, plan, body.Query, budget)
 	if err != nil {
 		d.Logger.Error("query: strategy failed", "err", err, "document_id", body.DocumentID)
 		writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error())
 		return
 	}
+
+	// Phase 2.4 abstention: if every confident pick is below the
+	// configured threshold, refuse to ground an answer in evidence
+	// the model itself is not confident is relevant. The check fires
+	// only when explicit confidence signal is present — legacy-shape
+	// responses (no confidences) always fall through to the normal
+	// path so older models keep working.
+	if d.abstentionEnabled(body.EnableAbstain) && shouldAbstain(confidences, d.Abstain.Below) {
+		d.respondAbstained(w, body.DocumentID, body.Query, confidences, plan)
+		return
+	}
+
 	if body.MaxSections > 0 && len(ids) > body.MaxSections {
 		ids = ids[:body.MaxSections]
 	}
@@ -539,6 +567,12 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 	if plan != nil {
 		resp["plan"] = plan
 	}
+	// Surface the confidence map on the response when present. Only the
+	// finalIDs survive truncation, so trim accordingly. Empty map →
+	// omit so the field stays absent when no signal was available.
+	if filtered := filterConfidencesToIDs(confidences, finalIDs); len(filtered) > 0 {
+		resp["confidences"] = stringKeyedConfidences(filtered)
+	}
 
 	raw, err := marshalJSONForReplay(resp)
 	if err != nil {
@@ -692,6 +726,11 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 		// pass. Synthesis then sees the re-ranked top-k. Overrides
 		// retrieval.rerank.enabled.
 		EnableReRank *bool `json:"enable_rerank"`
+		// EnableAbstain opts this request into the Phase 2.4
+		// confidence-driven abstention check. When all picks fall
+		// below the threshold, /v1/answer skips synthesis entirely
+		// and returns a refusal answer with no citations.
+		EnableAbstain *bool `json:"enable_abstain"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
 		writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
@@ -734,13 +773,22 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 	plan, planUsage := d.runPlanner(r.Context(), body.Query, body.EnablePlanning)
 	totalUsage.Add(planUsage)
 
-	ids, retrievalUsage, err := d.runSelectionWithUsage(r.Context(), t, plan, body.Query, budget)
+	ids, confidences, retrievalUsage, err := d.runSelectionWithUsage(r.Context(), t, plan, body.Query, budget)
 	if err != nil {
 		writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error())
 		return
 	}
 	totalUsage.Add(retrievalUsage)
 
+	// Phase 2.4 abstention: skip synthesis entirely when every confident
+	// pick falls below the threshold. The response answers with a
+	// regulator-friendly refusal rather than a hallucinated synthesis
+	// of weak evidence.
+	if d.abstentionEnabled(body.EnableAbstain) && shouldAbstain(confidences, d.Abstain.Below) {
+		d.respondAbstainedAnswer(w, body.DocumentID, body.Query, confidences, plan, totalUsage, started)
+		return
+	}
+
 	maxSections := body.MaxSections
 	if maxSections <= 0 {
 		maxSections = d.Answer.MaxSections
@@ -863,6 +911,9 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
 	if plan != nil {
 		resp["plan"] = plan
 	}
+	if filtered := filterConfidencesToIDs(confidences, finalIDs); len(filtered) > 0 {
+		resp["confidences"] = stringKeyedConfidences(filtered)
+	}
 
 	raw, err := marshalJSONForReplay(resp)
 	if err != nil {
@@ -1191,38 +1242,45 @@ func (d Deps) runPlanner(ctx context.Context, query string, bodyOverride *bool)
 
 // runSelection picks section IDs for the query, optionally going
 // through the Decomposer when the plan is multi-hop AND planning-level
-// decomposition is enabled. Returns the same []SectionID Strategy.Select
-// would.
-func (d Deps) runSelection(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, error) {
-	if d.shouldDecompose(plan) {
-		ids, _, err := retrieval.NewDecomposer(d.Strategy).DecomposedSelect(ctx, t, plan, query, budget)
-		return ids, err
-	}
-	return d.Strategy.Select(ctx, t, query, budget)
+// decomposition is enabled. Returns the selected IDs plus the per-pick
+// confidence map (nil when the selection LLM returned the legacy
+// shape with no confidence signal).
+func (d Deps) runSelection(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, error) {
+	ids, confidences, _, err := d.runSelectionFull(ctx, t, plan, query, budget)
+	return ids, confidences, err
 }
 
 // runSelectionWithUsage is the cost-tracking variant used by /v1/answer.
-// Returns the selected IDs plus the Usage accumulated during selection
-// (across all sub-questions for multi-hop plans).
-func (d Deps) runSelectionWithUsage(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, retrieval.Usage, error) {
+// Returns the selected IDs, per-pick confidences (nil when no signal),
+// and the Usage accumulated during selection (across all sub-questions
+// for multi-hop plans).
+func (d Deps) runSelectionWithUsage(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, retrieval.Usage, error) {
+	return d.runSelectionFull(ctx, t, plan, query, budget)
+}
+
+// runSelectionFull is the shared workhorse behind runSelection /
+// runSelectionWithUsage. It routes through the Decomposer when the
+// plan is multi-hop AND decomposition is enabled, and surfaces
+// confidences for the Phase 2.4 abstention check.
+func (d Deps) runSelectionFull(ctx context.Context, t *tree.Tree, plan *retrieval.Plan, query string, budget retrieval.ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, retrieval.Usage, error) {
 	if d.shouldDecompose(plan) {
-		return retrieval.NewDecomposer(d.Strategy).DecomposedSelect(ctx, t, plan, query, budget)
+		return retrieval.NewDecomposer(d.Strategy).DecomposedSelectWithConfidences(ctx, t, plan, query, budget)
 	}
 	if cs, ok := d.Strategy.(retrieval.CostStrategy); ok {
 		res, err := cs.SelectWithCost(ctx, t, query, budget)
 		if err != nil {
-			return nil, retrieval.Usage{}, err
+			return nil, nil, retrieval.Usage{}, err
 		}
 		if res == nil {
-			return nil, retrieval.Usage{}, nil
+			return nil, nil, retrieval.Usage{}, nil
 		}
-		return res.SelectedIDs, res.Usage, nil
+		return res.SelectedIDs, res.Confidences, res.Usage, nil
 	}
 	ids, err := d.Strategy.Select(ctx, t, query, budget)
 	if err != nil {
-		return nil, retrieval.Usage{}, err
+		return nil, nil, retrieval.Usage{}, err
 	}
-	return ids, retrieval.Usage{}, nil
+	return ids, nil, retrieval.Usage{}, nil
 }
 
 // shouldDecompose returns true when the plan is multi-hop AND
@@ -1379,6 +1437,148 @@ func writePlanHints(b *strings.Builder, plan *retrieval.Plan) {
 	}
 }
 
+// --- abstention helpers ---
+
+// abstentionEnabled reports whether the request should run the
+// confidence-driven abstention check. The per-request body field (when
+// present) wins over the server-side config; a nil body field falls
+// back to the config. When neither is enabled, abstention is skipped
+// regardless of the confidence signal.
+func (d Deps) abstentionEnabled(bodyOverride *bool) bool {
+	if bodyOverride != nil {
+		return *bodyOverride
+	}
+	return d.Abstain.Enabled
+}
+
+// shouldAbstain returns true when confidences carry an explicit
+// signal AND every entry is strictly below threshold.
+//
+// The "all picks below" semantics (vs "any pick below") is
+// deliberate: if even one section scored above, the engine has
+// enough evidence to surface it. Abstention is reserved for the case
+// where every candidate is weak.
+//
+// nil / empty confidences never trigger abstention — abstention
+// requires explicit confidence signal from the selection LLM. A
+// legacy-shape response carries nil confidences and falls through
+// to the normal path.
+func shouldAbstain(confidences map[tree.SectionID]float64, threshold float64) bool {
+	if len(confidences) == 0 {
+		return false
+	}
+	for _, c := range confidences {
+		if c >= threshold {
+			return false
+		}
+	}
+	return true
+}
+
+// stringKeyedConfidences converts the typed confidence map into a
+// JSON-friendly {string: float} so encoding/json emits an object
+// with section_id keys rather than relying on a tree.SectionID
+// MarshalText shim. Returns nil when src is empty so the field
+// stays absent on the wire.
+func stringKeyedConfidences(src map[tree.SectionID]float64) map[string]float64 {
+	if len(src) == 0 {
+		return nil
+	}
+	out := make(map[string]float64, len(src))
+	for id, c := range src {
+		out[string(id)] = c
+	}
+	return out
+}
+
+// filterConfidencesToIDs keeps only the entries whose IDs appear in
+// keep, preserving the "no signal" semantics: a nil input returns
+// nil, an empty filtered result also returns nil so callers can do
+// a single len()-check before serialising.
+func filterConfidencesToIDs(src map[tree.SectionID]float64, keep []tree.SectionID) map[tree.SectionID]float64 {
+	if len(src) == 0 {
+		return nil
+	}
+	out := make(map[tree.SectionID]float64, len(keep))
+	for _, id := range keep {
+		if v, ok := src[id]; ok {
+			out[id] = v
+		}
+	}
+	if len(out) == 0 {
+		return nil
+	}
+	return out
+}
+
+// abstentionReason is the human-readable message attached to every
+// abstention response. Kept as a single constant so callers don't
+// drift on wording and analytics can group by exact string.
+const abstentionReason = "no candidate section scored above the confidence threshold"
+
+// abstentionAnswerText is the canonical refusal used by /v1/answer
+// when abstention fires. The text is regulator-friendly: it admits
+// the engine could not answer rather than guessing, and does so in a
+// language clients can surface verbatim.
+const abstentionAnswerText = "I cannot answer this question from the supplied document."
+
+// respondAbstained writes the abstention shape for /v1/query. The
+// response includes the threshold and the candidate_confidences map
+// the model returned so callers (and downstream evaluators) can see
+// exactly why the engine refused.
+//
+// trace_token is intentionally empty on abstention: we don't store
+// the response in the replay log because there's no meaningful
+// retrieval result to reproduce. Callers replaying an abstention
+// will simply re-run /v1/query.
+func (d Deps) respondAbstained(w http.ResponseWriter, docID tree.DocumentID, query string, confidences map[tree.SectionID]float64, plan *retrieval.Plan) {
+	resp := map[string]any{
+		"document_id":              docID,
+		"query":                    query,
+		"strategy":                 d.Strategy.Name(),
+		"sections":                 []any{},
+		"abstained":                true,
+		"abstention_reason":        abstentionReason,
+		"min_confidence_threshold": d.Abstain.Below,
+		"candidate_confidences":    stringKeyedConfidences(confidences),
+	}
+	if plan != nil {
+		resp["plan"] = plan
+	}
+	writeJSON(w, http.StatusOK, resp)
+}
+
+// respondAbstainedAnswer writes the abstention shape for /v1/answer.
+// The answer text is the canonical refusal; citations is empty;
+// usage carries the LLM tokens spent up to the abstention point
+// (planning + retrieval, no synthesis) so the caller's billing
+// stays accurate.
+func (d Deps) respondAbstainedAnswer(w http.ResponseWriter, docID tree.DocumentID, query string, confidences map[tree.SectionID]float64, plan *retrieval.Plan, usage retrieval.Usage, started time.Time) {
+	resp := map[string]any{
+		"document_id": docID,
+		"query":       query,
+		"answer":      abstentionAnswerText,
+		"citations":   []any{},
+		"strategy":    d.Strategy.Name(),
+		"usage": map[string]any{
+			"input_tokens":  usage.InputTokens,
+			"output_tokens": usage.OutputTokens,
+			"total_tokens":  usage.TotalTokens,
+			"cost_usd":      usage.CostUSD,
+			"llm_calls":     usage.LLMCalls,
+		},
+		"elapsed_ms":               time.Since(started).Milliseconds(),
+		"abstained":                true,
+		"abstention_reason":        abstentionReason,
+		"min_confidence_threshold": d.Abstain.Below,
+		"candidate_confidences":    stringKeyedConfidences(confidences),
+	}
+	if plan != nil {
+		resp["plan"] = plan
+	}
+	writeJSON(w, http.StatusOK, resp)
+}
+
 // --- helpers ---
 
 func writeJSON(w http.ResponseWriter, status int, v any) {
diff --git a/openapi.yaml b/openapi.yaml
index a4c82ef..6307047 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -551,6 +551,19 @@ components:
             original order — re-rank never drops sections. Overrides
             the server's `retrieval.rerank.enabled` setting for this
             request only.
+        enable_abstain:
+          type: boolean
+          description: |
+            Opt this request into the Phase 2.4 confidence-driven
+            abstention check. When the selection LLM returns per-pick
+            confidence scores and every score falls below
+            `retrieval.abstain.below`, the response is an abstention
+            (sections empty, abstained=true) rather than evidence the
+            engine itself is not confident is relevant. Abstention
+            requires explicit confidence signal — legacy-shape LLM
+            responses (no confidence) never trigger abstention.
+            Overrides the server's `retrieval.abstain.enabled`
+            setting for this request only.
 
     QueryResponse:
       type: object
@@ -568,6 +581,9 @@ components:
           type: array
           items:
             $ref: "#/components/schemas/QuerySection"
+          description: |
+            Empty when `abstained=true`. The engine refused to surface
+            sections it isn't confident answer the query.
         plan:
           $ref: "#/components/schemas/Plan"
         elapsed_ms:
@@ -581,7 +597,48 @@ components:
             Pass this token to /v1/replay along with the original
             `query` and `document_id` to retrieve the byte-identical
             response. Empty when the server has
-            `retrieval.replay.enabled=false`.
+            `retrieval.replay.enabled=false`, and omitted on
+            abstention responses (which have no retrieval result to
+            replay).
+        confidences:
+          type: object
+          additionalProperties:
+            type: number
+          description: |
+            Per-section confidence scores in [0.0, 1.0] returned by the
+            selection LLM, keyed by section_id. Only present when the
+            model returned the new-shape response with explicit
+            confidence values. Useful for downstream evaluators that
+            want to surface the engine's certainty alongside the
+            evidence.
+        abstained:
+          type: boolean
+          description: |
+            True when the engine refused to ground an answer in the
+            retrieved evidence because every candidate section scored
+            below `min_confidence_threshold`. When true, `sections`
+            is empty and `candidate_confidences` carries the actual
+            scores so the caller can decide whether to relax the
+            threshold and retry.
+        abstention_reason:
+          type: string
+          description: |
+            Human-readable explanation when `abstained=true`. Stable
+            string suitable for surfacing verbatim to end users.
+        min_confidence_threshold:
+          type: number
+          description: |
+            The `retrieval.abstain.below` value the engine compared
+            confidences against. Present only on abstention responses.
+        candidate_confidences:
+          type: object
+          additionalProperties:
+            type: number
+          description: |
+            Per-section confidence scores the selection LLM returned
+            for every candidate it considered, NOT just the final
+            picks. Surfaced only on abstention responses so callers
+            can see exactly why the engine refused.
 
     QuerySection:
       type: object
@@ -675,6 +732,15 @@ components:
             When the pass runs, the synthesis prompt sees the
             re-ranked top-k (capped by `retrieval.rerank.top_k`), and
             each citation in the response carries a `score` field.
+        enable_abstain:
+          type: boolean
+          description: |
+            Opt this request into the Phase 2.4 confidence-driven
+            abstention check. See QueryRequest.enable_abstain for
+            full semantics. When abstention fires on /v1/answer the
+            synthesis call is skipped entirely; the response carries
+            a canonical refusal in `answer`, an empty `citations`
+            array, and `abstained=true`.
 
     AnswerResponse:
       type: object
@@ -685,11 +751,17 @@ components:
           type: string
         answer:
           type: string
-          description: Natural-language answer grounded in the cited sections.
+          description: |
+            Natural-language answer grounded in the cited sections.
+            When `abstained=true` this is the canonical refusal
+            ("I cannot answer this question from the supplied
+            document.") rather than synthesised prose.
         citations:
           type: array
           items:
             $ref: "#/components/schemas/AnswerCitation"
+          description: |
+            Empty array when `abstained=true`.
         strategy:
           type: string
         model:
@@ -702,6 +774,10 @@ components:
             total_tokens: {type: integer}
             cost_usd: {type: number}
             llm_calls: {type: integer}
+          description: |
+            On abstention this carries the planning + retrieval
+            tokens but no synthesis tokens — the engine skipped
+            the synthesis LLM call entirely.
         plan:
           $ref: "#/components/schemas/Plan"
         elapsed_ms:
@@ -715,7 +791,44 @@ components:
             system prompt version). Pass to /v1/replay with the
             original `query` and `document_id` to fetch the
             byte-identical response. Empty when the server has
-            `retrieval.replay.enabled=false`.
+            `retrieval.replay.enabled=false`, and omitted on
+            abstention responses (which have no synthesis result to
+            replay).
+        confidences:
+          type: object
+          additionalProperties:
+            type: number
+          description: |
+            Per-section confidence scores in [0.0, 1.0] returned by
+            the selection LLM, keyed by section_id. Only present
+            when the model returned the new-shape response with
+            explicit confidence values.
+        abstained:
+          type: boolean
+          description: |
+            True when the engine refused to synthesise an answer
+            because every candidate section scored below
+            `min_confidence_threshold`. The synthesis LLM call is
+            skipped entirely; `answer` is the canonical refusal and
+            `citations` is empty.
+        abstention_reason:
+          type: string
+          description: |
+            Human-readable explanation when `abstained=true`.
+        min_confidence_threshold:
+          type: number
+          description: |
+            The `retrieval.abstain.below` value the engine compared
+            confidences against. Present only on abstention responses.
+        candidate_confidences:
+          type: object
+          additionalProperties:
+            type: number
+          description: |
+            Per-section confidence scores the selection LLM returned
+            for every candidate it considered. Surfaced only on
+            abstention responses so callers can see exactly why the
+            engine refused to synthesise.
 
     Plan:
       type: object
diff --git a/pkg/retrieval/decompose.go b/pkg/retrieval/decompose.go
index 8c5e83b..0d5aa6f 100644
--- a/pkg/retrieval/decompose.go
+++ b/pkg/retrieval/decompose.go
@@ -42,14 +42,31 @@ func NewDecomposer(s Strategy) *Decomposer {
 // returns the partial Usage gathered up to that point. This is the same
 // failure contract Strategy.Select has — a multi-hop loop shouldn't
 // silently mask retrieval errors.
+//
+// This method does NOT surface per-pick confidences. Callers that need
+// them should use DecomposedSelectWithConfidences (added in Phase 2.4).
 func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *Plan, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) {
+	ids, _, usage, err := d.DecomposedSelectWithConfidences(ctx, t, plan, query, budget)
+	return ids, usage, err
+}
+
+// DecomposedSelectWithConfidences is the Phase 2.4 variant of
+// DecomposedSelect that also returns the per-pick confidence map.
+// When a sub-question's underlying Strategy is a CostStrategy and
+// surfaces confidences, those are unioned across sub-questions (max
+// wins on duplicate IDs — the most confident sub-question wins).
+//
+// The returned confidences map is nil when no sub-question contributed
+// any confidence signal at all — preserving the "no confidence signal"
+// distinction the API layer's abstention check depends on.
+func (d *Decomposer) DecomposedSelectWithConfidences(ctx context.Context, t *tree.Tree, plan *Plan, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) {
 	if d == nil || d.Strategy == nil {
-		return nil, Usage{}, fmt.Errorf("decomposer: no strategy configured")
+		return nil, nil, Usage{}, fmt.Errorf("decomposer: no strategy configured")
 	}
 
 	// Fall-through: no plan or not multi-hop. Single retrieval call on
-	// the original query, with usage extracted from CostStrategy when
-	// available.
+	// the original query, with usage + confidences extracted from
+	// CostStrategy when available.
 	if plan == nil || !plan.IsMultiHop || len(plan.SubQuestions) == 0 {
 		return d.runOnce(ctx, t, query, budget)
 	}
@@ -59,15 +76,16 @@ func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *P
 	// sub-question is usually the most important — and gives a
 	// deterministic union ordering callers can rely on.
 	var (
-		totalUsage Usage
-		out        = make([]tree.SectionID, 0)
-		seen       = make(map[tree.SectionID]struct{})
+		totalUsage  Usage
+		out         = make([]tree.SectionID, 0)
+		seen        = make(map[tree.SectionID]struct{})
+		confidences map[tree.SectionID]float64
 	)
 	for _, sub := range plan.SubQuestions {
-		ids, usage, err := d.runOnce(ctx, t, sub, budget)
+		ids, subConfidences, usage, err := d.runOnce(ctx, t, sub, budget)
 		totalUsage.Add(usage)
 		if err != nil {
-			return out, totalUsage, fmt.Errorf("decompose %q: %w", sub, err)
+			return out, confidences, totalUsage, fmt.Errorf("decompose %q: %w", sub, err)
 		}
 		for _, id := range ids {
 			if _, dup := seen[id]; dup {
@@ -76,28 +94,41 @@ func (d *Decomposer) DecomposedSelect(ctx context.Context, t *tree.Tree, plan *P
 			seen[id] = struct{}{}
 			out = append(out, id)
 		}
+		// Union with max-wins on overlap: if two sub-questions both
+		// score the same section, the more confident verdict carries.
+		if len(subConfidences) > 0 {
+			if confidences == nil {
+				confidences = make(map[tree.SectionID]float64, len(subConfidences))
+			}
+			for id, c := range subConfidences {
+				if existing, ok := confidences[id]; !ok || c > existing {
+					confidences[id] = c
+				}
+			}
+		}
 	}
-	return out, totalUsage, nil
+	return out, confidences, totalUsage, nil
 }
 
 // runOnce delegates one retrieval call. Uses CostStrategy when the
-// wrapped strategy implements it so per-sub-question usage flows into
-// the aggregated total; otherwise falls back to plain Select with a
-// zero Usage value.
-func (d *Decomposer) runOnce(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) ([]tree.SectionID, Usage, error) {
+// wrapped strategy implements it so per-sub-question usage and (since
+// Phase 2.4) confidences flow into the aggregated total; otherwise
+// falls back to plain Select with a zero Usage value and nil
+// confidences.
+func (d *Decomposer) runOnce(ctx context.Context, t *tree.Tree, query string, budget ContextBudget) ([]tree.SectionID, map[tree.SectionID]float64, Usage, error) {
 	if cs, ok := d.Strategy.(CostStrategy); ok {
 		res, err := cs.SelectWithCost(ctx, t, query, budget)
 		if err != nil {
-			return nil, Usage{}, err
+			return nil, nil, Usage{}, err
 		}
 		if res == nil {
-			return nil, Usage{}, nil
+			return nil, nil, Usage{}, nil
 		}
-		return res.SelectedIDs, res.Usage, nil
+		return res.SelectedIDs, res.Confidences, res.Usage, nil
 	}
 	ids, err := d.Strategy.Select(ctx, t, query, budget)
 	if err != nil {
-		return nil, Usage{}, err
+		return nil, nil, Usage{}, err
 	}
-	return ids, Usage{}, nil
+	return ids, nil, Usage{}, nil
 }