Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ func run() error {
ReRanker: reRanker,
ReRank: cfg.Retrieval.ReRank,
Replay: replayStore,
Abstain: cfg.Retrieval.Abstain,
}

srv := &http.Server{
Expand Down
28 changes: 28 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,34 @@ retrieval:
# re-rank pass to do the final selection.
top_k: 0

# abstain: Phase 2.4 abstention. When the selection LLM returns
# per-pick confidence scores (the new picks shape) and every
# confidence falls below `below`, /v1/query and /v1/answer skip the
# normal path and return an abstention response instead:
# {abstained: true, abstention_reason: "...", sections: [],
# min_confidence_threshold: 0.4, candidate_confidences: {...}}
# For /v1/answer the synthesis call is skipped entirely; the answer
# is the honest "I cannot answer this question from the supplied
# document." This trades a likely hallucination for a clear refusal
# when the engine's own confidence is weak.
#
# OPT-OUT. Default enabled. Per-request `enable_abstain` body field
# overrides this block. When the selection LLM returns the legacy
# shape (no confidence scores) the engine never abstains regardless
# of this setting — abstention requires explicit confidence signal.
#
# The check is "all picks below threshold". If any pick scored
# above, the engine surfaces that section as evidence — abstention
# is reserved for the case where every candidate is weak.
abstain:
enabled: true
# Confidence threshold in [0.0, 1.0]. Picks with confidence
# strictly less than this are "not confident"; when ALL picks
# fall below, the response is an abstention. 0.4 is the default
# — high enough to filter weak matches, low enough not to
# suppress legitimate partial answers.
below: 0.4

# replay: Phase 3.1 reproducibility store. Every /v1/query and
# /v1/answer response carries a deterministic `trace_token`; the
# response body is stored in an in-memory LRU under that token so
Expand Down
340 changes: 340 additions & 0 deletions internal/api/abstention_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
package api

import (
"bytes"
"context"
"encoding/json"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"

"github.com/go-chi/chi/v5"
"github.com/hallelx2/llmgate"

"github.com/hallelx2/vectorless-engine/pkg/config"
"github.com/hallelx2/vectorless-engine/pkg/retrieval"
"github.com/hallelx2/vectorless-engine/pkg/tree"
)

// TestShouldAbstainAllBelow: every confidence under threshold → abstain.
func TestShouldAbstainAllBelow(t *testing.T) {
t.Parallel()
confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.2, "sec_c": 0.39}
if !shouldAbstain(confidences, 0.4) {
t.Error("all confidences below 0.4 must trigger abstention")
}
}

// TestShouldAbstainOneAbove: any confidence at-or-above threshold → no abstain.
// The "all picks below" semantics is the spec's choice: if even one
// section has signal, surface it as evidence.
func TestShouldAbstainOneAbove(t *testing.T) {
t.Parallel()
confidences := map[tree.SectionID]float64{"sec_a": 0.1, "sec_b": 0.45}
if shouldAbstain(confidences, 0.4) {
t.Error("one pick at 0.45 should suppress abstention even when peers are low")
}
}

// TestShouldAbstainBoundary: confidence == threshold counts as "above" so
// the engine is generous about evidence; the threshold is strict-below.
func TestShouldAbstainBoundary(t *testing.T) {
t.Parallel()
confidences := map[tree.SectionID]float64{"sec_a": 0.4}
if shouldAbstain(confidences, 0.4) {
t.Error("confidence == threshold must NOT trigger abstention (strict-below)")
}
}

// TestShouldAbstainNilOrEmpty: missing confidence signal never abstains.
// This is the contract that keeps legacy-shape LLM responses working
// — the engine cannot abstain when it has no confidence to evaluate.
func TestShouldAbstainNilOrEmpty(t *testing.T) {
t.Parallel()
if shouldAbstain(nil, 0.4) {
t.Error("nil confidences must NOT trigger abstention")
}
if shouldAbstain(map[tree.SectionID]float64{}, 0.4) {
t.Error("empty confidences must NOT trigger abstention")
}
}

// TestFilterConfidencesToIDsHappy verifies the helper restricts
// surfaced confidences to the IDs the response actually carries (post
// max_sections / re-rank truncation).
func TestFilterConfidencesToIDs(t *testing.T) {
t.Parallel()
src := map[tree.SectionID]float64{"a": 0.1, "b": 0.5, "c": 0.9}
got := filterConfidencesToIDs(src, []tree.SectionID{"a", "c"})
if len(got) != 2 {
t.Fatalf("filtered length = %d, want 2", len(got))
}
if got["a"] != 0.1 || got["c"] != 0.9 {
t.Errorf("filtered = %v", got)
}
if _, present := got["b"]; present {
t.Error("b should have been filtered out")
}
}

// TestFilterConfidencesNilStaysNil preserves the "no signal" sentinel
// across the helper.
func TestFilterConfidencesNilStaysNil(t *testing.T) {
t.Parallel()
if got := filterConfidencesToIDs(nil, []tree.SectionID{"a"}); got != nil {
t.Errorf("nil input must produce nil output, got %v", got)
}
// All keys filtered out → nil too.
if got := filterConfidencesToIDs(map[tree.SectionID]float64{"x": 0.5}, []tree.SectionID{"a"}); got != nil {
t.Errorf("empty filtered result must produce nil, got %v", got)
}
}

// TestStringKeyedConfidencesShape: the helper converts the typed map
// to JSON-friendly string keys for the wire response.
func TestStringKeyedConfidences(t *testing.T) {
t.Parallel()
got := stringKeyedConfidences(map[tree.SectionID]float64{"sec_a": 0.7})
if got["sec_a"] != 0.7 {
t.Errorf("converted map should preserve the value, got %v", got)
}
if stringKeyedConfidences(nil) != nil {
t.Error("nil input must produce nil")
}
}

// TestAbstentionEnabledOverride: per-request body field wins over server config.
func TestAbstentionEnabledOverride(t *testing.T) {
t.Parallel()
d := Deps{Abstain: config.AbstainBlock{Enabled: false}}
if !d.abstentionEnabled(boolPtr(true)) {
t.Error("body=true should override server=false")
}
d2 := Deps{Abstain: config.AbstainBlock{Enabled: true}}
if d2.abstentionEnabled(boolPtr(false)) {
t.Error("body=false should override server=true")
}
}

// TestAbstentionEnabledFallsBackToConfig: when the body field is nil,
// the server config decides.
func TestAbstentionEnabledFallsBackToConfig(t *testing.T) {
t.Parallel()
d := Deps{Abstain: config.AbstainBlock{Enabled: true}}
if !d.abstentionEnabled(nil) {
t.Error("nil body should fall back to server=true")
}
d2 := Deps{Abstain: config.AbstainBlock{Enabled: false}}
if d2.abstentionEnabled(nil) {
t.Error("nil body should fall back to server=false")
}
}

// --- Integration-style tests against handleQuery / handleAnswer ---
//
// These exercise the response-shape contracts: that all-low
// confidences yield an abstained response; that mixed
// (some-above-threshold) confidences yield a normal response; and
// that legacy responses (no confidences) never abstain.

// stubStrategy is a CostStrategy that returns canned IDs +
// confidences without touching any LLM.
type stubStrategy struct {
ids []tree.SectionID
confidences map[tree.SectionID]float64
usage retrieval.Usage
calls int32
}

func (s *stubStrategy) Name() string { return "stub" }

func (s *stubStrategy) Select(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) ([]tree.SectionID, error) {
atomic.AddInt32(&s.calls, 1)
return s.ids, nil
}

func (s *stubStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, query string, budget retrieval.ContextBudget) (*retrieval.Result, error) {
atomic.AddInt32(&s.calls, 1)
return &retrieval.Result{
SelectedIDs: s.ids,
Confidences: s.confidences,
Usage: s.usage,
HopsTaken: 1,
}, nil
}

// abstentionRouter wires only handleQuery / handleAnswer. We mock the
// strategy and bypass DB by passing a tiny in-memory tree-loader
// stub. The simplest way is to give the handler a Strategy that
// short-circuits before any storage read — done by also stubbing
// the storage to return empty content.
func abstentionRouter(d Deps) http.Handler {
r := chi.NewRouter()
r.Route("/v1", func(r chi.Router) {
r.Post("/query", d.handleQuery)
r.Post("/answer", d.handleAnswer)
})
return r
}

// TestHandleQueryAbstainsOnAllLow: every confidence below threshold →
// the response is the abstention shape with sections=[] and
// abstained=true.
//
// We cannot run handleQuery without a DB-backed tree loader; instead,
// this test calls the helper functions on a Deps struct as the
// handler would, asserting the shape.
func TestRespondAbstained(t *testing.T) {
t.Parallel()
d := Deps{
Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}},
Abstain: config.AbstainBlock{Enabled: true, Below: 0.4},
}
confidences := map[tree.SectionID]float64{"sec_a": 0.12, "sec_b": 0.30}

rec := httptest.NewRecorder()
d.respondAbstained(rec, tree.DocumentID("doc_x"), "what is x?", confidences, nil)

if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want 200", rec.Code)
}
var body map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
t.Fatal(err)
}
if v, _ := body["abstained"].(bool); !v {
t.Error("response must carry abstained=true")
}
if v, _ := body["abstention_reason"].(string); !strings.Contains(v, "confidence") {
t.Errorf("abstention_reason missing 'confidence': %q", v)
}
if v, _ := body["min_confidence_threshold"].(float64); v != 0.4 {
t.Errorf("min_confidence_threshold = %v, want 0.4", v)
}
if v, _ := body["sections"].([]any); len(v) != 0 {
t.Errorf("sections must be empty, got %v", v)
}
cc, ok := body["candidate_confidences"].(map[string]any)
if !ok {
t.Fatal("candidate_confidences missing")
}
if cc["sec_a"] != 0.12 {
t.Errorf("sec_a confidence = %v, want 0.12", cc["sec_a"])
}
}

// TestRespondAbstainedAnswer: same shape on /v1/answer. The synthesis
// call is skipped — answer is the canonical refusal string, citations
// is empty.
func TestRespondAbstainedAnswer(t *testing.T) {
t.Parallel()
d := Deps{
Strategy: &stubStrategy{ids: []tree.SectionID{"sec_a"}},
Abstain: config.AbstainBlock{Enabled: true, Below: 0.4},
Logger: slog.Default(),
}
confidences := map[tree.SectionID]float64{"sec_a": 0.1}
usage := retrieval.Usage{InputTokens: 100, OutputTokens: 20, TotalTokens: 120, LLMCalls: 2}

rec := httptest.NewRecorder()
d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", confidences, nil, usage, time.Now())

if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want 200", rec.Code)
}
var body map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
t.Fatal(err)
}
if v, _ := body["abstained"].(bool); !v {
t.Error("answer response must carry abstained=true")
}
if v, _ := body["answer"].(string); !strings.Contains(v, "cannot answer") {
t.Errorf("answer must be the canonical refusal, got %q", v)
}
if v, _ := body["citations"].([]any); len(v) != 0 {
t.Errorf("citations must be empty, got %v", v)
}
// Usage carried through (planning + retrieval — no synthesis).
if u, ok := body["usage"].(map[string]any); !ok {
t.Error("usage block missing")
} else if u["llm_calls"].(float64) != 2 {
t.Errorf("usage.llm_calls = %v, want 2", u["llm_calls"])
}
}

// TestRespondAbstainedTraceTokenAbsent: replay isn't meaningful for
// an abstention (the engine produced no retrieval result); the
// response must NOT carry a trace_token so callers don't try to
// replay nothing.
func TestRespondAbstainedTraceTokenAbsent(t *testing.T) {
t.Parallel()
d := Deps{
Strategy: &stubStrategy{},
Abstain: config.AbstainBlock{Enabled: true, Below: 0.4},
}
rec := httptest.NewRecorder()
d.respondAbstained(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil)

var body map[string]any
_ = json.Unmarshal(rec.Body.Bytes(), &body)
if _, has := body["trace_token"]; has {
t.Error("abstention response must NOT carry trace_token")
}
}

// boolPtr is a tiny helper for the body-override tests.
func boolPtr(b bool) *bool { return &b }

// --- end-to-end through ServeHTTP without DB ---
//
// To exercise handleQuery / handleAnswer end-to-end we'd need a
// db.Pool. Instead we cover the in-handler logic by directly calling
// the helpers above (which is what the handler itself does on the
// abstention path) and by running the predicate tests through the
// handler-facing entrypoint via shouldAbstain + abstentionEnabled.
// A future test pass with a real test DB will exercise the full
// stack — for now, the abstention contract is unit-tested at the
// helper boundary, which is the only place the contract lives.

// mockLLMNeverCalled fails the test loudly if any LLM call lands.
// Used as a tripwire in the abstention path: synthesis must NOT
// run when /v1/answer abstains.
type mockLLMNeverCalled struct{ t *testing.T }

func (m mockLLMNeverCalled) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) {
m.t.Error("LLM should not be called on the abstention path")
return &llmgate.Response{Content: ""}, nil
}

func (m mockLLMNeverCalled) CountTokens(ctx context.Context, s string) (int, error) {
return len(s) / 4, nil
}

// TestRespondAbstainedAnswerSkipsSynthesis: the /v1/answer abstention
// helper must not invoke the LLM. We pass an LLM that explodes on
// any call so we'd see the test fail if synthesis leaks through.
func TestRespondAbstainedAnswerSkipsSynthesis(t *testing.T) {
t.Parallel()
d := Deps{
Strategy: &stubStrategy{},
Abstain: config.AbstainBlock{Enabled: true, Below: 0.4},
LLM: mockLLMNeverCalled{t: t},
}
rec := httptest.NewRecorder()
d.respondAbstainedAnswer(rec, tree.DocumentID("doc_x"), "q", map[tree.SectionID]float64{"a": 0.1}, nil, retrieval.Usage{}, time.Now())
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, want 200", rec.Code)
}
}

// (Imports that won't otherwise be referenced by every test file go
// through small uses below so go vet is happy.)
var _ = bytes.NewReader
var _ = io.EOF
var _ = abstentionRouter
Loading
Loading