From c72234c051e6dd13f876f59ee4615a6e0275cfcd Mon Sep 17 00:00:00 2001 From: Alexander Morales-Panitz Date: Wed, 29 Apr 2026 12:55:55 -0500 Subject: [PATCH 1/3] fix(extraction): bump EXTRACTION_MAX_TOKENS 4096 -> 8192 Extraction LLM was truncating JSON output at ~14 KB during BEAM Sprint 2 CR mini-slice runs on dense 10-turn chunks. Server log showed: [extractFacts] JSON parse failed (Unterminated string in JSON at position 14152 ...); attempting repair across 6 chunks of one ingest, causing iter 7 (first attempt) to crash on conv-3. The Anthropic max_tokens budget defaults to 4096 in extraction.ts. Going to 8192 doubles the headroom for JSON output without changing any other behavior. Cost impact is marginal (Anthropic bills only for tokens actually generated; rare for extraction to use the full 8192). Validation: server is running with this change locally; iter 7 v3 N=3 full-ingest reruns succeed without truncation. Companion harness mitigation lowered chunk size from 10 to 5 turn-pairs (in atomicmemory-benchmarks PR #8) to reduce the chance of hitting the limit at all. This server-side bump is defense-in-depth. --- src/services/extraction.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/extraction.ts b/src/services/extraction.ts index 3ce6730..7e4d0f9 100644 --- a/src/services/extraction.ts +++ b/src/services/extraction.ts @@ -18,7 +18,7 @@ import { type ExtractionOptions, } from './observation-date-extraction.js'; -const EXTRACTION_MAX_TOKENS = 4096; +const EXTRACTION_MAX_TOKENS = 8192; const AUDN_MAX_TOKENS = 2048; export type { ExtractionOptions }; From 8b137e3193e75f7bc812ec09f0905f73ff84bdda Mon Sep 17 00:00:00 2001 From: Alexander Morales-Panitz Date: Wed, 29 Apr 2026 15:48:49 -0500 Subject: [PATCH 2/3] wip(exp-14): partial retrieval-confidence-gate implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subagent that was supposed to write a plan-only doc also produced preliminary code on this branch. Preserving for later — autoresearch loop will treat as a future iteration candidate. NOT verified, NOT ready for review. --- src/config.ts | 11 ++++ src/services/retrieval-confidence-gate.ts | 78 +++++++++++++++++++++++ src/services/search-pipeline.ts | 18 +++++- 3 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 src/services/retrieval-confidence-gate.ts diff --git a/src/config.ts b/src/config.ts index 10025d0..eed7ec4 100644 --- a/src/config.ts +++ b/src/config.ts @@ -127,6 +127,10 @@ export interface RuntimeConfig { literalListProtectionMaxProtected: number; temporalQueryConstraintEnabled: boolean; temporalQueryConstraintBoost: number; + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; deferredAudnEnabled: boolean; deferredAudnBatchSize: number; compositeGroupingEnabled: boolean; @@ -374,6 +378,10 @@ export const config: RuntimeConfig = { literalListProtectionMaxProtected: parsePositiveIntEnv('LITERAL_LIST_PROTECTION_MAX_PROTECTED', 3), temporalQueryConstraintEnabled: (optionalEnv('TEMPORAL_QUERY_CONSTRAINT_ENABLED') ?? 'false') === 'true', temporalQueryConstraintBoost: parseFloat(optionalEnv('TEMPORAL_QUERY_CONSTRAINT_BOOST') ?? '2'), + retrievalConfidenceGateEnabled: (optionalEnv('RETRIEVAL_CONFIDENCE_GATE_ENABLED') ?? 'false') === 'true', + retrievalConfidenceMarginNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_MARGIN_NORMALIZER') ?? '0.05'), + retrievalConfidenceSimilarityNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_SIMILARITY_NORMALIZER') ?? '0.5'), + retrievalConfidenceFloor: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_FLOOR') ?? '0.3'), deferredAudnEnabled: (optionalEnv('DEFERRED_AUDN_ENABLED') ?? 'false') === 'true', deferredAudnBatchSize: parseInt(optionalEnv('DEFERRED_AUDN_BATCH_SIZE') ?? '20', 10), compositeGroupingEnabled: (optionalEnv('COMPOSITE_GROUPING_ENABLED') ?? 'true') === 'true', @@ -514,6 +522,9 @@ export const INTERNAL_POLICY_CONFIG_FIELDS = [ 'literalListProtectionEnabled', 'literalListProtectionMaxProtected', // Temporal query selection 'temporalQueryConstraintEnabled', 'temporalQueryConstraintBoost', + // Retrieval confidence gate + 'retrievalConfidenceGateEnabled', 'retrievalConfidenceMarginNormalizer', + 'retrievalConfidenceSimilarityNormalizer', 'retrievalConfidenceFloor', // Fast AUDN 'fastAudnEnabled', 'fastAudnDuplicateThreshold', // Observation / deferred diff --git a/src/services/retrieval-confidence-gate.ts b/src/services/retrieval-confidence-gate.ts new file mode 100644 index 0000000..148e812 --- /dev/null +++ b/src/services/retrieval-confidence-gate.ts @@ -0,0 +1,78 @@ +/** + * Retrieval confidence gate — computes a confidence score based on the + * separation between top results. When confidence is low, signals to the + * caller that retrieval may be insufficient for a definitive answer. + * + * This targets the abstention ability (ABS) on BEAM, where Honcho scores + * below the no-memory baseline because over-retrieval poisons "I don't know" + * answers. + */ + +export interface RetrievalConfidence { + /** True when the confidence composite falls below the configured floor. */ + lowConfidence: boolean; + /** Composite confidence in [0, 1]. */ + confidence: number; + /** Similarity of the top result (the stable, scale-invariant signal). */ + topSimilarity: number; + /** Margin between top and second result similarities. */ + margin: number; +} + +export interface RetrievalConfidenceConfig { + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; +} + +const DEFAULT_MARGIN_NORMALIZER = 0.05; +const DEFAULT_SIMILARITY_NORMALIZER = 0.5; +const DEFAULT_CONFIDENCE_FLOOR = 0.3; +const MARGIN_WEIGHT = 0.6; +const ABSOLUTE_WEIGHT = 0.4; + +/** + * Compute retrieval confidence from a ranked list of results. + * + * Uses `similarity` (not `score`) because `score` is rewritten by RRF, + * cross-encoder, MMR, and additive boosts mid-pipeline. `similarity` is the + * only stable, scale-invariant signal that survives all stages. + * + * @param results — ranked search results; must expose `similarity: number`. + * @param cfg — gate configuration; when disabled returns `null`. + */ +export function computeRetrievalConfidence( + results: ReadonlyArray<{ similarity: number }>, + cfg: Partial & { retrievalConfidenceGateEnabled: boolean }, +): RetrievalConfidence | null { + if (!cfg.retrievalConfidenceGateEnabled) return null; + + if (results.length === 0) { + return { + lowConfidence: true, + confidence: 0, + topSimilarity: 0, + margin: 0, + }; + } + + const top = results[0].similarity; + const second = results.length > 1 ? results[1].similarity : 0; + const margin = Math.max(0, top - second); + + const marginNormalizer = cfg.retrievalConfidenceMarginNormalizer ?? DEFAULT_MARGIN_NORMALIZER; + const similarityNormalizer = cfg.retrievalConfidenceSimilarityNormalizer ?? DEFAULT_SIMILARITY_NORMALIZER; + const floor = cfg.retrievalConfidenceFloor ?? DEFAULT_CONFIDENCE_FLOOR; + + const marginConf = Math.min(1, margin / marginNormalizer); + const absConf = Math.min(1, top / similarityNormalizer); + const confidence = MARGIN_WEIGHT * marginConf + ABSOLUTE_WEIGHT * absConf; + + return { + lowConfidence: confidence < floor, + confidence, + topSimilarity: top, + margin, + }; +} diff --git a/src/services/search-pipeline.ts b/src/services/search-pipeline.ts index 1c4d445..1e9a3e7 100644 --- a/src/services/search-pipeline.ts +++ b/src/services/search-pipeline.ts @@ -38,6 +38,7 @@ import { applyCurrentStateRanking } from './current-state-ranking.js'; import { applyConcisenessPenalty } from './conciseness-preference.js'; import { protectLiteralListAnswerCandidates } from './literal-list-protection.js'; import { applyTemporalQueryConstraints } from './temporal-query-constraints.js'; +import { computeRetrievalConfidence, type RetrievalConfidence } from './retrieval-confidence-gate.js'; const TEMPORAL_NEIGHBOR_WINDOW_MINUTES = 30; const SEMANTIC_RRF_WEIGHT = 1.2; @@ -85,6 +86,10 @@ export type SearchPipelineRuntimeConfig = Pick< | 'retrievalProfileSettings' | 'temporalQueryConstraintBoost' | 'temporalQueryConstraintEnabled' + | 'retrievalConfidenceGateEnabled' + | 'retrievalConfidenceMarginNormalizer' + | 'retrievalConfidenceSimilarityNormalizer' + | 'retrievalConfidenceFloor' >; /** * Decide whether to auto-skip cross-encoder reranking. @@ -142,7 +147,7 @@ export async function runSearchPipelineWithTrace( sourceSite?: string, referenceTime?: Date, options: SearchPipelineOptions = {}, -): Promise<{ filtered: SearchResult[]; trace: TraceCollector }> { +): Promise<{ filtered: SearchResult[]; trace: TraceCollector; retrievalConfidence: RetrievalConfidence | null }> { const trace = new TraceCollector(query, userId); const policyConfig: SearchPipelineRuntimeConfig = options.runtimeConfig ?? config; const mmrPoolMultiplier = policyConfig.mmrEnabled ? 3 : 1; @@ -267,6 +272,15 @@ export async function runSearchPipelineWithTrace( policyConfig, )); + const retrievalConfidence = computeRetrievalConfidence(selected, policyConfig); + if (retrievalConfidence?.lowConfidence) { + trace.event('low-confidence-gate', { + confidence: retrievalConfidence.confidence, + topSimilarity: retrievalConfidence.topSimilarity, + margin: retrievalConfidence.margin, + }); + } + const namespaceScope = options.namespaceScope ?? null; trace.setRetrievalSummary({ candidateIds: selected.map((result) => result.id), @@ -281,7 +295,7 @@ export async function runSearchPipelineWithTrace( ? selected.filter((r) => isInScope(r.namespace, namespaceScope)) : selected; - return { filtered, trace }; + return { filtered, trace, retrievalConfidence }; } async function runInitialRetrieval( From aa35dd5d7b2e1e231bf5958ba86520e4e7d12cdc Mon Sep 17 00:00:00 2001 From: Alexander Morales-Panitz Date: Wed, 29 Apr 2026 16:01:14 -0500 Subject: [PATCH 3/3] =?UTF-8?q?feat(search):=20EXP-14=20=E2=80=94=20retrie?= =?UTF-8?q?val-side=20abstention=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a post-rerank confidence computation that signals when retrieval results are poorly separated and/or absolutely weak. Targets BEAM abstention (ABS) ability where Honcho scores below baseline. - New retrieval-confidence-gate.ts (~70 LOC) with computeRetrievalConfidence - Gates on similarity (stable, scale-invariant) not score (rewritten by RRF) - Four new RuntimeConfig fields, all default-off, allowlisted in INTERNAL_POLICY_CONFIG_FIELDS for config_override A/B testing - Threaded through search-pipeline → memory-search → routes → response - Emits retrieval_confidence JSON in search responses when enabled - Trace event 'low-confidence-gate' fires when low confidence detected - 10 unit tests covering: disabled, empty, strong separation, narrow+ weak, strong-margin override, normalizer/floor overrides Plan: experiments/exp-14-implementation-plan-2026-04-29.md --- src/app/runtime-container.ts | 4 + src/routes/memories.ts | 8 ++ .../retrieval-confidence-gate.test.ts | 122 ++++++++++++++++++ src/services/memory-search.ts | 17 ++- src/services/memory-service-types.ts | 1 + 5 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 src/services/__tests__/retrieval-confidence-gate.test.ts diff --git a/src/app/runtime-container.ts b/src/app/runtime-container.ts index 7f3dec7..fc5ffa2 100644 --- a/src/app/runtime-container.ts +++ b/src/app/runtime-container.ts @@ -113,6 +113,10 @@ export interface CoreRuntimeConfig { retrievalProfileSettings: RetrievalProfile; temporalQueryConstraintBoost: number; temporalQueryConstraintEnabled: boolean; + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; } /** Repositories constructed by the runtime container. */ diff --git a/src/routes/memories.ts b/src/routes/memories.ts index 0a182cf..cf91b31 100644 --- a/src/routes/memories.ts +++ b/src/routes/memories.ts @@ -824,5 +824,13 @@ function formatSearchResponse(result: RetrievalResult, scope: MemoryScope) { }, } : {}), ...(observability ? { observability: formatObservability(observability) } : {}), + ...(result.retrievalConfidence ? { + retrieval_confidence: { + low_confidence: result.retrievalConfidence.lowConfidence, + confidence: result.retrievalConfidence.confidence, + top_similarity: result.retrievalConfidence.topSimilarity, + margin: result.retrievalConfidence.margin, + }, + } : {}), }; } diff --git a/src/services/__tests__/retrieval-confidence-gate.test.ts b/src/services/__tests__/retrieval-confidence-gate.test.ts new file mode 100644 index 0000000..e4e6edd --- /dev/null +++ b/src/services/__tests__/retrieval-confidence-gate.test.ts @@ -0,0 +1,122 @@ +/** + * Tests for retrieval-confidence-gate.ts + * + * Validates the confidence computation used by EXP-14 (retrieval-side + * abstention gate). The gate must: + * - Return null when disabled + * - Flag low confidence on empty results + * - Flag low confidence on narrow margin + weak top similarity + * - NOT flag when separation is strong or top similarity is high + * - Respect config overrides for normalizers and floor + */ + +import { describe, it, expect } from 'vitest'; +import { computeRetrievalConfidence } from '../retrieval-confidence-gate.js'; + +function result(similarity: number): { similarity: number } { + return { similarity }; +} + +const enabledCfg = { + retrievalConfidenceGateEnabled: true, +} as const; + +const disabledCfg = { + retrievalConfidenceGateEnabled: false, +} as const; + +describe('computeRetrievalConfidence', () => { + it('returns null when the gate is disabled', () => { + const res = computeRetrievalConfidence([result(0.9), result(0.8)], disabledCfg); + expect(res).toBeNull(); + }); + + it('flags low confidence on empty results', () => { + const res = computeRetrievalConfidence([], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(true); + expect(res!.confidence).toBe(0); + expect(res!.topSimilarity).toBe(0); + expect(res!.margin).toBe(0); + }); + + it('does NOT flag single result with decent absolute similarity', () => { + // top=0.4, second=0 → margin=0.4 → marginConf=1.0, absConf=0.8 + // confidence = 0.6*1.0 + 0.4*0.8 = 0.92 ≥ 0.3 + const res = computeRetrievalConfidence([result(0.4)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.margin).toBe(0.4); + }); + + it('does NOT flag when top is strong and well-separated', () => { + const res = computeRetrievalConfidence([result(0.9), result(0.4)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.confidence).toBeGreaterThan(0.8); + }); + + it('flags narrow margin and weak top similarity', () => { + // top=0.10, second=0.09 → margin=0.01 + // marginConf=0.01/0.05=0.2, absConf=0.10/0.5=0.2 + // confidence = 0.6*0.2 + 0.4*0.2 = 0.20 < 0.3 + const res = computeRetrievalConfidence([result(0.10), result(0.09)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(true); + expect(res!.margin).toBeCloseTo(0.01, 5); + expect(res!.confidence).toBeCloseTo(0.20, 2); + }); + + it('does NOT flag weak top when margin is strong', () => { + // top=0.15, second=0.02 → margin=0.13 → marginConf=min(1, 0.13/0.05)=1.0 + // absConf=min(1, 0.15/0.5)=0.3 + // confidence = 0.6*1.0 + 0.4*0.3 = 0.72 ≥ 0.3 + const res = computeRetrievalConfidence([result(0.15), result(0.02)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.confidence).toBeCloseTo(0.72, 2); + }); + + it('respects margin normalizer override', () => { + const narrow = computeRetrievalConfidence([result(0.25), result(0.23)], { + retrievalConfidenceGateEnabled: true, + retrievalConfidenceMarginNormalizer: 0.01, + }); + // margin=0.02, normalizer=0.01 → marginConf=1.0 → confidence much higher + expect(narrow).not.toBeNull(); + expect(narrow!.margin).toBeCloseTo(0.02, 5); + expect(narrow!.confidence).toBeGreaterThan(0.5); + }); + + it('respects floor override', () => { + const res = computeRetrievalConfidence([result(0.25), result(0.23)], { + retrievalConfidenceGateEnabled: true, + retrievalConfidenceFloor: 0.05, + }); + // Same narrow margin, but floor is 0.05 → confidence ≈0.24 < 0.05? No, 0.24 > 0.05 + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + }); + + it('uses similarity, not score, for computation', () => { + // The gate reads `similarity` directly; it does not depend on `score`. + const res = computeRetrievalConfidence( + [{ similarity: 0.8 }, { similarity: 0.3 }], + enabledCfg, + ); + expect(res).not.toBeNull(); + expect(res!.topSimilarity).toBe(0.8); + expect(res!.margin).toBe(0.5); + expect(res!.lowConfidence).toBe(false); + }); + + it('computes exact confidence for a mid-range case', () => { + // top=0.5, second=0.4 → margin=0.1 + // marginConf=min(1, 0.1/0.05)=1.0 + // absConf=min(1, 0.5/0.5)=1.0 + // confidence = 0.6*1.0 + 0.4*1.0 = 1.0 + const res = computeRetrievalConfidence([result(0.5), result(0.4)], enabledCfg); + expect(res!.confidence).toBeCloseTo(1.0, 5); + expect(res!.lowConfidence).toBe(false); + }); +}); diff --git a/src/services/memory-search.ts b/src/services/memory-search.ts index c221f12..40d0791 100644 --- a/src/services/memory-search.ts +++ b/src/services/memory-search.ts @@ -63,11 +63,11 @@ async function executeSearchStep( retrievalOptions: RetrievalOptions | undefined, asOf: string | undefined, trace: TraceCollector, -): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector }> { +): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector; retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null }> { if (asOf) { const memories = await deps.stores.claim.searchClaimVersions(userId, await embedText(query, 'query'), effectiveLimit, asOf, sourceSite); trace.stage('as-of-search', memories, { asOf }); - return { memories, activeTrace: trace }; + return { memories, activeTrace: trace, retrievalConfidence: null }; } const pipelineStores = { search: deps.stores.search, link: deps.stores.link, memory: deps.stores.memory, entity: deps.stores.entity, pool: deps.stores.pool }; const pipelineResult = await runSearchPipelineWithTrace(pipelineStores, userId, query, effectiveLimit, sourceSite, referenceTime, { @@ -78,7 +78,7 @@ async function executeSearchStep( skipReranking: retrievalOptions?.skipReranking, runtimeConfig: deps.config, }); - return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace }; + return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace, retrievalConfidence: pipelineResult.retrievalConfidence }; } /** Filter workspace-scoped, stale composites, and consensus-violating memories. */ @@ -135,6 +135,7 @@ function assembleResponse( asOf: string | undefined, sourceSite: string | undefined, lessonCheck: LessonCheckResult | undefined, + retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null, ): RetrievalResult { const mode = retrievalOptions?.retrievalMode ?? 'flat'; const packaged = applyFlatPackagingPolicy(postProcessed.memories, query, mode, activeTrace); @@ -150,7 +151,7 @@ function assembleResponse( }); activeTrace.finalize(outputMemories); - return { + const result: RetrievalResult = { memories: outputMemories, injectionText, citations: buildRichCitations(outputMemories).map((c) => c.memory_id), retrievalMode: mode, tierAssignments, expandIds, estimatedContextTokens, @@ -159,6 +160,10 @@ function assembleResponse( retrievalSummary: activeTrace.getRetrievalSummary(), packagingSummary, assemblySummary, }; + if (retrievalConfidence) { + result.retrievalConfidence = retrievalConfidence; + } + return result; } /** Full search with lesson check, URI resolution, pipeline, post-processing, and packaging. */ @@ -185,9 +190,9 @@ export async function performSearch( const uriResult = await tryUriResolution(deps, query, userId, retrievalOptions, trace); if (uriResult) return uriResult; - const { memories: rawMemories, activeTrace } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace); + const { memories: rawMemories, activeTrace, retrievalConfidence } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace); const filteredMemories = await postProcessResults(deps, rawMemories, activeTrace, userId, query, asOf); - return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck); + return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck, retrievalConfidence); } /** diff --git a/src/services/memory-service-types.ts b/src/services/memory-service-types.ts index 0187247..ba5fbc5 100644 --- a/src/services/memory-service-types.ts +++ b/src/services/memory-service-types.ts @@ -190,6 +190,7 @@ export interface RetrievalResult { retrievalSummary?: import('./retrieval-trace.js').RetrievalTraceSummary; packagingSummary?: import('./retrieval-trace.js').PackagingTraceSummary; assemblySummary?: import('./retrieval-trace.js').AssemblyTraceSummary; + retrievalConfidence?: import('./retrieval-confidence-gate.js').RetrievalConfidence; } /** Options controlling retrieval packaging. */