From e2f0e4d8198c6df05b478713ef9c460132a0c0c8 Mon Sep 17 00:00:00 2001 From: Alexander Morales-Panitz Date: Wed, 29 Apr 2026 15:33:37 -0500 Subject: [PATCH] =?UTF-8?q?feat(extraction):=20EXP-06=20=E2=80=94=20generi?= =?UTF-8?q?c=20event=20anchors=20for=20As-of=20facts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a fact starts with 'As of , ...' and no DESCRIPTOR_RULE matches, emit a generic event.occurred anchor with the date and subject recovered from the prefix. Behind new flag genericEventAnchorEnabled (default false). Targets BEAM TR. Stage 7 dry-run on iter 7 v3 had TR 1/2 and EO 0/2; much of the variance was on facts that had clear temporal phrasing but didn't match LoCoMo-style descriptors. The fall-through anchor restores them at retrieval time. Risks: anchor inflation (new flag is off by default to bound this); subject collapse on User-only facts (subject extractor returns null in ambiguous cases rather than emitting a wrong subject). New config keys (defaults-off): - genericEventAnchorEnabled: false Behind feature flag. Defaults preserve current behavior. --- src/config.ts | 3 + ...onsensus-extraction-runtime-config.test.ts | 10 ++- .../__tests__/event-anchor-facts.test.ts | 76 +++++++++++++++++++ src/services/consensus-extraction.ts | 10 ++- src/services/event-anchor-facts.ts | 76 +++++++++++++++---- src/services/extraction-enrichment.ts | 11 ++- src/services/extraction.ts | 4 +- src/services/memory-ingest.ts | 4 +- src/services/memory-service-types.ts | 1 + src/services/observation-date-extraction.ts | 7 ++ src/services/quick-extraction.ts | 9 ++- 11 files changed, 185 insertions(+), 26 deletions(-) diff --git a/src/config.ts b/src/config.ts index 10025d0..35ab6aa 100644 --- a/src/config.ts +++ b/src/config.ts @@ -83,6 +83,7 @@ export interface RuntimeConfig { consensusExtractionRuns: number; observationDateExtractionEnabled: boolean; quotedEntityExtractionEnabled: boolean; + genericEventAnchorEnabled: boolean; entropyGateEnabled: boolean; entropyGateThreshold: number; entropyGateAlpha: number; @@ -330,6 +331,7 @@ export const config: RuntimeConfig = { consensusExtractionRuns: parseInt(optionalEnv('CONSENSUS_EXTRACTION_RUNS') ?? '3', 10), observationDateExtractionEnabled: (optionalEnv('OBSERVATION_DATE_EXTRACTION_ENABLED') ?? 'false') === 'true', quotedEntityExtractionEnabled: (optionalEnv('QUOTED_ENTITY_EXTRACTION_ENABLED') ?? 'false') === 'true', + genericEventAnchorEnabled: (optionalEnv('GENERIC_EVENT_ANCHOR_ENABLED') ?? 'false') === 'true', entropyGateEnabled: (optionalEnv('ENTROPY_GATE_ENABLED') ?? 'false') === 'true', entropyGateThreshold: parseFloat(optionalEnv('ENTROPY_GATE_THRESHOLD') ?? '0.35'), entropyGateAlpha: parseFloat(optionalEnv('ENTROPY_GATE_ALPHA') ?? '0.5'), @@ -489,6 +491,7 @@ export const INTERNAL_POLICY_CONFIG_FIELDS = [ 'chunkSizeTurns', 'chunkOverlapTurns', 'consensusExtractionEnabled', 'consensusExtractionRuns', 'observationDateExtractionEnabled', 'quotedEntityExtractionEnabled', + 'genericEventAnchorEnabled', 'entropyGateEnabled', 'entropyGateThreshold', 'entropyGateAlpha', // Affinity clustering 'affinityClusteringThreshold', 'affinityClusteringMinSize', diff --git a/src/services/__tests__/consensus-extraction-runtime-config.test.ts b/src/services/__tests__/consensus-extraction-runtime-config.test.ts index 829a5b6..3ce1857 100644 --- a/src/services/__tests__/consensus-extraction-runtime-config.test.ts +++ b/src/services/__tests__/consensus-extraction-runtime-config.test.ts @@ -53,11 +53,12 @@ describe('consensusExtractFacts runtime config', () => { extractionCacheEnabled: false, observationDateExtractionEnabled: true, quotedEntityExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockChunkedExtractFacts).toHaveBeenCalledWith( 'User: I commute 45 minutes.', - { observationDateExtractionEnabled: true }, + { observationDateExtractionEnabled: true, genericEventAnchorEnabled: false }, { chunkSizeTurns: 8, chunkOverlapTurns: 2, extractionCacheEnabled: false }, ); expect(mockCachedExtractFacts).not.toHaveBeenCalled(); @@ -82,14 +83,16 @@ describe('consensusExtractFacts runtime config', () => { extractionCacheEnabled: true, observationDateExtractionEnabled: false, quotedEntityExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockCachedExtractFacts).toHaveBeenCalledWith(longConversation, { observationDateExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockChunkedExtractFacts).toHaveBeenCalledWith( longConversation, - { observationDateExtractionEnabled: false }, + { observationDateExtractionEnabled: false, genericEventAnchorEnabled: false }, { chunkSizeTurns: 2, chunkOverlapTurns: 1, extractionCacheEnabled: true }, ); }); @@ -108,6 +111,7 @@ describe('consensusExtractFacts runtime config', () => { extractionCacheEnabled: true, observationDateExtractionEnabled: false, quotedEntityExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockChunkedExtractFacts).not.toHaveBeenCalled(); @@ -127,10 +131,12 @@ describe('consensusExtractFacts runtime config', () => { extractionCacheEnabled: false, observationDateExtractionEnabled: false, quotedEntityExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockExtractFacts).toHaveBeenCalledWith('User: I prefer Rust', { observationDateExtractionEnabled: false, + genericEventAnchorEnabled: false, }); expect(mockCachedExtractFacts).not.toHaveBeenCalled(); }); diff --git a/src/services/__tests__/event-anchor-facts.test.ts b/src/services/__tests__/event-anchor-facts.test.ts index d9621ca..f6f8007 100644 --- a/src/services/__tests__/event-anchor-facts.test.ts +++ b/src/services/__tests__/event-anchor-facts.test.ts @@ -4,6 +4,22 @@ import { describe, expect, it } from 'vitest'; import { quickExtractFacts } from '../quick-extraction.js'; +import { inferEventAnchorFacts } from '../event-anchor-facts.js'; +import type { ExtractedFact } from '../extraction.js'; + +function makeFact(text: string, overrides: Partial = {}): ExtractedFact { + return { + fact: text, + headline: overrides.headline ?? text.slice(0, 40), + importance: overrides.importance ?? 0.6, + type: overrides.type ?? 'knowledge', + keywords: overrides.keywords ?? [], + entities: overrides.entities ?? [], + relations: overrides.relations ?? [], + network: overrides.network, + opinionConfidence: overrides.opinionConfidence ?? null, + }; +} describe('event anchor facts', () => { it('emits mentorship.received anchors from relative-time facts', () => { @@ -60,3 +76,63 @@ describe('event anchor facts', () => { expect(facts.some((fact) => fact.fact.includes('event anchor trip.took_short_trip_rome'))).toBe(true); }); }); + +describe('event anchor facts — generic event.occurred fall-through (EXP-06)', () => { + it('emits a generic event.occurred anchor when flag is on and no rule matches', () => { + const fact = makeFact('As of January 2026, user is using PostgreSQL.'); + const anchors = inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true }); + expect(anchors).toHaveLength(1); + expect(anchors[0].fact).toContain('event anchor event.occurred'); + expect(anchors[0].fact).toContain('for User'); + expect(anchors[0].fact).toContain('occurred on January 1, 2026'); + }); + + it('emits a generic event.occurred anchor for full-date prefix when flag is on', () => { + const fact = makeFact('As of March 15 2025, user completed the API migration.'); + const anchors = inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true }); + expect(anchors).toHaveLength(1); + expect(anchors[0].fact).toContain('event anchor event.occurred'); + expect(anchors[0].fact).toContain('for User'); + expect(anchors[0].fact).toContain('occurred on March 15, 2025'); + }); + + it('emits no anchor when the flag is off, even if the prefix matches', () => { + const fact = makeFact('As of January 2026, user is using PostgreSQL.'); + expect(inferEventAnchorFacts(fact)).toHaveLength(0); + expect(inferEventAnchorFacts(fact, { genericEventAnchorEnabled: false })).toHaveLength(0); + }); + + it('emits no anchor for facts without an "As of " prefix', () => { + const fact = makeFact('User prefers Rust over Go.'); + expect(inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true })).toHaveLength(0); + }); + + it('does not emit a generic anchor when a DESCRIPTOR_RULE already matches (regression)', () => { + const facts = quickExtractFacts([ + '[Session date: 2023-06-16]', + 'Jon: Gina, you won\'t believe it - I got mentored by this amazing business dude yesterday!', + ].join('\n')); + + // Re-run with the flag on by feeding the enriched facts back through. + // The DESCRIPTOR_RULES path emits mentorship.received and the generic + // fall-through must not also fire on the same source fact. + const sourceFact = facts.find((f) => /As of /i.test(f.fact) && !f.fact.includes('event anchor')); + expect(sourceFact).toBeDefined(); + const anchors = inferEventAnchorFacts(sourceFact as ExtractedFact, { genericEventAnchorEnabled: true }); + const labels = anchors.map((a) => a.headline); + expect(labels).toContain('Event mentorship.received'); + expect(labels).not.toContain('Event event.occurred'); + }); + + it('returns no anchors when subject cannot be inferred (graceful fallback)', () => { + const fact = makeFact('As of January 2026, the situation continues.'); + const anchors = inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true }); + expect(anchors).toHaveLength(0); + }); + + it('returns no anchors on weird non-prefixed input rather than crashing', () => { + const fact = makeFact('Random unstructured text without temporal prefix.'); + expect(() => inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true })).not.toThrow(); + expect(inferEventAnchorFacts(fact, { genericEventAnchorEnabled: true })).toHaveLength(0); + }); +}); diff --git a/src/services/consensus-extraction.ts b/src/services/consensus-extraction.ts index 1bdc4c7..1c5a797 100644 --- a/src/services/consensus-extraction.ts +++ b/src/services/consensus-extraction.ts @@ -35,6 +35,7 @@ export interface ConsensusExtractionConfig { extractionCacheEnabled: boolean; observationDateExtractionEnabled: boolean; quotedEntityExtractionEnabled: boolean; + genericEventAnchorEnabled: boolean; } interface FactWithEmbedding { @@ -86,7 +87,11 @@ function applyOptionalQuotedEntityExtraction( /** Run extractFacts() N times to get independent LLM samples. */ async function runMultipleExtractions( conversationText: string, - config: Pick, + config: Pick, ): Promise { const allRunFacts: ExtractedFact[][] = []; const options = buildExtractionOptions(config); @@ -97,10 +102,11 @@ async function runMultipleExtractions( } function buildExtractionOptions( - config: Pick, + config: Pick, ) { return { observationDateExtractionEnabled: config.observationDateExtractionEnabled, + genericEventAnchorEnabled: config.genericEventAnchorEnabled, }; } diff --git a/src/services/event-anchor-facts.ts b/src/services/event-anchor-facts.ts index 0189964..12431b1 100644 --- a/src/services/event-anchor-facts.ts +++ b/src/services/event-anchor-facts.ts @@ -15,7 +15,19 @@ interface EventAnchorDescriptor { eventDateIso: string; } +/** Options controlling event-anchor extraction behavior. */ +export interface EventAnchorOptions { + /** + * EXP-06: when no DESCRIPTOR_RULE matches but the fact has an `As of ,` + * prefix and a recoverable subject, emit a generic `event.occurred` anchor. + * Defaults to off. + */ + genericEventAnchorEnabled?: boolean; +} + +const GENERIC_ANCHOR_LABEL = 'event.occurred'; const RECORDED_DATE_PATTERN = /^As of ([A-Za-z]+ \d{1,2} \d{4}),\s*/i; +const RECORDED_DATE_FLEXIBLE_PATTERN = /^As of ([A-Za-z]+(?:\s+\d{1,2})?\s+\d{4}),\s*(.*)$/i; const EXPLICIT_EVENT_ANCHOR_PATTERN = /\bevent anchor\s+[a-z.]+/i; const EVENT_DATE_PATTERN = /\boccurred on ([A-Za-z]+ \d{1,2} \d{4})\b/i; const NON_SUBJECT_TOKENS = new Set(['Hey', 'Long', 'Yesterday', 'Thats', 'Awesome', 'Oh', 'Paris', 'Rome', 'Barcelona']); @@ -34,7 +46,10 @@ const MONTH_INDEX: Record = { december: 11, }; -export function inferEventAnchorFacts(fact: ExtractedFact): ExtractedFact[] { +export function inferEventAnchorFacts( + fact: ExtractedFact, + options: EventAnchorOptions = {}, +): ExtractedFact[] { if (EXPLICIT_EVENT_ANCHOR_PATTERN.test(fact.fact)) { return []; } @@ -42,10 +57,14 @@ export function inferEventAnchorFacts(fact: ExtractedFact): ExtractedFact[] { if (!recordedDate) { return []; } - return inferDescriptors(fact, recordedDate).map((descriptor) => buildAnchorFact(fact, descriptor)); + return inferDescriptors(fact, recordedDate, options).map((descriptor) => buildAnchorFact(fact, descriptor)); } -function inferDescriptors(fact: ExtractedFact, recordedDate: Date): EventAnchorDescriptor[] { +function inferDescriptors( + fact: ExtractedFact, + recordedDate: Date, + options: EventAnchorOptions, +): EventAnchorDescriptor[] { const lower = fact.fact.toLowerCase(); const subject = inferSubject(fact); if (!subject) { @@ -61,6 +80,10 @@ function inferDescriptors(fact: ExtractedFact, recordedDate: Date): EventAnchorD } } + if (descriptors.length === 0 && options.genericEventAnchorEnabled) { + descriptors.push({ label: GENERIC_ANCHOR_LABEL, subject, eventDateIso }); + } + return dedupeDescriptors(descriptors); } @@ -112,7 +135,7 @@ function inferRomeLabels(lower: string): string[] { } function buildAnchorFact(sourceFact: ExtractedFact, descriptor: EventAnchorDescriptor): ExtractedFact { - const recordedPrefix = sourceFact.fact.match(RECORDED_DATE_PATTERN)?.[1]; + const recordedPrefix = extractRecordedPrefix(sourceFact.fact); const eventDateHuman = formatHumanDate(descriptor.eventDateIso); const anchorFact = `As of ${recordedPrefix}, event anchor ${descriptor.label} for ${descriptor.subject} occurred on ${eventDateHuman}.`; return { @@ -193,24 +216,49 @@ function dedupeDescriptors(descriptors: EventAnchorDescriptor[]): EventAnchorDes return [...unique.values()]; } +function extractRecordedPrefix(text: string): string { + const strict = text.match(RECORDED_DATE_PATTERN); + if (strict) { + return strict[1]; + } + const flexible = text.match(RECORDED_DATE_FLEXIBLE_PATTERN); + if (flexible) { + return flexible[1]; + } + return ''; +} + function parseRecordedDate(text: string): Date | null { - const match = text.match(RECORDED_DATE_PATTERN); - if (!match) { + const strict = text.match(RECORDED_DATE_PATTERN); + if (strict) { + return parseHumanDate(strict[1]); + } + const flexible = text.match(RECORDED_DATE_FLEXIBLE_PATTERN); + if (!flexible) { return null; } - return parseHumanDate(match[1]); + return parseHumanDate(flexible[1]); } function parseHumanDate(input: string): Date | null { - const match = input.match(/^([A-Za-z]+) (\d{1,2}) (\d{4})$/); - if (!match) { - return null; + const trimmed = input.trim(); + const fullMatch = trimmed.match(/^([A-Za-z]+)\s+(\d{1,2})\s+(\d{4})$/); + if (fullMatch) { + const month = MONTH_INDEX[fullMatch[1].toLowerCase()]; + if (month === undefined) { + return null; + } + return new Date(Date.UTC(Number(fullMatch[3]), month, Number(fullMatch[2]), 0, 0, 0, 0)); } - const month = MONTH_INDEX[match[1].toLowerCase()]; - if (month === undefined) { - return null; + const monthYearMatch = trimmed.match(/^([A-Za-z]+)\s+(\d{4})$/); + if (monthYearMatch) { + const month = MONTH_INDEX[monthYearMatch[1].toLowerCase()]; + if (month === undefined) { + return null; + } + return new Date(Date.UTC(Number(monthYearMatch[2]), month, 1, 0, 0, 0, 0)); } - return new Date(Date.UTC(Number(match[3]), month, Number(match[2]), 0, 0, 0, 0)); + return null; } function formatHumanDate(isoDate: string): string { diff --git a/src/services/extraction-enrichment.ts b/src/services/extraction-enrichment.ts index 4086f99..071adb9 100644 --- a/src/services/extraction-enrichment.ts +++ b/src/services/extraction-enrichment.ts @@ -7,7 +7,9 @@ import type { ExtractedEntity, ExtractedFact, ExtractedRelation } from './extraction.js'; import { dedupeEntities } from './entity-dedup.js'; -import { inferEventAnchorFacts } from './event-anchor-facts.js'; +import { inferEventAnchorFacts, type EventAnchorOptions } from './event-anchor-facts.js'; + +export type EnrichmentOptions = EventAnchorOptions; const SELF_ENTITY: ExtractedEntity = { name: 'User', type: 'person' }; const SELF_MARKERS = ['user ', 'user\'s', 'i ', 'i\'m', 'i’ve', 'i have', 'my ']; @@ -49,10 +51,13 @@ const CANONICAL_ENTITY_NAMES: Record = { msr: 'Microsoft Research', }; -export function enrichExtractedFacts(facts: ExtractedFact[]): ExtractedFact[] { +export function enrichExtractedFacts( + facts: ExtractedFact[], + options: EnrichmentOptions = {}, +): ExtractedFact[] { const enriched = facts.flatMap((fact) => { const baseFact = enrichExtractedFact(fact); - return [baseFact, ...inferEventAnchorFacts(baseFact)]; + return [baseFact, ...inferEventAnchorFacts(baseFact, options)]; }); return dedupeFacts(enriched); } diff --git a/src/services/extraction.ts b/src/services/extraction.ts index 3ce6730..3471f24 100644 --- a/src/services/extraction.ts +++ b/src/services/extraction.ts @@ -320,7 +320,9 @@ export async function extractFacts( return timedSync('ingest.extract.post-process', () => { const normalized: ExtractedFact[] = rawFacts.map((m) => normalizeRawFact(m)); const anchoredFacts = applyObservationDateAnchors(normalized, conversationText, options); - const baseFacts = enrichExtractedFacts(normalizeExtractedFacts(anchoredFacts)); + const baseFacts = enrichExtractedFacts(normalizeExtractedFacts(anchoredFacts), { + genericEventAnchorEnabled: options.genericEventAnchorEnabled, + }); return mergeSupplementalFacts(baseFacts, conversationText); }); } diff --git a/src/services/memory-ingest.ts b/src/services/memory-ingest.ts index 606f49c..388b590 100644 --- a/src/services/memory-ingest.ts +++ b/src/services/memory-ingest.ts @@ -138,7 +138,9 @@ export async function performQuickIngest( const ingestStart = performance.now(); const logicalSessionTimestamp = resolveSessionDate(sessionTimestamp, conversationText); const episodeId = await deps.stores.episode.storeEpisode({ userId, content: conversationText, sourceSite, sourceUrl }); - const facts = timed('quick-ingest.extract', () => Promise.resolve(quickExtractFacts(conversationText))); + const facts = timed('quick-ingest.extract', () => Promise.resolve(quickExtractFacts(conversationText, { + genericEventAnchorEnabled: deps.config.genericEventAnchorEnabled, + }))); const extractedFacts = await facts; const traceCollector = new IngestTraceCollector(deps.config.ingestTraceEnabled); const acc = createIngestAccumulator(); diff --git a/src/services/memory-service-types.ts b/src/services/memory-service-types.ts index 0187247..f12ebf2 100644 --- a/src/services/memory-service-types.ts +++ b/src/services/memory-service-types.ts @@ -272,6 +272,7 @@ export interface IngestRuntimeConfig { extractionCacheEnabled: boolean; observationDateExtractionEnabled: boolean; quotedEntityExtractionEnabled: boolean; + genericEventAnchorEnabled: boolean; entityGraphEnabled: boolean; entropyGateAlpha: number; entropyGateEnabled: boolean; diff --git a/src/services/observation-date-extraction.ts b/src/services/observation-date-extraction.ts index 7529b2d..1e2a442 100644 --- a/src/services/observation-date-extraction.ts +++ b/src/services/observation-date-extraction.ts @@ -14,6 +14,13 @@ import { extractSessionTimestamp, parseSessionDate } from './session-date.js'; export interface ExtractionOptions { observationDateExtractionEnabled?: boolean; + /** + * EXP-06: when no DESCRIPTOR_RULE matches but the fact has an `As of ,` + * prefix and a recoverable subject, emit a generic `event.occurred` anchor. + * Threaded through to `enrichExtractedFacts` and `inferEventAnchorFacts`. + * Defaults to off. + */ + genericEventAnchorEnabled?: boolean; } export function buildExtractionUserMessage( diff --git a/src/services/quick-extraction.ts b/src/services/quick-extraction.ts index 1517c3d..118e325 100644 --- a/src/services/quick-extraction.ts +++ b/src/services/quick-extraction.ts @@ -17,7 +17,7 @@ */ import type { ExtractedFact, ExtractedEntity, ExtractedRelation } from './extraction.js'; -import { enrichExtractedFacts } from './extraction-enrichment.js'; +import { enrichExtractedFacts, type EnrichmentOptions } from './extraction-enrichment.js'; import { annotateRelativeTemporalText } from './relative-temporal.js'; import { isFactBearingAssistantTurn, isAssistantFactStatement } from './assistant-turn-filter.js'; import { @@ -403,7 +403,10 @@ function extractKeywords(sentence: string): string[] { * Processes both user turns (first-person fact detection) and fact-bearing * assistant turns (specific content detection). */ -export function quickExtractFacts(conversationText: string): ExtractedFact[] { +export function quickExtractFacts( + conversationText: string, + options: EnrichmentOptions = {}, +): ExtractedFact[] { const turns = extractFactBearingTurns(conversationText); const sessionDate = parseSessionDate(conversationText); const sessionDateValue = parseSessionDateValue(conversationText); @@ -414,7 +417,7 @@ export function quickExtractFacts(conversationText: string): ExtractedFact[] { extractFactsFromTurn(turn, conversationText, sessionDate, sessionDateValue, seenFacts, facts); } - return enrichExtractedFacts(facts); + return enrichExtractedFacts(facts, options); } /** Extract facts from a single turn's sentences and add to the accumulator. */