From 4de2539aafac417cb473ef3d005f64f48a9e4cea Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 19 May 2026 14:39:43 -0400 Subject: [PATCH] fix(livekit): measure eou detection delay Use LiveKit EOU delay metrics to anchor eou_detection spans instead of logging them as zero-duration events. Stop emitting a second metrics-derived vad_endpointing span so raw VAD endpointing remains distinct. Covered by the existing VCR-backed LiveKit voice turn assertions. --- .../livekit_agents/test_livekit_agents.py | 4 ++ .../integrations/livekit_agents/tracing.py | 47 ++++++------------- 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/py/src/braintrust/integrations/livekit_agents/test_livekit_agents.py b/py/src/braintrust/integrations/livekit_agents/test_livekit_agents.py index 75e06151..4576bff2 100644 --- a/py/src/braintrust/integrations/livekit_agents/test_livekit_agents.py +++ b/py/src/braintrust/integrations/livekit_agents/test_livekit_agents.py @@ -447,6 +447,10 @@ def _assert_eou_spans(logs): if eou_logs: _assert_any_span(eou_logs, lambda log: log.get("input", {}).get("text")) _assert_any_span(eou_logs, lambda log: "is_end_of_turn" in log.get("output", {})) + _assert_any_span( + eou_logs, + lambda log: log.get("metrics", {}).get("end", 0) - log.get("metrics", {}).get("start", 0) > 0, + ) def _assert_stt_spans(logs, speech_text): diff --git a/py/src/braintrust/integrations/livekit_agents/tracing.py b/py/src/braintrust/integrations/livekit_agents/tracing.py index 4140544f..caab9a4f 100644 --- a/py/src/braintrust/integrations/livekit_agents/tracing.py +++ b/py/src/braintrust/integrations/livekit_agents/tracing.py @@ -77,10 +77,6 @@ def on_metrics_collected(event: Any) -> None: parent=parent if isinstance(parent, str) else None, ) elif metrics_type == "eou_metrics": - _log_vad_endpointing_span( - metrics_obj, - parent=parent if isinstance(parent, str) else None, - ) _log_metric_span( "eou_detection", metrics_obj, @@ -781,44 +777,29 @@ def _log_metric_span( event.update(_eou_detection_io(metrics_payload)) if output is not None: event["output"] = output - start_time, end_time = _metric_span_times(metrics_payload) + if name == "eou_detection": + start_time, end_time = _eou_detection_span_times(metrics_payload) + else: + start_time, end_time = _metric_span_times(metrics_payload) span = start_span(name=name, type=span_type, start_time=start_time, set_current=False, parent=parent, input=input) span.log(**event) span.end(end_time=end_time) -def _log_vad_endpointing_span(metrics_obj: Any, parent: str | None = None) -> None: - metrics_payload = _metrics_from_object(metrics_obj) - endpointing_delay = _first_numeric_metric( +def _eou_detection_span_times(metrics_payload: dict[str, Any]) -> tuple[float | None, float | None]: + timestamp = metrics_payload.get("timestamp") + if not isinstance(timestamp, (int, float)) or isinstance(timestamp, bool): + return _metric_span_times(metrics_payload) + eou_delay = _first_numeric_metric( metrics_payload, - "endpointing_delay", - "end_of_turn_delay", "end_of_utterance_delay", + "end_of_turn_delay", + "endpointing_delay", "eou_delay", ) - if endpointing_delay is None or endpointing_delay <= 0: - return - timestamp = metrics_payload.get("timestamp") - end_time = timestamp if isinstance(timestamp, (int, float)) and not isinstance(timestamp, bool) else None - start_time = end_time - endpointing_delay if end_time is not None else None - metadata = _promoted_metadata(metrics_payload) - metadata["livekit_metrics"] = { - key: value - for key, value in metrics_payload.items() - if key - not in {"metadata", "type", "endpointing_delay", "end_of_turn_delay", "end_of_utterance_delay", "eou_delay"} - } - if not metadata["livekit_metrics"]: - metadata.pop("livekit_metrics") - span = start_span( - name="vad_endpointing", - type=SpanTypeAttribute.TASK, - start_time=start_time, - set_current=False, - parent=parent, - ) - span.log(metadata=metadata) - span.end(end_time=end_time) + if eou_delay is None or eou_delay <= 0: + return _metric_span_times(metrics_payload) + return timestamp - eou_delay, timestamp def _first_numeric_metric(metrics_payload: dict[str, Any], *keys: str) -> float | None: