launchdarkly · jsonbailey · Apr 28, 2026
@@ -1,14 +1,12 @@
 """LangGraph agent graph runner for LaunchDarkly AI SDK."""
 
-import asyncio
 import time
-from contextvars import ContextVar
 from typing import Annotated, Any, Dict, List, Set, Tuple
 
 from ldai import log
 from ldai.agent_graph import AgentGraphDefinition, AgentGraphNode
-from ldai.providers import AgentGraphResult, AgentGraphRunner, ToolRegistry
-from ldai.providers.types import LDAIMetrics
+from ldai.providers import AgentGraphRunner, ToolRegistry
+from ldai.providers.types import AgentGraphRunnerResult, GraphMetrics, LDAIMetrics
 
 from ldai_langchain.langchain_helper import (
     build_structured_tools,
@@ -18,9 +16,6 @@
 )
 from ldai_langchain.langgraph_callback_handler import LDMetricsCallbackHandler
 
-# Per-run eval task accumulator, isolated per concurrent run() call via ContextVar.
-_run_eval_tasks: ContextVar[Dict[str, List[asyncio.Task]]] = ContextVar('_run_eval_tasks')
-
 
 def _make_handoff_tool(child_key: str, description: str) -> Any:
     """
@@ -65,9 +60,10 @@ class LangGraphAgentGraphRunner(AgentGraphRunner):
 
     AgentGraphRunner implementation for LangGraph.
 
-    Compiles and runs the agent graph with LangGraph and automatically records
-    graph- and node-level AI metric data to the LaunchDarkly trackers on the
-    graph definition and each node.
+    Compiles and runs the agent graph with LangGraph and collects graph- and
+    node-level metrics via a LangChain callback handler.  Tracking events are
+    emitted by the managed layer (:class:`~ldai.ManagedAgentGraph`) from the
+    returned :class:`~ldai.providers.types.AgentGraphRunnerResult`.
 
     Requires ``langgraph`` to be installed.
     """
@@ -181,26 +177,6 @@ async def invoke(state: WorkflowState) -> dict:
                     if node_instructions:
                         msgs = [SystemMessage(content=node_instructions)] + msgs
                     response = await bound_model.ainvoke(msgs)
-
-                    node_obj = self._graph.get_node(nk)
-                    if node_obj is not None:
-                        input_text = '\r\n'.join(
-                            m.content if isinstance(m.content, str) else str(m.content)
-                            for m in msgs
-                        ) if msgs else ''
-                        output_text = (
-                            response.content if hasattr(response, 'content') else str(response)
-                        )
-                        task = node_obj.get_config().evaluator.evaluate(input_text, output_text)
-                        run_tasks = _run_eval_tasks.get(None)
-                        if run_tasks is not None:
-                            run_tasks.setdefault(nk, []).append(task)
-                        else:
-                            log.warning(
-                                f"LangGraphAgentGraphRunner: eval task for node '{nk}' "
-                                "has no run context; judge results will not be tracked"
-                            )
-
                     return {'messages': [response]}
 
                 invoke.__name__ = nk
@@ -298,20 +274,18 @@ def route(state: WorkflowState) -> str:
         compiled = agent_builder.compile()
         return compiled, fn_name_to_config_key, node_keys
 
-    async def run(self, input: Any) -> AgentGraphResult:
+    async def run(self, input: Any) -> AgentGraphRunnerResult:
         """
         Run the agent graph with the given input.
 
         Builds a LangGraph StateGraph from the AgentGraphDefinition, compiles
         it, and invokes it. Uses a LangChain callback handler to collect
-        per-node metrics, then flushes them to LaunchDarkly trackers.
+        per-node metrics. Graph-level tracking events are emitted by the
+        managed layer from the returned GraphMetrics.
 
         :param input: The string prompt to send to the agent graph
-        :return: AgentGraphResult with the final output and metrics
+        :return: AgentGraphRunnerResult with the final content and GraphMetrics
         """
-        pending_eval_tasks: Dict[str, List[asyncio.Task]] = {}
-        token = _run_eval_tasks.set(pending_eval_tasks)
-        tracker = self._graph.create_tracker()
         start_ns = time.perf_counter_ns()
 
         try:
@@ -325,24 +299,34 @@ async def run(self, input: Any) -> AgentGraphResult:
                 config={'callbacks': [handler], 'recursion_limit': 25},
             )
 
-            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
+            duration_ms = (time.perf_counter_ns() - start_ns) // 1_000_000
             messages = result.get('messages', [])
             output = extract_last_message_content(messages)
+            total_usage = sum_token_usage_from_messages(messages)
+
+            # Build per-node LDAIMetrics from callback handler data
+            node_metrics: Dict[str, LDAIMetrics] = {}
+            for node_key in handler.path:
+                usage = handler.node_tokens.get(node_key)
+                duration = handler.node_durations_ms.get(node_key)
+                tool_calls = handler.node_tool_calls.get(node_key) or []
+                node_metrics[node_key] = LDAIMetrics(
+                    success=True,
+                    usage=usage,
+                    duration_ms=duration,
+                    tool_calls=tool_calls if tool_calls else None,
+                )
 
-            # Flush per-node metrics to LD trackers; eval results are tracked
-            # internally and intentionally not exposed on AgentGraphResult here
-            # — judge dispatch is the managed layer's responsibility.
-            await handler.flush(self._graph, pending_eval_tasks)
-
-            tracker.track_path(handler.path)
-            tracker.track_duration(duration)
-            tracker.track_invocation_success()
-            tracker.track_total_tokens(sum_token_usage_from_messages(messages))
-
-            return AgentGraphResult(
-                output=output,
+            return AgentGraphRunnerResult(
+                content=output,
                 raw=result,
-                metrics=LDAIMetrics(success=True),
+                metrics=GraphMetrics(
+                    success=True,
+                    path=handler.path,
+                    duration_ms=duration_ms,
+                    usage=total_usage if (total_usage is not None and total_usage.total > 0) else None,
+                    node_metrics=node_metrics,
+                ),
             )
 
         except Exception as exc:
@@ -353,13 +337,12 @@ async def run(self, input: Any) -> AgentGraphResult:
                 )
             else:
                 log.warning(f'LangGraphAgentGraphRunner run failed: {exc}')
-            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
-            tracker.track_duration(duration)
-            tracker.track_invocation_failure()
-            return AgentGraphResult(
-                output='',
+            duration_ms = (time.perf_counter_ns() - start_ns) // 1_000_000
+            return AgentGraphRunnerResult(
+                content='',
                 raw=None,
-                metrics=LDAIMetrics(success=False),
+                metrics=GraphMetrics(
+                    success=False,
+                    duration_ms=duration_ms,
+                ),
             )
-        finally:
-            _run_eval_tasks.reset(token)
@@ -5,7 +5,7 @@
 from langchain_core.callbacks import BaseCallbackHandler
 from langchain_core.outputs import ChatGeneration, LLMResult
 from ldai.agent_graph import AgentGraphDefinition
-from ldai.providers.types import JudgeResult
+from ldai.providers.types import JudgeResult, LDAIMetrics
 from ldai.tracker import TokenUsage
 
 from ldai_langchain.langchain_helper import get_ai_usage_from_response
@@ -193,14 +193,19 @@ async def flush(
         self, graph: AgentGraphDefinition, eval_tasks=None
     ) -> List[JudgeResult]:
         """
-        Emit all collected per-node metrics to the LaunchDarkly trackers.
+        Emit collected per-node metrics to LaunchDarkly trackers.
 
-        Call this once after the graph run completes.
+        .. deprecated::
+            Per-node tracking is now driven by the managed layer
+            (:class:`ManagedAgentGraph`) from
+            :attr:`AgentGraphRunnerResult.metrics.node_metrics`. This method
+            is retained for tests and any external callers that still rely on
+            the original handler-driven tracking path; production code should
+            not call it.
 
         :param graph: The AgentGraphDefinition whose nodes hold the LD config trackers.
         :param eval_tasks: Optional dict mapping node key to a list of awaitables that
-            return judge evaluation results. Multiple tasks arise when a node is visited
-            more than once (e.g. in a graph with cycles).
+            return judge evaluation results.
         :return: All judge results collected across all nodes.
         """
         node_trackers: Dict[str, Any] = {}
@@ -240,3 +245,27 @@ async def flush(
                         config_tracker.track_judge_result(r)
 
         return all_eval_results
+
+    def collect_node_metrics(self) -> Dict[str, LDAIMetrics]:
+        """
+        Build a per-node ``LDAIMetrics`` map from data collected during the run.
+
+        Pure data extraction — no LaunchDarkly tracker events are emitted.
+        :class:`LangGraphAgentGraphRunner` uses this to populate
+        ``GraphMetrics.node_metrics`` so the managed layer can drive per-node
+        events.
+
+        :return: Mapping of node key to its accumulated ``LDAIMetrics``.
+        """
+        node_metrics: Dict[str, LDAIMetrics] = {}
+        for node_key in self._path:
+            if node_key in node_metrics:
+                continue
+            tool_calls = self._node_tool_calls.get(node_key, [])
+            node_metrics[node_key] = LDAIMetrics(
+                success=True,
+                usage=self._node_tokens.get(node_key),
+                tool_calls=list(tool_calls) if tool_calls else None,
+                duration_ms=self._node_duration_ms.get(node_key),
+            )
+        return node_metrics
@@ -6,7 +6,8 @@
 from ldai.agent_graph import AgentGraphDefinition
 from ldai.evaluator import Evaluator
 from ldai.models import AIAgentGraphConfig, AIAgentConfig, ModelConfig, ProviderConfig
-from ldai.providers import AgentGraphResult, ToolRegistry
+from ldai.providers import ToolRegistry
+from ldai.providers.types import AgentGraphRunnerResult
 from ldai_langchain.langgraph_agent_graph_runner import LangGraphAgentGraphRunner
 from ldai_langchain.langchain_runner_factory import LangChainRunnerFactory
 
@@ -75,22 +76,22 @@ async def test_langgraph_runner_run_raises_when_langgraph_not_installed():
 
     with patch.dict('sys.modules', {'langgraph': None, 'langgraph.graph': None}):
         result = await runner.run("test")
-        assert isinstance(result, AgentGraphResult)
+        assert isinstance(result, AgentGraphRunnerResult)
         assert result.metrics.success is False
 
 
 @pytest.mark.asyncio
-async def test_langgraph_runner_run_tracks_failure_on_exception():
+async def test_langgraph_runner_run_returns_failure_on_exception():
+    """Runner now returns AgentGraphRunnerResult; managed layer drives tracker events."""
     graph = _make_graph()
-    tracker = graph.create_tracker()
     runner = LangGraphAgentGraphRunner(graph, {})
 
     with patch.dict('sys.modules', {'langgraph': None, 'langgraph.graph': None}):
         result = await runner.run("fail")
 
+    assert isinstance(result, AgentGraphRunnerResult)
     assert result.metrics.success is False
-    tracker.track_invocation_failure.assert_called_once()
-    tracker.track_duration.assert_called_once()
+    assert result.metrics.duration_ms is not None
 
 
 @pytest.mark.asyncio
@@ -147,9 +148,10 @@ async def test_langgraph_runner_run_success():
         runner = LangGraphAgentGraphRunner(graph, {})
         result = await runner.run("find restaurants")
 
-    assert isinstance(result, AgentGraphResult)
-    assert result.output == "langgraph answer"
-    assert result.metrics.success is True
-    tracker.track_path.assert_called_once_with([])
-    tracker.track_invocation_success.assert_called_once()
-    tracker.track_duration.assert_called_once()
+    assert isinstance(result, AgentGraphRunnerResult)
+    assert result.metrics.duration_ms is not None
+    # Tracker events now fire from the managed layer (ManagedAgentGraph) using
+    # result.metrics; the runner no longer touches the graph tracker directly.
+    tracker.track_path.assert_not_called()
+    tracker.track_invocation_success.assert_not_called()
+    tracker.track_duration.assert_not_called()
@@ -11,11 +11,18 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 from ldai.agent_graph import AgentGraphDefinition
+from ldai.managed_agent_graph import ManagedAgentGraph
 from ldai.models import AIAgentGraphConfig, AIAgentConfig, Edge, ModelConfig, ProviderConfig
 from ldai.tracker import AIGraphTracker, LDAIConfigTracker
 from ldai.evaluator import Evaluator
 from ldai_langchain.langgraph_agent_graph_runner import LangGraphAgentGraphRunner
 
+
+async def _run_through_managed(runner: LangGraphAgentGraphRunner, graph: AgentGraphDefinition, input: str):
+    """Run the runner through the managed layer so graph-level tracking events fire."""
+    managed = ManagedAgentGraph(runner, graph=graph)
+    return await managed.run(input)
+
 pytestmark = pytest.mark.skipif(
     pytest.importorskip('langgraph', reason='langgraph not installed') is None,
     reason='langgraph not installed',
@@ -229,7 +236,7 @@ async def test_tracks_node_and_graph_tokens_on_success():
         result = await runner.run("What's the weather?")
 
     assert result.metrics.success is True
-    assert result.output == 'Sunny.'
+    assert result.content == 'Sunny.'
 
     # Manually simulate what the callback handler would collect and flush
     # (mock models don't fire LangChain callbacks, so we test flush directly)
@@ -259,12 +266,9 @@ async def test_tracks_node_and_graph_tokens_on_success():
     assert ev2['$ld:ai:generation:success'][0][1] == 1
     assert '$ld:ai:duration:total' in ev2
 
-    # Graph-level events from the real run
-    ev = _events(mock_ld_client)
-    assert ev['$ld:ai:graph:total_tokens'][0][1] == 15
-    assert ev['$ld:ai:graph:invocation_success'][0][1] == 1
-    assert '$ld:ai:graph:duration:total' in ev
-    assert '$ld:ai:graph:path' in ev
+    # Graph-level events are now driven by ManagedAgentGraph from
+    # AgentGraphRunnerResult.metrics — see test_managed_agent_graph.py for the
+    # managed-layer flow. The runner itself no longer fires graph-level events.
 
 
 @pytest.mark.asyncio
@@ -277,11 +281,11 @@ async def test_tracks_execution_path():
     with patch('ldai_langchain.langgraph_agent_graph_runner.create_langchain_model',
                return_value=_mock_model(fake_response)):
         runner = LangGraphAgentGraphRunner(graph, {})
-        await runner.run('hello')
+        result = await runner.run('hello')
 
-    ev = _events(mock_ld_client)
-    path_data = ev['$ld:ai:graph:path'][0][0]
-    assert 'my-agent' in path_data['path']
+    # Path now lives on AgentGraphRunnerResult.metrics.path; the runner no
+    # longer emits the $ld:ai:graph:path event directly (the managed layer does).
+    assert 'my-agent' in result.metrics.path
 
 
 @pytest.mark.asyncio
@@ -432,11 +436,9 @@ async def test_tracks_failure_and_latency_on_model_error():
         result = await runner.run('fail')
 
     assert result.metrics.success is False
-
-    ev = _events(mock_ld_client)
-    assert '$ld:ai:graph:invocation_failure' in ev
-    assert '$ld:ai:graph:duration:total' in ev
-    assert '$ld:ai:graph:invocation_success' not in ev
+    assert result.metrics.duration_ms is not None
+    # Graph-level events (invocation_failure, duration) are now driven by
+    # ManagedAgentGraph from result.metrics, not by the runner directly.
 
 
 @pytest.mark.asyncio
@@ -461,7 +463,7 @@ def model_factory(node_config, **kwargs):
     with patch('ldai_langchain.langgraph_agent_graph_runner.create_langchain_model',
                side_effect=model_factory):
         runner = LangGraphAgentGraphRunner(graph, {})
-        result = await runner.run('hello')
+        result = await _run_through_managed(runner, graph, 'hello')
 
     assert result.metrics.success is True
 
@@ -624,7 +626,7 @@ def model_factory(node_config, **kwargs):
         result = await runner.run('hello')
 
     assert result.metrics.success is True
-    assert 'Agent A' in result.output
+    assert 'Agent A' in result.content
     # Agent B's model must never have been invoked — no fan-out
     agent_b_model.ainvoke.assert_not_called()
 
@@ -752,7 +754,7 @@ def model_factory(node_config, **kwargs):
         result = await runner.run('Find info and route to the right agent.')
 
     assert result.metrics.success is True
-    assert 'Agent A' in result.output
+    assert 'Agent A' in result.content
     # Orchestrator must have been called twice: once before tool result, once after
     assert orchestrator_model.ainvoke.call_count == 2
     # Agent B must never have been invoked