From b4ddcba8d8589b4647336735b232204bc2a2e331 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 28 Apr 2026 18:27:33 -0500 Subject: [PATCH 1/3] feat: Add evaluations support to ManagedAgent.run() Wire judge evaluations into ManagedAgent.run() via an asyncio.Task, mirroring ManagedModel.run(). Awaiting result.evaluations guarantees both evaluation and tracker.track_judge_result() complete. run() returns immediately; the evaluations task resolves asynchronously. Co-Authored-By: Claude Sonnet 4.6 --- .../sdk/server-ai/src/ldai/managed_agent.py | 39 +++- .../sdk/server-ai/tests/test_managed_agent.py | 208 ++++++++++++++++-- 2 files changed, 227 insertions(+), 20 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index a2abdf98..f661294b 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -1,19 +1,21 @@ """ManagedAgent — LaunchDarkly managed wrapper for agent invocations.""" -from typing import Union +import asyncio +from typing import List, Union from ldai.models import AIAgentConfig from ldai.providers import AgentResult, AgentRunner from ldai.providers.runner import Runner -from ldai.providers.types import ManagedResult, RunnerResult +from ldai.providers.types import JudgeResult, ManagedResult, RunnerResult +from ldai.tracker import LDAIConfigTracker class ManagedAgent: """ LaunchDarkly managed wrapper for AI agent invocations. - Holds an AgentRunner or Runner. Handles tracking automatically via - ``create_tracker()``. + Holds an AgentRunner or Runner. Handles tracking and judge evaluation + dispatch automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_agent()``. """ @@ -29,8 +31,13 @@ async def run(self, input: str) -> ManagedResult: """ Run the agent with the given input string. + Invokes the runner, tracks metrics, and dispatches judge evaluations + asynchronously. Returns immediately; awaiting ``result.evaluations`` + guarantees both evaluation and tracking complete. + :param input: The user prompt or input to the agent - :return: ManagedResult containing the agent's output and metric summary + :return: ManagedResult containing the agent's output, metric summary, + and an optional evaluations task """ tracker = self._ai_config.create_tracker() result: Union[RunnerResult, AgentResult] = await tracker.track_metrics_of_async( @@ -39,12 +46,34 @@ async def run(self, input: str) -> ManagedResult: ) # Support both RunnerResult (content) and legacy AgentResult (output) content = result.content if isinstance(result, RunnerResult) else result.output # type: ignore[union-attr] + + evaluations_task = self._track_judge_results(tracker, input, content) + + return ManagedResult( content=content, metrics=tracker.get_summary(), raw=result.raw, + evaluations=evaluations_task, ) + def _track_judge_results( + self, + tracker: LDAIConfigTracker, + input_text: str, + output_text: str, + ) -> asyncio.Task[List[JudgeResult]]: + evaluator_task = self._ai_config.evaluator.evaluate(input_text, output_text) + + async def _run_and_track(eval_task: asyncio.Task) -> List[JudgeResult]: + results = await eval_task + for r in results: + if r.success: + tracker.track_judge_result(r) + return results + + return asyncio.create_task(_run_and_track(evaluator_task)) + def get_agent_runner(self) -> Union[Runner, AgentRunner]: """ Return the underlying runner for advanced use. diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py index 0c30637a..2e4f5a63 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent.py +++ b/packages/sdk/server-ai/tests/test_managed_agent.py @@ -1,13 +1,16 @@ """Tests for ManagedAgent.""" +import asyncio import pytest +from typing import List from unittest.mock import AsyncMock, MagicMock from ldai import LDAIClient, ManagedAgent +from ldai.evaluator import Evaluator from ldai.managed_agent import ManagedAgent from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig -from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult -from ldai.tracker import LDAIMetricSummary +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult +from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData @@ -19,6 +22,23 @@ def _make_summary(success: bool = True) -> LDAIMetricSummary: return summary +def _make_noop_evaluator_config() -> MagicMock: + """Build a minimal mock AIAgentConfig with a noop evaluator and a mock tracker.""" + mock_config = MagicMock(spec=AIAgentConfig) + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock( + return_value=RunnerResult( + content="Test response", + raw=None, + metrics=LDAIMetrics(success=True, usage=None), + ) + ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) + mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config.evaluator = Evaluator.noop() + return mock_config + + @pytest.fixture def td() -> TestData: td = TestData.data_source() @@ -60,17 +80,7 @@ class TestManagedAgentRun: @pytest.mark.asyncio async def test_run_delegates_to_agent_runner(self): """Should delegate run() to the underlying AgentRunner and return ManagedResult.""" - mock_config = MagicMock(spec=AIAgentConfig) - mock_tracker = MagicMock() - mock_tracker.track_metrics_of_async = AsyncMock( - return_value=RunnerResult( - content="Test response", - metrics=LDAIMetrics(success=True, usage=None), - raw=None, - ) - ) - mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) - mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config = _make_noop_evaluator_config() mock_runner = MagicMock() mock_runner.run = AsyncMock( return_value=RunnerResult( @@ -87,13 +97,16 @@ async def test_run_delegates_to_agent_runner(self): assert result.content == "Test response" assert result.metrics.success is True mock_config.create_tracker.assert_called_once() - mock_tracker.track_metrics_of_async.assert_called_once() + mock_config.create_tracker.return_value.track_metrics_of_async.assert_called_once() + # evaluations should be present (from noop evaluator) + if result.evaluations is not None: + await result.evaluations @pytest.mark.asyncio async def test_run_uses_create_tracker_for_fresh_tracker(self): """Should use create_tracker() factory for a fresh tracker per invocation.""" mock_config = MagicMock(spec=AIAgentConfig) - fresh_tracker = MagicMock() + fresh_tracker = MagicMock(spec=LDAIConfigTracker) fresh_tracker.track_metrics_of_async = AsyncMock( return_value=RunnerResult( content="Fresh tracker response", @@ -103,6 +116,7 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): ) fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=fresh_tracker) + mock_config.evaluator = Evaluator.noop() mock_runner = MagicMock() @@ -113,6 +127,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): assert result.content == "Fresh tracker response" mock_config.create_tracker.assert_called_once() fresh_tracker.track_metrics_of_async.assert_called_once() + if result.evaluations is not None: + await result.evaluations def test_get_agent_runner_returns_runner(self): """Should return the underlying AgentRunner.""" @@ -129,6 +145,168 @@ def test_get_config_returns_config(self): assert agent.get_config() is mock_config +class TestManagedAgentEvaluations: + """Tests for ManagedAgent evaluations chain (PR 12).""" + + @pytest.mark.asyncio + async def test_run_returns_before_evaluations_resolve(self): + """run() should return before evaluations complete.""" + barrier = asyncio.Event() + + async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: + await barrier.wait() + return [] + + mock_evaluator = MagicMock(spec=Evaluator) + mock_evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + ) + + mock_config = MagicMock(spec=AIAgentConfig) + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock( + return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True)) + ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) + mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config.evaluator = mock_evaluator + + mock_runner = MagicMock() + agent = ManagedAgent(mock_config, mock_runner) + result = await agent.run("Hello") + + assert result is not None + assert result.evaluations is not None + assert not result.evaluations.done(), "evaluations task should still be pending" + + barrier.set() + await result.evaluations + + @pytest.mark.asyncio + async def test_await_evaluations_collects_results(self): + """await result.evaluations should return the list of JudgeResult instances.""" + judge_result = JudgeResult( + judge_config_key='judge-key', + success=True, + sampled=True, + metric_key='$ld:ai:judge:relevance', + score=0.9, + reasoning='Good agent response', + ) + + async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]: + return [judge_result] + + mock_evaluator = MagicMock(spec=Evaluator) + mock_evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o)) + ) + + mock_config = MagicMock(spec=AIAgentConfig) + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock( + return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True)) + ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) + mock_tracker.track_judge_result = MagicMock() + mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config.evaluator = mock_evaluator + + mock_runner = MagicMock() + agent = ManagedAgent(mock_config, mock_runner) + result = await agent.run("Hello") + + results = await result.evaluations # type: ignore[misc] + assert results == [judge_result] + + @pytest.mark.asyncio + async def test_tracking_fires_inside_awaited_chain(self): + """tracker.track_judge_result() must be called when evaluations are awaited.""" + judge_result = JudgeResult( + judge_config_key='agent-judge', + success=True, + sampled=True, + metric_key='$ld:ai:judge:relevance', + score=0.85, + ) + + async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]: + return [judge_result] + + mock_evaluator = MagicMock(spec=Evaluator) + mock_evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o)) + ) + + mock_config = MagicMock(spec=AIAgentConfig) + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock( + return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True)) + ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) + mock_tracker.track_judge_result = MagicMock() + mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config.evaluator = mock_evaluator + + mock_runner = MagicMock() + agent = ManagedAgent(mock_config, mock_runner) + result = await agent.run("Hello") + + # Tracking should NOT have fired yet (before we await evaluations) + mock_tracker.track_judge_result.assert_not_called() + + # Now await the evaluations task — tracking fires inside the chain + await result.evaluations # type: ignore[misc] + + mock_tracker.track_judge_result.assert_called_once_with(judge_result) + + @pytest.mark.asyncio + async def test_noop_evaluator_returns_empty_list(self): + """With a noop evaluator, awaiting evaluations should return an empty list.""" + mock_config = _make_noop_evaluator_config() + mock_runner = MagicMock() + agent = ManagedAgent(mock_config, mock_runner) + result = await agent.run("Hello") + + results = await result.evaluations # type: ignore[misc] + assert results == [] + + @pytest.mark.asyncio + async def test_tracking_not_called_for_failed_judge_result(self): + """tracker.track_judge_result() should NOT be called for unsuccessful judge results.""" + failed_result = JudgeResult( + success=False, + sampled=True, + metric_key='$ld:ai:judge:relevance', + error_message='Judge evaluation failed', + ) + + async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]: + return [failed_result] + + mock_evaluator = MagicMock(spec=Evaluator) + mock_evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o)) + ) + + mock_config = MagicMock(spec=AIAgentConfig) + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock( + return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True)) + ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) + mock_tracker.track_judge_result = MagicMock() + mock_config.create_tracker = MagicMock(return_value=mock_tracker) + mock_config.evaluator = mock_evaluator + + mock_runner = MagicMock() + agent = ManagedAgent(mock_config, mock_runner) + result = await agent.run("Hello") + await result.evaluations # type: ignore[misc] + + mock_tracker.track_judge_result.assert_not_called() + + class TestLDAIClientCreateAgent: """Tests for LDAIClient.create_agent.""" From 757eb917a1e4428c6afa07773d17ff13ad4e27d1 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 08:23:22 -0500 Subject: [PATCH 2/3] fix: Isolate tracking failures and log failed judge evaluations in agent Mirror the managed_model.py fix in managed_agent.py: wrap tracker.track_judge_result() in try/except so a tracking failure does not destroy successfully computed evaluation results, and log a warning when a judge evaluation fails (r.success is False) so failures are visible rather than silently skipped. --- packages/sdk/server-ai/src/ldai/managed_agent.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index f661294b..68f0c5b5 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -3,6 +3,7 @@ import asyncio from typing import List, Union +from ldai import log from ldai.models import AIAgentConfig from ldai.providers import AgentResult, AgentRunner from ldai.providers.runner import Runner @@ -69,7 +70,12 @@ async def _run_and_track(eval_task: asyncio.Task) -> List[JudgeResult]: results = await eval_task for r in results: if r.success: - tracker.track_judge_result(r) + try: + tracker.track_judge_result(r) + except Exception: + pass + else: + log.warning("Judge evaluation failed: %s", r.error_message) return results return asyncio.create_task(_run_and_track(evaluator_task)) From ff2de9a7ab38cb393e9587eb4772a4001d6c4a95 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 08:59:53 -0500 Subject: [PATCH 3/3] fix: log warning when judge result tracking fails in ManagedAgent Co-Authored-By: Claude Sonnet 4.6 --- packages/sdk/server-ai/src/ldai/managed_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index 68f0c5b5..e5b81ec1 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -72,8 +72,8 @@ async def _run_and_track(eval_task: asyncio.Task) -> List[JudgeResult]: if r.success: try: tracker.track_judge_result(r) - except Exception: - pass + except Exception as exc: + log.warning("Judge evaluation failed: %s", exc) else: log.warning("Judge evaluation failed: %s", r.error_message) return results