From 5c4181c8ee586b504348c879564af896682cd3b0 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 28 Apr 2026 18:03:34 -0500 Subject: [PATCH 01/13] feat!: Add ManagedResult, RunnerResult, and Runner protocol; rename invoke() to run() Co-Authored-By: Claude Sonnet 4.6 --- packages/sdk/server-ai/src/ldai/__init__.py | 9 +- .../sdk/server-ai/src/ldai/managed_agent.py | 30 ++- .../sdk/server-ai/src/ldai/managed_model.py | 95 +++++++-- .../server-ai/src/ldai/providers/__init__.py | 6 + .../server-ai/src/ldai/providers/runner.py | 37 ++++ .../sdk/server-ai/src/ldai/providers/types.py | 64 +++++- packages/sdk/server-ai/src/ldai/tracker.py | 83 ++++++-- .../sdk/server-ai/tests/test_managed_agent.py | 19 +- .../sdk/server-ai/tests/test_managed_model.py | 184 ++++++++++-------- 9 files changed, 406 insertions(+), 121 deletions(-) create mode 100644 packages/sdk/server-ai/src/ldai/providers/runner.py diff --git a/packages/sdk/server-ai/src/ldai/__init__.py b/packages/sdk/server-ai/src/ldai/__init__.py index 405ec5a8..f02cee30 100644 --- a/packages/sdk/server-ai/src/ldai/__init__.py +++ b/packages/sdk/server-ai/src/ldai/__init__.py @@ -36,10 +36,13 @@ AgentGraphRunner, AgentResult, AgentRunner, + ManagedResult, + Runner, + RunnerResult, ToolRegistry, ) from ldai.providers.types import JudgeResult -from ldai.tracker import AIGraphTracker +from ldai.tracker import AIGraphTracker, LDAIMetricSummary __all__ = [ 'LDAIClient', @@ -48,6 +51,10 @@ 'AgentGraphRunner', 'AgentResult', 'AgentGraphResult', + 'ManagedResult', + 'Runner', + 'RunnerResult', + 'LDAIMetricSummary', 'ToolRegistry', 'AIAgentConfig', 'AIAgentConfigDefault', diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index ab3ee5e6..a2abdf98 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -1,43 +1,55 @@ """ManagedAgent — LaunchDarkly managed wrapper for agent invocations.""" +from typing import Union + from ldai.models import AIAgentConfig from ldai.providers import AgentResult, AgentRunner +from ldai.providers.runner import Runner +from ldai.providers.types import ManagedResult, RunnerResult class ManagedAgent: """ LaunchDarkly managed wrapper for AI agent invocations. - Holds an AgentRunner. Handles tracking automatically via ``create_tracker()``. + Holds an AgentRunner or Runner. Handles tracking automatically via + ``create_tracker()``. Obtain an instance via ``LDAIClient.create_agent()``. """ def __init__( self, ai_config: AIAgentConfig, - agent_runner: AgentRunner, + agent_runner: Union[Runner, AgentRunner], ): self._ai_config = ai_config self._agent_runner = agent_runner - async def run(self, input: str) -> AgentResult: + async def run(self, input: str) -> ManagedResult: """ Run the agent with the given input string. :param input: The user prompt or input to the agent - :return: AgentResult containing the agent's output and metrics + :return: ManagedResult containing the agent's output and metric summary """ tracker = self._ai_config.create_tracker() - return await tracker.track_metrics_of_async( - lambda result: result.metrics, + result: Union[RunnerResult, AgentResult] = await tracker.track_metrics_of_async( + lambda r: r.metrics, lambda: self._agent_runner.run(input), ) + # Support both RunnerResult (content) and legacy AgentResult (output) + content = result.content if isinstance(result, RunnerResult) else result.output # type: ignore[union-attr] + return ManagedResult( + content=content, + metrics=tracker.get_summary(), + raw=result.raw, + ) - def get_agent_runner(self) -> AgentRunner: + def get_agent_runner(self) -> Union[Runner, AgentRunner]: """ - Return the underlying AgentRunner for advanced use. + Return the underlying runner for advanced use. - :return: The AgentRunner instance. + :return: The Runner or AgentRunner instance. """ return self._agent_runner diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 9cfb503a..3d2949c3 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -1,10 +1,12 @@ import asyncio -from typing import List, Optional +import warnings +from typing import List, Union from ldai import log from ldai.models import AICompletionConfig, LDMessage from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResult, ModelResponse +from ldai.providers.runner import Runner +from ldai.providers.types import JudgeResult, ManagedResult, ModelResponse, RunnerResult from ldai.tracker import LDAIConfigTracker @@ -12,31 +14,100 @@ class ManagedModel: """ LaunchDarkly managed wrapper for AI model invocations. - Holds a ModelRunner. Handles conversation management, judge evaluation - dispatch, and tracking automatically via ``create_tracker()``. + Holds a Runner (or legacy ModelRunner). Handles conversation management, + judge evaluation dispatch, and tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_model()``. """ def __init__( self, ai_config: AICompletionConfig, - model_runner: ModelRunner, + model_runner: Union[Runner, ModelRunner], ): self._ai_config = ai_config self._model_runner = model_runner self._messages: List[LDMessage] = [] - async def invoke(self, prompt: str) -> ModelResponse: + async def run(self, prompt: str) -> ManagedResult: """ - Invoke the model with a prompt string. + Run the model with a prompt string. Appends the prompt to the conversation history, prepends any system messages from the config, delegates to the runner, and appends the response to the history. + :param prompt: The user prompt to send to the model + :return: ManagedResult containing the model's response, metric summary, + and an optional evaluations task + """ + tracker = self._ai_config.create_tracker() + + user_message = LDMessage(role='user', content=prompt) + self._messages.append(user_message) + + config_messages = self._ai_config.messages or [] + all_messages = config_messages + self._messages + + result: Union[RunnerResult, ModelResponse] = await tracker.track_metrics_of_async( + lambda r: r.metrics, + lambda: self._invoke_runner(all_messages), + ) + + # Support both new RunnerResult and legacy ModelResponse + if isinstance(result, RunnerResult): + content = result.content + raw = result.raw + parsed = result.parsed + assistant_message = LDMessage(role='assistant', content=content) + else: + content = result.message.content + raw = getattr(result, 'raw', None) + parsed = getattr(result, 'parsed', None) + assistant_message = result.message + + input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' + + evaluations_task = self._track_judge_results(tracker, input_text, content) + + self._messages.append(assistant_message) + + return ManagedResult( + content=content, + metrics=tracker.get_summary(), + raw=raw, + parsed=parsed, + evaluations=evaluations_task, + ) + + async def _invoke_runner( + self, all_messages: List[LDMessage] + ) -> Union[RunnerResult, ModelResponse]: + """ + Delegate to the runner. Supports both the new ``Runner`` protocol + (``run(messages) → RunnerResult``) and the legacy ``ModelRunner`` + (``invoke_model(messages) → ModelResponse``). + """ + if isinstance(self._model_runner, Runner): + return await self._model_runner.run(all_messages) + # Legacy ModelRunner path + return await self._model_runner.invoke_model(all_messages) # type: ignore[union-attr] + + async def invoke(self, prompt: str) -> ModelResponse: + """ + Invoke the model with a prompt string. + + .. deprecated:: + Use :meth:`run` instead. This method will be removed in a future + release once the migration to :class:`ManagedResult` is complete. + :param prompt: The user prompt to send to the model :return: ModelResponse containing the model's response and metrics """ + warnings.warn( + "ManagedModel.invoke() is deprecated. Use run() instead.", + DeprecationWarning, + stacklevel=2, + ) tracker = self._ai_config.create_tracker() user_message = LDMessage(role='user', content=prompt) @@ -45,9 +116,9 @@ async def invoke(self, prompt: str) -> ModelResponse: config_messages = self._ai_config.messages or [] all_messages = config_messages + self._messages - response = await tracker.track_metrics_of_async( + response: ModelResponse = await tracker.track_metrics_of_async( lambda result: result.metrics, - lambda: self._model_runner.invoke_model(all_messages), + lambda: self._model_runner.invoke_model(all_messages), # type: ignore[union-attr] ) input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' @@ -98,11 +169,11 @@ def append_messages(self, messages: List[LDMessage]) -> None: """ self._messages.extend(messages) - def get_model_runner(self) -> ModelRunner: + def get_model_runner(self) -> Union[Runner, ModelRunner]: """ - Return the underlying ModelRunner for advanced use. + Return the underlying runner for advanced use. - :return: The ModelRunner instance. + :return: The Runner or legacy ModelRunner instance. """ return self._model_runner diff --git a/packages/sdk/server-ai/src/ldai/providers/__init__.py b/packages/sdk/server-ai/src/ldai/providers/__init__.py index b2bfa72e..6f472c69 100644 --- a/packages/sdk/server-ai/src/ldai/providers/__init__.py +++ b/packages/sdk/server-ai/src/ldai/providers/__init__.py @@ -2,13 +2,16 @@ from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner from ldai.providers.runner_factory import RunnerFactory from ldai.providers.types import ( AgentGraphResult, AgentResult, JudgeResult, LDAIMetrics, + ManagedResult, ModelResponse, + RunnerResult, StructuredResponse, ToolRegistry, ) @@ -21,9 +24,12 @@ 'AgentRunner', 'JudgeResult', 'LDAIMetrics', + 'ManagedResult', 'ModelResponse', 'ModelRunner', + 'Runner', 'RunnerFactory', + 'RunnerResult', 'StructuredResponse', 'ToolRegistry', ] diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py new file mode 100644 index 00000000..c86a8fe8 --- /dev/null +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -0,0 +1,37 @@ +"""Unified Runner protocol for AI providers.""" + +from typing import Any, Dict, Optional, Protocol, runtime_checkable + +from ldai.providers.types import RunnerResult + + +@runtime_checkable +class Runner(Protocol): + """ + Unified runtime capability interface for all AI provider runners. + + A :class:`Runner` is a focused, configured object that performs a single + AI invocation. Both model runners and agent runners implement this protocol. + + :param input: The input to the runner (string prompt, list of messages, or + other provider-specific input type). + :param output_type: Optional JSON schema dict that requests structured output. + When provided, the runner populates :attr:`~RunnerResult.parsed` on the + returned :class:`RunnerResult`. + :return: :class:`RunnerResult` containing ``content``, ``metrics``, and + optionally ``raw`` and ``parsed``. + """ + + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: + """ + Execute the runner with the given input. + + :param input: The input to the runner. + :param output_type: Optional JSON schema for structured output. + :return: RunnerResult containing content, metrics, raw, and parsed fields. + """ + ... diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index aa537880..72f7198c 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -7,7 +7,7 @@ from typing import Any, Callable, Dict, List, Optional from ldai.models import LDMessage -from ldai.tracker import TokenUsage +from ldai.tracker import LDAIMetricSummary, TokenUsage # Type alias for a registry of tools available to an agent. # Keys are tool names; values are the callable implementations. @@ -17,10 +17,21 @@ @dataclass class LDAIMetrics: """ - Metrics information for AI operations that includes success status and token usage. + Metrics information for AI operations that includes success status, token + usage, and optional enrichment fields populated by runners. + + ``tool_calls`` is a list of tool-call names observed during the invocation + (populated by agent runners that execute tool loops). + + ``duration_ms`` is the wall-clock duration of the runner invocation in + milliseconds, when measured by the runner itself rather than externally. + When set, the tracker uses this value directly instead of measuring elapsed + time. """ success: bool usage: Optional[TokenUsage] = None + tool_calls: Optional[List[str]] = None + duration_ms: Optional[int] = None def to_dict(self) -> Dict[str, Any]: """ @@ -35,13 +46,55 @@ def to_dict(self) -> Dict[str, Any]: 'input': self.usage.input, 'output': self.usage.output, } + if self.tool_calls is not None: + result['toolCalls'] = self.tool_calls + if self.duration_ms is not None: + result['durationMs'] = self.duration_ms return result +@dataclass +class RunnerResult: + """ + Result returned by a :class:`~ldai.providers.runner.Runner` from a single + invocation. + + This is the unified return type for all Runner implementations. + ``evaluations`` is intentionally absent — judge evaluations are dispatched + by the managed layer and live on :class:`ManagedResult`. + """ + content: str + metrics: LDAIMetrics + raw: Optional[Any] = None + parsed: Optional[Dict[str, Any]] = None + + +@dataclass +class ManagedResult: + """ + Result returned by the managed layer (:class:`~ldai.ManagedModel` / + :class:`~ldai.ManagedAgent`) after a single invocation. + + ``metrics`` is an :class:`~ldai.tracker.LDAIMetricSummary` (from + ``tracker.get_summary()``) rather than a raw :class:`LDAIMetrics`. + ``evaluations`` is an optional asyncio Task that resolves to a list of + :class:`JudgeResult` instances when awaited. + """ + content: str + metrics: LDAIMetricSummary + raw: Optional[Any] = None + parsed: Optional[Dict[str, Any]] = None + evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None + + @dataclass class ModelResponse: """ Response from a model invocation. + + .. deprecated:: + Use :class:`RunnerResult` (from a runner) and :class:`ManagedResult` + (from the managed layer) instead. """ message: LDMessage metrics: LDAIMetrics @@ -52,6 +105,9 @@ class ModelResponse: class StructuredResponse: """ Structured response from AI models. + + .. deprecated:: + Structured output is now represented by :attr:`RunnerResult.parsed`. """ data: Dict[str, Any] raw_response: str @@ -96,6 +152,10 @@ def to_dict(self) -> Dict[str, Any]: class AgentResult: """ Result from a single-agent run. + + .. deprecated:: + Use :class:`ManagedResult` (managed layer) or :class:`RunnerResult` + (runner layer) instead. """ output: str raw: Any diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 0f5a32c5..608297d3 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -41,15 +41,31 @@ class LDAIMetricSummary: """ def __init__(self): - self._duration = None - self._success = None - self._feedback = None - self._usage = None - self._time_to_first_token = None + self._duration_ms: Optional[int] = None + self._success: Optional[bool] = None + self._feedback: Optional[Dict[str, FeedbackKind]] = None + self._usage: Optional[TokenUsage] = None + self._time_to_first_token: Optional[int] = None + self._tool_calls: Optional[List[str]] = None + self._resumption_token: Optional[str] = None + + @property + def duration_ms(self) -> Optional[int]: + """Duration of the AI operation in milliseconds.""" + return self._duration_ms @property def duration(self) -> Optional[int]: - return self._duration + """ + .. deprecated:: + Use :attr:`duration_ms` instead. + """ + warnings.warn( + "LDAIMetricSummary.duration is deprecated. Use duration_ms instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._duration_ms @property def success(self) -> Optional[bool]: @@ -67,6 +83,20 @@ def usage(self) -> Optional[TokenUsage]: def time_to_first_token(self) -> Optional[int]: return self._time_to_first_token + @property + def tool_calls(self) -> Optional[List[str]]: + """List of tool keys that were invoked during this operation.""" + return self._tool_calls + + @property + def resumption_token(self) -> Optional[str]: + """ + URL-safe Base64-encoded resumption token captured at tracker + instantiation. Useful for deferred feedback flows where a downstream + process needs to associate events with the original execution. + """ + return self._resumption_token + class LDAIConfigTracker: """ @@ -107,8 +137,10 @@ def __init__( self._provider_name = provider_name self._context = context self._graph_key = graph_key - self._summary = LDAIMetricSummary() self._run_id = run_id + self._summary = LDAIMetricSummary() + # Capture resumption_token immediately so it's available on the summary at instantiation. + self._summary._resumption_token = self.resumption_token @property def resumption_token(self) -> str: @@ -200,10 +232,10 @@ def track_duration(self, duration: int) -> None: :param duration: Duration in milliseconds. """ - if self._summary.duration is not None: + if self._summary.duration_ms is not None: log.warning("Duration has already been tracked for this execution. %s", self.__get_track_data()) return - self._summary._duration = duration + self._summary._duration_ms = duration self._ld_client.track( "$ld:ai:duration:total", self._context, self.__get_track_data(), duration ) @@ -259,6 +291,8 @@ def _track_from_metrics_extractor( self.track_error() if metrics.usage: self.track_tokens(metrics.usage) + if getattr(metrics, 'tool_calls', None): + self.track_tool_calls(metrics.tool_calls) return result def track_metrics_of( @@ -278,6 +312,10 @@ def track_metrics_of( For async operations, use :meth:`track_metrics_of_async`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Synchronous callable that runs the operation :return: The result of the operation @@ -291,8 +329,10 @@ def track_metrics_of( self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + metrics = metrics_extractor(result) + reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None + self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) return self._track_from_metrics_extractor(result, metrics_extractor) async def track_metrics_of_async(self, metrics_extractor, func): @@ -301,6 +341,10 @@ async def track_metrics_of_async(self, metrics_extractor, func): Same event semantics as :meth:`track_metrics_of`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Async callable or zero-arg callable that returns an awaitable when called :return: The result of the operation @@ -315,8 +359,10 @@ async def track_metrics_of_async(self, metrics_extractor, func): self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + metrics = metrics_extractor(result) + reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None + self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) return self._track_from_metrics_extractor(result, metrics_extractor) def track_judge_result(self, judge_result: Any) -> None: @@ -364,6 +410,17 @@ def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None: 1, ) + def track_tool_calls(self, tool_calls: List[str]) -> None: + """ + Track the tool calls made during an AI operation. + + :param tool_calls: List of tool call names. + """ + if self._summary.tool_calls is not None: + log.warning("Tool calls have already been tracked for this execution. %s", self.__get_track_data()) + return + self._summary._tool_calls = list(tool_calls) + def track_success(self) -> None: """ Track a successful AI generation. diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py index 144641fc..c4b94ea5 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent.py +++ b/packages/sdk/server-ai/tests/test_managed_agent.py @@ -7,12 +7,19 @@ from ldai.managed_agent import ManagedAgent from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig from ldai.providers import AgentResult -from ldai.providers.types import LDAIMetrics +from ldai.providers.types import LDAIMetrics, ManagedResult +from ldai.tracker import LDAIMetricSummary from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData +def _make_summary(success: bool = True) -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = success + return summary + + @pytest.fixture def td() -> TestData: td = TestData.data_source() @@ -53,7 +60,7 @@ class TestManagedAgentRun: @pytest.mark.asyncio async def test_run_delegates_to_agent_runner(self): - """Should delegate run() to the underlying AgentRunner.""" + """Should delegate run() to the underlying AgentRunner and return ManagedResult.""" mock_config = MagicMock(spec=AIAgentConfig) mock_tracker = MagicMock() mock_tracker.track_metrics_of_async = AsyncMock( @@ -63,6 +70,7 @@ async def test_run_delegates_to_agent_runner(self): metrics=LDAIMetrics(success=True, usage=None), ) ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=mock_tracker) mock_runner = MagicMock() mock_runner.run = AsyncMock( @@ -76,7 +84,8 @@ async def test_run_delegates_to_agent_runner(self): agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Test response" + assert isinstance(result, ManagedResult) + assert result.content == "Test response" assert result.metrics.success is True mock_config.create_tracker.assert_called_once() mock_tracker.track_metrics_of_async.assert_called_once() @@ -93,6 +102,7 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): metrics=LDAIMetrics(success=True, usage=None), ) ) + fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=fresh_tracker) mock_runner = MagicMock() @@ -100,7 +110,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Fresh tracker response" + assert isinstance(result, ManagedResult) + assert result.content == "Fresh tracker response" mock_config.create_tracker.assert_called_once() fresh_tracker.track_metrics_of_async.assert_called_once() diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index 36802a14..f81076c5 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -2,15 +2,15 @@ import asyncio from typing import List -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock import pytest from ldai.evaluator import Evaluator from ldai.managed_model import ManagedModel from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, ModelResponse -from ldai.tracker import LDAIConfigTracker +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse +from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary @@ -21,29 +21,46 @@ def _make_model_response(content: str = 'response text') -> ModelResponse: ) -class TestManagedModelInvokeReturnsImmediately: - """invoke() must return before the evaluations task resolves.""" +def _make_summary() -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = True + return summary + + +def _make_config_with_tracker(evaluator: Evaluator) -> tuple[AICompletionConfig, MagicMock]: + """Build an AICompletionConfig with a fully-mocked tracker.""" + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) + config = AICompletionConfig( + key='test-config', + enabled=True, + create_tracker=MagicMock(return_value=mock_tracker), + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + messages=[], + evaluator=evaluator, + ) + return config, mock_tracker - @pytest.mark.asyncio - async def test_invoke_returns_before_evaluations_resolve(self): - """invoke() should return a ModelResponse before evaluations complete.""" - # Set up a barrier so the evaluation coroutine doesn't complete until we release it - barrier = asyncio.Event() - async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: - await barrier.wait() - return [] +class TestManagedModelRunReturnsImmediately: + """run() must return before the evaluations task resolves.""" + @pytest.mark.asyncio + async def test_run_returns_managed_result(self): + """run() should return a ManagedResult with content from the runner.""" evaluator = MagicMock(spec=Evaluator) evaluator.evaluate = MagicMock( - side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + side_effect=lambda i, o: asyncio.create_task(_empty_eval()) ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_model_response('hi')) mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response('hi')) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) config = AICompletionConfig( key='test-config', enabled=True, @@ -55,20 +72,46 @@ async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult] ) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') - # invoke() returned — evaluations task should still be pending - assert response is not None - assert response.evaluations is not None - assert not response.evaluations.done(), "evaluations task should still be pending" + assert isinstance(result, ManagedResult) + assert result.content == 'hi' + assert isinstance(result.metrics, LDAIMetricSummary) + # Cleanup the still-pending evaluations task. + if result.evaluations is not None: + await result.evaluations + + @pytest.mark.asyncio + async def test_run_returns_before_evaluations_resolve(self): + """run() should return a ManagedResult before evaluations complete.""" + barrier = asyncio.Event() + + async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: + await barrier.wait() + return [] + + evaluator = MagicMock(spec=Evaluator) + evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + ) + + mock_runner = MagicMock() + mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + + config, _tracker = _make_config_with_tracker(evaluator) + model = ManagedModel(config, mock_runner) + result = await model.run('Hello') + + assert result is not None + assert result.evaluations is not None + assert not result.evaluations.done(), "evaluations task should still be pending" - # Release the barrier and let it finish cleanly barrier.set() - await response.evaluations + await result.evaluations @pytest.mark.asyncio async def test_await_evaluations_collects_results(self): - """await response.evaluations should return the list of JudgeResult instances.""" + """await result.evaluations should return the list of JudgeResult instances.""" judge_result = JudgeResult( judge_config_key='judge-key', success=True, @@ -89,22 +132,11 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] mock_runner = MagicMock() mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') - results = await response.evaluations # type: ignore[misc] + results = await result.evaluations # type: ignore[misc] assert results == [judge_result] @pytest.mark.asyncio @@ -130,28 +162,17 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] mock_runner = MagicMock() mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') # Tracking should NOT have fired yet (before we await evaluations) mock_tracker.track_judge_result.assert_not_called() # Now await the evaluations task — tracking fires inside the chain - await response.evaluations # type: ignore[misc] + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_called_once_with(judge_result) @@ -176,23 +197,12 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] mock_runner = MagicMock() mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - await response.evaluations # type: ignore[misc] + result = await model.run('Hello') + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_not_called() @@ -204,21 +214,35 @@ async def test_noop_evaluator_returns_empty_list(self): mock_runner = MagicMock() mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, _tracker = _make_config_with_tracker(evaluator) + model = ManagedModel(config, mock_runner) + result = await model.run('Hello') + results = await result.evaluations # type: ignore[misc] - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) + assert results == [] + + +class TestManagedModelInvokeDeprecated: + """The deprecated invoke() method continues to work and emits a DeprecationWarning.""" + @pytest.mark.asyncio + async def test_invoke_emits_deprecation_warning(self): + """invoke() should emit a DeprecationWarning.""" + evaluator = Evaluator.noop() + mock_runner = MagicMock() + mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - results = await response.evaluations # type: ignore[misc] - assert results == [] + with pytest.warns(DeprecationWarning, match=r"ManagedModel\.invoke\(\) is deprecated"): + response = await model.invoke('Hello') + + assert response is not None + # invoke() still wires the evaluations chain on the response. + if response.evaluations is not None: + await response.evaluations + + +async def _empty_eval() -> List[JudgeResult]: + return [] From 4e28ae691bcdf45c51e6714b53262189cce0066c Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 11:30:52 -0500 Subject: [PATCH 02/13] refactor: address review feedback on docstrings --- .../server-ai/src/ldai/providers/runner.py | 10 +-- .../sdk/server-ai/src/ldai/providers/types.py | 84 +++++++++++-------- .../sdk/server-ai/tests/test_managed_agent.py | 23 +++-- .../sdk/server-ai/tests/test_managed_model.py | 29 ++++--- .../sdk/server-ai/tests/test_runner_abcs.py | 24 +++--- 5 files changed, 94 insertions(+), 76 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py index c86a8fe8..5e1b9abc 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -11,15 +11,7 @@ class Runner(Protocol): Unified runtime capability interface for all AI provider runners. A :class:`Runner` is a focused, configured object that performs a single - AI invocation. Both model runners and agent runners implement this protocol. - - :param input: The input to the runner (string prompt, list of messages, or - other provider-specific input type). - :param output_type: Optional JSON schema dict that requests structured output. - When provided, the runner populates :attr:`~RunnerResult.parsed` on the - returned :class:`RunnerResult`. - :return: :class:`RunnerResult` containing ``content``, ``metrics``, and - optionally ``raw`` and ``parsed``. + AI invocation. """ async def run( diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index 72f7198c..f5224e0e 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -16,22 +16,19 @@ @dataclass class LDAIMetrics: - """ - Metrics information for AI operations that includes success status, token - usage, and optional enrichment fields populated by runners. - - ``tool_calls`` is a list of tool-call names observed during the invocation - (populated by agent runners that execute tool loops). + """Contains metrics for a single AI invocation.""" - ``duration_ms`` is the wall-clock duration of the runner invocation in - milliseconds, when measured by the runner itself rather than externally. - When set, the tracker uses this value directly instead of measuring elapsed - time. - """ success: bool + """Whether the invocation succeeded.""" + usage: Optional[TokenUsage] = None + """Optional token usage information.""" + tool_calls: Optional[List[str]] = None + """Ordered list of tool-call names observed during the invocation.""" + duration_ms: Optional[int] = None + """Wall-clock duration of the runner invocation in milliseconds.""" def to_dict(self) -> Dict[str, Any]: """ @@ -55,36 +52,39 @@ def to_dict(self) -> Dict[str, Any]: @dataclass class RunnerResult: - """ - Result returned by a :class:`~ldai.providers.runner.Runner` from a single - invocation. + """Contains the result of a single AI model invocation.""" - This is the unified return type for all Runner implementations. - ``evaluations`` is intentionally absent — judge evaluations are dispatched - by the managed layer and live on :class:`ManagedResult`. - """ content: str + """The text content returned by the model.""" + metrics: LDAIMetrics + """Metrics for this invocation.""" + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" @dataclass class ManagedResult: - """ - Result returned by the managed layer (:class:`~ldai.ManagedModel` / - :class:`~ldai.ManagedAgent`) after a single invocation. + """Contains the result of a managed AI invocation, including metrics and optional judge evaluations.""" - ``metrics`` is an :class:`~ldai.tracker.LDAIMetricSummary` (from - ``tracker.get_summary()``) rather than a raw :class:`LDAIMetrics`. - ``evaluations`` is an optional asyncio Task that resolves to a list of - :class:`JudgeResult` instances when awaited. - """ content: str + """The text content returned by the model.""" + metrics: LDAIMetricSummary + """Aggregated metric summary from the tracker for this invocation.""" + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" + evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None + """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited.""" @dataclass @@ -116,16 +116,28 @@ class StructuredResponse: @dataclass class JudgeResult: - """ - Result from a judge evaluation. - """ + """Contains the result of a single judge evaluation.""" + judge_config_key: Optional[str] = None + """The configuration key of the judge that produced this result.""" + success: bool = False + """Whether the judge evaluation completed successfully.""" + error_message: Optional[str] = None - sampled: bool = False # True when the evaluation was sampled and run + """Error message describing why the evaluation failed, if any.""" + + sampled: bool = False + """True when the evaluation was sampled and run.""" + metric_key: Optional[str] = None + """The metric key under which this judge's score is reported.""" + score: Optional[float] = None + """The numeric score (0-1) returned by the judge.""" + reasoning: Optional[str] = None + """The judge's reasoning text accompanying the score.""" def to_dict(self) -> Dict[str, Any]: """ @@ -164,10 +176,16 @@ class AgentResult: @dataclass class AgentGraphResult: - """ - Result from an agent graph run. - """ + """Contains the result of an agent graph run.""" + output: str + """The agent graph's final output content.""" + raw: Any + """The provider-native response object from the graph run.""" + metrics: LDAIMetrics + """Metrics recorded during the graph run.""" + evaluations: Optional[List[JudgeResult]] = None + """Optional list of judge evaluation results produced for the graph run.""" diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py index c4b94ea5..0c30637a 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent.py +++ b/packages/sdk/server-ai/tests/test_managed_agent.py @@ -6,8 +6,7 @@ from ldai import LDAIClient, ManagedAgent from ldai.managed_agent import ManagedAgent from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig -from ldai.providers import AgentResult -from ldai.providers.types import LDAIMetrics, ManagedResult +from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult from ldai.tracker import LDAIMetricSummary from ldclient import Config, Context, LDClient @@ -64,20 +63,20 @@ async def test_run_delegates_to_agent_runner(self): mock_config = MagicMock(spec=AIAgentConfig) mock_tracker = MagicMock() mock_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=mock_tracker) mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) @@ -96,10 +95,10 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): mock_config = MagicMock(spec=AIAgentConfig) fresh_tracker = MagicMock() fresh_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Fresh tracker response", - raw=None, + return_value=RunnerResult( + content="Fresh tracker response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True)) @@ -163,7 +162,7 @@ async def test_returns_managed_agent_when_runner_available(self, ldai_client: LD mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult(output="Hello!", raw=None, metrics=LDAIMetrics(success=True, usage=None)) + return_value=RunnerResult(content="Hello!", metrics=LDAIMetrics(success=True, usage=None), raw=None) ) original = rf.RunnerFactory.create_agent diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index f81076c5..cc190abf 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -9,11 +9,18 @@ from ldai.evaluator import Evaluator from ldai.managed_model import ManagedModel from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse, RunnerResult from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary +def _make_runner_result(content: str = 'response text') -> RunnerResult: + return RunnerResult( + content=content, + metrics=LDAIMetrics(success=True, usage=None), + ) + + def _make_model_response(content: str = 'response text') -> ModelResponse: return ModelResponse( message=LDMessage(role='assistant', content=content), @@ -30,7 +37,7 @@ def _make_summary() -> LDAIMetricSummary: def _make_config_with_tracker(evaluator: Evaluator) -> tuple[AICompletionConfig, MagicMock]: """Build an AICompletionConfig with a fully-mocked tracker.""" mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result()) mock_tracker.get_summary = MagicMock(return_value=_make_summary()) config = AICompletionConfig( key='test-config', @@ -56,10 +63,10 @@ async def test_run_returns_managed_result(self): ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response('hi')) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result('hi')) mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response('hi')) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result('hi')) mock_tracker.get_summary = MagicMock(return_value=_make_summary()) config = AICompletionConfig( key='test-config', @@ -96,7 +103,7 @@ async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) @@ -130,7 +137,7 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) @@ -160,7 +167,7 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() @@ -195,7 +202,7 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() @@ -212,7 +219,7 @@ async def test_noop_evaluator_returns_empty_list(self): evaluator = Evaluator.noop() mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) @@ -232,7 +239,9 @@ async def test_invoke_emits_deprecation_warning(self): mock_runner = MagicMock() mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - config, _tracker = _make_config_with_tracker(evaluator) + config, mock_tracker = _make_config_with_tracker(evaluator) + # invoke() expects a ModelResponse from the tracker, not a RunnerResult. + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) model = ManagedModel(config, mock_runner) with pytest.warns(DeprecationWarning, match=r"ManagedModel\.invoke\(\) is deprecated"): diff --git a/packages/sdk/server-ai/tests/test_runner_abcs.py b/packages/sdk/server-ai/tests/test_runner_abcs.py index d5136fd0..7e8087cd 100644 --- a/packages/sdk/server-ai/tests/test_runner_abcs.py +++ b/packages/sdk/server-ai/tests/test_runner_abcs.py @@ -1,17 +1,17 @@ import pytest -from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentResult, AgentRunner, ToolRegistry -from ldai.providers.types import LDAIMetrics +from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentRunner, ToolRegistry +from ldai.providers.types import LDAIMetrics, RunnerResult # --- Concrete test doubles --- class ConcreteAgentRunner: async def run(self, input): - return AgentResult( - output=f"agent response to: {input}", - raw={"raw": input}, + return RunnerResult( + content=f"agent response to: {input}", metrics=LDAIMetrics(success=True), + raw={"raw": input}, ) @@ -39,20 +39,20 @@ def test_agent_runner_structural_check_fails_when_run_missing(): @pytest.mark.asyncio -async def test_agent_runner_run_returns_agent_result(): +async def test_agent_runner_run_returns_runner_result(): runner = ConcreteAgentRunner() result = await runner.run("hello") - assert isinstance(result, AgentResult) - assert result.output == "agent response to: hello" + assert isinstance(result, RunnerResult) + assert result.content == "agent response to: hello" assert result.raw == {"raw": "hello"} assert result.metrics.success is True @pytest.mark.asyncio -async def test_agent_result_fields(): +async def test_runner_result_fields(): metrics = LDAIMetrics(success=True) - result = AgentResult(output="done", raw={"key": "val"}, metrics=metrics) - assert result.output == "done" + result = RunnerResult(content="done", metrics=metrics, raw={"key": "val"}) + assert result.content == "done" assert result.raw == {"key": "val"} assert result.metrics is metrics @@ -103,6 +103,6 @@ def test_top_level_exports(): import ldai assert hasattr(ldai, 'AgentRunner') assert hasattr(ldai, 'AgentGraphRunner') - assert hasattr(ldai, 'AgentResult') assert hasattr(ldai, 'AgentGraphResult') + assert hasattr(ldai, 'RunnerResult') assert hasattr(ldai, 'ToolRegistry') From 2dd9329e5ccb9f5a5cb1cf1d00af0b6a2cef3138 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 11:49:16 -0500 Subject: [PATCH 03/13] fix: merge duplicate track_tool_calls methods in LDAIConfigTracker The new track_tool_calls method at line 413 (with summary storage and dedup guard) was being shadowed by the older method at line 559 (which only fired per-tool events). Merge them into a single method that both stores to the summary and fires per-tool events. --- packages/sdk/server-ai/src/ldai/tracker.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 608297d3..21995dc5 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -410,16 +410,22 @@ def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None: 1, ) - def track_tool_calls(self, tool_calls: List[str]) -> None: + def track_tool_calls(self, tool_calls: Iterable[str]) -> None: """ Track the tool calls made during an AI operation. - :param tool_calls: List of tool call names. + Stores the tool call names on the summary (guarding against duplicate + tracking) and fires a ``$ld:ai:tool_call`` event for each tool. + + :param tool_calls: Tool identifiers (e.g. from a model response). """ if self._summary.tool_calls is not None: log.warning("Tool calls have already been tracked for this execution. %s", self.__get_track_data()) return - self._summary._tool_calls = list(tool_calls) + tool_calls_list = list(tool_calls) + self._summary._tool_calls = tool_calls_list + for tool_key in tool_calls_list: + self.track_tool_call(tool_key) def track_success(self) -> None: """ @@ -556,15 +562,6 @@ def track_tool_call(self, tool_key: str) -> None: 1, ) - def track_tool_calls(self, tool_keys: Iterable[str]) -> None: - """ - Track multiple tool invocations for this configuration. - - :param tool_keys: Tool identifiers (e.g. from a model response). - """ - for tool_key in tool_keys: - self.track_tool_call(tool_key) - def get_summary(self) -> LDAIMetricSummary: """ Get the current summary of AI metrics. From b4d15df4be4b5cb69bce1c8af6006ece3009c1ff Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 12:12:49 -0500 Subject: [PATCH 04/13] fix: avoid double metrics extraction in track_metrics_of helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, metrics_extractor(result) was called twice — once in the public track_metrics_of/track_metrics_of_async to read duration_ms, and again inside _track_from_metrics_extractor to track success, tokens, and tool calls. Extract metrics once in the public method and pass the resulting metrics + elapsed_ms into the private helper, which now also handles the duration tracking. --- packages/sdk/server-ai/src/ldai/tracker.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 21995dc5..414ac34b 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -279,12 +279,9 @@ def track_duration_of(self, func): return result - def _track_from_metrics_extractor( - self, - result: Any, - metrics_extractor: Callable[[Any], Any], - ) -> Any: - metrics = metrics_extractor(result) + def _track_from_metrics_extractor(self, metrics: Any, elapsed_ms: int) -> None: + reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None + self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) if metrics.success: self.track_success() else: @@ -293,7 +290,6 @@ def _track_from_metrics_extractor( self.track_tokens(metrics.usage) if getattr(metrics, 'tool_calls', None): self.track_tool_calls(metrics.tool_calls) - return result def track_metrics_of( self, @@ -331,9 +327,8 @@ def track_metrics_of( elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 metrics = metrics_extractor(result) - reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None - self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) - return self._track_from_metrics_extractor(result, metrics_extractor) + self._track_from_metrics_extractor(metrics, elapsed_ms) + return result async def track_metrics_of_async(self, metrics_extractor, func): """ @@ -361,9 +356,8 @@ async def track_metrics_of_async(self, metrics_extractor, func): elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 metrics = metrics_extractor(result) - reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None - self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) - return self._track_from_metrics_extractor(result, metrics_extractor) + self._track_from_metrics_extractor(metrics, elapsed_ms) + return result def track_judge_result(self, judge_result: Any) -> None: """ From 4fe7eb5bd82656868b42ea3b21cd1759f5557e32 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 12:37:04 -0500 Subject: [PATCH 05/13] refactor: drop ModelRunner/AgentRunner compat from managed layer ManagedModel and ManagedAgent now require a Runner. The compat shims (_invoke_runner, isinstance(result, RunnerResult) branches, Union type annotations) are removed; result handling is direct on RunnerResult fields. The deprecated ManagedModel.invoke() is preserved for backwards compat but now delegates to run() and adapts the ManagedResult into the legacy ModelResponse shape. ModelRunner and AgentRunner protocol definitions remain in place so downstream provider packages that import them continue to work. --- packages/sdk/server-ai/src/ldai/client.py | 4 +- .../sdk/server-ai/src/ldai/managed_agent.py | 20 ++--- .../sdk/server-ai/src/ldai/managed_model.py | 78 ++++++------------- .../sdk/server-ai/tests/test_managed_model.py | 15 +--- 4 files changed, 34 insertions(+), 83 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 448d5c55..3193095c 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -443,7 +443,7 @@ async def create_model( if not runner: return None - return ManagedModel(config, runner) + return ManagedModel(config, runner) # type: ignore[arg-type] async def create_chat( self, @@ -517,7 +517,7 @@ async def create_agent( if not runner: return None - return ManagedAgent(config, runner) + return ManagedAgent(config, runner) # type: ignore[arg-type] def agent_config( self, diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index a2abdf98..9d582ae4 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -1,26 +1,22 @@ """ManagedAgent — LaunchDarkly managed wrapper for agent invocations.""" -from typing import Union - from ldai.models import AIAgentConfig -from ldai.providers import AgentResult, AgentRunner from ldai.providers.runner import Runner -from ldai.providers.types import ManagedResult, RunnerResult +from ldai.providers.types import ManagedResult class ManagedAgent: """ LaunchDarkly managed wrapper for AI agent invocations. - Holds an AgentRunner or Runner. Handles tracking automatically via - ``create_tracker()``. + Holds a Runner. Handles tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_agent()``. """ def __init__( self, ai_config: AIAgentConfig, - agent_runner: Union[Runner, AgentRunner], + agent_runner: Runner, ): self._ai_config = ai_config self._agent_runner = agent_runner @@ -33,23 +29,21 @@ async def run(self, input: str) -> ManagedResult: :return: ManagedResult containing the agent's output and metric summary """ tracker = self._ai_config.create_tracker() - result: Union[RunnerResult, AgentResult] = await tracker.track_metrics_of_async( + result = await tracker.track_metrics_of_async( lambda r: r.metrics, lambda: self._agent_runner.run(input), ) - # Support both RunnerResult (content) and legacy AgentResult (output) - content = result.content if isinstance(result, RunnerResult) else result.output # type: ignore[union-attr] return ManagedResult( - content=content, + content=result.content, metrics=tracker.get_summary(), raw=result.raw, ) - def get_agent_runner(self) -> Union[Runner, AgentRunner]: + def get_agent_runner(self) -> Runner: """ Return the underlying runner for advanced use. - :return: The Runner or AgentRunner instance. + :return: The Runner instance. """ return self._agent_runner diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 3d2949c3..48f1a587 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -1,12 +1,11 @@ import asyncio import warnings -from typing import List, Union +from typing import List from ldai import log from ldai.models import AICompletionConfig, LDMessage -from ldai.providers.model_runner import ModelRunner from ldai.providers.runner import Runner -from ldai.providers.types import JudgeResult, ManagedResult, ModelResponse, RunnerResult +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse from ldai.tracker import LDAIConfigTracker @@ -14,15 +13,15 @@ class ManagedModel: """ LaunchDarkly managed wrapper for AI model invocations. - Holds a Runner (or legacy ModelRunner). Handles conversation management, - judge evaluation dispatch, and tracking automatically via ``create_tracker()``. + Holds a Runner. Handles conversation management, judge evaluation + dispatch, and tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_model()``. """ def __init__( self, ai_config: AICompletionConfig, - model_runner: Union[Runner, ModelRunner], + model_runner: Runner, ): self._ai_config = ai_config self._model_runner = model_runner @@ -48,50 +47,27 @@ async def run(self, prompt: str) -> ManagedResult: config_messages = self._ai_config.messages or [] all_messages = config_messages + self._messages - result: Union[RunnerResult, ModelResponse] = await tracker.track_metrics_of_async( + result = await tracker.track_metrics_of_async( lambda r: r.metrics, - lambda: self._invoke_runner(all_messages), + lambda: self._model_runner.run(all_messages), ) - # Support both new RunnerResult and legacy ModelResponse - if isinstance(result, RunnerResult): - content = result.content - raw = result.raw - parsed = result.parsed - assistant_message = LDMessage(role='assistant', content=content) - else: - content = result.message.content - raw = getattr(result, 'raw', None) - parsed = getattr(result, 'parsed', None) - assistant_message = result.message + assistant_message = LDMessage(role='assistant', content=result.content) input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' - evaluations_task = self._track_judge_results(tracker, input_text, content) + evaluations_task = self._track_judge_results(tracker, input_text, result.content) self._messages.append(assistant_message) return ManagedResult( - content=content, + content=result.content, metrics=tracker.get_summary(), - raw=raw, - parsed=parsed, + raw=result.raw, + parsed=result.parsed, evaluations=evaluations_task, ) - async def _invoke_runner( - self, all_messages: List[LDMessage] - ) -> Union[RunnerResult, ModelResponse]: - """ - Delegate to the runner. Supports both the new ``Runner`` protocol - (``run(messages) → RunnerResult``) and the legacy ``ModelRunner`` - (``invoke_model(messages) → ModelResponse``). - """ - if isinstance(self._model_runner, Runner): - return await self._model_runner.run(all_messages) - # Legacy ModelRunner path - return await self._model_runner.invoke_model(all_messages) # type: ignore[union-attr] - async def invoke(self, prompt: str) -> ModelResponse: """ Invoke the model with a prompt string. @@ -108,26 +84,16 @@ async def invoke(self, prompt: str) -> ModelResponse: DeprecationWarning, stacklevel=2, ) - tracker = self._ai_config.create_tracker() - - user_message = LDMessage(role='user', content=prompt) - self._messages.append(user_message) - - config_messages = self._ai_config.messages or [] - all_messages = config_messages + self._messages - - response: ModelResponse = await tracker.track_metrics_of_async( - lambda result: result.metrics, - lambda: self._model_runner.invoke_model(all_messages), # type: ignore[union-attr] + result = await self.run(prompt) + return ModelResponse( + message=LDMessage(role='assistant', content=result.content), + metrics=LDAIMetrics( + success=bool(result.metrics.success), + usage=result.metrics.usage, + ), + evaluations=result.evaluations, ) - input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' - output_text = response.message.content - response.evaluations = self._track_judge_results(tracker, input_text, output_text) - - self._messages.append(response.message) - return response - def _track_judge_results( self, tracker: LDAIConfigTracker, @@ -169,11 +135,11 @@ def append_messages(self, messages: List[LDMessage]) -> None: """ self._messages.extend(messages) - def get_model_runner(self) -> Union[Runner, ModelRunner]: + def get_model_runner(self) -> Runner: """ Return the underlying runner for advanced use. - :return: The Runner or legacy ModelRunner instance. + :return: The Runner instance. """ return self._model_runner diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index cc190abf..73739da2 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -9,7 +9,7 @@ from ldai.evaluator import Evaluator from ldai.managed_model import ManagedModel from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse, RunnerResult +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary @@ -21,13 +21,6 @@ def _make_runner_result(content: str = 'response text') -> RunnerResult: ) -def _make_model_response(content: str = 'response text') -> ModelResponse: - return ModelResponse( - message=LDMessage(role='assistant', content=content), - metrics=LDAIMetrics(success=True, usage=None), - ) - - def _make_summary() -> LDAIMetricSummary: summary = LDAIMetricSummary() summary._success = True @@ -237,11 +230,9 @@ async def test_invoke_emits_deprecation_warning(self): """invoke() should emit a DeprecationWarning.""" evaluator = Evaluator.noop() mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.run = AsyncMock(return_value=_make_runner_result()) - config, mock_tracker = _make_config_with_tracker(evaluator) - # invoke() expects a ModelResponse from the tracker, not a RunnerResult. - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, _mock_tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) with pytest.warns(DeprecationWarning, match=r"ManagedModel\.invoke\(\) is deprecated"): From ff871bf9293c3b3b7ae9e04fef51666ecb87a7bb Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 12:58:34 -0500 Subject: [PATCH 06/13] fix: tighten _track_from_metrics_extractor checks - Drop the inconsistent 'if metrics else None' guard on reported_ms; the next line already dereferences metrics.success unconditionally. - Use 'is not None' for tool_calls so an explicit empty list still triggers tracking (preserves the distinction between 'not tracked' and 'tracked with no calls'). --- packages/sdk/server-ai/src/ldai/tracker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 414ac34b..9e61a8a4 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -280,7 +280,7 @@ def track_duration_of(self, func): return result def _track_from_metrics_extractor(self, metrics: Any, elapsed_ms: int) -> None: - reported_ms = getattr(metrics, 'duration_ms', None) if metrics else None + reported_ms = getattr(metrics, 'duration_ms', None) self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) if metrics.success: self.track_success() @@ -288,7 +288,7 @@ def _track_from_metrics_extractor(self, metrics: Any, elapsed_ms: int) -> None: self.track_error() if metrics.usage: self.track_tokens(metrics.usage) - if getattr(metrics, 'tool_calls', None): + if getattr(metrics, 'tool_calls', None) is not None: self.track_tool_calls(metrics.tool_calls) def track_metrics_of( From d75467f8a9cec9d4b85a88051c6ae19418f660a9 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 16:06:57 -0500 Subject: [PATCH 07/13] refactor: remove deprecated ManagedModel.invoke() Drop the deprecated invoke() method from the managed layer along with its dedicated test class and the warnings/LDAIMetrics/ModelResponse imports that were only needed by it. Type definitions in providers/ remain so downstream provider packages keep building. --- .../sdk/server-ai/src/ldai/managed_model.py | 29 +------------------ .../sdk/server-ai/tests/test_managed_model.py | 22 -------------- 2 files changed, 1 insertion(+), 50 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 48f1a587..94605eab 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -1,11 +1,10 @@ import asyncio -import warnings from typing import List from ldai import log from ldai.models import AICompletionConfig, LDMessage from ldai.providers.runner import Runner -from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, ModelResponse +from ldai.providers.types import JudgeResult, ManagedResult from ldai.tracker import LDAIConfigTracker @@ -68,32 +67,6 @@ async def run(self, prompt: str) -> ManagedResult: evaluations=evaluations_task, ) - async def invoke(self, prompt: str) -> ModelResponse: - """ - Invoke the model with a prompt string. - - .. deprecated:: - Use :meth:`run` instead. This method will be removed in a future - release once the migration to :class:`ManagedResult` is complete. - - :param prompt: The user prompt to send to the model - :return: ModelResponse containing the model's response and metrics - """ - warnings.warn( - "ManagedModel.invoke() is deprecated. Use run() instead.", - DeprecationWarning, - stacklevel=2, - ) - result = await self.run(prompt) - return ModelResponse( - message=LDMessage(role='assistant', content=result.content), - metrics=LDAIMetrics( - success=bool(result.metrics.success), - usage=result.metrics.usage, - ), - evaluations=result.evaluations, - ) - def _track_judge_results( self, tracker: LDAIConfigTracker, diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index 73739da2..6d679552 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -222,27 +222,5 @@ async def test_noop_evaluator_returns_empty_list(self): assert results == [] -class TestManagedModelInvokeDeprecated: - """The deprecated invoke() method continues to work and emits a DeprecationWarning.""" - - @pytest.mark.asyncio - async def test_invoke_emits_deprecation_warning(self): - """invoke() should emit a DeprecationWarning.""" - evaluator = Evaluator.noop() - mock_runner = MagicMock() - mock_runner.run = AsyncMock(return_value=_make_runner_result()) - - config, _mock_tracker = _make_config_with_tracker(evaluator) - model = ManagedModel(config, mock_runner) - - with pytest.warns(DeprecationWarning, match=r"ManagedModel\.invoke\(\) is deprecated"): - response = await model.invoke('Hello') - - assert response is not None - # invoke() still wires the evaluations chain on the response. - if response.evaluations is not None: - await response.evaluations - - async def _empty_eval() -> List[JudgeResult]: return [] From 89d0ad7cd97c61bd4fe191489c154130c68b7e3d Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 16:13:25 -0500 Subject: [PATCH 08/13] refactor: type RunnerFactory.create_model/agent returns as Optional[Runner] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The factory's downstream consumers (ManagedModel, ManagedAgent) now take Runner; aligning the factory's return types lets us drop the type: ignore comments at the ManagedModel/ManagedAgent call sites. Provider package PRs will update their concrete implementations to match. Judge still takes ModelRunner, so its call site picks up the type: ignore[arg-type] in its place — that's resolved later in the cleanup PR when Judge migrates to Runner. --- packages/sdk/server-ai/src/ldai/client.py | 6 +++--- .../server-ai/src/ldai/providers/runner_factory.py | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 3193095c..ededae36 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -329,7 +329,7 @@ def create_judge( if not provider: return None - return Judge(judge_config, provider) + return Judge(judge_config, provider) # type: ignore[arg-type] except Exception as error: return None @@ -443,7 +443,7 @@ async def create_model( if not runner: return None - return ManagedModel(config, runner) # type: ignore[arg-type] + return ManagedModel(config, runner) async def create_chat( self, @@ -517,7 +517,7 @@ async def create_agent( if not runner: return None - return ManagedAgent(config, runner) # type: ignore[arg-type] + return ManagedAgent(config, runner) def agent_config( self, diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py index 9363f8e0..b7548791 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py @@ -4,9 +4,8 @@ from ldai import log from ldai.models import AIConfigKind from ldai.providers.agent_graph_runner import AgentGraphRunner -from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider -from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner T = TypeVar('T') @@ -118,13 +117,13 @@ def _get_providers_to_try( def create_model( config: AIConfigKind, default_ai_provider: Optional[str] = None, - ) -> Optional[ModelRunner]: + ) -> Optional[Runner]: """ Create a model executor for the given AI completion config. :param config: LaunchDarkly AI config (completion or judge) :param default_ai_provider: Optional provider override ('openai', 'langchain', …) - :return: Configured ModelRunner ready to invoke the model, or None + :return: Configured Runner ready to invoke the model, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) @@ -135,7 +134,7 @@ def create_agent( config: Any, tools: Any, default_ai_provider: Optional[str] = None, - ) -> Optional[AgentRunner]: + ) -> Optional[Runner]: """ CAUTION: This feature is experimental and should NOT be considered ready for production use. @@ -147,7 +146,7 @@ def create_agent( :param config: LaunchDarkly AI agent config :param tools: Tool registry mapping tool names to callables :param default_ai_provider: Optional provider override - :return: AgentRunner instance, or None + :return: Runner instance, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) From 7a52f24c1b2b3e56ad63f29f6e39ce2b5a4c0bfb Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 29 Apr 2026 16:56:03 -0500 Subject: [PATCH 09/13] refactor: handle metrics extraction failures gracefully MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the metrics_extractor call inside _track_from_metrics_extractor so extraction errors are caught and logged without bubbling up. When extraction fails or returns None, only the wall-clock duration is tracked — success/error is left untouched since the underlying model call itself succeeded. Also tighten the tool_calls check to access metrics.tool_calls directly, mirroring how metrics.usage is accessed. --- packages/sdk/server-ai/src/ldai/tracker.py | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 9e61a8a4..df122a24 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -279,7 +279,22 @@ def track_duration_of(self, func): return result - def _track_from_metrics_extractor(self, metrics: Any, elapsed_ms: int) -> None: + def _track_from_metrics_extractor( + self, + result: Any, + metrics_extractor: Callable[[Any], Any], + elapsed_ms: int, + ) -> None: + metrics = None + try: + metrics = metrics_extractor(result) + except Exception as exc: + log.warning("Failed to extract metrics: %s", exc) + + if metrics is None: + self.track_duration(elapsed_ms) + return + reported_ms = getattr(metrics, 'duration_ms', None) self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) if metrics.success: @@ -288,7 +303,7 @@ def _track_from_metrics_extractor(self, metrics: Any, elapsed_ms: int) -> None: self.track_error() if metrics.usage: self.track_tokens(metrics.usage) - if getattr(metrics, 'tool_calls', None) is not None: + if metrics.tool_calls is not None: self.track_tool_calls(metrics.tool_calls) def track_metrics_of( @@ -326,8 +341,7 @@ def track_metrics_of( raise err elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 - metrics = metrics_extractor(result) - self._track_from_metrics_extractor(metrics, elapsed_ms) + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) return result async def track_metrics_of_async(self, metrics_extractor, func): @@ -355,8 +369,7 @@ async def track_metrics_of_async(self, metrics_extractor, func): raise err elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 - metrics = metrics_extractor(result) - self._track_from_metrics_extractor(metrics, elapsed_ms) + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) return result def track_judge_result(self, judge_result: Any) -> None: From 45845edb000db7415fc1bd30b44b138026c46212 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 30 Apr 2026 08:58:16 -0500 Subject: [PATCH 10/13] refactor: update Judge to use Runner protocol and RunnerResult - Judge now accepts Runner instead of ModelRunner - evaluate() calls runner.run(output_type=...) instead of invoke_structured_model - response.parsed replaces StructuredResponse.data; None guard added - evaluate_messages() accepts RunnerResult instead of ModelResponse - Tests updated to use RunnerResult and mock_runner.run Co-Authored-By: Claude Sonnet 4.6 --- packages/sdk/server-ai/src/ldai/client.py | 2 +- .../sdk/server-ai/src/ldai/judge/__init__.py | 22 +++-- packages/sdk/server-ai/tests/test_judge.py | 84 +++++++++---------- 3 files changed, 55 insertions(+), 53 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index ededae36..448d5c55 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -329,7 +329,7 @@ def create_judge( if not provider: return None - return Judge(judge_config, provider) # type: ignore[arg-type] + return Judge(judge_config, provider) except Exception as error: return None diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index f2e8c362..6919a7aa 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -8,8 +8,8 @@ from ldai import log from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResult, ModelResponse +from ldai.providers.runner import Runner +from ldai.providers.types import JudgeResult, RunnerResult class Judge: @@ -23,7 +23,7 @@ class Judge: def __init__( self, ai_config: AIJudgeConfig, - model_runner: ModelRunner, + model_runner: Runner, ): """ Initialize the Judge. @@ -76,10 +76,14 @@ async def evaluate( response = await tracker.track_metrics_of_async( lambda result: result.metrics, - lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure), + lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure), ) - parsed = self._parse_evaluation_response(response.data) + if response.parsed is None: + log.warning('Judge evaluation did not return structured output') + return judge_result + + parsed = self._parse_evaluation_response(response.parsed) if parsed is None: log.warning('Judge evaluation did not return the expected evaluation') @@ -99,19 +103,19 @@ async def evaluate( async def evaluate_messages( self, messages: list[LDMessage], - response: ModelResponse, + response: RunnerResult, sampling_ratio: float = 1.0, ) -> JudgeResult: """ Evaluates an AI response from chat messages and response. :param messages: Array of messages representing the conversation history - :param response: The AI response to be evaluated + :param response: The runner result to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) :return: The result of the judge evaluation. """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' - output_text = response.message.content + output_text = response.content return await self.evaluate(input_text, output_text, sampling_ratio) @@ -123,7 +127,7 @@ def get_ai_config(self) -> AIJudgeConfig: """ return self._ai_config - def get_model_runner(self) -> ModelRunner: + def get_model_runner(self) -> Runner: """ Returns the model runner used by this judge. diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index c2690b6a..3ca0750b 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -9,7 +9,7 @@ from ldai.judge import Judge from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse +from ldai.providers.types import JudgeResult, LDAIMetrics, RunnerResult from ldai.tracker import LDAIConfigTracker @@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient: @pytest.fixture def mock_runner(): - """Create a mock AI provider.""" + """Create a mock AI runner.""" provider = MagicMock() - provider.invoke_structured_model = AsyncMock() + provider.run = AsyncMock() return provider @@ -137,7 +137,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing( assert isinstance(result, JudgeResult) assert result.success is False assert result.sampled is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() @pytest.mark.asyncio async def test_evaluate_returns_failure_when_messages_missing( @@ -151,23 +151,23 @@ async def test_evaluate_returns_failure_when_messages_missing( assert isinstance(result, JudgeResult) assert result.success is False assert result.sampled is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() @pytest.mark.asyncio async def test_evaluate_success_with_valid_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should return JudgeResponse with valid evaluation.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 0.85, 'reasoning': 'The response is highly relevant to the input.' }, - raw_response='{"score": 0.85, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -187,15 +187,15 @@ async def test_evaluate_success_with_evaluation_response_shape( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should accept shape { score, reasoning } and key by metric.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 0.9, 'reasoning': 'The response is accurate and complete.', }, - raw_response='{"score": 0.9, "reasoning": "..."}', - metrics=LDAIMetrics(success=True), ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -214,13 +214,13 @@ async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle missing score/reasoning in response.""" - mock_response = StructuredResponse( - data={}, - raw_response='{}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -236,16 +236,16 @@ async def test_evaluate_handles_invalid_score( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle invalid score values.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 1.5, - 'reasoning': 'Some reasoning' + 'reasoning': 'Some reasoning', }, - raw_response='{"score": 1.5, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -261,13 +261,13 @@ async def test_evaluate_handles_missing_reasoning( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle missing reasoning.""" - mock_response = StructuredResponse( - data={'score': 0.8}, - raw_response='{"score": 0.8}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.8}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -283,7 +283,7 @@ async def test_evaluate_handles_exception( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle exceptions gracefully.""" - mock_runner.invoke_structured_model.side_effect = Exception("Provider error") + mock_runner.run.side_effect = Exception("Provider error") tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error")) judge = Judge(judge_config_with_key, mock_runner) @@ -306,7 +306,7 @@ async def test_evaluate_respects_sampling_rate( assert isinstance(result, JudgeResult) assert result.sampled is False assert result.success is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() class TestJudgeEvaluateMessages: @@ -317,15 +317,13 @@ async def test_evaluate_messages_calls_evaluate( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """evaluate_messages should call evaluate with constructed input/output.""" - from ldai.providers.types import ModelResponse - - mock_response = StructuredResponse( - data={'score': 0.9, 'reasoning': 'Very relevant'}, - raw_response='{"score": 0.9, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.9, 'reasoning': 'Very relevant'}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -334,9 +332,9 @@ async def test_evaluate_messages_calls_evaluate( LDMessage(role='user', content='Question 1'), LDMessage(role='assistant', content='Answer 1'), ] - chat_response = ModelResponse( - message=LDMessage(role='assistant', content='Answer 2'), - metrics=LDAIMetrics(success=True) + chat_response = RunnerResult( + content='Answer 2', + metrics=LDAIMetrics(success=True), ) result = await judge.evaluate_messages(messages, chat_response) From 56249a1204291123d7d181c275c75276340c7631 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 28 Apr 2026 18:22:39 -0500 Subject: [PATCH 11/13] feat: Wire LDAIMetrics tool_calls and duration_ms into tracker --- packages/sdk/server-ai/tests/test_tracker.py | 108 +++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/packages/sdk/server-ai/tests/test_tracker.py b/packages/sdk/server-ai/tests/test_tracker.py index c2ae2dde..930ca633 100644 --- a/packages/sdk/server-ai/tests/test_tracker.py +++ b/packages/sdk/server-ai/tests/test_tracker.py @@ -909,3 +909,111 @@ def test_client_create_tracker_fails_on_invalid_json(): result = ai_client.create_tracker(bad_token, context) assert not result.is_success() assert "Invalid resumption token" in result.error + + +# --- PR 10: LDAIMetrics enrichment + tracker integration --- + + +def test_ldai_metrics_to_dict_includes_tool_calls_and_duration_ms(): + metrics = LDAIMetrics( + success=True, + usage=TokenUsage(total=10, input=4, output=6), + tool_calls=["search", "lookup"], + duration_ms=123, + ) + d = metrics.to_dict() + assert d["success"] is True + assert d["usage"] == {"total": 10, "input": 4, "output": 6} + assert d["toolCalls"] == ["search", "lookup"] + assert d["durationMs"] == 123 + + +def test_ldai_metrics_to_dict_omits_optional_fields_when_none(): + metrics = LDAIMetrics(success=False) + d = metrics.to_dict() + assert d == {"success": False} + + +def test_track_metrics_of_uses_metrics_duration_ms_when_set(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, duration_ms=999) + + tracker.track_metrics_of(extract, fn) + assert tracker.get_summary().duration_ms == 999 + + +@pytest.mark.asyncio +async def test_track_metrics_of_async_uses_metrics_duration_ms_when_set(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + async def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, duration_ms=42) + + await tracker.track_metrics_of_async(extract, fn) + assert tracker.get_summary().duration_ms == 42 + + +def test_track_metrics_of_calls_track_tool_calls_when_present(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, tool_calls=["foo", "bar"]) + + tracker.track_metrics_of(extract, fn) + summary = tracker.get_summary() + assert summary.tool_calls == ["foo", "bar"] + # One $ld:ai:tool_call event per tool key. + tool_call_events = [ + c for c in client.track.mock_calls # type: ignore + if c.args[0] == "$ld:ai:tool_call" + ] + assert len(tool_call_events) == 2 + + +def test_track_metrics_of_skips_track_tool_calls_when_absent(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, usage=None) + + tracker.track_metrics_of(extract, fn) + assert tracker.get_summary().tool_calls is None + tool_call_events = [ + c for c in client.track.mock_calls # type: ignore + if c.args[0] == "$ld:ai:tool_call" + ] + assert tool_call_events == [] From 4d86c9cf71471e93221ee2865755f284e38c2d2e Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 30 Apr 2026 09:19:11 -0500 Subject: [PATCH 12/13] chore: remove stale PR-10 section comment from test_tracker.py --- packages/sdk/server-ai/tests/test_tracker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/packages/sdk/server-ai/tests/test_tracker.py b/packages/sdk/server-ai/tests/test_tracker.py index 930ca633..4ed53441 100644 --- a/packages/sdk/server-ai/tests/test_tracker.py +++ b/packages/sdk/server-ai/tests/test_tracker.py @@ -911,9 +911,6 @@ def test_client_create_tracker_fails_on_invalid_json(): assert "Invalid resumption token" in result.error -# --- PR 10: LDAIMetrics enrichment + tracker integration --- - - def test_ldai_metrics_to_dict_includes_tool_calls_and_duration_ms(): metrics = LDAIMetrics( success=True, From cc792ecb1c4acb60b6c52edc3f059427a2ccae51 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 30 Apr 2026 09:41:23 -0500 Subject: [PATCH 13/13] refactor: type metrics_extractor as Callable[[Any], Optional[LDAIMetrics]], remove defensive getattr --- packages/sdk/server-ai/src/ldai/tracker.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index df122a24..43c836bf 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -1,15 +1,20 @@ +from __future__ import annotations + import base64 import json import time import warnings from dataclasses import dataclass from enum import Enum -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional from ldclient import Context, LDClient, Result from ldai import log +if TYPE_CHECKING: + from ldai.providers.types import LDAIMetrics + class FeedbackKind(Enum): """ @@ -282,7 +287,7 @@ def track_duration_of(self, func): def _track_from_metrics_extractor( self, result: Any, - metrics_extractor: Callable[[Any], Any], + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], elapsed_ms: int, ) -> None: metrics = None @@ -295,8 +300,7 @@ def _track_from_metrics_extractor( self.track_duration(elapsed_ms) return - reported_ms = getattr(metrics, 'duration_ms', None) - self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) + self.track_duration(metrics.duration_ms if metrics.duration_ms is not None else elapsed_ms) if metrics.success: self.track_success() else: @@ -308,7 +312,7 @@ def _track_from_metrics_extractor( def track_metrics_of( self, - metrics_extractor: Callable[[Any], Any], + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], func: Callable[[], Any], ) -> Any: """ @@ -344,7 +348,11 @@ def track_metrics_of( self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) return result - async def track_metrics_of_async(self, metrics_extractor, func): + async def track_metrics_of_async( + self, + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], + func: Callable[[], Any], + ) -> Any: """ Track metrics for an async AI operation (``func`` is awaited).