diff --git a/packages/sdk/server-ai/src/ldai/__init__.py b/packages/sdk/server-ai/src/ldai/__init__.py index 405ec5a8..f02cee30 100644 --- a/packages/sdk/server-ai/src/ldai/__init__.py +++ b/packages/sdk/server-ai/src/ldai/__init__.py @@ -36,10 +36,13 @@ AgentGraphRunner, AgentResult, AgentRunner, + ManagedResult, + Runner, + RunnerResult, ToolRegistry, ) from ldai.providers.types import JudgeResult -from ldai.tracker import AIGraphTracker +from ldai.tracker import AIGraphTracker, LDAIMetricSummary __all__ = [ 'LDAIClient', @@ -48,6 +51,10 @@ 'AgentGraphRunner', 'AgentResult', 'AgentGraphResult', + 'ManagedResult', + 'Runner', + 'RunnerResult', + 'LDAIMetricSummary', 'ToolRegistry', 'AIAgentConfig', 'AIAgentConfigDefault', diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 448d5c55..ededae36 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -329,7 +329,7 @@ def create_judge( if not provider: return None - return Judge(judge_config, provider) + return Judge(judge_config, provider) # type: ignore[arg-type] except Exception as error: return None diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index ab3ee5e6..9d582ae4 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -1,43 +1,49 @@ """ManagedAgent — LaunchDarkly managed wrapper for agent invocations.""" from ldai.models import AIAgentConfig -from ldai.providers import AgentResult, AgentRunner +from ldai.providers.runner import Runner +from ldai.providers.types import ManagedResult class ManagedAgent: """ LaunchDarkly managed wrapper for AI agent invocations. - Holds an AgentRunner. Handles tracking automatically via ``create_tracker()``. + Holds a Runner. Handles tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_agent()``. """ def __init__( self, ai_config: AIAgentConfig, - agent_runner: AgentRunner, + agent_runner: Runner, ): self._ai_config = ai_config self._agent_runner = agent_runner - async def run(self, input: str) -> AgentResult: + async def run(self, input: str) -> ManagedResult: """ Run the agent with the given input string. :param input: The user prompt or input to the agent - :return: AgentResult containing the agent's output and metrics + :return: ManagedResult containing the agent's output and metric summary """ tracker = self._ai_config.create_tracker() - return await tracker.track_metrics_of_async( - lambda result: result.metrics, + result = await tracker.track_metrics_of_async( + lambda r: r.metrics, lambda: self._agent_runner.run(input), ) + return ManagedResult( + content=result.content, + metrics=tracker.get_summary(), + raw=result.raw, + ) - def get_agent_runner(self) -> AgentRunner: + def get_agent_runner(self) -> Runner: """ - Return the underlying AgentRunner for advanced use. + Return the underlying runner for advanced use. - :return: The AgentRunner instance. + :return: The Runner instance. """ return self._agent_runner diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 9cfb503a..94605eab 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -1,10 +1,10 @@ import asyncio -from typing import List, Optional +from typing import List from ldai import log from ldai.models import AICompletionConfig, LDMessage -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResult, ModelResponse +from ldai.providers.runner import Runner +from ldai.providers.types import JudgeResult, ManagedResult from ldai.tracker import LDAIConfigTracker @@ -12,7 +12,7 @@ class ManagedModel: """ LaunchDarkly managed wrapper for AI model invocations. - Holds a ModelRunner. Handles conversation management, judge evaluation + Holds a Runner. Handles conversation management, judge evaluation dispatch, and tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_model()``. """ @@ -20,22 +20,23 @@ class ManagedModel: def __init__( self, ai_config: AICompletionConfig, - model_runner: ModelRunner, + model_runner: Runner, ): self._ai_config = ai_config self._model_runner = model_runner self._messages: List[LDMessage] = [] - async def invoke(self, prompt: str) -> ModelResponse: + async def run(self, prompt: str) -> ManagedResult: """ - Invoke the model with a prompt string. + Run the model with a prompt string. Appends the prompt to the conversation history, prepends any system messages from the config, delegates to the runner, and appends the response to the history. :param prompt: The user prompt to send to the model - :return: ModelResponse containing the model's response and metrics + :return: ManagedResult containing the model's response, metric summary, + and an optional evaluations task """ tracker = self._ai_config.create_tracker() @@ -45,17 +46,26 @@ async def invoke(self, prompt: str) -> ModelResponse: config_messages = self._ai_config.messages or [] all_messages = config_messages + self._messages - response = await tracker.track_metrics_of_async( - lambda result: result.metrics, - lambda: self._model_runner.invoke_model(all_messages), + result = await tracker.track_metrics_of_async( + lambda r: r.metrics, + lambda: self._model_runner.run(all_messages), ) + assistant_message = LDMessage(role='assistant', content=result.content) + input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' - output_text = response.message.content - response.evaluations = self._track_judge_results(tracker, input_text, output_text) - self._messages.append(response.message) - return response + evaluations_task = self._track_judge_results(tracker, input_text, result.content) + + self._messages.append(assistant_message) + + return ManagedResult( + content=result.content, + metrics=tracker.get_summary(), + raw=result.raw, + parsed=result.parsed, + evaluations=evaluations_task, + ) def _track_judge_results( self, @@ -98,11 +108,11 @@ def append_messages(self, messages: List[LDMessage]) -> None: """ self._messages.extend(messages) - def get_model_runner(self) -> ModelRunner: + def get_model_runner(self) -> Runner: """ - Return the underlying ModelRunner for advanced use. + Return the underlying runner for advanced use. - :return: The ModelRunner instance. + :return: The Runner instance. """ return self._model_runner diff --git a/packages/sdk/server-ai/src/ldai/providers/__init__.py b/packages/sdk/server-ai/src/ldai/providers/__init__.py index b2bfa72e..6f472c69 100644 --- a/packages/sdk/server-ai/src/ldai/providers/__init__.py +++ b/packages/sdk/server-ai/src/ldai/providers/__init__.py @@ -2,13 +2,16 @@ from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner from ldai.providers.runner_factory import RunnerFactory from ldai.providers.types import ( AgentGraphResult, AgentResult, JudgeResult, LDAIMetrics, + ManagedResult, ModelResponse, + RunnerResult, StructuredResponse, ToolRegistry, ) @@ -21,9 +24,12 @@ 'AgentRunner', 'JudgeResult', 'LDAIMetrics', + 'ManagedResult', 'ModelResponse', 'ModelRunner', + 'Runner', 'RunnerFactory', + 'RunnerResult', 'StructuredResponse', 'ToolRegistry', ] diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py new file mode 100644 index 00000000..5e1b9abc --- /dev/null +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -0,0 +1,29 @@ +"""Unified Runner protocol for AI providers.""" + +from typing import Any, Dict, Optional, Protocol, runtime_checkable + +from ldai.providers.types import RunnerResult + + +@runtime_checkable +class Runner(Protocol): + """ + Unified runtime capability interface for all AI provider runners. + + A :class:`Runner` is a focused, configured object that performs a single + AI invocation. + """ + + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: + """ + Execute the runner with the given input. + + :param input: The input to the runner. + :param output_type: Optional JSON schema for structured output. + :return: RunnerResult containing content, metrics, raw, and parsed fields. + """ + ... diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py index 9363f8e0..b7548791 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py @@ -4,9 +4,8 @@ from ldai import log from ldai.models import AIConfigKind from ldai.providers.agent_graph_runner import AgentGraphRunner -from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider -from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner T = TypeVar('T') @@ -118,13 +117,13 @@ def _get_providers_to_try( def create_model( config: AIConfigKind, default_ai_provider: Optional[str] = None, - ) -> Optional[ModelRunner]: + ) -> Optional[Runner]: """ Create a model executor for the given AI completion config. :param config: LaunchDarkly AI config (completion or judge) :param default_ai_provider: Optional provider override ('openai', 'langchain', …) - :return: Configured ModelRunner ready to invoke the model, or None + :return: Configured Runner ready to invoke the model, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) @@ -135,7 +134,7 @@ def create_agent( config: Any, tools: Any, default_ai_provider: Optional[str] = None, - ) -> Optional[AgentRunner]: + ) -> Optional[Runner]: """ CAUTION: This feature is experimental and should NOT be considered ready for production use. @@ -147,7 +146,7 @@ def create_agent( :param config: LaunchDarkly AI agent config :param tools: Tool registry mapping tool names to callables :param default_ai_provider: Optional provider override - :return: AgentRunner instance, or None + :return: Runner instance, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index aa537880..f5224e0e 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -7,7 +7,7 @@ from typing import Any, Callable, Dict, List, Optional from ldai.models import LDMessage -from ldai.tracker import TokenUsage +from ldai.tracker import LDAIMetricSummary, TokenUsage # Type alias for a registry of tools available to an agent. # Keys are tool names; values are the callable implementations. @@ -16,11 +16,19 @@ @dataclass class LDAIMetrics: - """ - Metrics information for AI operations that includes success status and token usage. - """ + """Contains metrics for a single AI invocation.""" + success: bool + """Whether the invocation succeeded.""" + usage: Optional[TokenUsage] = None + """Optional token usage information.""" + + tool_calls: Optional[List[str]] = None + """Ordered list of tool-call names observed during the invocation.""" + + duration_ms: Optional[int] = None + """Wall-clock duration of the runner invocation in milliseconds.""" def to_dict(self) -> Dict[str, Any]: """ @@ -35,13 +43,58 @@ def to_dict(self) -> Dict[str, Any]: 'input': self.usage.input, 'output': self.usage.output, } + if self.tool_calls is not None: + result['toolCalls'] = self.tool_calls + if self.duration_ms is not None: + result['durationMs'] = self.duration_ms return result +@dataclass +class RunnerResult: + """Contains the result of a single AI model invocation.""" + + content: str + """The text content returned by the model.""" + + metrics: LDAIMetrics + """Metrics for this invocation.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" + + +@dataclass +class ManagedResult: + """Contains the result of a managed AI invocation, including metrics and optional judge evaluations.""" + + content: str + """The text content returned by the model.""" + + metrics: LDAIMetricSummary + """Aggregated metric summary from the tracker for this invocation.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" + + evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None + """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited.""" + + @dataclass class ModelResponse: """ Response from a model invocation. + + .. deprecated:: + Use :class:`RunnerResult` (from a runner) and :class:`ManagedResult` + (from the managed layer) instead. """ message: LDMessage metrics: LDAIMetrics @@ -52,6 +105,9 @@ class ModelResponse: class StructuredResponse: """ Structured response from AI models. + + .. deprecated:: + Structured output is now represented by :attr:`RunnerResult.parsed`. """ data: Dict[str, Any] raw_response: str @@ -60,16 +116,28 @@ class StructuredResponse: @dataclass class JudgeResult: - """ - Result from a judge evaluation. - """ + """Contains the result of a single judge evaluation.""" + judge_config_key: Optional[str] = None + """The configuration key of the judge that produced this result.""" + success: bool = False + """Whether the judge evaluation completed successfully.""" + error_message: Optional[str] = None - sampled: bool = False # True when the evaluation was sampled and run + """Error message describing why the evaluation failed, if any.""" + + sampled: bool = False + """True when the evaluation was sampled and run.""" + metric_key: Optional[str] = None + """The metric key under which this judge's score is reported.""" + score: Optional[float] = None + """The numeric score (0-1) returned by the judge.""" + reasoning: Optional[str] = None + """The judge's reasoning text accompanying the score.""" def to_dict(self) -> Dict[str, Any]: """ @@ -96,6 +164,10 @@ def to_dict(self) -> Dict[str, Any]: class AgentResult: """ Result from a single-agent run. + + .. deprecated:: + Use :class:`ManagedResult` (managed layer) or :class:`RunnerResult` + (runner layer) instead. """ output: str raw: Any @@ -104,10 +176,16 @@ class AgentResult: @dataclass class AgentGraphResult: - """ - Result from an agent graph run. - """ + """Contains the result of an agent graph run.""" + output: str + """The agent graph's final output content.""" + raw: Any + """The provider-native response object from the graph run.""" + metrics: LDAIMetrics + """Metrics recorded during the graph run.""" + evaluations: Optional[List[JudgeResult]] = None + """Optional list of judge evaluation results produced for the graph run.""" diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 0f5a32c5..df122a24 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -41,15 +41,31 @@ class LDAIMetricSummary: """ def __init__(self): - self._duration = None - self._success = None - self._feedback = None - self._usage = None - self._time_to_first_token = None + self._duration_ms: Optional[int] = None + self._success: Optional[bool] = None + self._feedback: Optional[Dict[str, FeedbackKind]] = None + self._usage: Optional[TokenUsage] = None + self._time_to_first_token: Optional[int] = None + self._tool_calls: Optional[List[str]] = None + self._resumption_token: Optional[str] = None + + @property + def duration_ms(self) -> Optional[int]: + """Duration of the AI operation in milliseconds.""" + return self._duration_ms @property def duration(self) -> Optional[int]: - return self._duration + """ + .. deprecated:: + Use :attr:`duration_ms` instead. + """ + warnings.warn( + "LDAIMetricSummary.duration is deprecated. Use duration_ms instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._duration_ms @property def success(self) -> Optional[bool]: @@ -67,6 +83,20 @@ def usage(self) -> Optional[TokenUsage]: def time_to_first_token(self) -> Optional[int]: return self._time_to_first_token + @property + def tool_calls(self) -> Optional[List[str]]: + """List of tool keys that were invoked during this operation.""" + return self._tool_calls + + @property + def resumption_token(self) -> Optional[str]: + """ + URL-safe Base64-encoded resumption token captured at tracker + instantiation. Useful for deferred feedback flows where a downstream + process needs to associate events with the original execution. + """ + return self._resumption_token + class LDAIConfigTracker: """ @@ -107,8 +137,10 @@ def __init__( self._provider_name = provider_name self._context = context self._graph_key = graph_key - self._summary = LDAIMetricSummary() self._run_id = run_id + self._summary = LDAIMetricSummary() + # Capture resumption_token immediately so it's available on the summary at instantiation. + self._summary._resumption_token = self.resumption_token @property def resumption_token(self) -> str: @@ -200,10 +232,10 @@ def track_duration(self, duration: int) -> None: :param duration: Duration in milliseconds. """ - if self._summary.duration is not None: + if self._summary.duration_ms is not None: log.warning("Duration has already been tracked for this execution. %s", self.__get_track_data()) return - self._summary._duration = duration + self._summary._duration_ms = duration self._ld_client.track( "$ld:ai:duration:total", self._context, self.__get_track_data(), duration ) @@ -251,15 +283,28 @@ def _track_from_metrics_extractor( self, result: Any, metrics_extractor: Callable[[Any], Any], - ) -> Any: - metrics = metrics_extractor(result) + elapsed_ms: int, + ) -> None: + metrics = None + try: + metrics = metrics_extractor(result) + except Exception as exc: + log.warning("Failed to extract metrics: %s", exc) + + if metrics is None: + self.track_duration(elapsed_ms) + return + + reported_ms = getattr(metrics, 'duration_ms', None) + self.track_duration(reported_ms if reported_ms is not None else elapsed_ms) if metrics.success: self.track_success() else: self.track_error() if metrics.usage: self.track_tokens(metrics.usage) - return result + if metrics.tool_calls is not None: + self.track_tool_calls(metrics.tool_calls) def track_metrics_of( self, @@ -278,6 +323,10 @@ def track_metrics_of( For async operations, use :meth:`track_metrics_of_async`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Synchronous callable that runs the operation :return: The result of the operation @@ -291,9 +340,9 @@ def track_metrics_of( self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) - return self._track_from_metrics_extractor(result, metrics_extractor) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) + return result async def track_metrics_of_async(self, metrics_extractor, func): """ @@ -301,6 +350,10 @@ async def track_metrics_of_async(self, metrics_extractor, func): Same event semantics as :meth:`track_metrics_of`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Async callable or zero-arg callable that returns an awaitable when called :return: The result of the operation @@ -315,9 +368,9 @@ async def track_metrics_of_async(self, metrics_extractor, func): self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) - return self._track_from_metrics_extractor(result, metrics_extractor) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) + return result def track_judge_result(self, judge_result: Any) -> None: """ @@ -364,6 +417,23 @@ def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None: 1, ) + def track_tool_calls(self, tool_calls: Iterable[str]) -> None: + """ + Track the tool calls made during an AI operation. + + Stores the tool call names on the summary (guarding against duplicate + tracking) and fires a ``$ld:ai:tool_call`` event for each tool. + + :param tool_calls: Tool identifiers (e.g. from a model response). + """ + if self._summary.tool_calls is not None: + log.warning("Tool calls have already been tracked for this execution. %s", self.__get_track_data()) + return + tool_calls_list = list(tool_calls) + self._summary._tool_calls = tool_calls_list + for tool_key in tool_calls_list: + self.track_tool_call(tool_key) + def track_success(self) -> None: """ Track a successful AI generation. @@ -499,15 +569,6 @@ def track_tool_call(self, tool_key: str) -> None: 1, ) - def track_tool_calls(self, tool_keys: Iterable[str]) -> None: - """ - Track multiple tool invocations for this configuration. - - :param tool_keys: Tool identifiers (e.g. from a model response). - """ - for tool_key in tool_keys: - self.track_tool_call(tool_key) - def get_summary(self) -> LDAIMetricSummary: """ Get the current summary of AI metrics. diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py index 144641fc..0c30637a 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent.py +++ b/packages/sdk/server-ai/tests/test_managed_agent.py @@ -6,13 +6,19 @@ from ldai import LDAIClient, ManagedAgent from ldai.managed_agent import ManagedAgent from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig -from ldai.providers import AgentResult -from ldai.providers.types import LDAIMetrics +from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult +from ldai.tracker import LDAIMetricSummary from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData +def _make_summary(success: bool = True) -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = success + return summary + + @pytest.fixture def td() -> TestData: td = TestData.data_source() @@ -53,30 +59,32 @@ class TestManagedAgentRun: @pytest.mark.asyncio async def test_run_delegates_to_agent_runner(self): - """Should delegate run() to the underlying AgentRunner.""" + """Should delegate run() to the underlying AgentRunner and return ManagedResult.""" mock_config = MagicMock(spec=AIAgentConfig) mock_tracker = MagicMock() mock_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=mock_tracker) mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Test response" + assert isinstance(result, ManagedResult) + assert result.content == "Test response" assert result.metrics.success is True mock_config.create_tracker.assert_called_once() mock_tracker.track_metrics_of_async.assert_called_once() @@ -87,12 +95,13 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): mock_config = MagicMock(spec=AIAgentConfig) fresh_tracker = MagicMock() fresh_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Fresh tracker response", - raw=None, + return_value=RunnerResult( + content="Fresh tracker response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) + fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=fresh_tracker) mock_runner = MagicMock() @@ -100,7 +109,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Fresh tracker response" + assert isinstance(result, ManagedResult) + assert result.content == "Fresh tracker response" mock_config.create_tracker.assert_called_once() fresh_tracker.track_metrics_of_async.assert_called_once() @@ -152,7 +162,7 @@ async def test_returns_managed_agent_when_runner_available(self, ldai_client: LD mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult(output="Hello!", raw=None, metrics=LDAIMetrics(success=True, usage=None)) + return_value=RunnerResult(content="Hello!", metrics=LDAIMetrics(success=True, usage=None), raw=None) ) original = rf.RunnerFactory.create_agent diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index 36802a14..6d679552 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -2,48 +2,65 @@ import asyncio from typing import List -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock import pytest from ldai.evaluator import Evaluator from ldai.managed_model import ManagedModel from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, ModelResponse -from ldai.tracker import LDAIConfigTracker +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult +from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary -def _make_model_response(content: str = 'response text') -> ModelResponse: - return ModelResponse( - message=LDMessage(role='assistant', content=content), +def _make_runner_result(content: str = 'response text') -> RunnerResult: + return RunnerResult( + content=content, metrics=LDAIMetrics(success=True, usage=None), ) -class TestManagedModelInvokeReturnsImmediately: - """invoke() must return before the evaluations task resolves.""" +def _make_summary() -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = True + return summary + + +def _make_config_with_tracker(evaluator: Evaluator) -> tuple[AICompletionConfig, MagicMock]: + """Build an AICompletionConfig with a fully-mocked tracker.""" + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result()) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) + config = AICompletionConfig( + key='test-config', + enabled=True, + create_tracker=MagicMock(return_value=mock_tracker), + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + messages=[], + evaluator=evaluator, + ) + return config, mock_tracker - @pytest.mark.asyncio - async def test_invoke_returns_before_evaluations_resolve(self): - """invoke() should return a ModelResponse before evaluations complete.""" - # Set up a barrier so the evaluation coroutine doesn't complete until we release it - barrier = asyncio.Event() - async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: - await barrier.wait() - return [] +class TestManagedModelRunReturnsImmediately: + """run() must return before the evaluations task resolves.""" + @pytest.mark.asyncio + async def test_run_returns_managed_result(self): + """run() should return a ManagedResult with content from the runner.""" evaluator = MagicMock(spec=Evaluator) evaluator.evaluate = MagicMock( - side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + side_effect=lambda i, o: asyncio.create_task(_empty_eval()) ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result('hi')) mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result('hi')) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) config = AICompletionConfig( key='test-config', enabled=True, @@ -55,20 +72,46 @@ async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult] ) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') + + assert isinstance(result, ManagedResult) + assert result.content == 'hi' + assert isinstance(result.metrics, LDAIMetricSummary) + # Cleanup the still-pending evaluations task. + if result.evaluations is not None: + await result.evaluations + + @pytest.mark.asyncio + async def test_run_returns_before_evaluations_resolve(self): + """run() should return a ManagedResult before evaluations complete.""" + barrier = asyncio.Event() + + async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: + await barrier.wait() + return [] + + evaluator = MagicMock(spec=Evaluator) + evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + ) + + mock_runner = MagicMock() + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + + config, _tracker = _make_config_with_tracker(evaluator) + model = ManagedModel(config, mock_runner) + result = await model.run('Hello') - # invoke() returned — evaluations task should still be pending - assert response is not None - assert response.evaluations is not None - assert not response.evaluations.done(), "evaluations task should still be pending" + assert result is not None + assert result.evaluations is not None + assert not result.evaluations.done(), "evaluations task should still be pending" - # Release the barrier and let it finish cleanly barrier.set() - await response.evaluations + await result.evaluations @pytest.mark.asyncio async def test_await_evaluations_collects_results(self): - """await response.evaluations should return the list of JudgeResult instances.""" + """await result.evaluations should return the list of JudgeResult instances.""" judge_result = JudgeResult( judge_config_key='judge-key', success=True, @@ -87,24 +130,13 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') - results = await response.evaluations # type: ignore[misc] + results = await result.evaluations # type: ignore[misc] assert results == [judge_result] @pytest.mark.asyncio @@ -128,30 +160,19 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') # Tracking should NOT have fired yet (before we await evaluations) mock_tracker.track_judge_result.assert_not_called() # Now await the evaluations task — tracking fires inside the chain - await response.evaluations # type: ignore[misc] + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_called_once_with(judge_result) @@ -174,25 +195,14 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - await response.evaluations # type: ignore[misc] + result = await model.run('Hello') + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_not_called() @@ -202,23 +212,15 @@ async def test_noop_evaluator_returns_empty_list(self): evaluator = Evaluator.noop() mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) - - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - results = await response.evaluations # type: ignore[misc] + result = await model.run('Hello') + results = await result.evaluations # type: ignore[misc] assert results == [] + + +async def _empty_eval() -> List[JudgeResult]: + return [] diff --git a/packages/sdk/server-ai/tests/test_runner_abcs.py b/packages/sdk/server-ai/tests/test_runner_abcs.py index d5136fd0..7e8087cd 100644 --- a/packages/sdk/server-ai/tests/test_runner_abcs.py +++ b/packages/sdk/server-ai/tests/test_runner_abcs.py @@ -1,17 +1,17 @@ import pytest -from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentResult, AgentRunner, ToolRegistry -from ldai.providers.types import LDAIMetrics +from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentRunner, ToolRegistry +from ldai.providers.types import LDAIMetrics, RunnerResult # --- Concrete test doubles --- class ConcreteAgentRunner: async def run(self, input): - return AgentResult( - output=f"agent response to: {input}", - raw={"raw": input}, + return RunnerResult( + content=f"agent response to: {input}", metrics=LDAIMetrics(success=True), + raw={"raw": input}, ) @@ -39,20 +39,20 @@ def test_agent_runner_structural_check_fails_when_run_missing(): @pytest.mark.asyncio -async def test_agent_runner_run_returns_agent_result(): +async def test_agent_runner_run_returns_runner_result(): runner = ConcreteAgentRunner() result = await runner.run("hello") - assert isinstance(result, AgentResult) - assert result.output == "agent response to: hello" + assert isinstance(result, RunnerResult) + assert result.content == "agent response to: hello" assert result.raw == {"raw": "hello"} assert result.metrics.success is True @pytest.mark.asyncio -async def test_agent_result_fields(): +async def test_runner_result_fields(): metrics = LDAIMetrics(success=True) - result = AgentResult(output="done", raw={"key": "val"}, metrics=metrics) - assert result.output == "done" + result = RunnerResult(content="done", metrics=metrics, raw={"key": "val"}) + assert result.content == "done" assert result.raw == {"key": "val"} assert result.metrics is metrics @@ -103,6 +103,6 @@ def test_top_level_exports(): import ldai assert hasattr(ldai, 'AgentRunner') assert hasattr(ldai, 'AgentGraphRunner') - assert hasattr(ldai, 'AgentResult') assert hasattr(ldai, 'AgentGraphResult') + assert hasattr(ldai, 'RunnerResult') assert hasattr(ldai, 'ToolRegistry')