diff --git a/AGENTS.md b/AGENTS.md index e04d2bb..f13d2eb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,7 +22,7 @@ A jambonz application controls phone calls by returning **arrays of verbs** — - **Webhook (HTTP)**: Your server receives POST requests and returns JSON verb arrays. Stateless and simple. - **WebSocket**: Persistent bidirectional connection. Required for real-time LLM agents, audio streaming, and TTS token streaming. -**IMPORTANT**: Any application that uses a speech-to-speech verb (`openai_s2s`, `google_s2s`, `deepgram_s2s`, `ultravox_s2s`, `elevenlabs_s2s`, `s2s`, or `pipeline`) MUST use WebSocket transport. +**IMPORTANT**: Any application that uses a speech-to-speech verb (`openai_s2s`, `google_s2s`, `deepgram_s2s`, `ultravox_s2s`, `elevenlabs_s2s`, `s2s`, or `agent`) MUST use WebSocket transport. ## Core Verbs @@ -34,7 +34,7 @@ A jambonz application controls phone calls by returning **arrays of verbs** — ### AI & Real-time - **openai_s2s** / **google_s2s** / **deepgram_s2s** / **ultravox_s2s** / **elevenlabs_s2s** — Vendor-specific LLM voice conversation. - **s2s** — Generic LLM voice conversation (use when vendor is determined at runtime). -- **pipeline** — Higher-level voice AI pipeline with integrated turn detection. +- **agent** — Higher-level voice AI agent with integrated turn detection. - **dialogflow** — Google Dialogflow agent. - **stream** — Stream raw audio to a websocket endpoint. - **transcribe** — Real-time call transcription. @@ -258,7 +258,7 @@ await client.calls.whisper(call_sid, {"verb": "say", "text": "Hello"}) await client.calls.mute(call_sid, "mute") await client.calls.redirect(call_sid, "https://example.com/new") await client.calls.update(call_sid, {"call_status": "completed"}) -await client.calls.update_pipeline(call_sid, {"type": "update_instructions", "instructions": "New prompt"}) +await client.calls.update_agent(call_sid, {"type": "update_instructions", "instructions": "New prompt"}) ``` ## TTS Token Streaming @@ -273,18 +273,18 @@ await session.flush_tts_tokens() await session.clear_tts_tokens() ``` -## Pipeline Updates +## Agent Updates -Update a running pipeline mid-conversation: +Update a running agent mid-conversation: ```python -await session.update_pipeline({"type": "update_instructions", "instructions": "Now help with billing."}) -await session.update_pipeline({"type": "inject_context", "messages": [{"role": "system", "content": "Customer is Gold tier."}]}) -await session.update_pipeline({"type": "update_tools", "tools": [...]}) -await session.update_pipeline({"type": "generate_reply", "user_input": "Override", "interrupt": True}) +await session.update_agent({"type": "update_instructions", "instructions": "Now help with billing."}) +await session.update_agent({"type": "inject_context", "messages": [{"role": "system", "content": "Customer is Gold tier."}]}) +await session.update_agent({"type": "update_tools", "tools": [...]}) +await session.update_agent({"type": "generate_reply", "user_input": "Override", "interrupt": True}) ``` -## Tool Output (Pipeline) +## Tool Output (Agent) When the LLM requests a tool call, return the result: @@ -348,7 +348,7 @@ audio_svc.on("connection", on_audio_connection) | `llm:tool-output` | Tool call result (`tool_output()`) | | `tts:tokens` | Stream TTS text (`send_tts_tokens()`) | | `tts:flush` | End TTS stream (`flush_tts_tokens()`) | -| `pipeline:update` | Pipeline update (`update_pipeline()`) | +| `agent:update` | Agent update (`update_agent()`) | ## Common Patterns @@ -361,9 +361,9 @@ jambonz.say(text="Welcome.").gather( ).say(text="No input. Goodbye.").hangup() ``` -### Voice Agent (Pipeline) +### Voice Agent ```python -session.pipeline( +session.agent( stt={"vendor": "deepgram", "language": "en-US"}, tts={"vendor": "cartesia", "voice": "sonic-english"}, llm={"vendor": "openai", "model": "gpt-4o", "llmOptions": { @@ -371,7 +371,7 @@ session.pipeline( }}, turnDetection="krisp", bargeIn={"enable": True}, - actionHook="/pipeline-done", + actionHook="/agent-done", eventHook="/events", toolHook="/tools", ) @@ -410,9 +410,9 @@ jambonz.say(text="Connecting you now.").dial( ## SDK Architecture -The SDK auto-generates verb methods from `specs.json` (from `@jambonz/verb-specifications`). When the spec changes, the SDK automatically picks up new parameters: +The SDK auto-generates verb methods from JSON Schema files (from `@jambonz/schema`). When the schema changes, the SDK automatically picks up new parameters: -1. `specs.json` — bundled verb/component specifications (synced from upstream) +1. `schema/verbs/*.schema.json` — bundled verb schemas (synced from upstream) 2. `verb_registry.py` — maps spec entries to Python methods + synonyms 3. `verb_builder.py` — generates methods at import time from specs + registry 4. `WebhookResponse` and `Session` both extend `VerbBuilder` diff --git a/CLAUDE.md b/CLAUDE.md index d7c250b..16f9737 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,7 +23,7 @@ src/jambonz_sdk/ │ ├── verbs.py # All 26+ verb TypedDicts │ ├── rest.py # REST API request/response types │ └── session.py # Call session & WebSocket message types -├── verb_builder.py # VerbBuilder — methods auto-generated from specs.json +├── verb_builder.py # VerbBuilder — methods auto-generated from JSON Schema ├── verb_registry.py # Verb definitions: maps spec entries → Python methods ├── webhook/ │ ├── __init__.py @@ -48,12 +48,12 @@ src/jambonz_sdk/ - **Transport-agnostic verb building**: Same verb methods on both `WebhookResponse` and `Session` - **Fluent/chainable API**: All verb methods return `self` for method chaining - **TypedDict for verb schemas**: Type-safe verb construction matching JSON schemas exactly -- **Auto-generated verb methods**: VerbBuilder methods are generated at import time from `specs.json` + `verb_registry.py` — when the spec changes, the SDK automatically picks up new parameters +- **Auto-generated verb methods**: VerbBuilder methods are generated at import time from JSON Schema files (`@jambonz/schema`) + `verb_registry.py` — when the schema changes, the SDK automatically picks up new parameters - **aiohttp for both HTTP and WebSocket**: Single dependency for REST client and WS transport ## Verb System -The SDK supports all 26+ jambonz verbs. Verb methods on VerbBuilder are **auto-generated** from the shared `specs.json` (in `/Users/xhoaluu/jambonz/verb-specifications/specs.json`). +The SDK supports all 26+ jambonz verbs. Verb methods on VerbBuilder are **auto-generated** from JSON Schema files bundled from [`@jambonz/schema`](https://github.com/jambonz/schema). ### How verb generation works @@ -65,7 +65,7 @@ The SDK supports all 26+ jambonz verbs. Verb methods on VerbBuilder are **auto-g ### Verb List Audio/Speech: `say`, `play`, `gather` -AI/S2S: `openai_s2s`, `google_s2s`, `deepgram_s2s`, `elevenlabs_s2s`, `ultravox_s2s`, `s2s`, `llm`, `dialogflow`, `pipeline` +AI/S2S: `openai_s2s`, `google_s2s`, `deepgram_s2s`, `elevenlabs_s2s`, `ultravox_s2s`, `s2s`, `llm`, `dialogflow`, `agent` Call Control: `dial`, `conference`, `enqueue`, `dequeue`, `hangup`, `redirect`, `pause` Audio Streaming: `listen`, `stream`, `transcribe` SIP: `sip_decline`, `sip_request`, `sip_refer` @@ -105,7 +105,7 @@ Source: https://github.com/jambonz/schema `AGENTS.md` is the comprehensive developer guide for AI agents working with this SDK. It covers: verb system, webhook/WebSocket patterns, REST API, env vars, mid-call control, -TTS streaming, pipeline updates, audio streaming, and common application patterns. +TTS streaming, agent updates, audio streaming, and common application patterns. AI coding agents should read AGENTS.md before generating jambonz Python application code. ### MCP Server @@ -171,11 +171,11 @@ pytest # All 279 tests ### Unit tests (`tests/unit/`) - `test_verb_builder.py` — Parametrized across all 31 verb defs: method existence, correct verb name, all spec properties pass through - `test_webhook.py` — Webhook contract, HMAC-SHA256 signature protocol, env vars OPTIONS format -- `test_session.py` — WebSocket protocol messages: ack, command, tts:tokens, llm:tool-output, pipeline:update +- `test_session.py` — WebSocket protocol messages: ack, command, tts:tokens, llm:tool-output, agent:update - `test_ws_client.py` — Message routing: session:new, verb:hook dispatch, auto-reply, binary/JSON robustness - `test_rest_client.py` — REST API contract: URL construction, HTTP methods, request bodies - `test_audio_stream.py` — Audio protocol: raw PCM, playAudio JSON, marks, control commands ### Integration tests (`tests/integration/`) - `test_webhook.py` — Real aiohttp server with IVR menu, actionHook routing, env vars discovery -- `test_websocket.py` — Real WebSocket connections: full protocol compliance, multi-step conversations, inject commands, TTS streaming, pipeline updates +- `test_websocket.py` — Real WebSocket connections: full protocol compliance, multi-step conversations, inject commands, TTS streaming, agent updates diff --git a/README.md b/README.md index 9a55995..5752142 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ async with JambonzClient( ### Spec-driven verb generation -The SDK does **not** hardcode verb method signatures. Instead, verb methods (`.say()`, `.gather()`, `.dial()`, `.pipeline()`, etc.) are **auto-generated at import time** from [JSON Schema](https://github.com/jambonz/schema) files — the same schemas used by the Node.js SDK and the jambonz server. +The SDK does **not** hardcode verb method signatures. Instead, verb methods (`.say()`, `.gather()`, `.dial()`, `.agent()`, etc.) are **auto-generated at import time** from [JSON Schema](https://github.com/jambonz/schema) files — the same schemas used by the Node.js SDK and the jambonz server. **What this means:** @@ -98,7 +98,7 @@ VerbDef("new_verb", "new_verb", doc="Description.") ## Features -- **All 31 jambonz verbs**: say, play, gather, dial, conference, enqueue/dequeue, hangup, pause, redirect, config, tag, dtmf, dub, message, alert, answer, leave, listen/stream, transcribe, openai_s2s, google_s2s, deepgram_s2s, elevenlabs_s2s, ultravox_s2s, s2s, llm, dialogflow, pipeline, sip_decline, sip_request, sip_refer +- **All 31 jambonz verbs**: say, play, gather, dial, conference, enqueue/dequeue, hangup, pause, redirect, config, tag, dtmf, dub, message, alert, answer, leave, listen/stream, transcribe, openai_s2s, google_s2s, deepgram_s2s, elevenlabs_s2s, ultravox_s2s, s2s, llm, dialogflow, agent, sip_decline, sip_request, sip_refer - **Fluent chainable API**: `.say(...).gather(...).hangup()` - **Webhook transport**: `WebhookResponse` for HTTP apps (works with aiohttp, FastAPI, Flask, etc.) - **WebSocket transport**: `create_endpoint` with `Session`, event handling, `send()`/`reply()` @@ -106,7 +106,7 @@ VerbDef("new_verb", "new_verb", doc="Description.") - **Audio streaming**: Bidirectional audio via `AudioStream` - **Mid-call control**: inject commands (mute, whisper, record, DTMF, tag) - **TTS token streaming**: `send_tts_tokens()` / `flush_tts_tokens()` -- **Pipeline updates**: `update_pipeline()` for mid-conversation LLM changes +- **Agent updates**: `update_agent()` for mid-conversation LLM changes - **Signature verification**: HMAC-SHA256 webhook signature validation - **Env vars**: Portal discovery via OPTIONS + runtime reading @@ -119,7 +119,7 @@ See the [`examples/`](examples/) directory: | hello-world | [webhook](examples/hello-world/webhook_app.py) | [websocket](examples/hello-world/websocket_app.py) | Minimal greeting | | echo | [webhook](examples/echo/webhook_app.py) | [websocket](examples/echo/websocket_app.py) | Speech echo with gather | | ivr-menu | [webhook](examples/ivr-menu/webhook_app.py) | — | IVR menu with speech + DTMF | -| voice-agent | [webhook](examples/voice-agent/webhook_app.py) | [websocket](examples/voice-agent/websocket_app.py) | LLM pipeline with tool calls | +| voice-agent | [webhook](examples/voice-agent/webhook_app.py) | [websocket](examples/voice-agent/websocket_app.py) | LLM agent with tool calls | | dial | [webhook](examples/dial/webhook_app.py) | — | Outbound dial with fallback | | listen-record | [webhook](examples/listen-record/webhook_app.py) | [websocket](examples/listen-record/websocket_app.py) | Audio recording | diff --git a/examples/voice-agent/websocket_app.py b/examples/voice-agent/websocket_app.py index 459c4b7..d8403d6 100644 --- a/examples/voice-agent/websocket_app.py +++ b/examples/voice-agent/websocket_app.py @@ -1,7 +1,7 @@ """Voice Agent - WebSocket example. -LLM-powered voice agent using the pipeline verb with tool calling. -Demonstrates pipeline configuration, eventHook handling, and toolHook handling. +LLM-powered voice agent using the agent verb with tool calling. +Demonstrates agent configuration, eventHook handling, and toolHook handling. Usage: python websocket_app.py @@ -40,7 +40,7 @@ async def handle_session(session): print(f"New call: {session.call_sid} from {session.from_}") - # Handle pipeline events + # Handle agent events async def on_event(evt): event_type = evt.get("type", "") if event_type == "turn_end": @@ -51,7 +51,7 @@ async def on_event(evt): ) await session.reply() - session.on("/pipeline-event", on_event) + session.on("/agent-event", on_event) # Handle tool calls async def on_tool(evt): @@ -71,16 +71,16 @@ async def on_tool(evt): session.on("/tool-call", on_tool) - # Handle pipeline completion + # Handle agent completion async def on_complete(evt): - print(f"Pipeline complete: {evt.get('completion_reason', 'unknown')}") + print(f"Agent complete: {evt.get('completion_reason', 'unknown')}") session.hangup() await session.reply() - session.on("/pipeline-complete", on_complete) + session.on("/agent-complete", on_complete) - # Start the pipeline - session.pipeline( + # Start the agent + session.agent( stt={ "vendor": "deepgram", "language": "en-US", @@ -133,9 +133,9 @@ async def on_complete(evt): turnDetection="krisp", earlyGeneration=True, bargeIn={"enable": True, "minSpeechDuration": 0.3}, - eventHook="/pipeline-event", + eventHook="/agent-event", toolHook="/tool-call", - actionHook="/pipeline-complete", + actionHook="/agent-complete", ) await session.send() diff --git a/pyproject.toml b/pyproject.toml index 6782488..28c9ea7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "jambonz-python-sdk" -version = "0.2.0" +version = "0.3.0" description = "Python SDK for jambonz CPaaS platform" readme = "README.md" requires-python = ">=3.10" diff --git a/scripts/generate_stubs.py b/scripts/generate_stubs.py index e44e983..32cb859 100644 --- a/scripts/generate_stubs.py +++ b/scripts/generate_stubs.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -"""Generate verb_builder.pyi stub file from specs.json + verb_registry. +"""Generate verb_builder.pyi stub file from JSON Schema + verb_registry. This creates a .pyi type stub that IDEs (VS Code Pylance, PyCharm, mypy) read for static type checking and autocomplete. Run this after syncing -specs.json or updating verb_registry.py. +the schema or updating verb_registry.py. Usage: python scripts/generate_stubs.py @@ -19,10 +19,10 @@ from jambonz_sdk.verb_registry import VERB_DEFS -SPECS_PATH = SRC_DIR / "jambonz_sdk" / "specs.json" +SCHEMA_DIR = SRC_DIR / "jambonz_sdk" / "schema" / "verbs" STUB_PATH = SRC_DIR / "jambonz_sdk" / "verb_builder.pyi" -# Maps specs.json type strings to Python type annotation strings for .pyi +# Maps JSON Schema type strings to Python type annotation strings for .pyi TYPE_MAP = { "string": "str", "number": "int | float", @@ -33,7 +33,7 @@ def resolve_type(spec_type) -> str: - """Convert a specs.json type descriptor to a .pyi type string.""" + """Convert a JSON Schema type descriptor to a .pyi type string.""" if isinstance(spec_type, str): if spec_type.startswith("#"): return "dict[str, Any]" @@ -61,9 +61,37 @@ def resolve_type(spec_type) -> str: return "Any" +def _load_schemas() -> dict: + """Load verb JSON Schemas from the bundled schema directory.""" + schemas: dict = {} + for schema_file in sorted(SCHEMA_DIR.glob("*.schema.json")): + with schema_file.open() as f: + schema = json.load(f) + schema_id = schema.get("$id", "") + if schema_id: + spec_name = schema_id.rsplit("/", 1)[-1] + else: + spec_name = schema_file.stem.replace(".schema", "") + properties = {} + for prop_name, prop_def in schema.get("properties", {}).items(): + if prop_name == "verb": + continue + properties[prop_name] = prop_def + for entry in schema.get("allOf", []): + if "properties" in entry: + for prop_name, prop_def in entry["properties"].items(): + if prop_name == "verb": + continue + properties[prop_name] = prop_def + schemas[spec_name] = { + "properties": properties, + "required": schema.get("required", []), + } + return schemas + + def generate() -> str: - with SPECS_PATH.open() as f: - specs = json.load(f) + specs = _load_schemas() lines = [ '"""Auto-generated type stubs for VerbBuilder.', @@ -75,7 +103,6 @@ def generate() -> str: "", "from jambonz_sdk.types.verbs import AnyVerb", "", - "", "class VerbBuilder:", " _verbs: list[AnyVerb]", "", diff --git a/scripts/sync_schema.py b/scripts/sync_schema.py index 40967a4..9bc4c7a 100644 --- a/scripts/sync_schema.py +++ b/scripts/sync_schema.py @@ -22,7 +22,7 @@ from pathlib import Path # ── Pin the schema version here ────────────────────────────────────── -SCHEMA_VERSION = "v0.1.1" +SCHEMA_VERSION = "v0.2.1" # ──────────────────────────────────────────────────────────────────── DEST = Path(__file__).resolve().parent.parent / "src" / "jambonz_sdk" / "schema" diff --git a/src/jambonz_sdk/client/api.py b/src/jambonz_sdk/client/api.py index cabd203..dd53809 100644 --- a/src/jambonz_sdk/client/api.py +++ b/src/jambonz_sdk/client/api.py @@ -100,16 +100,16 @@ async def mute(self, call_sid: str, status: str) -> dict[str, Any]: """ return await self.update(call_sid, {"mute_status": status}) - async def update_pipeline( + async def update_agent( self, call_sid: str, data: dict[str, Any] ) -> dict[str, Any]: - """Send a mid-conversation pipeline update. + """Send a mid-conversation agent update. Args: call_sid: The call to update. - data: Pipeline update payload. + data: Agent update payload. """ - return await self.update(call_sid, {"pipeline_update": data}) + return await self.update(call_sid, {"agent_update": data}) async def noise_isolation( self, call_sid: str, status: str, opts: dict[str, Any] | None = None diff --git a/src/jambonz_sdk/schema/callbacks/pipeline-turn.schema.json b/src/jambonz_sdk/schema/callbacks/agent-turn.schema.json similarity index 93% rename from src/jambonz_sdk/schema/callbacks/pipeline-turn.schema.json rename to src/jambonz_sdk/schema/callbacks/agent-turn.schema.json index 10829fe..4bf6faa 100644 --- a/src/jambonz_sdk/schema/callbacks/pipeline-turn.schema.json +++ b/src/jambonz_sdk/schema/callbacks/agent-turn.schema.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://jambonz.org/schema/callbacks/pipeline-turn", - "title": "Pipeline EventHook Events", - "description": "Events sent to the pipeline verb's eventHook during a conversation. These are sent as 'pipeline:event' messages over the WebSocket connection.", + "$id": "https://jambonz.org/schema/callbacks/agent-turn", + "title": "Agent EventHook Events", + "description": "Events sent to the agent verb's eventHook during a conversation. These are sent as 'agent:event' messages over the WebSocket connection.", "type": "object", "oneOf": [ { @@ -84,7 +84,7 @@ { "properties": { "type": { - "const": "agent_response", + "const": "llm_response", "description": "Sent when the LLM has finished generating its response for the current turn. Contains the complete response text." }, "response": { diff --git a/src/jambonz_sdk/schema/components/recognizer.schema.json b/src/jambonz_sdk/schema/components/recognizer.schema.json index f7f3084..702b41f 100644 --- a/src/jambonz_sdk/schema/components/recognizer.schema.json +++ b/src/jambonz_sdk/schema/components/recognizer.schema.json @@ -41,9 +41,24 @@ }, "hints": { "type": "array", - "items": { "type": "string" }, - "description": "An array of words or phrases that the recognizer should favor. Use this to improve accuracy for domain-specific terminology, product names, or proper nouns.", - "examples": [["jambonz", "drachtio", "SIP", "WebRTC"]] + "items": { + "oneOf": [ + { "type": "string" }, + { + "type": "object", + "properties": { + "phrase": { "type": "string" }, + "boost": { "type": "number" } + }, + "required": ["phrase"] + } + ] + }, + "description": "An array of words or phrases that the recognizer should favor. Each item can be a plain string or an object with 'phrase' and optional 'boost' properties.", + "examples": [ + ["jambonz", "drachtio", "SIP", "WebRTC"], + [{"phrase": "jambonz", "boost": 20}, {"phrase": "drachtio", "boost": 10}] + ] }, "hintsBoost": { "type": "number", diff --git a/src/jambonz_sdk/schema/jambonz-app.schema.json b/src/jambonz_sdk/schema/jambonz-app.schema.json index 4e0ac4d..5182022 100644 --- a/src/jambonz_sdk/schema/jambonz-app.schema.json +++ b/src/jambonz_sdk/schema/jambonz-app.schema.json @@ -28,7 +28,7 @@ { "$ref": "verbs/deepgram_s2s" }, { "$ref": "verbs/ultravox_s2s" }, { "$ref": "verbs/dialogflow" }, - { "$ref": "verbs/pipeline" }, + { "$ref": "verbs/agent" }, { "$ref": "verbs/conference" }, { "$ref": "verbs/transcribe" }, { "$ref": "verbs/enqueue" }, diff --git a/src/jambonz_sdk/schema/verbs/pipeline.schema.json b/src/jambonz_sdk/schema/verbs/agent.schema.json similarity index 84% rename from src/jambonz_sdk/schema/verbs/pipeline.schema.json rename to src/jambonz_sdk/schema/verbs/agent.schema.json index 9156cd7..078bb36 100644 --- a/src/jambonz_sdk/schema/verbs/pipeline.schema.json +++ b/src/jambonz_sdk/schema/verbs/agent.schema.json @@ -1,13 +1,13 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://jambonz.org/schema/verbs/pipeline", + "$id": "https://jambonz.org/schema/verbs/agent", "minVersion": "10.1.0", - "title": "Pipeline", - "description": "Configures a complete STT → LLM → TTS voice AI pipeline with integrated turn detection. Provides a higher-level abstraction than manually orchestrating the individual components. Optimized for building voice AI agents with proper turn-taking behavior.", + "title": "Agent", + "description": "Configures a complete voice AI agent by wiring together STT → LLM → TTS with integrated turn detection. Provides a higher-level abstraction than manually orchestrating the individual components. Optimized for building voice AI agents with proper turn-taking behavior.", "type": "object", "properties": { "verb": { - "const": "pipeline" + "const": "agent" }, "id": { "type": "string", @@ -15,11 +15,11 @@ }, "stt": { "$ref": "../components/recognizer", - "description": "Speech-to-text configuration for the pipeline." + "description": "Speech-to-text configuration for the agent." }, "tts": { "$ref": "../components/synthesizer", - "description": "Text-to-speech configuration for the pipeline." + "description": "Text-to-speech configuration for the agent." }, "turnDetection": { "oneOf": [ @@ -53,7 +53,7 @@ } ], "default": "stt", - "description": "Turn detection strategy. Controls when the pipeline decides the user has finished speaking. STT vendors with native turn-taking (deepgramflux, assemblyai, speechmatics) always use their built-in detection regardless of this setting." + "description": "Turn detection strategy. Controls when the agent decides the user has finished speaking. STT vendors with native turn-taking (deepgramflux, assemblyai, speechmatics) always use their built-in detection regardless of this setting." }, "bargeIn": { "type": "object", @@ -86,16 +86,16 @@ }, "llm": { "type": "object", - "description": "LLM configuration for the pipeline. See the 'llm' verb schema for details.", + "description": "LLM configuration for the agent. See the 'llm' verb schema for details.", "additionalProperties": true }, "actionHook": { "$ref": "../components/actionHook", - "description": "A webhook invoked when the pipeline ends." + "description": "A webhook invoked when the agent ends." }, "eventHook": { "$ref": "../components/actionHook", - "description": "A webhook invoked for pipeline events. Receives event types: 'user_transcript' (user speech recognized), 'agent_response' (assistant reply), 'user_interruption' (barge-in detected), and 'turn_end' (end-of-turn summary with transcript, response, and latency metrics)." + "description": "A webhook invoked for agent events. Receives event types: 'user_transcript' (user speech recognized), 'llm_response' (assistant reply), 'user_interruption' (barge-in detected), and 'turn_end' (end-of-turn summary with transcript, response, and latency metrics)." }, "toolHook": { "$ref": "../components/actionHook", @@ -171,7 +171,7 @@ }, "required": ["url"] }, - "description": "External MCP servers that provide tools to the LLM. The pipeline connects at startup via SSE, discovers available tools, and makes them callable by the LLM." + "description": "External MCP servers that provide tools to the LLM. The agent connects at startup via SSE, discovers available tools, and makes them callable by the LLM." } }, "required": [ @@ -179,7 +179,7 @@ ], "examples": [ { - "verb": "pipeline", + "verb": "agent", "stt": { "vendor": "deepgram", "language": "en-US" @@ -201,10 +201,10 @@ } }, "turnDetection": "stt", - "actionHook": "/pipeline-complete" + "actionHook": "/agent-complete" }, { - "verb": "pipeline", + "verb": "agent", "stt": { "vendor": "deepgram", "language": "en-US" @@ -234,7 +234,7 @@ "minSpeechDuration": 0.3, "sticky": false }, - "actionHook": "/pipeline-complete" + "actionHook": "/agent-complete" } ] } diff --git a/src/jambonz_sdk/types/__init__.py b/src/jambonz_sdk/types/__init__.py index 424ec8a..2def7f4 100644 --- a/src/jambonz_sdk/types/__init__.py +++ b/src/jambonz_sdk/types/__init__.py @@ -34,6 +34,7 @@ WsMessageType, ) from jambonz_sdk.types.verbs import ( + AgentVerb, AlertVerb, AnswerVerb, AnyVerb, @@ -56,7 +57,6 @@ MessageVerb, OpenaiS2sVerb, PauseVerb, - PipelineVerb, PlayVerb, RedirectVerb, S2sVerb, @@ -92,6 +92,7 @@ "TurnTaking", "Vad", # Verbs + "AgentVerb", "AlertVerb", "AnswerVerb", "AnyVerb", @@ -114,7 +115,6 @@ "MessageVerb", "OpenaiS2sVerb", "PauseVerb", - "PipelineVerb", "PlayVerb", "RedirectVerb", "S2sVerb", diff --git a/src/jambonz_sdk/types/verbs.py b/src/jambonz_sdk/types/verbs.py index c03c5d1..dcbcc38 100644 --- a/src/jambonz_sdk/types/verbs.py +++ b/src/jambonz_sdk/types/verbs.py @@ -514,10 +514,10 @@ class DialogflowVerb(TypedDict, total=False): tts: Synthesizer -class PipelineVerb(TypedDict, total=False): - """Integrated STT -> LLM -> TTS voice AI pipeline.""" +class AgentVerb(TypedDict, total=False): + """Integrated STT -> LLM -> TTS voice AI agent.""" - verb: str # "pipeline" + verb: str # "agent" id: str stt: Recognizer tts: Synthesizer @@ -568,5 +568,5 @@ class PipelineVerb(TypedDict, total=False): ElevenlabsS2sVerb, UltravoxS2sVerb, DialogflowVerb, - PipelineVerb, + AgentVerb, ] diff --git a/src/jambonz_sdk/verb_builder.pyi b/src/jambonz_sdk/verb_builder.pyi index 1e102cd..5d0e5cd 100644 --- a/src/jambonz_sdk/verb_builder.pyi +++ b/src/jambonz_sdk/verb_builder.pyi @@ -16,11 +16,11 @@ class VerbBuilder: def say( self, id: str = ..., - text: str | list[Any] = ..., + text: Any = ..., instructions: str = ..., stream: bool = ..., - loop: int | float | str = ..., - synthesizer: dict[str, Any] = ..., + loop: Any = ..., + synthesizer: Any = ..., earlyMedia: bool = ..., disableTtsCache: bool = ..., closeStreamOnEmpty: bool = ..., @@ -30,11 +30,11 @@ class VerbBuilder: Args: id: str - text: str | list[Any] + text: Any instructions: str stream: bool - loop: int | float | str - synthesizer: dict[str, Any] + loop: Any + synthesizer: Any earlyMedia: bool disableTtsCache: bool closeStreamOnEmpty: bool @@ -47,12 +47,12 @@ class VerbBuilder: def play( self, id: str = ..., - url: str | list[Any] = ..., - loop: int | float | str = ..., + url: Any = ..., + loop: Any = ..., earlyMedia: bool = ..., - seekOffset: int | float | str = ..., - timeoutSecs: int | float | str = ..., - actionHook: dict[str, Any] | str = ..., + seekOffset: Any = ..., + timeoutSecs: Any = ..., + actionHook: Any = ..., **kwargs: Any, ) -> Self: """Play an audio file from a URL. @@ -61,12 +61,12 @@ class VerbBuilder: Args: id: str - url: str | list[Any] (required) - loop: int | float | str + url: Any (required) + loop: Any earlyMedia: bool - seekOffset: int | float | str - timeoutSecs: int | float | str - actionHook: dict[str, Any] | str + seekOffset: Any + timeoutSecs: Any + actionHook: Any Returns: self for chaining. @@ -76,50 +76,50 @@ class VerbBuilder: def gather( self, id: str = ..., - actionHook: dict[str, Any] | str = ..., - finishOnKey: str = ..., + actionHook: Any = ..., input: list[Any] = ..., + finishOnKey: str = ..., numDigits: int | float = ..., minDigits: int | float = ..., maxDigits: int | float = ..., interDigitTimeout: int | float = ..., - partialResultHook: dict[str, Any] | str = ..., speechTimeout: int | float = ..., + timeout: int | float = ..., + partialResultHook: Any = ..., listenDuringPrompt: bool = ..., dtmfBargein: bool = ..., bargein: bool = ..., minBargeinWordCount: int | float = ..., - timeout: int | float = ..., - recognizer: dict[str, Any] = ..., - play: dict[str, Any] = ..., + recognizer: Any = ..., say: dict[str, Any] = ..., - fillerNoise: dict[str, Any] = ..., - actionHookDelayAction: dict[str, Any] = ..., + play: dict[str, Any] = ..., + fillerNoise: Any = ..., + actionHookDelayAction: Any = ..., **kwargs: Any, ) -> Self: """Collect speech (STT) and/or DTMF input. Args: id: str - actionHook: dict[str, Any] | str - finishOnKey: str + actionHook: Any input: list[Any] + finishOnKey: str numDigits: int | float minDigits: int | float maxDigits: int | float interDigitTimeout: int | float - partialResultHook: dict[str, Any] | str speechTimeout: int | float + timeout: int | float + partialResultHook: Any listenDuringPrompt: bool dtmfBargein: bool bargein: bool minBargeinWordCount: int | float - timeout: int | float - recognizer: dict[str, Any] - play: dict[str, Any] + recognizer: Any say: dict[str, Any] - fillerNoise: dict[str, Any] - actionHookDelayAction: dict[str, Any] + play: dict[str, Any] + fillerNoise: Any + actionHookDelayAction: Any Returns: self for chaining. @@ -128,17 +128,6 @@ class VerbBuilder: def openai_s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Connect caller to OpenAI for real-time voice conversation. @@ -146,17 +135,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -165,17 +143,6 @@ class VerbBuilder: def google_s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Connect caller to Google for real-time voice conversation. @@ -183,17 +150,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -202,17 +158,6 @@ class VerbBuilder: def deepgram_s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Connect caller to Deepgram for real-time voice conversation. @@ -220,17 +165,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -239,17 +173,6 @@ class VerbBuilder: def elevenlabs_s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Connect caller to ElevenLabs Conversational AI agent. @@ -257,17 +180,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -276,17 +188,6 @@ class VerbBuilder: def ultravox_s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Connect caller to Ultravox for real-time voice conversation. @@ -294,17 +195,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -313,17 +203,6 @@ class VerbBuilder: def s2s( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Generic S2S verb (use when vendor is determined at runtime). @@ -331,17 +210,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -350,17 +218,6 @@ class VerbBuilder: def llm( self, - id: str = ..., - vendor: str = ..., - model: str = ..., - auth: dict[str, Any] = ..., - connectOptions: dict[str, Any] = ..., - mcpServers: list[Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., - events: list[Any] = ..., - llmOptions: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Legacy LLM verb (prefer s2s or vendor-specific shortcuts). @@ -368,17 +225,6 @@ class VerbBuilder: Required: llmOptions, vendor Args: - id: str - vendor: str (required) - model: str - auth: dict[str, Any] - connectOptions: dict[str, Any] - mcpServers: list[Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str - events: list[Any] - llmOptions: dict[str, Any] (required) Returns: self for chaining. @@ -388,15 +234,15 @@ class VerbBuilder: def dialogflow( self, id: str = ..., - credentials: dict[str, Any] | str = ..., + credentials: Any = ..., project: str = ..., agent: str = ..., environment: str = ..., region: str = ..., model: str = ..., lang: str = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., + actionHook: Any = ..., + eventHook: Any = ..., events: list[Any] = ..., welcomeEvent: str = ..., welcomeEventParams: dict[str, Any] = ..., @@ -404,7 +250,7 @@ class VerbBuilder: noInputEvent: str = ..., passDtmfAsTextInput: bool = ..., thinkingMusic: str = ..., - tts: dict[str, Any] = ..., + tts: Any = ..., bargein: bool = ..., queryInput: dict[str, Any] = ..., **kwargs: Any, @@ -415,15 +261,15 @@ class VerbBuilder: Args: id: str - credentials: dict[str, Any] | str (required) + credentials: Any (required) project: str (required) agent: str environment: str region: str model: str lang: str (required) - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str + actionHook: Any + eventHook: Any events: list[Any] welcomeEvent: str welcomeEventParams: dict[str, Any] @@ -431,7 +277,7 @@ class VerbBuilder: noInputEvent: str passDtmfAsTextInput: bool thinkingMusic: str - tts: dict[str, Any] + tts: Any bargein: bool queryInput: dict[str, Any] @@ -440,43 +286,43 @@ class VerbBuilder: """ ... - def pipeline( + def agent( self, id: str = ..., - stt: dict[str, Any] = ..., - tts: dict[str, Any] = ..., - llm: dict[str, Any] = ..., - turnDetection: str | dict[str, Any] = ..., + stt: Any = ..., + tts: Any = ..., + turnDetection: Any = ..., bargeIn: dict[str, Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., - toolHook: dict[str, Any] | str = ..., + noResponseTimeout: int | float = ..., + llm: dict[str, Any] = ..., + actionHook: Any = ..., + eventHook: Any = ..., + toolHook: Any = ..., greeting: bool = ..., earlyGeneration: bool = ..., - noiseIsolation: str | dict[str, Any] = ..., + noiseIsolation: Any = ..., mcpServers: list[Any] = ..., - noResponseTimeout: int | float = ..., **kwargs: Any, ) -> Self: - """Integrated STT → LLM → TTS voice AI pipeline. + """Integrated STT → LLM → TTS voice AI agent. Required: llm Args: id: str - stt: dict[str, Any] - tts: dict[str, Any] - llm: dict[str, Any] (required) - turnDetection: str | dict[str, Any] + stt: Any + tts: Any + turnDetection: Any bargeIn: dict[str, Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str - toolHook: dict[str, Any] | str + noResponseTimeout: int | float + llm: dict[str, Any] (required) + actionHook: Any + eventHook: Any + toolHook: Any greeting: bool earlyGeneration: bool - noiseIsolation: str | dict[str, Any] + noiseIsolation: Any mcpServers: list[Any] - noResponseTimeout: int | float Returns: self for chaining. @@ -486,21 +332,20 @@ class VerbBuilder: def listen( self, id: str = ..., - actionHook: dict[str, Any] | str = ..., - auth: dict[str, Any] = ..., + url: str = ..., + actionHook: Any = ..., + wsAuth: Any = ..., + mixType: str = ..., + metadata: dict[str, Any] = ..., + sampleRate: int | float = ..., finishOnKey: str = ..., maxLength: int | float = ..., - metadata: dict[str, Any] = ..., - mixType: str = ..., passDtmf: bool = ..., playBeep: bool = ..., disableBidirectionalAudio: bool = ..., - bidirectionalAudio: dict[str, Any] = ..., - sampleRate: int | float = ..., + bidirectionalAudio: Any = ..., timeout: int | float = ..., - transcribe: dict[str, Any] = ..., - url: str = ..., - wsAuth: dict[str, Any] = ..., + transcribe: Any = ..., earlyMedia: bool = ..., channel: int | float = ..., **kwargs: Any, @@ -511,21 +356,20 @@ class VerbBuilder: Args: id: str - actionHook: dict[str, Any] | str - auth: dict[str, Any] + url: str (required) + actionHook: Any + wsAuth: Any + mixType: str + metadata: dict[str, Any] + sampleRate: int | float finishOnKey: str maxLength: int | float - metadata: dict[str, Any] - mixType: str passDtmf: bool playBeep: bool disableBidirectionalAudio: bool - bidirectionalAudio: dict[str, Any] - sampleRate: int | float + bidirectionalAudio: Any timeout: int | float - transcribe: dict[str, Any] - url: str (required) - wsAuth: dict[str, Any] + transcribe: Any earlyMedia: bool channel: int | float @@ -537,21 +381,20 @@ class VerbBuilder: def stream( self, id: str = ..., - actionHook: dict[str, Any] | str = ..., - auth: dict[str, Any] = ..., + url: str = ..., + actionHook: Any = ..., + wsAuth: Any = ..., + mixType: str = ..., + metadata: dict[str, Any] = ..., + sampleRate: int | float = ..., finishOnKey: str = ..., maxLength: int | float = ..., - metadata: dict[str, Any] = ..., - mixType: str = ..., passDtmf: bool = ..., playBeep: bool = ..., disableBidirectionalAudio: bool = ..., - bidirectionalAudio: dict[str, Any] = ..., - sampleRate: int | float = ..., + bidirectionalAudio: Any = ..., timeout: int | float = ..., - transcribe: dict[str, Any] = ..., - url: str = ..., - wsAuth: dict[str, Any] = ..., + transcribe: Any = ..., earlyMedia: bool = ..., channel: int | float = ..., **kwargs: Any, @@ -562,21 +405,20 @@ class VerbBuilder: Args: id: str - actionHook: dict[str, Any] | str - auth: dict[str, Any] + url: str (required) + actionHook: Any + wsAuth: Any + mixType: str + metadata: dict[str, Any] + sampleRate: int | float finishOnKey: str maxLength: int | float - metadata: dict[str, Any] - mixType: str passDtmf: bool playBeep: bool disableBidirectionalAudio: bool - bidirectionalAudio: dict[str, Any] - sampleRate: int | float + bidirectionalAudio: Any timeout: int | float - transcribe: dict[str, Any] - url: str (required) - wsAuth: dict[str, Any] + transcribe: Any earlyMedia: bool channel: int | float @@ -588,9 +430,10 @@ class VerbBuilder: def transcribe( self, id: str = ..., + enable: bool = ..., transcriptionHook: str = ..., translationHook: str = ..., - recognizer: dict[str, Any] = ..., + recognizer: Any = ..., earlyMedia: bool = ..., channel: int | float = ..., **kwargs: Any, @@ -599,9 +442,10 @@ class VerbBuilder: Args: id: str + enable: bool transcriptionHook: str translationHook: str - recognizer: dict[str, Any] + recognizer: Any earlyMedia: bool channel: int | float @@ -613,28 +457,28 @@ class VerbBuilder: def dial( self, id: str = ..., - actionHook: dict[str, Any] | str = ..., - onHoldHook: dict[str, Any] | str = ..., + target: list[Any] = ..., + actionHook: Any = ..., + onHoldHook: Any = ..., answerOnBridge: bool = ..., callerId: str = ..., callerName: str = ..., - confirmHook: dict[str, Any] | str = ..., - referHook: dict[str, Any] | str = ..., + confirmHook: Any = ..., + referHook: Any = ..., dialMusic: str = ..., - dtmfCapture: dict[str, Any] = ..., - dtmfHook: dict[str, Any] | str = ..., + dtmfCapture: Any = ..., + dtmfHook: Any = ..., headers: dict[str, Any] = ..., anchorMedia: bool = ..., exitMediaPath: bool = ..., - boostAudioSignal: int | float | str = ..., + boostAudioSignal: Any = ..., listen: dict[str, Any] = ..., stream: dict[str, Any] = ..., - target: list[Any] = ..., + transcribe: dict[str, Any] = ..., timeLimit: int | float = ..., timeout: int | float = ..., proxy: str = ..., - transcribe: dict[str, Any] = ..., - amd: dict[str, Any] = ..., + amd: Any = ..., dub: list[Any] = ..., tag: dict[str, Any] = ..., forwardPAI: bool = ..., @@ -646,28 +490,28 @@ class VerbBuilder: Args: id: str - actionHook: dict[str, Any] | str - onHoldHook: dict[str, Any] | str + target: list[Any] (required) + actionHook: Any + onHoldHook: Any answerOnBridge: bool callerId: str callerName: str - confirmHook: dict[str, Any] | str - referHook: dict[str, Any] | str + confirmHook: Any + referHook: Any dialMusic: str - dtmfCapture: dict[str, Any] - dtmfHook: dict[str, Any] | str + dtmfCapture: Any + dtmfHook: Any headers: dict[str, Any] anchorMedia: bool exitMediaPath: bool - boostAudioSignal: int | float | str + boostAudioSignal: Any listen: dict[str, Any] stream: dict[str, Any] - target: list[Any] (required) + transcribe: dict[str, Any] timeLimit: int | float timeout: int | float proxy: str - transcribe: dict[str, Any] - amd: dict[str, Any] + amd: Any dub: list[Any] tag: dict[str, Any] forwardPAI: bool @@ -689,11 +533,11 @@ class VerbBuilder: endConferenceDuration: int | float = ..., maxParticipants: int | float = ..., joinMuted: bool = ..., - actionHook: dict[str, Any] | str = ..., - waitHook: dict[str, Any] | str = ..., + actionHook: Any = ..., + waitHook: Any = ..., statusEvents: list[Any] = ..., - statusHook: dict[str, Any] | str = ..., - enterHook: dict[str, Any] | str = ..., + statusHook: Any = ..., + enterHook: Any = ..., record: dict[str, Any] = ..., listen: dict[str, Any] = ..., distributeDtmf: bool = ..., @@ -714,11 +558,11 @@ class VerbBuilder: endConferenceDuration: int | float maxParticipants: int | float joinMuted: bool - actionHook: dict[str, Any] | str - waitHook: dict[str, Any] | str + actionHook: Any + waitHook: Any statusEvents: list[Any] - statusHook: dict[str, Any] | str - enterHook: dict[str, Any] | str + statusHook: Any + enterHook: Any record: dict[str, Any] listen: dict[str, Any] distributeDtmf: bool @@ -732,10 +576,9 @@ class VerbBuilder: self, id: str = ..., name: str = ..., - actionHook: dict[str, Any] | str = ..., - waitHook: dict[str, Any] | str = ..., + actionHook: Any = ..., + waitHook: Any = ..., priority: int | float = ..., - _: dict[str, Any] = ..., **kwargs: Any, ) -> Self: """Place caller into a named call queue. @@ -745,10 +588,9 @@ class VerbBuilder: Args: id: str name: str (required) - actionHook: dict[str, Any] | str - waitHook: dict[str, Any] | str + actionHook: Any + waitHook: Any priority: int | float - _: dict[str, Any] Returns: self for chaining. @@ -759,7 +601,7 @@ class VerbBuilder: self, id: str = ..., name: str = ..., - actionHook: dict[str, Any] | str = ..., + actionHook: Any = ..., timeout: int | float = ..., beep: bool = ..., callSid: str = ..., @@ -772,7 +614,7 @@ class VerbBuilder: Args: id: str name: str (required) - actionHook: dict[str, Any] | str + actionHook: Any timeout: int | float beep: bool callSid: str @@ -802,8 +644,8 @@ class VerbBuilder: def redirect( self, id: str = ..., - actionHook: dict[str, Any] | str = ..., - statusHook: dict[str, Any] | str = ..., + actionHook: Any = ..., + statusHook: Any = ..., **kwargs: Any, ) -> Self: """Transfer control to a different webhook URL. @@ -812,8 +654,8 @@ class VerbBuilder: Args: id: str - actionHook: dict[str, Any] | str (required) - statusHook: dict[str, Any] | str + actionHook: Any (required) + statusHook: Any Returns: self for chaining. @@ -868,7 +710,7 @@ class VerbBuilder: method: str = ..., body: str = ..., headers: dict[str, Any] = ..., - actionHook: dict[str, Any] | str = ..., + actionHook: Any = ..., **kwargs: Any, ) -> Self: """Send a SIP request within the current dialog. @@ -880,7 +722,7 @@ class VerbBuilder: method: str (required) body: str headers: dict[str, Any] - actionHook: dict[str, Any] | str + actionHook: Any Returns: self for chaining. @@ -894,8 +736,8 @@ class VerbBuilder: referredBy: str = ..., referredByDisplayName: str = ..., headers: dict[str, Any] = ..., - actionHook: dict[str, Any] | str = ..., - eventHook: dict[str, Any] | str = ..., + actionHook: Any = ..., + eventHook: Any = ..., **kwargs: Any, ) -> Self: """Send a SIP REFER for call transfer. @@ -908,8 +750,8 @@ class VerbBuilder: referredBy: str referredByDisplayName: str headers: dict[str, Any] - actionHook: dict[str, Any] | str - eventHook: dict[str, Any] | str + actionHook: Any + eventHook: Any Returns: self for chaining. @@ -919,25 +761,25 @@ class VerbBuilder: def config( self, id: str = ..., - synthesizer: dict[str, Any] = ..., - recognizer: dict[str, Any] = ..., + synthesizer: Any = ..., + recognizer: Any = ..., bargeIn: dict[str, Any] = ..., ttsStream: dict[str, Any] = ..., record: dict[str, Any] = ..., listen: dict[str, Any] = ..., stream: dict[str, Any] = ..., transcribe: dict[str, Any] = ..., - amd: dict[str, Any] = ..., - fillerNoise: dict[str, Any] = ..., + amd: Any = ..., + fillerNoise: Any = ..., + vad: Any = ..., notifyEvents: bool = ..., notifySttLatency: bool = ..., - reset: str | list[Any] = ..., + reset: Any = ..., onHoldMusic: str = ..., - actionHookDelayAction: dict[str, Any] = ..., - sipRequestWithinDialogHook: dict[str, Any] | str = ..., - boostAudioSignal: int | float | str = ..., - vad: dict[str, Any] = ..., - referHook: dict[str, Any] | str = ..., + actionHookDelayAction: Any = ..., + sipRequestWithinDialogHook: Any = ..., + boostAudioSignal: Any = ..., + referHook: Any = ..., earlyMedia: bool = ..., autoStreamTts: bool = ..., disableTtsCache: bool = ..., @@ -950,25 +792,25 @@ class VerbBuilder: Args: id: str - synthesizer: dict[str, Any] - recognizer: dict[str, Any] + synthesizer: Any + recognizer: Any bargeIn: dict[str, Any] ttsStream: dict[str, Any] record: dict[str, Any] listen: dict[str, Any] stream: dict[str, Any] transcribe: dict[str, Any] - amd: dict[str, Any] - fillerNoise: dict[str, Any] + amd: Any + fillerNoise: Any + vad: Any notifyEvents: bool notifySttLatency: bool - reset: str | list[Any] + reset: Any onHoldMusic: str - actionHookDelayAction: dict[str, Any] - sipRequestWithinDialogHook: dict[str, Any] | str - boostAudioSignal: int | float | str - vad: dict[str, Any] - referHook: dict[str, Any] | str + actionHookDelayAction: Any + sipRequestWithinDialogHook: Any + boostAudioSignal: Any + referHook: Any earlyMedia: bool autoStreamTts: bool disableTtsCache: bool @@ -1027,9 +869,9 @@ class VerbBuilder: action: str = ..., track: str = ..., play: str = ..., - say: str | dict[str, Any] = ..., + say: Any = ..., loop: bool = ..., - gain: int | float | str = ..., + gain: Any = ..., **kwargs: Any, ) -> Self: """Manage audio dubbing tracks. @@ -1041,9 +883,9 @@ class VerbBuilder: action: str (required) track: str (required) play: str - say: str | dict[str, Any] + say: Any loop: bool - gain: int | float | str + gain: Any Returns: self for chaining. @@ -1053,14 +895,14 @@ class VerbBuilder: def message( self, id: str = ..., - carrier: str = ..., - account_sid: str = ..., - message_sid: str = ..., to: str = ..., from_: str = ..., text: str = ..., - media: str | list[Any] = ..., - actionHook: dict[str, Any] | str = ..., + media: Any = ..., + carrier: str = ..., + account_sid: str = ..., + message_sid: str = ..., + actionHook: Any = ..., **kwargs: Any, ) -> Self: """Send SMS/MMS message. @@ -1069,14 +911,14 @@ class VerbBuilder: Args: id: str - carrier: str - account_sid: str - message_sid: str to: str (required) from_: str (required) text: str - media: str | list[Any] - actionHook: dict[str, Any] | str + media: Any + carrier: str + account_sid: str + message_sid: str + actionHook: Any Returns: self for chaining. diff --git a/src/jambonz_sdk/verb_registry.py b/src/jambonz_sdk/verb_registry.py index 3dd4e8e..f97f8f1 100644 --- a/src/jambonz_sdk/verb_registry.py +++ b/src/jambonz_sdk/verb_registry.py @@ -1,6 +1,6 @@ """Verb registry — the single source of truth for mapping spec entries to SDK methods. -This module defines which entries in ``specs.json`` are top-level verbs +This module defines which entries in the JSON Schema files are top-level verbs (as opposed to nested component types), their Python method names, docstrings, and any synonym/alias transforms. @@ -19,7 +19,7 @@ class VerbDef: """Definition of a single verb method on VerbBuilder. Attributes: - spec_name: The key in specs.json (e.g., ``"say"``, ``"sip:decline"``). + spec_name: The schema identifier (e.g., ``"say"``, ``"sip:decline"``). method_name: The Python method name (e.g., ``"say"``, ``"sip_decline"``). json_verb: The ``verb`` value in the output JSON. Defaults to ``spec_name``. doc: One-line docstring for the generated method. @@ -68,7 +68,7 @@ def __post_init__(self) -> None: doc="Generic S2S verb (use when vendor is determined at runtime)."), VerbDef("llm", "llm", doc="Legacy LLM verb (prefer s2s or vendor-specific shortcuts)."), VerbDef("dialogflow", "dialogflow", doc="Connect caller to Google Dialogflow agent."), - VerbDef("pipeline", "pipeline", doc="Integrated STT → LLM → TTS voice AI pipeline."), + VerbDef("agent", "agent", doc="Integrated STT → LLM → TTS voice AI agent."), # Audio Streaming VerbDef("listen", "listen", doc="Stream real-time audio to a websocket endpoint."), diff --git a/src/jambonz_sdk/websocket/session.py b/src/jambonz_sdk/websocket/session.py index 1ffa2da..58d7fbe 100644 --- a/src/jambonz_sdk/websocket/session.py +++ b/src/jambonz_sdk/websocket/session.py @@ -227,7 +227,7 @@ async def clear_tts_tokens(self) -> None: # ── LLM Tool Output ──────────────────────────────────────────── async def tool_output(self, tool_call_id: str, result: Any) -> Session: - """Return a tool call result to the pipeline LLM. + """Return a tool call result to the agent LLM. Args: tool_call_id: The tool_call_id from the llm:tool-call event. @@ -246,14 +246,14 @@ async def tool_output(self, tool_call_id: str, result: Any) -> Session: await self._ws.send(json.dumps(msg)) return self - # ── Pipeline Updates ──────────────────────────────────────────── + # ── Agent Updates ──────────────────────────────────────────── - async def update_pipeline(self, data: dict[str, Any]) -> None: - """Send a mid-conversation pipeline update. + async def update_agent(self, data: dict[str, Any]) -> None: + """Send a mid-conversation agent update. Args: data: Update payload with ``type`` key (e.g., 'update_instructions', 'inject_context', 'update_tools', 'generate_reply'). """ - msg = {"type": "pipeline:update", "data": data} + msg = {"type": "agent:update", "data": data} await self._ws.send(json.dumps(msg)) diff --git a/tests/integration/test_websocket.py b/tests/integration/test_websocket.py index 879f27b..50fe584 100644 --- a/tests/integration/test_websocket.py +++ b/tests/integration/test_websocket.py @@ -10,7 +10,7 @@ 5. App responds with ack containing next verbs 6. App can inject commands (mute, whisper, record) at any time 7. App can stream TTS tokens -8. App can send pipeline updates +8. App can send agent updates 9. jambonz sends OPTIONS for env vars discovery (HTTP, not WS) """ @@ -313,9 +313,9 @@ async def handler(session): await runner.cleanup() -# ── Pipeline updates ──────────────────────────────────────────────── +# ── Agent updates ──────────────────────────────────────────────── -class TestPipelineUpdateProtocol: +class TestAgentUpdateProtocol: @pytest.mark.asyncio async def test_update_instructions(self): port = 19109 @@ -323,14 +323,14 @@ async def test_update_instructions(self): svc = make_service(path="/") async def handler(session): - session.pipeline( + session.agent( stt={"vendor": "deepgram"}, tts={"vendor": "cartesia", "voice": "sonic"}, llm={"vendor": "openai", "model": "gpt-4o", "llmOptions": {}}, actionHook="/done", ) await session.send() - await session.update_pipeline({ + await session.update_agent({ "type": "update_instructions", "instructions": "Now help with billing.", }) @@ -340,9 +340,9 @@ async def handler(session): http, ws = await _ws_connect(port) await ws.send_str(_session_new()) ack = await _recv(ws) - assert ack["data"][0]["verb"] == "pipeline" + assert ack["data"][0]["verb"] == "agent" update = await _recv(ws) - assert update["type"] == "pipeline:update" + assert update["type"] == "agent:update" assert update["data"]["instructions"] == "Now help with billing." await ws.close() await http.close() diff --git a/tests/unit/test_rest_client.py b/tests/unit/test_rest_client.py index 24cba1c..68deb0d 100644 --- a/tests/unit/test_rest_client.py +++ b/tests/unit/test_rest_client.py @@ -161,15 +161,15 @@ async def test_unmute(self): assert mock.request.call_args[1]["json"]["mute_status"] == "unmute" -class TestCallsPipelineUpdate: - """PUT /Calls/{callSid} with pipeline_update.""" +class TestCallsAgentUpdate: + """PUT /Calls/{callSid} with agent_update.""" @pytest.mark.asyncio - async def test_sends_pipeline_update(self): + async def test_sends_agent_update(self): client, mock = _client_with_mock(_MockResponse(200, {})) - await client.calls.update_pipeline("c1", {"type": "update_instructions", "instructions": "New prompt"}) + await client.calls.update_agent("c1", {"type": "update_instructions", "instructions": "New prompt"}) body = mock.request.call_args[1]["json"] - assert body["pipeline_update"]["type"] == "update_instructions" + assert body["agent_update"]["type"] == "update_instructions" # ── Conferences resource ──────────────────────────────────────────── diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 058cc2b..1b036cf 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -6,7 +6,7 @@ - Inject commands produce "command" messages per the jambonz WS spec - TTS streaming produces tts:tokens/tts:flush/tts:clear messages - Tool output produces llm:tool-output messages -- Pipeline updates produce pipeline:update messages +- Agent updates produce agent:update messages - Session properties are correctly extracted from session:new data """ @@ -306,25 +306,25 @@ async def test_tool_output(self): assert msg["data"]["output"]["temperature"] == 72 -# ── Pipeline updates ──────────────────────────────────────────────── +# ── Agent updates ──────────────────────────────────────────────── -class TestPipelineUpdate: - """Pipeline updates per jambonz protocol: - {"type": "pipeline:update", "data": {"type": ..., ...}}""" +class TestAgentUpdate: + """Agent updates per jambonz protocol: + {"type": "agent:update", "data": {"type": ..., ...}}""" @pytest.mark.asyncio async def test_update_instructions(self): s, ws = _make_session() - await s.update_pipeline({"type": "update_instructions", "instructions": "Be a billing agent."}) + await s.update_agent({"type": "update_instructions", "instructions": "Be a billing agent."}) msg = json.loads(ws.send.call_args[0][0]) - assert msg["type"] == "pipeline:update" + assert msg["type"] == "agent:update" assert msg["data"]["type"] == "update_instructions" assert msg["data"]["instructions"] == "Be a billing agent." @pytest.mark.asyncio async def test_inject_context(self): s, ws = _make_session() - await s.update_pipeline({ + await s.update_agent({ "type": "inject_context", "messages": [{"role": "system", "content": "Customer is Gold tier."}], }) @@ -334,7 +334,7 @@ async def test_inject_context(self): @pytest.mark.asyncio async def test_generate_reply_with_interrupt(self): s, ws = _make_session() - await s.update_pipeline({ + await s.update_agent({ "type": "generate_reply", "user_input": "Urgent override", "interrupt": True, diff --git a/tests/unit/test_verb_builder.py b/tests/unit/test_verb_builder.py index d45851c..4f17da1 100644 --- a/tests/unit/test_verb_builder.py +++ b/tests/unit/test_verb_builder.py @@ -2,13 +2,13 @@ These tests validate that: 1. Every verb in the registry has a corresponding method on VerbBuilder -2. Every method produces JSON output matching the specs.json contract +2. Every method produces JSON output matching the JSON Schema contract 3. Verb synonyms and injected properties work correctly 4. The builder's chaining and reset behavior is correct -5. Property names in output match specs.json exactly (camelCase preserved) +5. Property names in output match JSON Schema exactly (camelCase preserved) 6. The 'from' → 'from_' Python mapping works for the message verb -Tests are driven by specs.json — if a new property is added to a verb spec, +Tests are driven by JSON Schema — if a new property is added to a verb schema, these tests verify the SDK can pass it through correctly. """ @@ -59,10 +59,10 @@ def test_method_produces_correct_verb_name(self, verb_def): assert verbs[0]["verb"] == verb_def.json_verb -# ── Spec-driven: output properties must match specs.json ─────────── +# ── Spec-driven: output properties must match JSON Schema ───────── class TestVerbOutputMatchesSpec: - """For each verb, passing a property defined in specs.json must + """For each verb, passing a property defined in the JSON Schema must appear in the output JSON with the exact same key name.""" @pytest.mark.parametrize( @@ -275,9 +275,9 @@ def test_dial_with_answer_on_bridge(self): assert verbs[0]["target"][0]["type"] == "phone" assert verbs[0]["answerOnBridge"] is True - def test_voice_agent_pipeline(self): + def test_voice_agent(self): builder = VerbBuilder() - builder.pipeline( + builder.agent( stt={"vendor": "deepgram", "language": "en-US"}, tts={"vendor": "cartesia", "voice": "sonic"}, llm={"vendor": "openai", "model": "gpt-4o", "llmOptions": { @@ -290,7 +290,7 @@ def test_voice_agent_pipeline(self): toolHook="/tools", ) v = builder.to_list()[0] - assert v["verb"] == "pipeline" + assert v["verb"] == "agent" assert v["stt"]["vendor"] == "deepgram" assert v["llm"]["vendor"] == "openai" assert v["turnDetection"] == "krisp" @@ -312,7 +312,7 @@ def test_listen_with_bidirectional_audio(self): # ── Helpers ───────────────────────────────────────────────────────── def _dummy_value(spec_type): - """Generate a dummy value matching a specs.json type descriptor.""" + """Generate a dummy value matching a JSON Schema type descriptor.""" if isinstance(spec_type, str): if spec_type.startswith("#"): return {}