jambonz · davehorton · Apr 26, 2026 · Apr 24, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/scripts/sync_schema.py b/scripts/sync_schema.py
@@ -22,7 +22,7 @@
 from pathlib import Path
 
 # ── Pin the schema version here ──────────────────────────────────────
-SCHEMA_VERSION = "v0.2.1"
+SCHEMA_VERSION = "v0.3.5"
 # ────────────────────────────────────────────────────────────────────
 
 DEST = Path(__file__).resolve().parent.parent / "src" / "jambonz_sdk" / "schema"

diff --git a/src/jambonz_sdk/schema/callbacks/call-status.schema.json b/src/jambonz_sdk/schema/callbacks/call-status.schema.json
@@ -2,12 +2,17 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://jambonz.org/schema/callbacks/call-status",
   "title": "Call Status Webhook Payload",
-  "description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.",
+  "description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.\n\n**Capturing B-leg call_sid:** When using the dial verb to bridge calls, status events are sent for both legs. The A-leg (original inbound call) has `direction: 'inbound'`. The B-leg (outbound dialed call) has `direction: 'outbound'`. To capture the B-leg's call_sid for later use (e.g., injecting commands to the B-leg), listen for status events where `direction === 'outbound'` and extract the `call_sid` field.",
   "allOf": [
     { "$ref": "base" }
   ],
   "type": "object",
   "properties": {
+    "direction": {
+      "type": "string",
+      "enum": ["inbound", "outbound"],
+      "description": "Call direction. 'inbound' = A-leg (original incoming call to the application). 'outbound' = B-leg (call placed by the dial verb). Use this field to identify which leg generated the status event, especially when capturing the B-leg's call_sid for mid-call control."
+    },
     "call_termination_by": {
       "type": "string",
       "enum": ["caller", "jambonz"],

diff --git a/src/jambonz_sdk/schema/components/recognizer-assemblyAiOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-assemblyAiOptions.schema.json
@@ -28,15 +28,15 @@
     },
     "minEndOfTurnSilenceWhenConfident": {
       "type": "number",
-      "description": "Minimum silence duration (seconds) to trigger end-of-turn when confidence is met."
+      "description": "Minimum silence duration (milliseconds) to trigger end-of-turn when confidence is met. Default: 400."
     },
     "maxTurnSilence": {
       "type": "number",
-      "description": "Maximum silence duration (seconds) before forcing end-of-turn."
+      "description": "Maximum silence duration (milliseconds) before forcing end-of-turn. Default: 1280."
     },
     "minTurnSilence": {
       "type": "number",
-      "description": "Minimum silence duration (seconds) before allowing end-of-turn."
+      "description": "Minimum silence duration (milliseconds) before allowing end-of-turn."
     },
     "keyterms": {
       "type": "array",

diff --git a/src/jambonz_sdk/schema/components/recognizer-deepgramOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-deepgramOptions.schema.json
@@ -141,6 +141,11 @@
     "eagerEotThreshold": {
       "type": "number",
       "description": "Eager end-of-turn threshold for faster response."
+    },
+    "languageHints": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Language hints for Deepgram Flux Multilingual. BCP-47 codes (e.g. 'en', 'es', 'fr'). Biases transcription toward specified languages."
     }
   },
   "additionalProperties": false

diff --git a/src/jambonz_sdk/schema/components/recognizer-houndifyOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-houndifyOptions.schema.json
@@ -47,7 +47,9 @@
       "description": "Custom vocabulary terms."
     },
     "languageModel": { "type": "string", "description": "Language model to use." },
-    "audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." }
+    "audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." },
+    "eoqThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "End-of-query likelihood threshold (0.0-1.0) to trigger end of speech when segmentation is disabled. Default 0.8, set to 0 to disable." },
+    "vadStopThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "VAD probability threshold to trigger end of speech when segmentation is disabled. When VAD drops below this value after speech is detected, streaming stops. Default 0.05, set to 0 to disable." }
   },
   "additionalProperties": false
 }
diff --git a/src/jambonz_sdk/schema/verbs/agent.schema.json b/src/jambonz_sdk/schema/verbs/agent.schema.json
@@ -21,6 +21,28 @@
       "$ref": "../components/synthesizer",
       "description": "Text-to-speech configuration for the agent."
     },
+    "autoLockLanguage": {
+      "oneOf": [
+        { "type": "boolean" },
+        { "type": "string", "enum": ["always"] }
+      ],
+      "description": "When using Deepgram Flux Multilingual, automatically adjust STT language hints and switch TTS voice based on detected language. Values: false (disabled), true (lock on first utterance), 'always' (continuously adapt on every turn). Default: false.",
+      "default": false
+    },
+    "languageConfig": {
+      "type": "object",
+      "description": "Per-language overrides for TTS. Keys are BCP-47 language codes. When autoLockLanguage detects a language switch, the agent uses the corresponding config.",
+      "additionalProperties": {
+        "type": "object",
+        "properties": {
+          "tts": {
+            "$ref": "../components/synthesizer",
+            "description": "TTS config override for this language. Merged with default tts."
+          }
+        },
+        "additionalProperties": false
+      }
+    },
     "turnDetection": {
       "oneOf": [
         {
@@ -86,8 +108,92 @@
     },
     "llm": {
       "type": "object",
-      "description": "LLM configuration for the agent. See the 'llm' verb schema for details.",
-      "additionalProperties": true
+      "description": "LLM configuration for the agent.",
+      "required": ["vendor", "model"],
+      "properties": {
+        "vendor": {
+          "type": "string",
+          "enum": [
+            "openai",
+            "anthropic",
+            "google",
+            "vertex-gemini",
+            "vertex-openai",
+            "bedrock",
+            "deepseek",
+            "azure-openai",
+            "groq",
+            "huggingface"
+          ],
+          "description": "LLM vendor id. Must match a `@jambonz/llm` registered adapter."
+        },
+        "model": {
+          "type": "string",
+          "description": "Vendor-specific model id (e.g. 'gpt-4o', 'claude-sonnet-4-5-20250929')."
+        },
+        "label": {
+          "type": "string",
+          "description": "Optional label to disambiguate when the account has multiple credentials for the same vendor."
+        },
+        "auth": {
+          "type": "object",
+          "description": "Optional inline credentials. When omitted, feature-server looks up credentials by (vendor, label) from the database.",
+          "properties": {
+            "apiKey": { "type": "string" }
+          },
+          "additionalProperties": true
+        },
+        "connectOptions": {
+          "type": "object",
+          "description": "SDK-level client options.",
+          "properties": {
+            "timeout": { "type": "number", "minimum": 0 },
+            "maxRetries": { "type": "integer", "minimum": 0 },
+            "endpoint": { "type": "string" },
+            "baseURL": { "type": "string" }
+          },
+          "additionalProperties": false
+        },
+        "llmOptions": {
+          "type": "object",
+          "description": "Per-call LLM configuration.",
+          "properties": {
+            "systemPrompt": {
+              "type": "string",
+              "description": "System prompt for the model. Placed vendor-appropriately (top-level for Anthropic/Bedrock, config.systemInstruction for Gemini, role:'system' for OpenAI-compatibles)."
+            },
+            "messages": {
+              "type": "array",
+              "description": "Seed conversation history. A role:'system' entry is extracted into systemPrompt internally.",
+              "items": { "$ref": "#/$defs/llmMessage" }
+            },
+            "initialMessages": {
+              "type": "array",
+              "description": "Alias of 'messages' (historical).",
+              "items": { "$ref": "#/$defs/llmMessage" }
+            },
+            "maxTokens": {
+              "type": "integer",
+              "minimum": 1,
+              "description": "Maximum tokens the model may generate per turn."
+            },
+            "temperature": {
+              "type": "number",
+              "minimum": 0,
+              "description": "Sampling temperature."
+            },
+            "tools": {
+              "type": "array",
+              "description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",
+              "items": {
+                "type": "object"
+              }
+            }
+          },
+          "additionalProperties": false
+        }
+      },
+      "additionalProperties": false
     },
     "actionHook": {
       "$ref": "../components/actionHook",
@@ -177,6 +283,21 @@
   "required": [
     "llm"
   ],
+  "$defs": {
+    "llmMessage": {
+      "type": "object",
+      "description": "A conversation-history message. The library normalizes content to a string; adapters may carry vendor-native shapes internally.",
+      "required": ["role", "content"],
+      "properties": {
+        "role": {
+          "type": "string",
+          "enum": ["system", "user", "assistant", "tool"]
+        },
+        "content": {}
+      },
+      "additionalProperties": true
+    }
+  },
   "examples": [
     {
       "verb": "agent",

diff --git a/src/jambonz_sdk/schema/verbs/dub.schema.json b/src/jambonz_sdk/schema/verbs/dub.schema.json
@@ -3,7 +3,7 @@
   "$id": "https://jambonz.org/schema/verbs/dub",
   "minVersion": "0.9.6",
   "title": "Dub",
-  "description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.",
+  "description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.\n\n**Track Routing:** Tracks are heard by the party on whose call leg they are created. A dub verb in the main verb stack (A-leg) creates tracks heard by the caller. A dub verb nested in the dial verb's `dub` array creates tracks heard by the callee. When using injectCommand to play/say on a track from a different call leg, pass the target call's `call_sid` as the third argument to `session.injectCommand()` to route the command to the correct leg.",
   "type": "object",
   "properties": {
     "verb": {

diff --git a/src/jambonz_sdk/schema/verbs/transcribe.schema.json b/src/jambonz_sdk/schema/verbs/transcribe.schema.json
@@ -37,7 +37,8 @@
     },
     "channel": {
       "type": "number",
-      "description": "Specific audio channel to transcribe."
+      "enum": [1, 2],
+      "description": "Specific audio channel to transcribe. Channel 1 = near-end (local party's audio, i.e. caller on A-leg or callee on B-leg). Channel 2 = far-end (remote party's audio). When transcribe is nested in the dial verb, omitting channel captures both legs mixed; specifying channel: 2 isolates the B-leg's inbound audio."
     }
   },
   "examples": [

diff --git a/src/jambonz_sdk/websocket/session.py b/src/jambonz_sdk/websocket/session.py
@@ -229,19 +229,30 @@ async def clear_tts_tokens(self) -> None:
     async def tool_output(self, tool_call_id: str, result: Any) -> Session:
         """Return a tool call result to the agent LLM.
 
+        Canonical wire shape (validated by ``@jambonz/schema``)::
+
+            {"type": "command", "command": "llm:tool-output",
+             "tool_call_id": "...", "data": {"result": ...}}
+
+        The ``result`` argument becomes ``data.result`` when it is not a dict,
+        matching the Node SDK's convenience wrapping. Passing a dict sends it
+        as-is so callers can include richer structured output (feature-server
+        JSON-stringifies the full ``data`` object on the way to the LLM).
+
         Args:
             tool_call_id: The tool_call_id from the llm:tool-call event.
-            result: The tool result (will be JSON-serialized).
+            result: The tool result. A non-dict value is wrapped as
+                ``{"result": result}``; a dict is sent as-is.
 
         Returns:
             self for chaining with .reply().
         """
+        payload = result if isinstance(result, dict) else {"result": result}
         msg = {
-            "type": "llm:tool-output",
-            "data": {
-                "tool_call_id": tool_call_id,
-                "output": result,
-            },
+            "type": "command",
+            "command": "llm:tool-output",
+            "tool_call_id": tool_call_id,
+            "data": payload,
         }
         await self._ws.send(json.dumps(msg))
         return self
@@ -257,3 +268,24 @@ async def update_agent(self, data: dict[str, Any]) -> None:
         """
         msg = {"type": "agent:update", "data": data}
         await self._ws.send(json.dumps(msg))
+
+    async def inject_stt_reconfigure(
+        self,
+        language_hints: list[str] | None = None,
+        opts: dict[str, Any] | None = None
+    ) -> None:
+        """Reconfigure STT (speech-to-text) settings mid-call.
+
+        Currently supports updating language hints for Deepgram Flux Multilingual.
+
+        Args:
+            language_hints: List of BCP-47 language codes (e.g., ['en', 'es']).
+                            Pass empty list [] to clear hints and enable auto-detection.
+            opts: Additional STT reconfiguration options.
+        """
+        data: dict[str, Any] = {}
+        if language_hints is not None:
+            data["languageHints"] = language_hints
+        if opts:
+            data.update(opts)
+        await self.inject_command("stt:reconfigure", data)
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
@@ -292,18 +292,31 @@ async def test_clear_tts_tokens(self):
 # ── Tool output ─────────────────────────────────────────────────────
 
 class TestToolOutput:
-    """Tool output per jambonz protocol:
-    {"type": "llm:tool-output", "data": {"tool_call_id": ..., "output": ...}}"""
+    """Tool output per jambonz protocol (canonical command envelope):
+    {"type": "command", "command": "llm:tool-output", "tool_call_id": ...,
+     "data": {"result": ...}}"""
 
     @pytest.mark.asyncio
-    async def test_tool_output(self):
+    async def test_tool_output_with_dict(self):
         s, ws = _make_session()
         result = await s.tool_output("call_abc", {"temperature": 72})
         assert result is s  # returns self for chaining
         msg = json.loads(ws.send.call_args[0][0])
-        assert msg["type"] == "llm:tool-output"
-        assert msg["data"]["tool_call_id"] == "call_abc"
-        assert msg["data"]["output"]["temperature"] == 72
+        assert msg["type"] == "command"
+        assert msg["command"] == "llm:tool-output"
+        assert msg["tool_call_id"] == "call_abc"
+        # dict passed through as-is for richer payloads
+        assert msg["data"]["temperature"] == 72
+
+    @pytest.mark.asyncio
+    async def test_tool_output_with_scalar_is_wrapped_as_result(self):
+        s, ws = _make_session()
+        await s.tool_output("call_xyz", "hello world")
+        msg = json.loads(ws.send.call_args[0][0])
+        assert msg["type"] == "command"
+        assert msg["command"] == "llm:tool-output"
+        assert msg["tool_call_id"] == "call_xyz"
+        assert msg["data"] == {"result": "hello world"}
 
 
 # ── Agent updates ────────────────────────────────────────────────