Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/sync_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pathlib import Path

# ── Pin the schema version here ──────────────────────────────────────
SCHEMA_VERSION = "v0.2.1"
SCHEMA_VERSION = "v0.3.5"
# ────────────────────────────────────────────────────────────────────

DEST = Path(__file__).resolve().parent.parent / "src" / "jambonz_sdk" / "schema"
Expand Down
7 changes: 6 additions & 1 deletion src/jambonz_sdk/schema/callbacks/call-status.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://jambonz.org/schema/callbacks/call-status",
"title": "Call Status Webhook Payload",
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.",
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.\n\n**Capturing B-leg call_sid:** When using the dial verb to bridge calls, status events are sent for both legs. The A-leg (original inbound call) has `direction: 'inbound'`. The B-leg (outbound dialed call) has `direction: 'outbound'`. To capture the B-leg's call_sid for later use (e.g., injecting commands to the B-leg), listen for status events where `direction === 'outbound'` and extract the `call_sid` field.",
"allOf": [
{ "$ref": "base" }
],
"type": "object",
"properties": {
"direction": {
"type": "string",
"enum": ["inbound", "outbound"],
"description": "Call direction. 'inbound' = A-leg (original incoming call to the application). 'outbound' = B-leg (call placed by the dial verb). Use this field to identify which leg generated the status event, especially when capturing the B-leg's call_sid for mid-call control."
},
"call_termination_by": {
"type": "string",
"enum": ["caller", "jambonz"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@
},
"minEndOfTurnSilenceWhenConfident": {
"type": "number",
"description": "Minimum silence duration (seconds) to trigger end-of-turn when confidence is met."
"description": "Minimum silence duration (milliseconds) to trigger end-of-turn when confidence is met. Default: 400."
},
"maxTurnSilence": {
"type": "number",
"description": "Maximum silence duration (seconds) before forcing end-of-turn."
"description": "Maximum silence duration (milliseconds) before forcing end-of-turn. Default: 1280."
},
"minTurnSilence": {
"type": "number",
"description": "Minimum silence duration (seconds) before allowing end-of-turn."
"description": "Minimum silence duration (milliseconds) before allowing end-of-turn."
},
"keyterms": {
"type": "array",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@
"eagerEotThreshold": {
"type": "number",
"description": "Eager end-of-turn threshold for faster response."
},
"languageHints": {
"type": "array",
"items": { "type": "string" },
"description": "Language hints for Deepgram Flux Multilingual. BCP-47 codes (e.g. 'en', 'es', 'fr'). Biases transcription toward specified languages."
}
},
"additionalProperties": false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@
"description": "Custom vocabulary terms."
},
"languageModel": { "type": "string", "description": "Language model to use." },
"audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." }
"audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." },
"eoqThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "End-of-query likelihood threshold (0.0-1.0) to trigger end of speech when segmentation is disabled. Default 0.8, set to 0 to disable." },
"vadStopThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "VAD probability threshold to trigger end of speech when segmentation is disabled. When VAD drops below this value after speech is detected, streaming stops. Default 0.05, set to 0 to disable." }
},
"additionalProperties": false
}
125 changes: 123 additions & 2 deletions src/jambonz_sdk/schema/verbs/agent.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
"$ref": "../components/synthesizer",
"description": "Text-to-speech configuration for the agent."
},
"autoLockLanguage": {
"oneOf": [
{ "type": "boolean" },
{ "type": "string", "enum": ["always"] }
],
"description": "When using Deepgram Flux Multilingual, automatically adjust STT language hints and switch TTS voice based on detected language. Values: false (disabled), true (lock on first utterance), 'always' (continuously adapt on every turn). Default: false.",
"default": false
},
"languageConfig": {
"type": "object",
"description": "Per-language overrides for TTS. Keys are BCP-47 language codes. When autoLockLanguage detects a language switch, the agent uses the corresponding config.",
"additionalProperties": {
"type": "object",
"properties": {
"tts": {
"$ref": "../components/synthesizer",
"description": "TTS config override for this language. Merged with default tts."
}
},
"additionalProperties": false
}
},
"turnDetection": {
"oneOf": [
{
Expand Down Expand Up @@ -86,8 +108,92 @@
},
"llm": {
"type": "object",
"description": "LLM configuration for the agent. See the 'llm' verb schema for details.",
"additionalProperties": true
"description": "LLM configuration for the agent.",
"required": ["vendor", "model"],
"properties": {
"vendor": {
"type": "string",
"enum": [
"openai",
"anthropic",
"google",
"vertex-gemini",
"vertex-openai",
"bedrock",
"deepseek",
"azure-openai",
"groq",
"huggingface"
],
"description": "LLM vendor id. Must match a `@jambonz/llm` registered adapter."
},
"model": {
"type": "string",
"description": "Vendor-specific model id (e.g. 'gpt-4o', 'claude-sonnet-4-5-20250929')."
},
"label": {
"type": "string",
"description": "Optional label to disambiguate when the account has multiple credentials for the same vendor."
},
"auth": {
"type": "object",
"description": "Optional inline credentials. When omitted, feature-server looks up credentials by (vendor, label) from the database.",
"properties": {
"apiKey": { "type": "string" }
},
"additionalProperties": true
},
"connectOptions": {
"type": "object",
"description": "SDK-level client options.",
"properties": {
"timeout": { "type": "number", "minimum": 0 },
"maxRetries": { "type": "integer", "minimum": 0 },
"endpoint": { "type": "string" },
"baseURL": { "type": "string" }
},
"additionalProperties": false
},
"llmOptions": {
"type": "object",
"description": "Per-call LLM configuration.",
"properties": {
"systemPrompt": {
"type": "string",
"description": "System prompt for the model. Placed vendor-appropriately (top-level for Anthropic/Bedrock, config.systemInstruction for Gemini, role:'system' for OpenAI-compatibles)."
},
"messages": {
"type": "array",
"description": "Seed conversation history. A role:'system' entry is extracted into systemPrompt internally.",
"items": { "$ref": "#/$defs/llmMessage" }
},
"initialMessages": {
"type": "array",
"description": "Alias of 'messages' (historical).",
"items": { "$ref": "#/$defs/llmMessage" }
},
"maxTokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum tokens the model may generate per turn."
},
"temperature": {
"type": "number",
"minimum": 0,
"description": "Sampling temperature."
},
"tools": {
"type": "array",
"description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",
"items": {
"type": "object"
}
}
},
"additionalProperties": false
}
},
"additionalProperties": false
},
"actionHook": {
"$ref": "../components/actionHook",
Expand Down Expand Up @@ -177,6 +283,21 @@
"required": [
"llm"
],
"$defs": {
"llmMessage": {
"type": "object",
"description": "A conversation-history message. The library normalizes content to a string; adapters may carry vendor-native shapes internally.",
"required": ["role", "content"],
"properties": {
"role": {
"type": "string",
"enum": ["system", "user", "assistant", "tool"]
},
"content": {}
},
"additionalProperties": true
}
},
"examples": [
{
"verb": "agent",
Expand Down
2 changes: 1 addition & 1 deletion src/jambonz_sdk/schema/verbs/dub.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"$id": "https://jambonz.org/schema/verbs/dub",
"minVersion": "0.9.6",
"title": "Dub",
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.",
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.\n\n**Track Routing:** Tracks are heard by the party on whose call leg they are created. A dub verb in the main verb stack (A-leg) creates tracks heard by the caller. A dub verb nested in the dial verb's `dub` array creates tracks heard by the callee. When using injectCommand to play/say on a track from a different call leg, pass the target call's `call_sid` as the third argument to `session.injectCommand()` to route the command to the correct leg.",
"type": "object",
"properties": {
"verb": {
Expand Down
3 changes: 2 additions & 1 deletion src/jambonz_sdk/schema/verbs/transcribe.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
},
"channel": {
"type": "number",
"description": "Specific audio channel to transcribe."
"enum": [1, 2],
"description": "Specific audio channel to transcribe. Channel 1 = near-end (local party's audio, i.e. caller on A-leg or callee on B-leg). Channel 2 = far-end (remote party's audio). When transcribe is nested in the dial verb, omitting channel captures both legs mixed; specifying channel: 2 isolates the B-leg's inbound audio."
}
},
"examples": [
Expand Down
44 changes: 38 additions & 6 deletions src/jambonz_sdk/websocket/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,19 +229,30 @@ async def clear_tts_tokens(self) -> None:
async def tool_output(self, tool_call_id: str, result: Any) -> Session:
"""Return a tool call result to the agent LLM.

Canonical wire shape (validated by ``@jambonz/schema``)::

{"type": "command", "command": "llm:tool-output",
"tool_call_id": "...", "data": {"result": ...}}

The ``result`` argument becomes ``data.result`` when it is not a dict,
matching the Node SDK's convenience wrapping. Passing a dict sends it
as-is so callers can include richer structured output (feature-server
JSON-stringifies the full ``data`` object on the way to the LLM).

Args:
tool_call_id: The tool_call_id from the llm:tool-call event.
result: The tool result (will be JSON-serialized).
result: The tool result. A non-dict value is wrapped as
``{"result": result}``; a dict is sent as-is.

Returns:
self for chaining with .reply().
"""
payload = result if isinstance(result, dict) else {"result": result}
msg = {
"type": "llm:tool-output",
"data": {
"tool_call_id": tool_call_id,
"output": result,
},
"type": "command",
"command": "llm:tool-output",
"tool_call_id": tool_call_id,
"data": payload,
}
await self._ws.send(json.dumps(msg))
return self
Expand All @@ -257,3 +268,24 @@ async def update_agent(self, data: dict[str, Any]) -> None:
"""
msg = {"type": "agent:update", "data": data}
await self._ws.send(json.dumps(msg))

async def inject_stt_reconfigure(
self,
language_hints: list[str] | None = None,
opts: dict[str, Any] | None = None
) -> None:
"""Reconfigure STT (speech-to-text) settings mid-call.

Currently supports updating language hints for Deepgram Flux Multilingual.

Args:
language_hints: List of BCP-47 language codes (e.g., ['en', 'es']).
Pass empty list [] to clear hints and enable auto-detection.
opts: Additional STT reconfiguration options.
"""
data: dict[str, Any] = {}
if language_hints is not None:
data["languageHints"] = language_hints
if opts:
data.update(opts)
await self.inject_command("stt:reconfigure", data)
25 changes: 19 additions & 6 deletions tests/unit/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,18 +292,31 @@ async def test_clear_tts_tokens(self):
# ── Tool output ─────────────────────────────────────────────────────

class TestToolOutput:
"""Tool output per jambonz protocol:
{"type": "llm:tool-output", "data": {"tool_call_id": ..., "output": ...}}"""
"""Tool output per jambonz protocol (canonical command envelope):
{"type": "command", "command": "llm:tool-output", "tool_call_id": ...,
"data": {"result": ...}}"""

@pytest.mark.asyncio
async def test_tool_output(self):
async def test_tool_output_with_dict(self):
s, ws = _make_session()
result = await s.tool_output("call_abc", {"temperature": 72})
assert result is s # returns self for chaining
msg = json.loads(ws.send.call_args[0][0])
assert msg["type"] == "llm:tool-output"
assert msg["data"]["tool_call_id"] == "call_abc"
assert msg["data"]["output"]["temperature"] == 72
assert msg["type"] == "command"
assert msg["command"] == "llm:tool-output"
assert msg["tool_call_id"] == "call_abc"
# dict passed through as-is for richer payloads
assert msg["data"]["temperature"] == 72

@pytest.mark.asyncio
async def test_tool_output_with_scalar_is_wrapped_as_result(self):
s, ws = _make_session()
await s.tool_output("call_xyz", "hello world")
msg = json.loads(ws.send.call_args[0][0])
assert msg["type"] == "command"
assert msg["command"] == "llm:tool-output"
assert msg["tool_call_id"] == "call_xyz"
assert msg["data"] == {"result": "hello world"}


# ── Agent updates ────────────────────────────────────────────────
Expand Down
Loading