From 7f22e471984d0dd2043086285e6eef37c9e8d43b Mon Sep 17 00:00:00 2001 From: Thamarai meena K Date: Fri, 5 Jun 2026 14:28:14 +0530 Subject: [PATCH] feat(01-learn): add 21-observability-debugging --- .../21-observability-debugging/.gitignore | 18 + .../01_tracing_setup.ipynb | 239 ++++++++++ .../02_trace_hierarchy.ipynb | 284 ++++++++++++ .../03_debugging_tools.ipynb | 355 +++++++++++++++ .../04_backend_export.ipynb | 304 +++++++++++++ .../05_custom_metrics.ipynb | 409 ++++++++++++++++++ .../21-observability-debugging/README.md | 157 +++++++ .../requirements.txt | 5 + .../21-observability-debugging/trace_utils.py | 187 ++++++++ 9 files changed, 1958 insertions(+) create mode 100644 python/01-learn/21-observability-debugging/.gitignore create mode 100644 python/01-learn/21-observability-debugging/01_tracing_setup.ipynb create mode 100644 python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb create mode 100644 python/01-learn/21-observability-debugging/03_debugging_tools.ipynb create mode 100644 python/01-learn/21-observability-debugging/04_backend_export.ipynb create mode 100644 python/01-learn/21-observability-debugging/05_custom_metrics.ipynb create mode 100644 python/01-learn/21-observability-debugging/README.md create mode 100644 python/01-learn/21-observability-debugging/requirements.txt create mode 100644 python/01-learn/21-observability-debugging/trace_utils.py diff --git a/python/01-learn/21-observability-debugging/.gitignore b/python/01-learn/21-observability-debugging/.gitignore new file mode 100644 index 00000000..f4dfd19c --- /dev/null +++ b/python/01-learn/21-observability-debugging/.gitignore @@ -0,0 +1,18 @@ +# Virtual environment +.venv/ +venv/ + +# Jupyter +.ipynb_checkpoints/ + +# Python +__pycache__/ +*.pyc +*.pyo + +# macOS +.DS_Store + +# IDE +.vscode/ +.idea/ diff --git a/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb b/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb new file mode 100644 index 00000000..973fbe2c --- /dev/null +++ b/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Observability & Debugging — Part 1: Tracing Setup\n", + "\n", + "This notebook teaches you how to configure OpenTelemetry tracing with Strands Agents\n", + "using the built-in `StrandsTelemetry` class and run your first traced agent invocation.\n", + "\n", + "**What you'll learn:**\n", + "- Configure `StrandsTelemetry` with the console exporter\n", + "- Run an agent and observe trace output in stdout\n", + "- Capture spans programmatically for inspection\n", + "\n", + "**Prerequisites:**\n", + "- Python 3.10+\n", + "- AWS credentials configured (for Bedrock model access)\n", + "- `pip install -r requirements.txt` completed" + ] + }, + { + "cell_type": "markdown", + "id": "concept", + "metadata": {}, + "source": [ + "## How It Works\n", + "\n", + "`StrandsTelemetry` manages the global OpenTelemetry `TracerProvider`. Once configured,\n", + "every `Agent` instance automatically picks up the tracer — no explicit wiring needed.\n", + "\n", + "```\n", + "StrandsTelemetry() → creates TracerProvider\n", + " .setup_console_exporter() → attaches ConsoleSpanExporter\n", + " .setup_otlp_exporter() → attaches OTLPSpanExporter\n", + "\n", + "Agent(...) → reads global TracerProvider\n", + " agent(\"prompt\") → emits spans automatically\n", + "```\n", + "\n", + "**Important:** `StrandsTelemetry` must be configured *before* creating any `Agent`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-telemetry", + "metadata": {}, + "outputs": [], + "source": [ + "from strands.telemetry.config import StrandsTelemetry\n", + "\n", + "# Configure telemetry with console exporter\n", + "# This prints every span to stdout as it completes\n", + "telemetry = StrandsTelemetry()\n", + "telemetry.setup_console_exporter()\n", + "\n", + "print(\"✓ Telemetry configured with console exporter\")\n", + "print(\" Every agent invocation will now produce trace output below.\")" + ] + }, + { + "cell_type": "markdown", + "id": "first-trace", + "metadata": {}, + "source": [ + "## Your First Traced Invocation\n", + "\n", + "Let's define a simple tool and create an agent. When we invoke the agent, the console\n", + "exporter will print each span as it completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-tool", + "metadata": {}, + "outputs": [], + "source": [ + "from strands import Agent, tool\n", + "from strands.models.bedrock import BedrockModel\n", + "\n", + "\n", + "@tool\n", + "def calculator(expression: str) -> str:\n", + " \"\"\"Evaluate a mathematical expression.\n", + "\n", + " Args:\n", + " expression: A mathematical expression to evaluate (e.g., \"42 * 17\")\n", + "\n", + " Returns:\n", + " The result of the expression as a string.\n", + " \"\"\"\n", + " try:\n", + " result = eval(expression, {\"__builtins__\": {}}, {})\n", + " return str(result)\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "\n", + "# Create agent — automatically picks up the global tracer\n", + "agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[calculator],\n", + ")\n", + "\n", + "print(\"✓ Agent created with calculator tool\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "invoke-agent", + "metadata": {}, + "outputs": [], + "source": [ + "# Invoke the agent — trace spans will print to stdout\n", + "result = agent(\"What is 42 multiplied by 17?\")\n", + "print(f\"\\n{'='*60}\")\n", + "print(f\"Agent response: {result}\")\n", + "print(f\"{'='*60}\")\n", + "print(\"\\n↑ The trace output above shows every span that was created.\")\n", + "print(\" Look for: Agent span, Cycle span(s), Model Invoke span(s), Tool span(s)\")" + ] + }, + { + "cell_type": "markdown", + "id": "programmatic", + "metadata": {}, + "source": [ + "## Capturing Spans Programmatically\n", + "\n", + "The console exporter is great for visual inspection. For programmatic analysis,\n", + "we can create a simple span collector that stores spans in a list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "memory-exporter", + "metadata": {}, + "outputs": [], + "source": [ + "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n", + "from opentelemetry import trace\n", + "\n", + "\n", + "class SpanCollector(SpanExporter):\n", + " \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n", + "\n", + " def __init__(self):\n", + " self._spans = []\n", + "\n", + " def export(self, spans):\n", + " self._spans.extend(spans)\n", + " return SpanExportResult.SUCCESS\n", + "\n", + " def get_finished_spans(self):\n", + " return list(self._spans)\n", + "\n", + " def clear(self):\n", + " self._spans = []\n", + "\n", + " def shutdown(self):\n", + " self._spans = []\n", + "\n", + "\n", + "# Add span collector alongside the console exporter\n", + "span_collector = SpanCollector()\n", + "provider = trace.get_tracer_provider()\n", + "if hasattr(provider, \"add_span_processor\"):\n", + " provider.add_span_processor(SimpleSpanProcessor(span_collector))\n", + " print(\"✓ Span collector added — spans will be captured for inspection\")\n", + "else:\n", + " print(\"⚠️ Could not add span processor\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "capture-and-inspect", + "metadata": {}, + "outputs": [], + "source": [ + "# Clear previous spans and run a fresh invocation\n", + "span_collector.clear()\n", + "_ = agent(\"What is 7 + 3?\")\n", + "\n", + "# Inspect captured spans\n", + "spans = span_collector.get_finished_spans()\n", + "print(f\"\\n📊 Captured {len(spans)} spans:\\n\")\n", + "\n", + "for span in spans:\n", + " attrs = span.attributes or {}\n", + " duration_ms = (\n", + " (span.end_time - span.start_time) / 1_000_000\n", + " if span.end_time and span.start_time\n", + " else 0\n", + " )\n", + " print(f\" {span.name:<25} duration={duration_ms:.0f}ms status={span.status.status_code.name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "troubleshooting", + "metadata": {}, + "source": [ + "## Troubleshooting\n", + "\n", + "| Problem | Solution |\n", + "|---------|----------|\n", + "| No trace output visible | Console exporter prints spans when they *end*. Wait for the agent call to complete. |\n", + "| `ModuleNotFoundError: No module named 'opentelemetry'` | Run `pip install -r requirements.txt` |\n", + "| `No TracerProvider configured` | Ensure `StrandsTelemetry` is configured *before* creating the Agent |\n", + "| AWS credentials error | Run `aws configure` or set `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` |\n", + "\n", + "## Next\n", + "\n", + "Continue to [02_trace_hierarchy.ipynb](02_trace_hierarchy.ipynb) to understand the\n", + "Agent → Cycle → Model → Tool span hierarchy in detail." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb b/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb new file mode 100644 index 00000000..667d484d --- /dev/null +++ b/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Observability & Debugging — Part 2: Trace Hierarchy\n", + "\n", + "This notebook teaches you how to read and interpret the span hierarchy that Strands\n", + "Agents produces for every invocation.\n", + "\n", + "**What you'll learn:**\n", + "- The four span types: Agent, Cycle, Model Invoke, Tool\n", + "- How parent-child relationships form the trace tree\n", + "- Key attributes on each span type\n", + "- How to use `trace_utils.py` for formatted output\n", + "\n", + "**Prerequisites:**\n", + "- Complete [01_tracing_setup.ipynb](01_tracing_setup.ipynb) first" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, \".\")\n", + "\n", + "from strands import Agent, tool\n", + "from strands.models.bedrock import BedrockModel\n", + "from strands.telemetry.config import StrandsTelemetry\n", + "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n", + "from opentelemetry import trace\n", + "\n", + "from trace_utils import format_trace_tree, print_trace_summary\n", + "\n", + "\n", + "class SpanCollector(SpanExporter):\n", + " \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n", + " def __init__(self):\n", + " self._spans = []\n", + " def export(self, spans):\n", + " self._spans.extend(spans)\n", + " return SpanExportResult.SUCCESS\n", + " def get_finished_spans(self):\n", + " return list(self._spans)\n", + " def clear(self):\n", + " self._spans = []\n", + " def shutdown(self):\n", + " self._spans = []\n", + "\n", + "\n", + "# Configure telemetry\n", + "telemetry = StrandsTelemetry()\n", + "telemetry.setup_console_exporter()\n", + "\n", + "# Add span collector\n", + "span_collector = SpanCollector()\n", + "provider = trace.get_tracer_provider()\n", + "if hasattr(provider, \"add_span_processor\"):\n", + " provider.add_span_processor(SimpleSpanProcessor(span_collector))\n", + "\n", + "print(\"✓ Telemetry configured with console + span collector\")" + ] + }, + { + "cell_type": "markdown", + "id": "hierarchy-explanation", + "metadata": {}, + "source": [ + "## The Four Span Types\n", + "\n", + "Every agent invocation produces this hierarchy:\n", + "\n", + "| Span Type | Key Attributes | What It Tells You |\n", + "|-----------|---------------|-------------------|\n", + "| **Agent** | `gen_ai.agent.name`, `gen_ai.request.model` | Overall invocation identity |\n", + "| **Cycle** | `event_loop.cycle_id` | Which iteration of the loop |\n", + "| **Model Invoke** | `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens` | Token consumption |\n", + "| **Tool** | `gen_ai.tool.name`, `tool.status` | Which tool ran, success/failure |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "single-tool", + "metadata": {}, + "outputs": [], + "source": [ + "@tool\n", + "def calculator(expression: str) -> str:\n", + " \"\"\"Evaluate a mathematical expression.\n", + "\n", + " Args:\n", + " expression: A mathematical expression to evaluate.\n", + "\n", + " Returns:\n", + " The result as a string.\n", + " \"\"\"\n", + " try:\n", + " result = eval(expression, {\"__builtins__\": {}}, {})\n", + " return str(result)\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "\n", + "agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[calculator],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "single-tool-trace", + "metadata": {}, + "source": [ + "## Single Tool Call Trace\n", + "\n", + "A simple math question produces a minimal trace: 2 cycles (one to call the tool,\n", + "one to format the response)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-single", + "metadata": {}, + "outputs": [], + "source": [ + "span_collector.clear()\n", + "result = agent(\"What is 42 * 17?\")\n", + "\n", + "spans = span_collector.get_finished_spans()\n", + "print(f\"Agent response: {result}\\n\")\n", + "print(\"🌳 Trace Tree:\\n\")\n", + "print(format_trace_tree(spans))\n", + "print()\n", + "print_trace_summary(spans)" + ] + }, + { + "cell_type": "markdown", + "id": "multi-tool-md", + "metadata": {}, + "source": [ + "## Multi-Tool Call Trace\n", + "\n", + "When the agent calls multiple tools in one cycle, you'll see multiple tool spans\n", + "under the same cycle span." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "run-multi", + "metadata": {}, + "outputs": [], + "source": [ + "span_collector.clear()\n", + "result = agent(\"Calculate 15 * 23, then calculate 100 / 4. Give me both results.\")\n", + "\n", + "spans = span_collector.get_finished_spans()\n", + "print(f\"Agent response: {result}\\n\")\n", + "print(\"🌳 Trace Tree:\\n\")\n", + "print(format_trace_tree(spans))\n", + "print()\n", + "print_trace_summary(spans)" + ] + }, + { + "cell_type": "markdown", + "id": "inspect-attrs", + "metadata": {}, + "source": [ + "## Inspecting Span Attributes\n", + "\n", + "Each span carries attributes with detailed metadata. Let's extract the key\n", + "information from each span type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "show-attrs", + "metadata": {}, + "outputs": [], + "source": [ + "spans = span_collector.get_finished_spans()\n", + "\n", + "print(\"📋 Detailed Span Attributes:\\n\")\n", + "for span in spans:\n", + " attrs = span.attributes or {}\n", + " print(f\"── {span.name} ──\")\n", + " print(f\" trace_id: {format(span.context.trace_id, '032x')[:16]}...\")\n", + " print(f\" status: {span.status.status_code.name}\")\n", + "\n", + " if \"gen_ai.usage.input_tokens\" in attrs:\n", + " print(f\" input_tokens: {attrs['gen_ai.usage.input_tokens']}\")\n", + " print(f\" output_tokens: {attrs.get('gen_ai.usage.output_tokens', 0)}\")\n", + " if \"gen_ai.tool.name\" in attrs:\n", + " print(f\" tool_name: {attrs['gen_ai.tool.name']}\")\n", + " print(f\" tool_status: {attrs.get('tool.status', 'N/A')}\")\n", + " if \"event_loop.cycle_id\" in attrs:\n", + " print(f\" cycle_id: {attrs['event_loop.cycle_id']}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "cycle-count", + "metadata": {}, + "source": [ + "## Understanding Cycle Count\n", + "\n", + "The number of cycles tells you how many round-trips the agent needed:\n", + "\n", + "- **1 cycle** — agent answered directly without tools\n", + "- **2 cycles** — typical: call tool(s), then format response\n", + "- **3+ cycles** — complex reasoning, retries, or multi-step tool use\n", + "- **5+ cycles** — potential issue (infinite loop, poor tool design)\n", + "\n", + "More cycles = more tokens consumed = higher latency = higher cost." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "count-cycles", + "metadata": {}, + "outputs": [], + "source": [ + "spans = span_collector.get_finished_spans()\n", + "\n", + "cycle_count = sum(\n", + " 1 for s in spans if \"event_loop.cycle_id\" in (s.attributes or {})\n", + ")\n", + "tool_count = sum(\n", + " 1 for s in spans if \"gen_ai.tool.name\" in (s.attributes or {})\n", + ")\n", + "\n", + "print(f\"📊 Execution Summary:\")\n", + "print(f\" Cycles: {cycle_count}\")\n", + "print(f\" Tool calls: {tool_count}\")\n", + "print(f\" Total spans: {len(spans)}\")\n", + "\n", + "if cycle_count <= 2:\n", + " print(\" ✓ Efficient execution\")\n", + "elif cycle_count <= 4:\n", + " print(\" ⚠️ Moderate — check if all cycles are necessary\")\n", + "else:\n", + " print(\" 🚨 High cycle count — investigate for potential issues\")" + ] + }, + { + "cell_type": "markdown", + "id": "next", + "metadata": {}, + "source": [ + "## Next\n", + "\n", + "Continue to [03_debugging_tools.ipynb](03_debugging_tools.ipynb) to learn how to\n", + "debug tool failures and context window pressure using trace data." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb b/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb new file mode 100644 index 00000000..da7b69d3 --- /dev/null +++ b/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Observability & Debugging — Part 3: Debugging Tool Failures & Context Pressure\n", + "\n", + "This notebook walks through two critical debugging scenarios using trace data:\n", + "identifying failed tool invocations and detecting context window pressure.\n", + "\n", + "**What you'll learn:**\n", + "- Find error spans and inspect exception details\n", + "- Understand how errors propagate through the trace hierarchy\n", + "- Detect context window pressure by tracking token growth\n", + "- Apply mitigation strategies for both scenarios\n", + "\n", + "**Prerequisites:**\n", + "- Complete [01_tracing_setup.ipynb](01_tracing_setup.ipynb) and [02_trace_hierarchy.ipynb](02_trace_hierarchy.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, \".\")\n", + "\n", + "from strands import Agent, tool\n", + "from strands.models.bedrock import BedrockModel\n", + "from strands.telemetry.config import StrandsTelemetry\n", + "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n", + "from opentelemetry.trace import StatusCode\n", + "from opentelemetry import trace\n", + "\n", + "from trace_utils import format_trace_tree, find_error_spans, analyze_token_growth\n", + "\n", + "\n", + "class SpanCollector(SpanExporter):\n", + " \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n", + " def __init__(self):\n", + " self._spans = []\n", + " def export(self, spans):\n", + " self._spans.extend(spans)\n", + " return SpanExportResult.SUCCESS\n", + " def get_finished_spans(self):\n", + " return list(self._spans)\n", + " def clear(self):\n", + " self._spans = []\n", + " def shutdown(self):\n", + " self._spans = []\n", + "\n", + "\n", + "# Configure telemetry\n", + "telemetry = StrandsTelemetry()\n", + "telemetry.setup_console_exporter()\n", + "\n", + "span_collector = SpanCollector()\n", + "provider = trace.get_tracer_provider()\n", + "if hasattr(provider, \"add_span_processor\"):\n", + " provider.add_span_processor(SimpleSpanProcessor(span_collector))\n", + "\n", + "print(\"✓ Telemetry configured\")" + ] + }, + { + "cell_type": "markdown", + "id": "scenario1-intro", + "metadata": {}, + "source": [ + "## Scenario 1: Tool Failure Detection\n", + "\n", + "When a tool raises an exception, the Strands SDK records:\n", + "- **Span status** → `ERROR`\n", + "- **Status description** → the error message\n", + "- **Exception event** → `exception.type`, `exception.message`, `exception.stacktrace`\n", + "\n", + "The agent typically handles the error gracefully (reports it to the user or retries),\n", + "so the root Agent span may still be `OK` even when a tool span is `ERROR`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-flaky", + "metadata": {}, + "outputs": [], + "source": [ + "# Track call count to make failure deterministic\n", + "_flaky_api_call_count = 0\n", + "\n", + "\n", + "@tool\n", + "def flaky_api(query: str) -> str:\n", + " \"\"\"Simulate an unreliable external API call.\n", + "\n", + " Fails on the first call with a ConnectionError to demonstrate\n", + " how tool errors appear in traces. Succeeds on subsequent calls\n", + " to show agent recovery behavior.\n", + "\n", + " Args:\n", + " query: The search query to send to the simulated API.\n", + "\n", + " Returns:\n", + " A simulated API response string.\n", + "\n", + " Raises:\n", + " ConnectionError: Simulated timeout on first call.\n", + " \"\"\"\n", + " global _flaky_api_call_count\n", + " _flaky_api_call_count += 1\n", + " if _flaky_api_call_count == 1:\n", + " raise ConnectionError(f\"API timeout after 30s for query: {query}\")\n", + " return f\"API result for: {query}\"\n", + "\n", + "\n", + "@tool\n", + "def calculator(expression: str) -> str:\n", + " \"\"\"Evaluate a mathematical expression.\"\"\"\n", + " try:\n", + " return str(eval(expression, {\"__builtins__\": {}}, {}))\n", + " except Exception as e:\n", + " return f\"Error: {e}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "trigger-failure", + "metadata": {}, + "outputs": [], + "source": [ + "# Reset call counter so flaky_api fails on first call\n", + "_flaky_api_call_count = 0\n", + "span_collector.clear()\n", + "\n", + "debug_agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[calculator, flaky_api],\n", + ")\n", + "\n", + "print(\"Invoking agent with flaky_api (expecting a tool failure)...\\n\")\n", + "result = debug_agent(\"Search for 'distributed tracing tutorial' using the API\")\n", + "print(f\"\\nAgent response: {result}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "inspect-errors", + "metadata": {}, + "outputs": [], + "source": [ + "# Use trace_utils to find error spans\n", + "spans = span_collector.get_finished_spans()\n", + "errors = find_error_spans(spans)\n", + "\n", + "print(f\"🔍 Found {len(errors)} error span(s):\\n\")\n", + "\n", + "for span in errors:\n", + " print(f\" ✗ Span: {span.name}\")\n", + " print(f\" Status code: {span.status.status_code.name}\")\n", + " attrs = span.attributes or {}\n", + " if attrs.get(\"tool.status\") == \"error\":\n", + " print(f\" tool.status: error\")\n", + " if \"gen_ai.tool.name\" in attrs:\n", + " print(f\" Tool: {attrs['gen_ai.tool.name']}\")\n", + "\n", + " # Check for exception events\n", + " for event in span.events:\n", + " if event.name == \"exception\":\n", + " print(f\" Exception type: {event.attributes.get('exception.type', 'N/A')}\")\n", + " print(f\" Exception msg: {event.attributes.get('exception.message', 'N/A')}\")\n", + " # Also check tool result for error message\n", + " if event.name == \"gen_ai.choice\":\n", + " msg = str(event.attributes.get('message', ''))\n", + " if 'Error:' in msg:\n", + " print(f\" Error detail: {msg[:200]}\")\n", + " print()\n", + "\n", + "if not errors:\n", + " print(\" No errors detected in this invocation.\")\n", + "\n", + "print(\"\\n📋 Full trace tree:\")\n", + "print(format_trace_tree(spans))" + ] + }, + { + "cell_type": "markdown", + "id": "error-pattern", + "metadata": {}, + "source": [ + "### Debugging Pattern: Error → Recovery\n", + "\n", + "The key insight: filter by `status_code == ERROR` to find the failure, then look at\n", + "the *next* cycle's model invoke span to see how the agent recovered.\n", + "\n", + "```\n", + "Agent Span (status: OK — agent handled it)\n", + "├── Cycle 1\n", + "│ ├── Model Invoke (OK) — decided to call flaky_api\n", + "│ └── Tool: flaky_api (ERROR) ← find this\n", + "│ exception.type: ConnectionError\n", + "├── Cycle 2 ← then look here\n", + "│ └── Model Invoke (OK) — \"I encountered an error...\"\n", + "```\n", + "\n", + "**Production alert:** Set up monitoring on error span count. If a tool's error rate\n", + "exceeds 5% over 5 minutes, trigger an alert." + ] + }, + { + "cell_type": "markdown", + "id": "scenario2-intro", + "metadata": {}, + "source": [ + "## Scenario 2: Context Window Pressure\n", + "\n", + "Each cycle adds to the conversation history. When tools return large outputs,\n", + "token usage grows rapidly. Monitor `gen_ai.usage.input_tokens` across model\n", + "invoke spans to detect this.\n", + "\n", + "**Warning signs:**\n", + "- Input tokens growing significantly between cycles\n", + "- Total approaching the model's context limit (200K for Claude)\n", + "- Degraded response quality" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "token-heavy", + "metadata": {}, + "outputs": [], + "source": [ + "@tool\n", + "def token_heavy(topic: str) -> str:\n", + " \"\"\"Generate a verbose response that consumes many tokens.\n", + "\n", + " Intentionally produces ~2000 tokens of output to demonstrate\n", + " context window pressure across multiple agent cycles.\n", + "\n", + " Args:\n", + " topic: The topic to generate content about.\n", + "\n", + " Returns:\n", + " A lengthy string of repeated content.\n", + " \"\"\"\n", + " paragraph = (\n", + " f\"Detailed analysis of {topic}: This is an extensive exploration covering \"\n", + " f\"multiple dimensions and perspectives. The topic of {topic} encompasses \"\n", + " f\"various interconnected aspects requiring thorough examination. \"\n", + " f\"When considering {topic}, one must account for historical context, \"\n", + " f\"current state, and future implications. \"\n", + " )\n", + " return paragraph * 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "trigger-pressure", + "metadata": {}, + "outputs": [], + "source": [ + "span_collector.clear()\n", + "\n", + "token_agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[token_heavy, calculator],\n", + ")\n", + "\n", + "print(\"Invoking agent with token_heavy tool (~2000 tokens per call)...\\n\")\n", + "result = token_agent(\n", + " \"Give me a detailed analysis of 'machine learning' and then 'distributed systems'. \"\n", + " \"Use the token_heavy tool for each topic.\"\n", + ")\n", + "print(f\"\\nAgent response (truncated): {str(result)[:150]}...\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "analyze-growth", + "metadata": {}, + "outputs": [], + "source": [ + "# Use trace_utils to analyze token growth\n", + "spans = span_collector.get_finished_spans()\n", + "analysis = analyze_token_growth(spans)\n", + "\n", + "print(\"📊 Token Usage Across Model Calls\\n\")\n", + "print(f\"{'Span':<30} {'Input':<12} {'Output':<12}\")\n", + "print(\"─\" * 54)\n", + "for ms in analysis[\"model_spans\"]:\n", + " print(f\"{ms['name']:<30} {ms['input_tokens']:<12} {ms['output_tokens']:<12}\")\n", + "\n", + "if \"first_input\" in analysis:\n", + " print(f\"\\n⚠️ Context Growth Analysis:\")\n", + " print(f\" First model call: {analysis['first_input']} input tokens\")\n", + " print(f\" Last model call: {analysis['last_input']} input tokens\")\n", + " print(f\" Growth: +{analysis['growth']} tokens ({analysis['growth_pct']:.0f}% increase)\")\n", + " print(f\" Context usage: {analysis['usage_pct']:.1f}% of {analysis['context_limit']:,} limit\")\n", + "\n", + " if analysis[\"usage_pct\"] > 50:\n", + " print(\" 🚨 HIGH: Context window more than half full!\")\n", + " elif analysis[\"usage_pct\"] > 20:\n", + " print(\" ⚠️ MODERATE: Monitor closely.\")\n", + " else:\n", + " print(\" ✓ LOW: Within safe limits.\")" + ] + }, + { + "cell_type": "markdown", + "id": "mitigation", + "metadata": {}, + "source": [ + "### Mitigation Strategies\n", + "\n", + "| Strategy | When to Use |\n", + "|----------|-------------|\n", + "| Summarize tool outputs | Tools return verbose text |\n", + "| Limit cycle count | Agent might loop indefinitely |\n", + "| Use concise tools | Return structured data, not prose |\n", + "| Set alerts at 80% | Production monitoring |\n", + "| Use `SlidingWindowConversationManager` | Long-running conversations |\n", + "\n", + "See [17-conversation-management](../17-conversation-management) for conversation\n", + "management strategies that help control context growth.\n", + "\n", + "## Next\n", + "\n", + "Continue to [04_backend_export.ipynb](04_backend_export.ipynb) to learn how to\n", + "export traces to production backends via OTLP." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/01-learn/21-observability-debugging/04_backend_export.ipynb b/python/01-learn/21-observability-debugging/04_backend_export.ipynb new file mode 100644 index 00000000..44a584cf --- /dev/null +++ b/python/01-learn/21-observability-debugging/04_backend_export.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Observability & Debugging — Part 4: Backend Export\n", + "\n", + "This notebook teaches you how to export traces to production observability backends\n", + "using the OTLP (OpenTelemetry Protocol) exporter.\n", + "\n", + "**What you'll learn:**\n", + "- Export traces to CloudWatch via AWS Distro for OpenTelemetry (ADOT)\n", + "- Export traces to Langfuse for LLM-specific observability\n", + "- Export traces to Jaeger for local development\n", + "- Switch backends with a single environment variable change\n", + "\n", + "**Prerequisites:**\n", + "- Complete notebooks 01–03 first\n", + "- *(Optional)* Docker for running Jaeger locally\n", + "- *(Optional)* Langfuse account for cloud export\n", + "- *(Optional)* AWS credentials with CloudWatch permissions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from strands import Agent, tool\n", + "from strands.models.bedrock import BedrockModel\n", + "from strands.telemetry.config import StrandsTelemetry\n", + "\n", + "print(\"✓ Imports ready\")" + ] + }, + { + "cell_type": "markdown", + "id": "otlp-concept", + "metadata": {}, + "source": [ + "## How OTLP Export Works\n", + "\n", + "The OpenTelemetry Protocol (OTLP) is the standard wire format for sending telemetry\n", + "data. Strands provides `setup_otlp_exporter()` which reads the endpoint from the\n", + "`OTEL_EXPORTER_OTLP_ENDPOINT` environment variable.\n", + "\n", + "The same exporter works with **any** OTLP-compatible backend:\n", + "- CloudWatch (via ADOT collector)\n", + "- Langfuse (direct OTLP ingestion)\n", + "- Jaeger (built-in OTLP receiver)\n", + "- Grafana Tempo, Honeycomb, Datadog, etc." + ] + }, + { + "cell_type": "markdown", + "id": "cloudwatch-md", + "metadata": {}, + "source": [ + "## Option 1: CloudWatch via ADOT\n", + "\n", + "AWS Distro for OpenTelemetry (ADOT) is a collector that receives OTLP traces and\n", + "forwards them to CloudWatch X-Ray.\n", + "\n", + "**Start the ADOT collector:**\n", + "```bash\n", + "docker run --rm -p 4318:4318 \\\n", + " -e AWS_REGION=us-east-1 \\\n", + " public.ecr.aws/aws-observability/aws-otel-collector:latest\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cloudwatch-setup", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_cloudwatch_telemetry():\n", + " \"\"\"Configure telemetry to export to CloudWatch via ADOT.\n", + "\n", + " Requires ADOT collector running at localhost:4318.\n", + " Start with: docker run --rm -p 4318:4318 public.ecr.aws/aws-observability/aws-otel-collector:latest\n", + " \"\"\"\n", + " os.environ.setdefault(\"OTEL_EXPORTER_OTLP_ENDPOINT\", \"http://localhost:4318\")\n", + "\n", + " telemetry = StrandsTelemetry()\n", + " telemetry.setup_console_exporter() # Keep console for local visibility\n", + "\n", + " try:\n", + " telemetry.setup_otlp_exporter()\n", + " print(\"✓ CloudWatch telemetry configured (via ADOT at localhost:4318)\")\n", + " print(\" Traces will appear in CloudWatch X-Ray console.\")\n", + " except Exception as e:\n", + " print(f\"⚠️ ADOT collector not available: {e}\")\n", + " print(\" Traces will only appear in console output.\")\n", + " print(\" Start ADOT: docker run --rm -p 4318:4318 public.ecr.aws/aws-observability/aws-otel-collector:latest\")\n", + "\n", + " return telemetry\n", + "\n", + "\n", + "# Uncomment to activate:\n", + "# telemetry = setup_cloudwatch_telemetry()\n", + "print(\"CloudWatch setup defined. Uncomment the line above to activate.\")" + ] + }, + { + "cell_type": "markdown", + "id": "langfuse-md", + "metadata": {}, + "source": [ + "## Option 2: Langfuse\n", + "\n", + "Langfuse provides a purpose-built UI for LLM observability with native OTLP ingestion.\n", + "\n", + "**Setup:**\n", + "1. Create account at [langfuse.com](https://langfuse.com)\n", + "2. Get public/secret keys from Project Settings\n", + "3. Set environment variables below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "langfuse-setup", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_langfuse_telemetry():\n", + " \"\"\"Configure telemetry to export to Langfuse.\n", + "\n", + " Requires LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables.\n", + " Get keys from: https://cloud.langfuse.com → Project Settings\n", + " \"\"\"\n", + " host = os.environ.get(\"LANGFUSE_HOST\", \"https://cloud.langfuse.com\")\n", + " public_key = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n", + " secret_key = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n", + "\n", + " if not public_key or not secret_key:\n", + " print(\"⚠️ LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY not set.\")\n", + " print(\" Set these environment variables to enable Langfuse export:\")\n", + " print(' export LANGFUSE_PUBLIC_KEY=\"pk-lf-...\"')\n", + " print(' export LANGFUSE_SECRET_KEY=\"sk-lf-...\"')\n", + " return None\n", + "\n", + " os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = f\"{host}/api/public/otel\"\n", + " os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = (\n", + " f\"Authorization=Basic {public_key}:{secret_key}\"\n", + " )\n", + "\n", + " telemetry = StrandsTelemetry()\n", + " telemetry.setup_console_exporter()\n", + " telemetry.setup_otlp_exporter()\n", + " print(f\"✓ Langfuse telemetry configured (host: {host})\")\n", + " print(\" Traces will appear in your Langfuse project dashboard.\")\n", + " return telemetry\n", + "\n", + "\n", + "# Uncomment to activate:\n", + "# telemetry = setup_langfuse_telemetry()\n", + "print(\"Langfuse setup defined. Uncomment the line above to activate.\")" + ] + }, + { + "cell_type": "markdown", + "id": "jaeger-md", + "metadata": {}, + "source": [ + "## Option 3: Jaeger (Local Development)\n", + "\n", + "Jaeger is ideal for local development — single Docker container with a built-in UI.\n", + "\n", + "**Start Jaeger:**\n", + "```bash\n", + "docker run --rm -d --name jaeger \\\n", + " -p 4318:4318 \\\n", + " -p 16686:16686 \\\n", + " jaegertracing/all-in-one:latest\n", + "```\n", + "\n", + "Then open http://localhost:16686 to view traces." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "jaeger-setup", + "metadata": {}, + "outputs": [], + "source": [ + "def setup_jaeger_telemetry():\n", + " \"\"\"Configure telemetry to export to Jaeger.\n", + "\n", + " Requires Jaeger running at localhost:4318.\n", + " Start with: docker run --rm -d -p 4318:4318 -p 16686:16686 jaegertracing/all-in-one:latest\n", + " View traces at: http://localhost:16686\n", + " \"\"\"\n", + " os.environ.setdefault(\"OTEL_EXPORTER_OTLP_ENDPOINT\", \"http://localhost:4318\")\n", + "\n", + " telemetry = StrandsTelemetry()\n", + " telemetry.setup_console_exporter()\n", + "\n", + " try:\n", + " telemetry.setup_otlp_exporter()\n", + " print(\"✓ Jaeger telemetry configured (localhost:4318)\")\n", + " print(\" View traces at: http://localhost:16686\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Jaeger not available: {e}\")\n", + " print(\" Start: docker run --rm -d -p 4318:4318 -p 16686:16686 jaegertracing/all-in-one:latest\")\n", + "\n", + " return telemetry\n", + "\n", + "\n", + "# Uncomment to activate:\n", + "# telemetry = setup_jaeger_telemetry()\n", + "print(\"Jaeger setup defined. Uncomment the line above to activate.\")" + ] + }, + { + "cell_type": "markdown", + "id": "test-export", + "metadata": {}, + "source": [ + "## Testing Your Backend\n", + "\n", + "Once you've uncommented one of the setup functions above, run the cell below\n", + "to send a test trace to your backend." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test-trace", + "metadata": {}, + "outputs": [], + "source": [ + "# Use console-only telemetry for this demo (replace with your backend above)\n", + "telemetry = StrandsTelemetry()\n", + "telemetry.setup_console_exporter()\n", + "\n", + "\n", + "@tool\n", + "def calculator(expression: str) -> str:\n", + " \"\"\"Evaluate a mathematical expression.\"\"\"\n", + " try:\n", + " return str(eval(expression, {\"__builtins__\": {}}, {}))\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "\n", + "agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[calculator],\n", + ")\n", + "\n", + "print(\"Sending test trace...\\n\")\n", + "result = agent(\"What is 99 * 77?\")\n", + "print(f\"\\nResult: {result}\")\n", + "print(\"\\n✓ Check your backend for the trace!\")" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "| Backend | Endpoint | Use Case |\n", + "|---------|----------|----------|\n", + "| Console | stdout | Development, quick debugging |\n", + "| CloudWatch (ADOT) | `localhost:4318` | AWS production monitoring |\n", + "| Langfuse | `cloud.langfuse.com/api/public/otel` | LLM-specific observability |\n", + "| Jaeger | `localhost:4318` | Local development with UI |\n", + "\n", + "All backends use the same `setup_otlp_exporter()` call — only the endpoint changes.\n", + "\n", + "## Next\n", + "\n", + "Continue to [05_custom_metrics.ipynb](05_custom_metrics.ipynb) to learn how to add\n", + "custom span attributes, metrics, and production best practices." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb b/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb new file mode 100644 index 00000000..53ee87d7 --- /dev/null +++ b/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Observability & Debugging — Part 5: Custom Metrics & Production Best Practices\n", + "\n", + "This notebook teaches you how to add custom instrumentation for production monitoring\n", + "and apply best practices for sampling, performance, and security.\n", + "\n", + "**What you'll learn:**\n", + "- Add custom span attributes for business context\n", + "- Use the OpenTelemetry Metrics API for aggregate monitoring\n", + "- Configure `BatchSpanProcessor` for production\n", + "- Apply sampling, alerting, and security best practices\n", + "\n", + "**Prerequisites:**\n", + "- Complete notebooks 01–04 first" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "from strands import Agent, tool\n", + "from strands.models.bedrock import BedrockModel\n", + "from strands.telemetry.config import StrandsTelemetry\n", + "from opentelemetry import trace\n", + "\n", + "# Configure telemetry\n", + "telemetry = StrandsTelemetry()\n", + "telemetry.setup_console_exporter()\n", + "\n", + "print(\"✓ Telemetry configured\")" + ] + }, + { + "cell_type": "markdown", + "id": "custom-attrs-md", + "metadata": {}, + "source": [ + "## Custom Span Attributes\n", + "\n", + "Add domain-specific metadata to spans using `trace.get_current_span().set_attribute()`.\n", + "This lets you correlate traces with business context: user IDs, session IDs,\n", + "feature flags, request types, etc.\n", + "\n", + "**Where to add attributes:**\n", + "- Inside `@tool` functions → attributes appear on the tool span\n", + "- In a wrapper span around `agent()` → attributes appear on a parent span\n", + "- Both approaches are useful for different query patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "custom-attrs", + "metadata": {}, + "outputs": [], + "source": [ + "@tool\n", + "def calculator(expression: str) -> str:\n", + " \"\"\"Evaluate a mathematical expression with custom span attributes.\"\"\"\n", + " # Add custom attributes to the current tool span\n", + " current_span = trace.get_current_span()\n", + " current_span.set_attribute(\"app.expression\", expression)\n", + " current_span.set_attribute(\"app.tool_version\", \"1.0.0\")\n", + "\n", + " try:\n", + " result = eval(expression, {\"__builtins__\": {}}, {})\n", + " current_span.set_attribute(\"app.result\", str(result))\n", + " current_span.set_attribute(\"app.success\", True)\n", + " return str(result)\n", + " except Exception as e:\n", + " current_span.set_attribute(\"app.success\", False)\n", + " current_span.set_attribute(\"app.error\", str(e))\n", + " return f\"Error: {e}\"\n", + "\n", + "\n", + "agent = Agent(\n", + " model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n", + " tools=[calculator],\n", + ")\n", + "\n", + "print(\"✓ Agent created with instrumented calculator tool\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "request-context", + "metadata": {}, + "outputs": [], + "source": [ + "# Add request-level context using a wrapper span\n", + "tracer = trace.get_tracer(\"observability-tutorial\")\n", + "\n", + "with tracer.start_as_current_span(\"user-request\") as span:\n", + " # Business context attributes\n", + " span.set_attribute(\"app.user_id\", \"user-123\")\n", + " span.set_attribute(\"app.session_id\", \"session-abc\")\n", + " span.set_attribute(\"app.feature_flag\", \"calculator-v2\")\n", + " span.set_attribute(\"app.request_type\", \"math_query\")\n", + "\n", + " result = agent(\"What is 99 * 77?\")\n", + " print(f\"\\nResult: {result}\")\n", + "\n", + "print(\"\\n✓ Custom attributes added — visible in trace output above\")\n", + "print(\" Look for: app.user_id, app.session_id, app.expression, app.result\")" + ] + }, + { + "cell_type": "markdown", + "id": "metrics-md", + "metadata": {}, + "source": [ + "## Custom Metrics\n", + "\n", + "Beyond traces, OpenTelemetry supports **metrics** for aggregate monitoring.\n", + "Metrics answer questions like:\n", + "- How many invocations per minute?\n", + "- What's the P95 latency?\n", + "- What's the error rate over the last hour?\n", + "- How many tokens are we consuming per day?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "metrics-setup", + "metadata": {}, + "outputs": [], + "source": [ + "from opentelemetry import metrics\n", + "from opentelemetry.sdk.metrics import MeterProvider\n", + "from opentelemetry.sdk.metrics.export import (\n", + " ConsoleMetricExporter,\n", + " PeriodicExportingMetricReader,\n", + ")\n", + "\n", + "# Set up metrics with console exporter\n", + "metric_reader = PeriodicExportingMetricReader(\n", + " ConsoleMetricExporter(),\n", + " export_interval_millis=10000, # Export every 10 seconds\n", + ")\n", + "meter_provider = MeterProvider(metric_readers=[metric_reader])\n", + "metrics.set_meter_provider(meter_provider)\n", + "\n", + "# Create custom instruments\n", + "meter = metrics.get_meter(\"strands-agent-metrics\")\n", + "\n", + "invocation_counter = meter.create_counter(\n", + " name=\"agent.invocations\",\n", + " description=\"Number of agent invocations\",\n", + " unit=\"1\",\n", + ")\n", + "\n", + "latency_histogram = meter.create_histogram(\n", + " name=\"agent.latency\",\n", + " description=\"Agent invocation latency\",\n", + " unit=\"ms\",\n", + ")\n", + "\n", + "error_counter = meter.create_counter(\n", + " name=\"agent.errors\",\n", + " description=\"Number of failed agent invocations\",\n", + " unit=\"1\",\n", + ")\n", + "\n", + "print(\"✓ Metrics instruments created\")\n", + "print(\" - agent.invocations (counter)\")\n", + "print(\" - agent.latency (histogram)\")\n", + "print(\" - agent.errors (counter)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "record-metrics", + "metadata": {}, + "outputs": [], + "source": [ + "# Instrument an agent call with metrics\n", + "start = time.time()\n", + "try:\n", + " result = agent(\"What is 2 + 2?\")\n", + " status = \"success\"\n", + "except Exception:\n", + " status = \"error\"\n", + " error_counter.add(1, {\"agent.name\": \"calculator-agent\"})\n", + "\n", + "duration_ms = (time.time() - start) * 1000\n", + "\n", + "# Record metrics\n", + "invocation_counter.add(1, {\"agent.name\": \"calculator-agent\", \"status\": status})\n", + "latency_histogram.record(duration_ms, {\"agent.name\": \"calculator-agent\"})\n", + "\n", + "print(f\"\\nResult: {result}\")\n", + "print(f\"Latency: {duration_ms:.0f}ms\")\n", + "print(f\"Status: {status}\")\n", + "print(\"\\n✓ Metrics recorded (will export to console in ~10 seconds)\")" + ] + }, + { + "cell_type": "markdown", + "id": "production-md", + "metadata": {}, + "source": [ + "## Production Configuration\n", + "\n", + "For production, use `BatchSpanProcessor` instead of `SimpleSpanProcessor`.\n", + "It queues spans and exports them in bulk, adding minimal latency to agent calls.\n", + "\n", + "| Setting | Development | Production |\n", + "|---------|-------------|------------|\n", + "| Span Processor | `SimpleSpanProcessor` | `BatchSpanProcessor` |\n", + "| Console Exporter | ✓ Enabled | ✗ Disabled |\n", + "| Sampling | 100% | 1–10% |\n", + "| Error Sampling | 100% | 100% (always) |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-otlp", + "metadata": {}, + "outputs": [], + "source": [ + "# Install OTLP exporter (required for production config below)\n", + "!pip install opentelemetry-exporter-otlp-proto-http -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "production-config", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from opentelemetry.sdk.trace import TracerProvider\n", + " from opentelemetry.sdk.trace.export import BatchSpanProcessor\n", + " from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n", + " from opentelemetry.sdk.resources import Resource\n", + " _OTLP_AVAILABLE = True\n", + "except ImportError:\n", + " _OTLP_AVAILABLE = False\n", + " print(\"⚠️ opentelemetry-exporter-otlp-proto-http not installed.\")\n", + " print(\" Install with: pip install opentelemetry-exporter-otlp-proto-http\")\n", + " print(\" The production config function below will not work without it.\")\n", + "\n", + "\n", + "def setup_production_telemetry(endpoint: str, service_name: str = \"my-agent-service\"):\n", + " \"\"\"Configure telemetry for production use.\n", + "\n", + " Uses BatchSpanProcessor for minimal latency impact and\n", + " OTLP export to the specified endpoint.\n", + "\n", + " Args:\n", + " endpoint: OTLP endpoint URL (e.g., http://collector:4318)\n", + " service_name: Service name for trace identification\n", + " \"\"\"\n", + " if not _OTLP_AVAILABLE:\n", + " print(\"✗ Cannot configure — install opentelemetry-exporter-otlp-proto-http first.\")\n", + " return\n", + "\n", + " resource = Resource.create({\"service.name\": service_name})\n", + " provider = TracerProvider(resource=resource)\n", + "\n", + " # BatchSpanProcessor — queues spans and exports in bulk\n", + " otlp_exporter = OTLPSpanExporter(endpoint=f\"{endpoint}/v1/traces\")\n", + " provider.add_span_processor(\n", + " BatchSpanProcessor(\n", + " otlp_exporter,\n", + " max_queue_size=2048, # Buffer up to 2048 spans\n", + " max_export_batch_size=512, # Export 512 at a time\n", + " schedule_delay_millis=5000, # Export every 5 seconds\n", + " )\n", + " )\n", + "\n", + " trace.set_tracer_provider(provider)\n", + " print(f\"✓ Production telemetry configured\")\n", + " print(f\" Endpoint: {endpoint}\")\n", + " print(f\" Service: {service_name}\")\n", + " print(f\" Processor: BatchSpanProcessor (queue=2048, batch=512, delay=5s)\")\n", + "\n", + "\n", + "# Example (don't run unless you have a collector):\n", + "# setup_production_telemetry(\"http://localhost:4318\", \"my-agent\")\n", + "if _OTLP_AVAILABLE:\n", + " print(\"Production setup function defined.\")" + ] + }, + { + "cell_type": "markdown", + "id": "sampling-md", + "metadata": {}, + "source": [ + "## Sampling Strategies\n", + "\n", + "In production, tracing every request adds overhead. Use sampling:\n", + "\n", + "```python\n", + "from opentelemetry.sdk.trace.sampling import TraceIdRatioBased, ParentBased\n", + "\n", + "# Sample 10% of traces, but always trace if parent is sampled\n", + "sampler = ParentBased(root=TraceIdRatioBased(0.1))\n", + "provider = TracerProvider(sampler=sampler, resource=resource)\n", + "```\n", + "\n", + "**Guidelines:**\n", + "- Development: 100% (see everything)\n", + "- Staging: 50–100%\n", + "- Production: 1–10% (adjust based on volume)\n", + "- **Always sample errors at 100%** regardless of rate" + ] + }, + { + "cell_type": "markdown", + "id": "alerts-md", + "metadata": {}, + "source": [ + "## Alert Thresholds\n", + "\n", + "Set alerts on these signals for production agents:\n", + "\n", + "| Signal | Threshold | Meaning |\n", + "|--------|-----------|--------|\n", + "| Tool error rate | > 5% over 5 min | Tool reliability degraded |\n", + "| P95 latency | > 30 seconds | Agent is slow |\n", + "| Context window usage | > 80% of limit | Risk of truncation |\n", + "| Cycle count | > 5 per invocation | Potential infinite loop |\n", + "| Token consumption | > budget threshold | Cost control |" + ] + }, + { + "cell_type": "markdown", + "id": "security-md", + "metadata": {}, + "source": [ + "## Telemetry Security\n", + "\n", + "**Do:**\n", + "- Use TLS for OTLP export endpoints\n", + "- Rotate API keys for backend authentication\n", + "- Add generic metadata (user_id, session_id) to spans\n", + "- Redact PII before adding to attributes\n", + "\n", + "**Don't:**\n", + "- Log raw user input in span attributes\n", + "- Include passwords, tokens, or secrets in traces\n", + "- Send traces to unencrypted endpoints in production\n", + "- Store full request/response bodies in span attributes" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "You've completed the full observability tutorial! Here's what you've learned:\n", + "\n", + "1. **Tracing setup** (01) — `StrandsTelemetry` with console exporter\n", + "2. **Trace hierarchy** (02) — Agent → Cycle → Model → Tool spans\n", + "3. **Debugging** (03) — Error spans, context pressure, token growth\n", + "4. **Backend export** (04) — OTLP to CloudWatch, Langfuse, Jaeger\n", + "5. **Production** (05) — Custom attributes, metrics, BatchSpanProcessor, sampling\n", + "\n", + "### Key Takeaways\n", + "\n", + "- `StrandsTelemetry` must be configured *before* creating `Agent` instances\n", + "- Filter spans by `status_code == ERROR` to find failures\n", + "- Monitor `gen_ai.usage.input_tokens` growth for context pressure\n", + "- Use `BatchSpanProcessor` in production\n", + "- Sample traces (1–10%) but always trace errors at 100%\n", + "- Never log sensitive data in span attributes\n", + "\n", + "### Next Steps\n", + "\n", + "- See [16-hooks-lifecycle](../16-hooks-lifecycle) for how hooks compose with tracing\n", + "- Explore multi-agent tracing with `Graph` or `Swarm` orchestration\n", + "- Set up dashboards in your chosen backend for ongoing monitoring\n", + "- Add custom ML-based anomaly detection on trace metrics" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/01-learn/21-observability-debugging/README.md b/python/01-learn/21-observability-debugging/README.md new file mode 100644 index 00000000..e5560b39 --- /dev/null +++ b/python/01-learn/21-observability-debugging/README.md @@ -0,0 +1,157 @@ +# Tutorial 21: Observability & Debugging + +Learn how to gain full visibility into your Strands agent's behavior using the +SDK's built-in telemetry. This tutorial covers tracing, debugging common agent +issues, exporting telemetry to production backends, and adding custom metrics — +using the `StrandsTelemetry` API that ships with the Strands Agents SDK. + +## Tutorial Details + +| | | +|---|---| +| Strands Features | `StrandsTelemetry`, `setup_console_exporter()`, `setup_otlp_exporter()`, span attributes, trace hierarchy | +| Agent Pattern | Single agent with custom tools exercising tracing and debugging scenarios | +| Tools | Small custom tools defined inline (calculator, flaky API simulator, token-heavy generator) | +| Model | Amazon Nova Lite on Amazon Bedrock (any Strands-supported model works) | + +## What You'll Learn + +By completing this tutorial, you will be able to: + +- **Basics (Notebook 01):** Configure `StrandsTelemetry` with a single API call and + run your first traced agent invocation +- **Trace Hierarchy (Notebook 02):** Read the Agent → Cycle → Model → Tool span + hierarchy to understand exactly how your agent executes +- **Debugging (Notebook 03):** Find tool failures via error spans and detect context + window pressure by tracking token growth across cycles +- **Backend Export (Notebook 04):** Export traces to CloudWatch, Langfuse, or Jaeger + by changing a single environment variable +- **Production (Notebook 05):** Add custom span attributes, create metrics for + aggregate monitoring, and configure `BatchSpanProcessor` for minimal latency impact + +## Prerequisites + +Before starting this tutorial, ensure you have: + +1. **Python 3.10+** installed + +2. **AWS credentials configured** for Bedrock model access: + ```bash + aws configure + # Or set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY environment variables + ``` + +3. **Dependencies installed:** + ```bash + pip install -r requirements.txt + ``` + +4. *(Optional)* **Docker** for running local backends (Jaeger): + ```bash + docker run --rm -d --name jaeger \ + -p 4318:4318 -p 16686:16686 \ + jaegertracing/all-in-one:latest + ``` + +5. *(Optional)* **Langfuse account** for cloud LLM observability export + +## How is this different from the existing `08-observability` on main? + +The existing `08-observability` sample on the `main` branch focuses on **Langfuse + RAGAS +evaluation** — it requires deploying AWS infrastructure (OpenSearch, DynamoDB) and +evaluates agent responses using the RAGAS framework. + +This tutorial (21-observability-debugging) takes a different approach: it focuses on +**tracing and debugging** using the Strands SDK's built-in `StrandsTelemetry` class. +This gives you: + +- Zero infrastructure required to start (console exporter works immediately) +- Model-agnostic implementation (works with any Strands-supported provider) +- Debugging-first approach (tool failures, context pressure, unexpected behavior) +- Multiple backend options via a single OTLP configuration change +- Testable in isolation without deploying any AWS resources + +## Tutorial Structure + +``` +python/01-learn/21-observability-debugging/ +├── README.md # This file — overview, prerequisites, structure +├── requirements.txt # Python dependencies for all notebooks +├── trace_utils.py # Shared trace formatting and analysis helpers +├── 01_tracing_setup.ipynb # Configure telemetry and first traced invocation +├── 02_trace_hierarchy.ipynb # Understand the span hierarchy +├── 03_debugging_tools.ipynb # Debug tool failures and context pressure +├── 04_backend_export.ipynb # Export to CloudWatch, Langfuse, Jaeger +└── 05_custom_metrics.ipynb # Custom attributes, metrics, production config +``` + +### Notebook Progression + +Each notebook builds on the prior one, but can be understood independently if you +read the setup cell at the top: + +1. **01_tracing_setup.ipynb** — Start here. Learn how `StrandsTelemetry` manages the + global tracer and how every `Agent` instance automatically picks it up. Run your + first traced invocation and capture spans programmatically. + +2. **02_trace_hierarchy.ipynb** — Understand the four span types (Agent, Cycle, Model + Invoke, Tool) and their parent-child relationships. Use `trace_utils.py` to + format and summarize traces. + +3. **03_debugging_tools.ipynb** — Two critical scenarios: finding tool failures via + error spans (status, exception events, recovery behavior) and detecting context + window pressure by monitoring token growth across cycles. + +4. **04_backend_export.ipynb** — Switch from console output to production backends. + The same `setup_otlp_exporter()` call works with CloudWatch (via ADOT), Langfuse, + Jaeger, and any OTLP-compatible backend. + +5. **05_custom_metrics.ipynb** — Add domain-specific span attributes, create counters + and histograms for aggregate monitoring, and configure `BatchSpanProcessor` for + production deployments. + +## Installation + +1. Create and activate a virtual environment: + ```bash + cd python/01-learn/21-observability-debugging + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Configure AWS credentials for Bedrock access: + ```bash + export AWS_DEFAULT_REGION=us-east-1 + # Ensure your AWS credentials are configured (via AWS CLI, environment variables, or IAM role) + ``` + +4. Launch Jupyter: + ```bash + jupyter notebook + ``` + +## A Note on Logging + +The Strands Agents SDK observability framework covers three telemetry primitives: +**Traces**, **Metrics**, and **Logs**. This tutorial focuses on Traces (notebooks +01–04) and Metrics (notebook 05). For logging configuration, refer to the +[SDK Logs documentation](https://strandsagents.com/docs/user-guide/observability-evaluation/logs/). + +## Related Tutorials + +### Prior Learning + +- [Tutorial 01 - First Agent](../01-first-agent) — Basic agent creation +- [Tutorial 02 - Tools and MCP](../02-tools-and-mcp) — Tool definition and usage +- [Tutorial 03 - Model Providers](../03-model-providers) — Model configuration + +### Next Steps + +- [Tutorial 05 - Guardrails](../05-guardrails) — Add content filtering alongside observability +- [SDK Observability Docs](https://strandsagents.com/docs/user-guide/observability-evaluation/observability/) — Full observability framework reference +- [SDK Traces Docs](https://strandsagents.com/docs/user-guide/observability-evaluation/traces/) — Deep dive into trace configuration diff --git a/python/01-learn/21-observability-debugging/requirements.txt b/python/01-learn/21-observability-debugging/requirements.txt new file mode 100644 index 00000000..c1fe39f6 --- /dev/null +++ b/python/01-learn/21-observability-debugging/requirements.txt @@ -0,0 +1,5 @@ +strands-agents +strands-agents-tools +opentelemetry-api +opentelemetry-sdk +opentelemetry-exporter-otlp-proto-http diff --git a/python/01-learn/21-observability-debugging/trace_utils.py b/python/01-learn/21-observability-debugging/trace_utils.py new file mode 100644 index 00000000..37de6f7d --- /dev/null +++ b/python/01-learn/21-observability-debugging/trace_utils.py @@ -0,0 +1,187 @@ +"""Shared trace utilities for the Observability & Debugging tutorial. + +This module provides helper functions for formatting and analyzing +OpenTelemetry trace spans. It is imported by all notebooks in this tutorial. +""" + +from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.trace import StatusCode + + +def format_trace_tree(spans: list[ReadableSpan]) -> str: + """Format spans as an indented tree for notebook display. + + Args: + spans: List of ReadableSpan objects from a completed trace. + + Returns: + A formatted string showing the span tree with status icons, + durations, and key attributes. + + Example: + >>> spans = memory_exporter.get_finished_spans() + >>> print(format_trace_tree(list(spans))) + ✓ Agent (1523ms) + ✓ Cycle (1520ms) + ✓ Model Invoke (1200ms) + └─ tokens: in=523 out=89 + ✓ calculator (3ms) + └─ tool: calculator + """ + output = [] + for span in spans: + attrs = span.attributes or {} + is_error = ( + span.status.status_code == StatusCode.ERROR + or attrs.get("tool.status") == "error" + ) + status_icon = "✗" if is_error else "✓" + + # Calculate duration + if span.end_time and span.start_time: + duration_ms = (span.end_time - span.start_time) / 1_000_000 + duration_str = f"{duration_ms:.0f}ms" + else: + duration_str = "N/A" + + output.append(f" {status_icon} {span.name} ({duration_str})") + + # Token usage on model spans + if "gen_ai.usage.input_tokens" in attrs: + input_t = attrs["gen_ai.usage.input_tokens"] + output_t = attrs.get("gen_ai.usage.output_tokens", 0) + output.append(f" └─ tokens: in={input_t} out={output_t}") + + # Tool info on tool spans + if "gen_ai.tool.name" in attrs: + tool_name = attrs["gen_ai.tool.name"] + output.append(f" └─ tool: {tool_name}") + + # Error details + if span.status.status_code == StatusCode.ERROR: + desc = span.status.description or "Unknown error" + output.append(f" └─ error: {desc}") + elif attrs.get("tool.status") == "error": + # Strands records tool errors as attributes + for event in span.events: + if event.name == "gen_ai.choice": + msg = event.attributes.get("message", "") + if "Error:" in str(msg): + output.append(f" └─ error: {msg}") + break + + for event in span.events: + if event.name == "exception": + exc_type = event.attributes.get("exception.type", "") + exc_msg = event.attributes.get("exception.message", "") + output.append(f" └─ exception: {exc_type}: {exc_msg}") + + return "\n".join(output) + + +def print_trace_summary(spans: list[ReadableSpan]) -> None: + """Print a summary of trace statistics. + + Args: + spans: List of ReadableSpan objects from a completed trace. + + Example: + >>> spans = memory_exporter.get_finished_spans() + >>> print_trace_summary(list(spans)) + 📊 Trace Summary: + Total spans: 5 + Error spans: 1 + Total duration: 1523ms + Success rate: 80% + """ + total_spans = len(spans) + error_spans = sum( + 1 for s in spans + if s.status.status_code == StatusCode.ERROR + or (s.attributes or {}).get("tool.status") == "error" + ) + + # Duration from first to last span + if spans: + root = spans[0] + if root.end_time and root.start_time: + total_ms = (root.end_time - root.start_time) / 1_000_000 + else: + total_ms = 0 + else: + total_ms = 0 + + print("📊 Trace Summary:") + print(f" Total spans: {total_spans}") + print(f" Error spans: {error_spans}") + print(f" Total duration: {total_ms:.0f}ms") + print(f" Success rate: {((total_spans - error_spans) / max(total_spans, 1)) * 100:.0f}%") + + +def find_error_spans(spans: list[ReadableSpan]) -> list[ReadableSpan]: + """Filter spans to those with errors (status ERROR or tool.status == 'error'). + + The Strands SDK may record tool failures as a span attribute + (tool.status = "error") rather than setting the span's status_code to ERROR. + This function checks both conditions. + + Args: + spans: List of all spans from a trace. + + Returns: + List of spans that indicate an error occurred. + """ + error_spans = [] + for s in spans: + if s.status.status_code == StatusCode.ERROR: + error_spans.append(s) + elif (s.attributes or {}).get("tool.status") == "error": + error_spans.append(s) + return error_spans + + +def analyze_token_growth(spans: list[ReadableSpan], context_limit: int = 200_000) -> dict: + """Analyze token usage growth across model invoke spans. + + Extracts input/output token counts from model spans and calculates + growth metrics useful for detecting context window pressure. + + Args: + spans: List of all spans from a trace. + context_limit: Model's context window size (default: 200K for Claude). + + Returns: + Dictionary with growth analysis metrics. + + Example: + >>> analysis = analyze_token_growth(list(spans)) + >>> print(f"Growth: {analysis['growth_pct']:.0f}%") + """ + model_spans = [] + for span in spans: + attrs = span.attributes or {} + if "gen_ai.usage.input_tokens" in attrs: + model_spans.append({ + "name": span.name, + "input_tokens": attrs["gen_ai.usage.input_tokens"], + "output_tokens": attrs.get("gen_ai.usage.output_tokens", 0), + }) + + if len(model_spans) < 2: + return {"model_spans": model_spans, "growth": 0, "growth_pct": 0, "usage_pct": 0} + + first_input = model_spans[0]["input_tokens"] + last_input = model_spans[-1]["input_tokens"] + growth = last_input - first_input + growth_pct = (growth / first_input * 100) if first_input > 0 else 0 + usage_pct = (last_input / context_limit) * 100 + + return { + "model_spans": model_spans, + "first_input": first_input, + "last_input": last_input, + "growth": growth, + "growth_pct": growth_pct, + "usage_pct": usage_pct, + "context_limit": context_limit, + }