From 7f22e471984d0dd2043086285e6eef37c9e8d43b Mon Sep 17 00:00:00 2001
From: Thamarai meena K <taamarai@amazon.com>
Date: Fri, 5 Jun 2026 14:28:14 +0530
Subject: [PATCH] feat(01-learn): add 21-observability-debugging

---
 .../21-observability-debugging/.gitignore     |  18 +
 .../01_tracing_setup.ipynb                    | 239 ++++++++++
 .../02_trace_hierarchy.ipynb                  | 284 ++++++++++++
 .../03_debugging_tools.ipynb                  | 355 +++++++++++++++
 .../04_backend_export.ipynb                   | 304 +++++++++++++
 .../05_custom_metrics.ipynb                   | 409 ++++++++++++++++++
 .../21-observability-debugging/README.md      | 157 +++++++
 .../requirements.txt                          |   5 +
 .../21-observability-debugging/trace_utils.py | 187 ++++++++
 9 files changed, 1958 insertions(+)
 create mode 100644 python/01-learn/21-observability-debugging/.gitignore
 create mode 100644 python/01-learn/21-observability-debugging/01_tracing_setup.ipynb
 create mode 100644 python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb
 create mode 100644 python/01-learn/21-observability-debugging/03_debugging_tools.ipynb
 create mode 100644 python/01-learn/21-observability-debugging/04_backend_export.ipynb
 create mode 100644 python/01-learn/21-observability-debugging/05_custom_metrics.ipynb
 create mode 100644 python/01-learn/21-observability-debugging/README.md
 create mode 100644 python/01-learn/21-observability-debugging/requirements.txt
 create mode 100644 python/01-learn/21-observability-debugging/trace_utils.py

diff --git a/python/01-learn/21-observability-debugging/.gitignore b/python/01-learn/21-observability-debugging/.gitignore
new file mode 100644
index 00000000..f4dfd19c
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/.gitignore
@@ -0,0 +1,18 @@
+# Virtual environment
+.venv/
+venv/
+
+# Jupyter
+.ipynb_checkpoints/
+
+# Python
+__pycache__/
+*.pyc
+*.pyo
+
+# macOS
+.DS_Store
+
+# IDE
+.vscode/
+.idea/
diff --git a/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb b/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb
new file mode 100644
index 00000000..973fbe2c
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/01_tracing_setup.ipynb
@@ -0,0 +1,239 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Observability & Debugging — Part 1: Tracing Setup\n",
+    "\n",
+    "This notebook teaches you how to configure OpenTelemetry tracing with Strands Agents\n",
+    "using the built-in `StrandsTelemetry` class and run your first traced agent invocation.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "- Configure `StrandsTelemetry` with the console exporter\n",
+    "- Run an agent and observe trace output in stdout\n",
+    "- Capture spans programmatically for inspection\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Python 3.10+\n",
+    "- AWS credentials configured (for Bedrock model access)\n",
+    "- `pip install -r requirements.txt` completed"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "concept",
+   "metadata": {},
+   "source": [
+    "## How It Works\n",
+    "\n",
+    "`StrandsTelemetry` manages the global OpenTelemetry `TracerProvider`. Once configured,\n",
+    "every `Agent` instance automatically picks up the tracer — no explicit wiring needed.\n",
+    "\n",
+    "```\n",
+    "StrandsTelemetry()              →  creates TracerProvider\n",
+    "  .setup_console_exporter()     →  attaches ConsoleSpanExporter\n",
+    "  .setup_otlp_exporter()        →  attaches OTLPSpanExporter\n",
+    "\n",
+    "Agent(...)                      →  reads global TracerProvider\n",
+    "  agent(\"prompt\")               →  emits spans automatically\n",
+    "```\n",
+    "\n",
+    "**Important:** `StrandsTelemetry` must be configured *before* creating any `Agent`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup-telemetry",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from strands.telemetry.config import StrandsTelemetry\n",
+    "\n",
+    "# Configure telemetry with console exporter\n",
+    "# This prints every span to stdout as it completes\n",
+    "telemetry = StrandsTelemetry()\n",
+    "telemetry.setup_console_exporter()\n",
+    "\n",
+    "print(\"✓ Telemetry configured with console exporter\")\n",
+    "print(\"  Every agent invocation will now produce trace output below.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "first-trace",
+   "metadata": {},
+   "source": [
+    "## Your First Traced Invocation\n",
+    "\n",
+    "Let's define a simple tool and create an agent. When we invoke the agent, the console\n",
+    "exporter will print each span as it completes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "define-tool",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from strands import Agent, tool\n",
+    "from strands.models.bedrock import BedrockModel\n",
+    "\n",
+    "\n",
+    "@tool\n",
+    "def calculator(expression: str) -> str:\n",
+    "    \"\"\"Evaluate a mathematical expression.\n",
+    "\n",
+    "    Args:\n",
+    "        expression: A mathematical expression to evaluate (e.g., \"42 * 17\")\n",
+    "\n",
+    "    Returns:\n",
+    "        The result of the expression as a string.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        result = eval(expression, {\"__builtins__\": {}}, {})\n",
+    "        return str(result)\n",
+    "    except Exception as e:\n",
+    "        return f\"Error: {e}\"\n",
+    "\n",
+    "\n",
+    "# Create agent — automatically picks up the global tracer\n",
+    "agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[calculator],\n",
+    ")\n",
+    "\n",
+    "print(\"✓ Agent created with calculator tool\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "invoke-agent",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Invoke the agent — trace spans will print to stdout\n",
+    "result = agent(\"What is 42 multiplied by 17?\")\n",
+    "print(f\"\\n{'='*60}\")\n",
+    "print(f\"Agent response: {result}\")\n",
+    "print(f\"{'='*60}\")\n",
+    "print(\"\\n↑ The trace output above shows every span that was created.\")\n",
+    "print(\"  Look for: Agent span, Cycle span(s), Model Invoke span(s), Tool span(s)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "programmatic",
+   "metadata": {},
+   "source": [
+    "## Capturing Spans Programmatically\n",
+    "\n",
+    "The console exporter is great for visual inspection. For programmatic analysis,\n",
+    "we can create a simple span collector that stores spans in a list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "memory-exporter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n",
+    "from opentelemetry import trace\n",
+    "\n",
+    "\n",
+    "class SpanCollector(SpanExporter):\n",
+    "    \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        self._spans = []\n",
+    "\n",
+    "    def export(self, spans):\n",
+    "        self._spans.extend(spans)\n",
+    "        return SpanExportResult.SUCCESS\n",
+    "\n",
+    "    def get_finished_spans(self):\n",
+    "        return list(self._spans)\n",
+    "\n",
+    "    def clear(self):\n",
+    "        self._spans = []\n",
+    "\n",
+    "    def shutdown(self):\n",
+    "        self._spans = []\n",
+    "\n",
+    "\n",
+    "# Add span collector alongside the console exporter\n",
+    "span_collector = SpanCollector()\n",
+    "provider = trace.get_tracer_provider()\n",
+    "if hasattr(provider, \"add_span_processor\"):\n",
+    "    provider.add_span_processor(SimpleSpanProcessor(span_collector))\n",
+    "    print(\"✓ Span collector added — spans will be captured for inspection\")\n",
+    "else:\n",
+    "    print(\"⚠️  Could not add span processor\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "capture-and-inspect",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clear previous spans and run a fresh invocation\n",
+    "span_collector.clear()\n",
+    "_ = agent(\"What is 7 + 3?\")\n",
+    "\n",
+    "# Inspect captured spans\n",
+    "spans = span_collector.get_finished_spans()\n",
+    "print(f\"\\n📊 Captured {len(spans)} spans:\\n\")\n",
+    "\n",
+    "for span in spans:\n",
+    "    attrs = span.attributes or {}\n",
+    "    duration_ms = (\n",
+    "        (span.end_time - span.start_time) / 1_000_000\n",
+    "        if span.end_time and span.start_time\n",
+    "        else 0\n",
+    "    )\n",
+    "    print(f\"  {span.name:<25} duration={duration_ms:.0f}ms  status={span.status.status_code.name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "troubleshooting",
+   "metadata": {},
+   "source": [
+    "## Troubleshooting\n",
+    "\n",
+    "| Problem | Solution |\n",
+    "|---------|----------|\n",
+    "| No trace output visible | Console exporter prints spans when they *end*. Wait for the agent call to complete. |\n",
+    "| `ModuleNotFoundError: No module named 'opentelemetry'` | Run `pip install -r requirements.txt` |\n",
+    "| `No TracerProvider configured` | Ensure `StrandsTelemetry` is configured *before* creating the Agent |\n",
+    "| AWS credentials error | Run `aws configure` or set `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` |\n",
+    "\n",
+    "## Next\n",
+    "\n",
+    "Continue to [02_trace_hierarchy.ipynb](02_trace_hierarchy.ipynb) to understand the\n",
+    "Agent → Cycle → Model → Tool span hierarchy in detail."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb b/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb
new file mode 100644
index 00000000..667d484d
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/02_trace_hierarchy.ipynb
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Observability & Debugging — Part 2: Trace Hierarchy\n",
+    "\n",
+    "This notebook teaches you how to read and interpret the span hierarchy that Strands\n",
+    "Agents produces for every invocation.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "- The four span types: Agent, Cycle, Model Invoke, Tool\n",
+    "- How parent-child relationships form the trace tree\n",
+    "- Key attributes on each span type\n",
+    "- How to use `trace_utils.py` for formatted output\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Complete [01_tracing_setup.ipynb](01_tracing_setup.ipynb) first"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \".\")\n",
+    "\n",
+    "from strands import Agent, tool\n",
+    "from strands.models.bedrock import BedrockModel\n",
+    "from strands.telemetry.config import StrandsTelemetry\n",
+    "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n",
+    "from opentelemetry import trace\n",
+    "\n",
+    "from trace_utils import format_trace_tree, print_trace_summary\n",
+    "\n",
+    "\n",
+    "class SpanCollector(SpanExporter):\n",
+    "    \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n",
+    "    def __init__(self):\n",
+    "        self._spans = []\n",
+    "    def export(self, spans):\n",
+    "        self._spans.extend(spans)\n",
+    "        return SpanExportResult.SUCCESS\n",
+    "    def get_finished_spans(self):\n",
+    "        return list(self._spans)\n",
+    "    def clear(self):\n",
+    "        self._spans = []\n",
+    "    def shutdown(self):\n",
+    "        self._spans = []\n",
+    "\n",
+    "\n",
+    "# Configure telemetry\n",
+    "telemetry = StrandsTelemetry()\n",
+    "telemetry.setup_console_exporter()\n",
+    "\n",
+    "# Add span collector\n",
+    "span_collector = SpanCollector()\n",
+    "provider = trace.get_tracer_provider()\n",
+    "if hasattr(provider, \"add_span_processor\"):\n",
+    "    provider.add_span_processor(SimpleSpanProcessor(span_collector))\n",
+    "\n",
+    "print(\"✓ Telemetry configured with console + span collector\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "hierarchy-explanation",
+   "metadata": {},
+   "source": [
+    "## The Four Span Types\n",
+    "\n",
+    "Every agent invocation produces this hierarchy:\n",
+    "\n",
+    "| Span Type | Key Attributes | What It Tells You |\n",
+    "|-----------|---------------|-------------------|\n",
+    "| **Agent** | `gen_ai.agent.name`, `gen_ai.request.model` | Overall invocation identity |\n",
+    "| **Cycle** | `event_loop.cycle_id` | Which iteration of the loop |\n",
+    "| **Model Invoke** | `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens` | Token consumption |\n",
+    "| **Tool** | `gen_ai.tool.name`, `tool.status` | Which tool ran, success/failure |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "single-tool",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def calculator(expression: str) -> str:\n",
+    "    \"\"\"Evaluate a mathematical expression.\n",
+    "\n",
+    "    Args:\n",
+    "        expression: A mathematical expression to evaluate.\n",
+    "\n",
+    "    Returns:\n",
+    "        The result as a string.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        result = eval(expression, {\"__builtins__\": {}}, {})\n",
+    "        return str(result)\n",
+    "    except Exception as e:\n",
+    "        return f\"Error: {e}\"\n",
+    "\n",
+    "\n",
+    "agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[calculator],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "single-tool-trace",
+   "metadata": {},
+   "source": [
+    "## Single Tool Call Trace\n",
+    "\n",
+    "A simple math question produces a minimal trace: 2 cycles (one to call the tool,\n",
+    "one to format the response)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-single",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "span_collector.clear()\n",
+    "result = agent(\"What is 42 * 17?\")\n",
+    "\n",
+    "spans = span_collector.get_finished_spans()\n",
+    "print(f\"Agent response: {result}\\n\")\n",
+    "print(\"🌳 Trace Tree:\\n\")\n",
+    "print(format_trace_tree(spans))\n",
+    "print()\n",
+    "print_trace_summary(spans)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "multi-tool-md",
+   "metadata": {},
+   "source": [
+    "## Multi-Tool Call Trace\n",
+    "\n",
+    "When the agent calls multiple tools in one cycle, you'll see multiple tool spans\n",
+    "under the same cycle span."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "run-multi",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "span_collector.clear()\n",
+    "result = agent(\"Calculate 15 * 23, then calculate 100 / 4. Give me both results.\")\n",
+    "\n",
+    "spans = span_collector.get_finished_spans()\n",
+    "print(f\"Agent response: {result}\\n\")\n",
+    "print(\"🌳 Trace Tree:\\n\")\n",
+    "print(format_trace_tree(spans))\n",
+    "print()\n",
+    "print_trace_summary(spans)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "inspect-attrs",
+   "metadata": {},
+   "source": [
+    "## Inspecting Span Attributes\n",
+    "\n",
+    "Each span carries attributes with detailed metadata. Let's extract the key\n",
+    "information from each span type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "show-attrs",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spans = span_collector.get_finished_spans()\n",
+    "\n",
+    "print(\"📋 Detailed Span Attributes:\\n\")\n",
+    "for span in spans:\n",
+    "    attrs = span.attributes or {}\n",
+    "    print(f\"── {span.name} ──\")\n",
+    "    print(f\"   trace_id: {format(span.context.trace_id, '032x')[:16]}...\")\n",
+    "    print(f\"   status: {span.status.status_code.name}\")\n",
+    "\n",
+    "    if \"gen_ai.usage.input_tokens\" in attrs:\n",
+    "        print(f\"   input_tokens: {attrs['gen_ai.usage.input_tokens']}\")\n",
+    "        print(f\"   output_tokens: {attrs.get('gen_ai.usage.output_tokens', 0)}\")\n",
+    "    if \"gen_ai.tool.name\" in attrs:\n",
+    "        print(f\"   tool_name: {attrs['gen_ai.tool.name']}\")\n",
+    "        print(f\"   tool_status: {attrs.get('tool.status', 'N/A')}\")\n",
+    "    if \"event_loop.cycle_id\" in attrs:\n",
+    "        print(f\"   cycle_id: {attrs['event_loop.cycle_id']}\")\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cycle-count",
+   "metadata": {},
+   "source": [
+    "## Understanding Cycle Count\n",
+    "\n",
+    "The number of cycles tells you how many round-trips the agent needed:\n",
+    "\n",
+    "- **1 cycle** — agent answered directly without tools\n",
+    "- **2 cycles** — typical: call tool(s), then format response\n",
+    "- **3+ cycles** — complex reasoning, retries, or multi-step tool use\n",
+    "- **5+ cycles** — potential issue (infinite loop, poor tool design)\n",
+    "\n",
+    "More cycles = more tokens consumed = higher latency = higher cost."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "count-cycles",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spans = span_collector.get_finished_spans()\n",
+    "\n",
+    "cycle_count = sum(\n",
+    "    1 for s in spans if \"event_loop.cycle_id\" in (s.attributes or {})\n",
+    ")\n",
+    "tool_count = sum(\n",
+    "    1 for s in spans if \"gen_ai.tool.name\" in (s.attributes or {})\n",
+    ")\n",
+    "\n",
+    "print(f\"📊 Execution Summary:\")\n",
+    "print(f\"   Cycles: {cycle_count}\")\n",
+    "print(f\"   Tool calls: {tool_count}\")\n",
+    "print(f\"   Total spans: {len(spans)}\")\n",
+    "\n",
+    "if cycle_count <= 2:\n",
+    "    print(\"   ✓ Efficient execution\")\n",
+    "elif cycle_count <= 4:\n",
+    "    print(\"   ⚠️  Moderate — check if all cycles are necessary\")\n",
+    "else:\n",
+    "    print(\"   🚨 High cycle count — investigate for potential issues\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "next",
+   "metadata": {},
+   "source": [
+    "## Next\n",
+    "\n",
+    "Continue to [03_debugging_tools.ipynb](03_debugging_tools.ipynb) to learn how to\n",
+    "debug tool failures and context window pressure using trace data."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb b/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb
new file mode 100644
index 00000000..da7b69d3
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/03_debugging_tools.ipynb
@@ -0,0 +1,355 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Observability & Debugging — Part 3: Debugging Tool Failures & Context Pressure\n",
+    "\n",
+    "This notebook walks through two critical debugging scenarios using trace data:\n",
+    "identifying failed tool invocations and detecting context window pressure.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "- Find error spans and inspect exception details\n",
+    "- Understand how errors propagate through the trace hierarchy\n",
+    "- Detect context window pressure by tracking token growth\n",
+    "- Apply mitigation strategies for both scenarios\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Complete [01_tracing_setup.ipynb](01_tracing_setup.ipynb) and [02_trace_hierarchy.ipynb](02_trace_hierarchy.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, \".\")\n",
+    "\n",
+    "from strands import Agent, tool\n",
+    "from strands.models.bedrock import BedrockModel\n",
+    "from strands.telemetry.config import StrandsTelemetry\n",
+    "from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult\n",
+    "from opentelemetry.trace import StatusCode\n",
+    "from opentelemetry import trace\n",
+    "\n",
+    "from trace_utils import format_trace_tree, find_error_spans, analyze_token_growth\n",
+    "\n",
+    "\n",
+    "class SpanCollector(SpanExporter):\n",
+    "    \"\"\"Simple in-memory span collector for tutorial use.\"\"\"\n",
+    "    def __init__(self):\n",
+    "        self._spans = []\n",
+    "    def export(self, spans):\n",
+    "        self._spans.extend(spans)\n",
+    "        return SpanExportResult.SUCCESS\n",
+    "    def get_finished_spans(self):\n",
+    "        return list(self._spans)\n",
+    "    def clear(self):\n",
+    "        self._spans = []\n",
+    "    def shutdown(self):\n",
+    "        self._spans = []\n",
+    "\n",
+    "\n",
+    "# Configure telemetry\n",
+    "telemetry = StrandsTelemetry()\n",
+    "telemetry.setup_console_exporter()\n",
+    "\n",
+    "span_collector = SpanCollector()\n",
+    "provider = trace.get_tracer_provider()\n",
+    "if hasattr(provider, \"add_span_processor\"):\n",
+    "    provider.add_span_processor(SimpleSpanProcessor(span_collector))\n",
+    "\n",
+    "print(\"✓ Telemetry configured\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "scenario1-intro",
+   "metadata": {},
+   "source": [
+    "## Scenario 1: Tool Failure Detection\n",
+    "\n",
+    "When a tool raises an exception, the Strands SDK records:\n",
+    "- **Span status** → `ERROR`\n",
+    "- **Status description** → the error message\n",
+    "- **Exception event** → `exception.type`, `exception.message`, `exception.stacktrace`\n",
+    "\n",
+    "The agent typically handles the error gracefully (reports it to the user or retries),\n",
+    "so the root Agent span may still be `OK` even when a tool span is `ERROR`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "define-flaky",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Track call count to make failure deterministic\n",
+    "_flaky_api_call_count = 0\n",
+    "\n",
+    "\n",
+    "@tool\n",
+    "def flaky_api(query: str) -> str:\n",
+    "    \"\"\"Simulate an unreliable external API call.\n",
+    "\n",
+    "    Fails on the first call with a ConnectionError to demonstrate\n",
+    "    how tool errors appear in traces. Succeeds on subsequent calls\n",
+    "    to show agent recovery behavior.\n",
+    "\n",
+    "    Args:\n",
+    "        query: The search query to send to the simulated API.\n",
+    "\n",
+    "    Returns:\n",
+    "        A simulated API response string.\n",
+    "\n",
+    "    Raises:\n",
+    "        ConnectionError: Simulated timeout on first call.\n",
+    "    \"\"\"\n",
+    "    global _flaky_api_call_count\n",
+    "    _flaky_api_call_count += 1\n",
+    "    if _flaky_api_call_count == 1:\n",
+    "        raise ConnectionError(f\"API timeout after 30s for query: {query}\")\n",
+    "    return f\"API result for: {query}\"\n",
+    "\n",
+    "\n",
+    "@tool\n",
+    "def calculator(expression: str) -> str:\n",
+    "    \"\"\"Evaluate a mathematical expression.\"\"\"\n",
+    "    try:\n",
+    "        return str(eval(expression, {\"__builtins__\": {}}, {}))\n",
+    "    except Exception as e:\n",
+    "        return f\"Error: {e}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "trigger-failure",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reset call counter so flaky_api fails on first call\n",
+    "_flaky_api_call_count = 0\n",
+    "span_collector.clear()\n",
+    "\n",
+    "debug_agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[calculator, flaky_api],\n",
+    ")\n",
+    "\n",
+    "print(\"Invoking agent with flaky_api (expecting a tool failure)...\\n\")\n",
+    "result = debug_agent(\"Search for 'distributed tracing tutorial' using the API\")\n",
+    "print(f\"\\nAgent response: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "inspect-errors",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use trace_utils to find error spans\n",
+    "spans = span_collector.get_finished_spans()\n",
+    "errors = find_error_spans(spans)\n",
+    "\n",
+    "print(f\"🔍 Found {len(errors)} error span(s):\\n\")\n",
+    "\n",
+    "for span in errors:\n",
+    "    print(f\"  ✗ Span: {span.name}\")\n",
+    "    print(f\"    Status code: {span.status.status_code.name}\")\n",
+    "    attrs = span.attributes or {}\n",
+    "    if attrs.get(\"tool.status\") == \"error\":\n",
+    "        print(f\"    tool.status: error\")\n",
+    "    if \"gen_ai.tool.name\" in attrs:\n",
+    "        print(f\"    Tool: {attrs['gen_ai.tool.name']}\")\n",
+    "\n",
+    "    # Check for exception events\n",
+    "    for event in span.events:\n",
+    "        if event.name == \"exception\":\n",
+    "            print(f\"    Exception type: {event.attributes.get('exception.type', 'N/A')}\")\n",
+    "            print(f\"    Exception msg:  {event.attributes.get('exception.message', 'N/A')}\")\n",
+    "        # Also check tool result for error message\n",
+    "        if event.name == \"gen_ai.choice\":\n",
+    "            msg = str(event.attributes.get('message', ''))\n",
+    "            if 'Error:' in msg:\n",
+    "                print(f\"    Error detail: {msg[:200]}\")\n",
+    "    print()\n",
+    "\n",
+    "if not errors:\n",
+    "    print(\"  No errors detected in this invocation.\")\n",
+    "\n",
+    "print(\"\\n📋 Full trace tree:\")\n",
+    "print(format_trace_tree(spans))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "error-pattern",
+   "metadata": {},
+   "source": [
+    "### Debugging Pattern: Error → Recovery\n",
+    "\n",
+    "The key insight: filter by `status_code == ERROR` to find the failure, then look at\n",
+    "the *next* cycle's model invoke span to see how the agent recovered.\n",
+    "\n",
+    "```\n",
+    "Agent Span (status: OK — agent handled it)\n",
+    "├── Cycle 1\n",
+    "│   ├── Model Invoke (OK) — decided to call flaky_api\n",
+    "│   └── Tool: flaky_api (ERROR) ← find this\n",
+    "│       exception.type: ConnectionError\n",
+    "├── Cycle 2 ← then look here\n",
+    "│   └── Model Invoke (OK) — \"I encountered an error...\"\n",
+    "```\n",
+    "\n",
+    "**Production alert:** Set up monitoring on error span count. If a tool's error rate\n",
+    "exceeds 5% over 5 minutes, trigger an alert."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "scenario2-intro",
+   "metadata": {},
+   "source": [
+    "## Scenario 2: Context Window Pressure\n",
+    "\n",
+    "Each cycle adds to the conversation history. When tools return large outputs,\n",
+    "token usage grows rapidly. Monitor `gen_ai.usage.input_tokens` across model\n",
+    "invoke spans to detect this.\n",
+    "\n",
+    "**Warning signs:**\n",
+    "- Input tokens growing significantly between cycles\n",
+    "- Total approaching the model's context limit (200K for Claude)\n",
+    "- Degraded response quality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "token-heavy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def token_heavy(topic: str) -> str:\n",
+    "    \"\"\"Generate a verbose response that consumes many tokens.\n",
+    "\n",
+    "    Intentionally produces ~2000 tokens of output to demonstrate\n",
+    "    context window pressure across multiple agent cycles.\n",
+    "\n",
+    "    Args:\n",
+    "        topic: The topic to generate content about.\n",
+    "\n",
+    "    Returns:\n",
+    "        A lengthy string of repeated content.\n",
+    "    \"\"\"\n",
+    "    paragraph = (\n",
+    "        f\"Detailed analysis of {topic}: This is an extensive exploration covering \"\n",
+    "        f\"multiple dimensions and perspectives. The topic of {topic} encompasses \"\n",
+    "        f\"various interconnected aspects requiring thorough examination. \"\n",
+    "        f\"When considering {topic}, one must account for historical context, \"\n",
+    "        f\"current state, and future implications. \"\n",
+    "    )\n",
+    "    return paragraph * 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "trigger-pressure",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "span_collector.clear()\n",
+    "\n",
+    "token_agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[token_heavy, calculator],\n",
+    ")\n",
+    "\n",
+    "print(\"Invoking agent with token_heavy tool (~2000 tokens per call)...\\n\")\n",
+    "result = token_agent(\n",
+    "    \"Give me a detailed analysis of 'machine learning' and then 'distributed systems'. \"\n",
+    "    \"Use the token_heavy tool for each topic.\"\n",
+    ")\n",
+    "print(f\"\\nAgent response (truncated): {str(result)[:150]}...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "analyze-growth",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use trace_utils to analyze token growth\n",
+    "spans = span_collector.get_finished_spans()\n",
+    "analysis = analyze_token_growth(spans)\n",
+    "\n",
+    "print(\"📊 Token Usage Across Model Calls\\n\")\n",
+    "print(f\"{'Span':<30} {'Input':<12} {'Output':<12}\")\n",
+    "print(\"─\" * 54)\n",
+    "for ms in analysis[\"model_spans\"]:\n",
+    "    print(f\"{ms['name']:<30} {ms['input_tokens']:<12} {ms['output_tokens']:<12}\")\n",
+    "\n",
+    "if \"first_input\" in analysis:\n",
+    "    print(f\"\\n⚠️  Context Growth Analysis:\")\n",
+    "    print(f\"   First model call: {analysis['first_input']} input tokens\")\n",
+    "    print(f\"   Last model call:  {analysis['last_input']} input tokens\")\n",
+    "    print(f\"   Growth: +{analysis['growth']} tokens ({analysis['growth_pct']:.0f}% increase)\")\n",
+    "    print(f\"   Context usage: {analysis['usage_pct']:.1f}% of {analysis['context_limit']:,} limit\")\n",
+    "\n",
+    "    if analysis[\"usage_pct\"] > 50:\n",
+    "        print(\"   🚨 HIGH: Context window more than half full!\")\n",
+    "    elif analysis[\"usage_pct\"] > 20:\n",
+    "        print(\"   ⚠️  MODERATE: Monitor closely.\")\n",
+    "    else:\n",
+    "        print(\"   ✓ LOW: Within safe limits.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "mitigation",
+   "metadata": {},
+   "source": [
+    "### Mitigation Strategies\n",
+    "\n",
+    "| Strategy | When to Use |\n",
+    "|----------|-------------|\n",
+    "| Summarize tool outputs | Tools return verbose text |\n",
+    "| Limit cycle count | Agent might loop indefinitely |\n",
+    "| Use concise tools | Return structured data, not prose |\n",
+    "| Set alerts at 80% | Production monitoring |\n",
+    "| Use `SlidingWindowConversationManager` | Long-running conversations |\n",
+    "\n",
+    "See [17-conversation-management](../17-conversation-management) for conversation\n",
+    "management strategies that help control context growth.\n",
+    "\n",
+    "## Next\n",
+    "\n",
+    "Continue to [04_backend_export.ipynb](04_backend_export.ipynb) to learn how to\n",
+    "export traces to production backends via OTLP."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/01-learn/21-observability-debugging/04_backend_export.ipynb b/python/01-learn/21-observability-debugging/04_backend_export.ipynb
new file mode 100644
index 00000000..44a584cf
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/04_backend_export.ipynb
@@ -0,0 +1,304 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Observability & Debugging — Part 4: Backend Export\n",
+    "\n",
+    "This notebook teaches you how to export traces to production observability backends\n",
+    "using the OTLP (OpenTelemetry Protocol) exporter.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "- Export traces to CloudWatch via AWS Distro for OpenTelemetry (ADOT)\n",
+    "- Export traces to Langfuse for LLM-specific observability\n",
+    "- Export traces to Jaeger for local development\n",
+    "- Switch backends with a single environment variable change\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Complete notebooks 01–03 first\n",
+    "- *(Optional)* Docker for running Jaeger locally\n",
+    "- *(Optional)* Langfuse account for cloud export\n",
+    "- *(Optional)* AWS credentials with CloudWatch permissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from strands import Agent, tool\n",
+    "from strands.models.bedrock import BedrockModel\n",
+    "from strands.telemetry.config import StrandsTelemetry\n",
+    "\n",
+    "print(\"✓ Imports ready\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "otlp-concept",
+   "metadata": {},
+   "source": [
+    "## How OTLP Export Works\n",
+    "\n",
+    "The OpenTelemetry Protocol (OTLP) is the standard wire format for sending telemetry\n",
+    "data. Strands provides `setup_otlp_exporter()` which reads the endpoint from the\n",
+    "`OTEL_EXPORTER_OTLP_ENDPOINT` environment variable.\n",
+    "\n",
+    "The same exporter works with **any** OTLP-compatible backend:\n",
+    "- CloudWatch (via ADOT collector)\n",
+    "- Langfuse (direct OTLP ingestion)\n",
+    "- Jaeger (built-in OTLP receiver)\n",
+    "- Grafana Tempo, Honeycomb, Datadog, etc."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cloudwatch-md",
+   "metadata": {},
+   "source": [
+    "## Option 1: CloudWatch via ADOT\n",
+    "\n",
+    "AWS Distro for OpenTelemetry (ADOT) is a collector that receives OTLP traces and\n",
+    "forwards them to CloudWatch X-Ray.\n",
+    "\n",
+    "**Start the ADOT collector:**\n",
+    "```bash\n",
+    "docker run --rm -p 4318:4318 \\\n",
+    "  -e AWS_REGION=us-east-1 \\\n",
+    "  public.ecr.aws/aws-observability/aws-otel-collector:latest\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cloudwatch-setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def setup_cloudwatch_telemetry():\n",
+    "    \"\"\"Configure telemetry to export to CloudWatch via ADOT.\n",
+    "\n",
+    "    Requires ADOT collector running at localhost:4318.\n",
+    "    Start with: docker run --rm -p 4318:4318 public.ecr.aws/aws-observability/aws-otel-collector:latest\n",
+    "    \"\"\"\n",
+    "    os.environ.setdefault(\"OTEL_EXPORTER_OTLP_ENDPOINT\", \"http://localhost:4318\")\n",
+    "\n",
+    "    telemetry = StrandsTelemetry()\n",
+    "    telemetry.setup_console_exporter()  # Keep console for local visibility\n",
+    "\n",
+    "    try:\n",
+    "        telemetry.setup_otlp_exporter()\n",
+    "        print(\"✓ CloudWatch telemetry configured (via ADOT at localhost:4318)\")\n",
+    "        print(\"  Traces will appear in CloudWatch X-Ray console.\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️  ADOT collector not available: {e}\")\n",
+    "        print(\"   Traces will only appear in console output.\")\n",
+    "        print(\"   Start ADOT: docker run --rm -p 4318:4318 public.ecr.aws/aws-observability/aws-otel-collector:latest\")\n",
+    "\n",
+    "    return telemetry\n",
+    "\n",
+    "\n",
+    "# Uncomment to activate:\n",
+    "# telemetry = setup_cloudwatch_telemetry()\n",
+    "print(\"CloudWatch setup defined. Uncomment the line above to activate.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "langfuse-md",
+   "metadata": {},
+   "source": [
+    "## Option 2: Langfuse\n",
+    "\n",
+    "Langfuse provides a purpose-built UI for LLM observability with native OTLP ingestion.\n",
+    "\n",
+    "**Setup:**\n",
+    "1. Create account at [langfuse.com](https://langfuse.com)\n",
+    "2. Get public/secret keys from Project Settings\n",
+    "3. Set environment variables below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "langfuse-setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def setup_langfuse_telemetry():\n",
+    "    \"\"\"Configure telemetry to export to Langfuse.\n",
+    "\n",
+    "    Requires LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables.\n",
+    "    Get keys from: https://cloud.langfuse.com → Project Settings\n",
+    "    \"\"\"\n",
+    "    host = os.environ.get(\"LANGFUSE_HOST\", \"https://cloud.langfuse.com\")\n",
+    "    public_key = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n",
+    "    secret_key = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n",
+    "\n",
+    "    if not public_key or not secret_key:\n",
+    "        print(\"⚠️  LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY not set.\")\n",
+    "        print(\"   Set these environment variables to enable Langfuse export:\")\n",
+    "        print('   export LANGFUSE_PUBLIC_KEY=\"pk-lf-...\"')\n",
+    "        print('   export LANGFUSE_SECRET_KEY=\"sk-lf-...\"')\n",
+    "        return None\n",
+    "\n",
+    "    os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = f\"{host}/api/public/otel\"\n",
+    "    os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = (\n",
+    "        f\"Authorization=Basic {public_key}:{secret_key}\"\n",
+    "    )\n",
+    "\n",
+    "    telemetry = StrandsTelemetry()\n",
+    "    telemetry.setup_console_exporter()\n",
+    "    telemetry.setup_otlp_exporter()\n",
+    "    print(f\"✓ Langfuse telemetry configured (host: {host})\")\n",
+    "    print(\"  Traces will appear in your Langfuse project dashboard.\")\n",
+    "    return telemetry\n",
+    "\n",
+    "\n",
+    "# Uncomment to activate:\n",
+    "# telemetry = setup_langfuse_telemetry()\n",
+    "print(\"Langfuse setup defined. Uncomment the line above to activate.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "jaeger-md",
+   "metadata": {},
+   "source": [
+    "## Option 3: Jaeger (Local Development)\n",
+    "\n",
+    "Jaeger is ideal for local development — single Docker container with a built-in UI.\n",
+    "\n",
+    "**Start Jaeger:**\n",
+    "```bash\n",
+    "docker run --rm -d --name jaeger \\\n",
+    "  -p 4318:4318 \\\n",
+    "  -p 16686:16686 \\\n",
+    "  jaegertracing/all-in-one:latest\n",
+    "```\n",
+    "\n",
+    "Then open http://localhost:16686 to view traces."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "jaeger-setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def setup_jaeger_telemetry():\n",
+    "    \"\"\"Configure telemetry to export to Jaeger.\n",
+    "\n",
+    "    Requires Jaeger running at localhost:4318.\n",
+    "    Start with: docker run --rm -d -p 4318:4318 -p 16686:16686 jaegertracing/all-in-one:latest\n",
+    "    View traces at: http://localhost:16686\n",
+    "    \"\"\"\n",
+    "    os.environ.setdefault(\"OTEL_EXPORTER_OTLP_ENDPOINT\", \"http://localhost:4318\")\n",
+    "\n",
+    "    telemetry = StrandsTelemetry()\n",
+    "    telemetry.setup_console_exporter()\n",
+    "\n",
+    "    try:\n",
+    "        telemetry.setup_otlp_exporter()\n",
+    "        print(\"✓ Jaeger telemetry configured (localhost:4318)\")\n",
+    "        print(\"  View traces at: http://localhost:16686\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️  Jaeger not available: {e}\")\n",
+    "        print(\"   Start: docker run --rm -d -p 4318:4318 -p 16686:16686 jaegertracing/all-in-one:latest\")\n",
+    "\n",
+    "    return telemetry\n",
+    "\n",
+    "\n",
+    "# Uncomment to activate:\n",
+    "# telemetry = setup_jaeger_telemetry()\n",
+    "print(\"Jaeger setup defined. Uncomment the line above to activate.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "test-export",
+   "metadata": {},
+   "source": [
+    "## Testing Your Backend\n",
+    "\n",
+    "Once you've uncommented one of the setup functions above, run the cell below\n",
+    "to send a test trace to your backend."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test-trace",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use console-only telemetry for this demo (replace with your backend above)\n",
+    "telemetry = StrandsTelemetry()\n",
+    "telemetry.setup_console_exporter()\n",
+    "\n",
+    "\n",
+    "@tool\n",
+    "def calculator(expression: str) -> str:\n",
+    "    \"\"\"Evaluate a mathematical expression.\"\"\"\n",
+    "    try:\n",
+    "        return str(eval(expression, {\"__builtins__\": {}}, {}))\n",
+    "    except Exception as e:\n",
+    "        return f\"Error: {e}\"\n",
+    "\n",
+    "\n",
+    "agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[calculator],\n",
+    ")\n",
+    "\n",
+    "print(\"Sending test trace...\\n\")\n",
+    "result = agent(\"What is 99 * 77?\")\n",
+    "print(f\"\\nResult: {result}\")\n",
+    "print(\"\\n✓ Check your backend for the trace!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "| Backend | Endpoint | Use Case |\n",
+    "|---------|----------|----------|\n",
+    "| Console | stdout | Development, quick debugging |\n",
+    "| CloudWatch (ADOT) | `localhost:4318` | AWS production monitoring |\n",
+    "| Langfuse | `cloud.langfuse.com/api/public/otel` | LLM-specific observability |\n",
+    "| Jaeger | `localhost:4318` | Local development with UI |\n",
+    "\n",
+    "All backends use the same `setup_otlp_exporter()` call — only the endpoint changes.\n",
+    "\n",
+    "## Next\n",
+    "\n",
+    "Continue to [05_custom_metrics.ipynb](05_custom_metrics.ipynb) to learn how to add\n",
+    "custom span attributes, metrics, and production best practices."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb b/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb
new file mode 100644
index 00000000..53ee87d7
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/05_custom_metrics.ipynb
@@ -0,0 +1,409 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "# Observability & Debugging — Part 5: Custom Metrics & Production Best Practices\n",
+    "\n",
+    "This notebook teaches you how to add custom instrumentation for production monitoring\n",
+    "and apply best practices for sampling, performance, and security.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "- Add custom span attributes for business context\n",
+    "- Use the OpenTelemetry Metrics API for aggregate monitoring\n",
+    "- Configure `BatchSpanProcessor` for production\n",
+    "- Apply sampling, alerting, and security best practices\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Complete notebooks 01–04 first"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "from strands import Agent, tool\n",
+    "from strands.models.bedrock import BedrockModel\n",
+    "from strands.telemetry.config import StrandsTelemetry\n",
+    "from opentelemetry import trace\n",
+    "\n",
+    "# Configure telemetry\n",
+    "telemetry = StrandsTelemetry()\n",
+    "telemetry.setup_console_exporter()\n",
+    "\n",
+    "print(\"✓ Telemetry configured\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "custom-attrs-md",
+   "metadata": {},
+   "source": [
+    "## Custom Span Attributes\n",
+    "\n",
+    "Add domain-specific metadata to spans using `trace.get_current_span().set_attribute()`.\n",
+    "This lets you correlate traces with business context: user IDs, session IDs,\n",
+    "feature flags, request types, etc.\n",
+    "\n",
+    "**Where to add attributes:**\n",
+    "- Inside `@tool` functions → attributes appear on the tool span\n",
+    "- In a wrapper span around `agent()` → attributes appear on a parent span\n",
+    "- Both approaches are useful for different query patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "custom-attrs",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def calculator(expression: str) -> str:\n",
+    "    \"\"\"Evaluate a mathematical expression with custom span attributes.\"\"\"\n",
+    "    # Add custom attributes to the current tool span\n",
+    "    current_span = trace.get_current_span()\n",
+    "    current_span.set_attribute(\"app.expression\", expression)\n",
+    "    current_span.set_attribute(\"app.tool_version\", \"1.0.0\")\n",
+    "\n",
+    "    try:\n",
+    "        result = eval(expression, {\"__builtins__\": {}}, {})\n",
+    "        current_span.set_attribute(\"app.result\", str(result))\n",
+    "        current_span.set_attribute(\"app.success\", True)\n",
+    "        return str(result)\n",
+    "    except Exception as e:\n",
+    "        current_span.set_attribute(\"app.success\", False)\n",
+    "        current_span.set_attribute(\"app.error\", str(e))\n",
+    "        return f\"Error: {e}\"\n",
+    "\n",
+    "\n",
+    "agent = Agent(\n",
+    "    model=BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\"),\n",
+    "    tools=[calculator],\n",
+    ")\n",
+    "\n",
+    "print(\"✓ Agent created with instrumented calculator tool\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "request-context",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add request-level context using a wrapper span\n",
+    "tracer = trace.get_tracer(\"observability-tutorial\")\n",
+    "\n",
+    "with tracer.start_as_current_span(\"user-request\") as span:\n",
+    "    # Business context attributes\n",
+    "    span.set_attribute(\"app.user_id\", \"user-123\")\n",
+    "    span.set_attribute(\"app.session_id\", \"session-abc\")\n",
+    "    span.set_attribute(\"app.feature_flag\", \"calculator-v2\")\n",
+    "    span.set_attribute(\"app.request_type\", \"math_query\")\n",
+    "\n",
+    "    result = agent(\"What is 99 * 77?\")\n",
+    "    print(f\"\\nResult: {result}\")\n",
+    "\n",
+    "print(\"\\n✓ Custom attributes added — visible in trace output above\")\n",
+    "print(\"  Look for: app.user_id, app.session_id, app.expression, app.result\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "metrics-md",
+   "metadata": {},
+   "source": [
+    "## Custom Metrics\n",
+    "\n",
+    "Beyond traces, OpenTelemetry supports **metrics** for aggregate monitoring.\n",
+    "Metrics answer questions like:\n",
+    "- How many invocations per minute?\n",
+    "- What's the P95 latency?\n",
+    "- What's the error rate over the last hour?\n",
+    "- How many tokens are we consuming per day?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "metrics-setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from opentelemetry import metrics\n",
+    "from opentelemetry.sdk.metrics import MeterProvider\n",
+    "from opentelemetry.sdk.metrics.export import (\n",
+    "    ConsoleMetricExporter,\n",
+    "    PeriodicExportingMetricReader,\n",
+    ")\n",
+    "\n",
+    "# Set up metrics with console exporter\n",
+    "metric_reader = PeriodicExportingMetricReader(\n",
+    "    ConsoleMetricExporter(),\n",
+    "    export_interval_millis=10000,  # Export every 10 seconds\n",
+    ")\n",
+    "meter_provider = MeterProvider(metric_readers=[metric_reader])\n",
+    "metrics.set_meter_provider(meter_provider)\n",
+    "\n",
+    "# Create custom instruments\n",
+    "meter = metrics.get_meter(\"strands-agent-metrics\")\n",
+    "\n",
+    "invocation_counter = meter.create_counter(\n",
+    "    name=\"agent.invocations\",\n",
+    "    description=\"Number of agent invocations\",\n",
+    "    unit=\"1\",\n",
+    ")\n",
+    "\n",
+    "latency_histogram = meter.create_histogram(\n",
+    "    name=\"agent.latency\",\n",
+    "    description=\"Agent invocation latency\",\n",
+    "    unit=\"ms\",\n",
+    ")\n",
+    "\n",
+    "error_counter = meter.create_counter(\n",
+    "    name=\"agent.errors\",\n",
+    "    description=\"Number of failed agent invocations\",\n",
+    "    unit=\"1\",\n",
+    ")\n",
+    "\n",
+    "print(\"✓ Metrics instruments created\")\n",
+    "print(\"  - agent.invocations (counter)\")\n",
+    "print(\"  - agent.latency (histogram)\")\n",
+    "print(\"  - agent.errors (counter)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "record-metrics",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instrument an agent call with metrics\n",
+    "start = time.time()\n",
+    "try:\n",
+    "    result = agent(\"What is 2 + 2?\")\n",
+    "    status = \"success\"\n",
+    "except Exception:\n",
+    "    status = \"error\"\n",
+    "    error_counter.add(1, {\"agent.name\": \"calculator-agent\"})\n",
+    "\n",
+    "duration_ms = (time.time() - start) * 1000\n",
+    "\n",
+    "# Record metrics\n",
+    "invocation_counter.add(1, {\"agent.name\": \"calculator-agent\", \"status\": status})\n",
+    "latency_histogram.record(duration_ms, {\"agent.name\": \"calculator-agent\"})\n",
+    "\n",
+    "print(f\"\\nResult: {result}\")\n",
+    "print(f\"Latency: {duration_ms:.0f}ms\")\n",
+    "print(f\"Status: {status}\")\n",
+    "print(\"\\n✓ Metrics recorded (will export to console in ~10 seconds)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "production-md",
+   "metadata": {},
+   "source": [
+    "## Production Configuration\n",
+    "\n",
+    "For production, use `BatchSpanProcessor` instead of `SimpleSpanProcessor`.\n",
+    "It queues spans and exports them in bulk, adding minimal latency to agent calls.\n",
+    "\n",
+    "| Setting | Development | Production |\n",
+    "|---------|-------------|------------|\n",
+    "| Span Processor | `SimpleSpanProcessor` | `BatchSpanProcessor` |\n",
+    "| Console Exporter | ✓ Enabled | ✗ Disabled |\n",
+    "| Sampling | 100% | 1–10% |\n",
+    "| Error Sampling | 100% | 100% (always) |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "install-otlp",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install OTLP exporter (required for production config below)\n",
+    "!pip install opentelemetry-exporter-otlp-proto-http -q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "production-config",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    from opentelemetry.sdk.trace import TracerProvider\n",
+    "    from opentelemetry.sdk.trace.export import BatchSpanProcessor\n",
+    "    from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
+    "    from opentelemetry.sdk.resources import Resource\n",
+    "    _OTLP_AVAILABLE = True\n",
+    "except ImportError:\n",
+    "    _OTLP_AVAILABLE = False\n",
+    "    print(\"⚠️  opentelemetry-exporter-otlp-proto-http not installed.\")\n",
+    "    print(\"   Install with: pip install opentelemetry-exporter-otlp-proto-http\")\n",
+    "    print(\"   The production config function below will not work without it.\")\n",
+    "\n",
+    "\n",
+    "def setup_production_telemetry(endpoint: str, service_name: str = \"my-agent-service\"):\n",
+    "    \"\"\"Configure telemetry for production use.\n",
+    "\n",
+    "    Uses BatchSpanProcessor for minimal latency impact and\n",
+    "    OTLP export to the specified endpoint.\n",
+    "\n",
+    "    Args:\n",
+    "        endpoint: OTLP endpoint URL (e.g., http://collector:4318)\n",
+    "        service_name: Service name for trace identification\n",
+    "    \"\"\"\n",
+    "    if not _OTLP_AVAILABLE:\n",
+    "        print(\"✗ Cannot configure — install opentelemetry-exporter-otlp-proto-http first.\")\n",
+    "        return\n",
+    "\n",
+    "    resource = Resource.create({\"service.name\": service_name})\n",
+    "    provider = TracerProvider(resource=resource)\n",
+    "\n",
+    "    # BatchSpanProcessor — queues spans and exports in bulk\n",
+    "    otlp_exporter = OTLPSpanExporter(endpoint=f\"{endpoint}/v1/traces\")\n",
+    "    provider.add_span_processor(\n",
+    "        BatchSpanProcessor(\n",
+    "            otlp_exporter,\n",
+    "            max_queue_size=2048,        # Buffer up to 2048 spans\n",
+    "            max_export_batch_size=512,  # Export 512 at a time\n",
+    "            schedule_delay_millis=5000, # Export every 5 seconds\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    trace.set_tracer_provider(provider)\n",
+    "    print(f\"✓ Production telemetry configured\")\n",
+    "    print(f\"  Endpoint: {endpoint}\")\n",
+    "    print(f\"  Service: {service_name}\")\n",
+    "    print(f\"  Processor: BatchSpanProcessor (queue=2048, batch=512, delay=5s)\")\n",
+    "\n",
+    "\n",
+    "# Example (don't run unless you have a collector):\n",
+    "# setup_production_telemetry(\"http://localhost:4318\", \"my-agent\")\n",
+    "if _OTLP_AVAILABLE:\n",
+    "    print(\"Production setup function defined.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sampling-md",
+   "metadata": {},
+   "source": [
+    "## Sampling Strategies\n",
+    "\n",
+    "In production, tracing every request adds overhead. Use sampling:\n",
+    "\n",
+    "```python\n",
+    "from opentelemetry.sdk.trace.sampling import TraceIdRatioBased, ParentBased\n",
+    "\n",
+    "# Sample 10% of traces, but always trace if parent is sampled\n",
+    "sampler = ParentBased(root=TraceIdRatioBased(0.1))\n",
+    "provider = TracerProvider(sampler=sampler, resource=resource)\n",
+    "```\n",
+    "\n",
+    "**Guidelines:**\n",
+    "- Development: 100% (see everything)\n",
+    "- Staging: 50–100%\n",
+    "- Production: 1–10% (adjust based on volume)\n",
+    "- **Always sample errors at 100%** regardless of rate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "alerts-md",
+   "metadata": {},
+   "source": [
+    "## Alert Thresholds\n",
+    "\n",
+    "Set alerts on these signals for production agents:\n",
+    "\n",
+    "| Signal | Threshold | Meaning |\n",
+    "|--------|-----------|--------|\n",
+    "| Tool error rate | > 5% over 5 min | Tool reliability degraded |\n",
+    "| P95 latency | > 30 seconds | Agent is slow |\n",
+    "| Context window usage | > 80% of limit | Risk of truncation |\n",
+    "| Cycle count | > 5 per invocation | Potential infinite loop |\n",
+    "| Token consumption | > budget threshold | Cost control |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "security-md",
+   "metadata": {},
+   "source": [
+    "## Telemetry Security\n",
+    "\n",
+    "**Do:**\n",
+    "- Use TLS for OTLP export endpoints\n",
+    "- Rotate API keys for backend authentication\n",
+    "- Add generic metadata (user_id, session_id) to spans\n",
+    "- Redact PII before adding to attributes\n",
+    "\n",
+    "**Don't:**\n",
+    "- Log raw user input in span attributes\n",
+    "- Include passwords, tokens, or secrets in traces\n",
+    "- Send traces to unencrypted endpoints in production\n",
+    "- Store full request/response bodies in span attributes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "You've completed the full observability tutorial! Here's what you've learned:\n",
+    "\n",
+    "1. **Tracing setup** (01) — `StrandsTelemetry` with console exporter\n",
+    "2. **Trace hierarchy** (02) — Agent → Cycle → Model → Tool spans\n",
+    "3. **Debugging** (03) — Error spans, context pressure, token growth\n",
+    "4. **Backend export** (04) — OTLP to CloudWatch, Langfuse, Jaeger\n",
+    "5. **Production** (05) — Custom attributes, metrics, BatchSpanProcessor, sampling\n",
+    "\n",
+    "### Key Takeaways\n",
+    "\n",
+    "- `StrandsTelemetry` must be configured *before* creating `Agent` instances\n",
+    "- Filter spans by `status_code == ERROR` to find failures\n",
+    "- Monitor `gen_ai.usage.input_tokens` growth for context pressure\n",
+    "- Use `BatchSpanProcessor` in production\n",
+    "- Sample traces (1–10%) but always trace errors at 100%\n",
+    "- Never log sensitive data in span attributes\n",
+    "\n",
+    "### Next Steps\n",
+    "\n",
+    "- See [16-hooks-lifecycle](../16-hooks-lifecycle) for how hooks compose with tracing\n",
+    "- Explore multi-agent tracing with `Graph` or `Swarm` orchestration\n",
+    "- Set up dashboards in your chosen backend for ongoing monitoring\n",
+    "- Add custom ML-based anomaly detection on trace metrics"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/01-learn/21-observability-debugging/README.md b/python/01-learn/21-observability-debugging/README.md
new file mode 100644
index 00000000..e5560b39
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/README.md
@@ -0,0 +1,157 @@
+# Tutorial 21: Observability & Debugging
+
+Learn how to gain full visibility into your Strands agent's behavior using the
+SDK's built-in telemetry. This tutorial covers tracing, debugging common agent
+issues, exporting telemetry to production backends, and adding custom metrics —
+using the `StrandsTelemetry` API that ships with the Strands Agents SDK.
+
+## Tutorial Details
+
+| | |
+|---|---|
+| Strands Features | `StrandsTelemetry`, `setup_console_exporter()`, `setup_otlp_exporter()`, span attributes, trace hierarchy |
+| Agent Pattern | Single agent with custom tools exercising tracing and debugging scenarios |
+| Tools | Small custom tools defined inline (calculator, flaky API simulator, token-heavy generator) |
+| Model | Amazon Nova Lite on Amazon Bedrock (any Strands-supported model works) |
+
+## What You'll Learn
+
+By completing this tutorial, you will be able to:
+
+- **Basics (Notebook 01):** Configure `StrandsTelemetry` with a single API call and
+  run your first traced agent invocation
+- **Trace Hierarchy (Notebook 02):** Read the Agent → Cycle → Model → Tool span
+  hierarchy to understand exactly how your agent executes
+- **Debugging (Notebook 03):** Find tool failures via error spans and detect context
+  window pressure by tracking token growth across cycles
+- **Backend Export (Notebook 04):** Export traces to CloudWatch, Langfuse, or Jaeger
+  by changing a single environment variable
+- **Production (Notebook 05):** Add custom span attributes, create metrics for
+  aggregate monitoring, and configure `BatchSpanProcessor` for minimal latency impact
+
+## Prerequisites
+
+Before starting this tutorial, ensure you have:
+
+1. **Python 3.10+** installed
+
+2. **AWS credentials configured** for Bedrock model access:
+   ```bash
+   aws configure
+   # Or set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY environment variables
+   ```
+
+3. **Dependencies installed:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. *(Optional)* **Docker** for running local backends (Jaeger):
+   ```bash
+   docker run --rm -d --name jaeger \
+     -p 4318:4318 -p 16686:16686 \
+     jaegertracing/all-in-one:latest
+   ```
+
+5. *(Optional)* **Langfuse account** for cloud LLM observability export
+
+## How is this different from the existing `08-observability` on main?
+
+The existing `08-observability` sample on the `main` branch focuses on **Langfuse + RAGAS
+evaluation** — it requires deploying AWS infrastructure (OpenSearch, DynamoDB) and
+evaluates agent responses using the RAGAS framework.
+
+This tutorial (21-observability-debugging) takes a different approach: it focuses on
+**tracing and debugging** using the Strands SDK's built-in `StrandsTelemetry` class.
+This gives you:
+
+- Zero infrastructure required to start (console exporter works immediately)
+- Model-agnostic implementation (works with any Strands-supported provider)
+- Debugging-first approach (tool failures, context pressure, unexpected behavior)
+- Multiple backend options via a single OTLP configuration change
+- Testable in isolation without deploying any AWS resources
+
+## Tutorial Structure
+
+```
+python/01-learn/21-observability-debugging/
+├── README.md                    # This file — overview, prerequisites, structure
+├── requirements.txt             # Python dependencies for all notebooks
+├── trace_utils.py               # Shared trace formatting and analysis helpers
+├── 01_tracing_setup.ipynb       # Configure telemetry and first traced invocation
+├── 02_trace_hierarchy.ipynb     # Understand the span hierarchy
+├── 03_debugging_tools.ipynb     # Debug tool failures and context pressure
+├── 04_backend_export.ipynb      # Export to CloudWatch, Langfuse, Jaeger
+└── 05_custom_metrics.ipynb      # Custom attributes, metrics, production config
+```
+
+### Notebook Progression
+
+Each notebook builds on the prior one, but can be understood independently if you
+read the setup cell at the top:
+
+1. **01_tracing_setup.ipynb** — Start here. Learn how `StrandsTelemetry` manages the
+   global tracer and how every `Agent` instance automatically picks it up. Run your
+   first traced invocation and capture spans programmatically.
+
+2. **02_trace_hierarchy.ipynb** — Understand the four span types (Agent, Cycle, Model
+   Invoke, Tool) and their parent-child relationships. Use `trace_utils.py` to
+   format and summarize traces.
+
+3. **03_debugging_tools.ipynb** — Two critical scenarios: finding tool failures via
+   error spans (status, exception events, recovery behavior) and detecting context
+   window pressure by monitoring token growth across cycles.
+
+4. **04_backend_export.ipynb** — Switch from console output to production backends.
+   The same `setup_otlp_exporter()` call works with CloudWatch (via ADOT), Langfuse,
+   Jaeger, and any OTLP-compatible backend.
+
+5. **05_custom_metrics.ipynb** — Add domain-specific span attributes, create counters
+   and histograms for aggregate monitoring, and configure `BatchSpanProcessor` for
+   production deployments.
+
+## Installation
+
+1. Create and activate a virtual environment:
+   ```bash
+   cd python/01-learn/21-observability-debugging
+   python -m venv .venv
+   source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+   ```
+
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Configure AWS credentials for Bedrock access:
+   ```bash
+   export AWS_DEFAULT_REGION=us-east-1
+   # Ensure your AWS credentials are configured (via AWS CLI, environment variables, or IAM role)
+   ```
+
+4. Launch Jupyter:
+   ```bash
+   jupyter notebook
+   ```
+
+## A Note on Logging
+
+The Strands Agents SDK observability framework covers three telemetry primitives:
+**Traces**, **Metrics**, and **Logs**. This tutorial focuses on Traces (notebooks
+01–04) and Metrics (notebook 05). For logging configuration, refer to the
+[SDK Logs documentation](https://strandsagents.com/docs/user-guide/observability-evaluation/logs/).
+
+## Related Tutorials
+
+### Prior Learning
+
+- [Tutorial 01 - First Agent](../01-first-agent) — Basic agent creation
+- [Tutorial 02 - Tools and MCP](../02-tools-and-mcp) — Tool definition and usage
+- [Tutorial 03 - Model Providers](../03-model-providers) — Model configuration
+
+### Next Steps
+
+- [Tutorial 05 - Guardrails](../05-guardrails) — Add content filtering alongside observability
+- [SDK Observability Docs](https://strandsagents.com/docs/user-guide/observability-evaluation/observability/) — Full observability framework reference
+- [SDK Traces Docs](https://strandsagents.com/docs/user-guide/observability-evaluation/traces/) — Deep dive into trace configuration
diff --git a/python/01-learn/21-observability-debugging/requirements.txt b/python/01-learn/21-observability-debugging/requirements.txt
new file mode 100644
index 00000000..c1fe39f6
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/requirements.txt
@@ -0,0 +1,5 @@
+strands-agents
+strands-agents-tools
+opentelemetry-api
+opentelemetry-sdk
+opentelemetry-exporter-otlp-proto-http
diff --git a/python/01-learn/21-observability-debugging/trace_utils.py b/python/01-learn/21-observability-debugging/trace_utils.py
new file mode 100644
index 00000000..37de6f7d
--- /dev/null
+++ b/python/01-learn/21-observability-debugging/trace_utils.py
@@ -0,0 +1,187 @@
+"""Shared trace utilities for the Observability & Debugging tutorial.
+
+This module provides helper functions for formatting and analyzing
+OpenTelemetry trace spans. It is imported by all notebooks in this tutorial.
+"""
+
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.trace import StatusCode
+
+
+def format_trace_tree(spans: list[ReadableSpan]) -> str:
+    """Format spans as an indented tree for notebook display.
+
+    Args:
+        spans: List of ReadableSpan objects from a completed trace.
+
+    Returns:
+        A formatted string showing the span tree with status icons,
+        durations, and key attributes.
+
+    Example:
+        >>> spans = memory_exporter.get_finished_spans()
+        >>> print(format_trace_tree(list(spans)))
+        ✓ Agent (1523ms)
+          ✓ Cycle (1520ms)
+            ✓ Model Invoke (1200ms)
+              └─ tokens: in=523 out=89
+            ✓ calculator (3ms)
+              └─ tool: calculator
+    """
+    output = []
+    for span in spans:
+        attrs = span.attributes or {}
+        is_error = (
+            span.status.status_code == StatusCode.ERROR
+            or attrs.get("tool.status") == "error"
+        )
+        status_icon = "✗" if is_error else "✓"
+
+        # Calculate duration
+        if span.end_time and span.start_time:
+            duration_ms = (span.end_time - span.start_time) / 1_000_000
+            duration_str = f"{duration_ms:.0f}ms"
+        else:
+            duration_str = "N/A"
+
+        output.append(f"  {status_icon} {span.name} ({duration_str})")
+
+        # Token usage on model spans
+        if "gen_ai.usage.input_tokens" in attrs:
+            input_t = attrs["gen_ai.usage.input_tokens"]
+            output_t = attrs.get("gen_ai.usage.output_tokens", 0)
+            output.append(f"      └─ tokens: in={input_t} out={output_t}")
+
+        # Tool info on tool spans
+        if "gen_ai.tool.name" in attrs:
+            tool_name = attrs["gen_ai.tool.name"]
+            output.append(f"      └─ tool: {tool_name}")
+
+        # Error details
+        if span.status.status_code == StatusCode.ERROR:
+            desc = span.status.description or "Unknown error"
+            output.append(f"      └─ error: {desc}")
+        elif attrs.get("tool.status") == "error":
+            # Strands records tool errors as attributes
+            for event in span.events:
+                if event.name == "gen_ai.choice":
+                    msg = event.attributes.get("message", "")
+                    if "Error:" in str(msg):
+                        output.append(f"      └─ error: {msg}")
+                        break
+
+        for event in span.events:
+            if event.name == "exception":
+                exc_type = event.attributes.get("exception.type", "")
+                exc_msg = event.attributes.get("exception.message", "")
+                output.append(f"      └─ exception: {exc_type}: {exc_msg}")
+
+    return "\n".join(output)
+
+
+def print_trace_summary(spans: list[ReadableSpan]) -> None:
+    """Print a summary of trace statistics.
+
+    Args:
+        spans: List of ReadableSpan objects from a completed trace.
+
+    Example:
+        >>> spans = memory_exporter.get_finished_spans()
+        >>> print_trace_summary(list(spans))
+        📊 Trace Summary:
+           Total spans: 5
+           Error spans: 1
+           Total duration: 1523ms
+           Success rate: 80%
+    """
+    total_spans = len(spans)
+    error_spans = sum(
+        1 for s in spans
+        if s.status.status_code == StatusCode.ERROR
+        or (s.attributes or {}).get("tool.status") == "error"
+    )
+
+    # Duration from first to last span
+    if spans:
+        root = spans[0]
+        if root.end_time and root.start_time:
+            total_ms = (root.end_time - root.start_time) / 1_000_000
+        else:
+            total_ms = 0
+    else:
+        total_ms = 0
+
+    print("📊 Trace Summary:")
+    print(f"   Total spans: {total_spans}")
+    print(f"   Error spans: {error_spans}")
+    print(f"   Total duration: {total_ms:.0f}ms")
+    print(f"   Success rate: {((total_spans - error_spans) / max(total_spans, 1)) * 100:.0f}%")
+
+
+def find_error_spans(spans: list[ReadableSpan]) -> list[ReadableSpan]:
+    """Filter spans to those with errors (status ERROR or tool.status == 'error').
+
+    The Strands SDK may record tool failures as a span attribute
+    (tool.status = "error") rather than setting the span's status_code to ERROR.
+    This function checks both conditions.
+
+    Args:
+        spans: List of all spans from a trace.
+
+    Returns:
+        List of spans that indicate an error occurred.
+    """
+    error_spans = []
+    for s in spans:
+        if s.status.status_code == StatusCode.ERROR:
+            error_spans.append(s)
+        elif (s.attributes or {}).get("tool.status") == "error":
+            error_spans.append(s)
+    return error_spans
+
+
+def analyze_token_growth(spans: list[ReadableSpan], context_limit: int = 200_000) -> dict:
+    """Analyze token usage growth across model invoke spans.
+
+    Extracts input/output token counts from model spans and calculates
+    growth metrics useful for detecting context window pressure.
+
+    Args:
+        spans: List of all spans from a trace.
+        context_limit: Model's context window size (default: 200K for Claude).
+
+    Returns:
+        Dictionary with growth analysis metrics.
+
+    Example:
+        >>> analysis = analyze_token_growth(list(spans))
+        >>> print(f"Growth: {analysis['growth_pct']:.0f}%")
+    """
+    model_spans = []
+    for span in spans:
+        attrs = span.attributes or {}
+        if "gen_ai.usage.input_tokens" in attrs:
+            model_spans.append({
+                "name": span.name,
+                "input_tokens": attrs["gen_ai.usage.input_tokens"],
+                "output_tokens": attrs.get("gen_ai.usage.output_tokens", 0),
+            })
+
+    if len(model_spans) < 2:
+        return {"model_spans": model_spans, "growth": 0, "growth_pct": 0, "usage_pct": 0}
+
+    first_input = model_spans[0]["input_tokens"]
+    last_input = model_spans[-1]["input_tokens"]
+    growth = last_input - first_input
+    growth_pct = (growth / first_input * 100) if first_input > 0 else 0
+    usage_pct = (last_input / context_limit) * 100
+
+    return {
+        "model_spans": model_spans,
+        "first_input": first_input,
+        "last_input": last_input,
+        "growth": growth,
+        "growth_pct": growth_pct,
+        "usage_pct": usage_pct,
+        "context_limit": context_limit,
+    }