diff --git a/.env.example b/.env.example
index 3fb9b33..7600574 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,5 @@
 ANTHROPIC_API_KEY=YOUR_ANTHROPIC_API_KEY
 OPENAI_API_KEY=YOUR_OPENAI_API_KEY
-GEMINI_API_KEY=YOUR_GEMINI_API_KEY
-OPENROUTER_API_KEY=YOUR_OPENROUTER_API_KEY
\ No newline at end of file
+GOOGLE_API_KEY=YOUR_GOOGLE_API_KEY
+OPENROUTER_API_KEY=YOUR_OPENROUTER_API_KEY
+OLLAMA_BASE_URL=http://localhost:11434 
\ No newline at end of file
diff --git a/.github/workflows/test-and-eval.yml b/.github/workflows/test-and-eval.yml
new file mode 100644
index 0000000..ee80e03
--- /dev/null
+++ b/.github/workflows/test-and-eval.yml
@@ -0,0 +1,31 @@
+name: Tests and Evaluations
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test-and-eval:
+    runs-on: ubuntu-latest
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+      - name: Install dependencies
+        run: |
+          uv sync --dev
+      - name: Tests
+        run: uv run pytest -q
+      - name: Evaluations (Mock Mode)
+        run: uv run python -m intent_kit.evals.run_all_evals --quiet --mock
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index c06fa2d..ac509ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,11 @@ ENV/
 htmlcov/
 .pytest_cache/
 .tox/
+reports/
+
+# Evaluation Results
+intent_kit/evals/results/
+intent_kit/evals/reports/
 
 # Visualization
 intentkit_graphs/
diff --git a/README.md b/README.md
index 0990839..a8841c9 100644
--- a/README.md
+++ b/README.md
@@ -801,6 +801,67 @@ pytest tests/
 
 ---
 
+## Evaluation & Benchmarking
+
+intent-kit provides a built-in evaluation framework for benchmarking intent graphs and nodes against real datasets. This is separate from unit/integration tests and is designed for large-scale, reproducible evaluation.
+
+The evaluation framework is now part of the main `intent_kit` package and can be imported as:
+
+```python
+from intent_kit.evals import run_all_evaluations, evaluate_node, generate_markdown_report
+```
+
+**Organized Structure:**
+- **Latest results**: Always available in `intent_kit/evals/results/latest/` and `intent_kit/evals/reports/latest/`
+- **Date-based archives**: Historical runs are automatically archived in date-based directories
+- **Clean separation**: Reports and raw results are organized separately for easy access
+
+### Running All Evals
+
+To run all evaluations and generate comprehensive markdown reports:
+
+```bash
+# Run with real API calls (requires API keys)
+uv run run-evals
+
+# Run in mock mode (no API keys required)
+uv run run-evals --mock
+```
+
+- Generates a comprehensive report at `reports/comprehensive_report.md`
+- Generates individual reports for each dataset in `reports/`
+- Mock mode uses simulated responses for testing without API costs
+
+### Running a Specific Eval
+
+To run a specific node evaluation (with markdown output):
+
+```bash
+uv run eval-node --dataset handler_node_llm --output reports/my_eval_report.md
+```
+
+- Replace `handler_node_llm` with any dataset name (without .yaml extension)
+- Add `--output <file.md>` to save the report to a specific file
+- Reports are automatically saved to `reports/` directory
+
+### Adding New Evals
+- Add new YAML datasets to `intent_kit/evals/datasets/`
+- Add corresponding node implementations to `intent_kit/evals/sample_nodes/`
+- The framework will automatically discover and evaluate them
+
+### Where are the results?
+- **Latest reports**: `intent_kit/evals/reports/latest/`
+- **Latest results**: `intent_kit/evals/results/latest/`
+- **Date-based archives**: `intent_kit/evals/reports/YYYY-MM-DD/` and `intent_kit/evals/results/YYYY-MM-DD/`
+- Reports are in markdown format for easy sharing and review
+- Raw results are in CSV format for detailed analysis
+
+### When to use evals vs. tests?
+- **Unit/Integration tests** (in `tests/`): For correctness, fast feedback, and CI
+- **Evals** (in `intent_kit/evals/`): For benchmarking, regression, and real-world performance
+
+---
+
 ## Project Structure
 
 ```
@@ -837,6 +898,13 @@ intent-kit/
 │   │   ├── google_client.py
 │   │   ├── ollama_client.py
 │   │   └── __init__.py
+│   ├── evals/               # Evaluation framework
+│   │   ├── __init__.py      # Evaluation exports
+│   │   ├── run_all_evals.py # Run all evaluations
+│   │   ├── run_node_eval.py # Individual node evaluation
+│   │   ├── datasets/        # Evaluation datasets
+│   │   ├── sample_nodes/    # Sample nodes for evaluation
+│   │   └── reports/         # Generated evaluation reports
 │   ├── types.py             # Type definitions
 │   ├── exceptions/          # Custom exceptions
 │   └── utils/               # Utilities
@@ -855,4 +923,135 @@ intent-kit/
 
 ## License
 
-MIT License
\ No newline at end of file
+MIT License
+
+## Evaluation API
+
+The evaluation API provides a clean Python interface for testing your nodes against YAML datasets.
+
+### Basic Usage
+
+```python
+from intent_kit.evals import load_dataset, run_eval
+from intent_kit.evals.sample_nodes.classifier_node_llm import classifier_node_llm
+
+# Load a dataset
+dataset = load_dataset("intent_kit/evals/datasets/classifier_node_llm.yaml")
+
+# Run evaluation
+result = run_eval(dataset, classifier_node_llm)
+
+# Check results
+print(f"Accuracy: {result.accuracy():.1%}")
+print(f"Passed: {result.passed_count()}/{result.total_count()}")
+
+# Save results (using default locations)
+csv_path = result.save_csv()
+json_path = result.save_json()
+md_path = result.save_markdown()
+
+# Or specify custom paths
+result.save_csv("my_results.csv")
+result.save_json("my_results.json")
+result.save_markdown("my_report.md")
+```
+
+### Convenience Functions
+
+```python
+from intent_kit.evals import run_eval_from_path, run_eval_from_module
+
+# Evaluate from file path
+result = run_eval_from_path(
+    "intent_kit/evals/datasets/classifier_node_llm.yaml",
+    classifier_node_llm
+)
+
+# Evaluate with module loading
+result = run_eval_from_module(
+    "intent_kit/evals/datasets/classifier_node_llm.yaml",
+    "intent_kit.evals.sample_nodes.classifier_node_llm",
+    "classifier_node_llm"
+)
+```
+
+### Custom Comparison
+
+```python
+# Case-insensitive comparison
+def case_insensitive_comparator(expected, actual):
+    return str(expected).lower().strip() == str(actual).lower().strip()
+
+result = run_eval(dataset, node, comparator=case_insensitive_comparator)
+```
+
+### Programmatic Datasets
+
+```python
+from intent_kit.evals import EvalTestCase, Dataset
+
+# Create test cases programmatically
+test_cases = [
+    EvalTestCase(
+        input="What's the weather like?",
+        expected="Weather response",
+        context={"user_id": "test"}
+    )
+]
+
+dataset = Dataset(
+    name="my_dataset",
+    description="Custom test dataset",
+    node_type="classifier",
+    node_name="my_node",
+    test_cases=test_cases
+)
+
+result = run_eval(dataset, my_node)
+```
+
+### Dataset Format
+
+YAML datasets should follow this format:
+
+```yaml
+dataset:
+  name: "my_dataset"
+  description: "Test dataset for my node"
+  node_type: "classifier"
+  node_name: "my_node"
+
+test_cases:
+  - input: "What's the weather like in New York?"
+    expected: "Weather in New York: Sunny with a chance of rain"
+    context:
+      user_id: "user123"
+  
+  - input: "Cancel my flight"
+    expected: "Successfully cancelled flight"
+    context:
+      user_id: "user123"
+```
+
+### Error Handling
+
+The API handles errors gracefully:
+
+- **Node exceptions**: Caught and recorded in results
+- **Missing files**: Clear error messages
+- **Malformed datasets**: Validation with helpful error messages
+- **Fail-fast option**: Stop evaluation on first failure
+
+```python
+# Fail-fast evaluation
+result = run_eval(dataset, node, fail_fast=True)
+```
+
+### Output Locations
+
+By default, results are saved to the existing intent-kit directory structure:
+
+- **CSV/JSON results**: `intent_kit/evals/results/latest/`
+- **Markdown reports**: `intent_kit/evals/reports/latest/`
+
+Files are automatically timestamped to avoid conflicts. You can also specify custom paths if needed.
\ No newline at end of file
diff --git a/env.example b/env.example
new file mode 100644
index 0000000..9e20daf
--- /dev/null
+++ b/env.example
@@ -0,0 +1,14 @@
+# Example .env file for Intent Kit LLM evaluations
+# Copy this to .env and add your actual API keys
+
+# OpenAI API Key (for GPT models)
+OPENAI_API_KEY=your-openai-api-key-here
+
+# Anthropic API Key (for Claude models)
+ANTHROPIC_API_KEY=your-anthropic-api-key-here
+
+# Google API Key (for Gemini models)
+GOOGLE_API_KEY=your-google-api-key-here
+
+# Ollama (local models - no API key needed)
+# OLLAMA_BASE_URL=http://localhost:11434 
\ No newline at end of file
diff --git a/examples/advanced_remediation_demo.py b/examples/advanced_remediation_demo.py
index 3db350c..408fd32 100644
--- a/examples/advanced_remediation_demo.py
+++ b/examples/advanced_remediation_demo.py
@@ -28,12 +28,12 @@
 # --- Setup LLM configs ---
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "sk-mock-openai"
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or "sk-mock-gemini"
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or "sk-mock-gemini"
 
 LLM_CONFIG_1 = {"provider": "openai",
                 "model": "gpt-4.1-mini", "api_key": OPENAI_API_KEY}
 LLM_CONFIG_2 = {"provider": "google",
-                "model": "gemini-2.5-flash", "api_key": GEMINI_API_KEY}
+                "model": "gemini-2.5-flash", "api_key": GOOGLE_API_KEY}
 
 # --- Core Handler: Simulates model confusion and ambiguity ---
 
@@ -134,7 +134,7 @@ def main():
     print("• Consensus voting: Multiple models must agree before output is accepted.")
     print("• Alternate prompt: Handler retries with a new prompt if it can't answer.")
 
-    if "mock" in OPENAI_API_KEY or "mock" in GEMINI_API_KEY:
+    if "mock" in OPENAI_API_KEY or "mock" in GOOGLE_API_KEY:
         print("\n💡 Pro Tip: For real LLM behavior, add your OpenAI and Gemini API keys to a .env file.")
 
 
diff --git a/examples/eval_api_demo.py b/examples/eval_api_demo.py
new file mode 100644
index 0000000..06362c5
--- /dev/null
+++ b/examples/eval_api_demo.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+eval_api_demo.py
+
+Demonstration of the new intent-kit evaluation API.
+"""
+
+from intent_kit.evals import (
+    load_dataset,
+    run_eval,
+    run_eval_from_path,
+    run_eval_from_module,
+    EvalTestCase,
+    Dataset
+)
+from intent_kit.evals.sample_nodes.classifier_node_llm import classifier_node_llm
+
+
+def demo_basic_usage():
+    """Demonstrate basic usage with direct node instance."""
+    print("=== Basic Usage Demo ===")
+
+    # Load dataset
+    dataset = load_dataset(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml")
+    print(f"Loaded dataset: {dataset.name}")
+    print(f"Test cases: {len(dataset.test_cases)}")
+
+    # Run evaluation
+    result = run_eval(dataset, classifier_node_llm)
+
+    # Print results
+    result.print_summary()
+
+    # Save results (using default locations)
+    csv_path = result.save_csv()
+    json_path = result.save_json()
+    md_path = result.save_markdown()
+
+    print(f"Results saved to:")
+    print(f"  CSV: {csv_path}")
+    print(f"  JSON: {json_path}")
+    print(f"  Markdown: {md_path}")
+    return result
+
+
+def demo_from_path():
+    """Demonstrate usage with dataset path."""
+    print("\n=== From Path Demo ===")
+
+    result = run_eval_from_path(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml",
+        classifier_node_llm
+    )
+
+    result.print_summary()
+    return result
+
+
+def demo_from_module():
+    """Demonstrate usage with module loading."""
+    print("\n=== From Module Demo ===")
+
+    result = run_eval_from_module(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml",
+        "intent_kit.evals.sample_nodes.classifier_node_llm",
+        "classifier_node_llm"
+    )
+
+    result.print_summary()
+    return result
+
+
+def demo_custom_comparator():
+    """Demonstrate usage with custom comparison logic."""
+    print("\n=== Custom Comparator Demo ===")
+
+    # Custom comparator for case-insensitive comparison
+    def case_insensitive_comparator(expected, actual):
+        if expected is None or actual is None:
+            return expected == actual
+        return str(expected).lower().strip() == str(actual).lower().strip()
+
+    result = run_eval_from_path(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml",
+        classifier_node_llm,
+        comparator=case_insensitive_comparator
+    )
+
+    result.print_summary()
+    return result
+
+
+def demo_fail_fast():
+    """Demonstrate fail-fast behavior."""
+    print("\n=== Fail Fast Demo ===")
+
+    result = run_eval_from_path(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml",
+        classifier_node_llm,
+        fail_fast=True
+    )
+
+    print(f"Fail-fast evaluation completed with {result.total_count()} tests")
+    return result
+
+
+def demo_programmatic_dataset():
+    """Demonstrate creating a dataset programmatically."""
+    print("\n=== Programmatic Dataset Demo ===")
+
+    # Create test cases programmatically
+    test_cases = [
+        EvalTestCase(
+            input="What's the weather like in Paris?",
+            expected="Weather in Paris: Sunny with a chance of rain",
+            context={"user_id": "demo_user"}
+        ),
+        EvalTestCase(
+            input="Cancel my flight",
+            expected="Successfully cancelled flight",
+            context={"user_id": "demo_user"}
+        )
+    ]
+
+    # Create dataset
+    dataset = Dataset(
+        name="demo_dataset",
+        description="Programmatically created test dataset",
+        node_type="classifier",
+        node_name="classifier_node_llm",
+        test_cases=test_cases
+    )
+
+    # Run evaluation
+    result = run_eval(dataset, classifier_node_llm)
+    result.print_summary()
+
+    return result
+
+
+def demo_error_handling():
+    """Demonstrate error handling with a broken node."""
+    print("\n=== Error Handling Demo ===")
+
+    # Create a broken node that raises exceptions
+    def broken_node(input_text, context=None):
+        if "weather" in input_text.lower():
+            raise ValueError("Weather service is down!")
+        return "Default response"
+
+    # Create a simple test case
+    test_cases = [
+        EvalTestCase(
+            input="What's the weather like?",
+            expected="Weather response",
+            context={}
+        ),
+        EvalTestCase(
+            input="Hello there",
+            expected="Default response",
+            context={}
+        )
+    ]
+
+    dataset = Dataset(
+        name="error_demo",
+        description="Testing error handling",
+        node_type="test",
+        node_name="broken_node",
+        test_cases=test_cases
+    )
+
+    result = run_eval(dataset, broken_node)
+    result.print_summary()
+
+    return result
+
+
+def main():
+    """Run all demos."""
+    import os
+
+    # Create results directory
+    os.makedirs("results", exist_ok=True)
+
+    # Run demos
+    demos = [
+        demo_basic_usage,
+        demo_from_path,
+        demo_from_module,
+        demo_custom_comparator,
+        demo_fail_fast,
+        demo_programmatic_dataset,
+        demo_error_handling
+    ]
+
+    results = []
+    for demo in demos:
+        try:
+            result = demo()
+            results.append(result)
+        except Exception as e:
+            print(f"Demo {demo.__name__} failed: {e}")
+
+    # Summary
+    print("\n=== Summary ===")
+    for i, result in enumerate(results):
+        print(f"Demo {i+1}: {result.accuracy():.1%} accuracy")
+
+    print("\nAll demos completed! Check the results/ directory for output files.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/intent_kit/classifiers/llm_classifier.py b/intent_kit/classifiers/llm_classifier.py
index 3689809..28527f4 100644
--- a/intent_kit/classifiers/llm_classifier.py
+++ b/intent_kit/classifiers/llm_classifier.py
@@ -181,7 +181,11 @@ def llm_arg_extractor(user_input: str, context: Optional[Dict[str, Any]] = None)
             )
 
             # Get LLM response
-            logger.debug(f"LLM arg extractor config: {llm_config}")
+            # Obfuscate API key in debug log
+            safe_config = llm_config.copy()
+            if "api_key" in safe_config:
+                safe_config["api_key"] = "***OBFUSCATED***"
+            logger.debug(f"LLM arg extractor config: {safe_config}")
             logger.debug(f"LLM arg extractor prompt: {prompt}")
             response = LLMFactory.generate_with_config(llm_config, prompt)
 
diff --git a/intent_kit/evals/__init__.py b/intent_kit/evals/__init__.py
new file mode 100644
index 0000000..b719307
--- /dev/null
+++ b/intent_kit/evals/__init__.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""
+intent_kit.evals
+
+A clean Python API for evaluating intent-kit nodes against YAML datasets.
+"""
+
+import csv
+import importlib
+from typing import Any, Dict, List, Optional, Callable, Union
+from pathlib import Path
+from dataclasses import dataclass
+from datetime import datetime
+import yaml
+
+
+@dataclass
+class EvalTestCase:
+    """A single test case with input, expected output, and optional context."""
+    input: str
+    expected: Any
+    context: Dict[str, Any]
+
+    def __post_init__(self):
+        if self.context is None:
+            self.context = {}
+
+
+@dataclass
+class Dataset:
+    """A dataset containing test cases for evaluating a node."""
+    name: str
+    description: str
+    node_type: str
+    node_name: str
+    test_cases: List[EvalTestCase]
+
+    def __post_init__(self):
+        if self.description is None:
+            self.description = ""
+
+
+@dataclass
+class EvalTestResult:
+    """Result of a single test case evaluation."""
+    input: str
+    expected: Any
+    actual: Any
+    passed: bool
+    context: Dict[str, Any]
+    error: Optional[str] = None
+
+    def __post_init__(self):
+        if self.context is None:
+            self.context = {}
+
+
+class EvalResult:
+    """Results from evaluating a node against a dataset."""
+
+    def __init__(self, results: List[EvalTestResult], dataset_name: str = ""):
+        self.results = results
+        self.dataset_name = dataset_name
+
+    def all_passed(self) -> bool:
+        return all(r.passed for r in self.results)
+
+    def accuracy(self) -> float:
+        if not self.results:
+            return 0.0
+        return sum(1 for r in self.results if r.passed) / len(self.results)
+
+    def passed_count(self) -> int:
+        return sum(1 for r in self.results if r.passed)
+
+    def failed_count(self) -> int:
+        return sum(1 for r in self.results if not r.passed)
+
+    def total_count(self) -> int:
+        return len(self.results)
+
+    def errors(self) -> List[EvalTestResult]:
+        return [r for r in self.results if not r.passed]
+
+    def print_summary(self) -> None:
+        print(f"\nEvaluation Results for {self.dataset_name or 'Dataset'}:")
+        print(
+            f"  Accuracy: {self.accuracy():.1%} ({self.passed_count()}/{self.total_count()})")
+        print(f"  Passed: {self.passed_count()}")
+        print(f"  Failed: {self.failed_count()}")
+        if self.errors():
+            print(f"\nFailed Tests:")
+            for i, error in enumerate(self.errors()[:5]):
+                print(f"  {i+1}. Input: '{error.input}'")
+                print(f"     Expected: '{error.expected}'")
+                print(f"     Actual: '{error.actual}'")
+                if error.error:
+                    print(f"     Error: {error.error}")
+                print()
+            if len(self.errors()) > 5:
+                print(f"  ... and {len(self.errors()) - 5} more failed tests")
+
+    def save_csv(self, path: Optional[str] = None) -> str:
+        if path is None:
+            results_dir = Path(__file__).parent / "results" / "latest"
+            results_dir.mkdir(parents=True, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            path = str(results_dir /
+                       f"{self.dataset_name}_eval_results_{timestamp}.csv")
+        with open(path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['input', 'expected', 'actual',
+                            'passed', 'error', 'context'])
+            for result in self.results:
+                writer.writerow([
+                    result.input,
+                    result.expected,
+                    result.actual,
+                    result.passed,
+                    result.error or '',
+                    str(result.context)
+                ])
+        return str(path)
+
+    def save_json(self, path: Optional[str] = None) -> str:
+        if path is None:
+            results_dir = Path(__file__).parent / "results" / "latest"
+            results_dir.mkdir(parents=True, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            path = str(results_dir /
+                       f"{self.dataset_name}_eval_results_{timestamp}.json")
+        import json
+        data = {
+            'dataset_name': self.dataset_name,
+            'summary': {
+                'accuracy': self.accuracy(),
+                'passed_count': self.passed_count(),
+                'failed_count': self.failed_count(),
+                'total_count': self.total_count()
+            },
+            'results': [
+                {
+                    'input': r.input,
+                    'expected': r.expected,
+                    'actual': r.actual,
+                    'passed': r.passed,
+                    'error': r.error,
+                    'context': r.context
+                }
+                for r in self.results
+            ]
+        }
+        with open(path, 'w') as f:
+            json.dump(data, f, indent=2)
+        return str(path)
+
+    def save_markdown(self, path: Optional[str] = None) -> str:
+        if path is None:
+            reports_dir = Path(__file__).parent / "reports" / "latest"
+            reports_dir.mkdir(parents=True, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            path = str(reports_dir /
+                       f"{self.dataset_name}_eval_report_{timestamp}.md")
+        report = f"""# Evaluation Report: {self.dataset_name}
+
+**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+## Summary
+
+- **Accuracy:** {self.accuracy():.1%} ({self.passed_count()}/{self.total_count()})
+- **Passed:** {self.passed_count()}
+- **Failed:** {self.failed_count()}
+
+## Results
+
+| # | Input | Expected | Actual | Status |
+|---|-------|----------|--------|--------|
+"""
+        for i, result in enumerate(self.results, 1):
+            status = "✅ PASS" if result.passed else "❌ FAIL"
+            report += f"| {i} | `{result.input}` | `{result.expected}` | `{result.actual}` | {status} |\n"
+        if self.errors():
+            report += "\n## Failed Tests\n\n"
+            for i, error in enumerate(self.errors(), 1):
+                report += f"### Failed Test {i}\n\n"
+                report += f"- **Input:** `{error.input}`\n"
+                report += f"- **Expected:** `{error.expected}`\n"
+                report += f"- **Actual:** `{error.actual}`\n"
+                if error.error:
+                    report += f"- **Error:** {error.error}\n"
+                report += "\n"
+        with open(path, 'w') as f:
+            f.write(report)
+        return str(path)
+
+
+def load_dataset(path: Union[str, Path]) -> Dataset:
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Dataset file not found: {path}")
+    with open(path, 'r') as f:
+        data = yaml.safe_load(f)
+    if 'dataset' not in data:
+        raise ValueError(f"Dataset file missing 'dataset' section: {path}")
+    dataset_info = data['dataset']
+    required_fields = ['name', 'node_type', 'node_name']
+    for field in required_fields:
+        if field not in dataset_info:
+            raise ValueError(
+                f"Dataset missing required field '{field}': {path}")
+    if 'test_cases' not in data:
+        raise ValueError(f"Dataset file missing 'test_cases' section: {path}")
+    test_cases = []
+    for i, tc_data in enumerate(data['test_cases']):
+        if 'input' not in tc_data:
+            raise ValueError(f"Test case {i+1} missing 'input' field: {path}")
+        if 'expected' not in tc_data:
+            raise ValueError(
+                f"Test case {i+1} missing 'expected' field: {path}")
+        test_case = EvalTestCase(
+            input=tc_data['input'],
+            expected=tc_data['expected'],
+            context=tc_data.get('context', {})
+        )
+        test_cases.append(test_case)
+    return Dataset(
+        name=dataset_info['name'],
+        description=dataset_info.get('description', ''),
+        node_type=dataset_info['node_type'],
+        node_name=dataset_info['node_name'],
+        test_cases=test_cases
+    )
+
+
+def get_node_from_module(module_name: str, node_name: str):
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, node_name)
+    except (ImportError, AttributeError) as e:
+        print(f"Error loading node {node_name} from {module_name}: {e}")
+        return None
+
+
+def run_eval(
+    dataset: Dataset,
+    node: Any,
+    comparator: Optional[Callable[[Any, Any], bool]] = None,
+    fail_fast: bool = False
+) -> EvalResult:
+    if comparator is None:
+        def default_comparator(expected, actual):
+            return expected == actual
+        comparator = default_comparator
+    results = []
+    for test_case in dataset.test_cases:
+        try:
+            if callable(node):
+                actual = node(test_case.input, context=test_case.context)
+            elif hasattr(node, 'execute'):
+                from intent_kit.context import IntentContext
+                context = IntentContext()
+                for key, value in test_case.context.items():
+                    context.set(key, value, modified_by="eval")
+                result = node.execute(test_case.input, context)
+                actual = result.output if result.success else None
+                if not result.success and result.error:
+                    raise Exception(result.error.message)
+            else:
+                raise ValueError(
+                    "Node must be callable or have an .execute() method")
+            passed = comparator(test_case.expected, actual)
+            result = EvalTestResult(
+                input=test_case.input,
+                expected=test_case.expected,
+                actual=actual,
+                passed=passed,
+                context=test_case.context
+            )
+        except Exception as e:
+            result = EvalTestResult(
+                input=test_case.input,
+                expected=test_case.expected,
+                actual=None,
+                passed=False,
+                context=test_case.context,
+                error=str(e)
+            )
+            if fail_fast:
+                results.append(result)
+                return EvalResult(results, dataset.name)
+        results.append(result)
+    return EvalResult(results, dataset.name)
+
+
+def run_eval_from_path(
+    dataset_path: Union[str, Path],
+    node: Any,
+    comparator: Optional[Callable[[Any, Any], bool]] = None,
+    fail_fast: bool = False
+) -> EvalResult:
+    dataset = load_dataset(dataset_path)
+    return run_eval(dataset, node, comparator, fail_fast)
+
+
+def run_eval_from_module(
+    dataset_path: Union[str, Path],
+    module_name: str,
+    node_name: str,
+    comparator: Optional[Callable[[Any, Any], bool]] = None,
+    fail_fast: bool = False
+) -> EvalResult:
+    dataset = load_dataset(dataset_path)
+    node = get_node_from_module(module_name, node_name)
+    if node is None:
+        raise ValueError(f"Failed to load node {node_name} from {module_name}")
+    return run_eval(dataset, node, comparator, fail_fast)
+
+
+# Control what gets imported when using "from intent_kit.evals import *"
+__all__ = [
+    "EvalTestCase",
+    "Dataset",
+    "EvalTestResult",
+    "EvalResult",
+    "load_dataset",
+    "get_node_from_module",
+    "run_eval",
+    "run_eval_from_path",
+    "run_eval_from_module"
+]
diff --git a/intent_kit/evals/datasets/classifier_node_llm.yaml b/intent_kit/evals/datasets/classifier_node_llm.yaml
new file mode 100644
index 0000000..4801a4a
--- /dev/null
+++ b/intent_kit/evals/datasets/classifier_node_llm.yaml
@@ -0,0 +1,56 @@
+dataset:
+  name: "classifier_node_llm"
+  description: "Test LLM-powered intent classification for weather and cancellation handlers"
+  node_type: "classifier"
+  node_name: "classifier_node_llm"
+
+test_cases:
+  - input: "What's the weather like in New York?"
+    expected: "Weather in New York: Sunny with a chance of rain"
+    context:
+      user_id: "user123"
+    
+  - input: "How's the temperature in London?"
+    expected: "Weather in London: Sunny with a chance of rain"
+    context:
+      user_id: "user123"
+    
+  - input: "Can you tell me the weather forecast for Tokyo?"
+    expected: "Weather in Tokyo: Sunny with a chance of rain"
+    context:
+      user_id: "user123"
+    
+  - input: "What's the weather like today?"
+    expected: "Weather in Unknown: Sunny with a chance of rain"
+    context:
+      user_id: "user123"
+    
+  - input: "I need to cancel my flight reservation"
+    expected: "Successfully cancelled flight reservation"
+    context:
+      user_id: "user123"
+    
+  - input: "Cancel my hotel booking"
+    expected: "Successfully cancelled hotel booking"
+    context:
+      user_id: "user123"
+    
+  - input: "I want to cancel my restaurant reservation"
+    expected: "Successfully cancelled restaurant reservation"
+    context:
+      user_id: "user123"
+    
+  - input: "Please cancel my appointment"
+    expected: "Successfully cancelled appointment"
+    context:
+      user_id: "user123"
+    
+  - input: "Cancel my subscription"
+    expected: "Successfully cancelled subscription"
+    context:
+      user_id: "user123"
+    
+  - input: "I need to cancel my order"
+    expected: "Successfully cancelled order"
+    context:
+      user_id: "user123" 
\ No newline at end of file
diff --git a/intent_kit/evals/datasets/handler_node_llm.yaml b/intent_kit/evals/datasets/handler_node_llm.yaml
new file mode 100644
index 0000000..cf72261
--- /dev/null
+++ b/intent_kit/evals/datasets/handler_node_llm.yaml
@@ -0,0 +1,56 @@
+dataset:
+  name: "handler_node_llm"
+  description: "Test LLM-powered argument extraction for booking handler"
+  node_type: "handler"
+  node_name: "handler_node_llm"
+
+test_cases:
+  - input: "I need to book a flight to Paris"
+    expected: "Flight booked to Paris for ASAP (Booking #1)"
+    context:
+      user_id: "user123"
+    
+  - input: "Book me a ticket to Tokyo for next Friday"
+    expected: "Flight booked to Tokyo for next Friday (Booking #2)"
+    context:
+      user_id: "user123"
+    
+  - input: "Can you arrange travel to London tomorrow?"
+    expected: "Flight booked to London for tomorrow (Booking #3)"
+    context:
+      user_id: "user123"
+    
+  - input: "I want to fly to New York"
+    expected: "Flight booked to New York for ASAP (Booking #4)"
+    context:
+      user_id: "user123"
+    
+  - input: "Book a flight to Sydney for December 15th"
+    expected: "Flight booked to Sydney for December 15th (Booking #5)"
+    context:
+      user_id: "user123"
+    
+  - input: "I need to travel to Berlin next week"
+    expected: "Flight booked to Berlin for next week (Booking #6)"
+    context:
+      user_id: "user123"
+    
+  - input: "Can you book me a flight to Rome for the weekend?"
+    expected: "Flight booked to Rome for the weekend (Booking #7)"
+    context:
+      user_id: "user123"
+    
+  - input: "I want to go to Barcelona"
+    expected: "Flight booked to Barcelona for ASAP (Booking #8)"
+    context:
+      user_id: "user123"
+    
+  - input: "Book a trip to Amsterdam for next month"
+    expected: "Flight booked to Amsterdam for next month (Booking #9)"
+    context:
+      user_id: "user123"
+    
+  - input: "I need a flight to Prague as soon as possible"
+    expected: "Flight booked to Prague for ASAP (Booking #10)"
+    context:
+      user_id: "user123" 
\ No newline at end of file
diff --git a/intent_kit/evals/datasets/splitter_node_llm.yaml b/intent_kit/evals/datasets/splitter_node_llm.yaml
new file mode 100644
index 0000000..90a7635
--- /dev/null
+++ b/intent_kit/evals/datasets/splitter_node_llm.yaml
@@ -0,0 +1,56 @@
+dataset:
+  name: "splitter_node_llm"
+  description: "Test LLM-powered text splitting for complex multi-intent scenarios"
+  node_type: "splitter"
+  node_name: "splitter_node_llm"
+
+test_cases:
+  - input: "Book a flight to Paris and check the weather in London"
+    expected: ["Book a flight to Paris", "Check the weather in London"]
+    context:
+      user_id: "user123"
+    
+  - input: "Cancel my reservation and book a new one"
+    expected: ["Cancel my reservation", "Book a new reservation"]
+    context:
+      user_id: "user123"
+    
+  - input: "What's the weather like in Tokyo and can you book me a hotel there?"
+    expected: ["What's the weather like in Tokyo", "Book me a hotel there"]
+    context:
+      user_id: "user123"
+    
+  - input: "I need to cancel my flight and get a refund"
+    expected: ["Cancel my flight", "Get a refund"]
+    context:
+      user_id: "user123"
+    
+  - input: "Check the weather in Berlin and book a restaurant for dinner"
+    expected: ["Check the weather in Berlin", "Book a restaurant for dinner"]
+    context:
+      user_id: "user123"
+    
+  - input: "What's the weather like?"
+    expected: ["What's the weather like?"]
+    context:
+      user_id: "user123"
+    
+  - input: "Book a flight to Rome, check the weather there, and reserve a hotel"
+    expected: ["Book a flight to Rome", "Check the weather there", "Reserve a hotel"]
+    context:
+      user_id: "user123"
+    
+  - input: "Cancel my subscription and order a replacement"
+    expected: ["Cancel my subscription", "Order a replacement"]
+    context:
+      user_id: "user123"
+    
+  - input: "I want to book a flight to Amsterdam and check the weather forecast"
+    expected: ["Book a flight to Amsterdam", "Check the weather forecast"]
+    context:
+      user_id: "user123"
+    
+  - input: "Cancel my appointment and reschedule for next week"
+    expected: ["Cancel my appointment", "Reschedule for next week"]
+    context:
+      user_id: "user123" 
\ No newline at end of file
diff --git a/intent_kit/evals/llm_config.yaml b/intent_kit/evals/llm_config.yaml
new file mode 100644
index 0000000..e4b3321
--- /dev/null
+++ b/intent_kit/evals/llm_config.yaml
@@ -0,0 +1,28 @@
+# LLM Configuration for Intent Kit Evaluations
+# Replace the API keys with your actual keys
+
+openai:
+  api_key: "your-openai-api-key-here"
+  model: "gpt-3.5-turbo"
+  max_tokens: 100
+
+anthropic:
+  api_key: "your-anthropic-api-key-here"
+  model: "claude-3-sonnet-20240229"
+  max_tokens: 100
+
+google:
+  api_key: "your-google-api-key-here"
+  model: "gemini-pro"
+  max_tokens: 100
+
+ollama:
+  api_key: ""  # Ollama doesn't require API key
+  model: "llama2"
+  base_url: "http://localhost:11434"
+  max_tokens: 100
+
+# You can also set API keys via environment variables:
+# OPENAI_API_KEY=your-key
+# ANTHROPIC_API_KEY=your-key
+# GOOGLE_API_KEY=your-key 
\ No newline at end of file
diff --git a/intent_kit/evals/run_all_evals.py b/intent_kit/evals/run_all_evals.py
new file mode 100644
index 0000000..4aa20e4
--- /dev/null
+++ b/intent_kit/evals/run_all_evals.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+run_all_evals.py
+
+Run evaluations on all datasets and generate comprehensive markdown reports.
+"""
+
+from intent_kit.evals.run_node_eval import load_dataset, get_node_from_module, evaluate_node, generate_markdown_report
+import yaml
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+import sys
+import pathlib
+from dotenv import load_dotenv
+load_dotenv()
+
+
+def run_all_evaluations():
+    """Run all evaluations and generate reports."""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Run all evaluations and generate comprehensive report")
+    parser.add_argument("--output", type=str, default="intent_kit/evals/reports/latest/comprehensive_report.md",
+                        help="Output file for comprehensive report")
+    parser.add_argument("--individual", action="store_true",
+                        help="Also generate individual reports for each dataset")
+    parser.add_argument("--quiet", action="store_true",
+                        help="Suppress output messages")
+    parser.add_argument("--llm-config", help="Path to LLM configuration file")
+    parser.add_argument("--mock", action="store_true",
+                        help="Run in mock mode without real API calls")
+
+    # Parse args if called as script, otherwise use defaults
+    try:
+        args = parser.parse_args()
+    except SystemExit:
+        # Called as function, use defaults
+        args = parser.parse_args([])
+
+    # Create organized reports directory structure
+    run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    today = datetime.now().strftime("%Y-%m-%d")
+    reports_dir = pathlib.Path(__file__).parent / "reports" / "latest"
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    date_reports_dir = pathlib.Path(__file__).parent / "reports" / today
+    date_reports_dir.mkdir(parents=True, exist_ok=True)
+
+    # Set output path
+    output_path = pathlib.Path(args.output)
+    if args.output == "intent_kit/evals/reports/latest/comprehensive_report.md":
+        output_path = reports_dir / "comprehensive_report.md"
+
+    if not args.quiet:
+        mode = "MOCK" if args.mock else "LIVE"
+        print(f"Running all evaluations in {mode} mode...")
+    results = run_all_evaluations_internal(
+        args.llm_config, mock_mode=args.mock)
+
+    if not args.quiet:
+        print("Generating comprehensive report...")
+    report = generate_comprehensive_report(
+        results, str(output_path), run_timestamp=run_timestamp, mock_mode=args.mock)
+
+    # Also write timestamped copy to date-based archive directory
+    date_comprehensive_report_path = date_reports_dir / \
+        f"comprehensive_report_{run_timestamp}.md"
+    with open(output_path, 'r') as src, open(date_comprehensive_report_path, 'w') as dst:
+        dst.write(src.read())
+    if not args.quiet:
+        print(
+            f"Comprehensive report archived as: {date_comprehensive_report_path}")
+
+    if args.individual:
+        if not args.quiet:
+            print("Generating individual reports...")
+        for result in results:
+            dataset_name = result['dataset']
+            individual_report_path = reports_dir / f"{dataset_name}_report.md"
+            # Write to latest
+            generate_markdown_report(
+                [result], individual_report_path, run_timestamp=run_timestamp)
+            # Also write to date-based archive with timestamp in filename
+            date_individual_report_path = date_reports_dir / \
+                f"{dataset_name}_report_{run_timestamp}.md"
+            with open(individual_report_path, 'r') as src, open(date_individual_report_path, 'w') as dst:
+                dst.write(src.read())
+            if not args.quiet:
+                print(
+                    f"Individual report written to: {individual_report_path} and archived as {date_individual_report_path}")
+
+    if not args.quiet:
+        print("Evaluation complete!")
+
+    return True
+
+
+def run_all_evaluations_internal(llm_config_path: Optional[str] = None, mock_mode: bool = False) -> List[Dict[str, Any]]:
+    """Run evaluations on all datasets and return results."""
+    dataset_dir = pathlib.Path(__file__).parent / "datasets"
+    results = []
+
+    # Load LLM configuration if provided
+    if llm_config_path:
+        import os
+        with open(llm_config_path, 'r') as f:
+            llm_config = yaml.safe_load(f)
+
+        # Set environment variables for API keys
+        for provider, config in llm_config.items():
+            if "api_key" in config:
+                env_var = f"{provider.upper()}_API_KEY"
+                os.environ[env_var] = config["api_key"]
+                print(f"Set {env_var} environment variable (key obfuscated)")
+
+    # Set mock mode environment variable
+    if mock_mode:
+        import os
+        os.environ["INTENT_KIT_MOCK_MODE"] = "1"
+        print("Running in MOCK mode - using simulated responses")
+
+    for dataset_file in dataset_dir.glob("*.yaml"):
+        print(f"Evaluating {dataset_file.name}...")
+
+        # Load dataset
+        dataset = load_dataset(dataset_file)
+        dataset_name = dataset["dataset"]["name"]
+        node_name = dataset["dataset"]["node_name"]
+
+        # Determine module name based on node name
+        if "llm" in node_name:
+            module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node_llm"
+        else:
+            module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node"
+
+        # Load node
+        node = get_node_from_module(module_name, node_name)
+        if node is None:
+            print(f"Failed to load node {node_name} from {module_name}")
+            continue
+
+        # Run evaluation
+        test_cases = dataset["test_cases"]
+        result = evaluate_node(node, test_cases, dataset_name)
+        results.append(result)
+
+        # Print results
+        accuracy = result["accuracy"]
+        mode_indicator = "[MOCK]" if mock_mode else ""
+        print(
+            f"  Accuracy: {accuracy:.1%} ({result['correct']}/{result['total_cases']}) {mode_indicator}")
+
+    return results
+
+
+def generate_comprehensive_report(results: List[Dict[str, Any]], output_file: Optional[str] = None, run_timestamp: str = "", mock_mode: bool = False) -> str:
+    """Generate a comprehensive markdown report for all evaluations."""
+
+    total_datasets = len(results)
+    total_tests = sum(r["total_cases"] for r in results)
+    total_passed = sum(r["correct"] for r in results)
+    overall_accuracy = total_passed / total_tests if total_tests > 0 else 0.0
+
+    # Count statuses
+    passed_datasets = sum(
+        1 for r in results if r["accuracy"] >= 0.8)  # 80% threshold
+    failed_datasets = total_datasets - passed_datasets
+
+    # Add mock mode indicator
+    mock_indicator = " (MOCK MODE)" if mock_mode else ""
+
+    report = f"""# Comprehensive Evaluation Report{mock_indicator}
+
+**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
+**Mode:** {'Mock (simulated responses)' if mock_mode else 'Live (real API calls)'}  
+**Total Datasets:** {total_datasets}  
+**Total Tests:** {total_tests}  
+**Overall Accuracy:** {overall_accuracy:.1%}
+
+## Executive Summary
+
+| Metric | Value |
+|--------|-------|
+| **Datasets Evaluated** | {total_datasets} |
+| **Datasets Passed** | {passed_datasets} |
+| **Datasets Failed** | {failed_datasets} |
+| **Total Tests** | {total_tests} |
+| **Tests Passed** | {total_passed} |
+| **Tests Failed** | {total_tests - total_passed} |
+| **Overall Accuracy** | {overall_accuracy:.1%} |
+
+## Dataset Results
+
+| Dataset | Accuracy | Status | Tests |
+|---------|----------|--------|-------|
+"""
+
+    for result in results:
+        status = "PASSED" if result["accuracy"] >= 0.8 else "FAILED"
+        status_icon = "✅" if status == "PASSED" else "❌"
+
+        report += f"| `{result['dataset']}` | {result['accuracy']:.1%} | {status_icon} {status} | {result['correct']}/{result['total_cases']} |\n"
+
+    # Detailed results for each dataset
+    report += "\n## Detailed Results\n\n"
+
+    for result in results:
+        report += f"### {result['dataset']}\n\n"
+        report += f"**Accuracy:** {result['accuracy']:.1%} ({result['correct']}/{result['total_cases']})  \n"
+        report += f"**Status:** {'PASSED' if result['accuracy'] >= 0.8 else 'FAILED'}\n\n"
+
+        # Show errors if any
+        if result["errors"]:
+            report += "#### Errors\n"
+            for error in result["errors"][:5]:  # Show first 5 errors
+                report += f"- **Case {error['case']}**: {error['input']}\n"
+                report += f"  - Expected: `{error['expected']}`\n"
+                report += f"  - Actual: `{error['actual']}`\n"
+                if error.get('error'):
+                    report += f"  - Error: {error['error']}\n"
+                report += "\n"
+            if len(result["errors"]) > 5:
+                report += f"- ... and {len(result['errors']) - 5} more errors\n\n"
+
+    # Write to file if specified
+    if output_file:
+        with open(output_file, 'w') as f:
+            f.write(report)
+        print(f"Comprehensive report written to: {output_file}")
+
+    return report
+
+
+if __name__ == "__main__":
+    run_all_evaluations()
diff --git a/intent_kit/evals/run_node_eval.py b/intent_kit/evals/run_node_eval.py
new file mode 100644
index 0000000..f6dd9ea
--- /dev/null
+++ b/intent_kit/evals/run_node_eval.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python3
+"""
+run_node_eval.py
+
+Run evaluations on sample nodes using datasets.
+"""
+
+from intent_kit.node.types import ExecutionResult
+from intent_kit.context import IntentContext
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import yaml
+import sys
+import os
+import importlib
+import argparse
+from dotenv import load_dotenv
+import json
+import csv
+from datetime import datetime
+import uuid
+
+# Add text similarity imports
+from difflib import SequenceMatcher
+import re
+
+load_dotenv()
+
+
+def load_dataset(dataset_path: Path) -> Dict[str, Any]:
+    """Load a dataset from YAML file."""
+    with open(dataset_path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def get_node_from_module(module_name: str, node_name: str):
+    """Get a node instance from a module."""
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, node_name)
+    except (ImportError, AttributeError) as e:
+        print(f"Error loading node {node_name} from {module_name}: {e}")
+        return None
+
+
+def save_raw_results_to_csv(dataset_name: str, test_case: Dict[str, Any], actual_output: Any, success: bool, error: Optional[str] = None, similarity_score: Optional[float] = None, run_timestamp: Optional[str] = None):
+    """Save raw evaluation results to CSV files."""
+    # Create organized results directory structure
+    today = datetime.now().strftime("%Y-%m-%d")
+    if run_timestamp is None:
+        run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+    # Create results directory structure
+    results_dir = Path(__file__).parent / "results" / "latest"
+    results_dir.mkdir(parents=True, exist_ok=True)
+
+    # Also create date-based directory for archiving
+    date_dir = Path(__file__).parent / "results" / today
+    date_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create CSV files for this dataset
+    csv_file = results_dir / f"{dataset_name}_results.csv"
+    date_csv_file = date_dir / f"{dataset_name}_results_{run_timestamp}.csv"
+
+    # Prepare row data
+    row_data = {
+        "timestamp": importlib.import_module('datetime').datetime.now().isoformat(),
+        "input": test_case["input"],
+        "expected": test_case["expected"],
+        "actual": actual_output,
+        "success": success,
+        "similarity_score": similarity_score or "",
+        "error": error or "",
+        "context": str(test_case.get("context", {}))
+    }
+
+    # Check if this is the first test case (to write header)
+    global _first_test_case
+    if not hasattr(save_raw_results_to_csv, '_first_test_case'):
+        save_raw_results_to_csv._first_test_case = {}
+
+    is_first = dataset_name not in save_raw_results_to_csv._first_test_case
+    if is_first:
+        save_raw_results_to_csv._first_test_case[dataset_name] = True
+        # Clear both files for new evaluation run
+        if csv_file.exists():
+            csv_file.unlink()
+        if date_csv_file.exists():
+            date_csv_file.unlink()
+
+    # Write to latest directory
+    with open(csv_file, 'a', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=row_data.keys())
+        if is_first:
+            writer.writeheader()
+        writer.writerow(row_data)
+
+    # Write to date-based directory for archiving (always write header for new file)
+    write_header = not date_csv_file.exists()
+    with open(date_csv_file, 'a', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=row_data.keys())
+        if write_header:
+            writer.writeheader()
+        writer.writerow(row_data)
+
+    return csv_file, date_csv_file
+
+
+def similarity_score(text1: str, text2: str) -> float:
+    """Calculate similarity score between two texts."""
+    # Normalize texts for comparison
+    def normalize(text):
+        return re.sub(r'\s+', ' ', text.lower().strip())
+
+    norm1 = normalize(text1)
+    norm2 = normalize(text2)
+
+    # Use sequence matcher for similarity
+    return SequenceMatcher(None, norm1, norm2).ratio()
+
+
+def chunks_similarity_score(expected_chunks: List[str], actual_chunks: List[str], threshold: float = 0.8) -> tuple[bool, float]:
+    """Calculate similarity score between expected and actual chunks."""
+    if len(expected_chunks) != len(actual_chunks):
+        return False, 0.0
+
+    total_score = 0.0
+    for expected, actual in zip(expected_chunks, actual_chunks):
+        score = similarity_score(expected, actual)
+        total_score += score
+
+    avg_score = total_score / len(expected_chunks)
+    return avg_score >= threshold, avg_score
+
+
+def evaluate_node(node, test_cases: List[Dict[str, Any]], dataset_name: str) -> Dict[str, Any]:
+    """Evaluate a node against test cases."""
+    results = {
+        "dataset": dataset_name,
+        "total_cases": len(test_cases),
+        "correct": 0,
+        "incorrect": 0,
+        "errors": [],
+        "details": [],
+        "raw_results_file": f"intent_kit/evals/results/latest/{dataset_name}_results.csv"
+    }
+
+    # Generate a unique run timestamp for this evaluation
+    run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+    # Check if this node needs persistent context (like handler_node_llm)
+    needs_persistent_context = hasattr(
+        node, 'name') and 'handler_node_llm' in node.name
+
+    # Create persistent context if needed
+    persistent_context = None
+    if needs_persistent_context:
+        persistent_context = IntentContext()
+        # Initialize booking count for handler_node_llm
+        persistent_context.set(
+            "booking_count", 0, modified_by="evaluation_init")
+
+    for i, test_case in enumerate(test_cases):
+        user_input = test_case["input"]
+        expected = test_case["expected"]
+        context_data = test_case.get("context", {})
+
+        # Use persistent context if available, otherwise create new one
+        if persistent_context is not None:
+            context = persistent_context
+            # Update context with test case data
+            for key, value in context_data.items():
+                context.set(key, value, modified_by="test_case")
+        else:
+            # Create new context for each test case
+            context = IntentContext()
+            for key, value in context_data.items():
+                context.set(key, value, modified_by="test_case")
+
+        try:
+            # Execute the node
+            result = node.execute(user_input, context)
+
+            if result.success:
+                actual_output = result.output
+                similarity_score_val = None
+
+                if isinstance(actual_output, list):
+                    # For splitters, compare lists using similarity
+                    if isinstance(expected, list):
+                        correct, similarity_score_val = chunks_similarity_score(
+                            expected, actual_output)
+                    else:
+                        correct = False
+                else:
+                    # For handlers and classifiers, compare strings
+                    correct = str(actual_output).strip().lower() == str(
+                        expected).strip().lower()
+
+                if correct:
+                    results["correct"] += 1
+                else:
+                    results["incorrect"] += 1
+                    results["errors"].append({
+                        "case": i + 1,
+                        "input": user_input,
+                        "expected": expected,
+                        "actual": actual_output,
+                        "similarity_score": similarity_score_val,
+                        "type": "incorrect_output"
+                    })
+
+                # Save raw result to CSV
+                save_raw_results_to_csv(
+                    dataset_name, test_case, actual_output, correct, similarity_score=similarity_score_val, run_timestamp=run_timestamp)
+            else:
+                results["incorrect"] += 1
+                error_msg = result.error.message if result.error else "Unknown error"
+                results["errors"].append({
+                    "case": i + 1,
+                    "input": user_input,
+                    "expected": expected,
+                    "actual": None,
+                    "type": "execution_failed",
+                    "error": error_msg
+                })
+
+                # Save raw result to CSV
+                save_raw_results_to_csv(
+                    dataset_name, test_case, None, False, error_msg, run_timestamp=run_timestamp)
+
+        except Exception as e:
+            results["incorrect"] += 1
+            error_msg = str(e)
+            results["errors"].append({
+                "case": i + 1,
+                "input": user_input,
+                "expected": expected,
+                "actual": None,
+                "type": "exception",
+                "error": error_msg
+            })
+
+            # Save raw result to CSV
+            save_raw_results_to_csv(
+                dataset_name, test_case, None, False, error_msg, run_timestamp=run_timestamp)
+
+        # Store detailed results
+        results["details"].append({
+            "case": i + 1,
+            "input": user_input,
+            "expected": expected,
+            "actual": result.output if 'result' in locals() else None,
+            "success": result.success if 'result' in locals() else False,
+            "error": result.error.message if 'result' in locals() and result.error else None
+        })
+
+    results["accuracy"] = results["correct"] / \
+        results["total_cases"] if results["total_cases"] > 0 else 0
+    return results
+
+
+def generate_markdown_report(results: List[Dict[str, Any]], output_path: Path, run_timestamp: Optional[str] = None, mock_mode: bool = False):
+    """Generate a markdown report from evaluation results."""
+    # Generate the report content
+    mock_indicator = " (MOCK MODE)" if mock_mode else ""
+    report_content = f"# Node Evaluation Report{mock_indicator}\n\n"
+    report_content += f"Generated on: {importlib.import_module('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
+    report_content += f"Mode: {'Mock (simulated responses)' if mock_mode else 'Live (real API calls)'}\n\n"
+
+    # Summary
+    report_content += "## Summary\n\n"
+    total_cases = sum(r["total_cases"] for r in results)
+    total_correct = sum(r["correct"] for r in results)
+    overall_accuracy = total_correct / total_cases if total_cases > 0 else 0
+
+    report_content += f"- **Total Test Cases**: {total_cases}\n"
+    report_content += f"- **Total Correct**: {total_correct}\n"
+    report_content += f"- **Overall Accuracy**: {overall_accuracy:.1%}\n\n"
+
+    # Individual dataset results
+    report_content += "## Dataset Results\n\n"
+    for result in results:
+        report_content += f"### {result['dataset']}\n"
+        report_content += f"- **Accuracy**: {result['accuracy']:.1%} ({result['correct']}/{result['total_cases']})\n"
+        report_content += f"- **Correct**: {result['correct']}\n"
+        report_content += f"- **Incorrect**: {result['incorrect']}\n"
+        report_content += f"- **Raw Results**: `{result['raw_results_file']}`\n\n"
+
+        # Show errors if any
+        if result["errors"]:
+            report_content += "#### Errors\n"
+            for error in result["errors"][:5]:  # Show first 5 errors
+                report_content += f"- **Case {error['case']}**: {error['input']}\n"
+                report_content += f"  - Expected: `{error['expected']}`\n"
+                report_content += f"  - Actual: `{error['actual']}`\n"
+                if error.get('error'):
+                    report_content += f"  - Error: {error['error']}\n"
+                report_content += "\n"
+            if len(result["errors"]) > 5:
+                report_content += f"- ... and {len(result['errors']) - 5} more errors\n\n"
+
+    # Detailed results table
+    report_content += "## Detailed Results\n\n"
+    report_content += "| Dataset | Accuracy | Correct | Total | Raw Results |\n"
+    report_content += "|---------|----------|---------|-------|-------------|\n"
+    for result in results:
+        report_content += f"| {result['dataset']} | {result['accuracy']:.1%} | {result['correct']} | {result['total_cases']} | `{result['raw_results_file']}` |\n"
+
+    # Write to the specified output path
+    with open(output_path, 'w') as f:
+        f.write(report_content)
+
+    today = datetime.now().strftime("%Y-%m-%d")
+    if run_timestamp is None:
+        run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    date_reports_dir = Path(__file__).parent / "reports" / today
+    date_reports_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create date-based filename
+    date_output_path = date_reports_dir / \
+        f"{output_path.stem}_{run_timestamp}{output_path.suffix}"
+    with open(date_output_path, 'w') as f:
+        f.write(report_content)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run node evaluations")
+    parser.add_argument("--dataset", help="Specific dataset to run")
+    parser.add_argument("--output", help="Output file for markdown report")
+    parser.add_argument("--llm-config", help="Path to LLM configuration file")
+
+    args = parser.parse_args()
+
+    # Load LLM configuration if provided
+    llm_config = {}
+    if args.llm_config:
+        with open(args.llm_config, 'r') as f:
+            llm_config = yaml.safe_load(f)
+
+        # Set environment variables for API keys
+        for provider, config in llm_config.items():
+            if "api_key" in config:
+                env_var = f"{provider.upper()}_API_KEY"
+                os.environ[env_var] = config["api_key"]
+                print(f"Set {env_var} environment variable")
+
+    # Find datasets
+    datasets_dir = Path(__file__).parent / "datasets"
+    if not datasets_dir.exists():
+        print(f"Datasets directory not found: {datasets_dir}")
+        sys.exit(1)
+
+    dataset_files = list(datasets_dir.glob("*.yaml"))
+    if not dataset_files:
+        print(f"No dataset files found in {datasets_dir}")
+        sys.exit(1)
+
+    # Filter to specific dataset if requested
+    if args.dataset:
+        dataset_files = [f for f in dataset_files if args.dataset in f.name]
+        if not dataset_files:
+            print(f"No dataset files found matching '{args.dataset}'")
+            sys.exit(1)
+
+    results = []
+    run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+    for dataset_file in dataset_files:
+        print(f"\nEvaluating dataset: {dataset_file.name}")
+
+        # Load dataset
+        dataset = load_dataset(dataset_file)
+        dataset_name = dataset["dataset"]["name"]
+        node_name = dataset["dataset"]["node_name"]
+
+        # Determine module name based on node name
+        if "llm" in node_name:
+            module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node_llm"
+        else:
+            module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node"
+
+        # Load node
+        node = get_node_from_module(module_name, node_name)
+        if node is None:
+            print(f"Failed to load node {node_name} from {module_name}")
+            continue
+
+        # Run evaluation
+        test_cases = dataset["test_cases"]
+        result = evaluate_node(node, test_cases, dataset_name)
+        results.append(result)
+
+        # Print results
+        accuracy = result["accuracy"]
+        print(
+            f"  Accuracy: {accuracy:.1%} ({result['correct']}/{result['total_cases']})")
+        print(f"  Raw results saved to: {result['raw_results_file']}")
+
+        if result["errors"]:
+            print(f"  Errors: {len(result['errors'])}")
+            for error in result["errors"][:3]:  # Show first 3 errors
+                print(f"    - Case {error['case']}: {error['input']}")
+                print(f"      Expected: {error['expected']}")
+                print(f"      Actual: {error['actual']}")
+
+    # Generate report
+    if results:
+        if args.output:
+            output_path = Path(args.output)
+        else:
+            # Create organized reports directory structure
+            today = datetime.now().strftime("%Y-%m-%d")
+
+            # Create reports directory structure
+            reports_dir = Path(__file__).parent / "reports" / "latest"
+            reports_dir.mkdir(parents=True, exist_ok=True)
+
+            # Also create date-based directory for archiving
+            date_reports_dir = Path(__file__).parent / "reports" / today
+            date_reports_dir.mkdir(parents=True, exist_ok=True)
+
+            output_path = reports_dir / "evaluation_report.md"
+
+        generate_markdown_report(results, output_path,
+                                 run_timestamp=run_timestamp)
+        print(f"\nReport generated: {output_path}")
+
+        # Print summary
+        total_cases = sum(r["total_cases"] for r in results)
+        total_correct = sum(r["correct"] for r in results)
+        overall_accuracy = total_correct / total_cases if total_cases > 0 else 0
+        print(
+            f"\nOverall Accuracy: {overall_accuracy:.1%} ({total_correct}/{total_cases})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/intent_kit/evals/sample_nodes/__init__.py b/intent_kit/evals/sample_nodes/__init__.py
new file mode 100644
index 0000000..f355633
--- /dev/null
+++ b/intent_kit/evals/sample_nodes/__init__.py
@@ -0,0 +1 @@
+"""Sample node implementations for node-level evaluation."""
\ No newline at end of file
diff --git a/intent_kit/evals/sample_nodes/classifier_node_llm.py b/intent_kit/evals/sample_nodes/classifier_node_llm.py
new file mode 100644
index 0000000..799ac2b
--- /dev/null
+++ b/intent_kit/evals/sample_nodes/classifier_node_llm.py
@@ -0,0 +1,329 @@
+from typing import Optional, List, Dict, Any
+from intent_kit.classifiers.node import ClassifierNode
+from intent_kit.handlers.node import HandlerNode
+from intent_kit.context import IntentContext
+from intent_kit.node.base import TreeNode
+
+
+def extract_weather_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Extract weather parameters using LLM."""
+    from intent_kit.services.llm_factory import LLMFactory
+
+    # Check for mock mode
+    import os
+    mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1"
+
+    if mock_mode:
+        # Mock responses for testing without API calls
+        import re
+        location_patterns = [
+            r'(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)',
+            r'(?:weather|temperature|forecast)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)',
+            r'(?:What\'s|How\'s)\s+(?:the\s+)?(?:weather|temperature)\s+(?:like\s+)?(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)',
+            r'(?:weather|temperature)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)',
+            r'(?:weather|temperature|forecast)\s+for\s+([A-Za-z\s]+?)(?:\?|$)',
+            r'(?:weather|temperature)\s+in\s+([A-Za-z\s]+?)(?:\?|$)'
+        ]
+
+        location = "Unknown"
+        for pattern in location_patterns:
+            location_match = re.search(pattern, user_input, re.IGNORECASE)
+            if location_match:
+                location = location_match.group(1).strip()
+                break
+
+        return {"location": location}
+
+    # Configure LLM
+    provider = "openai"  # or "anthropic", "google", "ollama"
+    api_key = os.getenv(f"{provider.upper()}_API_KEY")
+
+    if not api_key:
+        raise ValueError(
+            f"Environment variable {provider.upper()}_API_KEY not set")
+
+    llm_config = {
+        "provider": provider,
+        "model": "gpt-4.1-mini",
+        "api_key": api_key
+    }
+
+    try:
+        llm_client = LLMFactory.create_client(llm_config)
+
+        prompt = f"""
+Extract the location from this weather-related user input.
+
+User input: "{user_input}"
+
+Return a JSON object with this field:
+- location: The specific location/city mentioned
+
+Rules:
+- Extract the exact location name (e.g., "New York", "London", "Tokyo")
+- If no location is mentioned, use "Unknown"
+- Be precise and extract the full location name
+
+Examples:
+- "What's the weather like in New York?" → {{"location": "New York"}}
+- "How's the temperature in London?" → {{"location": "London"}}
+- "Can you tell me the weather forecast for Tokyo?" → {{"location": "Tokyo"}}
+- "What's the weather like today?" → {{"location": "Unknown"}}
+
+User input: {user_input}
+JSON:"""
+
+        response = llm_client.generate(prompt, model=llm_config["model"])
+
+        # Parse the JSON response
+        import json
+        import re
+
+        # Extract JSON from response
+        json_match = re.search(r'\{.*\}', response, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group())
+            return {"location": result.get("location", "Unknown")}
+    except Exception as e:
+        print(f"LLM weather extraction failed: {e}")
+
+    # Fallback to regex extraction
+    import re
+    location_patterns = [
+        r'(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)',
+        r'(?:weather|temperature|forecast)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)',
+        r'(?:What\'s|How\'s)\s+(?:the\s+)?(?:weather|temperature)\s+(?:like\s+)?(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)',
+        r'(?:weather|temperature)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)',
+        r'(?:weather|temperature|forecast)\s+for\s+([A-Za-z\s]+?)(?:\?|$)',
+        r'(?:weather|temperature)\s+in\s+([A-Za-z\s]+?)(?:\?|$)'
+    ]
+
+    location = "Unknown"
+    for pattern in location_patterns:
+        location_match = re.search(pattern, user_input, re.IGNORECASE)
+        if location_match:
+            location = location_match.group(1).strip()
+            break
+
+    return {"location": location}
+
+
+def weather_handler(location: str, context: IntentContext) -> str:
+    """Handle weather requests."""
+    return f"Weather in {location}: Sunny with a chance of rain"
+
+
+def extract_cancel_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Extract cancellation parameters using LLM."""
+    from intent_kit.services.llm_factory import LLMFactory
+
+    # Check for mock mode
+    import os
+    mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1"
+
+    if mock_mode:
+        # Mock responses for testing without API calls
+        import re
+        cancel_patterns = [
+            r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)',
+            r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\?|$)',
+            r'(?:I\s+need\s+to\s+)?cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)',
+            r'(?:cancel|cancellation)\s+(?:of\s+)?(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)'
+        ]
+
+        item = "reservation"
+        for pattern in cancel_patterns:
+            cancel_match = re.search(pattern, user_input, re.IGNORECASE)
+            if cancel_match:
+                item = cancel_match.group(1).strip()
+                break
+
+        return {"item": item}
+
+    # Configure LLM
+    provider = "openai"  # or "anthropic", "google", "ollama"
+    api_key = os.getenv(f"{provider.upper()}_API_KEY")
+
+    if not api_key:
+        raise ValueError(
+            f"Environment variable {provider.upper()}_API_KEY not set")
+
+    llm_config = {
+        "provider": provider,
+        "model": "gpt-3.5-turbo",
+        "api_key": api_key
+    }
+
+    try:
+        llm_client = LLMFactory.create_client(llm_config)
+
+        prompt = f"""
+Extract what the user wants to cancel from this user input.
+
+User input: "{user_input}"
+
+Return a JSON object with this field:
+- item: The specific item/reservation to cancel
+
+Rules:
+- Extract the exact item name (e.g., "flight reservation", "hotel booking", "restaurant reservation")
+- Be precise and extract the full item description
+- If no specific item is mentioned, use "reservation"
+
+Examples:
+- "I need to cancel my flight reservation" → {{"item": "flight reservation"}}
+- "Cancel my hotel booking" → {{"item": "hotel booking"}}
+- "I want to cancel my restaurant reservation" → {{"item": "restaurant reservation"}}
+- "Please cancel my appointment" → {{"item": "appointment"}}
+
+User input: {user_input}
+JSON:"""
+
+        response = llm_client.generate(prompt, model=llm_config["model"])
+
+        # Parse the JSON response
+        import json
+        import re
+
+        # Extract JSON from response
+        json_match = re.search(r'\{.*\}', response, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group())
+            return {"item": result.get("item", "reservation")}
+    except Exception as e:
+        print(f"LLM cancel extraction failed: {e}")
+
+    # Fallback to regex extraction
+    import re
+    cancel_patterns = [
+        r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)',
+        r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\?|$)',
+        r'(?:I\s+need\s+to\s+)?cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)',
+        r'(?:cancel|cancellation)\s+(?:of\s+)?(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)'
+    ]
+
+    item = "reservation"
+    for pattern in cancel_patterns:
+        cancel_match = re.search(pattern, user_input, re.IGNORECASE)
+        if cancel_match:
+            item = cancel_match.group(1).strip()
+            break
+
+    return {"item": item}
+
+
+def cancel_handler(item: str, context: IntentContext) -> str:
+    """Handle cancellation requests."""
+    return f"Successfully cancelled {item}"
+
+
+# Create handler nodes with LLM extraction
+weather_handler_node = HandlerNode(
+    name="weather_handler",
+    param_schema={"location": str},
+    handler=weather_handler,
+    arg_extractor=extract_weather_args_llm,
+    description="Get weather information for a location"
+)
+
+cancel_handler_node = HandlerNode(
+    name="cancel_handler",
+    param_schema={"item": str},
+    handler=cancel_handler,
+    arg_extractor=extract_cancel_args_llm,
+    description="Cancel reservations or bookings"
+)
+
+
+def intent_classifier_llm(user_input: str, children: List[TreeNode], context: Optional[Dict[str, Any]] = None) -> Optional[TreeNode]:
+    """Classify user intent using LLM."""
+    from intent_kit.services.llm_factory import LLMFactory
+
+    # Check for mock mode
+    import os
+    mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1"
+
+    if mock_mode:
+        # Mock responses for testing without API calls
+        if "weather" in user_input.lower():
+            # Return first child (weather handler)
+            return children[0] if children else None
+        elif "cancel" in user_input.lower():
+            # Return second child (cancel handler)
+            return children[1] if len(children) > 1 else None
+        else:
+            return children[0] if children else None  # Default to first child
+
+    # Configure LLM
+    provider = "openai"  # or "anthropic", "google", "ollama"
+    api_key = os.getenv(f"{provider.upper()}_API_KEY")
+
+    if not api_key:
+        raise ValueError(
+            f"Environment variable {provider.upper()}_API_KEY not set")
+
+    llm_config = {
+        "provider": provider,
+        "model": "gpt-3.5-turbo",
+        "api_key": api_key
+    }
+
+    try:
+        llm_client = LLMFactory.create_client(llm_config)
+
+        # Create descriptions of available handlers
+        handler_descriptions = []
+        for child in children:
+            handler_descriptions.append(f"- {child.name}: {child.description}")
+
+        prompt = f"""
+Classify the user's intent and return the name of the appropriate handler.
+
+Available handlers:
+{chr(10).join(handler_descriptions)}
+
+User input: "{user_input}"
+
+Rules:
+- If the user asks about weather, temperature, or forecast, return "weather_handler"
+- If the user wants to cancel something, return "cancel_handler"
+- Be precise and match the exact handler name
+
+Return only the handler name (e.g., "weather_handler" or "cancel_handler") or "none" if no handler matches.
+
+Handler:"""
+
+        response = llm_client.generate(prompt, model=llm_config["model"])
+        handler_name = response.strip().lower()
+
+        # Find the matching handler
+        for child in children:
+            if child.name == handler_name:
+                return child
+
+        # Fallback to keyword matching
+        user_input_lower = user_input.lower()
+        if any(word in user_input_lower for word in ["weather", "temperature", "forecast"]):
+            return weather_handler_node
+        elif any(word in user_input_lower for word in ["cancel", "cancellation", "refund"]):
+            return cancel_handler_node
+
+    except Exception as e:
+        print(f"LLM classification failed: {e}")
+        # Fallback to keyword matching
+        user_input_lower = user_input.lower()
+        if any(word in user_input_lower for word in ["weather", "temperature", "forecast"]):
+            return weather_handler_node
+        elif any(word in user_input_lower for word in ["cancel", "cancellation", "refund"]):
+            return cancel_handler_node
+
+    return None
+
+
+# Create the classifier node with LLM classification
+classifier_node_llm = ClassifierNode(
+    name="classifier_node_llm",
+    classifier=intent_classifier_llm,
+    children=[weather_handler_node, cancel_handler_node],
+    description="Route user intents to appropriate handlers using LLM classification"
+)
diff --git a/intent_kit/evals/sample_nodes/handler_node_llm.py b/intent_kit/evals/sample_nodes/handler_node_llm.py
new file mode 100644
index 0000000..efa2afc
--- /dev/null
+++ b/intent_kit/evals/sample_nodes/handler_node_llm.py
@@ -0,0 +1,138 @@
+from typing import Optional, Dict, Any
+from intent_kit.handlers.node import HandlerNode
+from intent_kit.context import IntentContext
+from intent_kit.node.types import ExecutionResult
+
+
+def extract_booking_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Extract booking parameters using LLM."""
+    from intent_kit.services.llm_factory import LLMFactory
+
+    # Check for mock mode
+    import os
+    mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1"
+
+    if mock_mode:
+        # Mock responses for testing without API calls
+        import re
+        # Simple regex extraction for mock mode
+        dest_match = re.search(
+            r'(?:to|for|in)\s+([A-Za-z\s]+?)(?:\s|$)', user_input, re.IGNORECASE)
+        destination = dest_match.group(1).strip() if dest_match else "Unknown"
+
+        date_match = re.search(r'(?:for|on)\s+(\w+\s+\w+)',
+                               user_input, re.IGNORECASE)
+        date = date_match.group(1) if date_match else "ASAP"
+
+        return {
+            "destination": destination,
+            "date": date,
+            "user_id": context.get("user_id", "anonymous") if context else "anonymous"
+        }
+
+    # Configure LLM (you can change this to any supported provider)
+    provider = "openai"  # or "anthropic", "google", "ollama"
+    api_key = os.getenv(f"{provider.upper()}_API_KEY")
+
+    if not api_key:
+        raise ValueError(
+            f"Environment variable {provider.upper()}_API_KEY not set")
+
+    llm_config = {
+        "provider": provider,
+        "model": "gpt-3.5-turbo",
+        "api_key": api_key
+    }
+
+    try:
+        llm_client = LLMFactory.create_client(llm_config)
+
+        prompt = f"""
+Extract booking parameters from this user input. Be precise and extract exactly what the user is asking for.
+
+User input: "{user_input}"
+
+Return a JSON object with these exact fields:
+- destination: The destination city/location (extract the actual place name)
+- date: The specific date mentioned, or "ASAP" if no date is specified
+
+Rules:
+- If the user says "book a flight to X", extract X as destination
+- If the user says "travel to X", extract X as destination  
+- If the user says "fly to X", extract X as destination
+- If the user says "go to X", extract X as destination
+- For dates, extract the exact date mentioned (e.g., "next Friday", "December 15th", "tomorrow")
+- If no date is mentioned, use "ASAP"
+- Clean up any extra words like "for" or "to" from the date field
+
+Examples:
+- "Book a flight to Paris" → {{"destination": "Paris", "date": "ASAP"}}
+- "I want to fly to Tokyo next Friday" → {{"destination": "Tokyo", "date": "next Friday"}}
+- "Travel to London tomorrow" → {{"destination": "London", "date": "tomorrow"}}
+- "Book a flight to Rome for the weekend" → {{"destination": "Rome", "date": "the weekend"}}
+
+User input: {user_input}
+JSON:"""
+
+        response = llm_client.generate(prompt, model=llm_config["model"])
+
+        # Parse the JSON response
+        import json
+        import re
+
+        # Extract JSON from response
+        json_match = re.search(r'\{.*\}', response, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group())
+            # Clean up the date field to remove extra words
+            date = result.get("date", "ASAP")
+            if date != "ASAP":
+                # Remove common prefixes that might be extracted
+                date = re.sub(r'^(for|to)\s+', '', date, flags=re.IGNORECASE)
+
+            return {
+                "destination": result.get("destination", "Unknown"),
+                "date": date,
+                "user_id": context.get("user_id", "anonymous") if context else "anonymous"
+            }
+    except Exception as e:
+        print(f"LLM extraction failed: {e}")
+
+    # Fallback to simple extraction
+    import re
+    dest_match = re.search(
+        r'(?:to|for|in)\s+([A-Za-z\s]+?)(?:\s|$)', user_input, re.IGNORECASE)
+    destination = dest_match.group(1).strip() if dest_match else "Unknown"
+
+    date_match = re.search(r'(?:for|on)\s+(\w+\s+\w+)',
+                           user_input, re.IGNORECASE)
+    date = date_match.group(1) if date_match else "ASAP"
+
+    return {
+        "destination": destination,
+        "date": date,
+        "user_id": context.get("user_id", "anonymous") if context else "anonymous"
+    }
+
+
+def booking_handler(destination: str, date: str, context: IntentContext) -> str:
+    """Handle flight booking requests."""
+    # Update context with booking info
+    booking_count = context.get("booking_count", 0) + 1
+    context.set("booking_count", booking_count, modified_by="booking_handler")
+    context.set("last_destination", destination, modified_by="booking_handler")
+
+    # Use the incremented count for the response
+    return f"Flight booked to {destination} for {date} (Booking #{booking_count})"
+
+
+# Create the handler node with LLM extraction
+handler_node_llm = HandlerNode(
+    name="handler_node_llm",
+    param_schema={"destination": str, "date": str},
+    handler=booking_handler,
+    arg_extractor=extract_booking_args_llm,
+    context_inputs={"user_id"},
+    context_outputs={"booking_count", "last_destination"},
+    description="Handle flight booking requests with LLM-powered argument extraction"
+)
diff --git a/intent_kit/evals/sample_nodes/splitter_node_llm.py b/intent_kit/evals/sample_nodes/splitter_node_llm.py
new file mode 100644
index 0000000..45b4640
--- /dev/null
+++ b/intent_kit/evals/sample_nodes/splitter_node_llm.py
@@ -0,0 +1,105 @@
+from typing import Optional, List, Dict, Any
+from intent_kit.splitters.node import SplitterNode
+from intent_kit.context import IntentContext
+
+
+def split_text_llm(user_input: str, debug: bool = False, context: Optional[Dict[str, Any]] = None) -> List[str]:
+    """Split user input into multiple intents using LLM."""
+    from intent_kit.services.llm_factory import LLMFactory
+
+    # Check for mock mode
+    import os
+    mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1"
+
+    if mock_mode:
+        # Mock responses for testing without API calls
+        # Simple splitting based on common conjunctions
+        import re
+        conjunctions = [" and ", " also ", " plus ",
+                        " as well as ", " furthermore "]
+        for conj in conjunctions:
+            if conj in user_input.lower():
+                parts = user_input.split(conj)
+                return [part.strip() for part in parts if part.strip()]
+        # If no conjunctions found, return as single intent
+        return [user_input]
+
+    # Configure LLM
+    provider = "openai"
+    api_key = os.getenv(f"{provider.upper()}_API_KEY")
+
+    if not api_key:
+        raise ValueError(
+            f"Environment variable {provider.upper()}_API_KEY not set")
+
+    llm_config = {
+        "provider": provider,
+        "model": "gpt-4.1-mini",
+        "api_key": api_key
+    }
+
+    try:
+        llm_client = LLMFactory.create_client(llm_config)
+
+        prompt = f"""
+Split this text into separate requests:
+
+"{user_input}"
+
+Return a JSON array of strings. Each string should be a complete, standalone request.
+
+IMPORTANT: Be verbatim. Do not add extra words, change pronouns, or modify the original text. Split exactly as written.
+
+JSON array:"""
+
+        response = llm_client.generate(prompt, model=llm_config["model"])
+
+        # Parse the JSON response
+        import json
+        import re
+
+        # Extract JSON array from response
+        json_match = re.search(r'\[.*\]', response, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group())
+            if isinstance(result, list):
+                return [str(item).strip() for item in result if item.strip()]
+
+    except Exception as e:
+        if debug:
+            print(f"LLM splitting failed: {e}")
+
+    # If LLM fails, return the original input as a single item
+    return [user_input]
+
+
+def create_splitter_node_llm():
+    """Create a splitter node that uses LLM for text splitting."""
+    return SplitterNode(
+        name="splitter_node_llm",
+        splitter_function=split_text_llm,
+        children=[],
+        description="Split complex user inputs into multiple intents using LLM"
+    )
+
+
+# Create a wrapper for evaluation that returns chunks directly
+class SplitterWrapper:
+    """Wrapper for splitter node that returns chunks as output for evaluation."""
+
+    def __init__(self, splitter_node):
+        self.name = splitter_node.name
+        self.splitter_function = splitter_node.splitter_function
+
+    def execute(self, user_input: str, context=None):
+        chunks = self.splitter_function(
+            user_input, debug=False, context=context)
+        return type('Result', (), {
+            'success': True,
+            'output': chunks,
+            'error': None
+        })()
+
+
+# Export the node creation function
+splitter_node_llm = SplitterWrapper(create_splitter_node_llm())
diff --git a/pyproject.toml b/pyproject.toml
index 17df942..bb863ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,11 @@ openai = [
     "ollama>=0.1.0",
 ]
 
+[project.scripts]
+run-evals = "intent_kit.evals.run_all_evals:run_all_evaluations"
+eval-node = "intent_kit.evals.run_node_eval:main"
+eval-api-demo = "examples.eval_api_demo:main"
+
 [tool.setuptools]
 packages = ["intent_kit"] 
 
@@ -44,6 +49,8 @@ dev = [
     "openai>=1.0.0",
     "ollama>=0.1.0",
     "python-dotenv>=1.0.0",
+    "tqdm",
+    "pyyaml",
 ]
 viz = [
     "networkx>=3.5",
diff --git a/tests/test_eval_api.py b/tests/test_eval_api.py
new file mode 100644
index 0000000..0aad71c
--- /dev/null
+++ b/tests/test_eval_api.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+test_eval_api.py
+
+Tests for the new evaluation API.
+"""
+
+import pytest
+from pathlib import Path
+from intent_kit.evals import (
+    load_dataset,
+    run_eval,
+    run_eval_from_path,
+    run_eval_from_module,
+    EvalTestCase,
+    Dataset,
+    EvalTestResult,
+    EvalResult
+)
+
+
+def test_load_dataset():
+    """Test loading a dataset from YAML."""
+    dataset = load_dataset(
+        "intent_kit/evals/datasets/classifier_node_llm.yaml")
+
+    assert dataset.name == "classifier_node_llm"
+    assert dataset.node_type == "classifier"
+    assert dataset.node_name == "classifier_node_llm"
+    assert len(dataset.test_cases) > 0
+
+    # Check first test case
+    first_case = dataset.test_cases[0]
+    assert first_case.input == "What's the weather like in New York?"
+    assert first_case.expected == "Weather in New York: Sunny with a chance of rain"
+    assert first_case.context == {"user_id": "user123"}
+
+
+def test_load_dataset_missing_file():
+    """Test loading a non-existent dataset."""
+    with pytest.raises(FileNotFoundError):
+        load_dataset("non_existent_file.yaml")
+
+
+def test_load_dataset_malformed():
+    """Test loading a malformed dataset."""
+    # Create a temporary malformed YAML file
+    import tempfile
+    import yaml
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+        yaml.dump({"invalid": "data"}, f)
+        temp_path = f.name
+
+    try:
+        with pytest.raises(ValueError):
+            load_dataset(temp_path)
+    finally:
+        Path(temp_path).unlink()
+
+
+def test_test_case_defaults():
+    """Test EvalTestCase with default context."""
+    test_case = EvalTestCase(
+        input="test input",
+        expected="test expected",
+        context={}
+    )
+
+    assert test_case.input == "test input"
+    assert test_case.expected == "test expected"
+    assert test_case.context == {}
+
+
+def test_dataset_defaults():
+    """Test Dataset with default description."""
+    test_cases = [EvalTestCase("input", "expected", {})]
+    dataset = Dataset(
+        name="test",
+        description="",
+        node_type="test",
+        node_name="test_node",
+        test_cases=test_cases
+    )
+
+    assert dataset.description == ""
+
+
+def test_eval_result_methods():
+    """Test EvalResult methods."""
+    results = [
+        EvalTestResult("input1", "expected1", "actual1", True, {}),
+        EvalTestResult("input2", "expected2", "actual2", False, {}),
+        EvalTestResult("input3", "expected3", "actual3", True, {})
+    ]
+
+    eval_result = EvalResult(results, "test_dataset")
+
+    assert eval_result.accuracy() == 2/3
+    assert eval_result.passed_count() == 2
+    assert eval_result.failed_count() == 1
+    assert eval_result.total_count() == 3
+    assert not eval_result.all_passed()
+    assert len(eval_result.errors()) == 1
+
+
+def test_eval_result_empty():
+    """Test EvalResult with empty results."""
+    eval_result = EvalResult([], "test_dataset")
+
+    assert eval_result.accuracy() == 0.0
+    assert eval_result.passed_count() == 0
+    assert eval_result.failed_count() == 0
+    assert eval_result.total_count() == 0
+    assert eval_result.all_passed()  # Empty results are considered "all passed"
+    assert len(eval_result.errors()) == 0
+
+
+def test_run_eval_with_callable():
+    """Test run_eval with a callable node."""
+    def simple_node(input_text, context=None):
+        return f"Processed: {input_text}"
+
+    test_cases = [
+        EvalTestCase("hello", "Processed: hello", {}),
+        EvalTestCase("world", "Processed: world", {})
+    ]
+
+    dataset = Dataset(
+        name="test",
+        description="Test dataset",
+        node_type="test",
+        node_name="simple_node",
+        test_cases=test_cases
+    )
+
+    result = run_eval(dataset, simple_node)
+
+    assert result.accuracy() == 1.0
+    assert result.all_passed()
+    assert result.total_count() == 2
+
+
+def test_run_eval_with_error():
+    """Test run_eval with a node that raises exceptions."""
+    def error_node(input_text, context=None):
+        if "error" in input_text.lower():
+            raise ValueError("Intentional error")
+        return "success"
+
+    test_cases = [
+        EvalTestCase("hello", "success", {}),
+        # This will fail due to exception
+        EvalTestCase("error", "success", {}),
+        EvalTestCase("world", "success", {})
+    ]
+
+    dataset = Dataset(
+        name="test",
+        description="Test dataset",
+        node_type="test",
+        node_name="error_node",
+        test_cases=test_cases
+    )
+
+    result = run_eval(dataset, error_node)
+
+    assert result.accuracy() == 2/3
+    assert not result.all_passed()
+    assert result.failed_count() == 1
+    assert result.errors()[0].error == "Intentional error"
+
+
+def test_run_eval_fail_fast():
+    """Test run_eval with fail_fast=True."""
+    def error_node(input_text, context=None):
+        if "error" in input_text.lower():
+            raise ValueError("Intentional error")
+        return "success"
+
+    test_cases = [
+        EvalTestCase("hello", "success", {}),
+        # This will fail and stop execution
+        EvalTestCase("error", "success", {}),
+        # This won't run due to fail_fast
+        EvalTestCase("world", "success", {})
+    ]
+
+    dataset = Dataset(
+        name="test",
+        description="Test dataset",
+        node_type="test",
+        node_name="error_node",
+        test_cases=test_cases
+    )
+
+    result = run_eval(dataset, error_node, fail_fast=True)
+
+    assert result.total_count() == 2  # Only first two tests ran
+    assert result.failed_count() == 1
+    assert result.errors()[0].error == "Intentional error"
+
+
+def test_run_eval_custom_comparator():
+    """Test run_eval with custom comparator."""
+    def simple_node(input_text, context=None):
+        return input_text.upper()
+
+    def case_insensitive_comparator(expected, actual):
+        return str(expected).lower() == str(actual).lower()
+
+    test_cases = [
+        EvalTestCase("hello", "HELLO", {}),
+        EvalTestCase("world", "WORLD", {})
+    ]
+
+    dataset = Dataset(
+        name="test",
+        description="Test dataset",
+        node_type="test",
+        node_name="simple_node",
+        test_cases=test_cases
+    )
+
+    result = run_eval(dataset, simple_node,
+                      comparator=case_insensitive_comparator)
+
+    assert result.accuracy() == 1.0
+    assert result.all_passed()
+
+
+def test_run_eval_from_path():
+    """Test run_eval_from_path convenience function."""
+    def simple_node(input_text, context=None):
+        return f"Processed: {input_text}"
+
+    # Create a temporary dataset file
+    import tempfile
+    import yaml
+
+    test_data = {
+        "dataset": {
+            "name": "test_dataset",
+            "description": "Test dataset",
+            "node_type": "test",
+            "node_name": "simple_node"
+        },
+        "test_cases": [
+            {
+                "input": "hello",
+                "expected": "Processed: hello",
+                "context": {"user_id": "test"}
+            }
+        ]
+    }
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
+        yaml.dump(test_data, f)
+        temp_path = f.name
+
+    try:
+        result = run_eval_from_path(temp_path, simple_node)
+        assert result.accuracy() == 1.0
+        assert result.all_passed()
+    finally:
+        Path(temp_path).unlink()
+
+
+def test_save_results():
+    """Test saving results to different formats."""
+    results = [
+        EvalTestResult("input1", "expected1", "actual1", True, {}),
+        EvalTestResult("input2", "expected2", "actual2",
+                       False, {}, "test error")
+    ]
+
+    eval_result = EvalResult(results, "test_dataset")
+
+    # Test CSV save
+    import tempfile
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+        csv_path = f.name
+
+    try:
+        eval_result.save_csv(csv_path)
+        assert Path(csv_path).exists()
+    finally:
+        Path(csv_path).unlink()
+
+    # Test JSON save
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+        json_path = f.name
+
+    try:
+        eval_result.save_json(json_path)
+        assert Path(json_path).exists()
+    finally:
+        Path(json_path).unlink()
+
+    # Test Markdown save
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+        md_path = f.name
+
+    try:
+        eval_result.save_markdown(md_path)
+        assert Path(md_path).exists()
+    finally:
+        Path(md_path).unlink()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/uv.lock b/uv.lock
index e22ece0..c9e53c6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -328,6 +328,8 @@ dev = [
     { name = "pytest" },
     { name = "pytest-cov" },
     { name = "python-dotenv" },
+    { name = "pyyaml" },
+    { name = "tqdm" },
 ]
 viz = [
     { name = "networkx" },
@@ -353,6 +355,8 @@ dev = [
     { name = "pytest", specifier = ">=8.4.1" },
     { name = "pytest-cov", specifier = ">=5.0" },
     { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "pyyaml" },
+    { name = "tqdm" },
 ]
 viz = [
     { name = "networkx", specifier = ">=3.5" },
@@ -819,6 +823,41 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/4b/e37e4e5d5ee1179694917b445768bdbfb084f5a59ecd38089d3413d4c70f/pyvis-0.3.2-py3-none-any.whl", hash = "sha256:5720c4ca8161dc5d9ab352015723abb7a8bb8fb443edeb07f7a322db34a97555", size = 756038, upload-time = "2023-02-24T20:29:46.758Z" },
 ]
 
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" },
+    { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" },
+    { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" },
+    { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" },
+    { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" },
+    { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" },
+    { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" },
+    { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" },
+    { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" },
+    { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" },
+    { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.4"