diff --git a/.env.example b/.env.example index 3fb9b33..7600574 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,5 @@ ANTHROPIC_API_KEY=YOUR_ANTHROPIC_API_KEY OPENAI_API_KEY=YOUR_OPENAI_API_KEY -GEMINI_API_KEY=YOUR_GEMINI_API_KEY -OPENROUTER_API_KEY=YOUR_OPENROUTER_API_KEY \ No newline at end of file +GOOGLE_API_KEY=YOUR_GOOGLE_API_KEY +OPENROUTER_API_KEY=YOUR_OPENROUTER_API_KEY +OLLAMA_BASE_URL=http://localhost:11434 \ No newline at end of file diff --git a/.github/workflows/test-and-eval.yml b/.github/workflows/test-and-eval.yml new file mode 100644 index 0000000..ee80e03 --- /dev/null +++ b/.github/workflows/test-and-eval.yml @@ -0,0 +1,31 @@ +name: Tests and Evaluations + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test-and-eval: + runs-on: ubuntu-latest + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install uv + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + - name: Install dependencies + run: | + uv sync --dev + - name: Tests + run: uv run pytest -q + - name: Evaluations (Mock Mode) + run: uv run python -m intent_kit.evals.run_all_evals --quiet --mock \ No newline at end of file diff --git a/.gitignore b/.gitignore index c06fa2d..ac509ed 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,11 @@ ENV/ htmlcov/ .pytest_cache/ .tox/ +reports/ + +# Evaluation Results +intent_kit/evals/results/ +intent_kit/evals/reports/ # Visualization intentkit_graphs/ diff --git a/README.md b/README.md index 0990839..a8841c9 100644 --- a/README.md +++ b/README.md @@ -801,6 +801,67 @@ pytest tests/ --- +## Evaluation & Benchmarking + +intent-kit provides a built-in evaluation framework for benchmarking intent graphs and nodes against real datasets. This is separate from unit/integration tests and is designed for large-scale, reproducible evaluation. + +The evaluation framework is now part of the main `intent_kit` package and can be imported as: + +```python +from intent_kit.evals import run_all_evaluations, evaluate_node, generate_markdown_report +``` + +**Organized Structure:** +- **Latest results**: Always available in `intent_kit/evals/results/latest/` and `intent_kit/evals/reports/latest/` +- **Date-based archives**: Historical runs are automatically archived in date-based directories +- **Clean separation**: Reports and raw results are organized separately for easy access + +### Running All Evals + +To run all evaluations and generate comprehensive markdown reports: + +```bash +# Run with real API calls (requires API keys) +uv run run-evals + +# Run in mock mode (no API keys required) +uv run run-evals --mock +``` + +- Generates a comprehensive report at `reports/comprehensive_report.md` +- Generates individual reports for each dataset in `reports/` +- Mock mode uses simulated responses for testing without API costs + +### Running a Specific Eval + +To run a specific node evaluation (with markdown output): + +```bash +uv run eval-node --dataset handler_node_llm --output reports/my_eval_report.md +``` + +- Replace `handler_node_llm` with any dataset name (without .yaml extension) +- Add `--output ` to save the report to a specific file +- Reports are automatically saved to `reports/` directory + +### Adding New Evals +- Add new YAML datasets to `intent_kit/evals/datasets/` +- Add corresponding node implementations to `intent_kit/evals/sample_nodes/` +- The framework will automatically discover and evaluate them + +### Where are the results? +- **Latest reports**: `intent_kit/evals/reports/latest/` +- **Latest results**: `intent_kit/evals/results/latest/` +- **Date-based archives**: `intent_kit/evals/reports/YYYY-MM-DD/` and `intent_kit/evals/results/YYYY-MM-DD/` +- Reports are in markdown format for easy sharing and review +- Raw results are in CSV format for detailed analysis + +### When to use evals vs. tests? +- **Unit/Integration tests** (in `tests/`): For correctness, fast feedback, and CI +- **Evals** (in `intent_kit/evals/`): For benchmarking, regression, and real-world performance + +--- + ## Project Structure ``` @@ -837,6 +898,13 @@ intent-kit/ │ │ ├── google_client.py │ │ ├── ollama_client.py │ │ └── __init__.py +│ ├── evals/ # Evaluation framework +│ │ ├── __init__.py # Evaluation exports +│ │ ├── run_all_evals.py # Run all evaluations +│ │ ├── run_node_eval.py # Individual node evaluation +│ │ ├── datasets/ # Evaluation datasets +│ │ ├── sample_nodes/ # Sample nodes for evaluation +│ │ └── reports/ # Generated evaluation reports │ ├── types.py # Type definitions │ ├── exceptions/ # Custom exceptions │ └── utils/ # Utilities @@ -855,4 +923,135 @@ intent-kit/ ## License -MIT License \ No newline at end of file +MIT License + +## Evaluation API + +The evaluation API provides a clean Python interface for testing your nodes against YAML datasets. + +### Basic Usage + +```python +from intent_kit.evals import load_dataset, run_eval +from intent_kit.evals.sample_nodes.classifier_node_llm import classifier_node_llm + +# Load a dataset +dataset = load_dataset("intent_kit/evals/datasets/classifier_node_llm.yaml") + +# Run evaluation +result = run_eval(dataset, classifier_node_llm) + +# Check results +print(f"Accuracy: {result.accuracy():.1%}") +print(f"Passed: {result.passed_count()}/{result.total_count()}") + +# Save results (using default locations) +csv_path = result.save_csv() +json_path = result.save_json() +md_path = result.save_markdown() + +# Or specify custom paths +result.save_csv("my_results.csv") +result.save_json("my_results.json") +result.save_markdown("my_report.md") +``` + +### Convenience Functions + +```python +from intent_kit.evals import run_eval_from_path, run_eval_from_module + +# Evaluate from file path +result = run_eval_from_path( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + classifier_node_llm +) + +# Evaluate with module loading +result = run_eval_from_module( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + "intent_kit.evals.sample_nodes.classifier_node_llm", + "classifier_node_llm" +) +``` + +### Custom Comparison + +```python +# Case-insensitive comparison +def case_insensitive_comparator(expected, actual): + return str(expected).lower().strip() == str(actual).lower().strip() + +result = run_eval(dataset, node, comparator=case_insensitive_comparator) +``` + +### Programmatic Datasets + +```python +from intent_kit.evals import EvalTestCase, Dataset + +# Create test cases programmatically +test_cases = [ + EvalTestCase( + input="What's the weather like?", + expected="Weather response", + context={"user_id": "test"} + ) +] + +dataset = Dataset( + name="my_dataset", + description="Custom test dataset", + node_type="classifier", + node_name="my_node", + test_cases=test_cases +) + +result = run_eval(dataset, my_node) +``` + +### Dataset Format + +YAML datasets should follow this format: + +```yaml +dataset: + name: "my_dataset" + description: "Test dataset for my node" + node_type: "classifier" + node_name: "my_node" + +test_cases: + - input: "What's the weather like in New York?" + expected: "Weather in New York: Sunny with a chance of rain" + context: + user_id: "user123" + + - input: "Cancel my flight" + expected: "Successfully cancelled flight" + context: + user_id: "user123" +``` + +### Error Handling + +The API handles errors gracefully: + +- **Node exceptions**: Caught and recorded in results +- **Missing files**: Clear error messages +- **Malformed datasets**: Validation with helpful error messages +- **Fail-fast option**: Stop evaluation on first failure + +```python +# Fail-fast evaluation +result = run_eval(dataset, node, fail_fast=True) +``` + +### Output Locations + +By default, results are saved to the existing intent-kit directory structure: + +- **CSV/JSON results**: `intent_kit/evals/results/latest/` +- **Markdown reports**: `intent_kit/evals/reports/latest/` + +Files are automatically timestamped to avoid conflicts. You can also specify custom paths if needed. \ No newline at end of file diff --git a/env.example b/env.example new file mode 100644 index 0000000..9e20daf --- /dev/null +++ b/env.example @@ -0,0 +1,14 @@ +# Example .env file for Intent Kit LLM evaluations +# Copy this to .env and add your actual API keys + +# OpenAI API Key (for GPT models) +OPENAI_API_KEY=your-openai-api-key-here + +# Anthropic API Key (for Claude models) +ANTHROPIC_API_KEY=your-anthropic-api-key-here + +# Google API Key (for Gemini models) +GOOGLE_API_KEY=your-google-api-key-here + +# Ollama (local models - no API key needed) +# OLLAMA_BASE_URL=http://localhost:11434 \ No newline at end of file diff --git a/examples/advanced_remediation_demo.py b/examples/advanced_remediation_demo.py index 3db350c..408fd32 100644 --- a/examples/advanced_remediation_demo.py +++ b/examples/advanced_remediation_demo.py @@ -28,12 +28,12 @@ # --- Setup LLM configs --- load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "sk-mock-openai" -GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or "sk-mock-gemini" +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or "sk-mock-gemini" LLM_CONFIG_1 = {"provider": "openai", "model": "gpt-4.1-mini", "api_key": OPENAI_API_KEY} LLM_CONFIG_2 = {"provider": "google", - "model": "gemini-2.5-flash", "api_key": GEMINI_API_KEY} + "model": "gemini-2.5-flash", "api_key": GOOGLE_API_KEY} # --- Core Handler: Simulates model confusion and ambiguity --- @@ -134,7 +134,7 @@ def main(): print("• Consensus voting: Multiple models must agree before output is accepted.") print("• Alternate prompt: Handler retries with a new prompt if it can't answer.") - if "mock" in OPENAI_API_KEY or "mock" in GEMINI_API_KEY: + if "mock" in OPENAI_API_KEY or "mock" in GOOGLE_API_KEY: print("\n💡 Pro Tip: For real LLM behavior, add your OpenAI and Gemini API keys to a .env file.") diff --git a/examples/eval_api_demo.py b/examples/eval_api_demo.py new file mode 100644 index 0000000..06362c5 --- /dev/null +++ b/examples/eval_api_demo.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +eval_api_demo.py + +Demonstration of the new intent-kit evaluation API. +""" + +from intent_kit.evals import ( + load_dataset, + run_eval, + run_eval_from_path, + run_eval_from_module, + EvalTestCase, + Dataset +) +from intent_kit.evals.sample_nodes.classifier_node_llm import classifier_node_llm + + +def demo_basic_usage(): + """Demonstrate basic usage with direct node instance.""" + print("=== Basic Usage Demo ===") + + # Load dataset + dataset = load_dataset( + "intent_kit/evals/datasets/classifier_node_llm.yaml") + print(f"Loaded dataset: {dataset.name}") + print(f"Test cases: {len(dataset.test_cases)}") + + # Run evaluation + result = run_eval(dataset, classifier_node_llm) + + # Print results + result.print_summary() + + # Save results (using default locations) + csv_path = result.save_csv() + json_path = result.save_json() + md_path = result.save_markdown() + + print(f"Results saved to:") + print(f" CSV: {csv_path}") + print(f" JSON: {json_path}") + print(f" Markdown: {md_path}") + return result + + +def demo_from_path(): + """Demonstrate usage with dataset path.""" + print("\n=== From Path Demo ===") + + result = run_eval_from_path( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + classifier_node_llm + ) + + result.print_summary() + return result + + +def demo_from_module(): + """Demonstrate usage with module loading.""" + print("\n=== From Module Demo ===") + + result = run_eval_from_module( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + "intent_kit.evals.sample_nodes.classifier_node_llm", + "classifier_node_llm" + ) + + result.print_summary() + return result + + +def demo_custom_comparator(): + """Demonstrate usage with custom comparison logic.""" + print("\n=== Custom Comparator Demo ===") + + # Custom comparator for case-insensitive comparison + def case_insensitive_comparator(expected, actual): + if expected is None or actual is None: + return expected == actual + return str(expected).lower().strip() == str(actual).lower().strip() + + result = run_eval_from_path( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + classifier_node_llm, + comparator=case_insensitive_comparator + ) + + result.print_summary() + return result + + +def demo_fail_fast(): + """Demonstrate fail-fast behavior.""" + print("\n=== Fail Fast Demo ===") + + result = run_eval_from_path( + "intent_kit/evals/datasets/classifier_node_llm.yaml", + classifier_node_llm, + fail_fast=True + ) + + print(f"Fail-fast evaluation completed with {result.total_count()} tests") + return result + + +def demo_programmatic_dataset(): + """Demonstrate creating a dataset programmatically.""" + print("\n=== Programmatic Dataset Demo ===") + + # Create test cases programmatically + test_cases = [ + EvalTestCase( + input="What's the weather like in Paris?", + expected="Weather in Paris: Sunny with a chance of rain", + context={"user_id": "demo_user"} + ), + EvalTestCase( + input="Cancel my flight", + expected="Successfully cancelled flight", + context={"user_id": "demo_user"} + ) + ] + + # Create dataset + dataset = Dataset( + name="demo_dataset", + description="Programmatically created test dataset", + node_type="classifier", + node_name="classifier_node_llm", + test_cases=test_cases + ) + + # Run evaluation + result = run_eval(dataset, classifier_node_llm) + result.print_summary() + + return result + + +def demo_error_handling(): + """Demonstrate error handling with a broken node.""" + print("\n=== Error Handling Demo ===") + + # Create a broken node that raises exceptions + def broken_node(input_text, context=None): + if "weather" in input_text.lower(): + raise ValueError("Weather service is down!") + return "Default response" + + # Create a simple test case + test_cases = [ + EvalTestCase( + input="What's the weather like?", + expected="Weather response", + context={} + ), + EvalTestCase( + input="Hello there", + expected="Default response", + context={} + ) + ] + + dataset = Dataset( + name="error_demo", + description="Testing error handling", + node_type="test", + node_name="broken_node", + test_cases=test_cases + ) + + result = run_eval(dataset, broken_node) + result.print_summary() + + return result + + +def main(): + """Run all demos.""" + import os + + # Create results directory + os.makedirs("results", exist_ok=True) + + # Run demos + demos = [ + demo_basic_usage, + demo_from_path, + demo_from_module, + demo_custom_comparator, + demo_fail_fast, + demo_programmatic_dataset, + demo_error_handling + ] + + results = [] + for demo in demos: + try: + result = demo() + results.append(result) + except Exception as e: + print(f"Demo {demo.__name__} failed: {e}") + + # Summary + print("\n=== Summary ===") + for i, result in enumerate(results): + print(f"Demo {i+1}: {result.accuracy():.1%} accuracy") + + print("\nAll demos completed! Check the results/ directory for output files.") + + +if __name__ == "__main__": + main() diff --git a/intent_kit/classifiers/llm_classifier.py b/intent_kit/classifiers/llm_classifier.py index 3689809..28527f4 100644 --- a/intent_kit/classifiers/llm_classifier.py +++ b/intent_kit/classifiers/llm_classifier.py @@ -181,7 +181,11 @@ def llm_arg_extractor(user_input: str, context: Optional[Dict[str, Any]] = None) ) # Get LLM response - logger.debug(f"LLM arg extractor config: {llm_config}") + # Obfuscate API key in debug log + safe_config = llm_config.copy() + if "api_key" in safe_config: + safe_config["api_key"] = "***OBFUSCATED***" + logger.debug(f"LLM arg extractor config: {safe_config}") logger.debug(f"LLM arg extractor prompt: {prompt}") response = LLMFactory.generate_with_config(llm_config, prompt) diff --git a/intent_kit/evals/__init__.py b/intent_kit/evals/__init__.py new file mode 100644 index 0000000..b719307 --- /dev/null +++ b/intent_kit/evals/__init__.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 +""" +intent_kit.evals + +A clean Python API for evaluating intent-kit nodes against YAML datasets. +""" + +import csv +import importlib +from typing import Any, Dict, List, Optional, Callable, Union +from pathlib import Path +from dataclasses import dataclass +from datetime import datetime +import yaml + + +@dataclass +class EvalTestCase: + """A single test case with input, expected output, and optional context.""" + input: str + expected: Any + context: Dict[str, Any] + + def __post_init__(self): + if self.context is None: + self.context = {} + + +@dataclass +class Dataset: + """A dataset containing test cases for evaluating a node.""" + name: str + description: str + node_type: str + node_name: str + test_cases: List[EvalTestCase] + + def __post_init__(self): + if self.description is None: + self.description = "" + + +@dataclass +class EvalTestResult: + """Result of a single test case evaluation.""" + input: str + expected: Any + actual: Any + passed: bool + context: Dict[str, Any] + error: Optional[str] = None + + def __post_init__(self): + if self.context is None: + self.context = {} + + +class EvalResult: + """Results from evaluating a node against a dataset.""" + + def __init__(self, results: List[EvalTestResult], dataset_name: str = ""): + self.results = results + self.dataset_name = dataset_name + + def all_passed(self) -> bool: + return all(r.passed for r in self.results) + + def accuracy(self) -> float: + if not self.results: + return 0.0 + return sum(1 for r in self.results if r.passed) / len(self.results) + + def passed_count(self) -> int: + return sum(1 for r in self.results if r.passed) + + def failed_count(self) -> int: + return sum(1 for r in self.results if not r.passed) + + def total_count(self) -> int: + return len(self.results) + + def errors(self) -> List[EvalTestResult]: + return [r for r in self.results if not r.passed] + + def print_summary(self) -> None: + print(f"\nEvaluation Results for {self.dataset_name or 'Dataset'}:") + print( + f" Accuracy: {self.accuracy():.1%} ({self.passed_count()}/{self.total_count()})") + print(f" Passed: {self.passed_count()}") + print(f" Failed: {self.failed_count()}") + if self.errors(): + print(f"\nFailed Tests:") + for i, error in enumerate(self.errors()[:5]): + print(f" {i+1}. Input: '{error.input}'") + print(f" Expected: '{error.expected}'") + print(f" Actual: '{error.actual}'") + if error.error: + print(f" Error: {error.error}") + print() + if len(self.errors()) > 5: + print(f" ... and {len(self.errors()) - 5} more failed tests") + + def save_csv(self, path: Optional[str] = None) -> str: + if path is None: + results_dir = Path(__file__).parent / "results" / "latest" + results_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + path = str(results_dir / + f"{self.dataset_name}_eval_results_{timestamp}.csv") + with open(path, 'w', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + writer.writerow(['input', 'expected', 'actual', + 'passed', 'error', 'context']) + for result in self.results: + writer.writerow([ + result.input, + result.expected, + result.actual, + result.passed, + result.error or '', + str(result.context) + ]) + return str(path) + + def save_json(self, path: Optional[str] = None) -> str: + if path is None: + results_dir = Path(__file__).parent / "results" / "latest" + results_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + path = str(results_dir / + f"{self.dataset_name}_eval_results_{timestamp}.json") + import json + data = { + 'dataset_name': self.dataset_name, + 'summary': { + 'accuracy': self.accuracy(), + 'passed_count': self.passed_count(), + 'failed_count': self.failed_count(), + 'total_count': self.total_count() + }, + 'results': [ + { + 'input': r.input, + 'expected': r.expected, + 'actual': r.actual, + 'passed': r.passed, + 'error': r.error, + 'context': r.context + } + for r in self.results + ] + } + with open(path, 'w') as f: + json.dump(data, f, indent=2) + return str(path) + + def save_markdown(self, path: Optional[str] = None) -> str: + if path is None: + reports_dir = Path(__file__).parent / "reports" / "latest" + reports_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + path = str(reports_dir / + f"{self.dataset_name}_eval_report_{timestamp}.md") + report = f"""# Evaluation Report: {self.dataset_name} + +**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + +## Summary + +- **Accuracy:** {self.accuracy():.1%} ({self.passed_count()}/{self.total_count()}) +- **Passed:** {self.passed_count()} +- **Failed:** {self.failed_count()} + +## Results + +| # | Input | Expected | Actual | Status | +|---|-------|----------|--------|--------| +""" + for i, result in enumerate(self.results, 1): + status = "✅ PASS" if result.passed else "❌ FAIL" + report += f"| {i} | `{result.input}` | `{result.expected}` | `{result.actual}` | {status} |\n" + if self.errors(): + report += "\n## Failed Tests\n\n" + for i, error in enumerate(self.errors(), 1): + report += f"### Failed Test {i}\n\n" + report += f"- **Input:** `{error.input}`\n" + report += f"- **Expected:** `{error.expected}`\n" + report += f"- **Actual:** `{error.actual}`\n" + if error.error: + report += f"- **Error:** {error.error}\n" + report += "\n" + with open(path, 'w') as f: + f.write(report) + return str(path) + + +def load_dataset(path: Union[str, Path]) -> Dataset: + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"Dataset file not found: {path}") + with open(path, 'r') as f: + data = yaml.safe_load(f) + if 'dataset' not in data: + raise ValueError(f"Dataset file missing 'dataset' section: {path}") + dataset_info = data['dataset'] + required_fields = ['name', 'node_type', 'node_name'] + for field in required_fields: + if field not in dataset_info: + raise ValueError( + f"Dataset missing required field '{field}': {path}") + if 'test_cases' not in data: + raise ValueError(f"Dataset file missing 'test_cases' section: {path}") + test_cases = [] + for i, tc_data in enumerate(data['test_cases']): + if 'input' not in tc_data: + raise ValueError(f"Test case {i+1} missing 'input' field: {path}") + if 'expected' not in tc_data: + raise ValueError( + f"Test case {i+1} missing 'expected' field: {path}") + test_case = EvalTestCase( + input=tc_data['input'], + expected=tc_data['expected'], + context=tc_data.get('context', {}) + ) + test_cases.append(test_case) + return Dataset( + name=dataset_info['name'], + description=dataset_info.get('description', ''), + node_type=dataset_info['node_type'], + node_name=dataset_info['node_name'], + test_cases=test_cases + ) + + +def get_node_from_module(module_name: str, node_name: str): + try: + module = importlib.import_module(module_name) + return getattr(module, node_name) + except (ImportError, AttributeError) as e: + print(f"Error loading node {node_name} from {module_name}: {e}") + return None + + +def run_eval( + dataset: Dataset, + node: Any, + comparator: Optional[Callable[[Any, Any], bool]] = None, + fail_fast: bool = False +) -> EvalResult: + if comparator is None: + def default_comparator(expected, actual): + return expected == actual + comparator = default_comparator + results = [] + for test_case in dataset.test_cases: + try: + if callable(node): + actual = node(test_case.input, context=test_case.context) + elif hasattr(node, 'execute'): + from intent_kit.context import IntentContext + context = IntentContext() + for key, value in test_case.context.items(): + context.set(key, value, modified_by="eval") + result = node.execute(test_case.input, context) + actual = result.output if result.success else None + if not result.success and result.error: + raise Exception(result.error.message) + else: + raise ValueError( + "Node must be callable or have an .execute() method") + passed = comparator(test_case.expected, actual) + result = EvalTestResult( + input=test_case.input, + expected=test_case.expected, + actual=actual, + passed=passed, + context=test_case.context + ) + except Exception as e: + result = EvalTestResult( + input=test_case.input, + expected=test_case.expected, + actual=None, + passed=False, + context=test_case.context, + error=str(e) + ) + if fail_fast: + results.append(result) + return EvalResult(results, dataset.name) + results.append(result) + return EvalResult(results, dataset.name) + + +def run_eval_from_path( + dataset_path: Union[str, Path], + node: Any, + comparator: Optional[Callable[[Any, Any], bool]] = None, + fail_fast: bool = False +) -> EvalResult: + dataset = load_dataset(dataset_path) + return run_eval(dataset, node, comparator, fail_fast) + + +def run_eval_from_module( + dataset_path: Union[str, Path], + module_name: str, + node_name: str, + comparator: Optional[Callable[[Any, Any], bool]] = None, + fail_fast: bool = False +) -> EvalResult: + dataset = load_dataset(dataset_path) + node = get_node_from_module(module_name, node_name) + if node is None: + raise ValueError(f"Failed to load node {node_name} from {module_name}") + return run_eval(dataset, node, comparator, fail_fast) + + +# Control what gets imported when using "from intent_kit.evals import *" +__all__ = [ + "EvalTestCase", + "Dataset", + "EvalTestResult", + "EvalResult", + "load_dataset", + "get_node_from_module", + "run_eval", + "run_eval_from_path", + "run_eval_from_module" +] diff --git a/intent_kit/evals/datasets/classifier_node_llm.yaml b/intent_kit/evals/datasets/classifier_node_llm.yaml new file mode 100644 index 0000000..4801a4a --- /dev/null +++ b/intent_kit/evals/datasets/classifier_node_llm.yaml @@ -0,0 +1,56 @@ +dataset: + name: "classifier_node_llm" + description: "Test LLM-powered intent classification for weather and cancellation handlers" + node_type: "classifier" + node_name: "classifier_node_llm" + +test_cases: + - input: "What's the weather like in New York?" + expected: "Weather in New York: Sunny with a chance of rain" + context: + user_id: "user123" + + - input: "How's the temperature in London?" + expected: "Weather in London: Sunny with a chance of rain" + context: + user_id: "user123" + + - input: "Can you tell me the weather forecast for Tokyo?" + expected: "Weather in Tokyo: Sunny with a chance of rain" + context: + user_id: "user123" + + - input: "What's the weather like today?" + expected: "Weather in Unknown: Sunny with a chance of rain" + context: + user_id: "user123" + + - input: "I need to cancel my flight reservation" + expected: "Successfully cancelled flight reservation" + context: + user_id: "user123" + + - input: "Cancel my hotel booking" + expected: "Successfully cancelled hotel booking" + context: + user_id: "user123" + + - input: "I want to cancel my restaurant reservation" + expected: "Successfully cancelled restaurant reservation" + context: + user_id: "user123" + + - input: "Please cancel my appointment" + expected: "Successfully cancelled appointment" + context: + user_id: "user123" + + - input: "Cancel my subscription" + expected: "Successfully cancelled subscription" + context: + user_id: "user123" + + - input: "I need to cancel my order" + expected: "Successfully cancelled order" + context: + user_id: "user123" \ No newline at end of file diff --git a/intent_kit/evals/datasets/handler_node_llm.yaml b/intent_kit/evals/datasets/handler_node_llm.yaml new file mode 100644 index 0000000..cf72261 --- /dev/null +++ b/intent_kit/evals/datasets/handler_node_llm.yaml @@ -0,0 +1,56 @@ +dataset: + name: "handler_node_llm" + description: "Test LLM-powered argument extraction for booking handler" + node_type: "handler" + node_name: "handler_node_llm" + +test_cases: + - input: "I need to book a flight to Paris" + expected: "Flight booked to Paris for ASAP (Booking #1)" + context: + user_id: "user123" + + - input: "Book me a ticket to Tokyo for next Friday" + expected: "Flight booked to Tokyo for next Friday (Booking #2)" + context: + user_id: "user123" + + - input: "Can you arrange travel to London tomorrow?" + expected: "Flight booked to London for tomorrow (Booking #3)" + context: + user_id: "user123" + + - input: "I want to fly to New York" + expected: "Flight booked to New York for ASAP (Booking #4)" + context: + user_id: "user123" + + - input: "Book a flight to Sydney for December 15th" + expected: "Flight booked to Sydney for December 15th (Booking #5)" + context: + user_id: "user123" + + - input: "I need to travel to Berlin next week" + expected: "Flight booked to Berlin for next week (Booking #6)" + context: + user_id: "user123" + + - input: "Can you book me a flight to Rome for the weekend?" + expected: "Flight booked to Rome for the weekend (Booking #7)" + context: + user_id: "user123" + + - input: "I want to go to Barcelona" + expected: "Flight booked to Barcelona for ASAP (Booking #8)" + context: + user_id: "user123" + + - input: "Book a trip to Amsterdam for next month" + expected: "Flight booked to Amsterdam for next month (Booking #9)" + context: + user_id: "user123" + + - input: "I need a flight to Prague as soon as possible" + expected: "Flight booked to Prague for ASAP (Booking #10)" + context: + user_id: "user123" \ No newline at end of file diff --git a/intent_kit/evals/datasets/splitter_node_llm.yaml b/intent_kit/evals/datasets/splitter_node_llm.yaml new file mode 100644 index 0000000..90a7635 --- /dev/null +++ b/intent_kit/evals/datasets/splitter_node_llm.yaml @@ -0,0 +1,56 @@ +dataset: + name: "splitter_node_llm" + description: "Test LLM-powered text splitting for complex multi-intent scenarios" + node_type: "splitter" + node_name: "splitter_node_llm" + +test_cases: + - input: "Book a flight to Paris and check the weather in London" + expected: ["Book a flight to Paris", "Check the weather in London"] + context: + user_id: "user123" + + - input: "Cancel my reservation and book a new one" + expected: ["Cancel my reservation", "Book a new reservation"] + context: + user_id: "user123" + + - input: "What's the weather like in Tokyo and can you book me a hotel there?" + expected: ["What's the weather like in Tokyo", "Book me a hotel there"] + context: + user_id: "user123" + + - input: "I need to cancel my flight and get a refund" + expected: ["Cancel my flight", "Get a refund"] + context: + user_id: "user123" + + - input: "Check the weather in Berlin and book a restaurant for dinner" + expected: ["Check the weather in Berlin", "Book a restaurant for dinner"] + context: + user_id: "user123" + + - input: "What's the weather like?" + expected: ["What's the weather like?"] + context: + user_id: "user123" + + - input: "Book a flight to Rome, check the weather there, and reserve a hotel" + expected: ["Book a flight to Rome", "Check the weather there", "Reserve a hotel"] + context: + user_id: "user123" + + - input: "Cancel my subscription and order a replacement" + expected: ["Cancel my subscription", "Order a replacement"] + context: + user_id: "user123" + + - input: "I want to book a flight to Amsterdam and check the weather forecast" + expected: ["Book a flight to Amsterdam", "Check the weather forecast"] + context: + user_id: "user123" + + - input: "Cancel my appointment and reschedule for next week" + expected: ["Cancel my appointment", "Reschedule for next week"] + context: + user_id: "user123" \ No newline at end of file diff --git a/intent_kit/evals/llm_config.yaml b/intent_kit/evals/llm_config.yaml new file mode 100644 index 0000000..e4b3321 --- /dev/null +++ b/intent_kit/evals/llm_config.yaml @@ -0,0 +1,28 @@ +# LLM Configuration for Intent Kit Evaluations +# Replace the API keys with your actual keys + +openai: + api_key: "your-openai-api-key-here" + model: "gpt-3.5-turbo" + max_tokens: 100 + +anthropic: + api_key: "your-anthropic-api-key-here" + model: "claude-3-sonnet-20240229" + max_tokens: 100 + +google: + api_key: "your-google-api-key-here" + model: "gemini-pro" + max_tokens: 100 + +ollama: + api_key: "" # Ollama doesn't require API key + model: "llama2" + base_url: "http://localhost:11434" + max_tokens: 100 + +# You can also set API keys via environment variables: +# OPENAI_API_KEY=your-key +# ANTHROPIC_API_KEY=your-key +# GOOGLE_API_KEY=your-key \ No newline at end of file diff --git a/intent_kit/evals/run_all_evals.py b/intent_kit/evals/run_all_evals.py new file mode 100644 index 0000000..4aa20e4 --- /dev/null +++ b/intent_kit/evals/run_all_evals.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +""" +run_all_evals.py + +Run evaluations on all datasets and generate comprehensive markdown reports. +""" + +from intent_kit.evals.run_node_eval import load_dataset, get_node_from_module, evaluate_node, generate_markdown_report +import yaml +from typing import Dict, List, Any, Optional +from datetime import datetime +import sys +import pathlib +from dotenv import load_dotenv +load_dotenv() + + +def run_all_evaluations(): + """Run all evaluations and generate reports.""" + import argparse + + parser = argparse.ArgumentParser( + description="Run all evaluations and generate comprehensive report") + parser.add_argument("--output", type=str, default="intent_kit/evals/reports/latest/comprehensive_report.md", + help="Output file for comprehensive report") + parser.add_argument("--individual", action="store_true", + help="Also generate individual reports for each dataset") + parser.add_argument("--quiet", action="store_true", + help="Suppress output messages") + parser.add_argument("--llm-config", help="Path to LLM configuration file") + parser.add_argument("--mock", action="store_true", + help="Run in mock mode without real API calls") + + # Parse args if called as script, otherwise use defaults + try: + args = parser.parse_args() + except SystemExit: + # Called as function, use defaults + args = parser.parse_args([]) + + # Create organized reports directory structure + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + today = datetime.now().strftime("%Y-%m-%d") + reports_dir = pathlib.Path(__file__).parent / "reports" / "latest" + reports_dir.mkdir(parents=True, exist_ok=True) + date_reports_dir = pathlib.Path(__file__).parent / "reports" / today + date_reports_dir.mkdir(parents=True, exist_ok=True) + + # Set output path + output_path = pathlib.Path(args.output) + if args.output == "intent_kit/evals/reports/latest/comprehensive_report.md": + output_path = reports_dir / "comprehensive_report.md" + + if not args.quiet: + mode = "MOCK" if args.mock else "LIVE" + print(f"Running all evaluations in {mode} mode...") + results = run_all_evaluations_internal( + args.llm_config, mock_mode=args.mock) + + if not args.quiet: + print("Generating comprehensive report...") + report = generate_comprehensive_report( + results, str(output_path), run_timestamp=run_timestamp, mock_mode=args.mock) + + # Also write timestamped copy to date-based archive directory + date_comprehensive_report_path = date_reports_dir / \ + f"comprehensive_report_{run_timestamp}.md" + with open(output_path, 'r') as src, open(date_comprehensive_report_path, 'w') as dst: + dst.write(src.read()) + if not args.quiet: + print( + f"Comprehensive report archived as: {date_comprehensive_report_path}") + + if args.individual: + if not args.quiet: + print("Generating individual reports...") + for result in results: + dataset_name = result['dataset'] + individual_report_path = reports_dir / f"{dataset_name}_report.md" + # Write to latest + generate_markdown_report( + [result], individual_report_path, run_timestamp=run_timestamp) + # Also write to date-based archive with timestamp in filename + date_individual_report_path = date_reports_dir / \ + f"{dataset_name}_report_{run_timestamp}.md" + with open(individual_report_path, 'r') as src, open(date_individual_report_path, 'w') as dst: + dst.write(src.read()) + if not args.quiet: + print( + f"Individual report written to: {individual_report_path} and archived as {date_individual_report_path}") + + if not args.quiet: + print("Evaluation complete!") + + return True + + +def run_all_evaluations_internal(llm_config_path: Optional[str] = None, mock_mode: bool = False) -> List[Dict[str, Any]]: + """Run evaluations on all datasets and return results.""" + dataset_dir = pathlib.Path(__file__).parent / "datasets" + results = [] + + # Load LLM configuration if provided + if llm_config_path: + import os + with open(llm_config_path, 'r') as f: + llm_config = yaml.safe_load(f) + + # Set environment variables for API keys + for provider, config in llm_config.items(): + if "api_key" in config: + env_var = f"{provider.upper()}_API_KEY" + os.environ[env_var] = config["api_key"] + print(f"Set {env_var} environment variable (key obfuscated)") + + # Set mock mode environment variable + if mock_mode: + import os + os.environ["INTENT_KIT_MOCK_MODE"] = "1" + print("Running in MOCK mode - using simulated responses") + + for dataset_file in dataset_dir.glob("*.yaml"): + print(f"Evaluating {dataset_file.name}...") + + # Load dataset + dataset = load_dataset(dataset_file) + dataset_name = dataset["dataset"]["name"] + node_name = dataset["dataset"]["node_name"] + + # Determine module name based on node name + if "llm" in node_name: + module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node_llm" + else: + module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node" + + # Load node + node = get_node_from_module(module_name, node_name) + if node is None: + print(f"Failed to load node {node_name} from {module_name}") + continue + + # Run evaluation + test_cases = dataset["test_cases"] + result = evaluate_node(node, test_cases, dataset_name) + results.append(result) + + # Print results + accuracy = result["accuracy"] + mode_indicator = "[MOCK]" if mock_mode else "" + print( + f" Accuracy: {accuracy:.1%} ({result['correct']}/{result['total_cases']}) {mode_indicator}") + + return results + + +def generate_comprehensive_report(results: List[Dict[str, Any]], output_file: Optional[str] = None, run_timestamp: str = "", mock_mode: bool = False) -> str: + """Generate a comprehensive markdown report for all evaluations.""" + + total_datasets = len(results) + total_tests = sum(r["total_cases"] for r in results) + total_passed = sum(r["correct"] for r in results) + overall_accuracy = total_passed / total_tests if total_tests > 0 else 0.0 + + # Count statuses + passed_datasets = sum( + 1 for r in results if r["accuracy"] >= 0.8) # 80% threshold + failed_datasets = total_datasets - passed_datasets + + # Add mock mode indicator + mock_indicator = " (MOCK MODE)" if mock_mode else "" + + report = f"""# Comprehensive Evaluation Report{mock_indicator} + +**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} +**Mode:** {'Mock (simulated responses)' if mock_mode else 'Live (real API calls)'} +**Total Datasets:** {total_datasets} +**Total Tests:** {total_tests} +**Overall Accuracy:** {overall_accuracy:.1%} + +## Executive Summary + +| Metric | Value | +|--------|-------| +| **Datasets Evaluated** | {total_datasets} | +| **Datasets Passed** | {passed_datasets} | +| **Datasets Failed** | {failed_datasets} | +| **Total Tests** | {total_tests} | +| **Tests Passed** | {total_passed} | +| **Tests Failed** | {total_tests - total_passed} | +| **Overall Accuracy** | {overall_accuracy:.1%} | + +## Dataset Results + +| Dataset | Accuracy | Status | Tests | +|---------|----------|--------|-------| +""" + + for result in results: + status = "PASSED" if result["accuracy"] >= 0.8 else "FAILED" + status_icon = "✅" if status == "PASSED" else "❌" + + report += f"| `{result['dataset']}` | {result['accuracy']:.1%} | {status_icon} {status} | {result['correct']}/{result['total_cases']} |\n" + + # Detailed results for each dataset + report += "\n## Detailed Results\n\n" + + for result in results: + report += f"### {result['dataset']}\n\n" + report += f"**Accuracy:** {result['accuracy']:.1%} ({result['correct']}/{result['total_cases']}) \n" + report += f"**Status:** {'PASSED' if result['accuracy'] >= 0.8 else 'FAILED'}\n\n" + + # Show errors if any + if result["errors"]: + report += "#### Errors\n" + for error in result["errors"][:5]: # Show first 5 errors + report += f"- **Case {error['case']}**: {error['input']}\n" + report += f" - Expected: `{error['expected']}`\n" + report += f" - Actual: `{error['actual']}`\n" + if error.get('error'): + report += f" - Error: {error['error']}\n" + report += "\n" + if len(result["errors"]) > 5: + report += f"- ... and {len(result['errors']) - 5} more errors\n\n" + + # Write to file if specified + if output_file: + with open(output_file, 'w') as f: + f.write(report) + print(f"Comprehensive report written to: {output_file}") + + return report + + +if __name__ == "__main__": + run_all_evaluations() diff --git a/intent_kit/evals/run_node_eval.py b/intent_kit/evals/run_node_eval.py new file mode 100644 index 0000000..f6dd9ea --- /dev/null +++ b/intent_kit/evals/run_node_eval.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python3 +""" +run_node_eval.py + +Run evaluations on sample nodes using datasets. +""" + +from intent_kit.node.types import ExecutionResult +from intent_kit.context import IntentContext +from typing import Dict, Any, List, Optional +from pathlib import Path +import yaml +import sys +import os +import importlib +import argparse +from dotenv import load_dotenv +import json +import csv +from datetime import datetime +import uuid + +# Add text similarity imports +from difflib import SequenceMatcher +import re + +load_dotenv() + + +def load_dataset(dataset_path: Path) -> Dict[str, Any]: + """Load a dataset from YAML file.""" + with open(dataset_path, 'r') as f: + return yaml.safe_load(f) + + +def get_node_from_module(module_name: str, node_name: str): + """Get a node instance from a module.""" + try: + module = importlib.import_module(module_name) + return getattr(module, node_name) + except (ImportError, AttributeError) as e: + print(f"Error loading node {node_name} from {module_name}: {e}") + return None + + +def save_raw_results_to_csv(dataset_name: str, test_case: Dict[str, Any], actual_output: Any, success: bool, error: Optional[str] = None, similarity_score: Optional[float] = None, run_timestamp: Optional[str] = None): + """Save raw evaluation results to CSV files.""" + # Create organized results directory structure + today = datetime.now().strftime("%Y-%m-%d") + if run_timestamp is None: + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + # Create results directory structure + results_dir = Path(__file__).parent / "results" / "latest" + results_dir.mkdir(parents=True, exist_ok=True) + + # Also create date-based directory for archiving + date_dir = Path(__file__).parent / "results" / today + date_dir.mkdir(parents=True, exist_ok=True) + + # Create CSV files for this dataset + csv_file = results_dir / f"{dataset_name}_results.csv" + date_csv_file = date_dir / f"{dataset_name}_results_{run_timestamp}.csv" + + # Prepare row data + row_data = { + "timestamp": importlib.import_module('datetime').datetime.now().isoformat(), + "input": test_case["input"], + "expected": test_case["expected"], + "actual": actual_output, + "success": success, + "similarity_score": similarity_score or "", + "error": error or "", + "context": str(test_case.get("context", {})) + } + + # Check if this is the first test case (to write header) + global _first_test_case + if not hasattr(save_raw_results_to_csv, '_first_test_case'): + save_raw_results_to_csv._first_test_case = {} + + is_first = dataset_name not in save_raw_results_to_csv._first_test_case + if is_first: + save_raw_results_to_csv._first_test_case[dataset_name] = True + # Clear both files for new evaluation run + if csv_file.exists(): + csv_file.unlink() + if date_csv_file.exists(): + date_csv_file.unlink() + + # Write to latest directory + with open(csv_file, 'a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if is_first: + writer.writeheader() + writer.writerow(row_data) + + # Write to date-based directory for archiving (always write header for new file) + write_header = not date_csv_file.exists() + with open(date_csv_file, 'a', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + if write_header: + writer.writeheader() + writer.writerow(row_data) + + return csv_file, date_csv_file + + +def similarity_score(text1: str, text2: str) -> float: + """Calculate similarity score between two texts.""" + # Normalize texts for comparison + def normalize(text): + return re.sub(r'\s+', ' ', text.lower().strip()) + + norm1 = normalize(text1) + norm2 = normalize(text2) + + # Use sequence matcher for similarity + return SequenceMatcher(None, norm1, norm2).ratio() + + +def chunks_similarity_score(expected_chunks: List[str], actual_chunks: List[str], threshold: float = 0.8) -> tuple[bool, float]: + """Calculate similarity score between expected and actual chunks.""" + if len(expected_chunks) != len(actual_chunks): + return False, 0.0 + + total_score = 0.0 + for expected, actual in zip(expected_chunks, actual_chunks): + score = similarity_score(expected, actual) + total_score += score + + avg_score = total_score / len(expected_chunks) + return avg_score >= threshold, avg_score + + +def evaluate_node(node, test_cases: List[Dict[str, Any]], dataset_name: str) -> Dict[str, Any]: + """Evaluate a node against test cases.""" + results = { + "dataset": dataset_name, + "total_cases": len(test_cases), + "correct": 0, + "incorrect": 0, + "errors": [], + "details": [], + "raw_results_file": f"intent_kit/evals/results/latest/{dataset_name}_results.csv" + } + + # Generate a unique run timestamp for this evaluation + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + # Check if this node needs persistent context (like handler_node_llm) + needs_persistent_context = hasattr( + node, 'name') and 'handler_node_llm' in node.name + + # Create persistent context if needed + persistent_context = None + if needs_persistent_context: + persistent_context = IntentContext() + # Initialize booking count for handler_node_llm + persistent_context.set( + "booking_count", 0, modified_by="evaluation_init") + + for i, test_case in enumerate(test_cases): + user_input = test_case["input"] + expected = test_case["expected"] + context_data = test_case.get("context", {}) + + # Use persistent context if available, otherwise create new one + if persistent_context is not None: + context = persistent_context + # Update context with test case data + for key, value in context_data.items(): + context.set(key, value, modified_by="test_case") + else: + # Create new context for each test case + context = IntentContext() + for key, value in context_data.items(): + context.set(key, value, modified_by="test_case") + + try: + # Execute the node + result = node.execute(user_input, context) + + if result.success: + actual_output = result.output + similarity_score_val = None + + if isinstance(actual_output, list): + # For splitters, compare lists using similarity + if isinstance(expected, list): + correct, similarity_score_val = chunks_similarity_score( + expected, actual_output) + else: + correct = False + else: + # For handlers and classifiers, compare strings + correct = str(actual_output).strip().lower() == str( + expected).strip().lower() + + if correct: + results["correct"] += 1 + else: + results["incorrect"] += 1 + results["errors"].append({ + "case": i + 1, + "input": user_input, + "expected": expected, + "actual": actual_output, + "similarity_score": similarity_score_val, + "type": "incorrect_output" + }) + + # Save raw result to CSV + save_raw_results_to_csv( + dataset_name, test_case, actual_output, correct, similarity_score=similarity_score_val, run_timestamp=run_timestamp) + else: + results["incorrect"] += 1 + error_msg = result.error.message if result.error else "Unknown error" + results["errors"].append({ + "case": i + 1, + "input": user_input, + "expected": expected, + "actual": None, + "type": "execution_failed", + "error": error_msg + }) + + # Save raw result to CSV + save_raw_results_to_csv( + dataset_name, test_case, None, False, error_msg, run_timestamp=run_timestamp) + + except Exception as e: + results["incorrect"] += 1 + error_msg = str(e) + results["errors"].append({ + "case": i + 1, + "input": user_input, + "expected": expected, + "actual": None, + "type": "exception", + "error": error_msg + }) + + # Save raw result to CSV + save_raw_results_to_csv( + dataset_name, test_case, None, False, error_msg, run_timestamp=run_timestamp) + + # Store detailed results + results["details"].append({ + "case": i + 1, + "input": user_input, + "expected": expected, + "actual": result.output if 'result' in locals() else None, + "success": result.success if 'result' in locals() else False, + "error": result.error.message if 'result' in locals() and result.error else None + }) + + results["accuracy"] = results["correct"] / \ + results["total_cases"] if results["total_cases"] > 0 else 0 + return results + + +def generate_markdown_report(results: List[Dict[str, Any]], output_path: Path, run_timestamp: Optional[str] = None, mock_mode: bool = False): + """Generate a markdown report from evaluation results.""" + # Generate the report content + mock_indicator = " (MOCK MODE)" if mock_mode else "" + report_content = f"# Node Evaluation Report{mock_indicator}\n\n" + report_content += f"Generated on: {importlib.import_module('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + report_content += f"Mode: {'Mock (simulated responses)' if mock_mode else 'Live (real API calls)'}\n\n" + + # Summary + report_content += "## Summary\n\n" + total_cases = sum(r["total_cases"] for r in results) + total_correct = sum(r["correct"] for r in results) + overall_accuracy = total_correct / total_cases if total_cases > 0 else 0 + + report_content += f"- **Total Test Cases**: {total_cases}\n" + report_content += f"- **Total Correct**: {total_correct}\n" + report_content += f"- **Overall Accuracy**: {overall_accuracy:.1%}\n\n" + + # Individual dataset results + report_content += "## Dataset Results\n\n" + for result in results: + report_content += f"### {result['dataset']}\n" + report_content += f"- **Accuracy**: {result['accuracy']:.1%} ({result['correct']}/{result['total_cases']})\n" + report_content += f"- **Correct**: {result['correct']}\n" + report_content += f"- **Incorrect**: {result['incorrect']}\n" + report_content += f"- **Raw Results**: `{result['raw_results_file']}`\n\n" + + # Show errors if any + if result["errors"]: + report_content += "#### Errors\n" + for error in result["errors"][:5]: # Show first 5 errors + report_content += f"- **Case {error['case']}**: {error['input']}\n" + report_content += f" - Expected: `{error['expected']}`\n" + report_content += f" - Actual: `{error['actual']}`\n" + if error.get('error'): + report_content += f" - Error: {error['error']}\n" + report_content += "\n" + if len(result["errors"]) > 5: + report_content += f"- ... and {len(result['errors']) - 5} more errors\n\n" + + # Detailed results table + report_content += "## Detailed Results\n\n" + report_content += "| Dataset | Accuracy | Correct | Total | Raw Results |\n" + report_content += "|---------|----------|---------|-------|-------------|\n" + for result in results: + report_content += f"| {result['dataset']} | {result['accuracy']:.1%} | {result['correct']} | {result['total_cases']} | `{result['raw_results_file']}` |\n" + + # Write to the specified output path + with open(output_path, 'w') as f: + f.write(report_content) + + today = datetime.now().strftime("%Y-%m-%d") + if run_timestamp is None: + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + date_reports_dir = Path(__file__).parent / "reports" / today + date_reports_dir.mkdir(parents=True, exist_ok=True) + + # Create date-based filename + date_output_path = date_reports_dir / \ + f"{output_path.stem}_{run_timestamp}{output_path.suffix}" + with open(date_output_path, 'w') as f: + f.write(report_content) + + +def main(): + parser = argparse.ArgumentParser(description="Run node evaluations") + parser.add_argument("--dataset", help="Specific dataset to run") + parser.add_argument("--output", help="Output file for markdown report") + parser.add_argument("--llm-config", help="Path to LLM configuration file") + + args = parser.parse_args() + + # Load LLM configuration if provided + llm_config = {} + if args.llm_config: + with open(args.llm_config, 'r') as f: + llm_config = yaml.safe_load(f) + + # Set environment variables for API keys + for provider, config in llm_config.items(): + if "api_key" in config: + env_var = f"{provider.upper()}_API_KEY" + os.environ[env_var] = config["api_key"] + print(f"Set {env_var} environment variable") + + # Find datasets + datasets_dir = Path(__file__).parent / "datasets" + if not datasets_dir.exists(): + print(f"Datasets directory not found: {datasets_dir}") + sys.exit(1) + + dataset_files = list(datasets_dir.glob("*.yaml")) + if not dataset_files: + print(f"No dataset files found in {datasets_dir}") + sys.exit(1) + + # Filter to specific dataset if requested + if args.dataset: + dataset_files = [f for f in dataset_files if args.dataset in f.name] + if not dataset_files: + print(f"No dataset files found matching '{args.dataset}'") + sys.exit(1) + + results = [] + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + for dataset_file in dataset_files: + print(f"\nEvaluating dataset: {dataset_file.name}") + + # Load dataset + dataset = load_dataset(dataset_file) + dataset_name = dataset["dataset"]["name"] + node_name = dataset["dataset"]["node_name"] + + # Determine module name based on node name + if "llm" in node_name: + module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node_llm" + else: + module_name = f"intent_kit.evals.sample_nodes.{node_name.split('_')[0]}_node" + + # Load node + node = get_node_from_module(module_name, node_name) + if node is None: + print(f"Failed to load node {node_name} from {module_name}") + continue + + # Run evaluation + test_cases = dataset["test_cases"] + result = evaluate_node(node, test_cases, dataset_name) + results.append(result) + + # Print results + accuracy = result["accuracy"] + print( + f" Accuracy: {accuracy:.1%} ({result['correct']}/{result['total_cases']})") + print(f" Raw results saved to: {result['raw_results_file']}") + + if result["errors"]: + print(f" Errors: {len(result['errors'])}") + for error in result["errors"][:3]: # Show first 3 errors + print(f" - Case {error['case']}: {error['input']}") + print(f" Expected: {error['expected']}") + print(f" Actual: {error['actual']}") + + # Generate report + if results: + if args.output: + output_path = Path(args.output) + else: + # Create organized reports directory structure + today = datetime.now().strftime("%Y-%m-%d") + + # Create reports directory structure + reports_dir = Path(__file__).parent / "reports" / "latest" + reports_dir.mkdir(parents=True, exist_ok=True) + + # Also create date-based directory for archiving + date_reports_dir = Path(__file__).parent / "reports" / today + date_reports_dir.mkdir(parents=True, exist_ok=True) + + output_path = reports_dir / "evaluation_report.md" + + generate_markdown_report(results, output_path, + run_timestamp=run_timestamp) + print(f"\nReport generated: {output_path}") + + # Print summary + total_cases = sum(r["total_cases"] for r in results) + total_correct = sum(r["correct"] for r in results) + overall_accuracy = total_correct / total_cases if total_cases > 0 else 0 + print( + f"\nOverall Accuracy: {overall_accuracy:.1%} ({total_correct}/{total_cases})") + + +if __name__ == "__main__": + main() diff --git a/intent_kit/evals/sample_nodes/__init__.py b/intent_kit/evals/sample_nodes/__init__.py new file mode 100644 index 0000000..f355633 --- /dev/null +++ b/intent_kit/evals/sample_nodes/__init__.py @@ -0,0 +1 @@ +"""Sample node implementations for node-level evaluation.""" \ No newline at end of file diff --git a/intent_kit/evals/sample_nodes/classifier_node_llm.py b/intent_kit/evals/sample_nodes/classifier_node_llm.py new file mode 100644 index 0000000..799ac2b --- /dev/null +++ b/intent_kit/evals/sample_nodes/classifier_node_llm.py @@ -0,0 +1,329 @@ +from typing import Optional, List, Dict, Any +from intent_kit.classifiers.node import ClassifierNode +from intent_kit.handlers.node import HandlerNode +from intent_kit.context import IntentContext +from intent_kit.node.base import TreeNode + + +def extract_weather_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract weather parameters using LLM.""" + from intent_kit.services.llm_factory import LLMFactory + + # Check for mock mode + import os + mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1" + + if mock_mode: + # Mock responses for testing without API calls + import re + location_patterns = [ + r'(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)', + r'(?:weather|temperature|forecast)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)', + r'(?:What\'s|How\'s)\s+(?:the\s+)?(?:weather|temperature)\s+(?:like\s+)?(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature|forecast)\s+for\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature)\s+in\s+([A-Za-z\s]+?)(?:\?|$)' + ] + + location = "Unknown" + for pattern in location_patterns: + location_match = re.search(pattern, user_input, re.IGNORECASE) + if location_match: + location = location_match.group(1).strip() + break + + return {"location": location} + + # Configure LLM + provider = "openai" # or "anthropic", "google", "ollama" + api_key = os.getenv(f"{provider.upper()}_API_KEY") + + if not api_key: + raise ValueError( + f"Environment variable {provider.upper()}_API_KEY not set") + + llm_config = { + "provider": provider, + "model": "gpt-4.1-mini", + "api_key": api_key + } + + try: + llm_client = LLMFactory.create_client(llm_config) + + prompt = f""" +Extract the location from this weather-related user input. + +User input: "{user_input}" + +Return a JSON object with this field: +- location: The specific location/city mentioned + +Rules: +- Extract the exact location name (e.g., "New York", "London", "Tokyo") +- If no location is mentioned, use "Unknown" +- Be precise and extract the full location name + +Examples: +- "What's the weather like in New York?" → {{"location": "New York"}} +- "How's the temperature in London?" → {{"location": "London"}} +- "Can you tell me the weather forecast for Tokyo?" → {{"location": "Tokyo"}} +- "What's the weather like today?" → {{"location": "Unknown"}} + +User input: {user_input} +JSON:""" + + response = llm_client.generate(prompt, model=llm_config["model"]) + + # Parse the JSON response + import json + import re + + # Extract JSON from response + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + result = json.loads(json_match.group()) + return {"location": result.get("location", "Unknown")} + except Exception as e: + print(f"LLM weather extraction failed: {e}") + + # Fallback to regex extraction + import re + location_patterns = [ + r'(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)', + r'(?:weather|temperature|forecast)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\s|$)', + r'(?:What\'s|How\'s)\s+(?:the\s+)?(?:weather|temperature)\s+(?:like\s+)?(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature)\s+(?:in|for|at)\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature|forecast)\s+for\s+([A-Za-z\s]+?)(?:\?|$)', + r'(?:weather|temperature)\s+in\s+([A-Za-z\s]+?)(?:\?|$)' + ] + + location = "Unknown" + for pattern in location_patterns: + location_match = re.search(pattern, user_input, re.IGNORECASE) + if location_match: + location = location_match.group(1).strip() + break + + return {"location": location} + + +def weather_handler(location: str, context: IntentContext) -> str: + """Handle weather requests.""" + return f"Weather in {location}: Sunny with a chance of rain" + + +def extract_cancel_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract cancellation parameters using LLM.""" + from intent_kit.services.llm_factory import LLMFactory + + # Check for mock mode + import os + mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1" + + if mock_mode: + # Mock responses for testing without API calls + import re + cancel_patterns = [ + r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)', + r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\?|$)', + r'(?:I\s+need\s+to\s+)?cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)', + r'(?:cancel|cancellation)\s+(?:of\s+)?(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)' + ] + + item = "reservation" + for pattern in cancel_patterns: + cancel_match = re.search(pattern, user_input, re.IGNORECASE) + if cancel_match: + item = cancel_match.group(1).strip() + break + + return {"item": item} + + # Configure LLM + provider = "openai" # or "anthropic", "google", "ollama" + api_key = os.getenv(f"{provider.upper()}_API_KEY") + + if not api_key: + raise ValueError( + f"Environment variable {provider.upper()}_API_KEY not set") + + llm_config = { + "provider": provider, + "model": "gpt-3.5-turbo", + "api_key": api_key + } + + try: + llm_client = LLMFactory.create_client(llm_config) + + prompt = f""" +Extract what the user wants to cancel from this user input. + +User input: "{user_input}" + +Return a JSON object with this field: +- item: The specific item/reservation to cancel + +Rules: +- Extract the exact item name (e.g., "flight reservation", "hotel booking", "restaurant reservation") +- Be precise and extract the full item description +- If no specific item is mentioned, use "reservation" + +Examples: +- "I need to cancel my flight reservation" → {{"item": "flight reservation"}} +- "Cancel my hotel booking" → {{"item": "hotel booking"}} +- "I want to cancel my restaurant reservation" → {{"item": "restaurant reservation"}} +- "Please cancel my appointment" → {{"item": "appointment"}} + +User input: {user_input} +JSON:""" + + response = llm_client.generate(prompt, model=llm_config["model"]) + + # Parse the JSON response + import json + import re + + # Extract JSON from response + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + result = json.loads(json_match.group()) + return {"item": result.get("item", "reservation")} + except Exception as e: + print(f"LLM cancel extraction failed: {e}") + + # Fallback to regex extraction + import re + cancel_patterns = [ + r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)', + r'cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\?|$)', + r'(?:I\s+need\s+to\s+)?cancel\s+(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)', + r'(?:cancel|cancellation)\s+(?:of\s+)?(?:my\s+)?([^,\s]+(?:\s+[^,\s]+)*?)(?:\s|$)' + ] + + item = "reservation" + for pattern in cancel_patterns: + cancel_match = re.search(pattern, user_input, re.IGNORECASE) + if cancel_match: + item = cancel_match.group(1).strip() + break + + return {"item": item} + + +def cancel_handler(item: str, context: IntentContext) -> str: + """Handle cancellation requests.""" + return f"Successfully cancelled {item}" + + +# Create handler nodes with LLM extraction +weather_handler_node = HandlerNode( + name="weather_handler", + param_schema={"location": str}, + handler=weather_handler, + arg_extractor=extract_weather_args_llm, + description="Get weather information for a location" +) + +cancel_handler_node = HandlerNode( + name="cancel_handler", + param_schema={"item": str}, + handler=cancel_handler, + arg_extractor=extract_cancel_args_llm, + description="Cancel reservations or bookings" +) + + +def intent_classifier_llm(user_input: str, children: List[TreeNode], context: Optional[Dict[str, Any]] = None) -> Optional[TreeNode]: + """Classify user intent using LLM.""" + from intent_kit.services.llm_factory import LLMFactory + + # Check for mock mode + import os + mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1" + + if mock_mode: + # Mock responses for testing without API calls + if "weather" in user_input.lower(): + # Return first child (weather handler) + return children[0] if children else None + elif "cancel" in user_input.lower(): + # Return second child (cancel handler) + return children[1] if len(children) > 1 else None + else: + return children[0] if children else None # Default to first child + + # Configure LLM + provider = "openai" # or "anthropic", "google", "ollama" + api_key = os.getenv(f"{provider.upper()}_API_KEY") + + if not api_key: + raise ValueError( + f"Environment variable {provider.upper()}_API_KEY not set") + + llm_config = { + "provider": provider, + "model": "gpt-3.5-turbo", + "api_key": api_key + } + + try: + llm_client = LLMFactory.create_client(llm_config) + + # Create descriptions of available handlers + handler_descriptions = [] + for child in children: + handler_descriptions.append(f"- {child.name}: {child.description}") + + prompt = f""" +Classify the user's intent and return the name of the appropriate handler. + +Available handlers: +{chr(10).join(handler_descriptions)} + +User input: "{user_input}" + +Rules: +- If the user asks about weather, temperature, or forecast, return "weather_handler" +- If the user wants to cancel something, return "cancel_handler" +- Be precise and match the exact handler name + +Return only the handler name (e.g., "weather_handler" or "cancel_handler") or "none" if no handler matches. + +Handler:""" + + response = llm_client.generate(prompt, model=llm_config["model"]) + handler_name = response.strip().lower() + + # Find the matching handler + for child in children: + if child.name == handler_name: + return child + + # Fallback to keyword matching + user_input_lower = user_input.lower() + if any(word in user_input_lower for word in ["weather", "temperature", "forecast"]): + return weather_handler_node + elif any(word in user_input_lower for word in ["cancel", "cancellation", "refund"]): + return cancel_handler_node + + except Exception as e: + print(f"LLM classification failed: {e}") + # Fallback to keyword matching + user_input_lower = user_input.lower() + if any(word in user_input_lower for word in ["weather", "temperature", "forecast"]): + return weather_handler_node + elif any(word in user_input_lower for word in ["cancel", "cancellation", "refund"]): + return cancel_handler_node + + return None + + +# Create the classifier node with LLM classification +classifier_node_llm = ClassifierNode( + name="classifier_node_llm", + classifier=intent_classifier_llm, + children=[weather_handler_node, cancel_handler_node], + description="Route user intents to appropriate handlers using LLM classification" +) diff --git a/intent_kit/evals/sample_nodes/handler_node_llm.py b/intent_kit/evals/sample_nodes/handler_node_llm.py new file mode 100644 index 0000000..efa2afc --- /dev/null +++ b/intent_kit/evals/sample_nodes/handler_node_llm.py @@ -0,0 +1,138 @@ +from typing import Optional, Dict, Any +from intent_kit.handlers.node import HandlerNode +from intent_kit.context import IntentContext +from intent_kit.node.types import ExecutionResult + + +def extract_booking_args_llm(user_input: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract booking parameters using LLM.""" + from intent_kit.services.llm_factory import LLMFactory + + # Check for mock mode + import os + mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1" + + if mock_mode: + # Mock responses for testing without API calls + import re + # Simple regex extraction for mock mode + dest_match = re.search( + r'(?:to|for|in)\s+([A-Za-z\s]+?)(?:\s|$)', user_input, re.IGNORECASE) + destination = dest_match.group(1).strip() if dest_match else "Unknown" + + date_match = re.search(r'(?:for|on)\s+(\w+\s+\w+)', + user_input, re.IGNORECASE) + date = date_match.group(1) if date_match else "ASAP" + + return { + "destination": destination, + "date": date, + "user_id": context.get("user_id", "anonymous") if context else "anonymous" + } + + # Configure LLM (you can change this to any supported provider) + provider = "openai" # or "anthropic", "google", "ollama" + api_key = os.getenv(f"{provider.upper()}_API_KEY") + + if not api_key: + raise ValueError( + f"Environment variable {provider.upper()}_API_KEY not set") + + llm_config = { + "provider": provider, + "model": "gpt-3.5-turbo", + "api_key": api_key + } + + try: + llm_client = LLMFactory.create_client(llm_config) + + prompt = f""" +Extract booking parameters from this user input. Be precise and extract exactly what the user is asking for. + +User input: "{user_input}" + +Return a JSON object with these exact fields: +- destination: The destination city/location (extract the actual place name) +- date: The specific date mentioned, or "ASAP" if no date is specified + +Rules: +- If the user says "book a flight to X", extract X as destination +- If the user says "travel to X", extract X as destination +- If the user says "fly to X", extract X as destination +- If the user says "go to X", extract X as destination +- For dates, extract the exact date mentioned (e.g., "next Friday", "December 15th", "tomorrow") +- If no date is mentioned, use "ASAP" +- Clean up any extra words like "for" or "to" from the date field + +Examples: +- "Book a flight to Paris" → {{"destination": "Paris", "date": "ASAP"}} +- "I want to fly to Tokyo next Friday" → {{"destination": "Tokyo", "date": "next Friday"}} +- "Travel to London tomorrow" → {{"destination": "London", "date": "tomorrow"}} +- "Book a flight to Rome for the weekend" → {{"destination": "Rome", "date": "the weekend"}} + +User input: {user_input} +JSON:""" + + response = llm_client.generate(prompt, model=llm_config["model"]) + + # Parse the JSON response + import json + import re + + # Extract JSON from response + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + result = json.loads(json_match.group()) + # Clean up the date field to remove extra words + date = result.get("date", "ASAP") + if date != "ASAP": + # Remove common prefixes that might be extracted + date = re.sub(r'^(for|to)\s+', '', date, flags=re.IGNORECASE) + + return { + "destination": result.get("destination", "Unknown"), + "date": date, + "user_id": context.get("user_id", "anonymous") if context else "anonymous" + } + except Exception as e: + print(f"LLM extraction failed: {e}") + + # Fallback to simple extraction + import re + dest_match = re.search( + r'(?:to|for|in)\s+([A-Za-z\s]+?)(?:\s|$)', user_input, re.IGNORECASE) + destination = dest_match.group(1).strip() if dest_match else "Unknown" + + date_match = re.search(r'(?:for|on)\s+(\w+\s+\w+)', + user_input, re.IGNORECASE) + date = date_match.group(1) if date_match else "ASAP" + + return { + "destination": destination, + "date": date, + "user_id": context.get("user_id", "anonymous") if context else "anonymous" + } + + +def booking_handler(destination: str, date: str, context: IntentContext) -> str: + """Handle flight booking requests.""" + # Update context with booking info + booking_count = context.get("booking_count", 0) + 1 + context.set("booking_count", booking_count, modified_by="booking_handler") + context.set("last_destination", destination, modified_by="booking_handler") + + # Use the incremented count for the response + return f"Flight booked to {destination} for {date} (Booking #{booking_count})" + + +# Create the handler node with LLM extraction +handler_node_llm = HandlerNode( + name="handler_node_llm", + param_schema={"destination": str, "date": str}, + handler=booking_handler, + arg_extractor=extract_booking_args_llm, + context_inputs={"user_id"}, + context_outputs={"booking_count", "last_destination"}, + description="Handle flight booking requests with LLM-powered argument extraction" +) diff --git a/intent_kit/evals/sample_nodes/splitter_node_llm.py b/intent_kit/evals/sample_nodes/splitter_node_llm.py new file mode 100644 index 0000000..45b4640 --- /dev/null +++ b/intent_kit/evals/sample_nodes/splitter_node_llm.py @@ -0,0 +1,105 @@ +from typing import Optional, List, Dict, Any +from intent_kit.splitters.node import SplitterNode +from intent_kit.context import IntentContext + + +def split_text_llm(user_input: str, debug: bool = False, context: Optional[Dict[str, Any]] = None) -> List[str]: + """Split user input into multiple intents using LLM.""" + from intent_kit.services.llm_factory import LLMFactory + + # Check for mock mode + import os + mock_mode = os.getenv("INTENT_KIT_MOCK_MODE") == "1" + + if mock_mode: + # Mock responses for testing without API calls + # Simple splitting based on common conjunctions + import re + conjunctions = [" and ", " also ", " plus ", + " as well as ", " furthermore "] + for conj in conjunctions: + if conj in user_input.lower(): + parts = user_input.split(conj) + return [part.strip() for part in parts if part.strip()] + # If no conjunctions found, return as single intent + return [user_input] + + # Configure LLM + provider = "openai" + api_key = os.getenv(f"{provider.upper()}_API_KEY") + + if not api_key: + raise ValueError( + f"Environment variable {provider.upper()}_API_KEY not set") + + llm_config = { + "provider": provider, + "model": "gpt-4.1-mini", + "api_key": api_key + } + + try: + llm_client = LLMFactory.create_client(llm_config) + + prompt = f""" +Split this text into separate requests: + +"{user_input}" + +Return a JSON array of strings. Each string should be a complete, standalone request. + +IMPORTANT: Be verbatim. Do not add extra words, change pronouns, or modify the original text. Split exactly as written. + +JSON array:""" + + response = llm_client.generate(prompt, model=llm_config["model"]) + + # Parse the JSON response + import json + import re + + # Extract JSON array from response + json_match = re.search(r'\[.*\]', response, re.DOTALL) + if json_match: + result = json.loads(json_match.group()) + if isinstance(result, list): + return [str(item).strip() for item in result if item.strip()] + + except Exception as e: + if debug: + print(f"LLM splitting failed: {e}") + + # If LLM fails, return the original input as a single item + return [user_input] + + +def create_splitter_node_llm(): + """Create a splitter node that uses LLM for text splitting.""" + return SplitterNode( + name="splitter_node_llm", + splitter_function=split_text_llm, + children=[], + description="Split complex user inputs into multiple intents using LLM" + ) + + +# Create a wrapper for evaluation that returns chunks directly +class SplitterWrapper: + """Wrapper for splitter node that returns chunks as output for evaluation.""" + + def __init__(self, splitter_node): + self.name = splitter_node.name + self.splitter_function = splitter_node.splitter_function + + def execute(self, user_input: str, context=None): + chunks = self.splitter_function( + user_input, debug=False, context=context) + return type('Result', (), { + 'success': True, + 'output': chunks, + 'error': None + })() + + +# Export the node creation function +splitter_node_llm = SplitterWrapper(create_splitter_node_llm()) diff --git a/pyproject.toml b/pyproject.toml index 17df942..bb863ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,11 @@ openai = [ "ollama>=0.1.0", ] +[project.scripts] +run-evals = "intent_kit.evals.run_all_evals:run_all_evaluations" +eval-node = "intent_kit.evals.run_node_eval:main" +eval-api-demo = "examples.eval_api_demo:main" + [tool.setuptools] packages = ["intent_kit"] @@ -44,6 +49,8 @@ dev = [ "openai>=1.0.0", "ollama>=0.1.0", "python-dotenv>=1.0.0", + "tqdm", + "pyyaml", ] viz = [ "networkx>=3.5", diff --git a/tests/test_eval_api.py b/tests/test_eval_api.py new file mode 100644 index 0000000..0aad71c --- /dev/null +++ b/tests/test_eval_api.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +""" +test_eval_api.py + +Tests for the new evaluation API. +""" + +import pytest +from pathlib import Path +from intent_kit.evals import ( + load_dataset, + run_eval, + run_eval_from_path, + run_eval_from_module, + EvalTestCase, + Dataset, + EvalTestResult, + EvalResult +) + + +def test_load_dataset(): + """Test loading a dataset from YAML.""" + dataset = load_dataset( + "intent_kit/evals/datasets/classifier_node_llm.yaml") + + assert dataset.name == "classifier_node_llm" + assert dataset.node_type == "classifier" + assert dataset.node_name == "classifier_node_llm" + assert len(dataset.test_cases) > 0 + + # Check first test case + first_case = dataset.test_cases[0] + assert first_case.input == "What's the weather like in New York?" + assert first_case.expected == "Weather in New York: Sunny with a chance of rain" + assert first_case.context == {"user_id": "user123"} + + +def test_load_dataset_missing_file(): + """Test loading a non-existent dataset.""" + with pytest.raises(FileNotFoundError): + load_dataset("non_existent_file.yaml") + + +def test_load_dataset_malformed(): + """Test loading a malformed dataset.""" + # Create a temporary malformed YAML file + import tempfile + import yaml + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump({"invalid": "data"}, f) + temp_path = f.name + + try: + with pytest.raises(ValueError): + load_dataset(temp_path) + finally: + Path(temp_path).unlink() + + +def test_test_case_defaults(): + """Test EvalTestCase with default context.""" + test_case = EvalTestCase( + input="test input", + expected="test expected", + context={} + ) + + assert test_case.input == "test input" + assert test_case.expected == "test expected" + assert test_case.context == {} + + +def test_dataset_defaults(): + """Test Dataset with default description.""" + test_cases = [EvalTestCase("input", "expected", {})] + dataset = Dataset( + name="test", + description="", + node_type="test", + node_name="test_node", + test_cases=test_cases + ) + + assert dataset.description == "" + + +def test_eval_result_methods(): + """Test EvalResult methods.""" + results = [ + EvalTestResult("input1", "expected1", "actual1", True, {}), + EvalTestResult("input2", "expected2", "actual2", False, {}), + EvalTestResult("input3", "expected3", "actual3", True, {}) + ] + + eval_result = EvalResult(results, "test_dataset") + + assert eval_result.accuracy() == 2/3 + assert eval_result.passed_count() == 2 + assert eval_result.failed_count() == 1 + assert eval_result.total_count() == 3 + assert not eval_result.all_passed() + assert len(eval_result.errors()) == 1 + + +def test_eval_result_empty(): + """Test EvalResult with empty results.""" + eval_result = EvalResult([], "test_dataset") + + assert eval_result.accuracy() == 0.0 + assert eval_result.passed_count() == 0 + assert eval_result.failed_count() == 0 + assert eval_result.total_count() == 0 + assert eval_result.all_passed() # Empty results are considered "all passed" + assert len(eval_result.errors()) == 0 + + +def test_run_eval_with_callable(): + """Test run_eval with a callable node.""" + def simple_node(input_text, context=None): + return f"Processed: {input_text}" + + test_cases = [ + EvalTestCase("hello", "Processed: hello", {}), + EvalTestCase("world", "Processed: world", {}) + ] + + dataset = Dataset( + name="test", + description="Test dataset", + node_type="test", + node_name="simple_node", + test_cases=test_cases + ) + + result = run_eval(dataset, simple_node) + + assert result.accuracy() == 1.0 + assert result.all_passed() + assert result.total_count() == 2 + + +def test_run_eval_with_error(): + """Test run_eval with a node that raises exceptions.""" + def error_node(input_text, context=None): + if "error" in input_text.lower(): + raise ValueError("Intentional error") + return "success" + + test_cases = [ + EvalTestCase("hello", "success", {}), + # This will fail due to exception + EvalTestCase("error", "success", {}), + EvalTestCase("world", "success", {}) + ] + + dataset = Dataset( + name="test", + description="Test dataset", + node_type="test", + node_name="error_node", + test_cases=test_cases + ) + + result = run_eval(dataset, error_node) + + assert result.accuracy() == 2/3 + assert not result.all_passed() + assert result.failed_count() == 1 + assert result.errors()[0].error == "Intentional error" + + +def test_run_eval_fail_fast(): + """Test run_eval with fail_fast=True.""" + def error_node(input_text, context=None): + if "error" in input_text.lower(): + raise ValueError("Intentional error") + return "success" + + test_cases = [ + EvalTestCase("hello", "success", {}), + # This will fail and stop execution + EvalTestCase("error", "success", {}), + # This won't run due to fail_fast + EvalTestCase("world", "success", {}) + ] + + dataset = Dataset( + name="test", + description="Test dataset", + node_type="test", + node_name="error_node", + test_cases=test_cases + ) + + result = run_eval(dataset, error_node, fail_fast=True) + + assert result.total_count() == 2 # Only first two tests ran + assert result.failed_count() == 1 + assert result.errors()[0].error == "Intentional error" + + +def test_run_eval_custom_comparator(): + """Test run_eval with custom comparator.""" + def simple_node(input_text, context=None): + return input_text.upper() + + def case_insensitive_comparator(expected, actual): + return str(expected).lower() == str(actual).lower() + + test_cases = [ + EvalTestCase("hello", "HELLO", {}), + EvalTestCase("world", "WORLD", {}) + ] + + dataset = Dataset( + name="test", + description="Test dataset", + node_type="test", + node_name="simple_node", + test_cases=test_cases + ) + + result = run_eval(dataset, simple_node, + comparator=case_insensitive_comparator) + + assert result.accuracy() == 1.0 + assert result.all_passed() + + +def test_run_eval_from_path(): + """Test run_eval_from_path convenience function.""" + def simple_node(input_text, context=None): + return f"Processed: {input_text}" + + # Create a temporary dataset file + import tempfile + import yaml + + test_data = { + "dataset": { + "name": "test_dataset", + "description": "Test dataset", + "node_type": "test", + "node_name": "simple_node" + }, + "test_cases": [ + { + "input": "hello", + "expected": "Processed: hello", + "context": {"user_id": "test"} + } + ] + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + yaml.dump(test_data, f) + temp_path = f.name + + try: + result = run_eval_from_path(temp_path, simple_node) + assert result.accuracy() == 1.0 + assert result.all_passed() + finally: + Path(temp_path).unlink() + + +def test_save_results(): + """Test saving results to different formats.""" + results = [ + EvalTestResult("input1", "expected1", "actual1", True, {}), + EvalTestResult("input2", "expected2", "actual2", + False, {}, "test error") + ] + + eval_result = EvalResult(results, "test_dataset") + + # Test CSV save + import tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + csv_path = f.name + + try: + eval_result.save_csv(csv_path) + assert Path(csv_path).exists() + finally: + Path(csv_path).unlink() + + # Test JSON save + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + json_path = f.name + + try: + eval_result.save_json(json_path) + assert Path(json_path).exists() + finally: + Path(json_path).unlink() + + # Test Markdown save + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: + md_path = f.name + + try: + eval_result.save_markdown(md_path) + assert Path(md_path).exists() + finally: + Path(md_path).unlink() + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/uv.lock b/uv.lock index e22ece0..c9e53c6 100644 --- a/uv.lock +++ b/uv.lock @@ -328,6 +328,8 @@ dev = [ { name = "pytest" }, { name = "pytest-cov" }, { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "tqdm" }, ] viz = [ { name = "networkx" }, @@ -353,6 +355,8 @@ dev = [ { name = "pytest", specifier = ">=8.4.1" }, { name = "pytest-cov", specifier = ">=5.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "pyyaml" }, + { name = "tqdm" }, ] viz = [ { name = "networkx", specifier = ">=3.5" }, @@ -819,6 +823,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/4b/e37e4e5d5ee1179694917b445768bdbfb084f5a59ecd38089d3413d4c70f/pyvis-0.3.2-py3-none-any.whl", hash = "sha256:5720c4ca8161dc5d9ab352015723abb7a8bb8fb443edeb07f7a322db34a97555", size = 756038, upload-time = "2023-02-24T20:29:46.758Z" }, ] +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" }, + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223, upload-time = "2024-08-06T20:32:30.058Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542, upload-time = "2024-08-06T20:32:31.881Z" }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164, upload-time = "2024-08-06T20:32:37.083Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309, upload-time = "2024-08-06T20:32:43.4Z" }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679, upload-time = "2024-08-06T20:32:44.801Z" }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428, upload-time = "2024-08-06T20:32:46.432Z" }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361, upload-time = "2024-08-06T20:32:51.188Z" }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523, upload-time = "2024-08-06T20:32:53.019Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660, upload-time = "2024-08-06T20:32:54.708Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, +] + [[package]] name = "requests" version = "2.32.4"