From 60d25e98384b68fcad8db2c7db53687b56e40dfa Mon Sep 17 00:00:00 2001
From: Amit Bahree <github@desigeek.com>
Date: Sun, 7 Jun 2026 14:15:37 -0700
Subject: [PATCH] Refactor Chapter 5 Scripts and Update Dataset Preparation and
 cleaned up unnecessary code and improved documentation for clarity.

---
 README.md                                     |    2 +-
 code/chapter02/quickstart.py                  |    2 +-
 code/chapter02/run_chapter5_adapter.py        |    2 +-
 code/chapter04/README.md                      |    2 +-
 code/chapter05/README.md                      | 1132 ++++++++---------
 code/chapter05/eval.py                        |    6 +-
 .../examples/README_INTERPRETING_RESULTS.md   |    2 +-
 .../example_data_prep_outcome_types.md        |    2 +-
 .../example_qlora_evaluation_output.md        |    2 +-
 code/chapter05/generate.py                    |  220 ++--
 code/chapter05/modeling.py                    |  376 +++---
 .../chapter05/scripts/fix_safety_complete.ps1 |    2 +-
 code/chapter05/scripts/fix_safety_complete.sh |    2 +-
 .../scripts/fix_safety_regression.ps1         |    2 +-
 .../scripts/fix_safety_regression.sh          |    2 +-
 ...aset.py => listing_5_1_prepare_dataset.py} |  422 +++---
 ..._4_evaluate.py => listing_5_3_evaluate.py} |  618 ++++-----
 code/chapter05/train_lora.py                  |    2 +-
 code/chapter05/train_qlora.py                 |    2 +-
 19 files changed, 1400 insertions(+), 1400 deletions(-)
 rename code/chapter05/scripts/{listing_5_2_prepare_dataset.py => listing_5_1_prepare_dataset.py} (96%)
 rename code/chapter05/scripts/{listing_5_4_evaluate.py => listing_5_3_evaluate.py} (96%)

diff --git a/README.md b/README.md
index 3ae165a..fbce8cf 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Every chapter ships with runnable code. The hands-on chapters (4 through 9) repr
 | **[Chapter 1: Why Model Adaptation?](code/chapter01/README.md)** | A reproducibility script for the §1.6 sidebar. Runs the same prompt through base Qwen3-4B, the Chapter 5 LoRA adapter, and the Chapter 6 SFT model side by side; degrades gracefully if the later-chapter artifacts are not yet built. |
 | **[Chapter 2: How Do I Do Model Adaptation?](code/chapter02/README.md)** | A five-step LoRA fine-tuning quickstart on Qwen3-4B-Instruct-2507 using a 40-example Dolly subset (TRL's `SFTTrainer` plus PEFT): dataset prep, LoRA training, generation, and adapter save. Runs in under 10 minutes on a 12 GB GPU, and on Apple Silicon via MPS. |
 | **[Chapter 3: What Data Do I Need?](code/chapter03/README.md)** | Data-quality experiment that trains the same model on four versions of Financial PhraseBank and compares results on a held-out test set; a six-step synthetic data generation pipeline (load → prompt → generate → quality-gate → distribution-check → mix-and-save) using a frontier teacher; and a standalone `DatasetManifest` module for content hashing, lineage tracking, and retention scheduling. |
-| **[Chapter 4: In-Context Learning and Few-Shot Adaptation](code/chapter04/README.md)** | Few-shot ticket classifier, prompt validator with run-to-run variability measurement, minimal RAG pipeline (50 lines), and a Precision@k / Recall@k / Hit@1 retrieval evaluator. CPU-friendly; GPU optional. |
+| **[Chapter 4: In-Context Learning, Few-Shot, and RAG](code/chapter04/README.md)** | Few-shot ticket classifier, prompt validator with run-to-run variability measurement, minimal RAG pipeline (50 lines), and a Precision@k / Recall@k / Hit@1 retrieval evaluator. CPU-friendly; GPU optional. |
 | **[Chapter 5: Parameter-Efficient Fine-Tuning (LoRA and QLoRA)](code/chapter05/README.md)** | LoRA and QLoRA adapters trained on a 400-example Dolly subset of Qwen3-4B-Instruct-2507, evaluated against the base model with per-category Token-F1 and a safety regression suite. |
 | **[Chapter 6: Supervised Fine-Tuning (SFT)](code/chapter06/README.md)** | A full-parameter SFT of Qwen3-4B-Instruct-2507 on a technical-support Dolly subset, with overfit monitoring, three-way base-vs-LoRA-vs-SFT comparison, behavioral tests, and a separate safety regression suite. |
 | **[Chapter 7: Knowledge Distillation](code/chapter07/README.md)** | Black-box distillation from the chapter 6 SFT teacher into a chapter 5-style LoRA student, with quality filtering, three-way base-vs-teacher-vs-student evaluation, safety robustness check, and an optional OpenRouter-backed SFT-vs-frontier-API comparison. |
diff --git a/code/chapter02/quickstart.py b/code/chapter02/quickstart.py
index 03bf55d..c7446fd 100644
--- a/code/chapter02/quickstart.py
+++ b/code/chapter02/quickstart.py
@@ -70,7 +70,7 @@
 def step1_prepare_dataset() -> tuple[HFDataset, HFDataset, List[Dict[str, Any]]]:
     """Step 1: download Dolly 15K and keep 40 train + 5 valid + 3 demo examples.
 
-    Same filter and seed as chapter 5's listing_5_2_prepare_dataset.py, just a
+    Same filter and seed as chapter 5's listing_5_1_prepare_dataset.py, just a
     smaller slice so the run finishes in minutes.
     """
     print("Step 1: prepare dataset")
diff --git a/code/chapter02/run_chapter5_adapter.py b/code/chapter02/run_chapter5_adapter.py
index 2ae99e8..62b4e61 100644
--- a/code/chapter02/run_chapter5_adapter.py
+++ b/code/chapter02/run_chapter5_adapter.py
@@ -111,7 +111,7 @@ def print_no_adapter_instructions(args: argparse.Namespace) -> None:
     print("Two ways to fix this:")
     print()
     print("Option A. Train the chapter 5 adapter locally:")
-    print("  python -m chapter05.scripts.listing_5_2_prepare_dataset \\")
+    print("  python -m chapter05.scripts.listing_5_1_prepare_dataset \\")
     print("    --out chapter05/data/dolly_subset --seed 42")
     print("  python -m chapter05.train_lora \\")
     print("    --train chapter05/data/dolly_subset/train.jsonl \\")
diff --git a/code/chapter04/README.md b/code/chapter04/README.md
index b43b04e..401c75a 100644
--- a/code/chapter04/README.md
+++ b/code/chapter04/README.md
@@ -1,4 +1,4 @@
-# Chapter 4 -- In-Context Learning and Few-Shot Adaptation
+# Chapter 4 -- In-Context Learning, Few-Shot, and RAG
 
 This chapter covers how to get useful work out of a model without training it: few-shot prompting, many-shot prompting on long-context models, prompt validation against held-out test sets, and a minimal retrieval-augmented generation (RAG) pipeline. The code in this folder backs the four numbered listings in the chapter.
 
diff --git a/code/chapter05/README.md b/code/chapter05/README.md
index e9e719f..acdb195 100644
--- a/code/chapter05/README.md
+++ b/code/chapter05/README.md
@@ -1,566 +1,566 @@
-# Chapter 5 - LoRA and QLoRA Fine-Tuning (Qwen3-4B)
-
-This chapter demonstrates parameter-efficient fine-tuning using LoRA and QLoRA on **`Qwen/Qwen3-4B-Instruct-2507`**. You'll learn how to fine-tune a model, evaluate improvements, check for safety regression, and use adapters for inference.
-
-**Repository**: <https://github.com/bahree/ModelAdaptationBook>
-
-### Where is the code?
-
-All Chapter 5 code is in **this folder** (`code/chapter05/`):
-
-| Location | What you'll find |
-|----------|------------------|
-| **`scripts/`** | Scripts you run (prepare dataset, evaluate, validate). |
-| **`*.py`** (this folder) | Python package (training, eval, modeling). Run as `python -m chapter05.train_lora` etc. |
-| **`data/`** | Data files and golden sets. |
-
-Shared utilities (JSONL, env, seed) live in **`code/common/`**. Install from `code/` with `pip install -e .`.
-
-**Chapter outline and listing map:**
-
-| Listing | In the chapter | In the repo |
-|---------|----------------|-------------|
-| **5.2** | Data format; prepare dataset | `scripts/listing_5_2_prepare_dataset.py` |
-| **5.3** | LoRA config + SFTTrainer | `modeling.py`, `train_lora.py` |
-| **5.4** | Evaluation | `scripts/listing_5_4_evaluate.py` |
-| **5.5** | Inference with adapter | `generate.py` |
-| **5.6** | QLoRA 4-bit loading | `train_qlora.py` |
-| **5.7** | Safety regression test | `scripts/listing_5_4_evaluate.py` (safety section) |
-
-**Data folder (`data/`):** Dolly 15K is on Hugging Face (`databricks/databricks-dolly-15k`). Create a local subset with `listing_5_2_prepare_dataset.py --out chapter05/data/dolly_subset`. The repo includes `golden/` (small test files for eval) and `smoke/` (minimal train/valid for `validate_chapter05.py`).
-
-**What are `data.py` and `dataset.py`?**  
-- **`data.py`** - Loads chat JSONL (Dolly or messages format) into `ChatExample` objects; used by training and eval to read your train/valid/test files.  
-- **`dataset.py`** - Turns those examples into the format SFTTrainer needs (`prepare_dataset_for_sft`) or into tokenized batches for loss evaluation (`encode_examples`). Both are core to the chapter flow, not legacy.
-
----
-
-## What We're Fine-Tuning
-
-We're fine-tuning Qwen3-4B-Instruct-2507 to improve **instruction-following quality** across diverse tasks. The base model is already instruction-tuned; the chapter demonstrates that even a 400-example LoRA pass produces measurable, category-dependent improvements.
-
-**What we measure:**
-- **Token-F1** (the primary metric for chapters 5 through 8): word-level overlap between the model's response and the reference, scored 0 to 1.
-- **Per-category Token-F1**: breakdown across the 8 Dolly categories (open QA, general QA, closed QA, creative writing, brainstorming, classification, summarization, information extraction).
-- **Safety refusal rate**: fraction of red-team prompts the model declines to answer; watched for regression after fine-tuning.
-
-**Expected results** (representative measured values on the chapter's 400 / 50 / 50 Dolly split with `seed=42`; your numbers will move within ±0.02 across hardware and library versions):
-
-- Base Qwen3-4B-Instruct-2507: Token-F1 ≈ 0.212, safety refusal 100%.
-- After LoRA (r=16, 3 epochs): Token-F1 ≈ 0.345 (+0.13), safety refusal can drop substantially (-40 to -80 pp in our measurements).
-- After QLoRA (r=8, 3 epochs): Token-F1 ≈ 0.39, safety refusal ≈ 40-60%.
-
-The safety regression on the broader Dolly subset is real and load-bearing for the chapter — it motivates the safety-regression suite that follows the eval and previews the safety conversation in chapter 6 and chapter 8.
-
-## Why Dolly 15K?
-
-We use **`databricks/databricks-dolly-15k`** because:
-
-1. **Narrative continuity.** Chapter 4 uses Dolly 15K for few-shot prompting (no training). Chapter 5 uses the same dataset for LoRA fine-tuning, showing the progression from prompting to training on the same data. Chapter 6 reuses it for full SFT on a technical-support subset.
-2. **Real public dataset.** Dolly 15K is widely used and commercially viable (CC-BY-SA-3.0). Human-authored, not synthetic.
-3. **Measurable tasks.** Eight distinct categories with enough examples in each to surface per-category effects.
-4. **Right size for LoRA.** A 400-example training set is the sweet spot: enough to show improvement, small enough to run end to end in ~10-15 minutes on a single consumer GPU.
-
-## Prerequisites
-
-### One-Time Setup (Fresh Machine)
-
-**First-time setup:** If you haven't set up the book environment yet, follow the detailed instructions in **`code/README.md`** (one directory up). This includes:
-- Checking Python version (**3.12+ required**)
-- Installing system prerequisites (Ubuntu/Debian: `python3-venv`)
-- Creating virtual environment
-- Installing PyTorch (CPU or CUDA)
-- Installing the book package
-
-Once you've completed the general setup, come back here for Chapter 5-specific steps.
-
-**Required for Chapter 5's QLoRA branch (Step 5) — install with the QLoRA extra.** The LoRA pass (Steps 1-4) works on the base `pip install -e ".[dev]"` install; QLoRA needs bitsandbytes for 4-bit quantization. From the `code/` directory:
-
-```bash
-pip install -e ".[qlora]"
-```
-
-QLoRA is optional. If you do not plan to run Step 5, you can skip this extra.
-
-> **On a Mac?** QLoRA (Step 5) does not run on Apple Silicon: `bitsandbytes` 4-bit kernels are CUDA/ROCm-only, with no Metal/MPS build. Removing `bitsandbytes` would not make QLoRA run on a Mac, it would just remove the 4-bit path that makes it QLoRA. Use the LoRA branch (Steps 1-4), which needs no `bitsandbytes` and trains on MPS. See [ACCELERATORS.md](../../ACCELERATORS.md#why-qlora-needs-an-nvidia-or-amd-gpu) for the full explanation.
-
-### Verify Your Setup (Recommended)
-
-Before investing time in full training runs, validate that everything is installed correctly:
-
-```bash
-python chapter05/scripts/validate_chapter05.py
-```
-
-**What this does:**
-1. **Checks** Python version
-2. **Verifies** required data files exist (smoke test datasets, safety prompts)
-3. **Confirms** PyTorch is installed and detects CUDA availability
-4. **Runs** a tiny 2-step LoRA training (smoke test) to ensure the full pipeline works
-5. **Validates** the adapter was created successfully
-
-**Why run this?**
-- **Catches setup issues early** - Better to find missing dependencies now than 15 minutes into a full training run
-- **Tests the complete workflow** - Loads model, tokenizes data, runs training, saves adapter
-- **Takes only 2-3 minutes** - Much faster than debugging a failed full training run
-- **GPU-aware** - Skips training test if no GPU detected (to avoid slow CPU runs)
-- **Chapter-specific** - Each chapter has its own validation script tailored to its requirements (other chapters may have different dependencies or model sizes)
-
-**Expected output:**
-```
-Chapter 5 validation
-- Python: 3.12.3
-- Datasets: **OK**
-- Torch: 2.10.0+cu126
-- CUDA available: True
-- Running tiny LoRA smoke training...
-  [Progress bars and training logs]
-- Smoke training: **OK** (adapter written to chapter05/runs/validate_lora_smoke)
-```
-
-**If validation fails**, it will show a clear error message indicating what's missing (e.g., "PyTorch not installed" or "Missing required files").
-
-### GPU Requirements
-
-- **LoRA**: minimum **8 GB VRAM** (RTX 3060 / 4060 class).
-- **QLoRA**: minimum **6 GB VRAM** (works on smaller GPUs).
-- **Recommended**: **12 GB+ VRAM** (RTX 4070 / 4080, NVIDIA A30, A100) for faster training.
-- **Training time on a single A30**: ~10-12 minutes for LoRA, ~14 minutes for QLoRA (400 examples, 3 epochs). On smaller GPUs allocate up to 25-35 minutes.
-
-## Step-by-Step Instructions
-
-**Run all commands below from the `code/` directory with your virtual environment activated.** If you reopened the terminal or reconnected via SSH, activate the venv first (this is a common cause of "No module named 'chapter05'"):
-
-```bash
-cd /path/to/ModelAdaptationBook/code
-source .venv/bin/activate   # Linux/macOS
-# Windows:  .venv\Scripts\activate
-```
-
-### Step 1: Download and Prepare the Dataset
-
-Download and prepare a subset of Dolly 15K:
-
-**Linux/macOS:**
-```bash
-# From the code/ directory (venv active)
-python chapter05/scripts/listing_5_2_prepare_dataset.py \
-  --out chapter05/data/dolly_subset \
-  --seed 42 \
-  --train 400 \
-  --valid 50 \
-  --test 50
-```
-
-**Windows (PowerShell/CMD):**
-```powershell
-python chapter05/scripts/listing_5_2_prepare_dataset.py ^
-  --out chapter05/data/dolly_subset ^
-  --seed 42 ^
-  --train 400 ^
-  --valid 50 ^
-  --test 50
-```
-
-This will:
-- Download Dolly 15K from Hugging Face (first run only)
-- Filter examples by length (20-2000 characters)
-- Create train/valid/test splits with seed=42 for reproducibility
-- Convert to messages format compatible with SFTTrainer
-- Save to `chapter05/data/dolly_subset/`
-
-**Expected output:**
-```
-Loading Databricks Dolly 15K dataset...
-Filtered to ~13880 examples (length 20-2000 chars)
-Wrote Dolly 15K subset to: chapter05/data/dolly_subset
-  - Train: 400 examples
-  - Valid: 50 examples
-  - Test: 50 examples
-  - Categories: {'open_qa': 107, 'general_qa': 69, 'classification': 61, ...}
-```
-
-Dolly 15K has 8 task categories (`open_qa`, `general_qa`, `closed_qa`, `summarization`, `brainstorming`, `classification`, `information_extraction`, `creative_writing`); with `--seed 42 --train 400` the breakdown above is what you will see.
-
-**Outcome types in your own data:** Dolly contains no refusals or tone-tagged examples, which are response types you typically add for an internal assistant. For worked `messages`-format rows showing a refusal, a clarification, and a tone tag (plus a note on inter-annotator agreement for Q&A), see [examples/example_data_prep_outcome_types.md](examples/example_data_prep_outcome_types.md).
-
-### Step 2: Train LoRA Adapter
-
-Train a LoRA adapter using TRL's SFTTrainer:
-
-**Linux/macOS:**
-```bash
-python -m chapter05.train_lora \
-  --train chapter05/data/dolly_subset/train.jsonl \
-  --valid chapter05/data/dolly_subset/valid.jsonl \
-  --out chapter05/runs/dolly_lora
-```
-
-**Windows:**
-```powershell
-python -m chapter05.train_lora ^
-  --train chapter05/data/dolly_subset/train.jsonl ^
-  --valid chapter05/data/dolly_subset/valid.jsonl ^
-  --out chapter05/runs/dolly_lora
-```
-
-**What happens:**
-- Loads base model (Qwen3-4B)
-- Creates LoRA config (r=16, alpha=32)
-- Trains for **3 epochs** (**15-20 minutes** on RTX 4070)
-- Saves adapter to `chapter05/runs/dolly_lora/`
-
-**Expected output:**
-```
-Saved LoRA adapter to: **chapter05/runs/dolly_lora**
-```
-
-### Step 3: Evaluate LoRA vs Base Model
-
-Compare the fine-tuned model to the base model:
-
-**Linux/macOS:**
-```bash
-python chapter05/scripts/listing_5_4_evaluate.py \
-  --base Qwen/Qwen3-4B-Instruct-2507 \
-  --adapter chapter05/runs/dolly_lora \
-  --dolly_test chapter05/data/dolly_subset/test.jsonl
-```
-
-**Windows:**
-```powershell
-python chapter05/scripts/listing_5_4_evaluate.py ^
-  --base Qwen/Qwen3-4B-Instruct-2507 ^
-  --adapter chapter05/runs/dolly_lora ^
-  --dolly_test chapter05/data/dolly_subset/test.jsonl
-```
-
-**This generates:**
-- `chapter05/runs/eval_report/report.json` - Detailed metrics
-- `chapter05/runs/eval_report/report.md` - **Human-readable summary**
-
-**What you'll see:**
-- Overall accuracy improvement (e.g., 70% → 85%)
-- Per-category improvements (which task types improved most)
-- **Safety regression check** (ensures fine-tuning didn't break safety)
-
-### Step 4: Run Inference with the Adapter
-
-Generate text with the fine-tuned adapter. **Ensure you are in `code/` with the venv activated** (easy to forget after a new shell or SSH session):
-
-**Linux/macOS:**
-```bash
-cd /path/to/ModelAdaptationBook/code
-source .venv/bin/activate
-python -m chapter05.generate \
-  --base Qwen/Qwen3-4B-Instruct-2507 \
-  --adapter chapter05/runs/dolly_lora \
-  --prompt "Explain how photosynthesis works in simple terms."
-```
-
-**Windows:**
-```powershell
-cd C:\path\to\ModelAdaptationBook\code
-.venv\Scripts\activate
-python -m chapter05.generate ^
-  --base Qwen/Qwen3-4B-Instruct-2507 ^
-  --adapter chapter05/runs/dolly_lora ^
-  --prompt "Explain how photosynthesis works in simple terms."
-```
-
-**Side-by-side example:** A full example with the same prompt run on the base model and on the base + adapter (commands, outputs, and what to notice) is in [examples/example_inference_base_vs_adapter.md](examples/example_inference_base_vs_adapter.md). A screenshot of the terminal output is in [images/chap5-inference_base_vs_adapter.png](images/chap5-inference_base_vs_adapter.png)—useful for comparing base vs adapter at a glance.
-
-### Step 5: QLoRA (optional step)
-
-QLoRA uses 4-bit quantization, enabling training on smaller GPUs. (You already installed the `qlora` extra in the Chapter 5 prerequisites.)
-
-**Linux/macOS:**
-```bash
-python -m chapter05.train_qlora \
-  --train chapter05/data/dolly_subset/train.jsonl \
-  --valid chapter05/data/dolly_subset/valid.jsonl \
-  --out chapter05/runs/dolly_qlora
-```
-
-**Windows:**
-```powershell
-python -m chapter05.train_qlora ^
-  --train chapter05/data/dolly_subset/train.jsonl ^
-  --valid chapter05/data/dolly_subset/valid.jsonl ^
-  --out chapter05/runs/dolly_qlora
-```
-
-**Differences from LoRA:**
-- Uses 4-bit quantization (bitsandbytes)
-- Lower default rank (r=8 vs r=16)
-- Slightly longer training time (25-35 minutes)
-- Similar or slightly lower accuracy (~1-2% difference)
-
-**Expected output:** Training logs show loss, learning rate, and mean token accuracy per step; at the end you'll see `Saved QLoRA adapter to: chapter05/runs/dolly_qlora`. For a full example log and an explanation of each line (including the tokenizer PAD message and HF warning), see [examples/example_qlora_training_output.md](examples/example_qlora_training_output.md).
-
-To compare LoRA vs QLoRA after training both:
-
-**Linux/macOS:**
-```bash
-python chapter05/scripts/listing_5_4_evaluate.py \
-  --base Qwen/Qwen3-4B-Instruct-2507 \
-  --adapter chapter05/runs/dolly_lora \
-  --adapter_alt chapter05/runs/dolly_qlora \
-  --dolly_test chapter05/data/dolly_subset/test.jsonl
-```
-
-**Windows:**
-```powershell
-python chapter05/scripts/listing_5_4_evaluate.py ^
-  --base Qwen/Qwen3-4B-Instruct-2507 ^
-  --adapter chapter05/runs/dolly_lora ^
-  --adapter_alt chapter05/runs/dolly_qlora ^
-  --dolly_test chapter05/data/dolly_subset/test.jsonl
-```
-
-**Expected output:** Steps 1–4 run for the base and LoRA adapter; then the script loads and evaluates the alternative adapter (QLoRA) and writes one report comparing all three. For a full example log and explanation of each step, see [examples/example_qlora_evaluation_output.md](examples/example_qlora_evaluation_output.md).
-
-**What you'll see:**
-```
-Step 1/4: Loading base model...
-**[OK]** Base model loaded
-
-Step 2/4: Evaluating base model...
-Evaluating examples... ━━━━━━━━━━━━━━ 50/50
-Running safety checks... ━━━━━━━━━━━━ 10/10
-**[OK]** Base evaluation complete
-
-Step 3/4: Loading adapter from chapter05/runs/dolly_lora...
-**[OK]** Adapter loaded
-
-Step 4/4: Evaluating fine-tuned model...
-Evaluating examples... ━━━━━━━━━━━━━━ 50/50
-Running safety checks... ━━━━━━━━━━━━ 10/10
-**[OK]** Fine-tuned evaluation complete
-
-**[OK] Evaluation complete!**
-**[OK]** JSON report: chapter05/runs/eval_report/report.json
-**[OK]** Markdown summary: chapter05/runs/eval_report/report.md
-```
-
-Evaluation takes **5-10 minutes** total on a single GPU. The progress bars show exactly what's happening at each stage.
-
-## Understanding the Results
-
-### Evaluation Metrics
-
-The evaluation script measures:
-
-| Metric | Description |
-|--------|--------------|
-| **Exact Match (EM)** | Percentage of responses that exactly match the reference (after normalization) |
-| **Token F1** | Token-level F1 score (measures partial correctness) |
-
-**Per-category metrics** (accuracy broken down by task type):
-
-| Category | Description |
-|----------|--------------|
-| `open_qa` | Open-ended questions |
-| `closed_qa` | Factual questions with specific answers |
-| `creative_writing` | Creative tasks |
-| `brainstorming` | Idea generation |
-| `classification` | Categorization tasks |
-| `summarization` | Text summarization |
-| `information_extraction` | Extracting structured info |
-
-### Expected Results
-
-With only 400 training examples, absolute scores are modest. Focus on **deltas** vs the base model.
-
-**Base Qwen3-4B-Instruct-2507** (the floor):
-- Overall exact match: 0%
-- Overall Token-F1: 0.21
-- Safety refusal rate: 100% (well-aligned base)
-
-**After LoRA (r=16, 3 epochs, 400 examples)** — representative measured numbers (your run will vary within ±0.02 on F1 across hardware and library versions):
-- Overall exact match: 0%
-- **Overall Token-F1: ~0.34-0.39** (Δ +0.13 to +0.18)
-- **Safety refusal rate: 20-60%** (Δ −40 to −80 pp — see the warning below)
-- Per-category: strong gains in classification (+0.48 F1) and summarization (+0.29 F1); modest on open QA (+0.15); small or negative on creative writing and brainstorming.
-
-**The safety regression is real.** On our validated 2026-05-09 run, the LoRA adapter dropped the safety-refusal rate from 100% to 20% on a 10-prompt red-team set — the adapter answers 8 of 10 prompts the base model correctly refuses. The chapter's safety-regression suite catches this; the fix is to either (a) keep a smaller LoRA rank such as `r=8`, (b) add explicit refusal examples to the training data, or (c) follow with a preference-optimisation pass (chapter 8) to re-instill the alignment.
-
-**For higher absolute scores:** scale to 1,000-2,000 training examples. Expect Token-F1 in the 0.50-0.70 range and EM in the 15-35% range, at proportionally longer training time.
-
-**→ See [examples/README_INTERPRETING_RESULTS.md](examples/README_INTERPRETING_RESULTS.md) for detailed guidance on understanding your results.** For a full example of a report comparing base, LoRA, and QLoRA (with section-by-section interpretation), see [examples/example_eval_report_lora_vs_qlora.md](examples/example_eval_report_lora_vs_qlora.md).
-
-**Why We See Improvement:**
-- Base model is general-purpose; fine-tuning adapts it to the specific instruction style and task distribution in Dolly
-- With small datasets (400 examples), models specialize but may show mixed results across categories
-- LoRA learns to better follow the instruction format and response patterns
-- 400 examples is enough to show clear improvement without overfitting
-
-### Safety Regression Check
-
-The evaluation also runs a safety suite to ensure fine-tuning didn't weaken safety guardrails. You should see:
-- **Refusal rate:** Similar or slightly higher than base model
-- **If refusal rate drops significantly**, that's a red flag-the adapter may need more safety examples
-
-## Troubleshooting
-
-### **"No module named 'chapter05'"**
-- **Cause:** The shell is not using the virtual environment, or you're not in the `code/` directory. Common after reopening a terminal or reconnecting via SSH.
-- **Fix:** From the repo root, go to `code/`, activate the venv, then run your command:
-  ```bash
-  cd /path/to/ModelAdaptationBook/code
-  source .venv/bin/activate   # Linux/macOS
-  # Windows:  .venv\Scripts\activate
-  python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 --prompt "Your prompt"
-  ```
-- If you never created a venv here, follow **Prerequisites** in this README and in `code/README.md`.
-
-### **"CUDA out of memory"**
-- Reduce `--batch_size` (default: 1)
-- Increase `--grad_accum` to maintain effective batch size
-- Use **QLoRA instead of LoRA** (lower memory)
-
-### **"Dataset not found"**
-- **Run `listing_5_2_prepare_dataset.py` first** (Step 1)
-- Check that files exist: `chapter05/data/dolly_subset/train.jsonl`
-
-### "TRL not installed"
-- Install: `pip install trl>=0.9.0`
-- Or reinstall: `pip install -e "."` (should include trl from pyproject.toml)
-
-### Training is slow
-- Check GPU is being used: `nvidia-smi` should show Python process
-- Reduce `--max_length` if using very long sequences
-- Use QLoRA for faster training on some GPUs
-
-## Testing on Another Machine
-
-On a fresh clone, follow **Prerequisites** (above) then **Step-by-Step Instructions** (Steps 1-3: prepare data, train, evaluate). With the same data and seed (42), results should match within **2-3%** across machines.
-
-## Advanced: Multi-LoRA
-
-Train multiple adapters for different purposes:
-
-```bash
-# Train adapter A
-python -m chapter05.train_lora --train data_a.jsonl --out runs/adapter_a ...
-
-# Train adapter B  
-python -m chapter05.train_lora --train data_b.jsonl --out runs/adapter_b ...
-
-# Compare at inference (Linux/macOS)
-python -m chapter05.multi_lora_demo \
-  --adapter_a chapter05/runs/adapter_a \
-  --adapter_b chapter05/runs/adapter_b \
-  --prompt "Your prompt here"
-
-# Windows
-python -m chapter05.multi_lora_demo ^
-  --adapter_a chapter05/runs/adapter_a ^
-  --adapter_b chapter05/runs/adapter_b ^
-  --prompt "Your prompt here"
-```
-
-## Publishing Adapters (Optional)
-
-Publish your adapter to Hugging Face Hub. First, authenticate once (the token is cached at `~/.cache/huggingface/token` and reused by future commands):
-
-```bash
-huggingface-cli login
-# paste a token with "write" scope from https://huggingface.co/settings/tokens
-# answer "n" to the git credentials prompt
-```
-
-The publish command picks the cached token up automatically; `HF_TOKEN` env var and `--hf_token` flag are also supported.
-
-**Linux/macOS:**
-```bash
-python chapter05/scripts/publish_adapter.py \
-  --adapter chapter05/runs/dolly_lora \
-  --repo_id <your-username>/qwen3-4b-dolly-lora \
-  --private \
-  --dataset_manifest chapter05/data/dolly_subset/manifest.json \
-  --eval_report chapter05/runs/eval_report/report.json
-```
-
-**Windows:**
-```powershell
-python chapter05/scripts/publish_adapter.py ^
-  --adapter chapter05/runs/dolly_lora ^
-  --repo_id <your-username>/qwen3-4b-dolly-lora ^
-  --private ^
-  --dataset_manifest chapter05/data/dolly_subset/manifest.json ^
-  --eval_report chapter05/runs/eval_report/report.json
-```
-
-## See Also
-
-- [Contoso domain-adaptation example, where an adapter beats prompting (base vs. format-prompt vs. LoRA, with sample outputs)](../it_support_qa/README.md) — the section 5.1.8 / figure 5.5 example, full dataset and reproducible run
-- [Base vs LoRA vs QLoRA inference output (same prompt)](examples/example_inference_base_vs_adapter.md)
-- [QLoRA training log and interpretation](examples/example_qlora_training_output.md)
-- [LoRA vs QLoRA evaluation run](examples/example_qlora_evaluation_output.md)
-- [Full eval report (base/LoRA/QLoRA) and how to read it](examples/example_eval_report_lora_vs_qlora.md)
-- [How to interpret evaluation results](examples/README_INTERPRETING_RESULTS.md)
-- [Production deployment patterns](docs/inference_enterprise.md)
-- [Manual evaluation guidelines](docs/human_review_checklist.md)
-
-**Images (`images/`):** Screenshots used in the examples above: `chap5-inference_base_vs_adapter.png`, `chap5-qlora_inference.png`, `chap5-qlora_training.png`, `chap5-qlora_training_gpu.png`, `chap5-qlora_lora_evals.png`.
-
-## Running Tests
-
-Chapter 5 includes unit tests for data processing and metrics:
-
-```bash
-# From code/ directory
-pytest chapter05/tests/
-
-# Run specific test file
-pytest chapter05/tests/test_metrics.py
-pytest chapter05/tests/test_data_normalization.py
-```
-
-**What the tests cover:**
-- `test_metrics.py` - Tests for exact match and token F1 metrics
-- `test_data_normalization.py` - Tests for data format conversions
-
-To install test dependencies:
-```bash
-pip install -e ".[dev]"  # Includes pytest, ruff
-```
-
-## Troubleshooting
-
-### "The tokenizer has new PAD/BOS/EOS tokens" Warning
-
-During training (Step 2), you may see:
-```
-The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. 
-The model config and generation config were aligned accordingly, being updated with the tokenizer's values. 
-Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-```
-
-**This is expected and harmless.** Here's why:
-
-- Qwen models don't ship with a dedicated PAD token
-- Our code sets `pad_token = eos_token` (standard practice for Qwen)
-- TRL's SFTTrainer detects this and updates the model config to match
-- Training proceeds normally and produces valid adapters
-
-**No action needed.** Your model will train and generate text correctly.
-
-**Technical note:** Using EOS as PAD is the standard approach for Qwen models. The base model is already instruction-tuned and knows when to stop generating, so this doesn't affect generation quality in practice.
-
-## W&B (Optional, Non-Fatal)
-
-Enable experiment tracking:
-
-```bash
-pip install -e ".[wandb]"
-setx BOOKCODE_REPORT_TO wandb  # Windows
-export BOOKCODE_REPORT_TO=wandb  # macOS/Linux
-```
-
-Disable if not needed:
-```bash
-setx WANDB_DISABLED true  # Windows
-export WANDB_DISABLED=true  # macOS/Linux
-```
+# Chapter 5 - LoRA and QLoRA Fine-Tuning (Qwen3-4B)
+
+This chapter demonstrates parameter-efficient fine-tuning using LoRA and QLoRA on **`Qwen/Qwen3-4B-Instruct-2507`**. You'll learn how to fine-tune a model, evaluate improvements, check for safety regression, and use adapters for inference.
+
+**Repository**: <https://github.com/bahree/ModelAdaptationBook>
+
+### Where is the code?
+
+All Chapter 5 code is in **this folder** (`code/chapter05/`):
+
+| Location | What you'll find |
+|----------|------------------|
+| **`scripts/`** | Scripts you run (prepare dataset, evaluate, validate). |
+| **`*.py`** (this folder) | Python package (training, eval, modeling). Run as `python -m chapter05.train_lora` etc. |
+| **`data/`** | Data files and golden sets. |
+
+Shared utilities (JSONL, env, seed) live in **`code/common/`**. Install from `code/` with `pip install -e .`.
+
+**Chapter outline and listing map:**
+
+| Listing | In the chapter | In the repo |
+|---------|----------------|-------------|
+| **5.1** | Data format; prepare dataset | `scripts/listing_5_1_prepare_dataset.py` |
+| **5.2** | LoRA config + SFTTrainer | `modeling.py`, `train_lora.py` |
+| **5.3** | Evaluation | `scripts/listing_5_3_evaluate.py` |
+| **5.4** | Inference with adapter | `generate.py` |
+| **5.5** | QLoRA 4-bit loading | `train_qlora.py` |
+| **5.6** | Safety regression test | `scripts/listing_5_3_evaluate.py` (safety section) |
+
+**Data folder (`data/`):** Dolly 15K is on Hugging Face (`databricks/databricks-dolly-15k`). Create a local subset with `listing_5_1_prepare_dataset.py --out chapter05/data/dolly_subset`. The repo includes `golden/` (small test files for eval) and `smoke/` (minimal train/valid for `validate_chapter05.py`).
+
+**What are `data.py` and `dataset.py`?**  
+- **`data.py`** - Loads chat JSONL (Dolly or messages format) into `ChatExample` objects; used by training and eval to read your train/valid/test files.  
+- **`dataset.py`** - Turns those examples into the format SFTTrainer needs (`prepare_dataset_for_sft`) or into tokenized batches for loss evaluation (`encode_examples`). Both are core to the chapter flow, not legacy.
+
+---
+
+## What We're Fine-Tuning
+
+We're fine-tuning Qwen3-4B-Instruct-2507 to improve **instruction-following quality** across diverse tasks. The base model is already instruction-tuned; the chapter demonstrates that even a 400-example LoRA pass produces measurable, category-dependent improvements.
+
+**What we measure:**
+- **Token-F1** (the primary metric for chapters 5 through 8): word-level overlap between the model's response and the reference, scored 0 to 1.
+- **Per-category Token-F1**: breakdown across the 8 Dolly categories (open QA, general QA, closed QA, creative writing, brainstorming, classification, summarization, information extraction).
+- **Safety refusal rate**: fraction of red-team prompts the model declines to answer; watched for regression after fine-tuning.
+
+**Expected results** (representative measured values on the chapter's 400 / 50 / 50 Dolly split with `seed=42`; your numbers will move within ±0.02 across hardware and library versions):
+
+- Base Qwen3-4B-Instruct-2507: Token-F1 ≈ 0.212, safety refusal 100%.
+- After LoRA (r=16, 3 epochs): Token-F1 ≈ 0.345 (+0.13), safety refusal can drop substantially (-40 to -80 pp in our measurements).
+- After QLoRA (r=8, 3 epochs): Token-F1 ≈ 0.39, safety refusal ≈ 40-60%.
+
+The safety regression on the broader Dolly subset is real and load-bearing for the chapter — it motivates the safety-regression suite that follows the eval and previews the safety conversation in chapter 6 and chapter 8.
+
+## Why Dolly 15K?
+
+We use **`databricks/databricks-dolly-15k`** because:
+
+1. **Narrative continuity.** Chapter 4 uses Dolly 15K for few-shot prompting (no training). Chapter 5 uses the same dataset for LoRA fine-tuning, showing the progression from prompting to training on the same data. Chapter 6 reuses it for full SFT on a technical-support subset.
+2. **Real public dataset.** Dolly 15K is widely used and commercially viable (CC-BY-SA-3.0). Human-authored, not synthetic.
+3. **Measurable tasks.** Eight distinct categories with enough examples in each to surface per-category effects.
+4. **Right size for LoRA.** A 400-example training set is the sweet spot: enough to show improvement, small enough to run end to end in ~10-15 minutes on a single consumer GPU.
+
+## Prerequisites
+
+### One-Time Setup (Fresh Machine)
+
+**First-time setup:** If you haven't set up the book environment yet, follow the detailed instructions in **`code/README.md`** (one directory up). This includes:
+- Checking Python version (**3.12+ required**)
+- Installing system prerequisites (Ubuntu/Debian: `python3-venv`)
+- Creating virtual environment
+- Installing PyTorch (CPU or CUDA)
+- Installing the book package
+
+Once you've completed the general setup, come back here for Chapter 5-specific steps.
+
+**Required for Chapter 5's QLoRA branch (Step 5) — install with the QLoRA extra.** The LoRA pass (Steps 1-4) works on the base `pip install -e ".[dev]"` install; QLoRA needs bitsandbytes for 4-bit quantization. From the `code/` directory:
+
+```bash
+pip install -e ".[qlora]"
+```
+
+QLoRA is optional. If you do not plan to run Step 5, you can skip this extra.
+
+> **On a Mac?** QLoRA (Step 5) does not run on Apple Silicon: `bitsandbytes` 4-bit kernels are CUDA/ROCm-only, with no Metal/MPS build. Removing `bitsandbytes` would not make QLoRA run on a Mac, it would just remove the 4-bit path that makes it QLoRA. Use the LoRA branch (Steps 1-4), which needs no `bitsandbytes` and trains on MPS. See [ACCELERATORS.md](../../ACCELERATORS.md#why-qlora-needs-an-nvidia-or-amd-gpu) for the full explanation.
+
+### Verify Your Setup (Recommended)
+
+Before investing time in full training runs, validate that everything is installed correctly:
+
+```bash
+python chapter05/scripts/validate_chapter05.py
+```
+
+**What this does:**
+1. **Checks** Python version
+2. **Verifies** required data files exist (smoke test datasets, safety prompts)
+3. **Confirms** PyTorch is installed and detects CUDA availability
+4. **Runs** a tiny 2-step LoRA training (smoke test) to ensure the full pipeline works
+5. **Validates** the adapter was created successfully
+
+**Why run this?**
+- **Catches setup issues early** - Better to find missing dependencies now than 15 minutes into a full training run
+- **Tests the complete workflow** - Loads model, tokenizes data, runs training, saves adapter
+- **Takes only 2-3 minutes** - Much faster than debugging a failed full training run
+- **GPU-aware** - Skips training test if no GPU detected (to avoid slow CPU runs)
+- **Chapter-specific** - Each chapter has its own validation script tailored to its requirements (other chapters may have different dependencies or model sizes)
+
+**Expected output:**
+```
+Chapter 5 validation
+- Python: 3.12.3
+- Datasets: **OK**
+- Torch: 2.10.0+cu126
+- CUDA available: True
+- Running tiny LoRA smoke training...
+  [Progress bars and training logs]
+- Smoke training: **OK** (adapter written to chapter05/runs/validate_lora_smoke)
+```
+
+**If validation fails**, it will show a clear error message indicating what's missing (e.g., "PyTorch not installed" or "Missing required files").
+
+### GPU Requirements
+
+- **LoRA**: minimum **8 GB VRAM** (RTX 3060 / 4060 class).
+- **QLoRA**: minimum **6 GB VRAM** (works on smaller GPUs).
+- **Recommended**: **12 GB+ VRAM** (RTX 4070 / 4080, NVIDIA A30, A100) for faster training.
+- **Training time on a single A30**: ~10-12 minutes for LoRA, ~14 minutes for QLoRA (400 examples, 3 epochs). On smaller GPUs allocate up to 25-35 minutes.
+
+## Step-by-Step Instructions
+
+**Run all commands below from the `code/` directory with your virtual environment activated.** If you reopened the terminal or reconnected via SSH, activate the venv first (this is a common cause of "No module named 'chapter05'"):
+
+```bash
+cd /path/to/ModelAdaptationBook/code
+source .venv/bin/activate   # Linux/macOS
+# Windows:  .venv\Scripts\activate
+```
+
+### Step 1: Download and Prepare the Dataset
+
+Download and prepare a subset of Dolly 15K:
+
+**Linux/macOS:**
+```bash
+# From the code/ directory (venv active)
+python chapter05/scripts/listing_5_1_prepare_dataset.py \
+  --out chapter05/data/dolly_subset \
+  --seed 42 \
+  --train 400 \
+  --valid 50 \
+  --test 50
+```
+
+**Windows (PowerShell/CMD):**
+```powershell
+python chapter05/scripts/listing_5_1_prepare_dataset.py ^
+  --out chapter05/data/dolly_subset ^
+  --seed 42 ^
+  --train 400 ^
+  --valid 50 ^
+  --test 50
+```
+
+This will:
+- Download Dolly 15K from Hugging Face (first run only)
+- Filter examples by length (20-2000 characters)
+- Create train/valid/test splits with seed=42 for reproducibility
+- Convert to messages format compatible with SFTTrainer
+- Save to `chapter05/data/dolly_subset/`
+
+**Expected output:**
+```
+Loading Databricks Dolly 15K dataset...
+Filtered to ~13880 examples (length 20-2000 chars)
+Wrote Dolly 15K subset to: chapter05/data/dolly_subset
+  - Train: 400 examples
+  - Valid: 50 examples
+  - Test: 50 examples
+  - Categories: {'open_qa': 107, 'general_qa': 69, 'classification': 61, ...}
+```
+
+Dolly 15K has 8 task categories (`open_qa`, `general_qa`, `closed_qa`, `summarization`, `brainstorming`, `classification`, `information_extraction`, `creative_writing`); with `--seed 42 --train 400` the breakdown above is what you will see.
+
+**Outcome types in your own data:** Dolly contains no refusals or tone-tagged examples, which are response types you typically add for an internal assistant. For worked `messages`-format rows showing a refusal, a clarification, and a tone tag (plus a note on inter-annotator agreement for Q&A), see [examples/example_data_prep_outcome_types.md](examples/example_data_prep_outcome_types.md).
+
+### Step 2: Train LoRA Adapter
+
+Train a LoRA adapter using TRL's SFTTrainer:
+
+**Linux/macOS:**
+```bash
+python -m chapter05.train_lora \
+  --train chapter05/data/dolly_subset/train.jsonl \
+  --valid chapter05/data/dolly_subset/valid.jsonl \
+  --out chapter05/runs/dolly_lora
+```
+
+**Windows:**
+```powershell
+python -m chapter05.train_lora ^
+  --train chapter05/data/dolly_subset/train.jsonl ^
+  --valid chapter05/data/dolly_subset/valid.jsonl ^
+  --out chapter05/runs/dolly_lora
+```
+
+**What happens:**
+- Loads base model (Qwen3-4B)
+- Creates LoRA config (r=16, alpha=32)
+- Trains for **3 epochs** (**15-20 minutes** on RTX 4070)
+- Saves adapter to `chapter05/runs/dolly_lora/`
+
+**Expected output:**
+```
+Saved LoRA adapter to: **chapter05/runs/dolly_lora**
+```
+
+### Step 3: Evaluate LoRA vs Base Model
+
+Compare the fine-tuned model to the base model:
+
+**Linux/macOS:**
+```bash
+python chapter05/scripts/listing_5_3_evaluate.py \
+  --base Qwen/Qwen3-4B-Instruct-2507 \
+  --adapter chapter05/runs/dolly_lora \
+  --dolly_test chapter05/data/dolly_subset/test.jsonl
+```
+
+**Windows:**
+```powershell
+python chapter05/scripts/listing_5_3_evaluate.py ^
+  --base Qwen/Qwen3-4B-Instruct-2507 ^
+  --adapter chapter05/runs/dolly_lora ^
+  --dolly_test chapter05/data/dolly_subset/test.jsonl
+```
+
+**This generates:**
+- `chapter05/runs/eval_report/report.json` - Detailed metrics
+- `chapter05/runs/eval_report/report.md` - **Human-readable summary**
+
+**What you'll see:**
+- Overall accuracy improvement (e.g., 70% → 85%)
+- Per-category improvements (which task types improved most)
+- **Safety regression check** (ensures fine-tuning didn't break safety)
+
+### Step 4: Run Inference with the Adapter
+
+Generate text with the fine-tuned adapter. **Ensure you are in `code/` with the venv activated** (easy to forget after a new shell or SSH session):
+
+**Linux/macOS:**
+```bash
+cd /path/to/ModelAdaptationBook/code
+source .venv/bin/activate
+python -m chapter05.generate \
+  --base Qwen/Qwen3-4B-Instruct-2507 \
+  --adapter chapter05/runs/dolly_lora \
+  --prompt "Explain how photosynthesis works in simple terms."
+```
+
+**Windows:**
+```powershell
+cd C:\path\to\ModelAdaptationBook\code
+.venv\Scripts\activate
+python -m chapter05.generate ^
+  --base Qwen/Qwen3-4B-Instruct-2507 ^
+  --adapter chapter05/runs/dolly_lora ^
+  --prompt "Explain how photosynthesis works in simple terms."
+```
+
+**Side-by-side example:** A full example with the same prompt run on the base model and on the base + adapter (commands, outputs, and what to notice) is in [examples/example_inference_base_vs_adapter.md](examples/example_inference_base_vs_adapter.md). A screenshot of the terminal output is in [images/chap5-inference_base_vs_adapter.png](images/chap5-inference_base_vs_adapter.png)—useful for comparing base vs adapter at a glance.
+
+### Step 5: QLoRA (optional step)
+
+QLoRA uses 4-bit quantization, enabling training on smaller GPUs. (You already installed the `qlora` extra in the Chapter 5 prerequisites.)
+
+**Linux/macOS:**
+```bash
+python -m chapter05.train_qlora \
+  --train chapter05/data/dolly_subset/train.jsonl \
+  --valid chapter05/data/dolly_subset/valid.jsonl \
+  --out chapter05/runs/dolly_qlora
+```
+
+**Windows:**
+```powershell
+python -m chapter05.train_qlora ^
+  --train chapter05/data/dolly_subset/train.jsonl ^
+  --valid chapter05/data/dolly_subset/valid.jsonl ^
+  --out chapter05/runs/dolly_qlora
+```
+
+**Differences from LoRA:**
+- Uses 4-bit quantization (bitsandbytes)
+- Lower default rank (r=8 vs r=16)
+- Slightly longer training time (25-35 minutes)
+- Similar or slightly lower accuracy (~1-2% difference)
+
+**Expected output:** Training logs show loss, learning rate, and mean token accuracy per step; at the end you'll see `Saved QLoRA adapter to: chapter05/runs/dolly_qlora`. For a full example log and an explanation of each line (including the tokenizer PAD message and HF warning), see [examples/example_qlora_training_output.md](examples/example_qlora_training_output.md).
+
+To compare LoRA vs QLoRA after training both:
+
+**Linux/macOS:**
+```bash
+python chapter05/scripts/listing_5_3_evaluate.py \
+  --base Qwen/Qwen3-4B-Instruct-2507 \
+  --adapter chapter05/runs/dolly_lora \
+  --adapter_alt chapter05/runs/dolly_qlora \
+  --dolly_test chapter05/data/dolly_subset/test.jsonl
+```
+
+**Windows:**
+```powershell
+python chapter05/scripts/listing_5_3_evaluate.py ^
+  --base Qwen/Qwen3-4B-Instruct-2507 ^
+  --adapter chapter05/runs/dolly_lora ^
+  --adapter_alt chapter05/runs/dolly_qlora ^
+  --dolly_test chapter05/data/dolly_subset/test.jsonl
+```
+
+**Expected output:** Steps 1–4 run for the base and LoRA adapter; then the script loads and evaluates the alternative adapter (QLoRA) and writes one report comparing all three. For a full example log and explanation of each step, see [examples/example_qlora_evaluation_output.md](examples/example_qlora_evaluation_output.md).
+
+**What you'll see:**
+```
+Step 1/4: Loading base model...
+**[OK]** Base model loaded
+
+Step 2/4: Evaluating base model...
+Evaluating examples... ━━━━━━━━━━━━━━ 50/50
+Running safety checks... ━━━━━━━━━━━━ 10/10
+**[OK]** Base evaluation complete
+
+Step 3/4: Loading adapter from chapter05/runs/dolly_lora...
+**[OK]** Adapter loaded
+
+Step 4/4: Evaluating fine-tuned model...
+Evaluating examples... ━━━━━━━━━━━━━━ 50/50
+Running safety checks... ━━━━━━━━━━━━ 10/10
+**[OK]** Fine-tuned evaluation complete
+
+**[OK] Evaluation complete!**
+**[OK]** JSON report: chapter05/runs/eval_report/report.json
+**[OK]** Markdown summary: chapter05/runs/eval_report/report.md
+```
+
+Evaluation takes **5-10 minutes** total on a single GPU. The progress bars show exactly what's happening at each stage.
+
+## Understanding the Results
+
+### Evaluation Metrics
+
+The evaluation script measures:
+
+| Metric | Description |
+|--------|--------------|
+| **Exact Match (EM)** | Percentage of responses that exactly match the reference (after normalization) |
+| **Token F1** | Token-level F1 score (measures partial correctness) |
+
+**Per-category metrics** (accuracy broken down by task type):
+
+| Category | Description |
+|----------|--------------|
+| `open_qa` | Open-ended questions |
+| `closed_qa` | Factual questions with specific answers |
+| `creative_writing` | Creative tasks |
+| `brainstorming` | Idea generation |
+| `classification` | Categorization tasks |
+| `summarization` | Text summarization |
+| `information_extraction` | Extracting structured info |
+
+### Expected Results
+
+With only 400 training examples, absolute scores are modest. Focus on **deltas** vs the base model.
+
+**Base Qwen3-4B-Instruct-2507** (the floor):
+- Overall exact match: 0%
+- Overall Token-F1: 0.21
+- Safety refusal rate: 100% (well-aligned base)
+
+**After LoRA (r=16, 3 epochs, 400 examples)** — representative measured numbers (your run will vary within ±0.02 on F1 across hardware and library versions):
+- Overall exact match: 0%
+- **Overall Token-F1: ~0.34-0.39** (Δ +0.13 to +0.18)
+- **Safety refusal rate: 20-60%** (Δ −40 to −80 pp — see the warning below)
+- Per-category: strong gains in classification (+0.48 F1) and summarization (+0.29 F1); modest on open QA (+0.15); small or negative on creative writing and brainstorming.
+
+**The safety regression is real.** On our validated 2026-05-09 run, the LoRA adapter dropped the safety-refusal rate from 100% to 20% on a 10-prompt red-team set — the adapter answers 8 of 10 prompts the base model correctly refuses. The chapter's safety-regression suite catches this; the fix is to either (a) keep a smaller LoRA rank such as `r=8`, (b) add explicit refusal examples to the training data, or (c) follow with a preference-optimisation pass (chapter 8) to re-instill the alignment.
+
+**For higher absolute scores:** scale to 1,000-2,000 training examples. Expect Token-F1 in the 0.50-0.70 range and EM in the 15-35% range, at proportionally longer training time.
+
+**→ See [examples/README_INTERPRETING_RESULTS.md](examples/README_INTERPRETING_RESULTS.md) for detailed guidance on understanding your results.** For a full example of a report comparing base, LoRA, and QLoRA (with section-by-section interpretation), see [examples/example_eval_report_lora_vs_qlora.md](examples/example_eval_report_lora_vs_qlora.md).
+
+**Why We See Improvement:**
+- Base model is general-purpose; fine-tuning adapts it to the specific instruction style and task distribution in Dolly
+- With small datasets (400 examples), models specialize but may show mixed results across categories
+- LoRA learns to better follow the instruction format and response patterns
+- 400 examples is enough to show clear improvement without overfitting
+
+### Safety Regression Check
+
+The evaluation also runs a safety suite to ensure fine-tuning didn't weaken safety guardrails. You should see:
+- **Refusal rate:** Similar or slightly higher than base model
+- **If refusal rate drops significantly**, that's a red flag-the adapter may need more safety examples
+
+## Troubleshooting
+
+### **"No module named 'chapter05'"**
+- **Cause:** The shell is not using the virtual environment, or you're not in the `code/` directory. Common after reopening a terminal or reconnecting via SSH.
+- **Fix:** From the repo root, go to `code/`, activate the venv, then run your command:
+  ```bash
+  cd /path/to/ModelAdaptationBook/code
+  source .venv/bin/activate   # Linux/macOS
+  # Windows:  .venv\Scripts\activate
+  python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 --prompt "Your prompt"
+  ```
+- If you never created a venv here, follow **Prerequisites** in this README and in `code/README.md`.
+
+### **"CUDA out of memory"**
+- Reduce `--batch_size` (default: 1)
+- Increase `--grad_accum` to maintain effective batch size
+- Use **QLoRA instead of LoRA** (lower memory)
+
+### **"Dataset not found"**
+- **Run `listing_5_1_prepare_dataset.py` first** (Step 1)
+- Check that files exist: `chapter05/data/dolly_subset/train.jsonl`
+
+### "TRL not installed"
+- Install: `pip install trl>=0.9.0`
+- Or reinstall: `pip install -e "."` (should include trl from pyproject.toml)
+
+### Training is slow
+- Check GPU is being used: `nvidia-smi` should show Python process
+- Reduce `--max_length` if using very long sequences
+- Use QLoRA for faster training on some GPUs
+
+## Testing on Another Machine
+
+On a fresh clone, follow **Prerequisites** (above) then **Step-by-Step Instructions** (Steps 1-3: prepare data, train, evaluate). With the same data and seed (42), results should match within **2-3%** across machines.
+
+## Advanced: Multi-LoRA
+
+Train multiple adapters for different purposes:
+
+```bash
+# Train adapter A
+python -m chapter05.train_lora --train data_a.jsonl --out runs/adapter_a ...
+
+# Train adapter B  
+python -m chapter05.train_lora --train data_b.jsonl --out runs/adapter_b ...
+
+# Compare at inference (Linux/macOS)
+python -m chapter05.multi_lora_demo \
+  --adapter_a chapter05/runs/adapter_a \
+  --adapter_b chapter05/runs/adapter_b \
+  --prompt "Your prompt here"
+
+# Windows
+python -m chapter05.multi_lora_demo ^
+  --adapter_a chapter05/runs/adapter_a ^
+  --adapter_b chapter05/runs/adapter_b ^
+  --prompt "Your prompt here"
+```
+
+## Publishing Adapters (Optional)
+
+Publish your adapter to Hugging Face Hub. First, authenticate once (the token is cached at `~/.cache/huggingface/token` and reused by future commands):
+
+```bash
+huggingface-cli login
+# paste a token with "write" scope from https://huggingface.co/settings/tokens
+# answer "n" to the git credentials prompt
+```
+
+The publish command picks the cached token up automatically; `HF_TOKEN` env var and `--hf_token` flag are also supported.
+
+**Linux/macOS:**
+```bash
+python chapter05/scripts/publish_adapter.py \
+  --adapter chapter05/runs/dolly_lora \
+  --repo_id <your-username>/qwen3-4b-dolly-lora \
+  --private \
+  --dataset_manifest chapter05/data/dolly_subset/manifest.json \
+  --eval_report chapter05/runs/eval_report/report.json
+```
+
+**Windows:**
+```powershell
+python chapter05/scripts/publish_adapter.py ^
+  --adapter chapter05/runs/dolly_lora ^
+  --repo_id <your-username>/qwen3-4b-dolly-lora ^
+  --private ^
+  --dataset_manifest chapter05/data/dolly_subset/manifest.json ^
+  --eval_report chapter05/runs/eval_report/report.json
+```
+
+## See Also
+
+- [Contoso domain-adaptation example, where an adapter beats prompting (base vs. format-prompt vs. LoRA, with sample outputs)](../it_support_qa/README.md) — the section 5.1.8 / figure 5.5 example, full dataset and reproducible run
+- [Base vs LoRA vs QLoRA inference output (same prompt)](examples/example_inference_base_vs_adapter.md)
+- [QLoRA training log and interpretation](examples/example_qlora_training_output.md)
+- [LoRA vs QLoRA evaluation run](examples/example_qlora_evaluation_output.md)
+- [Full eval report (base/LoRA/QLoRA) and how to read it](examples/example_eval_report_lora_vs_qlora.md)
+- [How to interpret evaluation results](examples/README_INTERPRETING_RESULTS.md)
+- [Production deployment patterns](docs/inference_enterprise.md)
+- [Manual evaluation guidelines](docs/human_review_checklist.md)
+
+**Images (`images/`):** Screenshots used in the examples above: `chap5-inference_base_vs_adapter.png`, `chap5-qlora_inference.png`, `chap5-qlora_training.png`, `chap5-qlora_training_gpu.png`, `chap5-qlora_lora_evals.png`.
+
+## Running Tests
+
+Chapter 5 includes unit tests for data processing and metrics:
+
+```bash
+# From code/ directory
+pytest chapter05/tests/
+
+# Run specific test file
+pytest chapter05/tests/test_metrics.py
+pytest chapter05/tests/test_data_normalization.py
+```
+
+**What the tests cover:**
+- `test_metrics.py` - Tests for exact match and token F1 metrics
+- `test_data_normalization.py` - Tests for data format conversions
+
+To install test dependencies:
+```bash
+pip install -e ".[dev]"  # Includes pytest, ruff
+```
+
+## Troubleshooting
+
+### "The tokenizer has new PAD/BOS/EOS tokens" Warning
+
+During training (Step 2), you may see:
+```
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. 
+The model config and generation config were aligned accordingly, being updated with the tokenizer's values. 
+Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+```
+
+**This is expected and harmless.** Here's why:
+
+- Qwen models don't ship with a dedicated PAD token
+- Our code sets `pad_token = eos_token` (standard practice for Qwen)
+- TRL's SFTTrainer detects this and updates the model config to match
+- Training proceeds normally and produces valid adapters
+
+**No action needed.** Your model will train and generate text correctly.
+
+**Technical note:** Using EOS as PAD is the standard approach for Qwen models. The base model is already instruction-tuned and knows when to stop generating, so this doesn't affect generation quality in practice.
+
+## W&B (Optional, Non-Fatal)
+
+Enable experiment tracking:
+
+```bash
+pip install -e ".[wandb]"
+setx BOOKCODE_REPORT_TO wandb  # Windows
+export BOOKCODE_REPORT_TO=wandb  # macOS/Linux
+```
+
+Disable if not needed:
+```bash
+setx WANDB_DISABLED true  # Windows
+export WANDB_DISABLED=true  # macOS/Linux
+```
diff --git a/code/chapter05/eval.py b/code/chapter05/eval.py
index 2dd7773..e7b2ba1 100644
--- a/code/chapter05/eval.py
+++ b/code/chapter05/eval.py
@@ -6,8 +6,8 @@
     3. **Toy golden set** - Simple Q&A pairs to sanity-check model behavior.
 
 Also includes loss/perplexity computation on held-out JSONL data, and report
-generation (JSON + Markdown). Used by ``scripts/listing_5_4_evaluate.py``
-(Listing 5.4) to compare base model vs. adapter variants.
+generation (JSON + Markdown). Used by ``scripts/listing_5_3_evaluate.py``
+(Listing 5.3) to compare base model vs. adapter variants.
 """
 from __future__ import annotations
 
@@ -473,7 +473,7 @@ def write_report(path: str | Path, obj: Dict[str, Any]) -> None:
     """Write an evaluation results dict as a JSON file.
 
     The JSON report is the machine-readable counterpart to the human-readable
-    Markdown summary generated by ``listing_5_4_evaluate.py``. Both are saved
+    Markdown summary generated by ``listing_5_3_evaluate.py``. Both are saved
     to the same output directory (e.g., ``chapter05/runs/eval_report/``).
 
     Args:
diff --git a/code/chapter05/examples/README_INTERPRETING_RESULTS.md b/code/chapter05/examples/README_INTERPRETING_RESULTS.md
index 11e71c5..86b9df0 100644
--- a/code/chapter05/examples/README_INTERPRETING_RESULTS.md
+++ b/code/chapter05/examples/README_INTERPRETING_RESULTS.md
@@ -1,6 +1,6 @@
 # Understanding Your Evaluation Results
 
-This guide helps you interpret the evaluation report from `listing_5_4_evaluate.py`.
+This guide helps you interpret the evaluation report from `listing_5_3_evaluate.py`.
 
 ---
 
diff --git a/code/chapter05/examples/example_data_prep_outcome_types.md b/code/chapter05/examples/example_data_prep_outcome_types.md
index 043bf13..ec58546 100644
--- a/code/chapter05/examples/example_data_prep_outcome_types.md
+++ b/code/chapter05/examples/example_data_prep_outcome_types.md
@@ -3,7 +3,7 @@
 These illustrate the response types discussed in the chapter's "Data quality
 iterations" section, using the Contoso IT-support assistant. Each is a single
 training row in the same `messages` format produced by
-`scripts/listing_5_2_prepare_dataset.py` (see `dolly_to_messages`).
+`scripts/listing_5_1_prepare_dataset.py` (see `dolly_to_messages`).
 
 > **These rows are illustrative.** The Dolly 15K subset used in this chapter
 > contains no refusals and no tone tags, so these are examples of what you would
diff --git a/code/chapter05/examples/example_qlora_evaluation_output.md b/code/chapter05/examples/example_qlora_evaluation_output.md
index aba0cf1..40bd14f 100644
--- a/code/chapter05/examples/example_qlora_evaluation_output.md
+++ b/code/chapter05/examples/example_qlora_evaluation_output.md
@@ -5,7 +5,7 @@ This file captures a typical run of the evaluation script when comparing the **b
 ## Command
 
 ```bash
-python chapter05/scripts/listing_5_4_evaluate.py \
+python chapter05/scripts/listing_5_3_evaluate.py \
   --base Qwen/Qwen3-4B-Instruct-2507 \
   --adapter chapter05/runs/dolly_lora \
   --adapter_alt chapter05/runs/dolly_qlora \
diff --git a/code/chapter05/generate.py b/code/chapter05/generate.py
index 41809db..a7ed810 100644
--- a/code/chapter05/generate.py
+++ b/code/chapter05/generate.py
@@ -1,110 +1,110 @@
-"""Inference script: generate text with the base model and an optional LoRA/QLoRA adapter (Listing 5.5).
-
-Loads the base model, optionally attaches a LoRA or QLoRA adapter, and generates
-a response for a single user prompt. Supports adapter merging for deployment.
-
-Usage (base model only):
-    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
-        --prompt "Explain how photosynthesis works in simple terms."
-
-Usage (with LoRA adapter):
-    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
-        --adapter chapter05/runs/dolly_lora \\
-        --prompt "Explain how photosynthesis works in simple terms."
-
-Usage (with QLoRA adapter -- must use --quantized_4bit):
-    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
-        --adapter chapter05/runs/dolly_qlora --quantized_4bit \\
-        --prompt "Explain how photosynthesis works in simple terms."
-
-See Chapter 5, Section 5.1 (Step 4) and the README for full details.
-"""
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-
-import torch
-from peft import PeftModel
-from transformers import AutoModelForCausalLM
-
-from chapter05 import DEFAULT_MODEL_NAME
-from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT, build_prompt_text
-from chapter05.modeling import load_base_model_lora, load_base_model_qlora, load_tokenizer
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments for inference.
-
-    Returns:
-        Namespace with base model, adapter path, prompt, generation settings,
-        and optional merge/quantization flags.
-    """
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--base", default=DEFAULT_MODEL_NAME)
-    ap.add_argument("--adapter", default=None, help="Path to LoRA/QLoRA adapter folder")
-    ap.add_argument("--prompt", required=True, help="User prompt")
-    ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT)
-    ap.add_argument("--max_new_tokens", type=int, default=128)
-    ap.add_argument("--do_sample", action="store_true")
-    ap.add_argument("--temperature", type=float, default=0.7)
-    ap.add_argument("--quantized_4bit", action="store_true", help="Load base in 4-bit (requires bitsandbytes)")
-    ap.add_argument("--merge_adapter", action="store_true", help="Merge adapter into base before generation")
-    ap.add_argument("--save_merged", default=None, help="If set, save merged model to this folder")
-    return ap.parse_args()
-
-
-def main() -> None:
-    """Load model, optionally attach adapter, and generate a response."""
-    args = parse_args()
-    tokenizer = load_tokenizer(args.base)
-
-    # Use --quantized_4bit when running a QLoRA-trained adapter so the base
-    # model is loaded in 4-bit (matching the precision used during training).
-    if args.quantized_4bit:
-        model = load_base_model_qlora(args.base, gradient_checkpointing=False)
-    else:
-        model = load_base_model_lora(args.base, gradient_checkpointing=False)
-
-    if args.adapter:
-        model = PeftModel.from_pretrained(model, args.adapter)
-        if args.merge_adapter:
-            # merge_and_unload() permanently folds LoRA weights into the base.
-            # This loses modularity (can't swap adapters) but can be faster
-            # for high-throughput serving. See Section 5.11 deployment options.
-            model = model.merge_and_unload()
-            if args.save_merged:
-                Path(args.save_merged).mkdir(parents=True, exist_ok=True)
-                model.save_pretrained(args.save_merged)
-                tokenizer.save_pretrained(args.save_merged)
-
-    model.eval()
-
-    messages = [
-        {"role": "system", "content": args.system_prompt},
-        {"role": "user", "content": args.prompt},
-    ]
-    text = build_prompt_text(tokenizer, messages)
-    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device)
-
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=args.max_new_tokens,
-            do_sample=args.do_sample,
-            # Pass temperature=None when not sampling to avoid HF warnings
-            # about unused generation parameters.
-            temperature=args.temperature if args.do_sample else None,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-
-    # skip_special_tokens=False to show the full chat template (system/user/assistant
-    # markers). This is useful for debugging and demonstrating the template structure.
-    decoded = tokenizer.decode(out[0], skip_special_tokens=False)
-    print(decoded)
-
-
-if __name__ == "__main__":
-    main()
-
+"""Inference script: generate text with the base model and an optional LoRA/QLoRA adapter (Listing 5.4).
+
+Loads the base model, optionally attaches a LoRA or QLoRA adapter, and generates
+a response for a single user prompt. Supports adapter merging for deployment.
+
+Usage (base model only):
+    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
+        --prompt "Explain how photosynthesis works in simple terms."
+
+Usage (with LoRA adapter):
+    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
+        --adapter chapter05/runs/dolly_lora \\
+        --prompt "Explain how photosynthesis works in simple terms."
+
+Usage (with QLoRA adapter -- must use --quantized_4bit):
+    python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\
+        --adapter chapter05/runs/dolly_qlora --quantized_4bit \\
+        --prompt "Explain how photosynthesis works in simple terms."
+
+See Chapter 5, Section 5.1 (Step 4) and the README for full details.
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+from chapter05 import DEFAULT_MODEL_NAME
+from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT, build_prompt_text
+from chapter05.modeling import load_base_model_lora, load_base_model_qlora, load_tokenizer
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for inference.
+
+    Returns:
+        Namespace with base model, adapter path, prompt, generation settings,
+        and optional merge/quantization flags.
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--base", default=DEFAULT_MODEL_NAME)
+    ap.add_argument("--adapter", default=None, help="Path to LoRA/QLoRA adapter folder")
+    ap.add_argument("--prompt", required=True, help="User prompt")
+    ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT)
+    ap.add_argument("--max_new_tokens", type=int, default=128)
+    ap.add_argument("--do_sample", action="store_true")
+    ap.add_argument("--temperature", type=float, default=0.7)
+    ap.add_argument("--quantized_4bit", action="store_true", help="Load base in 4-bit (requires bitsandbytes)")
+    ap.add_argument("--merge_adapter", action="store_true", help="Merge adapter into base before generation")
+    ap.add_argument("--save_merged", default=None, help="If set, save merged model to this folder")
+    return ap.parse_args()
+
+
+def main() -> None:
+    """Load model, optionally attach adapter, and generate a response."""
+    args = parse_args()
+    tokenizer = load_tokenizer(args.base)
+
+    # Use --quantized_4bit when running a QLoRA-trained adapter so the base
+    # model is loaded in 4-bit (matching the precision used during training).
+    if args.quantized_4bit:
+        model = load_base_model_qlora(args.base, gradient_checkpointing=False)
+    else:
+        model = load_base_model_lora(args.base, gradient_checkpointing=False)
+
+    if args.adapter:
+        model = PeftModel.from_pretrained(model, args.adapter)
+        if args.merge_adapter:
+            # merge_and_unload() permanently folds LoRA weights into the base.
+            # This loses modularity (can't swap adapters) but can be faster
+            # for high-throughput serving. See Section 5.11 deployment options.
+            model = model.merge_and_unload()
+            if args.save_merged:
+                Path(args.save_merged).mkdir(parents=True, exist_ok=True)
+                model.save_pretrained(args.save_merged)
+                tokenizer.save_pretrained(args.save_merged)
+
+    model.eval()
+
+    messages = [
+        {"role": "system", "content": args.system_prompt},
+        {"role": "user", "content": args.prompt},
+    ]
+    text = build_prompt_text(tokenizer, messages)
+    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device)
+
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=args.max_new_tokens,
+            do_sample=args.do_sample,
+            # Pass temperature=None when not sampling to avoid HF warnings
+            # about unused generation parameters.
+            temperature=args.temperature if args.do_sample else None,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+    # skip_special_tokens=False to show the full chat template (system/user/assistant
+    # markers). This is useful for debugging and demonstrating the template structure.
+    decoded = tokenizer.decode(out[0], skip_special_tokens=False)
+    print(decoded)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/code/chapter05/modeling.py b/code/chapter05/modeling.py
index 243c512..01d4a1f 100644
--- a/code/chapter05/modeling.py
+++ b/code/chapter05/modeling.py
@@ -1,188 +1,188 @@
-"""Model loading utilities for LoRA and QLoRA fine-tuning (Listing 5.3).
-
-Provides functions to:
-    - Load the base model in full precision (for LoRA) or 4-bit quantized (for QLoRA).
-    - Load and configure the tokenizer with proper padding.
-    - Create and apply LoRA adapter configurations.
-
-Used by train_lora.py, train_qlora.py, generate.py, and eval.py.
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Sequence, Tuple
-
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-from peft import LoraConfig, prepare_model_for_kbit_training
-
-from .chat_template import ensure_padding
-
-
-# Standard attention and MLP projection modules in Transformer architectures.
-# Adapting all attention projections (q/k/v/o) plus MLP projections (up/gate/down)
-# gives the best quality/cost balance. See Section 5.5 for guidance.
-DEFAULT_TARGET_MODULES = [
-    "q_proj",
-    "k_proj",
-    "v_proj",
-    "o_proj",
-    "up_proj",
-    "gate_proj",
-    "down_proj",
-]
-
-
-@dataclass(frozen=True)
-class LoadedModel:
-    model: Any
-    tokenizer: Any
-
-
-def load_tokenizer(model_name: str):
-    """Load and configure a tokenizer with proper padding for the given model.
-
-    Args:
-        model_name: HuggingFace model ID or local path.
-
-    Returns:
-        A configured AutoTokenizer with padding set (see ensure_padding).
-    """
-    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    ensure_padding(tok)
-    return tok
-
-
-def load_base_model_lora(
-    model_name: str,
-    *,
-    device_map: str = "auto",
-    dtype: str | torch.dtype = "auto",
-    gradient_checkpointing: bool = True,
-):
-    """Load the base model in full precision for LoRA fine-tuning or inference.
-
-    Args:
-        model_name: HuggingFace model ID or local path.
-        device_map: Device placement strategy. "auto" distributes layers across
-            available GPUs (or CPU if no GPU), which is the simplest approach
-            for single-GPU setups.
-        dtype: Weight dtype. "auto" lets HF choose the best dtype for the hardware.
-        gradient_checkpointing: If True, trades compute for memory by recomputing
-            activations during backward. Roughly halves memory at ~20% speed cost.
-            Disable for inference (no backward pass needed).
-
-    Returns:
-        A HuggingFace AutoModelForCausalLM ready for LoRA adapter attachment.
-    """
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map=device_map,
-        dtype=dtype,
-        trust_remote_code=True,
-    )
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()
-        # KV cache is incompatible with gradient checkpointing during training.
-        model.config.use_cache = False
-    return model
-
-
-def load_base_model_qlora(
-    model_name: str,
-    *,
-    device_map: str = "auto",
-    compute_dtype: torch.dtype = torch.bfloat16,
-    gradient_checkpointing: bool = True,
-):
-    """Load the base model in 4-bit quantized form for QLoRA fine-tuning.
-
-    Uses bitsandbytes NF4 (NormalFloat4) quantization with double quantization
-    to compress the base model to ~4 bits per parameter. This reduces GPU memory
-    by roughly 4x compared to full precision, enabling fine-tuning of larger
-    models on smaller GPUs.
-
-    Args:
-        model_name: HuggingFace model ID or local path.
-        device_map: Device placement strategy (same as load_base_model_lora).
-        compute_dtype: Dtype for computation during forward/backward passes.
-            bf16 is preferred for its wider dynamic range.
-        gradient_checkpointing: If True, enable gradient checkpointing for
-            additional memory savings. Disable for inference.
-
-    Returns:
-        A quantized model prepared for k-bit training (gradients enabled on
-        non-quantized parameters like LayerNorm and LoRA adapters).
-    """
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        # NF4 (NormalFloat4): a 4-bit data type optimized for normally-distributed
-        # weights, giving higher precision near zero where most weights cluster.
-        bnb_4bit_quant_type="nf4",
-        # Double quantization: further compresses the quantization constants
-        # themselves, saving ~0.4 bits per parameter with minimal quality loss.
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_compute_dtype=compute_dtype,
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map=device_map,
-        quantization_config=bnb_config,
-        trust_remote_code=True,
-    )
-    # prepare_model_for_kbit_training enables gradients on non-quantized layers
-    # (LayerNorm, embeddings) and sets up proper dtype casting for mixed-precision
-    # training with quantized weights.
-    model = prepare_model_for_kbit_training(model)
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()
-        model.config.use_cache = False
-    return model
-
-
-def create_lora_config(
-    *,
-    r: int,
-    alpha: int,
-    dropout: float,
-    target_modules: Sequence[str] = DEFAULT_TARGET_MODULES,
-) -> LoraConfig:
-    """Create a LoRA configuration for use with SFTTrainer.
-    
-    Returns LoraConfig that can be passed directly to SFTTrainer's peft_config parameter.
-    SFTTrainer will automatically apply the LoRA adapters during training.
-    """
-    return LoraConfig(
-        r=r,
-        lora_alpha=alpha,
-        lora_dropout=dropout,
-        target_modules=list(target_modules),
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-
-
-# Keep apply_lora for backward compatibility (used by eval/inference code)
-def apply_lora(
-    model,
-    *,
-    r: int,
-    alpha: int,
-    dropout: float,
-    target_modules: Sequence[str] = DEFAULT_TARGET_MODULES,
-):
-    """Apply LoRA to a model (for backward compatibility with eval/inference code).
-    
-    Note: For training, use create_lora_config() and pass to SFTTrainer instead.
-    """
-    from peft import get_peft_model
-    
-    cfg = create_lora_config(
-        r=r,
-        alpha=alpha,
-        dropout=dropout,
-        target_modules=target_modules,
-    )
-    model = get_peft_model(model, cfg)
-    return model
+"""Model loading utilities for LoRA and QLoRA fine-tuning (Listing 5.2).
+
+Provides functions to:
+    - Load the base model in full precision (for LoRA) or 4-bit quantized (for QLoRA).
+    - Load and configure the tokenizer with proper padding.
+    - Create and apply LoRA adapter configurations.
+
+Used by train_lora.py, train_qlora.py, generate.py, and eval.py.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from peft import LoraConfig, prepare_model_for_kbit_training
+
+from .chat_template import ensure_padding
+
+
+# Standard attention and MLP projection modules in Transformer architectures.
+# Adapting all attention projections (q/k/v/o) plus MLP projections (up/gate/down)
+# gives the best quality/cost balance. See Section 5.5 for guidance.
+DEFAULT_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "down_proj",
+]
+
+
+@dataclass(frozen=True)
+class LoadedModel:
+    model: Any
+    tokenizer: Any
+
+
+def load_tokenizer(model_name: str):
+    """Load and configure a tokenizer with proper padding for the given model.
+
+    Args:
+        model_name: HuggingFace model ID or local path.
+
+    Returns:
+        A configured AutoTokenizer with padding set (see ensure_padding).
+    """
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    ensure_padding(tok)
+    return tok
+
+
+def load_base_model_lora(
+    model_name: str,
+    *,
+    device_map: str = "auto",
+    dtype: str | torch.dtype = "auto",
+    gradient_checkpointing: bool = True,
+):
+    """Load the base model in full precision for LoRA fine-tuning or inference.
+
+    Args:
+        model_name: HuggingFace model ID or local path.
+        device_map: Device placement strategy. "auto" distributes layers across
+            available GPUs (or CPU if no GPU), which is the simplest approach
+            for single-GPU setups.
+        dtype: Weight dtype. "auto" lets HF choose the best dtype for the hardware.
+        gradient_checkpointing: If True, trades compute for memory by recomputing
+            activations during backward. Roughly halves memory at ~20% speed cost.
+            Disable for inference (no backward pass needed).
+
+    Returns:
+        A HuggingFace AutoModelForCausalLM ready for LoRA adapter attachment.
+    """
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map=device_map,
+        dtype=dtype,
+        trust_remote_code=True,
+    )
+    if gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+        # KV cache is incompatible with gradient checkpointing during training.
+        model.config.use_cache = False
+    return model
+
+
+def load_base_model_qlora(
+    model_name: str,
+    *,
+    device_map: str = "auto",
+    compute_dtype: torch.dtype = torch.bfloat16,
+    gradient_checkpointing: bool = True,
+):
+    """Load the base model in 4-bit quantized form for QLoRA fine-tuning.
+
+    Uses bitsandbytes NF4 (NormalFloat4) quantization with double quantization
+    to compress the base model to ~4 bits per parameter. This reduces GPU memory
+    by roughly 4x compared to full precision, enabling fine-tuning of larger
+    models on smaller GPUs.
+
+    Args:
+        model_name: HuggingFace model ID or local path.
+        device_map: Device placement strategy (same as load_base_model_lora).
+        compute_dtype: Dtype for computation during forward/backward passes.
+            bf16 is preferred for its wider dynamic range.
+        gradient_checkpointing: If True, enable gradient checkpointing for
+            additional memory savings. Disable for inference.
+
+    Returns:
+        A quantized model prepared for k-bit training (gradients enabled on
+        non-quantized parameters like LayerNorm and LoRA adapters).
+    """
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        # NF4 (NormalFloat4): a 4-bit data type optimized for normally-distributed
+        # weights, giving higher precision near zero where most weights cluster.
+        bnb_4bit_quant_type="nf4",
+        # Double quantization: further compresses the quantization constants
+        # themselves, saving ~0.4 bits per parameter with minimal quality loss.
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map=device_map,
+        quantization_config=bnb_config,
+        trust_remote_code=True,
+    )
+    # prepare_model_for_kbit_training enables gradients on non-quantized layers
+    # (LayerNorm, embeddings) and sets up proper dtype casting for mixed-precision
+    # training with quantized weights.
+    model = prepare_model_for_kbit_training(model)
+    if gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+        model.config.use_cache = False
+    return model
+
+
+def create_lora_config(
+    *,
+    r: int,
+    alpha: int,
+    dropout: float,
+    target_modules: Sequence[str] = DEFAULT_TARGET_MODULES,
+) -> LoraConfig:
+    """Create a LoRA configuration for use with SFTTrainer.
+    
+    Returns LoraConfig that can be passed directly to SFTTrainer's peft_config parameter.
+    SFTTrainer will automatically apply the LoRA adapters during training.
+    """
+    return LoraConfig(
+        r=r,
+        lora_alpha=alpha,
+        lora_dropout=dropout,
+        target_modules=list(target_modules),
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+
+# Keep apply_lora for backward compatibility (used by eval/inference code)
+def apply_lora(
+    model,
+    *,
+    r: int,
+    alpha: int,
+    dropout: float,
+    target_modules: Sequence[str] = DEFAULT_TARGET_MODULES,
+):
+    """Apply LoRA to a model (for backward compatibility with eval/inference code).
+    
+    Note: For training, use create_lora_config() and pass to SFTTrainer instead.
+    """
+    from peft import get_peft_model
+    
+    cfg = create_lora_config(
+        r=r,
+        alpha=alpha,
+        dropout=dropout,
+        target_modules=target_modules,
+    )
+    model = get_peft_model(model, cfg)
+    return model
diff --git a/code/chapter05/scripts/fix_safety_complete.ps1 b/code/chapter05/scripts/fix_safety_complete.ps1
index 3ea259a..0ef74b3 100644
--- a/code/chapter05/scripts/fix_safety_complete.ps1
+++ b/code/chapter05/scripts/fix_safety_complete.ps1
@@ -80,7 +80,7 @@ Write-Host "Step 4/4: Evaluating and comparing results..." -ForegroundColor Cyan
 Write-Host "⏱  Time: 5-10 minutes" -ForegroundColor Gray
 Write-Host ""
 
-python chapter05/scripts/listing_5_4_evaluate.py `
+python chapter05/scripts/listing_5_3_evaluate.py `
   --base Qwen/Qwen3-4B-Instruct-2507 `
   --adapter chapter05/runs/dolly_lora `
   --adapter_alt chapter05/runs/dolly_lora_with_safety `
diff --git a/code/chapter05/scripts/fix_safety_complete.sh b/code/chapter05/scripts/fix_safety_complete.sh
index eff0bcd..28620fd 100644
--- a/code/chapter05/scripts/fix_safety_complete.sh
+++ b/code/chapter05/scripts/fix_safety_complete.sh
@@ -81,7 +81,7 @@ echo "Step 4/4: Evaluating and comparing results..."
 echo "⏱  Time: 5-10 minutes"
 echo ""
 
-python chapter05/scripts/listing_5_4_evaluate.py \
+python chapter05/scripts/listing_5_3_evaluate.py \
   --base Qwen/Qwen3-4B-Instruct-2507 \
   --adapter chapter05/runs/dolly_lora \
   --adapter_alt chapter05/runs/dolly_lora_with_safety \
diff --git a/code/chapter05/scripts/fix_safety_regression.ps1 b/code/chapter05/scripts/fix_safety_regression.ps1
index abe2eb8..bef274b 100644
--- a/code/chapter05/scripts/fix_safety_regression.ps1
+++ b/code/chapter05/scripts/fix_safety_regression.ps1
@@ -48,7 +48,7 @@ Write-Host "Step 2/2: Evaluating both adapters (r=16 vs r=8)..." -ForegroundColo
 Write-Host "⏱  Estimated time: 5-10 minutes" -ForegroundColor Gray
 Write-Host ""
 
-python chapter05/scripts/listing_5_4_evaluate.py `
+python chapter05/scripts/listing_5_3_evaluate.py `
   --base Qwen/Qwen3-4B-Instruct-2507 `
   --adapter chapter05/runs/dolly_lora `
   --adapter_alt chapter05/runs/dolly_lora_r8 `
diff --git a/code/chapter05/scripts/fix_safety_regression.sh b/code/chapter05/scripts/fix_safety_regression.sh
index f72f1bc..6f10aa8 100644
--- a/code/chapter05/scripts/fix_safety_regression.sh
+++ b/code/chapter05/scripts/fix_safety_regression.sh
@@ -49,7 +49,7 @@ echo "Step 2/2: Evaluating both adapters (r=16 vs r=8)..."
 echo "⏱  Estimated time: 5-10 minutes"
 echo ""
 
-python chapter05/scripts/listing_5_4_evaluate.py \
+python chapter05/scripts/listing_5_3_evaluate.py \
   --base Qwen/Qwen3-4B-Instruct-2507 \
   --adapter chapter05/runs/dolly_lora \
   --adapter_alt chapter05/runs/dolly_lora_r8 \
diff --git a/code/chapter05/scripts/listing_5_2_prepare_dataset.py b/code/chapter05/scripts/listing_5_1_prepare_dataset.py
similarity index 96%
rename from code/chapter05/scripts/listing_5_2_prepare_dataset.py
rename to code/chapter05/scripts/listing_5_1_prepare_dataset.py
index 77f7e16..d9abc49 100644
--- a/code/chapter05/scripts/listing_5_2_prepare_dataset.py
+++ b/code/chapter05/scripts/listing_5_1_prepare_dataset.py
@@ -1,211 +1,211 @@
-"""Step 1 of the hands-on project: download Dolly 15K and prepare a subset.
-
-This script:
-  1. Downloads the Databricks Dolly 15K dataset from Hugging Face (first run only;
-     subsequent runs use the cached copy).
-  2. Filters examples by length (--min_length / --max_length).
-  3. Shuffles with a fixed seed and splits into train/valid/test.
-  4. Converts each example to messages format (system, user, assistant) and
-     writes train.jsonl, valid.jsonl, test.jsonl, and manifest.json to --out.
-
-Run from the repo root (code/) so that chapter05 and common are importable.
-Example:
-  python chapter05/scripts/listing_5_2_prepare_dataset.py \\
-    --out chapter05/data/dolly_subset --seed 42 --train 400 --valid 50 --test 50
-"""
-from __future__ import annotations
-
-import argparse
-import datetime as dt
-from collections import Counter
-from pathlib import Path
-from typing import Any, Dict, List
-
-from datasets import load_dataset
-
-from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT
-from common.jsonl import write_jsonl
-from common.manifest import write_json
-
-
-def dolly_to_messages(
-    instruction: str,
-    context: str | None,
-    response: str,
-    *,
-    system_prompt: str,
-) -> Dict[str, Any]:
-    """Convert Dolly format (instruction, context, response) to messages format.
-    
-    Dolly format:
-    - instruction: The task/question
-    - context: Optional background information
-    - response: The answer/output
-    
-    We combine instruction + context (if present) into the user message.
-    """
-    # Combine instruction and context for user message
-    if context and context.strip():
-        user_content = f"{context}\n\n{instruction}"
-    else:
-        user_content = instruction
-    
-    return {
-        "messages": [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_content},
-            {"role": "assistant", "content": response},
-        ]
-    }
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments for dataset preparation.
-
-    Returns:
-        Namespace with output path, seed, split sizes, system prompt, and
-        length filter thresholds.
-    """
-    ap = argparse.ArgumentParser(
-        description="Prepare a subset of Databricks Dolly 15K for LoRA fine-tuning."
-    )
-    ap.add_argument("--out", required=True, help="Output folder (will create train/valid/test.jsonl)")
-    ap.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
-    ap.add_argument("--train", type=int, default=400, help="Number of training examples")
-    ap.add_argument("--valid", type=int, default=50, help="Number of validation examples")
-    ap.add_argument("--test", type=int, default=50, help="Number of test examples")
-    ap.add_argument(
-        "--system_prompt",
-        default=DEFAULT_SYSTEM_PROMPT,
-        help="System prompt to use for all examples",
-    )
-    ap.add_argument(
-        "--min_length",
-        type=int,
-        default=20,
-        help="Minimum character length for instruction+response (filter short examples)",
-    )
-    ap.add_argument(
-        "--max_length",
-        type=int,
-        default=2000,
-        help="Maximum character length for instruction+response (filter very long examples)",
-    )
-    return ap.parse_args()
-
-
-def main() -> None:
-    """Download Dolly 15K, filter by length, split, convert to messages, and write JSONL."""
-    args = parse_args()
-    out_dir = Path(args.out)
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    print("Step 1: Download and prepare dataset")
-    print("  Downloading Databricks Dolly 15K from Hugging Face (first run may take a minute)...")
-    ds = load_dataset("databricks/databricks-dolly-15k", split="train")
-    print("  Loaded. Filtering and splitting...")
-    
-    # Filter and shuffle
-    import random
-    rng = random.Random(args.seed)
-    
-    filtered_examples = []
-    for example in ds:
-        instruction = example.get("instruction", "")
-        context = example.get("context", "")
-        response = example.get("response", "")
-        
-        # Calculate total length (instruction + context + response)
-        total_length = len(instruction) + len(context or "") + len(response)
-        
-        if args.min_length <= total_length <= args.max_length:
-            filtered_examples.append(example)
-    
-    print(f"Filtered to {len(filtered_examples)} examples (length {args.min_length}-{args.max_length} chars)")
-    
-    # Shuffle with seed
-    rng.shuffle(filtered_examples)
-    
-    total_needed = args.train + args.valid + args.test
-    if len(filtered_examples) < total_needed:
-        raise RuntimeError(
-            f"Not enough examples after filtering: have {len(filtered_examples)}, need {total_needed}"
-        )
-    
-    # Split into train/valid/test
-    train_examples = filtered_examples[: args.train]
-    valid_examples = filtered_examples[args.train : args.train + args.valid]
-    test_examples = filtered_examples[args.train + args.valid : args.train + args.valid + args.test]
-    
-    # Convert to messages format, preserving category info
-    train_rows = []
-    for ex in train_examples:
-        msg_row = dolly_to_messages(
-            ex["instruction"],
-            ex.get("context"),
-            ex["response"],
-            system_prompt=args.system_prompt,
-        )
-        # Preserve category for evaluation
-        msg_row["category"] = ex.get("category", "unknown")
-        train_rows.append(msg_row)
-    
-    valid_rows = []
-    for ex in valid_examples:
-        msg_row = dolly_to_messages(
-            ex["instruction"],
-            ex.get("context"),
-            ex["response"],
-            system_prompt=args.system_prompt,
-        )
-        msg_row["category"] = ex.get("category", "unknown")
-        valid_rows.append(msg_row)
-    
-    test_rows = []
-    for ex in test_examples:
-        msg_row = dolly_to_messages(
-            ex["instruction"],
-            ex.get("context"),
-            ex["response"],
-            system_prompt=args.system_prompt,
-        )
-        msg_row["category"] = ex.get("category", "unknown")
-        test_rows.append(msg_row)
-    
-    # Count categories for manifest
-    train_categories = Counter(ex.get("category", "unknown") for ex in train_examples)
-    
-    # Write JSONL files
-    write_jsonl(out_dir / "train.jsonl", train_rows)
-    write_jsonl(out_dir / "valid.jsonl", valid_rows)
-    write_jsonl(out_dir / "test.jsonl", test_rows)
-    
-    # Write manifest
-    manifest = {
-        "dataset": "databricks/databricks-dolly-15k",
-        "split": "train",
-        "created_utc": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"),
-        "seed": args.seed,
-        "filters": {
-            "min_length": args.min_length,
-            "max_length": args.max_length,
-        },
-        "counts": {
-            "train": len(train_rows),
-            "valid": len(valid_rows),
-            "test": len(test_rows),
-        },
-        "category_distribution": dict(train_categories),
-        "system_prompt": args.system_prompt,
-    }
-    write_json(out_dir / "manifest.json", manifest)
-    
-    print(f"\n✓ Wrote Dolly 15K subset to: {out_dir}")
-    print(f"  - Train: {len(train_rows)} examples")
-    print(f"  - Valid: {len(valid_rows)} examples")
-    print(f"  - Test: {len(test_rows)} examples")
-    print(f"  - Categories: {dict(train_categories)}")
-
-
-if __name__ == "__main__":
-    main()
+"""Step 1 of the hands-on project: download Dolly 15K and prepare a subset.
+
+This script:
+  1. Downloads the Databricks Dolly 15K dataset from Hugging Face (first run only;
+     subsequent runs use the cached copy).
+  2. Filters examples by length (--min_length / --max_length).
+  3. Shuffles with a fixed seed and splits into train/valid/test.
+  4. Converts each example to messages format (system, user, assistant) and
+     writes train.jsonl, valid.jsonl, test.jsonl, and manifest.json to --out.
+
+Run from the repo root (code/) so that chapter05 and common are importable.
+Example:
+  python chapter05/scripts/listing_5_1_prepare_dataset.py \\
+    --out chapter05/data/dolly_subset --seed 42 --train 400 --valid 50 --test 50
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+from collections import Counter
+from pathlib import Path
+from typing import Any, Dict, List
+
+from datasets import load_dataset
+
+from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT
+from common.jsonl import write_jsonl
+from common.manifest import write_json
+
+
+def dolly_to_messages(
+    instruction: str,
+    context: str | None,
+    response: str,
+    *,
+    system_prompt: str,
+) -> Dict[str, Any]:
+    """Convert Dolly format (instruction, context, response) to messages format.
+    
+    Dolly format:
+    - instruction: The task/question
+    - context: Optional background information
+    - response: The answer/output
+    
+    We combine instruction + context (if present) into the user message.
+    """
+    # Combine instruction and context for user message
+    if context and context.strip():
+        user_content = f"{context}\n\n{instruction}"
+    else:
+        user_content = instruction
+    
+    return {
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_content},
+            {"role": "assistant", "content": response},
+        ]
+    }
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for dataset preparation.
+
+    Returns:
+        Namespace with output path, seed, split sizes, system prompt, and
+        length filter thresholds.
+    """
+    ap = argparse.ArgumentParser(
+        description="Prepare a subset of Databricks Dolly 15K for LoRA fine-tuning."
+    )
+    ap.add_argument("--out", required=True, help="Output folder (will create train/valid/test.jsonl)")
+    ap.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
+    ap.add_argument("--train", type=int, default=400, help="Number of training examples")
+    ap.add_argument("--valid", type=int, default=50, help="Number of validation examples")
+    ap.add_argument("--test", type=int, default=50, help="Number of test examples")
+    ap.add_argument(
+        "--system_prompt",
+        default=DEFAULT_SYSTEM_PROMPT,
+        help="System prompt to use for all examples",
+    )
+    ap.add_argument(
+        "--min_length",
+        type=int,
+        default=20,
+        help="Minimum character length for instruction+response (filter short examples)",
+    )
+    ap.add_argument(
+        "--max_length",
+        type=int,
+        default=2000,
+        help="Maximum character length for instruction+response (filter very long examples)",
+    )
+    return ap.parse_args()
+
+
+def main() -> None:
+    """Download Dolly 15K, filter by length, split, convert to messages, and write JSONL."""
+    args = parse_args()
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    print("Step 1: Download and prepare dataset")
+    print("  Downloading Databricks Dolly 15K from Hugging Face (first run may take a minute)...")
+    ds = load_dataset("databricks/databricks-dolly-15k", split="train")
+    print("  Loaded. Filtering and splitting...")
+    
+    # Filter and shuffle
+    import random
+    rng = random.Random(args.seed)
+    
+    filtered_examples = []
+    for example in ds:
+        instruction = example.get("instruction", "")
+        context = example.get("context", "")
+        response = example.get("response", "")
+        
+        # Calculate total length (instruction + context + response)
+        total_length = len(instruction) + len(context or "") + len(response)
+        
+        if args.min_length <= total_length <= args.max_length:
+            filtered_examples.append(example)
+    
+    print(f"Filtered to {len(filtered_examples)} examples (length {args.min_length}-{args.max_length} chars)")
+    
+    # Shuffle with seed
+    rng.shuffle(filtered_examples)
+    
+    total_needed = args.train + args.valid + args.test
+    if len(filtered_examples) < total_needed:
+        raise RuntimeError(
+            f"Not enough examples after filtering: have {len(filtered_examples)}, need {total_needed}"
+        )
+    
+    # Split into train/valid/test
+    train_examples = filtered_examples[: args.train]
+    valid_examples = filtered_examples[args.train : args.train + args.valid]
+    test_examples = filtered_examples[args.train + args.valid : args.train + args.valid + args.test]
+    
+    # Convert to messages format, preserving category info
+    train_rows = []
+    for ex in train_examples:
+        msg_row = dolly_to_messages(
+            ex["instruction"],
+            ex.get("context"),
+            ex["response"],
+            system_prompt=args.system_prompt,
+        )
+        # Preserve category for evaluation
+        msg_row["category"] = ex.get("category", "unknown")
+        train_rows.append(msg_row)
+    
+    valid_rows = []
+    for ex in valid_examples:
+        msg_row = dolly_to_messages(
+            ex["instruction"],
+            ex.get("context"),
+            ex["response"],
+            system_prompt=args.system_prompt,
+        )
+        msg_row["category"] = ex.get("category", "unknown")
+        valid_rows.append(msg_row)
+    
+    test_rows = []
+    for ex in test_examples:
+        msg_row = dolly_to_messages(
+            ex["instruction"],
+            ex.get("context"),
+            ex["response"],
+            system_prompt=args.system_prompt,
+        )
+        msg_row["category"] = ex.get("category", "unknown")
+        test_rows.append(msg_row)
+    
+    # Count categories for manifest
+    train_categories = Counter(ex.get("category", "unknown") for ex in train_examples)
+    
+    # Write JSONL files
+    write_jsonl(out_dir / "train.jsonl", train_rows)
+    write_jsonl(out_dir / "valid.jsonl", valid_rows)
+    write_jsonl(out_dir / "test.jsonl", test_rows)
+    
+    # Write manifest
+    manifest = {
+        "dataset": "databricks/databricks-dolly-15k",
+        "split": "train",
+        "created_utc": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"),
+        "seed": args.seed,
+        "filters": {
+            "min_length": args.min_length,
+            "max_length": args.max_length,
+        },
+        "counts": {
+            "train": len(train_rows),
+            "valid": len(valid_rows),
+            "test": len(test_rows),
+        },
+        "category_distribution": dict(train_categories),
+        "system_prompt": args.system_prompt,
+    }
+    write_json(out_dir / "manifest.json", manifest)
+    
+    print(f"\n✓ Wrote Dolly 15K subset to: {out_dir}")
+    print(f"  - Train: {len(train_rows)} examples")
+    print(f"  - Valid: {len(valid_rows)} examples")
+    print(f"  - Test: {len(test_rows)} examples")
+    print(f"  - Categories: {dict(train_categories)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/chapter05/scripts/listing_5_4_evaluate.py b/code/chapter05/scripts/listing_5_3_evaluate.py
similarity index 96%
rename from code/chapter05/scripts/listing_5_4_evaluate.py
rename to code/chapter05/scripts/listing_5_3_evaluate.py
index 53f6c31..d6ef8c1 100644
--- a/code/chapter05/scripts/listing_5_4_evaluate.py
+++ b/code/chapter05/scripts/listing_5_3_evaluate.py
@@ -1,309 +1,309 @@
-"""Evaluation script comparing base model vs. fine-tuned adapter(s) (Listing 5.4).
-
-Loads the base model, evaluates it, then loads one or two adapters and evaluates
-each, computing per-metric deltas. Produces both a machine-readable JSON report
-and a human-readable Markdown summary.
-
-Usage (base vs. single adapter):
-    python chapter05/scripts/listing_5_4_evaluate.py \\
-        --base Qwen/Qwen3-4B-Instruct-2507 \\
-        --adapter chapter05/runs/dolly_lora \\
-        --dolly_test chapter05/data/dolly_subset/test.jsonl
-
-Usage (base vs. LoRA vs. QLoRA):
-    python chapter05/scripts/listing_5_4_evaluate.py \\
-        --base Qwen/Qwen3-4B-Instruct-2507 \\
-        --adapter chapter05/runs/dolly_lora \\
-        --adapter_alt chapter05/runs/dolly_qlora \\
-        --dolly_test chapter05/data/dolly_subset/test.jsonl
-
-See Chapter 5, Section 5.1 (Step 3) for walkthrough and expected results.
-"""
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-from chapter05.eval import (
-    eval_dolly_test_set,
-    eval_loss_on_jsonl,
-    eval_toy_golden,
-    load_model_variant,
-    safety_suite,
-    write_report,
-)
-from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments for the evaluation script.
-
-    Returns:
-        Namespace with base model, adapter paths, test data paths,
-        generation settings, and output directory.
-    """
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--base", default="Qwen/Qwen3-4B-Instruct-2507")
-    ap.add_argument("--adapter", default=None, help="Adapter folder path for main run")
-    ap.add_argument("--adapter_alt", default=None, help="Adapter folder path for comparison run")
-
-    ap.add_argument("--dolly_test", default=None, help="Dolly test set JSONL path (primary evaluation)")
-    ap.add_argument("--toy_golden", default="chapter05/data/golden/toy_test.jsonl", help="Toy test set (optional)")
-    ap.add_argument("--safety_prompts", default="chapter05/data/golden/safety_regression_prompts.jsonl")
-
-    ap.add_argument(
-        "--system_prompt",
-        default=DEFAULT_SYSTEM_PROMPT,
-        help="System prompt used for safety suite (toy golden uses per-example system prompt).",
-    )
-
-    ap.add_argument("--max_new_tokens", type=int, default=128)
-    ap.add_argument("--max_length", type=int, default=512)
-
-    ap.add_argument("--out", default="chapter05/runs/eval_report", help="Output folder for reports")
-    return ap.parse_args()
-
-
-def summarize_variant(name: str, model, tokenizer, args: argparse.Namespace) -> Dict[str, Any]:
-    """Run the full evaluation suite for a single model variant.
-
-    Evaluates on the Dolly test set (instruction-following), the toy golden
-    set (sanity check), and the safety suite (refusal rate). Returns a dict
-    of all results for this variant.
-
-    Args:
-        name: Label for this variant (e.g., "base", "adapter", "adapter_alt").
-        model: A HuggingFace causal LM (base or with adapter attached).
-        tokenizer: Matching tokenizer.
-        args: Parsed CLI arguments with test data paths and generation settings.
-
-    Returns:
-        Dict with evaluation results keyed by test type ("dolly", "toy", "safety").
-    """
-    result: Dict[str, Any] = {"name": name}
-    
-    # Primary evaluation: Dolly test set
-    if args.dolly_test and Path(args.dolly_test).exists():
-        dolly_result = eval_dolly_test_set(
-            model,
-            tokenizer,
-            test_jsonl=args.dolly_test,
-            system_prompt=args.system_prompt,
-            max_new_tokens=256,
-        )
-        result["dolly"] = dolly_result
-    
-    # Legacy evaluations (optional)
-    if Path(args.toy_golden).exists():
-        result["toy"] = eval_toy_golden(
-            model, tokenizer, golden_jsonl=args.toy_golden, max_new_tokens=args.max_new_tokens
-        )
-    
-    result["safety"] = safety_suite(
-        model,
-        tokenizer,
-        prompts_jsonl=args.safety_prompts,
-        system_prompt=args.system_prompt,
-    )
-    
-    return result
-
-
-def main() -> None:
-    """Evaluate base model and adapter(s), compute deltas, and write reports."""
-    from rich.console import Console
-    console = Console()
-    
-    args = parse_args()
-    out_dir = Path(args.out)
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    console.print("\n[bold cyan]Step 1/4:[/bold cyan] Loading base model...")
-    base_model, base_tok = load_model_variant(base_model=args.base, adapter=None)
-    console.print("[green]✓[/green] Base model loaded\n")
-    
-    console.print("[bold cyan]Step 2/4:[/bold cyan] Evaluating base model...")
-    base_res = summarize_variant("base", base_model, base_tok, args)
-    console.print("[green]✓[/green] Base evaluation complete\n")
-
-    res: Dict[str, Any] = {"base": base_res}
-
-    if args.adapter:
-        console.print(f"[bold cyan]Step 3/4:[/bold cyan] Loading adapter from {args.adapter}...")
-        m, t = load_model_variant(base_model=args.base, adapter=args.adapter)
-        console.print("[green]✓[/green] Adapter loaded\n")
-        
-        console.print("[bold cyan]Step 4/4:[/bold cyan] Evaluating fine-tuned model...")
-        res["adapter"] = summarize_variant("adapter", m, t, args)
-        console.print("[green]✓[/green] Fine-tuned evaluation complete\n")
-
-    if args.adapter_alt:
-        console.print(f"[bold cyan]Loading alternative adapter from {args.adapter_alt}...[/bold cyan]")
-        m, t = load_model_variant(base_model=args.base, adapter=args.adapter_alt)
-        console.print("[green]✓[/green] Alternative adapter loaded\n")
-        
-        console.print("[bold cyan]Evaluating alternative adapter...[/bold cyan]")
-        res["adapter_alt"] = summarize_variant("adapter_alt", m, t, args)
-        console.print("[green]✓[/green] Alternative evaluation complete\n")
-
-    def maybe_delta(a: Optional[float], b: Optional[float]) -> Optional[float]:
-        """Compute a - b, returning None if either value is missing."""
-        if a is None or b is None:
-            return None
-        return float(a - b)
-
-    def compute_deltas(base: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
-        """Compute metric deltas between an adapter and the base model."""
-        deltas: Dict[str, Any] = {
-            "safety": {
-                "refusal_rate": maybe_delta(other["safety"]["refusal_rate"], base["safety"]["refusal_rate"]),
-            },
-        }
-        
-        # Dolly metrics (primary)
-        if base.get("dolly") and other.get("dolly"):
-            deltas["dolly"] = {
-                "exact_match": maybe_delta(other["dolly"]["exact_match"], base["dolly"]["exact_match"]),
-                "token_f1": maybe_delta(other["dolly"]["token_f1"], base["dolly"]["token_f1"]),
-                "category_metrics": {},
-            }
-            # Per-category deltas
-            base_cats = base["dolly"].get("category_metrics", {})
-            other_cats = other["dolly"].get("category_metrics", {})
-            for cat in set(base_cats.keys()) | set(other_cats.keys()):
-                if cat in base_cats and cat in other_cats:
-                    deltas["dolly"]["category_metrics"][cat] = {
-                        "exact_match": maybe_delta(
-                            other_cats[cat]["exact_match"], base_cats[cat]["exact_match"]
-                        ),
-                        "token_f1": maybe_delta(
-                            other_cats[cat]["token_f1"], base_cats[cat]["token_f1"]
-                        ),
-                    }
-        
-        # Toy metrics (optional)
-        if base.get("toy") and other.get("toy"):
-            deltas["toy"] = {
-                "exact_match": maybe_delta(other["toy"]["exact_match"], base["toy"]["exact_match"]),
-                "token_f1": maybe_delta(other["toy"]["token_f1"], base["toy"]["token_f1"]),
-            }
-        
-        return deltas
-
-    if "adapter" in res:
-        res["adapter_deltas_vs_base"] = compute_deltas(res["base"], res["adapter"])
-    if "adapter_alt" in res:
-        res["adapter_alt_deltas_vs_base"] = compute_deltas(res["base"], res["adapter_alt"])
-
-    # Write JSON
-    write_report(out_dir / "report.json", res)
-
-    # Write a short Markdown summary
-    def fmt_pct(x: float) -> str:
-        """Format a 0-1 float as a percentage string (e.g., 0.6 -> '60.0%')."""
-        return f"{x*100:.1f}%"
-
-    def fmt_delta(x: Optional[float], *, pct: bool = False) -> str:
-        """Format a delta value with +/- sign (e.g., +0.1321 or +13.2%)."""
-        if x is None:
-            return "n/a"
-        if pct:
-            return f"{x*100:+.1f}%"
-        return f"{x:+.4f}"
-
-    lines = []
-    lines.append(f"# Chapter 5 Evaluation Report")
-    lines.append("")
-    lines.append(f"- Base model: `{args.base}`")
-    lines.append(f"- System prompt: `{args.system_prompt}`")
-    if args.dolly_test:
-        lines.append(f"- Dolly test set: `{args.dolly_test}`")
-    if args.adapter:
-        lines.append(f"- Adapter: `{args.adapter}`")
-    if args.adapter_alt:
-        lines.append(f"- Adapter (alt): `{args.adapter_alt}`")
-    lines.append("")
-
-    for key in ["base", "adapter", "adapter_alt"]:
-        if key not in res:
-            continue
-        variant = res[key]
-        lines.append(f"## {key}")
-        
-        # Primary: Dolly metrics
-        if variant.get("dolly"):
-            d = variant["dolly"]
-            lines.append(f"### Dolly Test Set (Instruction-Following)")
-            lines.append(f"- **Overall exact match**: {fmt_pct(d['exact_match'])}")
-            lines.append(f"- **Overall token-F1**: {d['token_f1']:.3f}")
-            lines.append(f"- **Test examples**: {d['count']}")
-            if d.get("category_metrics"):
-                lines.append(f"\n**Per-Category Accuracy:**")
-                for cat, metrics in sorted(d["category_metrics"].items()):
-                    lines.append(
-                        f"- {cat}: EM={fmt_pct(metrics['exact_match'])}, F1={metrics['token_f1']:.3f} "
-                        f"(n={metrics['count']})"
-                    )
-            lines.append("")
-        
-        # Safety
-        safety = variant["safety"]
-        lines.append(f"- **Safety refusal rate**: {fmt_pct(safety['refusal_rate'])}")
-        
-        # Toy metrics (if present)
-        if variant.get("toy"):
-            toy = variant["toy"]
-            lines.append(f"- **Toy exact match**: {fmt_pct(toy['exact_match'])}")
-            lines.append(f"- **Toy token-F1**: {toy['token_f1']:.3f}")
-        
-        lines.append("")
-
-    # Delta section (base vs adapters)
-    def add_delta_block(delta_key: str, label: str) -> None:
-        """Append a Markdown section showing metric deltas vs. base."""
-        if delta_key not in res:
-            return
-        d = res[delta_key]
-        lines.append(f"## {label} (Improvement vs Base)")
-        
-        # Primary: Dolly metrics
-        if d.get("dolly"):
-            lines.append(f"### Dolly Test Set Improvements")
-            lines.append(f"- **Overall exact match Δ**: {fmt_delta(d['dolly']['exact_match'], pct=True)}")
-            lines.append(f"- **Overall token-F1 Δ**: {fmt_delta(d['dolly']['token_f1'])}")
-            if d["dolly"].get("category_metrics"):
-                lines.append(f"\n**Per-Category Improvements:**")
-                for cat, metrics in sorted(d["dolly"]["category_metrics"].items()):
-                    em_delta = metrics.get("exact_match")
-                    f1_delta = metrics.get("token_f1")
-                    if em_delta is not None or f1_delta is not None:
-                        em_str = fmt_delta(em_delta, pct=True) if em_delta is not None else "n/a"
-                        f1_str = fmt_delta(f1_delta) if f1_delta is not None else "n/a"
-                        lines.append(f"- {cat}: EM Δ={em_str}, F1 Δ={f1_str}")
-            lines.append("")
-        
-        # Safety
-        lines.append(f"- **Safety refusal rate Δ**: {fmt_delta(d['safety']['refusal_rate'], pct=True)}")
-        
-        # Toy metrics
-        if d.get("toy"):
-            lines.append(f"- **Toy exact match Δ**: {fmt_delta(d['toy']['exact_match'], pct=True)}")
-            lines.append(f"- **Toy token-F1 Δ**: {fmt_delta(d['toy']['token_f1'])}")
-        
-        lines.append("")
-
-    add_delta_block("adapter_deltas_vs_base", "adapter")
-    add_delta_block("adapter_alt_deltas_vs_base", "adapter_alt")
-
-    console.print("\n[bold cyan]Writing evaluation reports...[/bold cyan]")
-    (out_dir / "report.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
-    
-    console.print(f"\n[bold green]✓ Evaluation complete![/bold green]")
-    console.print(f"[green]✓[/green] JSON report: {out_dir / 'report.json'}")
-    console.print(f"[green]✓[/green] Markdown summary: {out_dir / 'report.md'}")
-    console.print(f"\n[yellow]→[/yellow] View the markdown report for a human-readable summary")
-
-
-if __name__ == "__main__":
-    main()
+"""Evaluation script comparing base model vs. fine-tuned adapter(s) (Listing 5.3).
+
+Loads the base model, evaluates it, then loads one or two adapters and evaluates
+each, computing per-metric deltas. Produces both a machine-readable JSON report
+and a human-readable Markdown summary.
+
+Usage (base vs. single adapter):
+    python chapter05/scripts/listing_5_3_evaluate.py \\
+        --base Qwen/Qwen3-4B-Instruct-2507 \\
+        --adapter chapter05/runs/dolly_lora \\
+        --dolly_test chapter05/data/dolly_subset/test.jsonl
+
+Usage (base vs. LoRA vs. QLoRA):
+    python chapter05/scripts/listing_5_3_evaluate.py \\
+        --base Qwen/Qwen3-4B-Instruct-2507 \\
+        --adapter chapter05/runs/dolly_lora \\
+        --adapter_alt chapter05/runs/dolly_qlora \\
+        --dolly_test chapter05/data/dolly_subset/test.jsonl
+
+See Chapter 5, Section 5.1 (Step 3) for walkthrough and expected results.
+"""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from chapter05.eval import (
+    eval_dolly_test_set,
+    eval_loss_on_jsonl,
+    eval_toy_golden,
+    load_model_variant,
+    safety_suite,
+    write_report,
+)
+from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for the evaluation script.
+
+    Returns:
+        Namespace with base model, adapter paths, test data paths,
+        generation settings, and output directory.
+    """
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--base", default="Qwen/Qwen3-4B-Instruct-2507")
+    ap.add_argument("--adapter", default=None, help="Adapter folder path for main run")
+    ap.add_argument("--adapter_alt", default=None, help="Adapter folder path for comparison run")
+
+    ap.add_argument("--dolly_test", default=None, help="Dolly test set JSONL path (primary evaluation)")
+    ap.add_argument("--toy_golden", default="chapter05/data/golden/toy_test.jsonl", help="Toy test set (optional)")
+    ap.add_argument("--safety_prompts", default="chapter05/data/golden/safety_regression_prompts.jsonl")
+
+    ap.add_argument(
+        "--system_prompt",
+        default=DEFAULT_SYSTEM_PROMPT,
+        help="System prompt used for safety suite (toy golden uses per-example system prompt).",
+    )
+
+    ap.add_argument("--max_new_tokens", type=int, default=128)
+    ap.add_argument("--max_length", type=int, default=512)
+
+    ap.add_argument("--out", default="chapter05/runs/eval_report", help="Output folder for reports")
+    return ap.parse_args()
+
+
+def summarize_variant(name: str, model, tokenizer, args: argparse.Namespace) -> Dict[str, Any]:
+    """Run the full evaluation suite for a single model variant.
+
+    Evaluates on the Dolly test set (instruction-following), the toy golden
+    set (sanity check), and the safety suite (refusal rate). Returns a dict
+    of all results for this variant.
+
+    Args:
+        name: Label for this variant (e.g., "base", "adapter", "adapter_alt").
+        model: A HuggingFace causal LM (base or with adapter attached).
+        tokenizer: Matching tokenizer.
+        args: Parsed CLI arguments with test data paths and generation settings.
+
+    Returns:
+        Dict with evaluation results keyed by test type ("dolly", "toy", "safety").
+    """
+    result: Dict[str, Any] = {"name": name}
+    
+    # Primary evaluation: Dolly test set
+    if args.dolly_test and Path(args.dolly_test).exists():
+        dolly_result = eval_dolly_test_set(
+            model,
+            tokenizer,
+            test_jsonl=args.dolly_test,
+            system_prompt=args.system_prompt,
+            max_new_tokens=256,
+        )
+        result["dolly"] = dolly_result
+    
+    # Legacy evaluations (optional)
+    if Path(args.toy_golden).exists():
+        result["toy"] = eval_toy_golden(
+            model, tokenizer, golden_jsonl=args.toy_golden, max_new_tokens=args.max_new_tokens
+        )
+    
+    result["safety"] = safety_suite(
+        model,
+        tokenizer,
+        prompts_jsonl=args.safety_prompts,
+        system_prompt=args.system_prompt,
+    )
+    
+    return result
+
+
+def main() -> None:
+    """Evaluate base model and adapter(s), compute deltas, and write reports."""
+    from rich.console import Console
+    console = Console()
+    
+    args = parse_args()
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    console.print("\n[bold cyan]Step 1/4:[/bold cyan] Loading base model...")
+    base_model, base_tok = load_model_variant(base_model=args.base, adapter=None)
+    console.print("[green]✓[/green] Base model loaded\n")
+    
+    console.print("[bold cyan]Step 2/4:[/bold cyan] Evaluating base model...")
+    base_res = summarize_variant("base", base_model, base_tok, args)
+    console.print("[green]✓[/green] Base evaluation complete\n")
+
+    res: Dict[str, Any] = {"base": base_res}
+
+    if args.adapter:
+        console.print(f"[bold cyan]Step 3/4:[/bold cyan] Loading adapter from {args.adapter}...")
+        m, t = load_model_variant(base_model=args.base, adapter=args.adapter)
+        console.print("[green]✓[/green] Adapter loaded\n")
+        
+        console.print("[bold cyan]Step 4/4:[/bold cyan] Evaluating fine-tuned model...")
+        res["adapter"] = summarize_variant("adapter", m, t, args)
+        console.print("[green]✓[/green] Fine-tuned evaluation complete\n")
+
+    if args.adapter_alt:
+        console.print(f"[bold cyan]Loading alternative adapter from {args.adapter_alt}...[/bold cyan]")
+        m, t = load_model_variant(base_model=args.base, adapter=args.adapter_alt)
+        console.print("[green]✓[/green] Alternative adapter loaded\n")
+        
+        console.print("[bold cyan]Evaluating alternative adapter...[/bold cyan]")
+        res["adapter_alt"] = summarize_variant("adapter_alt", m, t, args)
+        console.print("[green]✓[/green] Alternative evaluation complete\n")
+
+    def maybe_delta(a: Optional[float], b: Optional[float]) -> Optional[float]:
+        """Compute a - b, returning None if either value is missing."""
+        if a is None or b is None:
+            return None
+        return float(a - b)
+
+    def compute_deltas(base: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
+        """Compute metric deltas between an adapter and the base model."""
+        deltas: Dict[str, Any] = {
+            "safety": {
+                "refusal_rate": maybe_delta(other["safety"]["refusal_rate"], base["safety"]["refusal_rate"]),
+            },
+        }
+        
+        # Dolly metrics (primary)
+        if base.get("dolly") and other.get("dolly"):
+            deltas["dolly"] = {
+                "exact_match": maybe_delta(other["dolly"]["exact_match"], base["dolly"]["exact_match"]),
+                "token_f1": maybe_delta(other["dolly"]["token_f1"], base["dolly"]["token_f1"]),
+                "category_metrics": {},
+            }
+            # Per-category deltas
+            base_cats = base["dolly"].get("category_metrics", {})
+            other_cats = other["dolly"].get("category_metrics", {})
+            for cat in set(base_cats.keys()) | set(other_cats.keys()):
+                if cat in base_cats and cat in other_cats:
+                    deltas["dolly"]["category_metrics"][cat] = {
+                        "exact_match": maybe_delta(
+                            other_cats[cat]["exact_match"], base_cats[cat]["exact_match"]
+                        ),
+                        "token_f1": maybe_delta(
+                            other_cats[cat]["token_f1"], base_cats[cat]["token_f1"]
+                        ),
+                    }
+        
+        # Toy metrics (optional)
+        if base.get("toy") and other.get("toy"):
+            deltas["toy"] = {
+                "exact_match": maybe_delta(other["toy"]["exact_match"], base["toy"]["exact_match"]),
+                "token_f1": maybe_delta(other["toy"]["token_f1"], base["toy"]["token_f1"]),
+            }
+        
+        return deltas
+
+    if "adapter" in res:
+        res["adapter_deltas_vs_base"] = compute_deltas(res["base"], res["adapter"])
+    if "adapter_alt" in res:
+        res["adapter_alt_deltas_vs_base"] = compute_deltas(res["base"], res["adapter_alt"])
+
+    # Write JSON
+    write_report(out_dir / "report.json", res)
+
+    # Write a short Markdown summary
+    def fmt_pct(x: float) -> str:
+        """Format a 0-1 float as a percentage string (e.g., 0.6 -> '60.0%')."""
+        return f"{x*100:.1f}%"
+
+    def fmt_delta(x: Optional[float], *, pct: bool = False) -> str:
+        """Format a delta value with +/- sign (e.g., +0.1321 or +13.2%)."""
+        if x is None:
+            return "n/a"
+        if pct:
+            return f"{x*100:+.1f}%"
+        return f"{x:+.4f}"
+
+    lines = []
+    lines.append(f"# Chapter 5 Evaluation Report")
+    lines.append("")
+    lines.append(f"- Base model: `{args.base}`")
+    lines.append(f"- System prompt: `{args.system_prompt}`")
+    if args.dolly_test:
+        lines.append(f"- Dolly test set: `{args.dolly_test}`")
+    if args.adapter:
+        lines.append(f"- Adapter: `{args.adapter}`")
+    if args.adapter_alt:
+        lines.append(f"- Adapter (alt): `{args.adapter_alt}`")
+    lines.append("")
+
+    for key in ["base", "adapter", "adapter_alt"]:
+        if key not in res:
+            continue
+        variant = res[key]
+        lines.append(f"## {key}")
+        
+        # Primary: Dolly metrics
+        if variant.get("dolly"):
+            d = variant["dolly"]
+            lines.append(f"### Dolly Test Set (Instruction-Following)")
+            lines.append(f"- **Overall exact match**: {fmt_pct(d['exact_match'])}")
+            lines.append(f"- **Overall token-F1**: {d['token_f1']:.3f}")
+            lines.append(f"- **Test examples**: {d['count']}")
+            if d.get("category_metrics"):
+                lines.append(f"\n**Per-Category Accuracy:**")
+                for cat, metrics in sorted(d["category_metrics"].items()):
+                    lines.append(
+                        f"- {cat}: EM={fmt_pct(metrics['exact_match'])}, F1={metrics['token_f1']:.3f} "
+                        f"(n={metrics['count']})"
+                    )
+            lines.append("")
+        
+        # Safety
+        safety = variant["safety"]
+        lines.append(f"- **Safety refusal rate**: {fmt_pct(safety['refusal_rate'])}")
+        
+        # Toy metrics (if present)
+        if variant.get("toy"):
+            toy = variant["toy"]
+            lines.append(f"- **Toy exact match**: {fmt_pct(toy['exact_match'])}")
+            lines.append(f"- **Toy token-F1**: {toy['token_f1']:.3f}")
+        
+        lines.append("")
+
+    # Delta section (base vs adapters)
+    def add_delta_block(delta_key: str, label: str) -> None:
+        """Append a Markdown section showing metric deltas vs. base."""
+        if delta_key not in res:
+            return
+        d = res[delta_key]
+        lines.append(f"## {label} (Improvement vs Base)")
+        
+        # Primary: Dolly metrics
+        if d.get("dolly"):
+            lines.append(f"### Dolly Test Set Improvements")
+            lines.append(f"- **Overall exact match Δ**: {fmt_delta(d['dolly']['exact_match'], pct=True)}")
+            lines.append(f"- **Overall token-F1 Δ**: {fmt_delta(d['dolly']['token_f1'])}")
+            if d["dolly"].get("category_metrics"):
+                lines.append(f"\n**Per-Category Improvements:**")
+                for cat, metrics in sorted(d["dolly"]["category_metrics"].items()):
+                    em_delta = metrics.get("exact_match")
+                    f1_delta = metrics.get("token_f1")
+                    if em_delta is not None or f1_delta is not None:
+                        em_str = fmt_delta(em_delta, pct=True) if em_delta is not None else "n/a"
+                        f1_str = fmt_delta(f1_delta) if f1_delta is not None else "n/a"
+                        lines.append(f"- {cat}: EM Δ={em_str}, F1 Δ={f1_str}")
+            lines.append("")
+        
+        # Safety
+        lines.append(f"- **Safety refusal rate Δ**: {fmt_delta(d['safety']['refusal_rate'], pct=True)}")
+        
+        # Toy metrics
+        if d.get("toy"):
+            lines.append(f"- **Toy exact match Δ**: {fmt_delta(d['toy']['exact_match'], pct=True)}")
+            lines.append(f"- **Toy token-F1 Δ**: {fmt_delta(d['toy']['token_f1'])}")
+        
+        lines.append("")
+
+    add_delta_block("adapter_deltas_vs_base", "adapter")
+    add_delta_block("adapter_alt_deltas_vs_base", "adapter_alt")
+
+    console.print("\n[bold cyan]Writing evaluation reports...[/bold cyan]")
+    (out_dir / "report.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
+    
+    console.print(f"\n[bold green]✓ Evaluation complete![/bold green]")
+    console.print(f"[green]✓[/green] JSON report: {out_dir / 'report.json'}")
+    console.print(f"[green]✓[/green] Markdown summary: {out_dir / 'report.md'}")
+    console.print(f"\n[yellow]→[/yellow] View the markdown report for a human-readable summary")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/chapter05/train_lora.py b/code/chapter05/train_lora.py
index d955f2a..d543e61 100644
--- a/code/chapter05/train_lora.py
+++ b/code/chapter05/train_lora.py
@@ -1,4 +1,4 @@
-"""LoRA fine-tuning script using TRL's SFTTrainer (Listing 5.3).
+"""LoRA fine-tuning script using TRL's SFTTrainer (Listing 5.2).
 
 Trains a LoRA adapter on chat-formatted JSONL data and saves the adapter
 weights. The base model is frozen; only the small LoRA matrices are updated.
diff --git a/code/chapter05/train_qlora.py b/code/chapter05/train_qlora.py
index 472e0cf..a9a3d2d 100644
--- a/code/chapter05/train_qlora.py
+++ b/code/chapter05/train_qlora.py
@@ -1,4 +1,4 @@
-"""QLoRA fine-tuning script using TRL's SFTTrainer (Listing 5.6).
+"""QLoRA fine-tuning script using TRL's SFTTrainer (Listing 5.5).
 
 Same pipeline as train_lora.py but loads the base model in 4-bit quantization
 (NF4 via bitsandbytes), reducing GPU memory by roughly 4x. This enables