From 60d25e98384b68fcad8db2c7db53687b56e40dfa Mon Sep 17 00:00:00 2001 From: Amit Bahree Date: Sun, 7 Jun 2026 14:15:37 -0700 Subject: [PATCH] Refactor Chapter 5 Scripts and Update Dataset Preparation and cleaned up unnecessary code and improved documentation for clarity. --- README.md | 2 +- code/chapter02/quickstart.py | 2 +- code/chapter02/run_chapter5_adapter.py | 2 +- code/chapter04/README.md | 2 +- code/chapter05/README.md | 1132 ++++++++--------- code/chapter05/eval.py | 6 +- .../examples/README_INTERPRETING_RESULTS.md | 2 +- .../example_data_prep_outcome_types.md | 2 +- .../example_qlora_evaluation_output.md | 2 +- code/chapter05/generate.py | 220 ++-- code/chapter05/modeling.py | 376 +++--- .../chapter05/scripts/fix_safety_complete.ps1 | 2 +- code/chapter05/scripts/fix_safety_complete.sh | 2 +- .../scripts/fix_safety_regression.ps1 | 2 +- .../scripts/fix_safety_regression.sh | 2 +- ...aset.py => listing_5_1_prepare_dataset.py} | 422 +++--- ..._4_evaluate.py => listing_5_3_evaluate.py} | 618 ++++----- code/chapter05/train_lora.py | 2 +- code/chapter05/train_qlora.py | 2 +- 19 files changed, 1400 insertions(+), 1400 deletions(-) rename code/chapter05/scripts/{listing_5_2_prepare_dataset.py => listing_5_1_prepare_dataset.py} (96%) rename code/chapter05/scripts/{listing_5_4_evaluate.py => listing_5_3_evaluate.py} (96%) diff --git a/README.md b/README.md index 3ae165a..fbce8cf 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Every chapter ships with runnable code. The hands-on chapters (4 through 9) repr | **[Chapter 1: Why Model Adaptation?](code/chapter01/README.md)** | A reproducibility script for the §1.6 sidebar. Runs the same prompt through base Qwen3-4B, the Chapter 5 LoRA adapter, and the Chapter 6 SFT model side by side; degrades gracefully if the later-chapter artifacts are not yet built. | | **[Chapter 2: How Do I Do Model Adaptation?](code/chapter02/README.md)** | A five-step LoRA fine-tuning quickstart on Qwen3-4B-Instruct-2507 using a 40-example Dolly subset (TRL's `SFTTrainer` plus PEFT): dataset prep, LoRA training, generation, and adapter save. Runs in under 10 minutes on a 12 GB GPU, and on Apple Silicon via MPS. | | **[Chapter 3: What Data Do I Need?](code/chapter03/README.md)** | Data-quality experiment that trains the same model on four versions of Financial PhraseBank and compares results on a held-out test set; a six-step synthetic data generation pipeline (load → prompt → generate → quality-gate → distribution-check → mix-and-save) using a frontier teacher; and a standalone `DatasetManifest` module for content hashing, lineage tracking, and retention scheduling. | -| **[Chapter 4: In-Context Learning and Few-Shot Adaptation](code/chapter04/README.md)** | Few-shot ticket classifier, prompt validator with run-to-run variability measurement, minimal RAG pipeline (50 lines), and a Precision@k / Recall@k / Hit@1 retrieval evaluator. CPU-friendly; GPU optional. | +| **[Chapter 4: In-Context Learning, Few-Shot, and RAG](code/chapter04/README.md)** | Few-shot ticket classifier, prompt validator with run-to-run variability measurement, minimal RAG pipeline (50 lines), and a Precision@k / Recall@k / Hit@1 retrieval evaluator. CPU-friendly; GPU optional. | | **[Chapter 5: Parameter-Efficient Fine-Tuning (LoRA and QLoRA)](code/chapter05/README.md)** | LoRA and QLoRA adapters trained on a 400-example Dolly subset of Qwen3-4B-Instruct-2507, evaluated against the base model with per-category Token-F1 and a safety regression suite. | | **[Chapter 6: Supervised Fine-Tuning (SFT)](code/chapter06/README.md)** | A full-parameter SFT of Qwen3-4B-Instruct-2507 on a technical-support Dolly subset, with overfit monitoring, three-way base-vs-LoRA-vs-SFT comparison, behavioral tests, and a separate safety regression suite. | | **[Chapter 7: Knowledge Distillation](code/chapter07/README.md)** | Black-box distillation from the chapter 6 SFT teacher into a chapter 5-style LoRA student, with quality filtering, three-way base-vs-teacher-vs-student evaluation, safety robustness check, and an optional OpenRouter-backed SFT-vs-frontier-API comparison. | diff --git a/code/chapter02/quickstart.py b/code/chapter02/quickstart.py index 03bf55d..c7446fd 100644 --- a/code/chapter02/quickstart.py +++ b/code/chapter02/quickstart.py @@ -70,7 +70,7 @@ def step1_prepare_dataset() -> tuple[HFDataset, HFDataset, List[Dict[str, Any]]]: """Step 1: download Dolly 15K and keep 40 train + 5 valid + 3 demo examples. - Same filter and seed as chapter 5's listing_5_2_prepare_dataset.py, just a + Same filter and seed as chapter 5's listing_5_1_prepare_dataset.py, just a smaller slice so the run finishes in minutes. """ print("Step 1: prepare dataset") diff --git a/code/chapter02/run_chapter5_adapter.py b/code/chapter02/run_chapter5_adapter.py index 2ae99e8..62b4e61 100644 --- a/code/chapter02/run_chapter5_adapter.py +++ b/code/chapter02/run_chapter5_adapter.py @@ -111,7 +111,7 @@ def print_no_adapter_instructions(args: argparse.Namespace) -> None: print("Two ways to fix this:") print() print("Option A. Train the chapter 5 adapter locally:") - print(" python -m chapter05.scripts.listing_5_2_prepare_dataset \\") + print(" python -m chapter05.scripts.listing_5_1_prepare_dataset \\") print(" --out chapter05/data/dolly_subset --seed 42") print(" python -m chapter05.train_lora \\") print(" --train chapter05/data/dolly_subset/train.jsonl \\") diff --git a/code/chapter04/README.md b/code/chapter04/README.md index b43b04e..401c75a 100644 --- a/code/chapter04/README.md +++ b/code/chapter04/README.md @@ -1,4 +1,4 @@ -# Chapter 4 -- In-Context Learning and Few-Shot Adaptation +# Chapter 4 -- In-Context Learning, Few-Shot, and RAG This chapter covers how to get useful work out of a model without training it: few-shot prompting, many-shot prompting on long-context models, prompt validation against held-out test sets, and a minimal retrieval-augmented generation (RAG) pipeline. The code in this folder backs the four numbered listings in the chapter. diff --git a/code/chapter05/README.md b/code/chapter05/README.md index e9e719f..acdb195 100644 --- a/code/chapter05/README.md +++ b/code/chapter05/README.md @@ -1,566 +1,566 @@ -# Chapter 5 - LoRA and QLoRA Fine-Tuning (Qwen3-4B) - -This chapter demonstrates parameter-efficient fine-tuning using LoRA and QLoRA on **`Qwen/Qwen3-4B-Instruct-2507`**. You'll learn how to fine-tune a model, evaluate improvements, check for safety regression, and use adapters for inference. - -**Repository**: - -### Where is the code? - -All Chapter 5 code is in **this folder** (`code/chapter05/`): - -| Location | What you'll find | -|----------|------------------| -| **`scripts/`** | Scripts you run (prepare dataset, evaluate, validate). | -| **`*.py`** (this folder) | Python package (training, eval, modeling). Run as `python -m chapter05.train_lora` etc. | -| **`data/`** | Data files and golden sets. | - -Shared utilities (JSONL, env, seed) live in **`code/common/`**. Install from `code/` with `pip install -e .`. - -**Chapter outline and listing map:** - -| Listing | In the chapter | In the repo | -|---------|----------------|-------------| -| **5.2** | Data format; prepare dataset | `scripts/listing_5_2_prepare_dataset.py` | -| **5.3** | LoRA config + SFTTrainer | `modeling.py`, `train_lora.py` | -| **5.4** | Evaluation | `scripts/listing_5_4_evaluate.py` | -| **5.5** | Inference with adapter | `generate.py` | -| **5.6** | QLoRA 4-bit loading | `train_qlora.py` | -| **5.7** | Safety regression test | `scripts/listing_5_4_evaluate.py` (safety section) | - -**Data folder (`data/`):** Dolly 15K is on Hugging Face (`databricks/databricks-dolly-15k`). Create a local subset with `listing_5_2_prepare_dataset.py --out chapter05/data/dolly_subset`. The repo includes `golden/` (small test files for eval) and `smoke/` (minimal train/valid for `validate_chapter05.py`). - -**What are `data.py` and `dataset.py`?** -- **`data.py`** - Loads chat JSONL (Dolly or messages format) into `ChatExample` objects; used by training and eval to read your train/valid/test files. -- **`dataset.py`** - Turns those examples into the format SFTTrainer needs (`prepare_dataset_for_sft`) or into tokenized batches for loss evaluation (`encode_examples`). Both are core to the chapter flow, not legacy. - ---- - -## What We're Fine-Tuning - -We're fine-tuning Qwen3-4B-Instruct-2507 to improve **instruction-following quality** across diverse tasks. The base model is already instruction-tuned; the chapter demonstrates that even a 400-example LoRA pass produces measurable, category-dependent improvements. - -**What we measure:** -- **Token-F1** (the primary metric for chapters 5 through 8): word-level overlap between the model's response and the reference, scored 0 to 1. -- **Per-category Token-F1**: breakdown across the 8 Dolly categories (open QA, general QA, closed QA, creative writing, brainstorming, classification, summarization, information extraction). -- **Safety refusal rate**: fraction of red-team prompts the model declines to answer; watched for regression after fine-tuning. - -**Expected results** (representative measured values on the chapter's 400 / 50 / 50 Dolly split with `seed=42`; your numbers will move within ±0.02 across hardware and library versions): - -- Base Qwen3-4B-Instruct-2507: Token-F1 ≈ 0.212, safety refusal 100%. -- After LoRA (r=16, 3 epochs): Token-F1 ≈ 0.345 (+0.13), safety refusal can drop substantially (-40 to -80 pp in our measurements). -- After QLoRA (r=8, 3 epochs): Token-F1 ≈ 0.39, safety refusal ≈ 40-60%. - -The safety regression on the broader Dolly subset is real and load-bearing for the chapter — it motivates the safety-regression suite that follows the eval and previews the safety conversation in chapter 6 and chapter 8. - -## Why Dolly 15K? - -We use **`databricks/databricks-dolly-15k`** because: - -1. **Narrative continuity.** Chapter 4 uses Dolly 15K for few-shot prompting (no training). Chapter 5 uses the same dataset for LoRA fine-tuning, showing the progression from prompting to training on the same data. Chapter 6 reuses it for full SFT on a technical-support subset. -2. **Real public dataset.** Dolly 15K is widely used and commercially viable (CC-BY-SA-3.0). Human-authored, not synthetic. -3. **Measurable tasks.** Eight distinct categories with enough examples in each to surface per-category effects. -4. **Right size for LoRA.** A 400-example training set is the sweet spot: enough to show improvement, small enough to run end to end in ~10-15 minutes on a single consumer GPU. - -## Prerequisites - -### One-Time Setup (Fresh Machine) - -**First-time setup:** If you haven't set up the book environment yet, follow the detailed instructions in **`code/README.md`** (one directory up). This includes: -- Checking Python version (**3.12+ required**) -- Installing system prerequisites (Ubuntu/Debian: `python3-venv`) -- Creating virtual environment -- Installing PyTorch (CPU or CUDA) -- Installing the book package - -Once you've completed the general setup, come back here for Chapter 5-specific steps. - -**Required for Chapter 5's QLoRA branch (Step 5) — install with the QLoRA extra.** The LoRA pass (Steps 1-4) works on the base `pip install -e ".[dev]"` install; QLoRA needs bitsandbytes for 4-bit quantization. From the `code/` directory: - -```bash -pip install -e ".[qlora]" -``` - -QLoRA is optional. If you do not plan to run Step 5, you can skip this extra. - -> **On a Mac?** QLoRA (Step 5) does not run on Apple Silicon: `bitsandbytes` 4-bit kernels are CUDA/ROCm-only, with no Metal/MPS build. Removing `bitsandbytes` would not make QLoRA run on a Mac, it would just remove the 4-bit path that makes it QLoRA. Use the LoRA branch (Steps 1-4), which needs no `bitsandbytes` and trains on MPS. See [ACCELERATORS.md](../../ACCELERATORS.md#why-qlora-needs-an-nvidia-or-amd-gpu) for the full explanation. - -### Verify Your Setup (Recommended) - -Before investing time in full training runs, validate that everything is installed correctly: - -```bash -python chapter05/scripts/validate_chapter05.py -``` - -**What this does:** -1. **Checks** Python version -2. **Verifies** required data files exist (smoke test datasets, safety prompts) -3. **Confirms** PyTorch is installed and detects CUDA availability -4. **Runs** a tiny 2-step LoRA training (smoke test) to ensure the full pipeline works -5. **Validates** the adapter was created successfully - -**Why run this?** -- **Catches setup issues early** - Better to find missing dependencies now than 15 minutes into a full training run -- **Tests the complete workflow** - Loads model, tokenizes data, runs training, saves adapter -- **Takes only 2-3 minutes** - Much faster than debugging a failed full training run -- **GPU-aware** - Skips training test if no GPU detected (to avoid slow CPU runs) -- **Chapter-specific** - Each chapter has its own validation script tailored to its requirements (other chapters may have different dependencies or model sizes) - -**Expected output:** -``` -Chapter 5 validation -- Python: 3.12.3 -- Datasets: **OK** -- Torch: 2.10.0+cu126 -- CUDA available: True -- Running tiny LoRA smoke training... - [Progress bars and training logs] -- Smoke training: **OK** (adapter written to chapter05/runs/validate_lora_smoke) -``` - -**If validation fails**, it will show a clear error message indicating what's missing (e.g., "PyTorch not installed" or "Missing required files"). - -### GPU Requirements - -- **LoRA**: minimum **8 GB VRAM** (RTX 3060 / 4060 class). -- **QLoRA**: minimum **6 GB VRAM** (works on smaller GPUs). -- **Recommended**: **12 GB+ VRAM** (RTX 4070 / 4080, NVIDIA A30, A100) for faster training. -- **Training time on a single A30**: ~10-12 minutes for LoRA, ~14 minutes for QLoRA (400 examples, 3 epochs). On smaller GPUs allocate up to 25-35 minutes. - -## Step-by-Step Instructions - -**Run all commands below from the `code/` directory with your virtual environment activated.** If you reopened the terminal or reconnected via SSH, activate the venv first (this is a common cause of "No module named 'chapter05'"): - -```bash -cd /path/to/ModelAdaptationBook/code -source .venv/bin/activate # Linux/macOS -# Windows: .venv\Scripts\activate -``` - -### Step 1: Download and Prepare the Dataset - -Download and prepare a subset of Dolly 15K: - -**Linux/macOS:** -```bash -# From the code/ directory (venv active) -python chapter05/scripts/listing_5_2_prepare_dataset.py \ - --out chapter05/data/dolly_subset \ - --seed 42 \ - --train 400 \ - --valid 50 \ - --test 50 -``` - -**Windows (PowerShell/CMD):** -```powershell -python chapter05/scripts/listing_5_2_prepare_dataset.py ^ - --out chapter05/data/dolly_subset ^ - --seed 42 ^ - --train 400 ^ - --valid 50 ^ - --test 50 -``` - -This will: -- Download Dolly 15K from Hugging Face (first run only) -- Filter examples by length (20-2000 characters) -- Create train/valid/test splits with seed=42 for reproducibility -- Convert to messages format compatible with SFTTrainer -- Save to `chapter05/data/dolly_subset/` - -**Expected output:** -``` -Loading Databricks Dolly 15K dataset... -Filtered to ~13880 examples (length 20-2000 chars) -Wrote Dolly 15K subset to: chapter05/data/dolly_subset - - Train: 400 examples - - Valid: 50 examples - - Test: 50 examples - - Categories: {'open_qa': 107, 'general_qa': 69, 'classification': 61, ...} -``` - -Dolly 15K has 8 task categories (`open_qa`, `general_qa`, `closed_qa`, `summarization`, `brainstorming`, `classification`, `information_extraction`, `creative_writing`); with `--seed 42 --train 400` the breakdown above is what you will see. - -**Outcome types in your own data:** Dolly contains no refusals or tone-tagged examples, which are response types you typically add for an internal assistant. For worked `messages`-format rows showing a refusal, a clarification, and a tone tag (plus a note on inter-annotator agreement for Q&A), see [examples/example_data_prep_outcome_types.md](examples/example_data_prep_outcome_types.md). - -### Step 2: Train LoRA Adapter - -Train a LoRA adapter using TRL's SFTTrainer: - -**Linux/macOS:** -```bash -python -m chapter05.train_lora \ - --train chapter05/data/dolly_subset/train.jsonl \ - --valid chapter05/data/dolly_subset/valid.jsonl \ - --out chapter05/runs/dolly_lora -``` - -**Windows:** -```powershell -python -m chapter05.train_lora ^ - --train chapter05/data/dolly_subset/train.jsonl ^ - --valid chapter05/data/dolly_subset/valid.jsonl ^ - --out chapter05/runs/dolly_lora -``` - -**What happens:** -- Loads base model (Qwen3-4B) -- Creates LoRA config (r=16, alpha=32) -- Trains for **3 epochs** (**15-20 minutes** on RTX 4070) -- Saves adapter to `chapter05/runs/dolly_lora/` - -**Expected output:** -``` -Saved LoRA adapter to: **chapter05/runs/dolly_lora** -``` - -### Step 3: Evaluate LoRA vs Base Model - -Compare the fine-tuned model to the base model: - -**Linux/macOS:** -```bash -python chapter05/scripts/listing_5_4_evaluate.py \ - --base Qwen/Qwen3-4B-Instruct-2507 \ - --adapter chapter05/runs/dolly_lora \ - --dolly_test chapter05/data/dolly_subset/test.jsonl -``` - -**Windows:** -```powershell -python chapter05/scripts/listing_5_4_evaluate.py ^ - --base Qwen/Qwen3-4B-Instruct-2507 ^ - --adapter chapter05/runs/dolly_lora ^ - --dolly_test chapter05/data/dolly_subset/test.jsonl -``` - -**This generates:** -- `chapter05/runs/eval_report/report.json` - Detailed metrics -- `chapter05/runs/eval_report/report.md` - **Human-readable summary** - -**What you'll see:** -- Overall accuracy improvement (e.g., 70% → 85%) -- Per-category improvements (which task types improved most) -- **Safety regression check** (ensures fine-tuning didn't break safety) - -### Step 4: Run Inference with the Adapter - -Generate text with the fine-tuned adapter. **Ensure you are in `code/` with the venv activated** (easy to forget after a new shell or SSH session): - -**Linux/macOS:** -```bash -cd /path/to/ModelAdaptationBook/code -source .venv/bin/activate -python -m chapter05.generate \ - --base Qwen/Qwen3-4B-Instruct-2507 \ - --adapter chapter05/runs/dolly_lora \ - --prompt "Explain how photosynthesis works in simple terms." -``` - -**Windows:** -```powershell -cd C:\path\to\ModelAdaptationBook\code -.venv\Scripts\activate -python -m chapter05.generate ^ - --base Qwen/Qwen3-4B-Instruct-2507 ^ - --adapter chapter05/runs/dolly_lora ^ - --prompt "Explain how photosynthesis works in simple terms." -``` - -**Side-by-side example:** A full example with the same prompt run on the base model and on the base + adapter (commands, outputs, and what to notice) is in [examples/example_inference_base_vs_adapter.md](examples/example_inference_base_vs_adapter.md). A screenshot of the terminal output is in [images/chap5-inference_base_vs_adapter.png](images/chap5-inference_base_vs_adapter.png)—useful for comparing base vs adapter at a glance. - -### Step 5: QLoRA (optional step) - -QLoRA uses 4-bit quantization, enabling training on smaller GPUs. (You already installed the `qlora` extra in the Chapter 5 prerequisites.) - -**Linux/macOS:** -```bash -python -m chapter05.train_qlora \ - --train chapter05/data/dolly_subset/train.jsonl \ - --valid chapter05/data/dolly_subset/valid.jsonl \ - --out chapter05/runs/dolly_qlora -``` - -**Windows:** -```powershell -python -m chapter05.train_qlora ^ - --train chapter05/data/dolly_subset/train.jsonl ^ - --valid chapter05/data/dolly_subset/valid.jsonl ^ - --out chapter05/runs/dolly_qlora -``` - -**Differences from LoRA:** -- Uses 4-bit quantization (bitsandbytes) -- Lower default rank (r=8 vs r=16) -- Slightly longer training time (25-35 minutes) -- Similar or slightly lower accuracy (~1-2% difference) - -**Expected output:** Training logs show loss, learning rate, and mean token accuracy per step; at the end you'll see `Saved QLoRA adapter to: chapter05/runs/dolly_qlora`. For a full example log and an explanation of each line (including the tokenizer PAD message and HF warning), see [examples/example_qlora_training_output.md](examples/example_qlora_training_output.md). - -To compare LoRA vs QLoRA after training both: - -**Linux/macOS:** -```bash -python chapter05/scripts/listing_5_4_evaluate.py \ - --base Qwen/Qwen3-4B-Instruct-2507 \ - --adapter chapter05/runs/dolly_lora \ - --adapter_alt chapter05/runs/dolly_qlora \ - --dolly_test chapter05/data/dolly_subset/test.jsonl -``` - -**Windows:** -```powershell -python chapter05/scripts/listing_5_4_evaluate.py ^ - --base Qwen/Qwen3-4B-Instruct-2507 ^ - --adapter chapter05/runs/dolly_lora ^ - --adapter_alt chapter05/runs/dolly_qlora ^ - --dolly_test chapter05/data/dolly_subset/test.jsonl -``` - -**Expected output:** Steps 1–4 run for the base and LoRA adapter; then the script loads and evaluates the alternative adapter (QLoRA) and writes one report comparing all three. For a full example log and explanation of each step, see [examples/example_qlora_evaluation_output.md](examples/example_qlora_evaluation_output.md). - -**What you'll see:** -``` -Step 1/4: Loading base model... -**[OK]** Base model loaded - -Step 2/4: Evaluating base model... -Evaluating examples... ━━━━━━━━━━━━━━ 50/50 -Running safety checks... ━━━━━━━━━━━━ 10/10 -**[OK]** Base evaluation complete - -Step 3/4: Loading adapter from chapter05/runs/dolly_lora... -**[OK]** Adapter loaded - -Step 4/4: Evaluating fine-tuned model... -Evaluating examples... ━━━━━━━━━━━━━━ 50/50 -Running safety checks... ━━━━━━━━━━━━ 10/10 -**[OK]** Fine-tuned evaluation complete - -**[OK] Evaluation complete!** -**[OK]** JSON report: chapter05/runs/eval_report/report.json -**[OK]** Markdown summary: chapter05/runs/eval_report/report.md -``` - -Evaluation takes **5-10 minutes** total on a single GPU. The progress bars show exactly what's happening at each stage. - -## Understanding the Results - -### Evaluation Metrics - -The evaluation script measures: - -| Metric | Description | -|--------|--------------| -| **Exact Match (EM)** | Percentage of responses that exactly match the reference (after normalization) | -| **Token F1** | Token-level F1 score (measures partial correctness) | - -**Per-category metrics** (accuracy broken down by task type): - -| Category | Description | -|----------|--------------| -| `open_qa` | Open-ended questions | -| `closed_qa` | Factual questions with specific answers | -| `creative_writing` | Creative tasks | -| `brainstorming` | Idea generation | -| `classification` | Categorization tasks | -| `summarization` | Text summarization | -| `information_extraction` | Extracting structured info | - -### Expected Results - -With only 400 training examples, absolute scores are modest. Focus on **deltas** vs the base model. - -**Base Qwen3-4B-Instruct-2507** (the floor): -- Overall exact match: 0% -- Overall Token-F1: 0.21 -- Safety refusal rate: 100% (well-aligned base) - -**After LoRA (r=16, 3 epochs, 400 examples)** — representative measured numbers (your run will vary within ±0.02 on F1 across hardware and library versions): -- Overall exact match: 0% -- **Overall Token-F1: ~0.34-0.39** (Δ +0.13 to +0.18) -- **Safety refusal rate: 20-60%** (Δ −40 to −80 pp — see the warning below) -- Per-category: strong gains in classification (+0.48 F1) and summarization (+0.29 F1); modest on open QA (+0.15); small or negative on creative writing and brainstorming. - -**The safety regression is real.** On our validated 2026-05-09 run, the LoRA adapter dropped the safety-refusal rate from 100% to 20% on a 10-prompt red-team set — the adapter answers 8 of 10 prompts the base model correctly refuses. The chapter's safety-regression suite catches this; the fix is to either (a) keep a smaller LoRA rank such as `r=8`, (b) add explicit refusal examples to the training data, or (c) follow with a preference-optimisation pass (chapter 8) to re-instill the alignment. - -**For higher absolute scores:** scale to 1,000-2,000 training examples. Expect Token-F1 in the 0.50-0.70 range and EM in the 15-35% range, at proportionally longer training time. - -**→ See [examples/README_INTERPRETING_RESULTS.md](examples/README_INTERPRETING_RESULTS.md) for detailed guidance on understanding your results.** For a full example of a report comparing base, LoRA, and QLoRA (with section-by-section interpretation), see [examples/example_eval_report_lora_vs_qlora.md](examples/example_eval_report_lora_vs_qlora.md). - -**Why We See Improvement:** -- Base model is general-purpose; fine-tuning adapts it to the specific instruction style and task distribution in Dolly -- With small datasets (400 examples), models specialize but may show mixed results across categories -- LoRA learns to better follow the instruction format and response patterns -- 400 examples is enough to show clear improvement without overfitting - -### Safety Regression Check - -The evaluation also runs a safety suite to ensure fine-tuning didn't weaken safety guardrails. You should see: -- **Refusal rate:** Similar or slightly higher than base model -- **If refusal rate drops significantly**, that's a red flag-the adapter may need more safety examples - -## Troubleshooting - -### **"No module named 'chapter05'"** -- **Cause:** The shell is not using the virtual environment, or you're not in the `code/` directory. Common after reopening a terminal or reconnecting via SSH. -- **Fix:** From the repo root, go to `code/`, activate the venv, then run your command: - ```bash - cd /path/to/ModelAdaptationBook/code - source .venv/bin/activate # Linux/macOS - # Windows: .venv\Scripts\activate - python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 --prompt "Your prompt" - ``` -- If you never created a venv here, follow **Prerequisites** in this README and in `code/README.md`. - -### **"CUDA out of memory"** -- Reduce `--batch_size` (default: 1) -- Increase `--grad_accum` to maintain effective batch size -- Use **QLoRA instead of LoRA** (lower memory) - -### **"Dataset not found"** -- **Run `listing_5_2_prepare_dataset.py` first** (Step 1) -- Check that files exist: `chapter05/data/dolly_subset/train.jsonl` - -### "TRL not installed" -- Install: `pip install trl>=0.9.0` -- Or reinstall: `pip install -e "."` (should include trl from pyproject.toml) - -### Training is slow -- Check GPU is being used: `nvidia-smi` should show Python process -- Reduce `--max_length` if using very long sequences -- Use QLoRA for faster training on some GPUs - -## Testing on Another Machine - -On a fresh clone, follow **Prerequisites** (above) then **Step-by-Step Instructions** (Steps 1-3: prepare data, train, evaluate). With the same data and seed (42), results should match within **2-3%** across machines. - -## Advanced: Multi-LoRA - -Train multiple adapters for different purposes: - -```bash -# Train adapter A -python -m chapter05.train_lora --train data_a.jsonl --out runs/adapter_a ... - -# Train adapter B -python -m chapter05.train_lora --train data_b.jsonl --out runs/adapter_b ... - -# Compare at inference (Linux/macOS) -python -m chapter05.multi_lora_demo \ - --adapter_a chapter05/runs/adapter_a \ - --adapter_b chapter05/runs/adapter_b \ - --prompt "Your prompt here" - -# Windows -python -m chapter05.multi_lora_demo ^ - --adapter_a chapter05/runs/adapter_a ^ - --adapter_b chapter05/runs/adapter_b ^ - --prompt "Your prompt here" -``` - -## Publishing Adapters (Optional) - -Publish your adapter to Hugging Face Hub. First, authenticate once (the token is cached at `~/.cache/huggingface/token` and reused by future commands): - -```bash -huggingface-cli login -# paste a token with "write" scope from https://huggingface.co/settings/tokens -# answer "n" to the git credentials prompt -``` - -The publish command picks the cached token up automatically; `HF_TOKEN` env var and `--hf_token` flag are also supported. - -**Linux/macOS:** -```bash -python chapter05/scripts/publish_adapter.py \ - --adapter chapter05/runs/dolly_lora \ - --repo_id /qwen3-4b-dolly-lora \ - --private \ - --dataset_manifest chapter05/data/dolly_subset/manifest.json \ - --eval_report chapter05/runs/eval_report/report.json -``` - -**Windows:** -```powershell -python chapter05/scripts/publish_adapter.py ^ - --adapter chapter05/runs/dolly_lora ^ - --repo_id /qwen3-4b-dolly-lora ^ - --private ^ - --dataset_manifest chapter05/data/dolly_subset/manifest.json ^ - --eval_report chapter05/runs/eval_report/report.json -``` - -## See Also - -- [Contoso domain-adaptation example, where an adapter beats prompting (base vs. format-prompt vs. LoRA, with sample outputs)](../it_support_qa/README.md) — the section 5.1.8 / figure 5.5 example, full dataset and reproducible run -- [Base vs LoRA vs QLoRA inference output (same prompt)](examples/example_inference_base_vs_adapter.md) -- [QLoRA training log and interpretation](examples/example_qlora_training_output.md) -- [LoRA vs QLoRA evaluation run](examples/example_qlora_evaluation_output.md) -- [Full eval report (base/LoRA/QLoRA) and how to read it](examples/example_eval_report_lora_vs_qlora.md) -- [How to interpret evaluation results](examples/README_INTERPRETING_RESULTS.md) -- [Production deployment patterns](docs/inference_enterprise.md) -- [Manual evaluation guidelines](docs/human_review_checklist.md) - -**Images (`images/`):** Screenshots used in the examples above: `chap5-inference_base_vs_adapter.png`, `chap5-qlora_inference.png`, `chap5-qlora_training.png`, `chap5-qlora_training_gpu.png`, `chap5-qlora_lora_evals.png`. - -## Running Tests - -Chapter 5 includes unit tests for data processing and metrics: - -```bash -# From code/ directory -pytest chapter05/tests/ - -# Run specific test file -pytest chapter05/tests/test_metrics.py -pytest chapter05/tests/test_data_normalization.py -``` - -**What the tests cover:** -- `test_metrics.py` - Tests for exact match and token F1 metrics -- `test_data_normalization.py` - Tests for data format conversions - -To install test dependencies: -```bash -pip install -e ".[dev]" # Includes pytest, ruff -``` - -## Troubleshooting - -### "The tokenizer has new PAD/BOS/EOS tokens" Warning - -During training (Step 2), you may see: -``` -The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. -The model config and generation config were aligned accordingly, being updated with the tokenizer's values. -Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}. -``` - -**This is expected and harmless.** Here's why: - -- Qwen models don't ship with a dedicated PAD token -- Our code sets `pad_token = eos_token` (standard practice for Qwen) -- TRL's SFTTrainer detects this and updates the model config to match -- Training proceeds normally and produces valid adapters - -**No action needed.** Your model will train and generate text correctly. - -**Technical note:** Using EOS as PAD is the standard approach for Qwen models. The base model is already instruction-tuned and knows when to stop generating, so this doesn't affect generation quality in practice. - -## W&B (Optional, Non-Fatal) - -Enable experiment tracking: - -```bash -pip install -e ".[wandb]" -setx BOOKCODE_REPORT_TO wandb # Windows -export BOOKCODE_REPORT_TO=wandb # macOS/Linux -``` - -Disable if not needed: -```bash -setx WANDB_DISABLED true # Windows -export WANDB_DISABLED=true # macOS/Linux -``` +# Chapter 5 - LoRA and QLoRA Fine-Tuning (Qwen3-4B) + +This chapter demonstrates parameter-efficient fine-tuning using LoRA and QLoRA on **`Qwen/Qwen3-4B-Instruct-2507`**. You'll learn how to fine-tune a model, evaluate improvements, check for safety regression, and use adapters for inference. + +**Repository**: + +### Where is the code? + +All Chapter 5 code is in **this folder** (`code/chapter05/`): + +| Location | What you'll find | +|----------|------------------| +| **`scripts/`** | Scripts you run (prepare dataset, evaluate, validate). | +| **`*.py`** (this folder) | Python package (training, eval, modeling). Run as `python -m chapter05.train_lora` etc. | +| **`data/`** | Data files and golden sets. | + +Shared utilities (JSONL, env, seed) live in **`code/common/`**. Install from `code/` with `pip install -e .`. + +**Chapter outline and listing map:** + +| Listing | In the chapter | In the repo | +|---------|----------------|-------------| +| **5.1** | Data format; prepare dataset | `scripts/listing_5_1_prepare_dataset.py` | +| **5.2** | LoRA config + SFTTrainer | `modeling.py`, `train_lora.py` | +| **5.3** | Evaluation | `scripts/listing_5_3_evaluate.py` | +| **5.4** | Inference with adapter | `generate.py` | +| **5.5** | QLoRA 4-bit loading | `train_qlora.py` | +| **5.6** | Safety regression test | `scripts/listing_5_3_evaluate.py` (safety section) | + +**Data folder (`data/`):** Dolly 15K is on Hugging Face (`databricks/databricks-dolly-15k`). Create a local subset with `listing_5_1_prepare_dataset.py --out chapter05/data/dolly_subset`. The repo includes `golden/` (small test files for eval) and `smoke/` (minimal train/valid for `validate_chapter05.py`). + +**What are `data.py` and `dataset.py`?** +- **`data.py`** - Loads chat JSONL (Dolly or messages format) into `ChatExample` objects; used by training and eval to read your train/valid/test files. +- **`dataset.py`** - Turns those examples into the format SFTTrainer needs (`prepare_dataset_for_sft`) or into tokenized batches for loss evaluation (`encode_examples`). Both are core to the chapter flow, not legacy. + +--- + +## What We're Fine-Tuning + +We're fine-tuning Qwen3-4B-Instruct-2507 to improve **instruction-following quality** across diverse tasks. The base model is already instruction-tuned; the chapter demonstrates that even a 400-example LoRA pass produces measurable, category-dependent improvements. + +**What we measure:** +- **Token-F1** (the primary metric for chapters 5 through 8): word-level overlap between the model's response and the reference, scored 0 to 1. +- **Per-category Token-F1**: breakdown across the 8 Dolly categories (open QA, general QA, closed QA, creative writing, brainstorming, classification, summarization, information extraction). +- **Safety refusal rate**: fraction of red-team prompts the model declines to answer; watched for regression after fine-tuning. + +**Expected results** (representative measured values on the chapter's 400 / 50 / 50 Dolly split with `seed=42`; your numbers will move within ±0.02 across hardware and library versions): + +- Base Qwen3-4B-Instruct-2507: Token-F1 ≈ 0.212, safety refusal 100%. +- After LoRA (r=16, 3 epochs): Token-F1 ≈ 0.345 (+0.13), safety refusal can drop substantially (-40 to -80 pp in our measurements). +- After QLoRA (r=8, 3 epochs): Token-F1 ≈ 0.39, safety refusal ≈ 40-60%. + +The safety regression on the broader Dolly subset is real and load-bearing for the chapter — it motivates the safety-regression suite that follows the eval and previews the safety conversation in chapter 6 and chapter 8. + +## Why Dolly 15K? + +We use **`databricks/databricks-dolly-15k`** because: + +1. **Narrative continuity.** Chapter 4 uses Dolly 15K for few-shot prompting (no training). Chapter 5 uses the same dataset for LoRA fine-tuning, showing the progression from prompting to training on the same data. Chapter 6 reuses it for full SFT on a technical-support subset. +2. **Real public dataset.** Dolly 15K is widely used and commercially viable (CC-BY-SA-3.0). Human-authored, not synthetic. +3. **Measurable tasks.** Eight distinct categories with enough examples in each to surface per-category effects. +4. **Right size for LoRA.** A 400-example training set is the sweet spot: enough to show improvement, small enough to run end to end in ~10-15 minutes on a single consumer GPU. + +## Prerequisites + +### One-Time Setup (Fresh Machine) + +**First-time setup:** If you haven't set up the book environment yet, follow the detailed instructions in **`code/README.md`** (one directory up). This includes: +- Checking Python version (**3.12+ required**) +- Installing system prerequisites (Ubuntu/Debian: `python3-venv`) +- Creating virtual environment +- Installing PyTorch (CPU or CUDA) +- Installing the book package + +Once you've completed the general setup, come back here for Chapter 5-specific steps. + +**Required for Chapter 5's QLoRA branch (Step 5) — install with the QLoRA extra.** The LoRA pass (Steps 1-4) works on the base `pip install -e ".[dev]"` install; QLoRA needs bitsandbytes for 4-bit quantization. From the `code/` directory: + +```bash +pip install -e ".[qlora]" +``` + +QLoRA is optional. If you do not plan to run Step 5, you can skip this extra. + +> **On a Mac?** QLoRA (Step 5) does not run on Apple Silicon: `bitsandbytes` 4-bit kernels are CUDA/ROCm-only, with no Metal/MPS build. Removing `bitsandbytes` would not make QLoRA run on a Mac, it would just remove the 4-bit path that makes it QLoRA. Use the LoRA branch (Steps 1-4), which needs no `bitsandbytes` and trains on MPS. See [ACCELERATORS.md](../../ACCELERATORS.md#why-qlora-needs-an-nvidia-or-amd-gpu) for the full explanation. + +### Verify Your Setup (Recommended) + +Before investing time in full training runs, validate that everything is installed correctly: + +```bash +python chapter05/scripts/validate_chapter05.py +``` + +**What this does:** +1. **Checks** Python version +2. **Verifies** required data files exist (smoke test datasets, safety prompts) +3. **Confirms** PyTorch is installed and detects CUDA availability +4. **Runs** a tiny 2-step LoRA training (smoke test) to ensure the full pipeline works +5. **Validates** the adapter was created successfully + +**Why run this?** +- **Catches setup issues early** - Better to find missing dependencies now than 15 minutes into a full training run +- **Tests the complete workflow** - Loads model, tokenizes data, runs training, saves adapter +- **Takes only 2-3 minutes** - Much faster than debugging a failed full training run +- **GPU-aware** - Skips training test if no GPU detected (to avoid slow CPU runs) +- **Chapter-specific** - Each chapter has its own validation script tailored to its requirements (other chapters may have different dependencies or model sizes) + +**Expected output:** +``` +Chapter 5 validation +- Python: 3.12.3 +- Datasets: **OK** +- Torch: 2.10.0+cu126 +- CUDA available: True +- Running tiny LoRA smoke training... + [Progress bars and training logs] +- Smoke training: **OK** (adapter written to chapter05/runs/validate_lora_smoke) +``` + +**If validation fails**, it will show a clear error message indicating what's missing (e.g., "PyTorch not installed" or "Missing required files"). + +### GPU Requirements + +- **LoRA**: minimum **8 GB VRAM** (RTX 3060 / 4060 class). +- **QLoRA**: minimum **6 GB VRAM** (works on smaller GPUs). +- **Recommended**: **12 GB+ VRAM** (RTX 4070 / 4080, NVIDIA A30, A100) for faster training. +- **Training time on a single A30**: ~10-12 minutes for LoRA, ~14 minutes for QLoRA (400 examples, 3 epochs). On smaller GPUs allocate up to 25-35 minutes. + +## Step-by-Step Instructions + +**Run all commands below from the `code/` directory with your virtual environment activated.** If you reopened the terminal or reconnected via SSH, activate the venv first (this is a common cause of "No module named 'chapter05'"): + +```bash +cd /path/to/ModelAdaptationBook/code +source .venv/bin/activate # Linux/macOS +# Windows: .venv\Scripts\activate +``` + +### Step 1: Download and Prepare the Dataset + +Download and prepare a subset of Dolly 15K: + +**Linux/macOS:** +```bash +# From the code/ directory (venv active) +python chapter05/scripts/listing_5_1_prepare_dataset.py \ + --out chapter05/data/dolly_subset \ + --seed 42 \ + --train 400 \ + --valid 50 \ + --test 50 +``` + +**Windows (PowerShell/CMD):** +```powershell +python chapter05/scripts/listing_5_1_prepare_dataset.py ^ + --out chapter05/data/dolly_subset ^ + --seed 42 ^ + --train 400 ^ + --valid 50 ^ + --test 50 +``` + +This will: +- Download Dolly 15K from Hugging Face (first run only) +- Filter examples by length (20-2000 characters) +- Create train/valid/test splits with seed=42 for reproducibility +- Convert to messages format compatible with SFTTrainer +- Save to `chapter05/data/dolly_subset/` + +**Expected output:** +``` +Loading Databricks Dolly 15K dataset... +Filtered to ~13880 examples (length 20-2000 chars) +Wrote Dolly 15K subset to: chapter05/data/dolly_subset + - Train: 400 examples + - Valid: 50 examples + - Test: 50 examples + - Categories: {'open_qa': 107, 'general_qa': 69, 'classification': 61, ...} +``` + +Dolly 15K has 8 task categories (`open_qa`, `general_qa`, `closed_qa`, `summarization`, `brainstorming`, `classification`, `information_extraction`, `creative_writing`); with `--seed 42 --train 400` the breakdown above is what you will see. + +**Outcome types in your own data:** Dolly contains no refusals or tone-tagged examples, which are response types you typically add for an internal assistant. For worked `messages`-format rows showing a refusal, a clarification, and a tone tag (plus a note on inter-annotator agreement for Q&A), see [examples/example_data_prep_outcome_types.md](examples/example_data_prep_outcome_types.md). + +### Step 2: Train LoRA Adapter + +Train a LoRA adapter using TRL's SFTTrainer: + +**Linux/macOS:** +```bash +python -m chapter05.train_lora \ + --train chapter05/data/dolly_subset/train.jsonl \ + --valid chapter05/data/dolly_subset/valid.jsonl \ + --out chapter05/runs/dolly_lora +``` + +**Windows:** +```powershell +python -m chapter05.train_lora ^ + --train chapter05/data/dolly_subset/train.jsonl ^ + --valid chapter05/data/dolly_subset/valid.jsonl ^ + --out chapter05/runs/dolly_lora +``` + +**What happens:** +- Loads base model (Qwen3-4B) +- Creates LoRA config (r=16, alpha=32) +- Trains for **3 epochs** (**15-20 minutes** on RTX 4070) +- Saves adapter to `chapter05/runs/dolly_lora/` + +**Expected output:** +``` +Saved LoRA adapter to: **chapter05/runs/dolly_lora** +``` + +### Step 3: Evaluate LoRA vs Base Model + +Compare the fine-tuned model to the base model: + +**Linux/macOS:** +```bash +python chapter05/scripts/listing_5_3_evaluate.py \ + --base Qwen/Qwen3-4B-Instruct-2507 \ + --adapter chapter05/runs/dolly_lora \ + --dolly_test chapter05/data/dolly_subset/test.jsonl +``` + +**Windows:** +```powershell +python chapter05/scripts/listing_5_3_evaluate.py ^ + --base Qwen/Qwen3-4B-Instruct-2507 ^ + --adapter chapter05/runs/dolly_lora ^ + --dolly_test chapter05/data/dolly_subset/test.jsonl +``` + +**This generates:** +- `chapter05/runs/eval_report/report.json` - Detailed metrics +- `chapter05/runs/eval_report/report.md` - **Human-readable summary** + +**What you'll see:** +- Overall accuracy improvement (e.g., 70% → 85%) +- Per-category improvements (which task types improved most) +- **Safety regression check** (ensures fine-tuning didn't break safety) + +### Step 4: Run Inference with the Adapter + +Generate text with the fine-tuned adapter. **Ensure you are in `code/` with the venv activated** (easy to forget after a new shell or SSH session): + +**Linux/macOS:** +```bash +cd /path/to/ModelAdaptationBook/code +source .venv/bin/activate +python -m chapter05.generate \ + --base Qwen/Qwen3-4B-Instruct-2507 \ + --adapter chapter05/runs/dolly_lora \ + --prompt "Explain how photosynthesis works in simple terms." +``` + +**Windows:** +```powershell +cd C:\path\to\ModelAdaptationBook\code +.venv\Scripts\activate +python -m chapter05.generate ^ + --base Qwen/Qwen3-4B-Instruct-2507 ^ + --adapter chapter05/runs/dolly_lora ^ + --prompt "Explain how photosynthesis works in simple terms." +``` + +**Side-by-side example:** A full example with the same prompt run on the base model and on the base + adapter (commands, outputs, and what to notice) is in [examples/example_inference_base_vs_adapter.md](examples/example_inference_base_vs_adapter.md). A screenshot of the terminal output is in [images/chap5-inference_base_vs_adapter.png](images/chap5-inference_base_vs_adapter.png)—useful for comparing base vs adapter at a glance. + +### Step 5: QLoRA (optional step) + +QLoRA uses 4-bit quantization, enabling training on smaller GPUs. (You already installed the `qlora` extra in the Chapter 5 prerequisites.) + +**Linux/macOS:** +```bash +python -m chapter05.train_qlora \ + --train chapter05/data/dolly_subset/train.jsonl \ + --valid chapter05/data/dolly_subset/valid.jsonl \ + --out chapter05/runs/dolly_qlora +``` + +**Windows:** +```powershell +python -m chapter05.train_qlora ^ + --train chapter05/data/dolly_subset/train.jsonl ^ + --valid chapter05/data/dolly_subset/valid.jsonl ^ + --out chapter05/runs/dolly_qlora +``` + +**Differences from LoRA:** +- Uses 4-bit quantization (bitsandbytes) +- Lower default rank (r=8 vs r=16) +- Slightly longer training time (25-35 minutes) +- Similar or slightly lower accuracy (~1-2% difference) + +**Expected output:** Training logs show loss, learning rate, and mean token accuracy per step; at the end you'll see `Saved QLoRA adapter to: chapter05/runs/dolly_qlora`. For a full example log and an explanation of each line (including the tokenizer PAD message and HF warning), see [examples/example_qlora_training_output.md](examples/example_qlora_training_output.md). + +To compare LoRA vs QLoRA after training both: + +**Linux/macOS:** +```bash +python chapter05/scripts/listing_5_3_evaluate.py \ + --base Qwen/Qwen3-4B-Instruct-2507 \ + --adapter chapter05/runs/dolly_lora \ + --adapter_alt chapter05/runs/dolly_qlora \ + --dolly_test chapter05/data/dolly_subset/test.jsonl +``` + +**Windows:** +```powershell +python chapter05/scripts/listing_5_3_evaluate.py ^ + --base Qwen/Qwen3-4B-Instruct-2507 ^ + --adapter chapter05/runs/dolly_lora ^ + --adapter_alt chapter05/runs/dolly_qlora ^ + --dolly_test chapter05/data/dolly_subset/test.jsonl +``` + +**Expected output:** Steps 1–4 run for the base and LoRA adapter; then the script loads and evaluates the alternative adapter (QLoRA) and writes one report comparing all three. For a full example log and explanation of each step, see [examples/example_qlora_evaluation_output.md](examples/example_qlora_evaluation_output.md). + +**What you'll see:** +``` +Step 1/4: Loading base model... +**[OK]** Base model loaded + +Step 2/4: Evaluating base model... +Evaluating examples... ━━━━━━━━━━━━━━ 50/50 +Running safety checks... ━━━━━━━━━━━━ 10/10 +**[OK]** Base evaluation complete + +Step 3/4: Loading adapter from chapter05/runs/dolly_lora... +**[OK]** Adapter loaded + +Step 4/4: Evaluating fine-tuned model... +Evaluating examples... ━━━━━━━━━━━━━━ 50/50 +Running safety checks... ━━━━━━━━━━━━ 10/10 +**[OK]** Fine-tuned evaluation complete + +**[OK] Evaluation complete!** +**[OK]** JSON report: chapter05/runs/eval_report/report.json +**[OK]** Markdown summary: chapter05/runs/eval_report/report.md +``` + +Evaluation takes **5-10 minutes** total on a single GPU. The progress bars show exactly what's happening at each stage. + +## Understanding the Results + +### Evaluation Metrics + +The evaluation script measures: + +| Metric | Description | +|--------|--------------| +| **Exact Match (EM)** | Percentage of responses that exactly match the reference (after normalization) | +| **Token F1** | Token-level F1 score (measures partial correctness) | + +**Per-category metrics** (accuracy broken down by task type): + +| Category | Description | +|----------|--------------| +| `open_qa` | Open-ended questions | +| `closed_qa` | Factual questions with specific answers | +| `creative_writing` | Creative tasks | +| `brainstorming` | Idea generation | +| `classification` | Categorization tasks | +| `summarization` | Text summarization | +| `information_extraction` | Extracting structured info | + +### Expected Results + +With only 400 training examples, absolute scores are modest. Focus on **deltas** vs the base model. + +**Base Qwen3-4B-Instruct-2507** (the floor): +- Overall exact match: 0% +- Overall Token-F1: 0.21 +- Safety refusal rate: 100% (well-aligned base) + +**After LoRA (r=16, 3 epochs, 400 examples)** — representative measured numbers (your run will vary within ±0.02 on F1 across hardware and library versions): +- Overall exact match: 0% +- **Overall Token-F1: ~0.34-0.39** (Δ +0.13 to +0.18) +- **Safety refusal rate: 20-60%** (Δ −40 to −80 pp — see the warning below) +- Per-category: strong gains in classification (+0.48 F1) and summarization (+0.29 F1); modest on open QA (+0.15); small or negative on creative writing and brainstorming. + +**The safety regression is real.** On our validated 2026-05-09 run, the LoRA adapter dropped the safety-refusal rate from 100% to 20% on a 10-prompt red-team set — the adapter answers 8 of 10 prompts the base model correctly refuses. The chapter's safety-regression suite catches this; the fix is to either (a) keep a smaller LoRA rank such as `r=8`, (b) add explicit refusal examples to the training data, or (c) follow with a preference-optimisation pass (chapter 8) to re-instill the alignment. + +**For higher absolute scores:** scale to 1,000-2,000 training examples. Expect Token-F1 in the 0.50-0.70 range and EM in the 15-35% range, at proportionally longer training time. + +**→ See [examples/README_INTERPRETING_RESULTS.md](examples/README_INTERPRETING_RESULTS.md) for detailed guidance on understanding your results.** For a full example of a report comparing base, LoRA, and QLoRA (with section-by-section interpretation), see [examples/example_eval_report_lora_vs_qlora.md](examples/example_eval_report_lora_vs_qlora.md). + +**Why We See Improvement:** +- Base model is general-purpose; fine-tuning adapts it to the specific instruction style and task distribution in Dolly +- With small datasets (400 examples), models specialize but may show mixed results across categories +- LoRA learns to better follow the instruction format and response patterns +- 400 examples is enough to show clear improvement without overfitting + +### Safety Regression Check + +The evaluation also runs a safety suite to ensure fine-tuning didn't weaken safety guardrails. You should see: +- **Refusal rate:** Similar or slightly higher than base model +- **If refusal rate drops significantly**, that's a red flag-the adapter may need more safety examples + +## Troubleshooting + +### **"No module named 'chapter05'"** +- **Cause:** The shell is not using the virtual environment, or you're not in the `code/` directory. Common after reopening a terminal or reconnecting via SSH. +- **Fix:** From the repo root, go to `code/`, activate the venv, then run your command: + ```bash + cd /path/to/ModelAdaptationBook/code + source .venv/bin/activate # Linux/macOS + # Windows: .venv\Scripts\activate + python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 --prompt "Your prompt" + ``` +- If you never created a venv here, follow **Prerequisites** in this README and in `code/README.md`. + +### **"CUDA out of memory"** +- Reduce `--batch_size` (default: 1) +- Increase `--grad_accum` to maintain effective batch size +- Use **QLoRA instead of LoRA** (lower memory) + +### **"Dataset not found"** +- **Run `listing_5_1_prepare_dataset.py` first** (Step 1) +- Check that files exist: `chapter05/data/dolly_subset/train.jsonl` + +### "TRL not installed" +- Install: `pip install trl>=0.9.0` +- Or reinstall: `pip install -e "."` (should include trl from pyproject.toml) + +### Training is slow +- Check GPU is being used: `nvidia-smi` should show Python process +- Reduce `--max_length` if using very long sequences +- Use QLoRA for faster training on some GPUs + +## Testing on Another Machine + +On a fresh clone, follow **Prerequisites** (above) then **Step-by-Step Instructions** (Steps 1-3: prepare data, train, evaluate). With the same data and seed (42), results should match within **2-3%** across machines. + +## Advanced: Multi-LoRA + +Train multiple adapters for different purposes: + +```bash +# Train adapter A +python -m chapter05.train_lora --train data_a.jsonl --out runs/adapter_a ... + +# Train adapter B +python -m chapter05.train_lora --train data_b.jsonl --out runs/adapter_b ... + +# Compare at inference (Linux/macOS) +python -m chapter05.multi_lora_demo \ + --adapter_a chapter05/runs/adapter_a \ + --adapter_b chapter05/runs/adapter_b \ + --prompt "Your prompt here" + +# Windows +python -m chapter05.multi_lora_demo ^ + --adapter_a chapter05/runs/adapter_a ^ + --adapter_b chapter05/runs/adapter_b ^ + --prompt "Your prompt here" +``` + +## Publishing Adapters (Optional) + +Publish your adapter to Hugging Face Hub. First, authenticate once (the token is cached at `~/.cache/huggingface/token` and reused by future commands): + +```bash +huggingface-cli login +# paste a token with "write" scope from https://huggingface.co/settings/tokens +# answer "n" to the git credentials prompt +``` + +The publish command picks the cached token up automatically; `HF_TOKEN` env var and `--hf_token` flag are also supported. + +**Linux/macOS:** +```bash +python chapter05/scripts/publish_adapter.py \ + --adapter chapter05/runs/dolly_lora \ + --repo_id /qwen3-4b-dolly-lora \ + --private \ + --dataset_manifest chapter05/data/dolly_subset/manifest.json \ + --eval_report chapter05/runs/eval_report/report.json +``` + +**Windows:** +```powershell +python chapter05/scripts/publish_adapter.py ^ + --adapter chapter05/runs/dolly_lora ^ + --repo_id /qwen3-4b-dolly-lora ^ + --private ^ + --dataset_manifest chapter05/data/dolly_subset/manifest.json ^ + --eval_report chapter05/runs/eval_report/report.json +``` + +## See Also + +- [Contoso domain-adaptation example, where an adapter beats prompting (base vs. format-prompt vs. LoRA, with sample outputs)](../it_support_qa/README.md) — the section 5.1.8 / figure 5.5 example, full dataset and reproducible run +- [Base vs LoRA vs QLoRA inference output (same prompt)](examples/example_inference_base_vs_adapter.md) +- [QLoRA training log and interpretation](examples/example_qlora_training_output.md) +- [LoRA vs QLoRA evaluation run](examples/example_qlora_evaluation_output.md) +- [Full eval report (base/LoRA/QLoRA) and how to read it](examples/example_eval_report_lora_vs_qlora.md) +- [How to interpret evaluation results](examples/README_INTERPRETING_RESULTS.md) +- [Production deployment patterns](docs/inference_enterprise.md) +- [Manual evaluation guidelines](docs/human_review_checklist.md) + +**Images (`images/`):** Screenshots used in the examples above: `chap5-inference_base_vs_adapter.png`, `chap5-qlora_inference.png`, `chap5-qlora_training.png`, `chap5-qlora_training_gpu.png`, `chap5-qlora_lora_evals.png`. + +## Running Tests + +Chapter 5 includes unit tests for data processing and metrics: + +```bash +# From code/ directory +pytest chapter05/tests/ + +# Run specific test file +pytest chapter05/tests/test_metrics.py +pytest chapter05/tests/test_data_normalization.py +``` + +**What the tests cover:** +- `test_metrics.py` - Tests for exact match and token F1 metrics +- `test_data_normalization.py` - Tests for data format conversions + +To install test dependencies: +```bash +pip install -e ".[dev]" # Includes pytest, ruff +``` + +## Troubleshooting + +### "The tokenizer has new PAD/BOS/EOS tokens" Warning + +During training (Step 2), you may see: +``` +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. +The model config and generation config were aligned accordingly, being updated with the tokenizer's values. +Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}. +``` + +**This is expected and harmless.** Here's why: + +- Qwen models don't ship with a dedicated PAD token +- Our code sets `pad_token = eos_token` (standard practice for Qwen) +- TRL's SFTTrainer detects this and updates the model config to match +- Training proceeds normally and produces valid adapters + +**No action needed.** Your model will train and generate text correctly. + +**Technical note:** Using EOS as PAD is the standard approach for Qwen models. The base model is already instruction-tuned and knows when to stop generating, so this doesn't affect generation quality in practice. + +## W&B (Optional, Non-Fatal) + +Enable experiment tracking: + +```bash +pip install -e ".[wandb]" +setx BOOKCODE_REPORT_TO wandb # Windows +export BOOKCODE_REPORT_TO=wandb # macOS/Linux +``` + +Disable if not needed: +```bash +setx WANDB_DISABLED true # Windows +export WANDB_DISABLED=true # macOS/Linux +``` diff --git a/code/chapter05/eval.py b/code/chapter05/eval.py index 2dd7773..e7b2ba1 100644 --- a/code/chapter05/eval.py +++ b/code/chapter05/eval.py @@ -6,8 +6,8 @@ 3. **Toy golden set** - Simple Q&A pairs to sanity-check model behavior. Also includes loss/perplexity computation on held-out JSONL data, and report -generation (JSON + Markdown). Used by ``scripts/listing_5_4_evaluate.py`` -(Listing 5.4) to compare base model vs. adapter variants. +generation (JSON + Markdown). Used by ``scripts/listing_5_3_evaluate.py`` +(Listing 5.3) to compare base model vs. adapter variants. """ from __future__ import annotations @@ -473,7 +473,7 @@ def write_report(path: str | Path, obj: Dict[str, Any]) -> None: """Write an evaluation results dict as a JSON file. The JSON report is the machine-readable counterpart to the human-readable - Markdown summary generated by ``listing_5_4_evaluate.py``. Both are saved + Markdown summary generated by ``listing_5_3_evaluate.py``. Both are saved to the same output directory (e.g., ``chapter05/runs/eval_report/``). Args: diff --git a/code/chapter05/examples/README_INTERPRETING_RESULTS.md b/code/chapter05/examples/README_INTERPRETING_RESULTS.md index 11e71c5..86b9df0 100644 --- a/code/chapter05/examples/README_INTERPRETING_RESULTS.md +++ b/code/chapter05/examples/README_INTERPRETING_RESULTS.md @@ -1,6 +1,6 @@ # Understanding Your Evaluation Results -This guide helps you interpret the evaluation report from `listing_5_4_evaluate.py`. +This guide helps you interpret the evaluation report from `listing_5_3_evaluate.py`. --- diff --git a/code/chapter05/examples/example_data_prep_outcome_types.md b/code/chapter05/examples/example_data_prep_outcome_types.md index 043bf13..ec58546 100644 --- a/code/chapter05/examples/example_data_prep_outcome_types.md +++ b/code/chapter05/examples/example_data_prep_outcome_types.md @@ -3,7 +3,7 @@ These illustrate the response types discussed in the chapter's "Data quality iterations" section, using the Contoso IT-support assistant. Each is a single training row in the same `messages` format produced by -`scripts/listing_5_2_prepare_dataset.py` (see `dolly_to_messages`). +`scripts/listing_5_1_prepare_dataset.py` (see `dolly_to_messages`). > **These rows are illustrative.** The Dolly 15K subset used in this chapter > contains no refusals and no tone tags, so these are examples of what you would diff --git a/code/chapter05/examples/example_qlora_evaluation_output.md b/code/chapter05/examples/example_qlora_evaluation_output.md index aba0cf1..40bd14f 100644 --- a/code/chapter05/examples/example_qlora_evaluation_output.md +++ b/code/chapter05/examples/example_qlora_evaluation_output.md @@ -5,7 +5,7 @@ This file captures a typical run of the evaluation script when comparing the **b ## Command ```bash -python chapter05/scripts/listing_5_4_evaluate.py \ +python chapter05/scripts/listing_5_3_evaluate.py \ --base Qwen/Qwen3-4B-Instruct-2507 \ --adapter chapter05/runs/dolly_lora \ --adapter_alt chapter05/runs/dolly_qlora \ diff --git a/code/chapter05/generate.py b/code/chapter05/generate.py index 41809db..a7ed810 100644 --- a/code/chapter05/generate.py +++ b/code/chapter05/generate.py @@ -1,110 +1,110 @@ -"""Inference script: generate text with the base model and an optional LoRA/QLoRA adapter (Listing 5.5). - -Loads the base model, optionally attaches a LoRA or QLoRA adapter, and generates -a response for a single user prompt. Supports adapter merging for deployment. - -Usage (base model only): - python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ - --prompt "Explain how photosynthesis works in simple terms." - -Usage (with LoRA adapter): - python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ - --adapter chapter05/runs/dolly_lora \\ - --prompt "Explain how photosynthesis works in simple terms." - -Usage (with QLoRA adapter -- must use --quantized_4bit): - python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ - --adapter chapter05/runs/dolly_qlora --quantized_4bit \\ - --prompt "Explain how photosynthesis works in simple terms." - -See Chapter 5, Section 5.1 (Step 4) and the README for full details. -""" -from __future__ import annotations - -import argparse -from pathlib import Path - -import torch -from peft import PeftModel -from transformers import AutoModelForCausalLM - -from chapter05 import DEFAULT_MODEL_NAME -from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT, build_prompt_text -from chapter05.modeling import load_base_model_lora, load_base_model_qlora, load_tokenizer - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments for inference. - - Returns: - Namespace with base model, adapter path, prompt, generation settings, - and optional merge/quantization flags. - """ - ap = argparse.ArgumentParser() - ap.add_argument("--base", default=DEFAULT_MODEL_NAME) - ap.add_argument("--adapter", default=None, help="Path to LoRA/QLoRA adapter folder") - ap.add_argument("--prompt", required=True, help="User prompt") - ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT) - ap.add_argument("--max_new_tokens", type=int, default=128) - ap.add_argument("--do_sample", action="store_true") - ap.add_argument("--temperature", type=float, default=0.7) - ap.add_argument("--quantized_4bit", action="store_true", help="Load base in 4-bit (requires bitsandbytes)") - ap.add_argument("--merge_adapter", action="store_true", help="Merge adapter into base before generation") - ap.add_argument("--save_merged", default=None, help="If set, save merged model to this folder") - return ap.parse_args() - - -def main() -> None: - """Load model, optionally attach adapter, and generate a response.""" - args = parse_args() - tokenizer = load_tokenizer(args.base) - - # Use --quantized_4bit when running a QLoRA-trained adapter so the base - # model is loaded in 4-bit (matching the precision used during training). - if args.quantized_4bit: - model = load_base_model_qlora(args.base, gradient_checkpointing=False) - else: - model = load_base_model_lora(args.base, gradient_checkpointing=False) - - if args.adapter: - model = PeftModel.from_pretrained(model, args.adapter) - if args.merge_adapter: - # merge_and_unload() permanently folds LoRA weights into the base. - # This loses modularity (can't swap adapters) but can be faster - # for high-throughput serving. See Section 5.11 deployment options. - model = model.merge_and_unload() - if args.save_merged: - Path(args.save_merged).mkdir(parents=True, exist_ok=True) - model.save_pretrained(args.save_merged) - tokenizer.save_pretrained(args.save_merged) - - model.eval() - - messages = [ - {"role": "system", "content": args.system_prompt}, - {"role": "user", "content": args.prompt}, - ] - text = build_prompt_text(tokenizer, messages) - inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device) - - with torch.no_grad(): - out = model.generate( - **inputs, - max_new_tokens=args.max_new_tokens, - do_sample=args.do_sample, - # Pass temperature=None when not sampling to avoid HF warnings - # about unused generation parameters. - temperature=args.temperature if args.do_sample else None, - pad_token_id=tokenizer.pad_token_id, - eos_token_id=tokenizer.eos_token_id, - ) - - # skip_special_tokens=False to show the full chat template (system/user/assistant - # markers). This is useful for debugging and demonstrating the template structure. - decoded = tokenizer.decode(out[0], skip_special_tokens=False) - print(decoded) - - -if __name__ == "__main__": - main() - +"""Inference script: generate text with the base model and an optional LoRA/QLoRA adapter (Listing 5.4). + +Loads the base model, optionally attaches a LoRA or QLoRA adapter, and generates +a response for a single user prompt. Supports adapter merging for deployment. + +Usage (base model only): + python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ + --prompt "Explain how photosynthesis works in simple terms." + +Usage (with LoRA adapter): + python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ + --adapter chapter05/runs/dolly_lora \\ + --prompt "Explain how photosynthesis works in simple terms." + +Usage (with QLoRA adapter -- must use --quantized_4bit): + python -m chapter05.generate --base Qwen/Qwen3-4B-Instruct-2507 \\ + --adapter chapter05/runs/dolly_qlora --quantized_4bit \\ + --prompt "Explain how photosynthesis works in simple terms." + +See Chapter 5, Section 5.1 (Step 4) and the README for full details. +""" +from __future__ import annotations + +import argparse +from pathlib import Path + +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +from chapter05 import DEFAULT_MODEL_NAME +from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT, build_prompt_text +from chapter05.modeling import load_base_model_lora, load_base_model_qlora, load_tokenizer + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments for inference. + + Returns: + Namespace with base model, adapter path, prompt, generation settings, + and optional merge/quantization flags. + """ + ap = argparse.ArgumentParser() + ap.add_argument("--base", default=DEFAULT_MODEL_NAME) + ap.add_argument("--adapter", default=None, help="Path to LoRA/QLoRA adapter folder") + ap.add_argument("--prompt", required=True, help="User prompt") + ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT) + ap.add_argument("--max_new_tokens", type=int, default=128) + ap.add_argument("--do_sample", action="store_true") + ap.add_argument("--temperature", type=float, default=0.7) + ap.add_argument("--quantized_4bit", action="store_true", help="Load base in 4-bit (requires bitsandbytes)") + ap.add_argument("--merge_adapter", action="store_true", help="Merge adapter into base before generation") + ap.add_argument("--save_merged", default=None, help="If set, save merged model to this folder") + return ap.parse_args() + + +def main() -> None: + """Load model, optionally attach adapter, and generate a response.""" + args = parse_args() + tokenizer = load_tokenizer(args.base) + + # Use --quantized_4bit when running a QLoRA-trained adapter so the base + # model is loaded in 4-bit (matching the precision used during training). + if args.quantized_4bit: + model = load_base_model_qlora(args.base, gradient_checkpointing=False) + else: + model = load_base_model_lora(args.base, gradient_checkpointing=False) + + if args.adapter: + model = PeftModel.from_pretrained(model, args.adapter) + if args.merge_adapter: + # merge_and_unload() permanently folds LoRA weights into the base. + # This loses modularity (can't swap adapters) but can be faster + # for high-throughput serving. See Section 5.11 deployment options. + model = model.merge_and_unload() + if args.save_merged: + Path(args.save_merged).mkdir(parents=True, exist_ok=True) + model.save_pretrained(args.save_merged) + tokenizer.save_pretrained(args.save_merged) + + model.eval() + + messages = [ + {"role": "system", "content": args.system_prompt}, + {"role": "user", "content": args.prompt}, + ] + text = build_prompt_text(tokenizer, messages) + inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(model.device) + + with torch.no_grad(): + out = model.generate( + **inputs, + max_new_tokens=args.max_new_tokens, + do_sample=args.do_sample, + # Pass temperature=None when not sampling to avoid HF warnings + # about unused generation parameters. + temperature=args.temperature if args.do_sample else None, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + ) + + # skip_special_tokens=False to show the full chat template (system/user/assistant + # markers). This is useful for debugging and demonstrating the template structure. + decoded = tokenizer.decode(out[0], skip_special_tokens=False) + print(decoded) + + +if __name__ == "__main__": + main() + diff --git a/code/chapter05/modeling.py b/code/chapter05/modeling.py index 243c512..01d4a1f 100644 --- a/code/chapter05/modeling.py +++ b/code/chapter05/modeling.py @@ -1,188 +1,188 @@ -"""Model loading utilities for LoRA and QLoRA fine-tuning (Listing 5.3). - -Provides functions to: - - Load the base model in full precision (for LoRA) or 4-bit quantized (for QLoRA). - - Load and configure the tokenizer with proper padding. - - Create and apply LoRA adapter configurations. - -Used by train_lora.py, train_qlora.py, generate.py, and eval.py. -""" -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Sequence, Tuple - -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig - -from peft import LoraConfig, prepare_model_for_kbit_training - -from .chat_template import ensure_padding - - -# Standard attention and MLP projection modules in Transformer architectures. -# Adapting all attention projections (q/k/v/o) plus MLP projections (up/gate/down) -# gives the best quality/cost balance. See Section 5.5 for guidance. -DEFAULT_TARGET_MODULES = [ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "up_proj", - "gate_proj", - "down_proj", -] - - -@dataclass(frozen=True) -class LoadedModel: - model: Any - tokenizer: Any - - -def load_tokenizer(model_name: str): - """Load and configure a tokenizer with proper padding for the given model. - - Args: - model_name: HuggingFace model ID or local path. - - Returns: - A configured AutoTokenizer with padding set (see ensure_padding). - """ - tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - ensure_padding(tok) - return tok - - -def load_base_model_lora( - model_name: str, - *, - device_map: str = "auto", - dtype: str | torch.dtype = "auto", - gradient_checkpointing: bool = True, -): - """Load the base model in full precision for LoRA fine-tuning or inference. - - Args: - model_name: HuggingFace model ID or local path. - device_map: Device placement strategy. "auto" distributes layers across - available GPUs (or CPU if no GPU), which is the simplest approach - for single-GPU setups. - dtype: Weight dtype. "auto" lets HF choose the best dtype for the hardware. - gradient_checkpointing: If True, trades compute for memory by recomputing - activations during backward. Roughly halves memory at ~20% speed cost. - Disable for inference (no backward pass needed). - - Returns: - A HuggingFace AutoModelForCausalLM ready for LoRA adapter attachment. - """ - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map=device_map, - dtype=dtype, - trust_remote_code=True, - ) - if gradient_checkpointing: - model.gradient_checkpointing_enable() - # KV cache is incompatible with gradient checkpointing during training. - model.config.use_cache = False - return model - - -def load_base_model_qlora( - model_name: str, - *, - device_map: str = "auto", - compute_dtype: torch.dtype = torch.bfloat16, - gradient_checkpointing: bool = True, -): - """Load the base model in 4-bit quantized form for QLoRA fine-tuning. - - Uses bitsandbytes NF4 (NormalFloat4) quantization with double quantization - to compress the base model to ~4 bits per parameter. This reduces GPU memory - by roughly 4x compared to full precision, enabling fine-tuning of larger - models on smaller GPUs. - - Args: - model_name: HuggingFace model ID or local path. - device_map: Device placement strategy (same as load_base_model_lora). - compute_dtype: Dtype for computation during forward/backward passes. - bf16 is preferred for its wider dynamic range. - gradient_checkpointing: If True, enable gradient checkpointing for - additional memory savings. Disable for inference. - - Returns: - A quantized model prepared for k-bit training (gradients enabled on - non-quantized parameters like LayerNorm and LoRA adapters). - """ - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - # NF4 (NormalFloat4): a 4-bit data type optimized for normally-distributed - # weights, giving higher precision near zero where most weights cluster. - bnb_4bit_quant_type="nf4", - # Double quantization: further compresses the quantization constants - # themselves, saving ~0.4 bits per parameter with minimal quality loss. - bnb_4bit_use_double_quant=True, - bnb_4bit_compute_dtype=compute_dtype, - ) - model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map=device_map, - quantization_config=bnb_config, - trust_remote_code=True, - ) - # prepare_model_for_kbit_training enables gradients on non-quantized layers - # (LayerNorm, embeddings) and sets up proper dtype casting for mixed-precision - # training with quantized weights. - model = prepare_model_for_kbit_training(model) - if gradient_checkpointing: - model.gradient_checkpointing_enable() - model.config.use_cache = False - return model - - -def create_lora_config( - *, - r: int, - alpha: int, - dropout: float, - target_modules: Sequence[str] = DEFAULT_TARGET_MODULES, -) -> LoraConfig: - """Create a LoRA configuration for use with SFTTrainer. - - Returns LoraConfig that can be passed directly to SFTTrainer's peft_config parameter. - SFTTrainer will automatically apply the LoRA adapters during training. - """ - return LoraConfig( - r=r, - lora_alpha=alpha, - lora_dropout=dropout, - target_modules=list(target_modules), - bias="none", - task_type="CAUSAL_LM", - ) - - -# Keep apply_lora for backward compatibility (used by eval/inference code) -def apply_lora( - model, - *, - r: int, - alpha: int, - dropout: float, - target_modules: Sequence[str] = DEFAULT_TARGET_MODULES, -): - """Apply LoRA to a model (for backward compatibility with eval/inference code). - - Note: For training, use create_lora_config() and pass to SFTTrainer instead. - """ - from peft import get_peft_model - - cfg = create_lora_config( - r=r, - alpha=alpha, - dropout=dropout, - target_modules=target_modules, - ) - model = get_peft_model(model, cfg) - return model +"""Model loading utilities for LoRA and QLoRA fine-tuning (Listing 5.2). + +Provides functions to: + - Load the base model in full precision (for LoRA) or 4-bit quantized (for QLoRA). + - Load and configure the tokenizer with proper padding. + - Create and apply LoRA adapter configurations. + +Used by train_lora.py, train_qlora.py, generate.py, and eval.py. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +from peft import LoraConfig, prepare_model_for_kbit_training + +from .chat_template import ensure_padding + + +# Standard attention and MLP projection modules in Transformer architectures. +# Adapting all attention projections (q/k/v/o) plus MLP projections (up/gate/down) +# gives the best quality/cost balance. See Section 5.5 for guidance. +DEFAULT_TARGET_MODULES = [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", +] + + +@dataclass(frozen=True) +class LoadedModel: + model: Any + tokenizer: Any + + +def load_tokenizer(model_name: str): + """Load and configure a tokenizer with proper padding for the given model. + + Args: + model_name: HuggingFace model ID or local path. + + Returns: + A configured AutoTokenizer with padding set (see ensure_padding). + """ + tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + ensure_padding(tok) + return tok + + +def load_base_model_lora( + model_name: str, + *, + device_map: str = "auto", + dtype: str | torch.dtype = "auto", + gradient_checkpointing: bool = True, +): + """Load the base model in full precision for LoRA fine-tuning or inference. + + Args: + model_name: HuggingFace model ID or local path. + device_map: Device placement strategy. "auto" distributes layers across + available GPUs (or CPU if no GPU), which is the simplest approach + for single-GPU setups. + dtype: Weight dtype. "auto" lets HF choose the best dtype for the hardware. + gradient_checkpointing: If True, trades compute for memory by recomputing + activations during backward. Roughly halves memory at ~20% speed cost. + Disable for inference (no backward pass needed). + + Returns: + A HuggingFace AutoModelForCausalLM ready for LoRA adapter attachment. + """ + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map=device_map, + dtype=dtype, + trust_remote_code=True, + ) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + # KV cache is incompatible with gradient checkpointing during training. + model.config.use_cache = False + return model + + +def load_base_model_qlora( + model_name: str, + *, + device_map: str = "auto", + compute_dtype: torch.dtype = torch.bfloat16, + gradient_checkpointing: bool = True, +): + """Load the base model in 4-bit quantized form for QLoRA fine-tuning. + + Uses bitsandbytes NF4 (NormalFloat4) quantization with double quantization + to compress the base model to ~4 bits per parameter. This reduces GPU memory + by roughly 4x compared to full precision, enabling fine-tuning of larger + models on smaller GPUs. + + Args: + model_name: HuggingFace model ID or local path. + device_map: Device placement strategy (same as load_base_model_lora). + compute_dtype: Dtype for computation during forward/backward passes. + bf16 is preferred for its wider dynamic range. + gradient_checkpointing: If True, enable gradient checkpointing for + additional memory savings. Disable for inference. + + Returns: + A quantized model prepared for k-bit training (gradients enabled on + non-quantized parameters like LayerNorm and LoRA adapters). + """ + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + # NF4 (NormalFloat4): a 4-bit data type optimized for normally-distributed + # weights, giving higher precision near zero where most weights cluster. + bnb_4bit_quant_type="nf4", + # Double quantization: further compresses the quantization constants + # themselves, saving ~0.4 bits per parameter with minimal quality loss. + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=compute_dtype, + ) + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map=device_map, + quantization_config=bnb_config, + trust_remote_code=True, + ) + # prepare_model_for_kbit_training enables gradients on non-quantized layers + # (LayerNorm, embeddings) and sets up proper dtype casting for mixed-precision + # training with quantized weights. + model = prepare_model_for_kbit_training(model) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + model.config.use_cache = False + return model + + +def create_lora_config( + *, + r: int, + alpha: int, + dropout: float, + target_modules: Sequence[str] = DEFAULT_TARGET_MODULES, +) -> LoraConfig: + """Create a LoRA configuration for use with SFTTrainer. + + Returns LoraConfig that can be passed directly to SFTTrainer's peft_config parameter. + SFTTrainer will automatically apply the LoRA adapters during training. + """ + return LoraConfig( + r=r, + lora_alpha=alpha, + lora_dropout=dropout, + target_modules=list(target_modules), + bias="none", + task_type="CAUSAL_LM", + ) + + +# Keep apply_lora for backward compatibility (used by eval/inference code) +def apply_lora( + model, + *, + r: int, + alpha: int, + dropout: float, + target_modules: Sequence[str] = DEFAULT_TARGET_MODULES, +): + """Apply LoRA to a model (for backward compatibility with eval/inference code). + + Note: For training, use create_lora_config() and pass to SFTTrainer instead. + """ + from peft import get_peft_model + + cfg = create_lora_config( + r=r, + alpha=alpha, + dropout=dropout, + target_modules=target_modules, + ) + model = get_peft_model(model, cfg) + return model diff --git a/code/chapter05/scripts/fix_safety_complete.ps1 b/code/chapter05/scripts/fix_safety_complete.ps1 index 3ea259a..0ef74b3 100644 --- a/code/chapter05/scripts/fix_safety_complete.ps1 +++ b/code/chapter05/scripts/fix_safety_complete.ps1 @@ -80,7 +80,7 @@ Write-Host "Step 4/4: Evaluating and comparing results..." -ForegroundColor Cyan Write-Host "⏱ Time: 5-10 minutes" -ForegroundColor Gray Write-Host "" -python chapter05/scripts/listing_5_4_evaluate.py ` +python chapter05/scripts/listing_5_3_evaluate.py ` --base Qwen/Qwen3-4B-Instruct-2507 ` --adapter chapter05/runs/dolly_lora ` --adapter_alt chapter05/runs/dolly_lora_with_safety ` diff --git a/code/chapter05/scripts/fix_safety_complete.sh b/code/chapter05/scripts/fix_safety_complete.sh index eff0bcd..28620fd 100644 --- a/code/chapter05/scripts/fix_safety_complete.sh +++ b/code/chapter05/scripts/fix_safety_complete.sh @@ -81,7 +81,7 @@ echo "Step 4/4: Evaluating and comparing results..." echo "⏱ Time: 5-10 minutes" echo "" -python chapter05/scripts/listing_5_4_evaluate.py \ +python chapter05/scripts/listing_5_3_evaluate.py \ --base Qwen/Qwen3-4B-Instruct-2507 \ --adapter chapter05/runs/dolly_lora \ --adapter_alt chapter05/runs/dolly_lora_with_safety \ diff --git a/code/chapter05/scripts/fix_safety_regression.ps1 b/code/chapter05/scripts/fix_safety_regression.ps1 index abe2eb8..bef274b 100644 --- a/code/chapter05/scripts/fix_safety_regression.ps1 +++ b/code/chapter05/scripts/fix_safety_regression.ps1 @@ -48,7 +48,7 @@ Write-Host "Step 2/2: Evaluating both adapters (r=16 vs r=8)..." -ForegroundColo Write-Host "⏱ Estimated time: 5-10 minutes" -ForegroundColor Gray Write-Host "" -python chapter05/scripts/listing_5_4_evaluate.py ` +python chapter05/scripts/listing_5_3_evaluate.py ` --base Qwen/Qwen3-4B-Instruct-2507 ` --adapter chapter05/runs/dolly_lora ` --adapter_alt chapter05/runs/dolly_lora_r8 ` diff --git a/code/chapter05/scripts/fix_safety_regression.sh b/code/chapter05/scripts/fix_safety_regression.sh index f72f1bc..6f10aa8 100644 --- a/code/chapter05/scripts/fix_safety_regression.sh +++ b/code/chapter05/scripts/fix_safety_regression.sh @@ -49,7 +49,7 @@ echo "Step 2/2: Evaluating both adapters (r=16 vs r=8)..." echo "⏱ Estimated time: 5-10 minutes" echo "" -python chapter05/scripts/listing_5_4_evaluate.py \ +python chapter05/scripts/listing_5_3_evaluate.py \ --base Qwen/Qwen3-4B-Instruct-2507 \ --adapter chapter05/runs/dolly_lora \ --adapter_alt chapter05/runs/dolly_lora_r8 \ diff --git a/code/chapter05/scripts/listing_5_2_prepare_dataset.py b/code/chapter05/scripts/listing_5_1_prepare_dataset.py similarity index 96% rename from code/chapter05/scripts/listing_5_2_prepare_dataset.py rename to code/chapter05/scripts/listing_5_1_prepare_dataset.py index 77f7e16..d9abc49 100644 --- a/code/chapter05/scripts/listing_5_2_prepare_dataset.py +++ b/code/chapter05/scripts/listing_5_1_prepare_dataset.py @@ -1,211 +1,211 @@ -"""Step 1 of the hands-on project: download Dolly 15K and prepare a subset. - -This script: - 1. Downloads the Databricks Dolly 15K dataset from Hugging Face (first run only; - subsequent runs use the cached copy). - 2. Filters examples by length (--min_length / --max_length). - 3. Shuffles with a fixed seed and splits into train/valid/test. - 4. Converts each example to messages format (system, user, assistant) and - writes train.jsonl, valid.jsonl, test.jsonl, and manifest.json to --out. - -Run from the repo root (code/) so that chapter05 and common are importable. -Example: - python chapter05/scripts/listing_5_2_prepare_dataset.py \\ - --out chapter05/data/dolly_subset --seed 42 --train 400 --valid 50 --test 50 -""" -from __future__ import annotations - -import argparse -import datetime as dt -from collections import Counter -from pathlib import Path -from typing import Any, Dict, List - -from datasets import load_dataset - -from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT -from common.jsonl import write_jsonl -from common.manifest import write_json - - -def dolly_to_messages( - instruction: str, - context: str | None, - response: str, - *, - system_prompt: str, -) -> Dict[str, Any]: - """Convert Dolly format (instruction, context, response) to messages format. - - Dolly format: - - instruction: The task/question - - context: Optional background information - - response: The answer/output - - We combine instruction + context (if present) into the user message. - """ - # Combine instruction and context for user message - if context and context.strip(): - user_content = f"{context}\n\n{instruction}" - else: - user_content = instruction - - return { - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_content}, - {"role": "assistant", "content": response}, - ] - } - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments for dataset preparation. - - Returns: - Namespace with output path, seed, split sizes, system prompt, and - length filter thresholds. - """ - ap = argparse.ArgumentParser( - description="Prepare a subset of Databricks Dolly 15K for LoRA fine-tuning." - ) - ap.add_argument("--out", required=True, help="Output folder (will create train/valid/test.jsonl)") - ap.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") - ap.add_argument("--train", type=int, default=400, help="Number of training examples") - ap.add_argument("--valid", type=int, default=50, help="Number of validation examples") - ap.add_argument("--test", type=int, default=50, help="Number of test examples") - ap.add_argument( - "--system_prompt", - default=DEFAULT_SYSTEM_PROMPT, - help="System prompt to use for all examples", - ) - ap.add_argument( - "--min_length", - type=int, - default=20, - help="Minimum character length for instruction+response (filter short examples)", - ) - ap.add_argument( - "--max_length", - type=int, - default=2000, - help="Maximum character length for instruction+response (filter very long examples)", - ) - return ap.parse_args() - - -def main() -> None: - """Download Dolly 15K, filter by length, split, convert to messages, and write JSONL.""" - args = parse_args() - out_dir = Path(args.out) - out_dir.mkdir(parents=True, exist_ok=True) - - print("Step 1: Download and prepare dataset") - print(" Downloading Databricks Dolly 15K from Hugging Face (first run may take a minute)...") - ds = load_dataset("databricks/databricks-dolly-15k", split="train") - print(" Loaded. Filtering and splitting...") - - # Filter and shuffle - import random - rng = random.Random(args.seed) - - filtered_examples = [] - for example in ds: - instruction = example.get("instruction", "") - context = example.get("context", "") - response = example.get("response", "") - - # Calculate total length (instruction + context + response) - total_length = len(instruction) + len(context or "") + len(response) - - if args.min_length <= total_length <= args.max_length: - filtered_examples.append(example) - - print(f"Filtered to {len(filtered_examples)} examples (length {args.min_length}-{args.max_length} chars)") - - # Shuffle with seed - rng.shuffle(filtered_examples) - - total_needed = args.train + args.valid + args.test - if len(filtered_examples) < total_needed: - raise RuntimeError( - f"Not enough examples after filtering: have {len(filtered_examples)}, need {total_needed}" - ) - - # Split into train/valid/test - train_examples = filtered_examples[: args.train] - valid_examples = filtered_examples[args.train : args.train + args.valid] - test_examples = filtered_examples[args.train + args.valid : args.train + args.valid + args.test] - - # Convert to messages format, preserving category info - train_rows = [] - for ex in train_examples: - msg_row = dolly_to_messages( - ex["instruction"], - ex.get("context"), - ex["response"], - system_prompt=args.system_prompt, - ) - # Preserve category for evaluation - msg_row["category"] = ex.get("category", "unknown") - train_rows.append(msg_row) - - valid_rows = [] - for ex in valid_examples: - msg_row = dolly_to_messages( - ex["instruction"], - ex.get("context"), - ex["response"], - system_prompt=args.system_prompt, - ) - msg_row["category"] = ex.get("category", "unknown") - valid_rows.append(msg_row) - - test_rows = [] - for ex in test_examples: - msg_row = dolly_to_messages( - ex["instruction"], - ex.get("context"), - ex["response"], - system_prompt=args.system_prompt, - ) - msg_row["category"] = ex.get("category", "unknown") - test_rows.append(msg_row) - - # Count categories for manifest - train_categories = Counter(ex.get("category", "unknown") for ex in train_examples) - - # Write JSONL files - write_jsonl(out_dir / "train.jsonl", train_rows) - write_jsonl(out_dir / "valid.jsonl", valid_rows) - write_jsonl(out_dir / "test.jsonl", test_rows) - - # Write manifest - manifest = { - "dataset": "databricks/databricks-dolly-15k", - "split": "train", - "created_utc": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"), - "seed": args.seed, - "filters": { - "min_length": args.min_length, - "max_length": args.max_length, - }, - "counts": { - "train": len(train_rows), - "valid": len(valid_rows), - "test": len(test_rows), - }, - "category_distribution": dict(train_categories), - "system_prompt": args.system_prompt, - } - write_json(out_dir / "manifest.json", manifest) - - print(f"\n✓ Wrote Dolly 15K subset to: {out_dir}") - print(f" - Train: {len(train_rows)} examples") - print(f" - Valid: {len(valid_rows)} examples") - print(f" - Test: {len(test_rows)} examples") - print(f" - Categories: {dict(train_categories)}") - - -if __name__ == "__main__": - main() +"""Step 1 of the hands-on project: download Dolly 15K and prepare a subset. + +This script: + 1. Downloads the Databricks Dolly 15K dataset from Hugging Face (first run only; + subsequent runs use the cached copy). + 2. Filters examples by length (--min_length / --max_length). + 3. Shuffles with a fixed seed and splits into train/valid/test. + 4. Converts each example to messages format (system, user, assistant) and + writes train.jsonl, valid.jsonl, test.jsonl, and manifest.json to --out. + +Run from the repo root (code/) so that chapter05 and common are importable. +Example: + python chapter05/scripts/listing_5_1_prepare_dataset.py \\ + --out chapter05/data/dolly_subset --seed 42 --train 400 --valid 50 --test 50 +""" +from __future__ import annotations + +import argparse +import datetime as dt +from collections import Counter +from pathlib import Path +from typing import Any, Dict, List + +from datasets import load_dataset + +from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT +from common.jsonl import write_jsonl +from common.manifest import write_json + + +def dolly_to_messages( + instruction: str, + context: str | None, + response: str, + *, + system_prompt: str, +) -> Dict[str, Any]: + """Convert Dolly format (instruction, context, response) to messages format. + + Dolly format: + - instruction: The task/question + - context: Optional background information + - response: The answer/output + + We combine instruction + context (if present) into the user message. + """ + # Combine instruction and context for user message + if context and context.strip(): + user_content = f"{context}\n\n{instruction}" + else: + user_content = instruction + + return { + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content}, + {"role": "assistant", "content": response}, + ] + } + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments for dataset preparation. + + Returns: + Namespace with output path, seed, split sizes, system prompt, and + length filter thresholds. + """ + ap = argparse.ArgumentParser( + description="Prepare a subset of Databricks Dolly 15K for LoRA fine-tuning." + ) + ap.add_argument("--out", required=True, help="Output folder (will create train/valid/test.jsonl)") + ap.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") + ap.add_argument("--train", type=int, default=400, help="Number of training examples") + ap.add_argument("--valid", type=int, default=50, help="Number of validation examples") + ap.add_argument("--test", type=int, default=50, help="Number of test examples") + ap.add_argument( + "--system_prompt", + default=DEFAULT_SYSTEM_PROMPT, + help="System prompt to use for all examples", + ) + ap.add_argument( + "--min_length", + type=int, + default=20, + help="Minimum character length for instruction+response (filter short examples)", + ) + ap.add_argument( + "--max_length", + type=int, + default=2000, + help="Maximum character length for instruction+response (filter very long examples)", + ) + return ap.parse_args() + + +def main() -> None: + """Download Dolly 15K, filter by length, split, convert to messages, and write JSONL.""" + args = parse_args() + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + + print("Step 1: Download and prepare dataset") + print(" Downloading Databricks Dolly 15K from Hugging Face (first run may take a minute)...") + ds = load_dataset("databricks/databricks-dolly-15k", split="train") + print(" Loaded. Filtering and splitting...") + + # Filter and shuffle + import random + rng = random.Random(args.seed) + + filtered_examples = [] + for example in ds: + instruction = example.get("instruction", "") + context = example.get("context", "") + response = example.get("response", "") + + # Calculate total length (instruction + context + response) + total_length = len(instruction) + len(context or "") + len(response) + + if args.min_length <= total_length <= args.max_length: + filtered_examples.append(example) + + print(f"Filtered to {len(filtered_examples)} examples (length {args.min_length}-{args.max_length} chars)") + + # Shuffle with seed + rng.shuffle(filtered_examples) + + total_needed = args.train + args.valid + args.test + if len(filtered_examples) < total_needed: + raise RuntimeError( + f"Not enough examples after filtering: have {len(filtered_examples)}, need {total_needed}" + ) + + # Split into train/valid/test + train_examples = filtered_examples[: args.train] + valid_examples = filtered_examples[args.train : args.train + args.valid] + test_examples = filtered_examples[args.train + args.valid : args.train + args.valid + args.test] + + # Convert to messages format, preserving category info + train_rows = [] + for ex in train_examples: + msg_row = dolly_to_messages( + ex["instruction"], + ex.get("context"), + ex["response"], + system_prompt=args.system_prompt, + ) + # Preserve category for evaluation + msg_row["category"] = ex.get("category", "unknown") + train_rows.append(msg_row) + + valid_rows = [] + for ex in valid_examples: + msg_row = dolly_to_messages( + ex["instruction"], + ex.get("context"), + ex["response"], + system_prompt=args.system_prompt, + ) + msg_row["category"] = ex.get("category", "unknown") + valid_rows.append(msg_row) + + test_rows = [] + for ex in test_examples: + msg_row = dolly_to_messages( + ex["instruction"], + ex.get("context"), + ex["response"], + system_prompt=args.system_prompt, + ) + msg_row["category"] = ex.get("category", "unknown") + test_rows.append(msg_row) + + # Count categories for manifest + train_categories = Counter(ex.get("category", "unknown") for ex in train_examples) + + # Write JSONL files + write_jsonl(out_dir / "train.jsonl", train_rows) + write_jsonl(out_dir / "valid.jsonl", valid_rows) + write_jsonl(out_dir / "test.jsonl", test_rows) + + # Write manifest + manifest = { + "dataset": "databricks/databricks-dolly-15k", + "split": "train", + "created_utc": dt.datetime.now(dt.timezone.utc).isoformat().replace("+00:00", "Z"), + "seed": args.seed, + "filters": { + "min_length": args.min_length, + "max_length": args.max_length, + }, + "counts": { + "train": len(train_rows), + "valid": len(valid_rows), + "test": len(test_rows), + }, + "category_distribution": dict(train_categories), + "system_prompt": args.system_prompt, + } + write_json(out_dir / "manifest.json", manifest) + + print(f"\n✓ Wrote Dolly 15K subset to: {out_dir}") + print(f" - Train: {len(train_rows)} examples") + print(f" - Valid: {len(valid_rows)} examples") + print(f" - Test: {len(test_rows)} examples") + print(f" - Categories: {dict(train_categories)}") + + +if __name__ == "__main__": + main() diff --git a/code/chapter05/scripts/listing_5_4_evaluate.py b/code/chapter05/scripts/listing_5_3_evaluate.py similarity index 96% rename from code/chapter05/scripts/listing_5_4_evaluate.py rename to code/chapter05/scripts/listing_5_3_evaluate.py index 53f6c31..d6ef8c1 100644 --- a/code/chapter05/scripts/listing_5_4_evaluate.py +++ b/code/chapter05/scripts/listing_5_3_evaluate.py @@ -1,309 +1,309 @@ -"""Evaluation script comparing base model vs. fine-tuned adapter(s) (Listing 5.4). - -Loads the base model, evaluates it, then loads one or two adapters and evaluates -each, computing per-metric deltas. Produces both a machine-readable JSON report -and a human-readable Markdown summary. - -Usage (base vs. single adapter): - python chapter05/scripts/listing_5_4_evaluate.py \\ - --base Qwen/Qwen3-4B-Instruct-2507 \\ - --adapter chapter05/runs/dolly_lora \\ - --dolly_test chapter05/data/dolly_subset/test.jsonl - -Usage (base vs. LoRA vs. QLoRA): - python chapter05/scripts/listing_5_4_evaluate.py \\ - --base Qwen/Qwen3-4B-Instruct-2507 \\ - --adapter chapter05/runs/dolly_lora \\ - --adapter_alt chapter05/runs/dolly_qlora \\ - --dolly_test chapter05/data/dolly_subset/test.jsonl - -See Chapter 5, Section 5.1 (Step 3) for walkthrough and expected results. -""" -from __future__ import annotations - -import argparse -from pathlib import Path -from typing import Any, Dict, Optional - -from chapter05.eval import ( - eval_dolly_test_set, - eval_loss_on_jsonl, - eval_toy_golden, - load_model_variant, - safety_suite, - write_report, -) -from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments for the evaluation script. - - Returns: - Namespace with base model, adapter paths, test data paths, - generation settings, and output directory. - """ - ap = argparse.ArgumentParser() - ap.add_argument("--base", default="Qwen/Qwen3-4B-Instruct-2507") - ap.add_argument("--adapter", default=None, help="Adapter folder path for main run") - ap.add_argument("--adapter_alt", default=None, help="Adapter folder path for comparison run") - - ap.add_argument("--dolly_test", default=None, help="Dolly test set JSONL path (primary evaluation)") - ap.add_argument("--toy_golden", default="chapter05/data/golden/toy_test.jsonl", help="Toy test set (optional)") - ap.add_argument("--safety_prompts", default="chapter05/data/golden/safety_regression_prompts.jsonl") - - ap.add_argument( - "--system_prompt", - default=DEFAULT_SYSTEM_PROMPT, - help="System prompt used for safety suite (toy golden uses per-example system prompt).", - ) - - ap.add_argument("--max_new_tokens", type=int, default=128) - ap.add_argument("--max_length", type=int, default=512) - - ap.add_argument("--out", default="chapter05/runs/eval_report", help="Output folder for reports") - return ap.parse_args() - - -def summarize_variant(name: str, model, tokenizer, args: argparse.Namespace) -> Dict[str, Any]: - """Run the full evaluation suite for a single model variant. - - Evaluates on the Dolly test set (instruction-following), the toy golden - set (sanity check), and the safety suite (refusal rate). Returns a dict - of all results for this variant. - - Args: - name: Label for this variant (e.g., "base", "adapter", "adapter_alt"). - model: A HuggingFace causal LM (base or with adapter attached). - tokenizer: Matching tokenizer. - args: Parsed CLI arguments with test data paths and generation settings. - - Returns: - Dict with evaluation results keyed by test type ("dolly", "toy", "safety"). - """ - result: Dict[str, Any] = {"name": name} - - # Primary evaluation: Dolly test set - if args.dolly_test and Path(args.dolly_test).exists(): - dolly_result = eval_dolly_test_set( - model, - tokenizer, - test_jsonl=args.dolly_test, - system_prompt=args.system_prompt, - max_new_tokens=256, - ) - result["dolly"] = dolly_result - - # Legacy evaluations (optional) - if Path(args.toy_golden).exists(): - result["toy"] = eval_toy_golden( - model, tokenizer, golden_jsonl=args.toy_golden, max_new_tokens=args.max_new_tokens - ) - - result["safety"] = safety_suite( - model, - tokenizer, - prompts_jsonl=args.safety_prompts, - system_prompt=args.system_prompt, - ) - - return result - - -def main() -> None: - """Evaluate base model and adapter(s), compute deltas, and write reports.""" - from rich.console import Console - console = Console() - - args = parse_args() - out_dir = Path(args.out) - out_dir.mkdir(parents=True, exist_ok=True) - - console.print("\n[bold cyan]Step 1/4:[/bold cyan] Loading base model...") - base_model, base_tok = load_model_variant(base_model=args.base, adapter=None) - console.print("[green]✓[/green] Base model loaded\n") - - console.print("[bold cyan]Step 2/4:[/bold cyan] Evaluating base model...") - base_res = summarize_variant("base", base_model, base_tok, args) - console.print("[green]✓[/green] Base evaluation complete\n") - - res: Dict[str, Any] = {"base": base_res} - - if args.adapter: - console.print(f"[bold cyan]Step 3/4:[/bold cyan] Loading adapter from {args.adapter}...") - m, t = load_model_variant(base_model=args.base, adapter=args.adapter) - console.print("[green]✓[/green] Adapter loaded\n") - - console.print("[bold cyan]Step 4/4:[/bold cyan] Evaluating fine-tuned model...") - res["adapter"] = summarize_variant("adapter", m, t, args) - console.print("[green]✓[/green] Fine-tuned evaluation complete\n") - - if args.adapter_alt: - console.print(f"[bold cyan]Loading alternative adapter from {args.adapter_alt}...[/bold cyan]") - m, t = load_model_variant(base_model=args.base, adapter=args.adapter_alt) - console.print("[green]✓[/green] Alternative adapter loaded\n") - - console.print("[bold cyan]Evaluating alternative adapter...[/bold cyan]") - res["adapter_alt"] = summarize_variant("adapter_alt", m, t, args) - console.print("[green]✓[/green] Alternative evaluation complete\n") - - def maybe_delta(a: Optional[float], b: Optional[float]) -> Optional[float]: - """Compute a - b, returning None if either value is missing.""" - if a is None or b is None: - return None - return float(a - b) - - def compute_deltas(base: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]: - """Compute metric deltas between an adapter and the base model.""" - deltas: Dict[str, Any] = { - "safety": { - "refusal_rate": maybe_delta(other["safety"]["refusal_rate"], base["safety"]["refusal_rate"]), - }, - } - - # Dolly metrics (primary) - if base.get("dolly") and other.get("dolly"): - deltas["dolly"] = { - "exact_match": maybe_delta(other["dolly"]["exact_match"], base["dolly"]["exact_match"]), - "token_f1": maybe_delta(other["dolly"]["token_f1"], base["dolly"]["token_f1"]), - "category_metrics": {}, - } - # Per-category deltas - base_cats = base["dolly"].get("category_metrics", {}) - other_cats = other["dolly"].get("category_metrics", {}) - for cat in set(base_cats.keys()) | set(other_cats.keys()): - if cat in base_cats and cat in other_cats: - deltas["dolly"]["category_metrics"][cat] = { - "exact_match": maybe_delta( - other_cats[cat]["exact_match"], base_cats[cat]["exact_match"] - ), - "token_f1": maybe_delta( - other_cats[cat]["token_f1"], base_cats[cat]["token_f1"] - ), - } - - # Toy metrics (optional) - if base.get("toy") and other.get("toy"): - deltas["toy"] = { - "exact_match": maybe_delta(other["toy"]["exact_match"], base["toy"]["exact_match"]), - "token_f1": maybe_delta(other["toy"]["token_f1"], base["toy"]["token_f1"]), - } - - return deltas - - if "adapter" in res: - res["adapter_deltas_vs_base"] = compute_deltas(res["base"], res["adapter"]) - if "adapter_alt" in res: - res["adapter_alt_deltas_vs_base"] = compute_deltas(res["base"], res["adapter_alt"]) - - # Write JSON - write_report(out_dir / "report.json", res) - - # Write a short Markdown summary - def fmt_pct(x: float) -> str: - """Format a 0-1 float as a percentage string (e.g., 0.6 -> '60.0%').""" - return f"{x*100:.1f}%" - - def fmt_delta(x: Optional[float], *, pct: bool = False) -> str: - """Format a delta value with +/- sign (e.g., +0.1321 or +13.2%).""" - if x is None: - return "n/a" - if pct: - return f"{x*100:+.1f}%" - return f"{x:+.4f}" - - lines = [] - lines.append(f"# Chapter 5 Evaluation Report") - lines.append("") - lines.append(f"- Base model: `{args.base}`") - lines.append(f"- System prompt: `{args.system_prompt}`") - if args.dolly_test: - lines.append(f"- Dolly test set: `{args.dolly_test}`") - if args.adapter: - lines.append(f"- Adapter: `{args.adapter}`") - if args.adapter_alt: - lines.append(f"- Adapter (alt): `{args.adapter_alt}`") - lines.append("") - - for key in ["base", "adapter", "adapter_alt"]: - if key not in res: - continue - variant = res[key] - lines.append(f"## {key}") - - # Primary: Dolly metrics - if variant.get("dolly"): - d = variant["dolly"] - lines.append(f"### Dolly Test Set (Instruction-Following)") - lines.append(f"- **Overall exact match**: {fmt_pct(d['exact_match'])}") - lines.append(f"- **Overall token-F1**: {d['token_f1']:.3f}") - lines.append(f"- **Test examples**: {d['count']}") - if d.get("category_metrics"): - lines.append(f"\n**Per-Category Accuracy:**") - for cat, metrics in sorted(d["category_metrics"].items()): - lines.append( - f"- {cat}: EM={fmt_pct(metrics['exact_match'])}, F1={metrics['token_f1']:.3f} " - f"(n={metrics['count']})" - ) - lines.append("") - - # Safety - safety = variant["safety"] - lines.append(f"- **Safety refusal rate**: {fmt_pct(safety['refusal_rate'])}") - - # Toy metrics (if present) - if variant.get("toy"): - toy = variant["toy"] - lines.append(f"- **Toy exact match**: {fmt_pct(toy['exact_match'])}") - lines.append(f"- **Toy token-F1**: {toy['token_f1']:.3f}") - - lines.append("") - - # Delta section (base vs adapters) - def add_delta_block(delta_key: str, label: str) -> None: - """Append a Markdown section showing metric deltas vs. base.""" - if delta_key not in res: - return - d = res[delta_key] - lines.append(f"## {label} (Improvement vs Base)") - - # Primary: Dolly metrics - if d.get("dolly"): - lines.append(f"### Dolly Test Set Improvements") - lines.append(f"- **Overall exact match Δ**: {fmt_delta(d['dolly']['exact_match'], pct=True)}") - lines.append(f"- **Overall token-F1 Δ**: {fmt_delta(d['dolly']['token_f1'])}") - if d["dolly"].get("category_metrics"): - lines.append(f"\n**Per-Category Improvements:**") - for cat, metrics in sorted(d["dolly"]["category_metrics"].items()): - em_delta = metrics.get("exact_match") - f1_delta = metrics.get("token_f1") - if em_delta is not None or f1_delta is not None: - em_str = fmt_delta(em_delta, pct=True) if em_delta is not None else "n/a" - f1_str = fmt_delta(f1_delta) if f1_delta is not None else "n/a" - lines.append(f"- {cat}: EM Δ={em_str}, F1 Δ={f1_str}") - lines.append("") - - # Safety - lines.append(f"- **Safety refusal rate Δ**: {fmt_delta(d['safety']['refusal_rate'], pct=True)}") - - # Toy metrics - if d.get("toy"): - lines.append(f"- **Toy exact match Δ**: {fmt_delta(d['toy']['exact_match'], pct=True)}") - lines.append(f"- **Toy token-F1 Δ**: {fmt_delta(d['toy']['token_f1'])}") - - lines.append("") - - add_delta_block("adapter_deltas_vs_base", "adapter") - add_delta_block("adapter_alt_deltas_vs_base", "adapter_alt") - - console.print("\n[bold cyan]Writing evaluation reports...[/bold cyan]") - (out_dir / "report.md").write_text("\n".join(lines) + "\n", encoding="utf-8") - - console.print(f"\n[bold green]✓ Evaluation complete![/bold green]") - console.print(f"[green]✓[/green] JSON report: {out_dir / 'report.json'}") - console.print(f"[green]✓[/green] Markdown summary: {out_dir / 'report.md'}") - console.print(f"\n[yellow]→[/yellow] View the markdown report for a human-readable summary") - - -if __name__ == "__main__": - main() +"""Evaluation script comparing base model vs. fine-tuned adapter(s) (Listing 5.3). + +Loads the base model, evaluates it, then loads one or two adapters and evaluates +each, computing per-metric deltas. Produces both a machine-readable JSON report +and a human-readable Markdown summary. + +Usage (base vs. single adapter): + python chapter05/scripts/listing_5_3_evaluate.py \\ + --base Qwen/Qwen3-4B-Instruct-2507 \\ + --adapter chapter05/runs/dolly_lora \\ + --dolly_test chapter05/data/dolly_subset/test.jsonl + +Usage (base vs. LoRA vs. QLoRA): + python chapter05/scripts/listing_5_3_evaluate.py \\ + --base Qwen/Qwen3-4B-Instruct-2507 \\ + --adapter chapter05/runs/dolly_lora \\ + --adapter_alt chapter05/runs/dolly_qlora \\ + --dolly_test chapter05/data/dolly_subset/test.jsonl + +See Chapter 5, Section 5.1 (Step 3) for walkthrough and expected results. +""" +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any, Dict, Optional + +from chapter05.eval import ( + eval_dolly_test_set, + eval_loss_on_jsonl, + eval_toy_golden, + load_model_variant, + safety_suite, + write_report, +) +from chapter05.chat_template import DEFAULT_SYSTEM_PROMPT + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments for the evaluation script. + + Returns: + Namespace with base model, adapter paths, test data paths, + generation settings, and output directory. + """ + ap = argparse.ArgumentParser() + ap.add_argument("--base", default="Qwen/Qwen3-4B-Instruct-2507") + ap.add_argument("--adapter", default=None, help="Adapter folder path for main run") + ap.add_argument("--adapter_alt", default=None, help="Adapter folder path for comparison run") + + ap.add_argument("--dolly_test", default=None, help="Dolly test set JSONL path (primary evaluation)") + ap.add_argument("--toy_golden", default="chapter05/data/golden/toy_test.jsonl", help="Toy test set (optional)") + ap.add_argument("--safety_prompts", default="chapter05/data/golden/safety_regression_prompts.jsonl") + + ap.add_argument( + "--system_prompt", + default=DEFAULT_SYSTEM_PROMPT, + help="System prompt used for safety suite (toy golden uses per-example system prompt).", + ) + + ap.add_argument("--max_new_tokens", type=int, default=128) + ap.add_argument("--max_length", type=int, default=512) + + ap.add_argument("--out", default="chapter05/runs/eval_report", help="Output folder for reports") + return ap.parse_args() + + +def summarize_variant(name: str, model, tokenizer, args: argparse.Namespace) -> Dict[str, Any]: + """Run the full evaluation suite for a single model variant. + + Evaluates on the Dolly test set (instruction-following), the toy golden + set (sanity check), and the safety suite (refusal rate). Returns a dict + of all results for this variant. + + Args: + name: Label for this variant (e.g., "base", "adapter", "adapter_alt"). + model: A HuggingFace causal LM (base or with adapter attached). + tokenizer: Matching tokenizer. + args: Parsed CLI arguments with test data paths and generation settings. + + Returns: + Dict with evaluation results keyed by test type ("dolly", "toy", "safety"). + """ + result: Dict[str, Any] = {"name": name} + + # Primary evaluation: Dolly test set + if args.dolly_test and Path(args.dolly_test).exists(): + dolly_result = eval_dolly_test_set( + model, + tokenizer, + test_jsonl=args.dolly_test, + system_prompt=args.system_prompt, + max_new_tokens=256, + ) + result["dolly"] = dolly_result + + # Legacy evaluations (optional) + if Path(args.toy_golden).exists(): + result["toy"] = eval_toy_golden( + model, tokenizer, golden_jsonl=args.toy_golden, max_new_tokens=args.max_new_tokens + ) + + result["safety"] = safety_suite( + model, + tokenizer, + prompts_jsonl=args.safety_prompts, + system_prompt=args.system_prompt, + ) + + return result + + +def main() -> None: + """Evaluate base model and adapter(s), compute deltas, and write reports.""" + from rich.console import Console + console = Console() + + args = parse_args() + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + + console.print("\n[bold cyan]Step 1/4:[/bold cyan] Loading base model...") + base_model, base_tok = load_model_variant(base_model=args.base, adapter=None) + console.print("[green]✓[/green] Base model loaded\n") + + console.print("[bold cyan]Step 2/4:[/bold cyan] Evaluating base model...") + base_res = summarize_variant("base", base_model, base_tok, args) + console.print("[green]✓[/green] Base evaluation complete\n") + + res: Dict[str, Any] = {"base": base_res} + + if args.adapter: + console.print(f"[bold cyan]Step 3/4:[/bold cyan] Loading adapter from {args.adapter}...") + m, t = load_model_variant(base_model=args.base, adapter=args.adapter) + console.print("[green]✓[/green] Adapter loaded\n") + + console.print("[bold cyan]Step 4/4:[/bold cyan] Evaluating fine-tuned model...") + res["adapter"] = summarize_variant("adapter", m, t, args) + console.print("[green]✓[/green] Fine-tuned evaluation complete\n") + + if args.adapter_alt: + console.print(f"[bold cyan]Loading alternative adapter from {args.adapter_alt}...[/bold cyan]") + m, t = load_model_variant(base_model=args.base, adapter=args.adapter_alt) + console.print("[green]✓[/green] Alternative adapter loaded\n") + + console.print("[bold cyan]Evaluating alternative adapter...[/bold cyan]") + res["adapter_alt"] = summarize_variant("adapter_alt", m, t, args) + console.print("[green]✓[/green] Alternative evaluation complete\n") + + def maybe_delta(a: Optional[float], b: Optional[float]) -> Optional[float]: + """Compute a - b, returning None if either value is missing.""" + if a is None or b is None: + return None + return float(a - b) + + def compute_deltas(base: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]: + """Compute metric deltas between an adapter and the base model.""" + deltas: Dict[str, Any] = { + "safety": { + "refusal_rate": maybe_delta(other["safety"]["refusal_rate"], base["safety"]["refusal_rate"]), + }, + } + + # Dolly metrics (primary) + if base.get("dolly") and other.get("dolly"): + deltas["dolly"] = { + "exact_match": maybe_delta(other["dolly"]["exact_match"], base["dolly"]["exact_match"]), + "token_f1": maybe_delta(other["dolly"]["token_f1"], base["dolly"]["token_f1"]), + "category_metrics": {}, + } + # Per-category deltas + base_cats = base["dolly"].get("category_metrics", {}) + other_cats = other["dolly"].get("category_metrics", {}) + for cat in set(base_cats.keys()) | set(other_cats.keys()): + if cat in base_cats and cat in other_cats: + deltas["dolly"]["category_metrics"][cat] = { + "exact_match": maybe_delta( + other_cats[cat]["exact_match"], base_cats[cat]["exact_match"] + ), + "token_f1": maybe_delta( + other_cats[cat]["token_f1"], base_cats[cat]["token_f1"] + ), + } + + # Toy metrics (optional) + if base.get("toy") and other.get("toy"): + deltas["toy"] = { + "exact_match": maybe_delta(other["toy"]["exact_match"], base["toy"]["exact_match"]), + "token_f1": maybe_delta(other["toy"]["token_f1"], base["toy"]["token_f1"]), + } + + return deltas + + if "adapter" in res: + res["adapter_deltas_vs_base"] = compute_deltas(res["base"], res["adapter"]) + if "adapter_alt" in res: + res["adapter_alt_deltas_vs_base"] = compute_deltas(res["base"], res["adapter_alt"]) + + # Write JSON + write_report(out_dir / "report.json", res) + + # Write a short Markdown summary + def fmt_pct(x: float) -> str: + """Format a 0-1 float as a percentage string (e.g., 0.6 -> '60.0%').""" + return f"{x*100:.1f}%" + + def fmt_delta(x: Optional[float], *, pct: bool = False) -> str: + """Format a delta value with +/- sign (e.g., +0.1321 or +13.2%).""" + if x is None: + return "n/a" + if pct: + return f"{x*100:+.1f}%" + return f"{x:+.4f}" + + lines = [] + lines.append(f"# Chapter 5 Evaluation Report") + lines.append("") + lines.append(f"- Base model: `{args.base}`") + lines.append(f"- System prompt: `{args.system_prompt}`") + if args.dolly_test: + lines.append(f"- Dolly test set: `{args.dolly_test}`") + if args.adapter: + lines.append(f"- Adapter: `{args.adapter}`") + if args.adapter_alt: + lines.append(f"- Adapter (alt): `{args.adapter_alt}`") + lines.append("") + + for key in ["base", "adapter", "adapter_alt"]: + if key not in res: + continue + variant = res[key] + lines.append(f"## {key}") + + # Primary: Dolly metrics + if variant.get("dolly"): + d = variant["dolly"] + lines.append(f"### Dolly Test Set (Instruction-Following)") + lines.append(f"- **Overall exact match**: {fmt_pct(d['exact_match'])}") + lines.append(f"- **Overall token-F1**: {d['token_f1']:.3f}") + lines.append(f"- **Test examples**: {d['count']}") + if d.get("category_metrics"): + lines.append(f"\n**Per-Category Accuracy:**") + for cat, metrics in sorted(d["category_metrics"].items()): + lines.append( + f"- {cat}: EM={fmt_pct(metrics['exact_match'])}, F1={metrics['token_f1']:.3f} " + f"(n={metrics['count']})" + ) + lines.append("") + + # Safety + safety = variant["safety"] + lines.append(f"- **Safety refusal rate**: {fmt_pct(safety['refusal_rate'])}") + + # Toy metrics (if present) + if variant.get("toy"): + toy = variant["toy"] + lines.append(f"- **Toy exact match**: {fmt_pct(toy['exact_match'])}") + lines.append(f"- **Toy token-F1**: {toy['token_f1']:.3f}") + + lines.append("") + + # Delta section (base vs adapters) + def add_delta_block(delta_key: str, label: str) -> None: + """Append a Markdown section showing metric deltas vs. base.""" + if delta_key not in res: + return + d = res[delta_key] + lines.append(f"## {label} (Improvement vs Base)") + + # Primary: Dolly metrics + if d.get("dolly"): + lines.append(f"### Dolly Test Set Improvements") + lines.append(f"- **Overall exact match Δ**: {fmt_delta(d['dolly']['exact_match'], pct=True)}") + lines.append(f"- **Overall token-F1 Δ**: {fmt_delta(d['dolly']['token_f1'])}") + if d["dolly"].get("category_metrics"): + lines.append(f"\n**Per-Category Improvements:**") + for cat, metrics in sorted(d["dolly"]["category_metrics"].items()): + em_delta = metrics.get("exact_match") + f1_delta = metrics.get("token_f1") + if em_delta is not None or f1_delta is not None: + em_str = fmt_delta(em_delta, pct=True) if em_delta is not None else "n/a" + f1_str = fmt_delta(f1_delta) if f1_delta is not None else "n/a" + lines.append(f"- {cat}: EM Δ={em_str}, F1 Δ={f1_str}") + lines.append("") + + # Safety + lines.append(f"- **Safety refusal rate Δ**: {fmt_delta(d['safety']['refusal_rate'], pct=True)}") + + # Toy metrics + if d.get("toy"): + lines.append(f"- **Toy exact match Δ**: {fmt_delta(d['toy']['exact_match'], pct=True)}") + lines.append(f"- **Toy token-F1 Δ**: {fmt_delta(d['toy']['token_f1'])}") + + lines.append("") + + add_delta_block("adapter_deltas_vs_base", "adapter") + add_delta_block("adapter_alt_deltas_vs_base", "adapter_alt") + + console.print("\n[bold cyan]Writing evaluation reports...[/bold cyan]") + (out_dir / "report.md").write_text("\n".join(lines) + "\n", encoding="utf-8") + + console.print(f"\n[bold green]✓ Evaluation complete![/bold green]") + console.print(f"[green]✓[/green] JSON report: {out_dir / 'report.json'}") + console.print(f"[green]✓[/green] Markdown summary: {out_dir / 'report.md'}") + console.print(f"\n[yellow]→[/yellow] View the markdown report for a human-readable summary") + + +if __name__ == "__main__": + main() diff --git a/code/chapter05/train_lora.py b/code/chapter05/train_lora.py index d955f2a..d543e61 100644 --- a/code/chapter05/train_lora.py +++ b/code/chapter05/train_lora.py @@ -1,4 +1,4 @@ -"""LoRA fine-tuning script using TRL's SFTTrainer (Listing 5.3). +"""LoRA fine-tuning script using TRL's SFTTrainer (Listing 5.2). Trains a LoRA adapter on chat-formatted JSONL data and saves the adapter weights. The base model is frozen; only the small LoRA matrices are updated. diff --git a/code/chapter05/train_qlora.py b/code/chapter05/train_qlora.py index 472e0cf..a9a3d2d 100644 --- a/code/chapter05/train_qlora.py +++ b/code/chapter05/train_qlora.py @@ -1,4 +1,4 @@ -"""QLoRA fine-tuning script using TRL's SFTTrainer (Listing 5.6). +"""QLoRA fine-tuning script using TRL's SFTTrainer (Listing 5.5). Same pipeline as train_lora.py but loads the base model in 4-bit quantization (NF4 via bitsandbytes), reducing GPU memory by roughly 4x. This enables