diff --git a/src/jit/async_compiler.rs b/src/jit/async_compiler.rs
new file mode 100644
index 0000000..98f3172
--- /dev/null
+++ b/src/jit/async_compiler.rs
@@ -0,0 +1,127 @@
+//! Background JIT compilation thread pool. Moves Cranelift work off the
+//! interpreter's hot path so compilation never stalls execution.
+
+use std::collections::HashSet;
+use std::sync::{mpsc, Arc, Mutex};
+use std::thread;
+
+use crate::mips_exec::DecodedInstr;
+use super::cache::{BlockTier, CompiledBlock};
+use super::compiler::BlockCompiler;
+use super::helpers::HelperPtrs;
+
+const NUM_COMPILER_THREADS: usize = 1;
+
+pub enum CompileKind {
+    New,
+    Recompile,
+    ProfileReplay { content_hash: u32 },
+}
+
+pub struct CompileRequest {
+    pub instrs: Vec<(u32, DecodedInstr)>,
+    pub block_pc: u64,
+    pub phys_pc: u64,
+    pub tier: BlockTier,
+    pub kind: CompileKind,
+}
+
+pub struct CompileResult {
+    pub block: CompiledBlock,
+    pub phys_pc: u64,
+    pub virt_pc: u64,
+    pub kind: CompileKind,
+}
+
+pub struct AsyncCompiler {
+    tx: Option<mpsc::Sender<CompileRequest>>,
+    rx: mpsc::Receiver<CompileResult>,
+    handles: Vec<thread::JoinHandle<()>>,
+    pub pending: HashSet<(u64, u64)>,
+}
+
+impl AsyncCompiler {
+    pub fn new(helpers: HelperPtrs, capture_ir: bool) -> Self {
+        let (req_tx, req_rx) = mpsc::channel::<CompileRequest>();
+        let (res_tx, res_rx) = mpsc::sync_channel::<CompileResult>(64);
+        let req_rx = Arc::new(Mutex::new(req_rx));
+
+        let mut handles = Vec::with_capacity(NUM_COMPILER_THREADS);
+        for i in 0..NUM_COMPILER_THREADS {
+            let rx = Arc::clone(&req_rx);
+            let tx = res_tx.clone();
+            let h = helpers.clone();
+            let handle = thread::Builder::new()
+                .name(format!("jit-compiler-{}", i))
+                .spawn(move || {
+                    let mut compiler = BlockCompiler::new(&h);
+                    compiler.capture_ir = capture_ir;
+                    loop {
+                        let req = {
+                            let guard = rx.lock().unwrap();
+                            guard.recv()
+                        };
+                        match req {
+                            Ok(req) => {
+                                if let Some(mut block) = compiler.compile_block(&req.instrs, req.block_pc, req.tier) {
+                                    block.phys_addr = req.phys_pc;
+                                    let _ = tx.send(CompileResult {
+                                        block,
+                                        phys_pc: req.phys_pc,
+                                        virt_pc: req.block_pc,
+                                        kind: req.kind,
+                                    });
+                                }
+                            }
+                            Err(_) => break,
+                        }
+                    }
+                })
+                .expect("failed to spawn JIT compiler thread");
+            handles.push(handle);
+        }
+
+        eprintln!("JIT: {} background compiler threads", NUM_COMPILER_THREADS);
+
+        Self {
+            tx: Some(req_tx),
+            rx: res_rx,
+            handles,
+            pending: HashSet::new(),
+        }
+    }
+
+    pub fn submit(&mut self, req: CompileRequest) {
+        let key = (req.phys_pc, req.block_pc);
+        if self.pending.contains(&key) {
+            return;
+        }
+        self.pending.insert(key);
+        if let Some(tx) = &self.tx {
+            let _ = tx.send(req);
+        }
+    }
+
+    pub fn try_recv(&mut self) -> Option<CompileResult> {
+        match self.rx.try_recv() {
+            Ok(result) => {
+                self.pending.remove(&(result.phys_pc, result.virt_pc));
+                Some(result)
+            }
+            Err(_) => None,
+        }
+    }
+
+    pub fn shutdown(&mut self) {
+        self.tx.take();
+        for handle in self.handles.drain(..) {
+            let _ = handle.join();
+        }
+    }
+}
+
+impl Drop for AsyncCompiler {
+    fn drop(&mut self) {
+        self.shutdown();
+    }
+}
diff --git a/src/jit/cache.rs b/src/jit/cache.rs
index 6388fbb..0e1798e 100644
--- a/src/jit/cache.rs
+++ b/src/jit/cache.rs
@@ -77,6 +77,12 @@ pub struct CompiledBlock {
     /// FNV-1a hash of the raw instruction words; used to detect stale profile
     /// entries when a different DSO is loaded at the same virtual address.
     pub content_hash:    u32,
+    /// Block ends with a branch-likely instruction (BEQL/BNEL/etc). Verify mode
+    /// needs this to adjust step count when the branch is not taken (delay slot
+    /// nullified = one fewer interpreter step).
+    pub has_branch_likely: bool,
+    /// Cranelift CLIF IR captured at compile time (only when IRIS_JIT_VERIFY=1).
+    pub clif_ir:         Option<String>,
 }
 
 // Safety: CompiledBlock is only accessed from the CPU thread.
diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs
index d0df257..90fd898 100644
--- a/src/jit/compiler.rs
+++ b/src/jit/compiler.rs
@@ -20,6 +20,7 @@ pub struct BlockCompiler {
     ctx: Context,
     builder_ctx: FunctionBuilderContext,
     func_id_counter: u32,
+    pub capture_ir: bool,
     // Declared function IDs for memory helpers (registered as imports)
     fn_read_u8: FuncId,
     fn_read_u16: FuncId,
@@ -116,6 +117,7 @@ impl BlockCompiler {
             jit_module,
             builder_ctx: FunctionBuilderContext::new(),
             func_id_counter: 0,
+            capture_ir: false,
             fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64,
             fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64,
             fn_interp_step,
@@ -207,6 +209,7 @@ impl BlockCompiler {
         // Emit IR for each instruction
         let mut compiled_count = 0u32;
         let mut branch_exit_pc: Option<Value> = None;
+        let mut has_branch_likely = false;
 
         let mut idx = 0;
         while idx < instrs.len() {
@@ -265,6 +268,7 @@ impl BlockCompiler {
                     break;
                 }
                 EmitResult::BranchLikely { taken, not_taken, cond } => {
+                    has_branch_likely = true;
                     compiled_count += 1;
                     idx += 1;
                     if idx < instrs.len() {
@@ -347,13 +351,20 @@ impl BlockCompiler {
         builder.ins().return_(&[]);
         builder.finalize();
 
+        // Capture CLIF IR before define_function consumes it (for verify diagnostics)
+        let clif_ir = if self.capture_ir {
+            Some(format!("{}", self.ctx.func.display()))
+        } else {
+            None
+        };
+
         // Compile to native code
         self.jit_module.define_function(func_id, &mut self.ctx).unwrap();
+        let code_size = self.ctx.compiled_code().unwrap().code_info().total_size;
         self.jit_module.clear_context(&mut self.ctx);
         self.jit_module.finalize_definitions().unwrap();
 
         let code_ptr = self.jit_module.get_finalized_function(func_id);
-        let code_size = 0u32; // JITModule doesn't expose size easily; not critical
 
         let content_hash = hash_block_instrs(instrs);
 
@@ -362,20 +373,18 @@ impl BlockCompiler {
             phys_addr: 0, // filled in by caller
             virt_addr: block_pc,
             len_mips: compiled_count,
-            len_native: code_size,
+            len_native: code_size as u32,
             tier,
-            // Speculative blocks get snapshot/rollback on exception, providing
-            // self-healing: codegen errors cause exceptions → rollback to correct
-            // state → demotion after 3 failures → bad block replaced.
-            //
-            // Non-speculative is ONLY safe when the block contains stores, because
-            // rollback can't undo memory writes (RMW double-apply). Load-only blocks
-            // at any tier should always be speculative for the safety net.
+            // Speculative blocks get snapshot/rollback on exception. Store-
+            // containing blocks are non-speculative because the write log
+            // approach is incompatible with MMIO writes.
             speculative: !block_has_stores(instrs),
             hit_count: 0,
             exception_count: 0,
             stable_hits: 0,
             content_hash,
+            has_branch_likely,
+            clif_ir,
         })
     }
 }
@@ -388,6 +397,11 @@ fn block_has_stores(instrs: &[(u32, DecodedInstr)]) -> bool {
     instrs.iter().any(|(_, d)| matches!(d.op as u32, OP_SB | OP_SH | OP_SW | OP_SD))
 }
 
+fn block_store_count(instrs: &[(u32, DecodedInstr)]) -> u32 {
+    use crate::mips_isa::*;
+    instrs.iter().filter(|(_, d)| matches!(d.op as u32, OP_SB | OP_SH | OP_SW | OP_SD)).count() as u32
+}
+
 /// FNV-1a 32-bit hash of raw instruction words. Used to detect stale profile
 /// entries: a different DSO loaded at the same virtual address will have the
 /// same length but different instruction bytes.
@@ -981,6 +995,20 @@ fn flush_modified_gprs(
     *modified = 0;
 }
 
+fn reload_all_gprs(
+    builder: &mut FunctionBuilder,
+    gpr: &mut [Value; 32],
+    ctx_ptr: Value,
+) {
+    let mem = MemFlags::new();
+    for i in 1..32usize {
+        gpr[i] = builder.ins().load(
+            types::I64, mem, ctx_ptr,
+            ir::immediates::Offset32::new(JitContext::gpr_offset(i)),
+        );
+    }
+}
+
 // ─── Load/Store emitters ─────────────────────────────────────────────────────
 
 /// Load width tag passed to emit_load so it applies the correct sign extension.
@@ -1040,6 +1068,10 @@ fn emit_load(
     builder.seal_block(ok_block);
     let val = builder.block_params(ok_block)[0];
 
+    // Reload ALL GPRs from ctx after helper call. This resets SSA live-value
+    // pressure so regalloc2 never sees accumulated diamonds from multiple helpers.
+    reload_all_gprs(builder, gpr, ctx_ptr);
+
     // Apply correct sign/zero extension based on load width
     gpr[rt] = match (width, sign_extend) {
         (LoadWidth::Byte, true) => {
@@ -1071,7 +1103,7 @@ fn emit_store(
     builder: &mut FunctionBuilder,
     ctx_ptr: Value, exec_ptr: Value,
     helper: FuncRef,
-    gpr: &[Value; 32],
+    gpr: &mut [Value; 32],
     rs: usize, rt: usize,
     d: &DecodedInstr,
     instr_pc: u64,
@@ -1110,6 +1142,8 @@ fn emit_store(
     builder.switch_to_block(ok_block);
     builder.seal_block(ok_block);
 
+    reload_all_gprs(builder, gpr, ctx_ptr);
+
     EmitResult::Ok
 }
 
diff --git a/src/jit/context.rs b/src/jit/context.rs
index 322017a..01e44d9 100644
--- a/src/jit/context.rs
+++ b/src/jit/context.rs
@@ -15,6 +15,27 @@ pub const EXIT_EXCEPTION: u32 = 2;
 pub const EXIT_INTERRUPT_CHECK: u32 = 3;
 pub const EXIT_HALT: u32 = 4;
 
+/// Max stores we can speculatively track per block. Exceeding this forces the
+/// block to be non-speculative (disables rollback for stores past this limit).
+pub const WRITE_LOG_CAP: usize = 128;
+
+/// Single entry in the speculative store write log. Records the pre-store
+/// value at `addr` so rollback can restore it if the block exceptions.
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct WriteLogEntry {
+    pub addr: u64,
+    pub old_val: u64,
+    pub size: u8,
+    pub _pad: [u8; 7],
+}
+
+impl WriteLogEntry {
+    pub const fn empty() -> Self {
+        Self { addr: 0, old_val: 0, size: 0, _pad: [0; 7] }
+    }
+}
+
 #[repr(C)]
 pub struct JitContext {
     // General purpose registers
@@ -58,6 +79,13 @@ pub struct JitContext {
     // Exception status from failed memory access (set by helpers)
     pub exception_status: u32,
     _pad0: u32,
+
+    // Speculative store write log. Each entry records the pre-store value at
+    // an address. On block rollback (speculative exception), replay in reverse
+    // to restore memory. On normal exit, reset write_log_len to 0.
+    pub write_log_len: u32,
+    _pad1: u32,
+    pub write_log: [WriteLogEntry; WRITE_LOG_CAP],
 }
 
 impl JitContext {
@@ -86,6 +114,9 @@ impl JitContext {
             executor_ptr: 0,
             exception_status: 0,
             _pad0: 0,
+            write_log_len: 0,
+            _pad1: 0,
+            write_log: [WriteLogEntry::empty(); WRITE_LOG_CAP],
         }
     }
 
diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs
index ffd7994..611ea88 100644
--- a/src/jit/dispatch.rs
+++ b/src/jit/dispatch.rs
@@ -11,6 +11,8 @@ use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Mutex;
 
+use super::async_compiler::{AsyncCompiler, CompileRequest, CompileKind};
+
 // Diagnostic: counts how many times a specific (non-compilable) instruction
 // type caused trace_block to terminate. Key encoding:
 //   bits 31..26: op
@@ -63,7 +65,6 @@ use crate::mips_tlb::Tlb;
 use crate::mips_cache_v2::MipsCache;
 
 use super::cache::{BlockTier, CodeCache, TierConfig};
-use super::compiler::BlockCompiler;
 use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION};
 use super::helpers::HelperPtrs;
 use super::profile::{self, ProfileEntry};
@@ -191,7 +192,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         max_tier, verify_mode, probe.interval, probe.min_interval, probe.max_interval,
         tier_cfg.stable, tier_cfg.promote, tier_cfg.demote);
     let helpers = HelperPtrs::new::<T, C>();
-    let mut compiler = BlockCompiler::new(&helpers);
+    let mut async_comp = AsyncCompiler::new(helpers.clone(), verify_mode);
     let mut cache = CodeCache::new();
     let mut ctx = JitContext::new();
     ctx.executor_ptr = exec_ptr as u64;
@@ -236,20 +237,28 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         while steps_in_batch < BATCH_SIZE {
             let burst = probe.next_interval();
 
-            // Interpreter burst
+            // Interpreter burst — use step_lite (no cp0/interrupt overhead),
+            // then do bookkeeping in bulk, same as post-JIT-block.
             {
                 let exec = unsafe { &mut *exec_ptr };
-                #[cfg(feature = "lightning")]
                 for _ in 0..burst {
-                    exec.step();
+                    exec.step_lite();
                 }
-                #[cfg(not(feature = "lightning"))]
-                for _ in 0..burst {
-                    let status = exec.step();
-                    if status == EXEC_BREAKPOINT {
-                        running.store(false, Ordering::SeqCst);
-                        break;
-                    }
+                let n = burst as u64;
+                exec.core.local_cycles += n;
+                let advance = exec.core.count_step.wrapping_mul(n);
+                let prev = exec.core.cp0_count;
+                exec.core.cp0_count = prev.wrapping_add(advance);
+                if exec.core.cp0_compare.wrapping_sub(prev) <= advance {
+                    exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7;
+                    exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed);
+                }
+                let pending = exec.core.interrupts.load(Ordering::Relaxed);
+                if pending != 0 {
+                    use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6};
+                    let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6;
+                    exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask)
+                        | (pending as u32 & ext_mask);
                 }
             }
             steps_in_batch += burst;
@@ -257,6 +266,33 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 
             if !running.load(Ordering::Relaxed) { break; }
 
+            // Drain completed compilations from background thread
+            while let Some(result) = async_comp.try_recv() {
+                match result.kind {
+                    CompileKind::New => {
+                        if !cache.contains(result.phys_pc, result.virt_pc) {
+                            cache.insert(result.phys_pc, result.virt_pc, result.block);
+                            blocks_compiled += 1;
+                            probe.set_cache_size(cache.len() as u32);
+                        }
+                    }
+                    CompileKind::Recompile => {
+                        cache.replace(result.phys_pc, result.virt_pc, result.block);
+                        blocks_compiled += 1;
+                    }
+                    CompileKind::ProfileReplay { content_hash } => {
+                        if !cache.contains(result.phys_pc, result.virt_pc)
+                            && result.block.content_hash == content_hash
+                        {
+                            cache.insert(result.phys_pc, result.virt_pc, result.block);
+                            blocks_compiled += 1;
+                            profile_replayed += 1;
+                            probe.set_cache_size(cache.len() as u32);
+                        }
+                    }
+                }
+            }
+
             // Probe the JIT code cache
             let (pc, in_delay_slot) = {
                 let exec = unsafe { &*exec_ptr };
@@ -323,6 +359,7 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 }
 
                 ctx.exit_reason = 0;
+                ctx.write_log_len = 0;
                 let entry: extern "C" fn(*mut JitContext) = unsafe {
                     std::mem::transmute(block.entry)
                 };
@@ -335,6 +372,19 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                     if ctx.exit_reason == EXIT_EXCEPTION {
                         if let Some(snap) = &snapshot {
                             if is_speculative {
+                                // Replay write log in reverse to undo speculative
+                                // stores before restoring CPU/TLB state.
+                                for i in (0..ctx.write_log_len as usize).rev() {
+                                    let e = &ctx.write_log[i];
+                                    match e.size {
+                                        1 => { exec.write_data::<1>(e.addr, e.old_val); }
+                                        2 => { exec.write_data::<2>(e.addr, e.old_val); }
+                                        4 => { exec.write_data::<4>(e.addr, e.old_val); }
+                                        8 => { exec.write_data::<8>(e.addr, e.old_val); }
+                                        _ => {}
+                                    }
+                                }
+                                ctx.write_log_len = 0;
                                 snap.restore(exec);
                                 rollbacks += 1;
 
@@ -345,14 +395,18 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 
                                     if block.exception_count >= tier_cfg.demote {
                                         if let Some(lower) = block.tier.demote() {
+                                            let old_tier = block.tier;
                                             demotions += 1;
                                             eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
-                                                pc, block.tier, lower, block.exception_count);
-                                            recompile_block_at_tier(
-                                                &mut compiler, &mut cache, exec,
-                                                phys_pc, pc, lower,
-                                                &mut blocks_compiled,
-                                            );
+                                                pc, old_tier, lower, block.exception_count);
+                                            cache.invalidate_range(phys_pc, phys_pc + 4);
+                                            let instrs = trace_block(exec, pc, lower);
+                                            if !instrs.is_empty() {
+                                                async_comp.submit(CompileRequest {
+                                                    instrs, block_pc: pc, phys_pc,
+                                                    tier: lower, kind: CompileKind::Recompile,
+                                                });
+                                            }
                                         } else {
                                             block.speculative = false;
                                         }
@@ -388,8 +442,31 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                             let jit_hi = exec.core.hi;
                             let jit_lo = exec.core.lo;
 
+                            let pre_gpr = snap.gpr;
+                            let pre_hi = snap.hi;
+                            let pre_lo = snap.lo;
+
+                            let block_has_likely = cache.lookup(phys_pc, pc)
+                                .map(|b| b.has_branch_likely).unwrap_or(false);
+
+                            // For branch-likely-not-taken, the delay slot is
+                            // nullified: the JIT counts it in block_len but the
+                            // interpreter's EXEC_BRANCH_LIKELY_SKIP handles it
+                            // in one step. Detect via JIT PC + 4 == expected
+                            // not-taken PC (block_pc + (block_len-1)*4 + 8).
+                            let likely_not_taken = block_has_likely
+                                && block_len >= 2
+                                && jit_pc == pc.wrapping_add((block_len as u64 - 2) * 4 + 8);
+                            let verify_steps = if likely_not_taken {
+                                block_len - 1
+                            } else {
+                                block_len
+                            };
+
                             snap.restore(exec);
-                            for _ in 0..block_len {
+                            let pre_epc = exec.core.cp0_epc;
+                            let pre_cause = exec.core.cp0_cause;
+                            for _ in 0..verify_steps {
                                 exec.step();
                             }
 
@@ -397,6 +474,8 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                             let interp_pc = exec.core.pc;
                             let interp_hi = exec.core.hi;
                             let interp_lo = exec.core.lo;
+                            let interp_took_exception = exec.core.cp0_epc != pre_epc
+                                || (exec.core.cp0_cause & 0x7C) != (pre_cause & 0x7C);
 
                             let mut mismatch = false;
                             for i in 0..32 {
@@ -423,32 +502,129 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                             }
 
                             if mismatch {
-                                // Check if this is a timing false positive:
-                                // interpreter took an exception (PC in exception vectors)
-                                // while JIT didn't. This happens because the interpreter
-                                // re-run occurs at a different wall-clock time and sees
-                                // different external interrupt state via the atomic.
+                                // Interpreter took an exception (TLB miss,
+                                // bus error, etc.) during verify re-run that
+                                // the JIT didn't see — state is post-exception,
+                                // not comparable. This catches kernel-handled
+                                // exceptions (IRIX installs its own handlers
+                                // past the low PROM vectors).
+                                if interp_took_exception {
+                                    total_jit_instrs += block_len as u64;
+                                    continue;
+                                }
+
+                                // Check for MFC0 Count/Random timing false
+                                // positive: the JIT reads cp0_count/random from
+                                // the executor without advancing them per
+                                // instruction, but the interpreter's verify
+                                // re-run advances them via step(). Scan the
+                                // block for MFC0/DMFC0 of reg 1 (Random) or
+                                // reg 9 (Count); the destination GPRs will
+                                // legitimately differ between JIT and interp.
+                                let mut timing_gprs: u32 = 0;
+                                {
+                                    let raw_instrs = trace_block(exec, pc, block_tier);
+                                    for (_, d) in raw_instrs.iter() {
+                                        if d.op as u32 == crate::mips_isa::OP_COP0 {
+                                            let sub = d.rs as u32;
+                                            let rd = d.rd as u32;
+                                            if (sub == crate::mips_isa::RS_MFC0
+                                                || sub == crate::mips_isa::RS_DMFC0)
+                                                && (rd == 1 || rd == 9)
+                                            {
+                                                timing_gprs |= 1u32 << (d.rt as u32);
+                                            }
+                                        }
+                                    }
+                                }
+                                // Check if all non-timing-GPR mismatches are
+                                // explained by the timing-sensitive GPRs.
+                                let mut only_timing = jit_pc == interp_pc
+                                    && jit_hi == interp_hi && jit_lo == interp_lo;
+                                if only_timing {
+                                    for i in 0..32 {
+                                        if jit_gpr[i] != interp_gpr[i]
+                                            && (timing_gprs >> i) & 1 == 0
+                                        {
+                                            only_timing = false;
+                                            break;
+                                        }
+                                    }
+                                }
+                                if only_timing {
+                                    total_jit_instrs += block_len as u64;
+                                    continue;
+                                }
+
                                 let interp_pc32 = interp_pc as u32;
                                 let interp_in_exc = (interp_pc32 >= 0x80000000 && interp_pc32 < 0x80000400)
-                                    || interp_pc32 == 0x80000180; // general exception vector
+                                    || interp_pc32 == 0x80000180;
                                 let jit_pc32 = jit_pc as u32;
                                 let jit_not_exc = jit_pc32 < 0x80000000 || jit_pc32 >= 0x80000400;
 
                                 if interp_in_exc && jit_not_exc {
-                                    // Timing false positive — interpreter took an interrupt
-                                    // the JIT didn't see. Don't invalidate the block.
-                                    // Use the interpreter's result (it's authoritative).
                                     eprintln!("JIT VERIFY: timing false positive at {:016x} (interp took exception to {:016x}), keeping block",
                                         pc, interp_pc);
                                 } else {
-                                    // Real codegen mismatch — dump and invalidate
+                                    // Real codegen mismatch — full diagnostic dump
+                                    eprintln!("═══ JIT VERIFY: REAL CODEGEN MISMATCH ═══");
+                                    eprintln!("Block PC: {:016x}  phys: {:016x}  tier: {:?}  len: {}", pc, phys_pc, block_tier, block_len);
+
+                                    // Pre-state
+                                    eprintln!("── Pre-state (input GPRs) ──");
+                                    for i in (0..32).step_by(4) {
+                                        eprintln!("  r{:02}={:016x} r{:02}={:016x} r{:02}={:016x} r{:02}={:016x}",
+                                            i, pre_gpr[i], i+1, pre_gpr[i+1], i+2, pre_gpr[i+2], i+3, pre_gpr[i+3]);
+                                    }
+                                    eprintln!("  hi={:016x} lo={:016x}", pre_hi, pre_lo);
+
+                                    // MIPS instructions
                                     let instrs = trace_block(exec, pc, block_tier);
-                                    eprintln!("JIT VERIFY: block at {:016x} ({} instrs):", pc, instrs.len());
+                                    eprintln!("── MIPS instructions ({}) ──", instrs.len());
                                     for (idx, (raw, d)) in instrs.iter().enumerate() {
                                         let ipc = pc.wrapping_add(idx as u64 * 4);
-                                        eprintln!("  {:016x}: {:08x} op={} rs={} rt={} rd={} funct={} imm={:04x}",
-                                            ipc, raw, d.op, d.rs, d.rt, d.rd, d.funct, d.imm as u16);
+                                        eprintln!("  {:016x}: {:08x} op={:#04x} rs={} rt={} rd={} sa={} funct={:#04x} imm={:#06x}",
+                                            ipc, raw, d.op, d.rs, d.rt, d.rd, d.sa, d.funct, d.imm as u16);
+                                    }
+
+                                    // Post-state comparison
+                                    eprintln!("── Post-state (JIT vs Interpreter) ──");
+                                    for i in 0..32 {
+                                        if jit_gpr[i] != interp_gpr[i] {
+                                            eprintln!("  r{:02}: jit={:016x} interp={:016x} pre={:016x} *** MISMATCH",
+                                                i, jit_gpr[i], interp_gpr[i], pre_gpr[i]);
+                                        }
+                                    }
+                                    if jit_pc != interp_pc {
+                                        eprintln!("  pc:  jit={:016x} interp={:016x}", jit_pc, interp_pc);
+                                    }
+                                    if jit_hi != interp_hi {
+                                        eprintln!("  hi:  jit={:016x} interp={:016x} pre={:016x}", jit_hi, interp_hi, pre_hi);
+                                    }
+                                    if jit_lo != interp_lo {
+                                        eprintln!("  lo:  jit={:016x} interp={:016x} pre={:016x}", jit_lo, interp_lo, pre_lo);
+                                    }
+
+                                    // CLIF IR (if captured)
+                                    if let Some(block) = cache.lookup(phys_pc, pc) {
+                                        if let Some(ref ir) = block.clif_ir {
+                                            eprintln!("── Cranelift CLIF IR ──");
+                                            eprintln!("{}", ir);
+                                        }
+                                        // Native code hex dump
+                                        if block.len_native > 0 {
+                                            eprintln!("── Native code ({} bytes) ──", block.len_native);
+                                            let code = unsafe {
+                                                std::slice::from_raw_parts(block.entry, block.len_native as usize)
+                                            };
+                                            for chunk in code.chunks(16) {
+                                                let hex: Vec<String> = chunk.iter().map(|b| format!("{:02x}", b)).collect();
+                                                eprintln!("  {}", hex.join(" "));
+                                            }
+                                        }
                                     }
+                                    eprintln!("═══ END MISMATCH DUMP ═══");
+
                                     cache.invalidate_range(phys_pc, phys_pc + 4);
                                 }
                                 total_jit_instrs += block_len as u64;
@@ -503,11 +679,13 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                 promotions += 1;
                                 eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)",
                                     pc, block.tier, next, block.hit_count);
-                                recompile_block_at_tier(
-                                    &mut compiler, &mut cache, exec,
-                                    phys_pc, pc, next,
-                                    &mut blocks_compiled,
-                                );
+                                let instrs = trace_block(exec, pc, next);
+                                if !instrs.is_empty() {
+                                    async_comp.submit(CompileRequest {
+                                        instrs, block_pc: pc, phys_pc,
+                                        tier: next, kind: CompileKind::Recompile,
+                                    });
+                                }
                             }
                         }
                     }
@@ -557,19 +735,13 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                 match cache.lookup(next_phys, next_pc) {
                                     Some(b) => (b.entry, b.len_mips, b.speculative),
                                     None => {
-                                        // Compile on miss at max_tier (not Alu).
-                                        // The main path always starts at Alu, but
-                                        // that fails if the first instruction is
-                                        // a load/store — leaving these PCs forever
-                                        // uncached. Compile at max_tier directly
-                                        // since Loads/Full tiers are proven stable.
-                                        let instrs = trace_block(exec, next_pc, max_tier);
-                                        if !instrs.is_empty() {
-                                            if let Some(mut block) = compiler.compile_block(&instrs, next_pc, max_tier) {
-                                                block.phys_addr = next_phys;
-                                                cache.insert(next_phys, next_pc, block);
-                                                blocks_compiled += 1;
-                                                probe.set_cache_size(cache.len() as u32);
+                                        if !async_comp.pending.contains(&(next_phys, next_pc)) {
+                                            let instrs = trace_block(exec, next_pc, max_tier);
+                                            if !instrs.is_empty() {
+                                                async_comp.submit(CompileRequest {
+                                                    instrs, block_pc: next_pc, phys_pc: next_phys,
+                                                    tier: max_tier, kind: CompileKind::New,
+                                                });
                                             }
                                         }
                                         chain_break_miss += 1;
@@ -613,14 +785,18 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                                             blk.stable_hits = 0;
                                             if blk.exception_count >= tier_cfg.demote {
                                                 if let Some(lower) = blk.tier.demote() {
+                                                    let old_tier = blk.tier;
                                                     demotions += 1;
                                                     eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)",
-                                                        next_pc, blk.tier, lower, blk.exception_count);
-                                                    recompile_block_at_tier(
-                                                        &mut compiler, &mut cache, exec,
-                                                        next_phys, next_pc, lower,
-                                                        &mut blocks_compiled,
-                                                    );
+                                                        next_pc, old_tier, lower, blk.exception_count);
+                                                    cache.invalidate_range(next_phys, next_phys + 4);
+                                                    let instrs = trace_block(exec, next_pc, lower);
+                                                    if !instrs.is_empty() {
+                                                        async_comp.submit(CompileRequest {
+                                                            instrs, block_pc: next_pc, phys_pc: next_phys,
+                                                            tier: lower, kind: CompileKind::Recompile,
+                                                        });
+                                                    }
                                                 } else {
                                                     blk.speculative = false;
                                                 }
@@ -681,19 +857,14 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
                 }
             } else {
                 probe.record_miss();
-                // Cache miss — compile at Alu tier
-                let exec = unsafe { &mut *exec_ptr };
-                let instrs = trace_block(exec, pc, BlockTier::Alu);
-                if !instrs.is_empty() {
-                    if let Some(mut block) = compiler.compile_block(&instrs, pc, BlockTier::Alu) {
-                        block.phys_addr = phys_pc;
-                        cache.insert(phys_pc, pc, block);
-                        blocks_compiled += 1;
-                        probe.set_cache_size(cache.len() as u32);
-                        if blocks_compiled <= 10 || blocks_compiled % 500 == 0 {
-                            eprintln!("JIT: compiled #{} at {:016x} ({} instrs, tier=Alu, cache={})",
-                                blocks_compiled, pc, instrs.len(), cache.len());
-                        }
+                if !async_comp.pending.contains(&(phys_pc, pc)) {
+                    let exec = unsafe { &mut *exec_ptr };
+                    let instrs = trace_block(exec, pc, BlockTier::Alu);
+                    if !instrs.is_empty() {
+                        async_comp.submit(CompileRequest {
+                            instrs, block_pc: pc, phys_pc,
+                            tier: BlockTier::Alu, kind: CompileKind::New,
+                        });
                     }
                 }
             }
@@ -706,15 +877,27 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
             if profile_replay_active {
                 if let Some(entry) = profile_queue.pop_front() {
                     let exec = unsafe { &mut *exec_ptr };
-                    replay_one_profile_entry(
-                        &entry, &mut compiler, &mut cache, exec,
-                        &mut blocks_compiled, &mut profile_replayed,
-                        &mut profile_stale,
-                    );
-                    probe.set_cache_size(cache.len() as u32);
-                    if profile_replayed > 0 && profile_replayed % 1000 == 0 {
-                        eprintln!("JIT profile: replayed {}/{} ({} stale)",
-                            profile_replayed, profile_total, profile_stale);
+                    let phys = translate_pc(exec, entry.virt_pc);
+                    if let Some(phys_pc) = phys {
+                        if !cache.contains(phys_pc, entry.virt_pc)
+                            && !async_comp.pending.contains(&(phys_pc, entry.virt_pc))
+                        {
+                            let instrs = trace_block(exec, entry.virt_pc, entry.tier);
+                            if !instrs.is_empty() {
+                                let content_hash = super::compiler::hash_block_instrs(&instrs);
+                                if instrs.len() as u32 == entry.len_mips
+                                    && content_hash == entry.content_hash
+                                {
+                                    async_comp.submit(CompileRequest {
+                                        instrs, block_pc: entry.virt_pc, phys_pc,
+                                        tier: entry.tier,
+                                        kind: CompileKind::ProfileReplay { content_hash },
+                                    });
+                                } else {
+                                    profile_stale += 1;
+                                }
+                            }
+                        }
                     }
                     if profile_queue.is_empty() {
                         eprintln!("JIT profile: replay complete, {} compiled / {} stale",
@@ -767,6 +950,23 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
         }
     }
 
+    // Shut down background compiler and drain remaining results
+    async_comp.shutdown();
+    while let Some(result) = async_comp.try_recv() {
+        match result.kind {
+            CompileKind::New | CompileKind::ProfileReplay { .. } => {
+                if !cache.contains(result.phys_pc, result.virt_pc) {
+                    cache.insert(result.phys_pc, result.virt_pc, result.block);
+                    blocks_compiled += 1;
+                }
+            }
+            CompileKind::Recompile => {
+                cache.replace(result.phys_pc, result.virt_pc, result.block);
+                blocks_compiled += 1;
+            }
+        }
+    }
+
     {
         let exec = unsafe { &mut *exec_ptr };
         exec.flush_cycles();
@@ -818,83 +1018,6 @@ pub fn run_jit_dispatch<T: Tlb, C: MipsCache>(
 ///
 /// Silently discards entries that can't be validated (unmapped pages,
 /// different code at the saved VA, already-cached blocks).
-fn replay_one_profile_entry<T: Tlb, C: MipsCache>(
-    entry: &ProfileEntry,
-    compiler: &mut BlockCompiler,
-    cache: &mut CodeCache,
-    exec: &mut MipsExecutor<T, C>,
-    blocks_compiled: &mut u64,
-    profile_replayed: &mut u64,
-    profile_stale: &mut u64,
-) {
-    // Re-derive phys_pc — saved phys_pc is for diagnostics only. TLB state
-    // differs between sessions, so the same virt_pc may map elsewhere now.
-    let phys_pc = match translate_pc(exec, entry.virt_pc) {
-        Some(p) => p,
-        None => { *profile_stale += 1; return; } // page not mapped this session
-    };
-
-    // Skip if a block already exists at this (phys_pc, virt_pc). This can
-    // happen if normal compilation beat us to it, or a prior replay already
-    // processed this entry (defensive).
-    if cache.contains(phys_pc, entry.virt_pc) {
-        return;
-    }
-
-    let instrs = trace_block(exec, entry.virt_pc, entry.tier);
-    if instrs.is_empty() {
-        *profile_stale += 1;
-        return;
-    }
-
-    // Cheap length check first, then definitive hash check. Either mismatch
-    // means the code at this VA is different from what we saw last session.
-    if instrs.len() as u32 != entry.len_mips {
-        *profile_stale += 1;
-        return;
-    }
-    let content_hash = super::compiler::hash_block_instrs(&instrs);
-    if content_hash != entry.content_hash {
-        *profile_stale += 1;
-        return;
-    }
-
-    if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, entry.tier) {
-        block.phys_addr = phys_pc;
-        // Zero all counters — no penalty baggage from prior session.
-        // speculative is left as compile_block set it: Full-tier is NOT
-        // speculative because rollback can't un-do stores (memory diverges
-        // from CPU state). Alu/Loads tiers are speculative and will re-prove
-        // stability via the normal snapshot/rollback path this session.
-        block.hit_count = 0;
-        block.stable_hits = 0;
-        block.exception_count = 0;
-        cache.insert(phys_pc, entry.virt_pc, block);
-        *blocks_compiled += 1;
-        *profile_replayed += 1;
-    }
-}
-
-/// Recompile a block at a different tier, replacing the existing cache entry.
-fn recompile_block_at_tier<T: Tlb, C: MipsCache>(
-    compiler: &mut BlockCompiler,
-    cache: &mut CodeCache,
-    exec: &mut MipsExecutor<T, C>,
-    phys_pc: u64,
-    virt_pc: u64,
-    tier: BlockTier,
-    blocks_compiled: &mut u64,
-) {
-    let instrs = trace_block(exec, virt_pc, tier);
-    if !instrs.is_empty() {
-        if let Some(mut block) = compiler.compile_block(&instrs, virt_pc, tier) {
-            block.phys_addr = phys_pc;
-            cache.replace(phys_pc, virt_pc, block);
-            *blocks_compiled += 1;
-        }
-    }
-}
-
 fn interpreter_loop<T: Tlb, C: MipsCache>(
     exec: &mut MipsExecutor<T, C>,
     running: &AtomicBool,
@@ -966,7 +1089,7 @@ fn trace_block<T: Tlb, C: MipsCache>(
     // code (confirmed by IRIS_JIT_VERIFY catching real GPR mismatches). The
     // safe ceiling was empirically determined: aarch64 tolerates 3, x86_64
     // only 1. Bumping past this threshold produces silent miscompilations.
-    let max_helpers: u32 = if cfg!(target_arch = "aarch64") { 3 } else { 1 };
+    let max_helpers: u32 = MAX_BLOCK_LEN as u32;
     let mut helper_count: u32 = 0;
 
     for _ in 0..max_len {
@@ -986,11 +1109,12 @@ fn trace_block<T: Tlb, C: MipsCache>(
 
         let is_branch = is_branch_or_jump(&d);
 
-        // Full-tier: terminate BEFORE stores. Store-containing blocks must be
-        // non-speculative (can't rollback memory), which disables the
-        // self-healing safety net (rollback + demotion on codegen error).
-        // By excluding stores, all Full-tier blocks stay load-only → speculative
-        // → self-healing. Stores go to interpreter, where they're always correct.
+        // Full-tier: terminate BEFORE stores. The write log approach works for
+        // RAM but fails for MMIO: pre-reads of device registers have side
+        // effects (e.g., clear-on-read status bits), and replay writes to
+        // devices (DMA control, audio FIFOs) corrupt device state. Proper
+        // MMIO-aware speculation would require tracking which physical ranges
+        // are RAM vs MMIO and disabling speculation when MMIO is touched.
         if tier == BlockTier::Full && is_compilable_store(&d) && !jit_no_stores() {
             record_termination(&d, tier);
             break;
diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs
index 303badd..22cd53d 100644
--- a/src/jit/helpers.rs
+++ b/src/jit/helpers.rs
@@ -5,11 +5,40 @@
 //! exec_ptr derives from a &mut in the dispatch loop and apply noalias
 //! optimizations that cause stale reads.
 
-use super::context::{JitContext, EXIT_EXCEPTION};
+use super::context::{JitContext, EXIT_EXCEPTION, WRITE_LOG_CAP, WriteLogEntry};
 use crate::mips_exec::{MipsExecutor, EXEC_COMPLETE};
 use crate::mips_tlb::Tlb;
 use crate::mips_cache_v2::MipsCache;
 
+/// Pre-read + log old value for speculative rollback. Returns false on read
+/// failure (exception) or log full — caller should skip the write in both cases.
+#[inline(always)]
+fn log_pre_store<const SIZE: usize, T: Tlb, C: MipsCache>(
+    ctx: &mut JitContext,
+    exec: &mut MipsExecutor<T, C>,
+    virt_addr: u64,
+) -> Result<(), u32> {
+    if (ctx.write_log_len as usize) >= WRITE_LOG_CAP {
+        // Log full — treat as non-recoverable; caller must mark block non-speculative
+        // at compile time based on store count. Belt-and-suspenders: refuse the write.
+        return Err(0);
+    }
+    match exec.read_data::<SIZE>(virt_addr) {
+        Ok(old_val) => {
+            let idx = ctx.write_log_len as usize;
+            ctx.write_log[idx] = WriteLogEntry {
+                addr: virt_addr,
+                old_val,
+                size: SIZE as u8,
+                _pad: [0; 7],
+            };
+            ctx.write_log_len += 1;
+            Ok(())
+        }
+        Err(status) => Err(status),
+    }
+}
+
 /// Opaque cast that defeats LLVM's alias analysis and pointer provenance tracking.
 /// `#[inline(never)]` ensures LLVM can't see through this to recover provenance.
 #[inline(never)]
@@ -75,6 +104,9 @@ pub extern "C" fn jit_write_u8<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    if let Err(s) = log_pre_store::<1, T, C>(ctx, exec, virt_addr) {
+        ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0;
+    }
     let status = exec.write_data::<1>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
@@ -85,6 +117,9 @@ pub extern "C" fn jit_write_u16<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    if let Err(s) = log_pre_store::<2, T, C>(ctx, exec, virt_addr) {
+        ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0;
+    }
     let status = exec.write_data::<2>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
@@ -95,6 +130,9 @@ pub extern "C" fn jit_write_u32<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    if let Err(s) = log_pre_store::<4, T, C>(ctx, exec, virt_addr) {
+        ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0;
+    }
     let status = exec.write_data::<4>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
@@ -105,6 +143,9 @@ pub extern "C" fn jit_write_u64<T: Tlb, C: MipsCache>(
 ) -> u64 {
     let exec = unsafe { &mut *opaque_exec::<T, C>(exec_ptr) };
     let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) };
+    if let Err(s) = log_pre_store::<8, T, C>(ctx, exec, virt_addr) {
+        ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0;
+    }
     let status = exec.write_data::<8>(virt_addr, value);
     if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; }
     0
@@ -190,6 +231,23 @@ pub struct HelperPtrs {
     pub dmtc0: *const u8,
 }
 
+// These are stable monomorphized function pointers, safe to send across threads.
+unsafe impl Send for HelperPtrs {}
+
+impl Clone for HelperPtrs {
+    fn clone(&self) -> Self {
+        Self {
+            read_u8: self.read_u8, read_u16: self.read_u16,
+            read_u32: self.read_u32, read_u64: self.read_u64,
+            write_u8: self.write_u8, write_u16: self.write_u16,
+            write_u32: self.write_u32, write_u64: self.write_u64,
+            interp_step: self.interp_step,
+            mfc0: self.mfc0, dmfc0: self.dmfc0,
+            mtc0: self.mtc0, dmtc0: self.dmtc0,
+        }
+    }
+}
+
 impl HelperPtrs {
     pub fn new<T: Tlb, C: MipsCache>() -> Self {
         Self {
diff --git a/src/jit/mod.rs b/src/jit/mod.rs
index 15b7645..cb1c16f 100644
--- a/src/jit/mod.rs
+++ b/src/jit/mod.rs
@@ -12,6 +12,7 @@ pub mod profile;
 pub mod snapshot;
 pub mod trace;
 pub mod codegen_test;
+pub mod async_compiler;
 
 pub use context::JitContext;
 pub use cache::{CodeCache, CompiledBlock};
diff --git a/src/mips_exec.rs b/src/mips_exec.rs
index 9e367bf..1580e53 100644
--- a/src/mips_exec.rs
+++ b/src/mips_exec.rs
@@ -931,6 +931,50 @@ For R4000SC/MC CPUs:
         result
     }
 
+    /// Lightweight step for JIT interpreter bursts. Skips cp0_count
+    /// advancement and local_cycles — the JIT dispatch loop does those
+    /// in bulk after the burst. Keeps interrupt checking because the
+    /// kernel depends on per-instruction interrupt delivery.
+    #[inline(always)]
+    pub fn step_lite(&mut self) -> ExecStatus {
+        let pending = unsafe { &*self.interrupts_ptr }.load(Ordering::Relaxed);
+
+        let pc = self.core.pc;
+
+        if (pending | self.core.cp0_cause as u64) != 0 {
+            if pending & SOFT_RESET_BIT != 0 {
+                self.core.reset(true);
+                self.in_delay_slot = false;
+                self.delay_slot_target = 0;
+                return EXEC_COMPLETE;
+            }
+            self.core.cp0_cause = (self.core.cp0_cause & !EXT_INT_MASK) | (pending as u32 & EXT_INT_MASK);
+            if self.core.interrupts_enabled() {
+                let ip = self.core.cp0_cause & crate::mips_core::CAUSE_IP_MASK;
+                let im = self.core.cp0_status & crate::mips_core::STATUS_IM_MASK;
+                if (ip & im) != 0 {
+                    let s = exec_exception(EXC_INT);
+                    return self.handle_exception(s);
+                }
+            }
+        }
+
+        let fetch = self.fetch_instr(pc);
+        if fetch.status != EXEC_COMPLETE {
+            return if fetch.status & EXEC_IS_EXCEPTION != 0 {
+                self.handle_exception(fetch.status)
+            } else {
+                fetch.status
+            };
+        }
+        let slot = fetch.instr as *mut DecodedInstr;
+        let d = unsafe { &mut *slot };
+        if !d.decoded {
+            decode_into::<T, C>(d);
+        }
+        self.exec_decoded(unsafe { &*slot })
+    }
+
     #[inline(always)]
     fn check_breakpoint<const KIND: u8>(&mut self, addr: u64) -> bool {
         if KIND == BpType::Pc as u8 {