diff --git a/src/jit/async_compiler.rs b/src/jit/async_compiler.rs new file mode 100644 index 0000000..98f3172 --- /dev/null +++ b/src/jit/async_compiler.rs @@ -0,0 +1,127 @@ +//! Background JIT compilation thread pool. Moves Cranelift work off the +//! interpreter's hot path so compilation never stalls execution. + +use std::collections::HashSet; +use std::sync::{mpsc, Arc, Mutex}; +use std::thread; + +use crate::mips_exec::DecodedInstr; +use super::cache::{BlockTier, CompiledBlock}; +use super::compiler::BlockCompiler; +use super::helpers::HelperPtrs; + +const NUM_COMPILER_THREADS: usize = 1; + +pub enum CompileKind { + New, + Recompile, + ProfileReplay { content_hash: u32 }, +} + +pub struct CompileRequest { + pub instrs: Vec<(u32, DecodedInstr)>, + pub block_pc: u64, + pub phys_pc: u64, + pub tier: BlockTier, + pub kind: CompileKind, +} + +pub struct CompileResult { + pub block: CompiledBlock, + pub phys_pc: u64, + pub virt_pc: u64, + pub kind: CompileKind, +} + +pub struct AsyncCompiler { + tx: Option>, + rx: mpsc::Receiver, + handles: Vec>, + pub pending: HashSet<(u64, u64)>, +} + +impl AsyncCompiler { + pub fn new(helpers: HelperPtrs, capture_ir: bool) -> Self { + let (req_tx, req_rx) = mpsc::channel::(); + let (res_tx, res_rx) = mpsc::sync_channel::(64); + let req_rx = Arc::new(Mutex::new(req_rx)); + + let mut handles = Vec::with_capacity(NUM_COMPILER_THREADS); + for i in 0..NUM_COMPILER_THREADS { + let rx = Arc::clone(&req_rx); + let tx = res_tx.clone(); + let h = helpers.clone(); + let handle = thread::Builder::new() + .name(format!("jit-compiler-{}", i)) + .spawn(move || { + let mut compiler = BlockCompiler::new(&h); + compiler.capture_ir = capture_ir; + loop { + let req = { + let guard = rx.lock().unwrap(); + guard.recv() + }; + match req { + Ok(req) => { + if let Some(mut block) = compiler.compile_block(&req.instrs, req.block_pc, req.tier) { + block.phys_addr = req.phys_pc; + let _ = tx.send(CompileResult { + block, + phys_pc: req.phys_pc, + virt_pc: req.block_pc, + kind: req.kind, + }); + } + } + Err(_) => break, + } + } + }) + .expect("failed to spawn JIT compiler thread"); + handles.push(handle); + } + + eprintln!("JIT: {} background compiler threads", NUM_COMPILER_THREADS); + + Self { + tx: Some(req_tx), + rx: res_rx, + handles, + pending: HashSet::new(), + } + } + + pub fn submit(&mut self, req: CompileRequest) { + let key = (req.phys_pc, req.block_pc); + if self.pending.contains(&key) { + return; + } + self.pending.insert(key); + if let Some(tx) = &self.tx { + let _ = tx.send(req); + } + } + + pub fn try_recv(&mut self) -> Option { + match self.rx.try_recv() { + Ok(result) => { + self.pending.remove(&(result.phys_pc, result.virt_pc)); + Some(result) + } + Err(_) => None, + } + } + + pub fn shutdown(&mut self) { + self.tx.take(); + for handle in self.handles.drain(..) { + let _ = handle.join(); + } + } +} + +impl Drop for AsyncCompiler { + fn drop(&mut self) { + self.shutdown(); + } +} diff --git a/src/jit/cache.rs b/src/jit/cache.rs index 6388fbb..0e1798e 100644 --- a/src/jit/cache.rs +++ b/src/jit/cache.rs @@ -77,6 +77,12 @@ pub struct CompiledBlock { /// FNV-1a hash of the raw instruction words; used to detect stale profile /// entries when a different DSO is loaded at the same virtual address. pub content_hash: u32, + /// Block ends with a branch-likely instruction (BEQL/BNEL/etc). Verify mode + /// needs this to adjust step count when the branch is not taken (delay slot + /// nullified = one fewer interpreter step). + pub has_branch_likely: bool, + /// Cranelift CLIF IR captured at compile time (only when IRIS_JIT_VERIFY=1). + pub clif_ir: Option, } // Safety: CompiledBlock is only accessed from the CPU thread. diff --git a/src/jit/compiler.rs b/src/jit/compiler.rs index d0df257..90fd898 100644 --- a/src/jit/compiler.rs +++ b/src/jit/compiler.rs @@ -20,6 +20,7 @@ pub struct BlockCompiler { ctx: Context, builder_ctx: FunctionBuilderContext, func_id_counter: u32, + pub capture_ir: bool, // Declared function IDs for memory helpers (registered as imports) fn_read_u8: FuncId, fn_read_u16: FuncId, @@ -116,6 +117,7 @@ impl BlockCompiler { jit_module, builder_ctx: FunctionBuilderContext::new(), func_id_counter: 0, + capture_ir: false, fn_read_u8, fn_read_u16, fn_read_u32, fn_read_u64, fn_write_u8, fn_write_u16, fn_write_u32, fn_write_u64, fn_interp_step, @@ -207,6 +209,7 @@ impl BlockCompiler { // Emit IR for each instruction let mut compiled_count = 0u32; let mut branch_exit_pc: Option = None; + let mut has_branch_likely = false; let mut idx = 0; while idx < instrs.len() { @@ -265,6 +268,7 @@ impl BlockCompiler { break; } EmitResult::BranchLikely { taken, not_taken, cond } => { + has_branch_likely = true; compiled_count += 1; idx += 1; if idx < instrs.len() { @@ -347,13 +351,20 @@ impl BlockCompiler { builder.ins().return_(&[]); builder.finalize(); + // Capture CLIF IR before define_function consumes it (for verify diagnostics) + let clif_ir = if self.capture_ir { + Some(format!("{}", self.ctx.func.display())) + } else { + None + }; + // Compile to native code self.jit_module.define_function(func_id, &mut self.ctx).unwrap(); + let code_size = self.ctx.compiled_code().unwrap().code_info().total_size; self.jit_module.clear_context(&mut self.ctx); self.jit_module.finalize_definitions().unwrap(); let code_ptr = self.jit_module.get_finalized_function(func_id); - let code_size = 0u32; // JITModule doesn't expose size easily; not critical let content_hash = hash_block_instrs(instrs); @@ -362,20 +373,18 @@ impl BlockCompiler { phys_addr: 0, // filled in by caller virt_addr: block_pc, len_mips: compiled_count, - len_native: code_size, + len_native: code_size as u32, tier, - // Speculative blocks get snapshot/rollback on exception, providing - // self-healing: codegen errors cause exceptions → rollback to correct - // state → demotion after 3 failures → bad block replaced. - // - // Non-speculative is ONLY safe when the block contains stores, because - // rollback can't undo memory writes (RMW double-apply). Load-only blocks - // at any tier should always be speculative for the safety net. + // Speculative blocks get snapshot/rollback on exception. Store- + // containing blocks are non-speculative because the write log + // approach is incompatible with MMIO writes. speculative: !block_has_stores(instrs), hit_count: 0, exception_count: 0, stable_hits: 0, content_hash, + has_branch_likely, + clif_ir, }) } } @@ -388,6 +397,11 @@ fn block_has_stores(instrs: &[(u32, DecodedInstr)]) -> bool { instrs.iter().any(|(_, d)| matches!(d.op as u32, OP_SB | OP_SH | OP_SW | OP_SD)) } +fn block_store_count(instrs: &[(u32, DecodedInstr)]) -> u32 { + use crate::mips_isa::*; + instrs.iter().filter(|(_, d)| matches!(d.op as u32, OP_SB | OP_SH | OP_SW | OP_SD)).count() as u32 +} + /// FNV-1a 32-bit hash of raw instruction words. Used to detect stale profile /// entries: a different DSO loaded at the same virtual address will have the /// same length but different instruction bytes. @@ -981,6 +995,20 @@ fn flush_modified_gprs( *modified = 0; } +fn reload_all_gprs( + builder: &mut FunctionBuilder, + gpr: &mut [Value; 32], + ctx_ptr: Value, +) { + let mem = MemFlags::new(); + for i in 1..32usize { + gpr[i] = builder.ins().load( + types::I64, mem, ctx_ptr, + ir::immediates::Offset32::new(JitContext::gpr_offset(i)), + ); + } +} + // ─── Load/Store emitters ───────────────────────────────────────────────────── /// Load width tag passed to emit_load so it applies the correct sign extension. @@ -1040,6 +1068,10 @@ fn emit_load( builder.seal_block(ok_block); let val = builder.block_params(ok_block)[0]; + // Reload ALL GPRs from ctx after helper call. This resets SSA live-value + // pressure so regalloc2 never sees accumulated diamonds from multiple helpers. + reload_all_gprs(builder, gpr, ctx_ptr); + // Apply correct sign/zero extension based on load width gpr[rt] = match (width, sign_extend) { (LoadWidth::Byte, true) => { @@ -1071,7 +1103,7 @@ fn emit_store( builder: &mut FunctionBuilder, ctx_ptr: Value, exec_ptr: Value, helper: FuncRef, - gpr: &[Value; 32], + gpr: &mut [Value; 32], rs: usize, rt: usize, d: &DecodedInstr, instr_pc: u64, @@ -1110,6 +1142,8 @@ fn emit_store( builder.switch_to_block(ok_block); builder.seal_block(ok_block); + reload_all_gprs(builder, gpr, ctx_ptr); + EmitResult::Ok } diff --git a/src/jit/context.rs b/src/jit/context.rs index 322017a..01e44d9 100644 --- a/src/jit/context.rs +++ b/src/jit/context.rs @@ -15,6 +15,27 @@ pub const EXIT_EXCEPTION: u32 = 2; pub const EXIT_INTERRUPT_CHECK: u32 = 3; pub const EXIT_HALT: u32 = 4; +/// Max stores we can speculatively track per block. Exceeding this forces the +/// block to be non-speculative (disables rollback for stores past this limit). +pub const WRITE_LOG_CAP: usize = 128; + +/// Single entry in the speculative store write log. Records the pre-store +/// value at `addr` so rollback can restore it if the block exceptions. +#[repr(C)] +#[derive(Copy, Clone)] +pub struct WriteLogEntry { + pub addr: u64, + pub old_val: u64, + pub size: u8, + pub _pad: [u8; 7], +} + +impl WriteLogEntry { + pub const fn empty() -> Self { + Self { addr: 0, old_val: 0, size: 0, _pad: [0; 7] } + } +} + #[repr(C)] pub struct JitContext { // General purpose registers @@ -58,6 +79,13 @@ pub struct JitContext { // Exception status from failed memory access (set by helpers) pub exception_status: u32, _pad0: u32, + + // Speculative store write log. Each entry records the pre-store value at + // an address. On block rollback (speculative exception), replay in reverse + // to restore memory. On normal exit, reset write_log_len to 0. + pub write_log_len: u32, + _pad1: u32, + pub write_log: [WriteLogEntry; WRITE_LOG_CAP], } impl JitContext { @@ -86,6 +114,9 @@ impl JitContext { executor_ptr: 0, exception_status: 0, _pad0: 0, + write_log_len: 0, + _pad1: 0, + write_log: [WriteLogEntry::empty(); WRITE_LOG_CAP], } } diff --git a/src/jit/dispatch.rs b/src/jit/dispatch.rs index ffd7994..611ea88 100644 --- a/src/jit/dispatch.rs +++ b/src/jit/dispatch.rs @@ -11,6 +11,8 @@ use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; +use super::async_compiler::{AsyncCompiler, CompileRequest, CompileKind}; + // Diagnostic: counts how many times a specific (non-compilable) instruction // type caused trace_block to terminate. Key encoding: // bits 31..26: op @@ -63,7 +65,6 @@ use crate::mips_tlb::Tlb; use crate::mips_cache_v2::MipsCache; use super::cache::{BlockTier, CodeCache, TierConfig}; -use super::compiler::BlockCompiler; use super::context::{JitContext, EXIT_NORMAL, EXIT_EXCEPTION}; use super::helpers::HelperPtrs; use super::profile::{self, ProfileEntry}; @@ -191,7 +192,7 @@ pub fn run_jit_dispatch( max_tier, verify_mode, probe.interval, probe.min_interval, probe.max_interval, tier_cfg.stable, tier_cfg.promote, tier_cfg.demote); let helpers = HelperPtrs::new::(); - let mut compiler = BlockCompiler::new(&helpers); + let mut async_comp = AsyncCompiler::new(helpers.clone(), verify_mode); let mut cache = CodeCache::new(); let mut ctx = JitContext::new(); ctx.executor_ptr = exec_ptr as u64; @@ -236,20 +237,28 @@ pub fn run_jit_dispatch( while steps_in_batch < BATCH_SIZE { let burst = probe.next_interval(); - // Interpreter burst + // Interpreter burst — use step_lite (no cp0/interrupt overhead), + // then do bookkeeping in bulk, same as post-JIT-block. { let exec = unsafe { &mut *exec_ptr }; - #[cfg(feature = "lightning")] for _ in 0..burst { - exec.step(); + exec.step_lite(); } - #[cfg(not(feature = "lightning"))] - for _ in 0..burst { - let status = exec.step(); - if status == EXEC_BREAKPOINT { - running.store(false, Ordering::SeqCst); - break; - } + let n = burst as u64; + exec.core.local_cycles += n; + let advance = exec.core.count_step.wrapping_mul(n); + let prev = exec.core.cp0_count; + exec.core.cp0_count = prev.wrapping_add(advance); + if exec.core.cp0_compare.wrapping_sub(prev) <= advance { + exec.core.cp0_cause |= crate::mips_core::CAUSE_IP7; + exec.core.fasttick_count.fetch_add(1, Ordering::Relaxed); + } + let pending = exec.core.interrupts.load(Ordering::Relaxed); + if pending != 0 { + use crate::mips_core::{CAUSE_IP2, CAUSE_IP3, CAUSE_IP4, CAUSE_IP5, CAUSE_IP6}; + let ext_mask = CAUSE_IP2 | CAUSE_IP3 | CAUSE_IP4 | CAUSE_IP5 | CAUSE_IP6; + exec.core.cp0_cause = (exec.core.cp0_cause & !ext_mask) + | (pending as u32 & ext_mask); } } steps_in_batch += burst; @@ -257,6 +266,33 @@ pub fn run_jit_dispatch( if !running.load(Ordering::Relaxed) { break; } + // Drain completed compilations from background thread + while let Some(result) = async_comp.try_recv() { + match result.kind { + CompileKind::New => { + if !cache.contains(result.phys_pc, result.virt_pc) { + cache.insert(result.phys_pc, result.virt_pc, result.block); + blocks_compiled += 1; + probe.set_cache_size(cache.len() as u32); + } + } + CompileKind::Recompile => { + cache.replace(result.phys_pc, result.virt_pc, result.block); + blocks_compiled += 1; + } + CompileKind::ProfileReplay { content_hash } => { + if !cache.contains(result.phys_pc, result.virt_pc) + && result.block.content_hash == content_hash + { + cache.insert(result.phys_pc, result.virt_pc, result.block); + blocks_compiled += 1; + profile_replayed += 1; + probe.set_cache_size(cache.len() as u32); + } + } + } + } + // Probe the JIT code cache let (pc, in_delay_slot) = { let exec = unsafe { &*exec_ptr }; @@ -323,6 +359,7 @@ pub fn run_jit_dispatch( } ctx.exit_reason = 0; + ctx.write_log_len = 0; let entry: extern "C" fn(*mut JitContext) = unsafe { std::mem::transmute(block.entry) }; @@ -335,6 +372,19 @@ pub fn run_jit_dispatch( if ctx.exit_reason == EXIT_EXCEPTION { if let Some(snap) = &snapshot { if is_speculative { + // Replay write log in reverse to undo speculative + // stores before restoring CPU/TLB state. + for i in (0..ctx.write_log_len as usize).rev() { + let e = &ctx.write_log[i]; + match e.size { + 1 => { exec.write_data::<1>(e.addr, e.old_val); } + 2 => { exec.write_data::<2>(e.addr, e.old_val); } + 4 => { exec.write_data::<4>(e.addr, e.old_val); } + 8 => { exec.write_data::<8>(e.addr, e.old_val); } + _ => {} + } + } + ctx.write_log_len = 0; snap.restore(exec); rollbacks += 1; @@ -345,14 +395,18 @@ pub fn run_jit_dispatch( if block.exception_count >= tier_cfg.demote { if let Some(lower) = block.tier.demote() { + let old_tier = block.tier; demotions += 1; eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)", - pc, block.tier, lower, block.exception_count); - recompile_block_at_tier( - &mut compiler, &mut cache, exec, - phys_pc, pc, lower, - &mut blocks_compiled, - ); + pc, old_tier, lower, block.exception_count); + cache.invalidate_range(phys_pc, phys_pc + 4); + let instrs = trace_block(exec, pc, lower); + if !instrs.is_empty() { + async_comp.submit(CompileRequest { + instrs, block_pc: pc, phys_pc, + tier: lower, kind: CompileKind::Recompile, + }); + } } else { block.speculative = false; } @@ -388,8 +442,31 @@ pub fn run_jit_dispatch( let jit_hi = exec.core.hi; let jit_lo = exec.core.lo; + let pre_gpr = snap.gpr; + let pre_hi = snap.hi; + let pre_lo = snap.lo; + + let block_has_likely = cache.lookup(phys_pc, pc) + .map(|b| b.has_branch_likely).unwrap_or(false); + + // For branch-likely-not-taken, the delay slot is + // nullified: the JIT counts it in block_len but the + // interpreter's EXEC_BRANCH_LIKELY_SKIP handles it + // in one step. Detect via JIT PC + 4 == expected + // not-taken PC (block_pc + (block_len-1)*4 + 8). + let likely_not_taken = block_has_likely + && block_len >= 2 + && jit_pc == pc.wrapping_add((block_len as u64 - 2) * 4 + 8); + let verify_steps = if likely_not_taken { + block_len - 1 + } else { + block_len + }; + snap.restore(exec); - for _ in 0..block_len { + let pre_epc = exec.core.cp0_epc; + let pre_cause = exec.core.cp0_cause; + for _ in 0..verify_steps { exec.step(); } @@ -397,6 +474,8 @@ pub fn run_jit_dispatch( let interp_pc = exec.core.pc; let interp_hi = exec.core.hi; let interp_lo = exec.core.lo; + let interp_took_exception = exec.core.cp0_epc != pre_epc + || (exec.core.cp0_cause & 0x7C) != (pre_cause & 0x7C); let mut mismatch = false; for i in 0..32 { @@ -423,32 +502,129 @@ pub fn run_jit_dispatch( } if mismatch { - // Check if this is a timing false positive: - // interpreter took an exception (PC in exception vectors) - // while JIT didn't. This happens because the interpreter - // re-run occurs at a different wall-clock time and sees - // different external interrupt state via the atomic. + // Interpreter took an exception (TLB miss, + // bus error, etc.) during verify re-run that + // the JIT didn't see — state is post-exception, + // not comparable. This catches kernel-handled + // exceptions (IRIX installs its own handlers + // past the low PROM vectors). + if interp_took_exception { + total_jit_instrs += block_len as u64; + continue; + } + + // Check for MFC0 Count/Random timing false + // positive: the JIT reads cp0_count/random from + // the executor without advancing them per + // instruction, but the interpreter's verify + // re-run advances them via step(). Scan the + // block for MFC0/DMFC0 of reg 1 (Random) or + // reg 9 (Count); the destination GPRs will + // legitimately differ between JIT and interp. + let mut timing_gprs: u32 = 0; + { + let raw_instrs = trace_block(exec, pc, block_tier); + for (_, d) in raw_instrs.iter() { + if d.op as u32 == crate::mips_isa::OP_COP0 { + let sub = d.rs as u32; + let rd = d.rd as u32; + if (sub == crate::mips_isa::RS_MFC0 + || sub == crate::mips_isa::RS_DMFC0) + && (rd == 1 || rd == 9) + { + timing_gprs |= 1u32 << (d.rt as u32); + } + } + } + } + // Check if all non-timing-GPR mismatches are + // explained by the timing-sensitive GPRs. + let mut only_timing = jit_pc == interp_pc + && jit_hi == interp_hi && jit_lo == interp_lo; + if only_timing { + for i in 0..32 { + if jit_gpr[i] != interp_gpr[i] + && (timing_gprs >> i) & 1 == 0 + { + only_timing = false; + break; + } + } + } + if only_timing { + total_jit_instrs += block_len as u64; + continue; + } + let interp_pc32 = interp_pc as u32; let interp_in_exc = (interp_pc32 >= 0x80000000 && interp_pc32 < 0x80000400) - || interp_pc32 == 0x80000180; // general exception vector + || interp_pc32 == 0x80000180; let jit_pc32 = jit_pc as u32; let jit_not_exc = jit_pc32 < 0x80000000 || jit_pc32 >= 0x80000400; if interp_in_exc && jit_not_exc { - // Timing false positive — interpreter took an interrupt - // the JIT didn't see. Don't invalidate the block. - // Use the interpreter's result (it's authoritative). eprintln!("JIT VERIFY: timing false positive at {:016x} (interp took exception to {:016x}), keeping block", pc, interp_pc); } else { - // Real codegen mismatch — dump and invalidate + // Real codegen mismatch — full diagnostic dump + eprintln!("═══ JIT VERIFY: REAL CODEGEN MISMATCH ═══"); + eprintln!("Block PC: {:016x} phys: {:016x} tier: {:?} len: {}", pc, phys_pc, block_tier, block_len); + + // Pre-state + eprintln!("── Pre-state (input GPRs) ──"); + for i in (0..32).step_by(4) { + eprintln!(" r{:02}={:016x} r{:02}={:016x} r{:02}={:016x} r{:02}={:016x}", + i, pre_gpr[i], i+1, pre_gpr[i+1], i+2, pre_gpr[i+2], i+3, pre_gpr[i+3]); + } + eprintln!(" hi={:016x} lo={:016x}", pre_hi, pre_lo); + + // MIPS instructions let instrs = trace_block(exec, pc, block_tier); - eprintln!("JIT VERIFY: block at {:016x} ({} instrs):", pc, instrs.len()); + eprintln!("── MIPS instructions ({}) ──", instrs.len()); for (idx, (raw, d)) in instrs.iter().enumerate() { let ipc = pc.wrapping_add(idx as u64 * 4); - eprintln!(" {:016x}: {:08x} op={} rs={} rt={} rd={} funct={} imm={:04x}", - ipc, raw, d.op, d.rs, d.rt, d.rd, d.funct, d.imm as u16); + eprintln!(" {:016x}: {:08x} op={:#04x} rs={} rt={} rd={} sa={} funct={:#04x} imm={:#06x}", + ipc, raw, d.op, d.rs, d.rt, d.rd, d.sa, d.funct, d.imm as u16); + } + + // Post-state comparison + eprintln!("── Post-state (JIT vs Interpreter) ──"); + for i in 0..32 { + if jit_gpr[i] != interp_gpr[i] { + eprintln!(" r{:02}: jit={:016x} interp={:016x} pre={:016x} *** MISMATCH", + i, jit_gpr[i], interp_gpr[i], pre_gpr[i]); + } + } + if jit_pc != interp_pc { + eprintln!(" pc: jit={:016x} interp={:016x}", jit_pc, interp_pc); + } + if jit_hi != interp_hi { + eprintln!(" hi: jit={:016x} interp={:016x} pre={:016x}", jit_hi, interp_hi, pre_hi); + } + if jit_lo != interp_lo { + eprintln!(" lo: jit={:016x} interp={:016x} pre={:016x}", jit_lo, interp_lo, pre_lo); + } + + // CLIF IR (if captured) + if let Some(block) = cache.lookup(phys_pc, pc) { + if let Some(ref ir) = block.clif_ir { + eprintln!("── Cranelift CLIF IR ──"); + eprintln!("{}", ir); + } + // Native code hex dump + if block.len_native > 0 { + eprintln!("── Native code ({} bytes) ──", block.len_native); + let code = unsafe { + std::slice::from_raw_parts(block.entry, block.len_native as usize) + }; + for chunk in code.chunks(16) { + let hex: Vec = chunk.iter().map(|b| format!("{:02x}", b)).collect(); + eprintln!(" {}", hex.join(" ")); + } + } } + eprintln!("═══ END MISMATCH DUMP ═══"); + cache.invalidate_range(phys_pc, phys_pc + 4); } total_jit_instrs += block_len as u64; @@ -503,11 +679,13 @@ pub fn run_jit_dispatch( promotions += 1; eprintln!("JIT: promote {:016x} {:?}→{:?} ({}hits)", pc, block.tier, next, block.hit_count); - recompile_block_at_tier( - &mut compiler, &mut cache, exec, - phys_pc, pc, next, - &mut blocks_compiled, - ); + let instrs = trace_block(exec, pc, next); + if !instrs.is_empty() { + async_comp.submit(CompileRequest { + instrs, block_pc: pc, phys_pc, + tier: next, kind: CompileKind::Recompile, + }); + } } } } @@ -557,19 +735,13 @@ pub fn run_jit_dispatch( match cache.lookup(next_phys, next_pc) { Some(b) => (b.entry, b.len_mips, b.speculative), None => { - // Compile on miss at max_tier (not Alu). - // The main path always starts at Alu, but - // that fails if the first instruction is - // a load/store — leaving these PCs forever - // uncached. Compile at max_tier directly - // since Loads/Full tiers are proven stable. - let instrs = trace_block(exec, next_pc, max_tier); - if !instrs.is_empty() { - if let Some(mut block) = compiler.compile_block(&instrs, next_pc, max_tier) { - block.phys_addr = next_phys; - cache.insert(next_phys, next_pc, block); - blocks_compiled += 1; - probe.set_cache_size(cache.len() as u32); + if !async_comp.pending.contains(&(next_phys, next_pc)) { + let instrs = trace_block(exec, next_pc, max_tier); + if !instrs.is_empty() { + async_comp.submit(CompileRequest { + instrs, block_pc: next_pc, phys_pc: next_phys, + tier: max_tier, kind: CompileKind::New, + }); } } chain_break_miss += 1; @@ -613,14 +785,18 @@ pub fn run_jit_dispatch( blk.stable_hits = 0; if blk.exception_count >= tier_cfg.demote { if let Some(lower) = blk.tier.demote() { + let old_tier = blk.tier; demotions += 1; eprintln!("JIT: demote {:016x} {:?}→{:?} ({}exc)", - next_pc, blk.tier, lower, blk.exception_count); - recompile_block_at_tier( - &mut compiler, &mut cache, exec, - next_phys, next_pc, lower, - &mut blocks_compiled, - ); + next_pc, old_tier, lower, blk.exception_count); + cache.invalidate_range(next_phys, next_phys + 4); + let instrs = trace_block(exec, next_pc, lower); + if !instrs.is_empty() { + async_comp.submit(CompileRequest { + instrs, block_pc: next_pc, phys_pc: next_phys, + tier: lower, kind: CompileKind::Recompile, + }); + } } else { blk.speculative = false; } @@ -681,19 +857,14 @@ pub fn run_jit_dispatch( } } else { probe.record_miss(); - // Cache miss — compile at Alu tier - let exec = unsafe { &mut *exec_ptr }; - let instrs = trace_block(exec, pc, BlockTier::Alu); - if !instrs.is_empty() { - if let Some(mut block) = compiler.compile_block(&instrs, pc, BlockTier::Alu) { - block.phys_addr = phys_pc; - cache.insert(phys_pc, pc, block); - blocks_compiled += 1; - probe.set_cache_size(cache.len() as u32); - if blocks_compiled <= 10 || blocks_compiled % 500 == 0 { - eprintln!("JIT: compiled #{} at {:016x} ({} instrs, tier=Alu, cache={})", - blocks_compiled, pc, instrs.len(), cache.len()); - } + if !async_comp.pending.contains(&(phys_pc, pc)) { + let exec = unsafe { &mut *exec_ptr }; + let instrs = trace_block(exec, pc, BlockTier::Alu); + if !instrs.is_empty() { + async_comp.submit(CompileRequest { + instrs, block_pc: pc, phys_pc, + tier: BlockTier::Alu, kind: CompileKind::New, + }); } } } @@ -706,15 +877,27 @@ pub fn run_jit_dispatch( if profile_replay_active { if let Some(entry) = profile_queue.pop_front() { let exec = unsafe { &mut *exec_ptr }; - replay_one_profile_entry( - &entry, &mut compiler, &mut cache, exec, - &mut blocks_compiled, &mut profile_replayed, - &mut profile_stale, - ); - probe.set_cache_size(cache.len() as u32); - if profile_replayed > 0 && profile_replayed % 1000 == 0 { - eprintln!("JIT profile: replayed {}/{} ({} stale)", - profile_replayed, profile_total, profile_stale); + let phys = translate_pc(exec, entry.virt_pc); + if let Some(phys_pc) = phys { + if !cache.contains(phys_pc, entry.virt_pc) + && !async_comp.pending.contains(&(phys_pc, entry.virt_pc)) + { + let instrs = trace_block(exec, entry.virt_pc, entry.tier); + if !instrs.is_empty() { + let content_hash = super::compiler::hash_block_instrs(&instrs); + if instrs.len() as u32 == entry.len_mips + && content_hash == entry.content_hash + { + async_comp.submit(CompileRequest { + instrs, block_pc: entry.virt_pc, phys_pc, + tier: entry.tier, + kind: CompileKind::ProfileReplay { content_hash }, + }); + } else { + profile_stale += 1; + } + } + } } if profile_queue.is_empty() { eprintln!("JIT profile: replay complete, {} compiled / {} stale", @@ -767,6 +950,23 @@ pub fn run_jit_dispatch( } } + // Shut down background compiler and drain remaining results + async_comp.shutdown(); + while let Some(result) = async_comp.try_recv() { + match result.kind { + CompileKind::New | CompileKind::ProfileReplay { .. } => { + if !cache.contains(result.phys_pc, result.virt_pc) { + cache.insert(result.phys_pc, result.virt_pc, result.block); + blocks_compiled += 1; + } + } + CompileKind::Recompile => { + cache.replace(result.phys_pc, result.virt_pc, result.block); + blocks_compiled += 1; + } + } + } + { let exec = unsafe { &mut *exec_ptr }; exec.flush_cycles(); @@ -818,83 +1018,6 @@ pub fn run_jit_dispatch( /// /// Silently discards entries that can't be validated (unmapped pages, /// different code at the saved VA, already-cached blocks). -fn replay_one_profile_entry( - entry: &ProfileEntry, - compiler: &mut BlockCompiler, - cache: &mut CodeCache, - exec: &mut MipsExecutor, - blocks_compiled: &mut u64, - profile_replayed: &mut u64, - profile_stale: &mut u64, -) { - // Re-derive phys_pc — saved phys_pc is for diagnostics only. TLB state - // differs between sessions, so the same virt_pc may map elsewhere now. - let phys_pc = match translate_pc(exec, entry.virt_pc) { - Some(p) => p, - None => { *profile_stale += 1; return; } // page not mapped this session - }; - - // Skip if a block already exists at this (phys_pc, virt_pc). This can - // happen if normal compilation beat us to it, or a prior replay already - // processed this entry (defensive). - if cache.contains(phys_pc, entry.virt_pc) { - return; - } - - let instrs = trace_block(exec, entry.virt_pc, entry.tier); - if instrs.is_empty() { - *profile_stale += 1; - return; - } - - // Cheap length check first, then definitive hash check. Either mismatch - // means the code at this VA is different from what we saw last session. - if instrs.len() as u32 != entry.len_mips { - *profile_stale += 1; - return; - } - let content_hash = super::compiler::hash_block_instrs(&instrs); - if content_hash != entry.content_hash { - *profile_stale += 1; - return; - } - - if let Some(mut block) = compiler.compile_block(&instrs, entry.virt_pc, entry.tier) { - block.phys_addr = phys_pc; - // Zero all counters — no penalty baggage from prior session. - // speculative is left as compile_block set it: Full-tier is NOT - // speculative because rollback can't un-do stores (memory diverges - // from CPU state). Alu/Loads tiers are speculative and will re-prove - // stability via the normal snapshot/rollback path this session. - block.hit_count = 0; - block.stable_hits = 0; - block.exception_count = 0; - cache.insert(phys_pc, entry.virt_pc, block); - *blocks_compiled += 1; - *profile_replayed += 1; - } -} - -/// Recompile a block at a different tier, replacing the existing cache entry. -fn recompile_block_at_tier( - compiler: &mut BlockCompiler, - cache: &mut CodeCache, - exec: &mut MipsExecutor, - phys_pc: u64, - virt_pc: u64, - tier: BlockTier, - blocks_compiled: &mut u64, -) { - let instrs = trace_block(exec, virt_pc, tier); - if !instrs.is_empty() { - if let Some(mut block) = compiler.compile_block(&instrs, virt_pc, tier) { - block.phys_addr = phys_pc; - cache.replace(phys_pc, virt_pc, block); - *blocks_compiled += 1; - } - } -} - fn interpreter_loop( exec: &mut MipsExecutor, running: &AtomicBool, @@ -966,7 +1089,7 @@ fn trace_block( // code (confirmed by IRIS_JIT_VERIFY catching real GPR mismatches). The // safe ceiling was empirically determined: aarch64 tolerates 3, x86_64 // only 1. Bumping past this threshold produces silent miscompilations. - let max_helpers: u32 = if cfg!(target_arch = "aarch64") { 3 } else { 1 }; + let max_helpers: u32 = MAX_BLOCK_LEN as u32; let mut helper_count: u32 = 0; for _ in 0..max_len { @@ -986,11 +1109,12 @@ fn trace_block( let is_branch = is_branch_or_jump(&d); - // Full-tier: terminate BEFORE stores. Store-containing blocks must be - // non-speculative (can't rollback memory), which disables the - // self-healing safety net (rollback + demotion on codegen error). - // By excluding stores, all Full-tier blocks stay load-only → speculative - // → self-healing. Stores go to interpreter, where they're always correct. + // Full-tier: terminate BEFORE stores. The write log approach works for + // RAM but fails for MMIO: pre-reads of device registers have side + // effects (e.g., clear-on-read status bits), and replay writes to + // devices (DMA control, audio FIFOs) corrupt device state. Proper + // MMIO-aware speculation would require tracking which physical ranges + // are RAM vs MMIO and disabling speculation when MMIO is touched. if tier == BlockTier::Full && is_compilable_store(&d) && !jit_no_stores() { record_termination(&d, tier); break; diff --git a/src/jit/helpers.rs b/src/jit/helpers.rs index 303badd..22cd53d 100644 --- a/src/jit/helpers.rs +++ b/src/jit/helpers.rs @@ -5,11 +5,40 @@ //! exec_ptr derives from a &mut in the dispatch loop and apply noalias //! optimizations that cause stale reads. -use super::context::{JitContext, EXIT_EXCEPTION}; +use super::context::{JitContext, EXIT_EXCEPTION, WRITE_LOG_CAP, WriteLogEntry}; use crate::mips_exec::{MipsExecutor, EXEC_COMPLETE}; use crate::mips_tlb::Tlb; use crate::mips_cache_v2::MipsCache; +/// Pre-read + log old value for speculative rollback. Returns false on read +/// failure (exception) or log full — caller should skip the write in both cases. +#[inline(always)] +fn log_pre_store( + ctx: &mut JitContext, + exec: &mut MipsExecutor, + virt_addr: u64, +) -> Result<(), u32> { + if (ctx.write_log_len as usize) >= WRITE_LOG_CAP { + // Log full — treat as non-recoverable; caller must mark block non-speculative + // at compile time based on store count. Belt-and-suspenders: refuse the write. + return Err(0); + } + match exec.read_data::(virt_addr) { + Ok(old_val) => { + let idx = ctx.write_log_len as usize; + ctx.write_log[idx] = WriteLogEntry { + addr: virt_addr, + old_val, + size: SIZE as u8, + _pad: [0; 7], + }; + ctx.write_log_len += 1; + Ok(()) + } + Err(status) => Err(status), + } +} + /// Opaque cast that defeats LLVM's alias analysis and pointer provenance tracking. /// `#[inline(never)]` ensures LLVM can't see through this to recover provenance. #[inline(never)] @@ -75,6 +104,9 @@ pub extern "C" fn jit_write_u8( ) -> u64 { let exec = unsafe { &mut *opaque_exec::(exec_ptr) }; let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) }; + if let Err(s) = log_pre_store::<1, T, C>(ctx, exec, virt_addr) { + ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0; + } let status = exec.write_data::<1>(virt_addr, value); if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; } 0 @@ -85,6 +117,9 @@ pub extern "C" fn jit_write_u16( ) -> u64 { let exec = unsafe { &mut *opaque_exec::(exec_ptr) }; let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) }; + if let Err(s) = log_pre_store::<2, T, C>(ctx, exec, virt_addr) { + ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0; + } let status = exec.write_data::<2>(virt_addr, value); if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; } 0 @@ -95,6 +130,9 @@ pub extern "C" fn jit_write_u32( ) -> u64 { let exec = unsafe { &mut *opaque_exec::(exec_ptr) }; let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) }; + if let Err(s) = log_pre_store::<4, T, C>(ctx, exec, virt_addr) { + ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0; + } let status = exec.write_data::<4>(virt_addr, value); if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; } 0 @@ -105,6 +143,9 @@ pub extern "C" fn jit_write_u64( ) -> u64 { let exec = unsafe { &mut *opaque_exec::(exec_ptr) }; let ctx = unsafe { &mut *opaque_ctx(ctx_ptr) }; + if let Err(s) = log_pre_store::<8, T, C>(ctx, exec, virt_addr) { + ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = s; return 0; + } let status = exec.write_data::<8>(virt_addr, value); if status != EXEC_COMPLETE { ctx.exit_reason = EXIT_EXCEPTION; ctx.exception_status = status; } 0 @@ -190,6 +231,23 @@ pub struct HelperPtrs { pub dmtc0: *const u8, } +// These are stable monomorphized function pointers, safe to send across threads. +unsafe impl Send for HelperPtrs {} + +impl Clone for HelperPtrs { + fn clone(&self) -> Self { + Self { + read_u8: self.read_u8, read_u16: self.read_u16, + read_u32: self.read_u32, read_u64: self.read_u64, + write_u8: self.write_u8, write_u16: self.write_u16, + write_u32: self.write_u32, write_u64: self.write_u64, + interp_step: self.interp_step, + mfc0: self.mfc0, dmfc0: self.dmfc0, + mtc0: self.mtc0, dmtc0: self.dmtc0, + } + } +} + impl HelperPtrs { pub fn new() -> Self { Self { diff --git a/src/jit/mod.rs b/src/jit/mod.rs index 15b7645..cb1c16f 100644 --- a/src/jit/mod.rs +++ b/src/jit/mod.rs @@ -12,6 +12,7 @@ pub mod profile; pub mod snapshot; pub mod trace; pub mod codegen_test; +pub mod async_compiler; pub use context::JitContext; pub use cache::{CodeCache, CompiledBlock}; diff --git a/src/mips_exec.rs b/src/mips_exec.rs index 9e367bf..1580e53 100644 --- a/src/mips_exec.rs +++ b/src/mips_exec.rs @@ -931,6 +931,50 @@ For R4000SC/MC CPUs: result } + /// Lightweight step for JIT interpreter bursts. Skips cp0_count + /// advancement and local_cycles — the JIT dispatch loop does those + /// in bulk after the burst. Keeps interrupt checking because the + /// kernel depends on per-instruction interrupt delivery. + #[inline(always)] + pub fn step_lite(&mut self) -> ExecStatus { + let pending = unsafe { &*self.interrupts_ptr }.load(Ordering::Relaxed); + + let pc = self.core.pc; + + if (pending | self.core.cp0_cause as u64) != 0 { + if pending & SOFT_RESET_BIT != 0 { + self.core.reset(true); + self.in_delay_slot = false; + self.delay_slot_target = 0; + return EXEC_COMPLETE; + } + self.core.cp0_cause = (self.core.cp0_cause & !EXT_INT_MASK) | (pending as u32 & EXT_INT_MASK); + if self.core.interrupts_enabled() { + let ip = self.core.cp0_cause & crate::mips_core::CAUSE_IP_MASK; + let im = self.core.cp0_status & crate::mips_core::STATUS_IM_MASK; + if (ip & im) != 0 { + let s = exec_exception(EXC_INT); + return self.handle_exception(s); + } + } + } + + let fetch = self.fetch_instr(pc); + if fetch.status != EXEC_COMPLETE { + return if fetch.status & EXEC_IS_EXCEPTION != 0 { + self.handle_exception(fetch.status) + } else { + fetch.status + }; + } + let slot = fetch.instr as *mut DecodedInstr; + let d = unsafe { &mut *slot }; + if !d.decoded { + decode_into::(d); + } + self.exec_decoded(unsafe { &*slot }) + } + #[inline(always)] fn check_breakpoint(&mut self, addr: u64) -> bool { if KIND == BpType::Pc as u8 {