From 2c10ea5e28df6a790cd276ed4673e66a16a2b128 Mon Sep 17 00:00:00 2001
From: NthTensor <nth.tensor@gmail.com>
Date: Sun, 25 Jan 2026 13:15:00 -0500
Subject: [PATCH 1/3] feat: instruction counters and lazy scheduling

---
 CHANGELOG.md          |   18 +
 Cargo.lock            |  165 ++++++-
 Cargo.toml            |    4 +-
 benches/bevy_tasks.rs |   18 +-
 benches/flat_scope.rs |    6 +-
 benches/flood_fill.rs |   15 +-
 benches/fork_join.rs  |   57 ++-
 src/compile_fail.rs   |  313 +++++++------
 src/job.rs            |  507 +++++++++++++++++----
 src/latch.rs          |  181 ++++----
 src/lib.rs            |   57 +--
 src/scope.rs          |  272 +++++++----
 src/thread_pool.rs    | 1000 ++++++++++++++++++++++-------------------
 src/util.rs           |    9 +-
 tests/shuttle.rs      |  145 +-----
 15 files changed, 1714 insertions(+), 1053 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 03e89e3..3f7ea85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,4 @@
+
 # Changelog
 
 All notable changes to this project will be documented in this file.
@@ -13,6 +14,23 @@ This project is currently in early [pre-release], and there may be arbitrary bre
 
 ## [Unreleased]
 
+### Added 
+
+- `ThreadPool::num_workers` method which return the current number of workers
+- `ThreadPool::on_worker` variant of `with_worker` for `Send` closures.
+- `ThreadPool::expect_worker` variant of `with_worker` that panics.
+
+### Changed
+- Work sharing has been rewritten to improve performance.
+- Thread pools can now have a max of 32 workers at a time.
+- `spawn`, `Scope::spawn`, and `Worker::spawn` now accept closures and futures.
+- `ThreadPool::with_worker` now provides `Option<&Worker>` instead of `&Worker`.
+- `claim_lease` now returns `Option<Lease>` instead of `Lease`.
+- `Scope` now has two lifetimes instead of one, and is more flexible.
+
+### Removed
+- All versions of `spawn_future` and `spawn_async`; just use `spawn` instead.
+
 ## [1.0.0-alpha.4]
 
 ### Added
diff --git a/Cargo.lock b/Cargo.lock
index 4274d5d..75b2950 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -55,12 +55,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "arraydeque"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236"
-
 [[package]]
 name = "assoc"
 version = "0.1.3"
@@ -496,7 +490,6 @@ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 name = "forte"
 version = "1.0.0-dev"
 dependencies = [
- "arraydeque",
  "async-task",
  "atomic-wait",
  "bevy_tasks",
@@ -508,6 +501,8 @@ dependencies = [
  "divan",
  "rayon",
  "shuttle",
+ "st3",
+ "tick_counter",
  "tracing",
  "tracing-subscriber",
 ]
@@ -543,6 +538,19 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "generator"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e"
+dependencies = [
+ "cc",
+ "libc",
+ "log",
+ "rustversion",
+ "windows 0.48.0",
+]
+
 [[package]]
 name = "generator"
 version = "0.8.5"
@@ -554,7 +562,7 @@ dependencies = [
  "libc",
  "log",
  "rustversion",
- "windows",
+ "windows 0.61.3",
 ]
 
 [[package]]
@@ -694,6 +702,28 @@ version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 
+[[package]]
+name = "loom"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5"
+dependencies = [
+ "cfg-if",
+ "generator 0.7.5",
+ "scoped-tls",
+ "tracing",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.5"
@@ -920,8 +950,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.9",
+ "regex-syntax 0.8.5",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -932,7 +971,7 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -941,6 +980,12 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "regex-syntax"
 version = "0.8.5"
@@ -1055,7 +1100,7 @@ dependencies = [
  "assoc",
  "bitvec",
  "cfg-if",
- "generator",
+ "generator 0.8.5",
  "hex",
  "owo-colors",
  "rand",
@@ -1087,6 +1132,16 @@ dependencies = [
  "portable-atomic",
 ]
 
+[[package]]
+name = "st3"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5a1d2cec4c9904d238075cb3a212615e67aee6acce849e4e565acf2320a7bf1"
+dependencies = [
+ "crossbeam-utils",
+ "loom",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -1129,6 +1184,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "tick_counter"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37f1310986d0aa940019cbb2b480161c60a614dba076cbb20e82bfbc236bbabd"
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -1188,10 +1249,14 @@ version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
+ "matchers",
  "nu-ansi-term",
+ "once_cell",
+ "regex",
  "sharded-slab",
  "smallvec",
  "thread_local",
+ "tracing",
  "tracing-core",
  "tracing-log",
 ]
@@ -1323,6 +1388,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
 [[package]]
 name = "windows"
 version = "0.61.3"
@@ -1371,9 +1445,9 @@ dependencies = [
 
 [[package]]
 name = "windows-implement"
-version = "0.60.0"
+version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1382,9 +1456,9 @@ dependencies = [
 
 [[package]]
 name = "windows-interface"
-version = "0.59.1"
+version = "0.59.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1452,7 +1526,22 @@ version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
 ]
 
 [[package]]
@@ -1486,6 +1575,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
@@ -1498,6 +1593,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
@@ -1510,6 +1611,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
@@ -1528,6 +1635,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
@@ -1540,6 +1653,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
@@ -1552,6 +1671,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
@@ -1564,6 +1689,12 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
diff --git a/Cargo.toml b/Cargo.toml
index 3a5057c..d77e28f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,11 +11,13 @@ resolver = "2"
 members = ["ci"]
 
 [dependencies]
-arraydeque = "0.5.1"
 async-task = "4.7.1"
 atomic-wait = "1.1.0"
 crossbeam-queue = "0.3.12"
 crossbeam-utils = "0.8.21"
+st3 = "0.4"
+tick_counter = "0.4.5"
+
 shuttle = { version = "0.8.0", optional = true }
 tracing = { version = "0.1.41", features = ["release_max_level_off"] }
 tracing-subscriber = "0.3.19"
diff --git a/benches/bevy_tasks.rs b/benches/bevy_tasks.rs
index d3cbe33..b730f1f 100644
--- a/benches/bevy_tasks.rs
+++ b/benches/bevy_tasks.rs
@@ -40,21 +40,21 @@ mod overhead {
         for i in 0..80 {
             black_box(i);
         }
-        // std::thread::sleep(Duration::from_nanos(100));
         black_box(value);
     }
 
     #[divan::bench(args = LEN)]
-    fn serial(bencher: Bencher, len: usize) {
+    fn baseline(bencher: Bencher, len: usize) {
         let mut vec: Vec<_> = (0..len).collect();
         bencher.bench_local(|| vec.iter_mut().for_each(work));
     }
 
     #[divan::bench(args = LEN)]
     fn bevy_tasks(bencher: Bencher, len: usize) {
-        use crate::BevyParChunksMut;
         use bevy_tasks::ParallelIterator;
 
+        use crate::BevyParChunksMut;
+
         let mut vec: Vec<_> = (0..len).collect();
         let pool = bevy_tasks::TaskPoolBuilder::new()
             .thread_name("bevy_tasks".to_string())
@@ -83,18 +83,18 @@ mod overhead {
 
         let mut vec: Vec<_> = (0..len).collect();
 
-        THREAD_POOL.resize_to_available();
-
-        bencher.bench_local(|| {
-            THREAD_POOL.with_worker(|worker| {
-                forte_chunks::<8, _, _>(worker, &mut vec, &|c| {
+        THREAD_POOL.expect_worker(|worker| {
+            bencher.bench_local(|| {
+                forte_chunks::<64, _, _>(worker, &mut vec, &|c| {
                     c.iter_mut().for_each(work);
                 });
-            })
+            });
         });
     }
 }
 
 fn main() {
+    THREAD_POOL.resize_to_available();
+
     divan::main();
 }
diff --git a/benches/flat_scope.rs b/benches/flat_scope.rs
index 0aca71e..8d48ef0 100644
--- a/benches/flat_scope.rs
+++ b/benches/flat_scope.rs
@@ -1,6 +1,8 @@
 //! A benchmark for fork-join workloads adapted from `chili`.
 
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::DefaultHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
 
 use criterion::black_box;
 use divan::Bencher;
@@ -37,7 +39,7 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new();
 fn forte(bencher: Bencher, size: usize) {
     use forte::Worker;
 
-    COMPUTE.with_worker(|worker| {
+    COMPUTE.expect_worker(|worker| {
         bencher.bench_local(|| {
             worker.scope(|scope| {
                 for i in 0..size {
diff --git a/benches/flood_fill.rs b/benches/flood_fill.rs
index 5ccb631..471953d 100644
--- a/benches/flood_fill.rs
+++ b/benches/flood_fill.rs
@@ -1,7 +1,10 @@
 //! A benchmark for fork-join workloads adapted from `chili`.
 
-use std::collections::{HashSet, VecDeque};
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::collections::HashSet;
+use std::collections::VecDeque;
+use std::hash::DefaultHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
 
 use criterion::black_box;
 use dashmap::DashSet;
@@ -69,7 +72,8 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new();
 
 #[divan::bench(args = sizes(), threads = false)]
 fn forte(bencher: Bencher, size: usize) {
-    use forte::{Scope, Worker};
+    use forte::Scope;
+    use forte::Worker;
 
     fn visit<'scope, 'env>(
         size: usize,
@@ -122,7 +126,7 @@ fn forte(bencher: Bencher, size: usize) {
         }
     }
 
-    COMPUTE.with_worker(|worker| {
+    COMPUTE.expect_worker(|worker| {
         bencher.bench_local(|| {
             let visited = DashSet::new();
 
@@ -135,7 +139,8 @@ fn forte(bencher: Bencher, size: usize) {
 
 #[divan::bench(args = sizes(), threads = false)]
 fn rayon(bencher: Bencher, size: usize) {
-    use rayon::{Scope, scope};
+    use rayon::Scope;
+    use rayon::scope;
 
     fn visit<'scope>(
         size: usize,
diff --git a/benches/fork_join.rs b/benches/fork_join.rs
index 6772b8b..2e55b24 100644
--- a/benches/fork_join.rs
+++ b/benches/fork_join.rs
@@ -86,7 +86,7 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) {
 
     let tree = Node::tree(nodes.0);
 
-    COMPUTE.with_worker(|worker| {
+    COMPUTE.expect_worker(|worker| {
         info!("Staring Benchmark");
         bencher.bench_local(move || {
             assert_eq!(sum(&tree, worker), nodes.1 as u64);
@@ -94,6 +94,26 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) {
     });
 }
 
+#[divan::bench(args = nodes())]
+fn throughput_forte(bencher: Bencher, nodes: (usize, usize)) {
+    fn sum(node: &Node, worker: &Worker) -> u64 {
+        let (left, right) = worker.join(
+            |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
+            |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
+        );
+
+        node.val + left + right
+    }
+
+    info!("Staring Benchmark");
+    bencher.bench(|| {
+        COMPUTE.expect_worker(|worker| {
+            let tree = Node::tree(nodes.0);
+            assert_eq!(sum(&tree, worker), nodes.1 as u64);
+        });
+    });
+}
+
 #[divan::bench(args = nodes())]
 fn chili(bencher: Bencher, nodes: (usize, usize)) {
     fn sum(node: &Node, scope: &mut Scope<'_>) -> u64 {
@@ -113,6 +133,24 @@ fn chili(bencher: Bencher, nodes: (usize, usize)) {
     });
 }
 
+#[divan::bench(args = nodes())]
+fn thrughput_chili(bencher: Bencher, nodes: (usize, usize)) {
+    fn sum(node: &Node, scope: &mut Scope<'_>) -> u64 {
+        let (left, right) = scope.join(
+            |s| node.left.as_deref().map(|n| sum(n, s)).unwrap_or_default(),
+            |s| node.right.as_deref().map(|n| sum(n, s)).unwrap_or_default(),
+        );
+
+        node.val + left + right
+    }
+
+    bencher.bench(move || {
+        let tree = Node::tree(nodes.0);
+        let mut scope = Scope::global();
+        assert_eq!(sum(&tree, &mut scope), nodes.1 as u64);
+    });
+}
+
 #[divan::bench(args = nodes())]
 fn rayon(bencher: Bencher, nodes: (usize, usize)) {
     fn sum(node: &Node) -> u64 {
@@ -131,6 +169,23 @@ fn rayon(bencher: Bencher, nodes: (usize, usize)) {
     });
 }
 
+#[divan::bench(args = nodes())]
+fn throughput_rayon(bencher: Bencher, nodes: (usize, usize)) {
+    fn sum(node: &Node) -> u64 {
+        let (left, right) = rayon::join(
+            || node.left.as_deref().map(sum).unwrap_or_default(),
+            || node.right.as_deref().map(sum).unwrap_or_default(),
+        );
+
+        node.val + left + right
+    }
+
+    bencher.bench(move || {
+        let tree = Node::tree(nodes.0);
+        assert_eq!(sum(&tree), nodes.1 as u64);
+    });
+}
+
 fn main() {
     let fmt_layer = fmt::layer()
         .without_time()
diff --git a/src/compile_fail.rs b/src/compile_fail.rs
index 832b171..e60ed01 100644
--- a/src/compile_fail.rs
+++ b/src/compile_fail.rs
@@ -3,188 +3,175 @@
 // -----------------------------------------------------------------------------
 // Ensures non-send data cannot be moved into a join.
 
-/** ```compile_fail,E0277
-
-use std::rc::Rc;
-use forte::ThreadPool;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-let r = Rc::new(22);
-THREAD_POOL.join(|_| r.clone(), |_| r.clone());
-//~^ ERROR
-
-``` */
+/// ```compile_fail,E0277
+/// use std::rc::Rc;
+/// use forte::ThreadPool;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// let r = Rc::new(22);
+/// THREAD_POOL.join(|_| r.clone(), |_| r.clone());
+/// //~^ ERROR
+/// ```
 mod nonsend_input {}
 
 // -----------------------------------------------------------------------------
 // Ensures non-send data cannot be returned by join.
 
-/** ```compile_fail,E0277
-
-use std::rc::Rc;
-use forte::ThreadPool;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-THREAD_POOL.join(|_| Rc::new(22), |_| ()); //~ ERROR
-
-THREAD_POOL.depopulate();
-
-``` */
+/// ```compile_fail,E0277
+/// use std::rc::Rc;
+/// use forte::ThreadPool;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// THREAD_POOL.join(|_| Rc::new(22), |_| ()); //~ ERROR
+///
+/// THREAD_POOL.depopulate();
+/// ```
 mod nonsend_left_join {}
 
-/** ```compile_fail,E0277
-
-use std::rc::Rc;
-use forte::ThreadPool;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-THREAD_POOL.join(|_| (), |_| Rc::new(23)); //~ ERROR
-
-THREAD_POOL.depopulate();
-
-``` */
+/// ```compile_fail,E0277
+/// use std::rc::Rc;
+/// use forte::ThreadPool;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// THREAD_POOL.join(|_| (), |_| Rc::new(23)); //~ ERROR
+///
+/// THREAD_POOL.depopulate();
+/// ```
 mod nonsend_right_join {}
 
 // -----------------------------------------------------------------------------
 // Ensures scopes can not borrow data spawned within the closure.
 
-/** ```compile_fail,E0373
-
-use forte::ThreadPool;
-use forte::Worker;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-fn bad_scope<F>(f: F)
-    where F: FnOnce(&i32) + Send,
-{
-    THREAD_POOL.scope(|scope| {
-        let x = 22;
-        scope.spawn(|_: &Worker| f(&x)); //~ ERROR `x` does not live long enough
-    });
-}
-
-fn good_scope<F>(f: F)
-    where F: FnOnce(&i32) + Send,
-{
-    let x = 22;
-    THREAD_POOL.scope(|scope| {
-        scope.spawn(|_: &Worker| f(&x));
-    });
-}
-
-fn main() { }
-
-``` */
+/// ```compile_fail,E0373
+/// use forte::ThreadPool;
+/// use forte::Worker;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// fn bad_scope<F>(f: F)
+/// where
+///     F: FnOnce(&i32) + Send,
+/// {
+///     THREAD_POOL.scope(|scope| {
+///         let x = 22;
+///         scope.spawn(|_: &Worker| f(&x)); //~ ERROR `x` does not live long enough
+///     });
+/// }
+///
+/// fn good_scope<F>(f: F)
+/// where
+///     F: FnOnce(&i32) + Send,
+/// {
+///     let x = 22;
+///     THREAD_POOL.scope(|scope| {
+///         scope.spawn(|_: &Worker| f(&x));
+///     });
+/// }
+///
+/// fn main() {}
+/// ```
 mod scope_join_bad {}
 
 // -----------------------------------------------------------------------------
 // Ensures the two branches of a join mutably borrow the same data.
 
-/** ```compile_fail,E0524
-
-use forte::ThreadPool;
-use forte::Worker;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-fn quick_sort<T:PartialOrd+Send>(v: &mut [T]) {
-    if v.len() <= 1 {
-        return;
-    }
-
-    let mid = partition(v);
-    let (lo, _hi) = v.split_at_mut(mid);
-    THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(lo)); //~ ERROR
-}
-
-fn partition<T:PartialOrd+Send>(v: &mut [T]) -> usize {
-    let pivot = v.len() - 1;
-    let mut i = 0;
-    for j in 0..pivot {
-        if v[j] <= v[pivot] {
-            v.swap(i, j);
-            i += 1;
-        }
-    }
-    v.swap(i, pivot);
-    i
-}
-
-fn main() { }
-
-``` */
+/// ```compile_fail,E0524
+/// use forte::ThreadPool;
+/// use forte::Worker;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// fn quick_sort<T: PartialOrd + Send>(v: &mut [T]) {
+///     if v.len() <= 1 {
+///         return;
+///     }
+///
+///     let mid = partition(v);
+///     let (lo, _hi) = v.split_at_mut(mid);
+///     THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(lo)); //~ ERROR
+/// }
+///
+/// fn partition<T: PartialOrd + Send>(v: &mut [T]) -> usize {
+///     let pivot = v.len() - 1;
+///     let mut i = 0;
+///     for j in 0..pivot {
+///         if v[j] <= v[pivot] {
+///             v.swap(i, j);
+///             i += 1;
+///         }
+///     }
+///     v.swap(i, pivot);
+///     i
+/// }
+/// fn main() {}
+/// ```
 mod quicksort_race_1 {}
 
-/** ```compile_fail,E0500
-
-use forte::ThreadPool;
-use forte::Worker;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-fn quick_sort<T:PartialOrd+Send>(v: &mut [T]) {
-    if v.len() <= 1 {
-        return;
-    }
-
-    let mid = partition(v);
-    let (lo, _hi) = v.split_at_mut(mid);
-    THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(v)); //~ ERROR
-}
-
-fn partition<T:PartialOrd+Send>(v: &mut [T]) -> usize {
-    let pivot = v.len() - 1;
-    let mut i = 0;
-    for j in 0..pivot {
-        if v[j] <= v[pivot] {
-            v.swap(i, j);
-            i += 1;
-        }
-    }
-    v.swap(i, pivot);
-    i
-}
-
-fn main() { }
-
-``` */
+/// ```compile_fail,E0500
+/// use forte::ThreadPool;
+/// use forte::Worker;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// fn quick_sort<T:PartialOrd+Send>(v: &mut [T]) {
+///     if v.len() <= 1 {
+///         return;
+/// }
+///
+/// let mid = partition(v);
+///     let (lo, _hi) = v.split_at_mut(mid);
+///     THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(v)); //~ ERROR
+/// }
+///
+/// fn partition<T: PartialOrd + Send>(v: &mut [T]) -> usize {
+///     let pivot = v.len() - 1;
+///     let mut i = 0;
+///     for j in 0..pivot {
+///         if v[j] <= v[pivot] {
+///             v.swap(i, j);
+///             i += 1;
+///         }
+///     }
+///     v.swap(i, pivot);
+///     i
+/// }
+///
+/// fn main() { }
+/// ```
 mod quicksort_race_2 {}
 
-/** ```compile_fail,E0524
-
-use forte::ThreadPool;
-use forte::Worker;
-
-static THREAD_POOL: ThreadPool = ThreadPool::new();
-
-fn quick_sort<T:PartialOrd+Send>(v: &mut [T]) {
-    if v.len() <= 1 {
-        return;
-    }
-
-    let mid = partition(v);
-    let (_lo, hi) = v.split_at_mut(mid);
-    THREAD_POOL.join(|_| quick_sort(hi), |_| quick_sort(hi)); //~ ERROR
-}
-
-fn partition<T:PartialOrd+Send>(v: &mut [T]) -> usize {
-    let pivot = v.len() - 1;
-    let mut i = 0;
-    for j in 0..pivot {
-        if v[j] <= v[pivot] {
-            v.swap(i, j);
-            i += 1;
-        }
-    }
-    v.swap(i, pivot);
-    i
-}
-
-fn main() { }
-
-``` */
+/// ```compile_fail,E0524
+/// use forte::ThreadPool;
+/// use forte::Worker;
+///
+/// static THREAD_POOL: ThreadPool = ThreadPool::new();
+///
+/// fn quick_sort<T: PartialOrd + Send>(v: &mut [T]) {
+///     if v.len() <= 1 {
+///         return;
+///     }
+///
+///     let mid = partition(v);
+///     let (_lo, hi) = v.split_at_mut(mid);
+///     THREAD_POOL.join(|_| quick_sort(hi), |_| quick_sort(hi)); //~ ERROR
+/// }
+///
+/// fn partition<T: PartialOrd + Send>(v: &mut [T]) -> usize {
+///     let pivot = v.len() - 1;
+///     let mut i = 0;
+///     for j in 0..pivot {
+///         if v[j] <= v[pivot] {
+///             v.swap(i, j);
+///             i += 1;
+///         }
+///     }
+///     v.swap(i, pivot);
+///     i
+/// }
+///
+/// fn main() {}
+/// ```
 mod quicksort_race_3 {}
diff --git a/src/job.rs b/src/job.rs
index 75e6347..38d0c30 100644
--- a/src/job.rs
+++ b/src/job.rs
@@ -12,14 +12,18 @@
 //! (c) Each job reference is executed exactly once.
 
 use alloc::boxed::Box;
-use arraydeque::ArrayDeque;
+use alloc::collections::VecDeque;
+use alloc::vec::Vec;
 use core::cell::UnsafeCell;
-use core::mem::{ManuallyDrop, MaybeUninit};
+use core::mem::ManuallyDrop;
+use core::mem::MaybeUninit;
 use core::ptr::NonNull;
-use core::sync::atomic::{Ordering, fence};
+use core::sync::atomic::Ordering;
+use core::sync::atomic::fence;
 use std::thread::Result as ThreadResult;
 
 use crate::latch::Latch;
+use crate::platform::AtomicU32;
 use crate::thread_pool::Worker;
 use crate::unwind;
 
@@ -34,7 +38,7 @@ trait Job {
     ///
     /// # Safety
     ///
-    /// Implements must specify the invariant of the pointer `this` that the
+    /// Implementors must specify the invariant of the pointer `this` that the
     /// caller is expected to uphold.
     ///
     /// This may be called from a different thread than the one which scheduled
@@ -59,7 +63,7 @@ pub struct JobRef {
     /// of `StackJob` or `HeapJob`. But it can contain other things as well.
     job_pointer: NonNull<()>,
     /// A function pointer that can execute the job stored at `job_pointer`.
-    /// This is usually point to an implementation of `Job::execute` (either
+    /// This usually points to an implementation of `Job::execute` (either
     /// `HeapJob::execute` or `StackJob::execute`). But it can contain other
     /// things as well.
     execute_fn: unsafe fn(NonNull<()>, &Worker),
@@ -70,9 +74,16 @@ impl JobRef {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that `job_pointer` remains valid to pass to
-    /// `execute_fn` until the job is executed. What exactly this means is
-    /// dependent on the implementation of the execute function.
+    /// The caller must ensure that:
+    ///
+    /// * `job_pointer` and `execute_fn` are *matched*; the `execute_fn` must be
+    ///   a function that can safely receive `job_pointer` as it's first argument.
+    ///
+    /// * `job_pointer` points to an initialized and properly aligned value which
+    ///   is neither moved nor dropped until `execute_fn` is called.
+    ///
+    /// * `job_pointer` is "valid" now and until `execute_fn` is called,
+    ///   according to the contract of the specific `execute_fn` being stored.
     #[inline(always)]
     pub unsafe fn new_raw(
         job_pointer: NonNull<()>,
@@ -94,56 +105,94 @@ impl JobRef {
     /// Executes the `JobRef` by passing the execute function on the job pointer.
     #[inline(always)]
     pub fn execute(self, worker: &Worker) {
-        // SAFETY: The constructor of `JobRef` is required to ensure this is valid.
+        // SAFETY: Calling this function on this pointer is valid due to the
+        // contract of `JobRef::new_raw`:
+        //
+        // * `self.execute_fn` and `self.job_pointer` are "matched": every
+        //   `JobRef` is constructed via `new_raw`, which requires the caller
+        //   to supply a compatible pair.
+        //
+        // * `self.job_pointer` is valid at this point: `new_raw` requires the
+        //   pointer to remain valid until `execute_fn` is called, and we are
+        //   calling it now.
+        //
+        // * This is called at most once: `execute` consumes `self`, so the
+        //   pointer cannot be used again via this `JobRef`.
         unsafe { (self.execute_fn)(self.job_pointer, worker) }
     }
 }
 
-// SAFETY: !Send for raw pointers is not for safety, just as a lint.
+// SAFETY: `JobRef` is a type-erased data pointer + function pointer tuple. The
+// data pointer always points to a `Send` value due to the safety requirements
+// of `JobRef::new_raw`. Function pointers are always `Send`. Therefore it is
+// sound to move a `JobRef` across thread boundaries.
 unsafe impl Send for JobRef {}
 
 // -----------------------------------------------------------------------------
 // Job queue
 
+/// A queue of jobs. This is a simple wrapper around a vec dequeue that uses
+/// inner mutation, and has some more intiuitively named methods to enforce
+/// conventions.
 pub struct JobQueue {
-    job_refs: UnsafeCell<ArrayDeque<JobRef, 64>>,
+    job_refs: UnsafeCell<VecDeque<JobRef>>,
 }
 
 impl JobQueue {
+    /// Creates a new job queue.
     pub fn new() -> JobQueue {
         JobQueue {
-            job_refs: UnsafeCell::new(ArrayDeque::new()),
+            job_refs: UnsafeCell::new(VecDeque::new()),
         }
     }
 
-    #[inline(always)]
-    pub fn push(&self, job_ref: JobRef) -> Option<JobRef> {
-        // SAFETY: The queue itself is only access mutably within `push_back`,
-        // `pop_back` and `pop_front`. Since these functions never call each
-        // other, we must have exclusive access to the queue.
+    /// Insert a job at the back of the queue (the side with the newest jobs).
+    pub fn push_new(&self, job_ref: JobRef) {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
         let job_refs = unsafe { &mut *self.job_refs.get() };
-        if let Err(full) = job_refs.push_back(job_ref) {
-            Some(full.element)
-        } else {
-            None
-        }
+        job_refs.push_back(job_ref);
     }
 
-    #[inline(always)]
+    /// Insert a job at the front of the queue (the side with the oldest jobs).
+    pub fn push_old(&self, job_ref: JobRef) {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
+        let job_refs = unsafe { &mut *self.job_refs.get() };
+        job_refs.push_front(job_ref);
+    }
+
+    /// Removes the newest job in the queue.
     pub fn pop_newest(&self) -> Option<JobRef> {
-        // SAFETY: The queue itself is only access mutably within `push_back`,
-        // `pop_back` and `pop_front`. Since these functions never call each
-        // other, we must have exclusive access to the queue.
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         job_refs.pop_back()
     }
 
-    // Attempt to remove the given job-ref from the back of the queue.
+    /// Removes the oldest job in the queue.
+    pub fn pop_oldest(&self) -> Option<JobRef> {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
+        let job_refs = unsafe { &mut *self.job_refs.get() };
+        job_refs.pop_front()
+    }
+
+    /// Attempt to remove the given job-ref from the back of the queue.
     #[inline(always)]
-    pub fn recover_just_pushed(&self, id: (usize, usize)) -> bool {
-        // SAFETY: The queue itself is only access mutably within `push_back`,
-        // `pop_back` and `pop_front`. Since these functions never call each
-        // other, we must have exclusive access to the queue.
+    pub fn recover_newest(&self, id: (usize, usize)) -> bool {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
         let job_refs = unsafe { &mut *self.job_refs.get() };
         if job_refs.back().map(JobRef::id) == Some(id) {
             let _ = job_refs.pop_back();
@@ -153,13 +202,38 @@ impl JobQueue {
         }
     }
 
-    #[cold]
-    pub fn pop_oldest(&self) -> Option<JobRef> {
-        // SAFETY: The queue itself is only access mutably within `push_back`,
-        // `pop_back` and `pop_front`. Since these functions never call each
-        // other, we must have exclusive access to the queue.
+    /// The size of a chunk of jobs.
+    const CHUNK_SIZE: usize = 16;
+
+    /// Splits off a series of chunks from the end of the queue (the side with
+    /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. After, At most
+    /// `CHUNK_SIZE` jobs will be left in the queue.
+    pub fn split(&self) -> Vec<VecDeque<JobRef>> {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
         let job_refs = unsafe { &mut *self.job_refs.get() };
-        job_refs.pop_front()
+        let mut len = job_refs.len();
+        let num_chunks = len / Self::CHUNK_SIZE;
+        (0..num_chunks)
+            .map(|_| {
+                let chunk = job_refs.split_off(len - Self::CHUNK_SIZE);
+                len -= Self::CHUNK_SIZE;
+                chunk
+            })
+            .collect()
+    }
+
+    /// Appends a chunk of jobs (expected to be provided by `split`) to the
+    /// queue. Jobs are added to the end (the side with the newst jobs).
+    pub fn append(&self, mut split_refs: VecDeque<JobRef>) {
+        // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
+        // thread. We ensure no other references to the inner value exist by not
+        // returning any references from this API, making this exclusive access
+        // safe.
+        let job_refs = unsafe { &mut *self.job_refs.get() };
+        job_refs.append(&mut split_refs);
     }
 }
 
@@ -195,23 +269,35 @@ where
     ///
     /// # Safety
     ///
-    /// The caller must ensure that the `StackJob` that the returned `JobRef` refers
-    /// to will live as long as the `JobRef`. The caller must also ensure that
-    /// the `JobRef` does not outlive the data the `StackJob` closes over; which
-    /// is to say, if the closure references something, that thing must exist at
-    /// least until the `JobRef` is executed or dropped. Additionally, the
-    /// caller must ensure that they never create two different `JobRef`s that
-    /// point to the same `StackJob`.
+    /// The caller must ensure that:
+    ///
+    /// * The `StackJob` will outlive the `JobRef`.
+    ///
+    /// * The `StackJob` will not move for the lifetime of the `JobRef`.
+    ///
+    /// * The `StackJob` does not outlive any data it closes over.
+    ///
+    /// * This function is not called again so long as the `JobRef` lives.
     #[inline(always)]
     pub unsafe fn as_job_ref(&self) -> JobRef {
         let job_pointer = NonNull::from(self).cast();
-        // SAFETY: The caller ensures the `StackJob` will outlive the `JobRef`,
-        // so it will remain valid to convert this pointer into a reference, and
-        // hence it is possible to pass this pointer to `Self::execute`.
+        // SAFETY: `JobRef::new_raw` requires:
+        //
+        // * `job_pointer` and `Self::execute` are matched.
+        //
+        //   Here, `execute` expects a pointer to `Self`, which is what
+        //   `job_pointer` is.
+        //
+        // * The pointee is live, not moved, and not dropped until `execute_fn`
+        //   is called.
         //
-        // `Self::execute` cannot be called multiple times because
-        // `JobRef::execute` takes ownership of the `JobRef`, and we only create
-        // a single `JobRef` for each stack job.
+        //   Here, the caller guarantees the `StackJob` outlives and does not
+        //   move for the lifetime of the `JobRef`.
+        //
+        // * `execute_fn` to be called at most once.
+        //
+        //   Here, `JobRef::execute` consumes the `JobRef`, and only one
+        //   `JobRef` is created per `StackJob`, so it is called exactly once.
         unsafe { JobRef::new_raw(job_pointer, Self::execute) }
     }
 
@@ -228,29 +314,49 @@ where
     ///
     /// # Safety
     ///
-    /// This may only be called before the job is executed.
+    /// The caller must ensure that either this function or `execute` are called
+    /// for a given `StackJob` (not both), and that this function must not be
+    /// called multiple times.
     #[inline(always)]
     pub unsafe fn unwrap(&mut self) -> F {
-        // SAFETY: This will not be used again. Given that `execute` has not
-        // already been, it will never be used twice.
-        unsafe { ManuallyDrop::take(self.f.get_mut()) }
+        let f_mut = self.f.get_mut();
+        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
+        // again after we `take()` it's contents.
+        //
+        // `take()` is called in two places: once here, and once in `execute`.
+        // Since this function is mutually exclusive with `execute`, and is
+        // called at most once, the `ManuallyDrop<F>` is not used again.
+        unsafe { ManuallyDrop::take(f_mut) }
     }
 
     /// Unwraps the job into it's return value.
     ///
     /// # Safety
     ///
-    /// This may only be called after the job has finished executing, and it's
-    /// latch has been set.
+    /// The caller must ensure that:
+    ///
+    /// * This is called only after the job's latch is set.
+    ///
+    /// * That this is called at most once for a given `StackJob`.
     #[inline(always)]
     pub unsafe fn return_value(&mut self) -> ThreadResult<T> {
-        // Synchronize with the fence in `StackJob::execute`.
+        // Synchronize with the fence in `StackJob::execute`, establishing a
+        // happens-after relationship with the following read..
         fence(Ordering::Acquire);
         // Get a ref to the result.
         let result_ref = self.return_value.get_mut();
-        // SAFETY: The job has completed, which means the return value must have
-        // been initialized. This consumes the job, so there's no chance of this
-        // accidentally duplicating data.
+        // SAFETY: `assume_init_read` requires:
+        //
+        // * The `MaybeUninit` is fully initialized.
+        //
+        //   As this function can only be called if the latch has been set, and
+        //   the latch is only set at the end of `StackJob::execute` (after
+        //   `return_value` is written and memory is synchronized via the above
+        //   fence) the memory must be initialized.
+        //
+        // * That data not be incorrectly duplicated by repeated calls.
+        //
+        //   Data is not duplicated because this function is called at most once.
         unsafe { result_ref.assume_init_read() }
     }
 }
@@ -264,36 +370,58 @@ where
     ///
     /// # Safety
     ///
-    /// The caller must ensure that `this` is valid to access a `StackJob`
-    /// immutably at least until the `Latch` within the `StackJob` has been set.
-    /// As a consequence, this may not be run after a latch has been set. Since
-    /// this function sets the latch, the caller must ensure to only call this
-    /// function once.
+    /// The caller must ensure that:
+    ///
+    /// * `this` is a non-null, properly aligned pointer to a live instance of
+    ///   `StackJob<F, T>`.
+    ///
+    /// * The `StackJob` will not move or be deallocated until the latch it
+    ///   contains is set.
+    ///
+    /// * Either this function or `unwrap` are called at most once for a given
+    ///   `StackJob`.
     #[inline(always)]
     unsafe fn execute(this: NonNull<()>, worker: &Worker) {
         // SAFETY: The caller ensures `this` can be converted into an immutable
         // reference until we set the latch, and the latch has not yet been set.
         let this = unsafe { this.cast::<Self>().as_ref() };
         // Create an abort guard. If the closure panics, this will convert the
-        // panic into an abort. Doing so prevents use-after-free for other elements of the stack.
+        // panic into an abort. Doing so prevents use-after-free for other
+        // elements of the stack.
         let abort_guard = unwind::AbortOnDrop;
-        // SAFETY: This memory location is accessed only in this function and in
-        // `unwrap`. The latter cannot have been called because it consumes the
-        // stack job. And since this function is called only once, we can
-        // guarantee that we have exclusive access.
+        // SAFETY: `f` is a `UnsafeCell<ManuallyDrop<F>>`. Creating a
+        // `&mut ManuallyDrop<F>` is only sound so long as no other live
+        // references exist.
+        //
+        // `f` is accessed mutably in two places: once here, and once in
+        // `unwrap`. Since this function is mutually exclusive with `unwrap`,
+        // and is called at most once, exclusive access is guaranteed.
         let f_ref = unsafe { &mut *this.f.get() };
-        // SAFETY: The caller ensures this function is called only once.
+        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
+        // again after we `take()` it's contents.
+        //
+        // `take()` is called in two places: once here, and once in `unwrap`.
+        // Since this function is mutually exclusive with `unwrap`, and is
+        // called at most once, the `ManuallyDrop<F>` is not used again.
         let f = unsafe { ManuallyDrop::take(f_ref) };
-        // Run the job. If the job panics, we propagate the panic back to the main thread.
+        // Run the job. If the job panics, we propagate the panic back to the
+        // main thread.
         let result = unwind::halt_unwinding(|| f(worker));
         // Get the uninitialized memory where we should put the return value.
         let return_value = this.return_value.get();
-        // SAFETY: The return value is only accessed here and in
-        // `StackJob::return_value`. Since the other method consumes the stack
-        // job, it's not possible for it to run concurrently. Therefore, we must
-        // have exclusive access to the return value.
+        // SAFETY: Writing to this unsafe cell requires that no other thread
+        // holds a reference to it's contents.
+        //
+        // The `return_value` is only written here and only read within
+        // `StackJob::return_value`, and then only after the latch has been set.
+        // The latch has not been set, and this function is called at most once,
+        // so no concurrent access can occur.
         unsafe { (*return_value).write(result) };
-        // Latches do not participate in memory ordering, so we need to do this manually.
+        // This syncrhonizies with the `Acquire` fence within `return_value()`,
+        // establishing a happens-before relationship that makes the preceding
+        // `return_value` write vsibile to the reader.
+        //
+        // This is required because latches do not synchronize memory.
         fence(Ordering::Release);
         // SAFETY: The caller ensures the job is valid until the latch is set.
         // Since the latch is a field of the job, the latch must be valid until
@@ -304,6 +432,229 @@ where
     }
 }
 
+// -----------------------------------------------------------------------------
+// Stack allocated work function on a non-worker thread
+
+/// Like [`StackJob`] but allocated on the stack of a non-worker thread. While
+/// this job is pending, the owning thread is fully blocked.
+#[cfg(not(feature = "shuttle"))]
+pub struct ExternalJob<F, T> {
+    f: UnsafeCell<ManuallyDrop<F>>,
+    completed: AtomicU32,
+    return_value: UnsafeCell<MaybeUninit<ThreadResult<T>>>,
+}
+
+#[cfg(not(feature = "shuttle"))]
+impl<F, T> ExternalJob<F, T>
+where
+    F: FnOnce(&Worker) -> T + Send,
+    T: Send,
+{
+    /// Creates a new `ExternalJob`.
+    #[inline(always)]
+    pub fn new(f: F) -> ExternalJob<F, T> {
+        ExternalJob {
+            f: UnsafeCell::new(ManuallyDrop::new(f)),
+            completed: AtomicU32::new(0),
+            return_value: UnsafeCell::new(MaybeUninit::uninit()),
+        }
+    }
+
+    /// Creates a `JobRef` pointing to this job. The underlying `ExternalJob` is
+    /// not dropped after the `JobRef` is executed.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that:
+    ///
+    /// * The `ExternalJob` will not move or be deallocated until the `JobRef`
+    ///   is executed.
+    ///
+    /// * The `JobRef` does not outlive any data the `ExternalJob` closes over.
+    ///
+    /// * This function is not called again so long as the `JobRef` lives.
+    #[inline(always)]
+    pub unsafe fn as_job_ref(&self) -> JobRef {
+        let job_pointer = NonNull::from(self).cast();
+        // SAFETY: The `job_pointer` is trivially aligned and non-null,
+        // because it is derived from a reference.
+        //
+        // The caller must not allow the `ExternalJob` to move or be deallocated
+        // until the `JobRef` is executed. This guarantees that `job_pointer`
+        // remains valid for the lifetime of `JobRef`, satisfying the
+        // requirements of `JobRef::new_raw`.
+        //
+        // The caller guarantees that this function is not called again while
+        // `JobRef` lives, so `Self::execute` can be called at most once for
+        // this particular `ExternalJob`. This satisfies the at-most-once
+        // execution invariant documented on `Job::execute`.
+        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
+    }
+
+    /// Waits for the `ExternalJob` to be executed and returns the result.
+    ///
+    /// # Safety
+    ///
+    /// This must be called at most once.
+    #[inline(always)]
+    pub unsafe fn wait_for_value(&mut self) -> ThreadResult<T> {
+        // Wait for the complete flag to be set.
+        loop {
+            atomic_wait::wait(&self.completed, 0);
+            if self.completed.load(Ordering::Relaxed) == 1 {
+                break;
+            }
+        }
+        // Synchronize memory; we do this with a fence, so that we only do a
+        // relaxed load in the case of a spurious wakeup.
+        fence(Ordering::Acquire);
+        // Get a ref to the result.
+        let result_ref = self.return_value.get_mut();
+        // SAFETY: `assume_init_read` requires:
+        //
+        // * The `MaybeUninit` is fully initialized.
+        //
+        //   As this can only be called if we have observed that `completed` has
+        //   been set to 1, and that only happens at the end of
+        //   `ExternalJob::execute` (after `return_value` is written and memory
+        //   is synchronized via the above fence) the memory must be initialized.
+        //
+        // * That data not be incorrectly duplicated by repeated calls.
+        //
+        //   Data is not duplicated because this function is called at most
+        //   once.
+        unsafe { result_ref.assume_init_read() }
+    }
+}
+
+#[cfg(not(feature = "shuttle"))]
+impl<F, T> Job for ExternalJob<F, T>
+where
+    F: FnOnce(&Worker) -> T + Send,
+    T: Send,
+{
+    /// Executes an `ExternalJob` from a const pointer.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that:
+    ///
+    /// * `this` is a non-null, properly aligned pointer to a live instance
+    ///   of `ExternalJob<F, T>`.
+    ///
+    /// * The `ExternalJob` will not move or be deallocated for as long as
+    ///   `completed` remains set to 0.
+    ///
+    /// * This function is called at most once for a given `ExternalJob`.
+    #[inline(always)]
+    unsafe fn execute(this: NonNull<()>, worker: &Worker) {
+        // SAFETY: The caller ensures `this` can be converted into an immutable
+        // reference until we set the `complete` atomic.
+        let this = unsafe { this.cast::<Self>().as_ref() };
+        // Create an abort guard. If the closure panics, this will convert the
+        // panic into an abort. Doing so prevents use-after-free for other
+        // elements of the stack.
+        let abort_guard = unwind::AbortOnDrop;
+        // SAFETY: `f` is a `UnsafeCell<ManuallyDrop<F>>`. Creating a
+        // `&mut ManuallyDrop<F>` is only sound so long as no other live
+        // references exist.
+        //
+        // Since this field is never access mutably except for here and this
+        // function is called at most once, exclusive access is guaranteed.
+        let f_ref = unsafe { &mut *this.f.get() };
+        // SAFETY: `ManuallyDrop` requires us to ensure that it is not used
+        // again after we `take()` it's contents.
+        //
+        // Since it is not used in the remainder of this function, and this
+        // function is called at most once, it is indeed not used again.
+        let f = unsafe { ManuallyDrop::take(f_ref) };
+        // Run the job. If the job panics, we propagate the panic back to the
+        // main thread.
+        let result = unwind::halt_unwinding(|| f(worker));
+        // Get the uninitialized memory where we should put the return value.
+        let return_value = this.return_value.get();
+        // SAFETY: Writing to this unsafe cell requires that no other thread
+        // holds a reference to it's contents.
+        //
+        // The `return_value` is only read within `ExternalJob::wait_for_value`,
+        // and then only after `completed` is set to 1. Since this function is
+        // called at most once, `completed` must still be set to 0. Therefore no
+        // concurrent access can occur.
+        unsafe { (*return_value).write(result) };
+        // Set `completed` to 1, allowing reads of the return value. This
+        // `Release` store synchronizes with the `Acquire` fence in
+        // `ExternalJob::wait_for_value`, establishing a happens-before
+        // relationship that makes the preceding `return_value` write visible
+        // to the waiting reader.
+        this.completed.store(1, Ordering::Release);
+        // Notify the waiting thread that the job is complete.
+        atomic_wait::wake_one(&this.completed);
+        // Forget the abort guard, re-enabling panics.
+        core::mem::forget(abort_guard);
+    }
+}
+
+#[cfg(feature = "shuttle")]
+pub struct ExternalJob<F, T> {
+    f: UnsafeCell<ManuallyDrop<F>>,
+    mutex: shuttle::sync::Mutex<Option<ThreadResult<T>>>,
+    condvar: shuttle::sync::Condvar,
+}
+
+#[cfg(feature = "shuttle")]
+impl<F, T> ExternalJob<F, T>
+where
+    F: FnOnce(&Worker) -> T + Send,
+    T: Send,
+{
+    /// Creates a new `ExternalJob`.
+    #[inline(always)]
+    pub fn new(f: F) -> ExternalJob<F, T> {
+        ExternalJob {
+            f: UnsafeCell::new(ManuallyDrop::new(f)),
+            mutex: shuttle::sync::Mutex::new(None),
+            condvar: shuttle::sync::Condvar::new(),
+        }
+    }
+
+    #[inline(always)]
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    pub unsafe fn as_job_ref(&self) -> JobRef {
+        let job_pointer = NonNull::from(self).cast();
+        unsafe { JobRef::new_raw(job_pointer, Self::execute) }
+    }
+
+    #[inline(always)]
+    pub unsafe fn wait_for_value(&mut self) -> ThreadResult<T> {
+        let mut value = self.mutex.lock().unwrap();
+        while value.is_none() {
+            value = self.condvar.wait(value).unwrap();
+        }
+        Option::take(&mut value).unwrap()
+    }
+}
+
+#[cfg(feature = "shuttle")]
+impl<F, T> Job for ExternalJob<F, T>
+where
+    F: FnOnce(&Worker) -> T + Send,
+    T: Send,
+{
+    #[inline(always)]
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe fn execute(this: NonNull<()>, worker: &Worker) {
+        let this = unsafe { this.cast::<Self>().as_ref() };
+        let abort_guard = unwind::AbortOnDrop;
+        let f_ref = unsafe { &mut *this.f.get() };
+        let f = unsafe { ManuallyDrop::take(f_ref) };
+        let result = unwind::halt_unwinding(|| f(worker));
+        let mut value = this.mutex.lock().unwrap();
+        *value = Some(result);
+        this.condvar.notify_one();
+        core::mem::forget(abort_guard);
+    }
+}
+
 // -----------------------------------------------------------------------------
 // Heap allocated work function
 
@@ -362,7 +713,7 @@ where
     /// # Safety
     ///
     /// The caller must ensure that `this` is a pointer, created by calling
-    /// `Box::into_raw` on a `Box<HeapJob>`. After the call `this` must be
+    /// `Box::into_raw` on a `Box<HeapJob<F>>`. After the call `this` must be
     /// treated as dangling.
     #[inline(always)]
     unsafe fn execute(this: NonNull<()>, worker: &Worker) {
diff --git a/src/latch.rs b/src/latch.rs
index 8c7564c..9a1b3a1 100644
--- a/src/latch.rs
+++ b/src/latch.rs
@@ -1,15 +1,13 @@
 //! A core concept in Rayon is the *latch*. Forte has borrowed this, in a
 //! somewhat simplified form.
 //!
-//! Every forte worker thread is has a single "sleep controller" that it uses to
+//! Every forte worker thread has a single "sleep controller" that it uses to
 //! park and unpark itself. Latches build on this to create a simple boolean
 //! switch, which allows the owning thread to sleep until the latch becomes set
 //! by another thread.
 
-use core::{
-    pin::Pin,
-    task::{RawWaker, RawWakerVTable, Waker},
-};
+use alloc::task::Wake;
+use core::borrow::Borrow;
 
 use crate::platform::*;
 
@@ -40,20 +38,36 @@ const ASLEEP: u32 = 0b10;
 /// The general idea and spirit for latches (as well as some of the
 /// documentation) is due to rayon. However the implementation is specific to
 /// forte.
+///
+/// ## Memory Ordering
+///
+/// Latches _do not synchronize memory_. They are only used for signaling. If
+/// the thread that sets a latch wishes to transmit a value to the thread
+/// waiting for that latch, explicit fences must be used.
 pub struct Latch {
     /// Holds the internal state of the latch. This tracks if the latch has been
     /// set or not.
     state: AtomicU32,
+    /// Tracks the number of sleeping threads in the pool.
+    sleeping: &'static AtomicU32,
     /// The sleep controller for the owning thread.
     sleep_controller: &'static SleepController,
+    /// The seat number that owns this latch
+    seat_number: usize,
 }
 
 impl Latch {
     /// Creates a new latch, owned by a specific thread.
-    pub fn new(sleep_controller: &'static SleepController) -> Latch {
+    pub fn new(
+        seat_number: usize,
+        sleeping: &'static AtomicU32,
+        sleep_controller: &'static SleepController,
+    ) -> Latch {
         Latch {
             state: AtomicU32::new(LOCKED),
+            sleeping,
             sleep_controller,
+            seat_number,
         }
     }
 
@@ -66,13 +80,22 @@ impl Latch {
     /// Waits for the latch to be set. In actuality, this may be woken.
     ///
     /// Returns true if the latch signal was received, and false otherwise.
+    ///
+    /// # Memory Ordering
+    ///
+    /// This does not synchronize memory. To synchronize memory with the thread
+    /// setting the latch, call `fence(Ordering::Acquire)` after this function.
+    /// The other thread must issue a corresponding `fence(Ordering::Release)`
+    /// call.
     #[cold]
     pub fn wait(&self) {
         // First, check if the latch has been set.
         //
         // In the event of a race with `set`:
-        // + If this happens before the store, then we will go to sleep.
-        // + If this happens after the store, then we notice and return.
+        //
+        // * If this happens before the store, then we will go to sleep.
+        //
+        // * If this happens after the store, then we notice and return.
         if self.state.load(Ordering::Relaxed) == SIGNAL {
             return;
         }
@@ -80,7 +103,7 @@ impl Latch {
         //
         // In the event of a race with `set`, the `wake` will always cause this
         // to return regardless of memory ordering.
-        self.sleep_controller.sleep();
+        self.sleep_controller.sleep(self.seat_number, self.sleeping);
     }
 
     /// Activates the latch, potentially unblocking the owning thread.
@@ -88,35 +111,46 @@ impl Latch {
     /// This takes a raw pointer because the latch may be de-allocated by a
     /// different thread while this function is executing.
     ///
+    /// # Memory Ordering
+    ///
+    /// This does not synchronize memory. To synchronize memory with the waiting
+    /// thread, call `fence(Ordering::Release)` before this function. The other
+    /// thread must issue a corresponding `fence(Ordering::Acquire)` call.
+    ///
     /// # Safety
     ///
-    /// The latch pointer must be valid when passed to this function, and must
-    /// not be allowed to become dangling until after the latch is set.
+    /// The latch pointer must be valid when passed to this function. After this
+    /// call, the latch pointer may become dangling and must not be dereferenced
+    /// unless it is known to still be valid.
     #[inline(always)]
     pub unsafe fn set(latch: *const Latch) {
-        // SAFETY: At this point, the latch must still be valid to dereference.
-        let sleep_controller = unsafe { (*latch).sleep_controller };
+        // SAFETY: The caller guarantees the latch remain alive until `set`
+        // returns.
+        let latch = unsafe { &*latch };
+        let sleep_controller = latch.sleep_controller;
         // First we set the state to true.
         //
         // In the event of a race with `wait`, this may cause `wait` to return.
         // Otherwise the other thread will sleep within `wait.
-        //
-        // SAFETY: At this point, the latch must still be valid to dereference.
-        unsafe { (*latch).state.store(SIGNAL, Ordering::Relaxed) };
+        latch.state.store(SIGNAL, Ordering::Relaxed);
         // We must try to wake the other thread, just in case it missed the
-        // notification and went to sleep. This garentees that the other thread
+        // notification and went to sleep. This guarantees that the other thread
         // will make progress.
         sleep_controller.wake();
     }
 
     /// Restores the latch to the default state.
     ///
-    /// # Safety
+    /// # Deadlocks
+    ///
+    /// This may only be called by the thread that "owns" the latch, and only
+    /// after it has *observed* the latch entering the `SIGNAL` state, e.g.
+    /// after either `wait` or `check` has returned `true`.
     ///
-    /// This may only be called when in the `SIGNAL` state, eg. after either `wait` or
-    /// `check` has returned `true`.
+    /// Calling `reset` from a different thread or before observing the signal
+    /// is likely to result in deadlocks.
     #[inline(always)]
-    pub unsafe fn reset(&self) {
+    pub fn reset(&self) {
         self.state.store(LOCKED, Ordering::Relaxed);
     }
 }
@@ -128,58 +162,52 @@ impl Latch {
 #[cfg(not(feature = "shuttle"))]
 pub struct SleepController {
     state: AtomicU32,
-    num_sleeping: &'static AtomicU32,
 }
 
 #[cfg(not(feature = "shuttle"))]
 impl SleepController {
-    /// Creates a new latch. Expects to be passed an atomic used for tracking
-    /// the number of sleeping workers.
-    pub fn new(num_sleeping: &'static AtomicU32) -> Self {
+    /// Creates a new sleep controller.
+    pub const fn new() -> Self {
         SleepController {
             state: AtomicU32::new(LOCKED),
-            num_sleeping,
         }
     }
 
-    // Attempt to wake the thread to which this belongs.
-    //
-    // Returns true if this allows the thread to make progress (by waking it up
-    // or catching it before it goes to sleep) and false if the thread was
-    // running.
+    /// Attempt to wake the thread to which this belongs.
+    ///
+    /// Returns true if this allows the thread to make progress (by waking it up
+    /// or catching it before it goes to sleep) and false if the thread was
+    /// running.
     #[inline(always)]
     pub fn wake(&self) -> bool {
-        // Set set the state to SIGNAL and read the current state, which must be
+        // Set the state to SIGNAL and read the current state, which must be
         // either LOCKED, ASLEEP or SIGNAL.
         let sleep_state = self.state.swap(SIGNAL, Ordering::Relaxed);
-        let asleep = sleep_state == ASLEEP;
-        if asleep {
-            // Decrement the sleeping counter by one.
-            self.num_sleeping.fetch_sub(1, Ordering::Relaxed);
+        if sleep_state == ASLEEP {
             // If the state was ASLEEP, the thread is either asleep or about to
             // go to sleep.
             //
-            // + If it is about to go to sleep (but has not yet called
+            // * If it is about to go to sleep (but has not yet called
             //   `atomic_wait::wait`) then setting the state to SIGNAL above
             //   should prevent it from going to sleep.
             //
-            // + If it is already waiting, the following notification will wake
+            // * If it is already waiting, the following notification will wake
             //   it up.
             //
             // Either way, after this call the other thread must make progress.
             atomic_wait::wake_one(&self.state);
         }
-        // Return true if the other thread was asleep
-        asleep
+        // Return true if the other thread was asleep and not already notified.
+        sleep_state == ASLEEP
     }
 
-    // Attempt to send the thread to sleep. This should only be called on a
-    // single thread, and we say that this controller "belongs" to that thread.
-    //
-    // Returns true if this thread makes a syscall to suspend the thread, and
-    // false if the thread was already woken (letting us skip the syscall).
+    /// Attempt to send the thread to sleep. This should only be called on a
+    /// single thread, and we say that this controller "belongs" to that thread.
+    ///
+    /// Returns true if this thread makes a syscall to suspend the thread, and
+    /// false if the thread was already woken (letting us skip the syscall).
     #[cold]
-    pub fn sleep(&self) {
+    pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) {
         // Set the state to ASLEEP and read the current state, which must be
         // either LOCKED or SIGNAL.
         let state = self.state.swap(ASLEEP, Ordering::Relaxed);
@@ -187,10 +215,10 @@ impl SleepController {
         // we should try to put the thread to sleep. Otherwise we should return
         // early.
         if state == LOCKED {
-            // Increase the sleeping count by one.
-            self.num_sleeping.fetch_add(1, Ordering::Relaxed);
+            // Set the sleeping bit for this worker.
+            sleeping.fetch_or(1 << seat_number, Ordering::Relaxed);
             // If we have received a signal since entering the sleep state
-            // (meaning the state is not longer set to ASLEEP) then this will
+            // (meaning the state is no longer set to ASLEEP) then this will
             // return immediately.
             //
             // If the state is still ASLEEP, then the next call to `wake` will
@@ -198,6 +226,8 @@ impl SleepController {
             //
             // Either way, there is no way we can fail to receive a `wake`.
             atomic_wait::wait(&self.state, ASLEEP);
+            // Clear the sleeping bit for this worker.
+            sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed);
         }
         // Set the state back to LOCKED so that we are ready to receive new
         // signals.
@@ -217,17 +247,14 @@ pub struct SleepController {
 }
 
 #[cfg(feature = "shuttle")]
-impl Default for SleepController {
-    fn default() -> SleepController {
+impl SleepController {
+    pub fn new() -> Self {
         SleepController {
             state: Mutex::new(LOCKED),
             condvar: Condvar::new(),
         }
     }
-}
 
-#[cfg(feature = "shuttle")]
-impl SleepController {
     pub fn wake(&self) -> bool {
         let state = core::mem::replace(&mut *self.state.lock().unwrap(), SIGNAL);
         let asleep = state == ASLEEP;
@@ -237,43 +264,33 @@ impl SleepController {
         asleep
     }
 
-    pub fn sleep(&self) {
+    pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) {
         let mut state = self.state.lock().unwrap();
         if *state == LOCKED {
             *state = ASLEEP;
-            self.condvar.wait(state).unwrap();
+            sleeping.fetch_or(1 << seat_number, Ordering::Relaxed);
+            while *state == ASLEEP {
+                state = self.condvar.wait(state).unwrap();
+            }
+            sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed);
         }
+        *state = LOCKED;
     }
 }
 
 // -----------------------------------------------------------------------------
-// Async waker
+// Async wakers
 
-impl Latch {
-    /// Creates an async waker from a reference to a latch.
-    ///
-    /// # Safety
-    ///
-    /// The latch must outlive the waker.
-    pub unsafe fn as_waker(self: Pin<&Self>) -> Waker {
-        let this: *const Self = Pin::get_ref(self);
-        let raw_waker = RawWaker::new(this.cast::<()>(), &RAW_WAKER_VTABLE);
-        // SAFETY: The RawWakerVTable api contract is upheald and these
-        // functions are all thread-safe.
-        unsafe { Waker::from_raw(raw_waker) }
+impl Wake for Latch {
+    fn wake(self: Arc<Self>) {
+        // SAFETY: The borrowed `Arc` is held for the duration of this call,
+        // keeping the `Latch` alive.
+        unsafe { Latch::set(self.borrow()) };
     }
-}
 
-const RAW_WAKER_VTABLE: RawWakerVTable = RawWakerVTable::new(
-    #[inline(always)]
-    |ptr| RawWaker::new(ptr, &RAW_WAKER_VTABLE),
-    wake,
-    wake,
-    |_| {},
-);
-
-fn wake(this: *const ()) {
-    let latch = this.cast::<Latch>();
-    // SAFETY: The latch must be valid for the duration
-    unsafe { Latch::set(latch) };
+    fn wake_by_ref(self: &Arc<Self>) {
+        // SAFETY: The borrowed `Arc` is held for the duration of this call,
+        // keeping the `Latch` alive.
+        unsafe { Latch::set(self.borrow()) };
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 368c2d0..dd3071d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,12 +4,18 @@
 //! `ForkJoinPool`.
 //!
 //! It features:
-//! + Statically defined and dynamically sized thread pools.
-//! + Fully stack-allocated and inlined fork/join parrellism.
-//! + The ability to execute both closures and futures on the same pool.
-//! + Hybrid scopes that can contain work distributed across multiple thread pools.
-//! + A primitive for awaiting async work in non-async contexts without spinning.
-//! + An exposed unsafe api, built for for low-level integration and customization.
+//!
+//! * Statically defined and dynamically sized thread pools.
+//!
+//! * Fully stack-allocated and inlined fork/join parallelism.
+//!
+//! * The ability to execute both closures and futures on the same pool.
+//!
+//! * Hybrid scopes that can contain work distributed across multiple thread pools.
+//!
+//! * A primitive for awaiting async work in non-async contexts without spinning.
+//!
+//! * An exposed unsafe api, built for low-level integration and customization.
 //!
 //! Here's an example of what it looks like:
 //!
@@ -24,7 +30,7 @@
 //!     THREAD_POOL.resize_to_available();
 //!
 //!     // Register this thread as a worker on the pool.
-//!     THREAD_POOL.with_worker(|worker| {
+//!     THREAD_POOL.expect_worker(|worker| {
 //!         // Spawn a job onto the pool. The closure also accepts a worker, because the
 //!         // job may be executed on a different thread. This will be the worker for whatever
 //!         // thread it executes on.
@@ -96,7 +102,7 @@
 //! external thread tries to use a pool of size zero (with no workers), it will
 //! still be able to do work, it just won't be done in parallel. And if multiple
 //! external threads use an empty pool at the same time, they will sometimes try
-//! to collaborate and help each-other out with work.
+//! to collaborate and help each other out with work.
 //!
 //! ```
 //! # use forte::ThreadPool;
@@ -113,7 +119,7 @@
 //! THREAD_POOL.depopulate();
 //!
 //! // Do the same work, but this time we know it will execute serially (because
-//! // there are no workers to parallelized it).
+//! // there are no workers to parallelize it).
 //! THREAD_POOL.join(|_| println!("world"), |_| println!("hello "));
 //!
 //! // This will always print "hello world" (because join happens execute things
@@ -125,22 +131,23 @@
 //! Thread pools are comprised of (and run on) workers, represented as instances
 //! of the [`Worker`] type. All work done on the pool is done in a "worker
 //! context" created by [`Worker::occupy`]. The recommended way to access a
-//! worker context for a specific pool is via [`ThreadPool::with_worker`].
+//! worker context for a specific pool is via [`ThreadPool::with_worker`],
+//! [`ThreadPool::on_worker`], or [`ThreadPool::expect_worker`].
 //!
 //! ```
 //! # use forte::ThreadPool;
 //! # static THREAD_POOL: ThreadPool = ThreadPool::new();
-//! THREAD_POOL.with_worker(|worker_1| {     // <-- Creates a worker on the pool.
-//!     THREAD_POOL.with_worker(|worker_2| { // <-- Returns a reference to the existing worker.
+//! THREAD_POOL.expect_worker(|worker_1| {     // <-- Sets up this thread as a worker.
+//!     THREAD_POOL.expect_worker(|worker_2| { // <-- Returns a reference to the existing worker.
 //!         // These pointers are identical.
 //!         assert!(std::ptr::eq(worker_1, worker_2));
-//!     });                                  // <-- Leaving this scope does nothing.
-//! });                                      // <-- Leaving this scope frees the worker.
+//!     });                                    // <-- Leaving this scope does nothing.
+//! });                                        // <-- Leaving this scope frees the worker.
 //! ```
 //!
 //! Every worker holds a local queue of tasks, as well as metadata that allows
 //! other workers on the pool to communicate with it and wake it from sleep.
-//! When existing outermost scope (where the worker was actually allocated), all
+//! When exiting the outermost scope (where the worker was actually allocated), all
 //! tasks left in the local queue are executed.
 //!
 //! You will only ever receive `&Worker` references, because the worker is not
@@ -149,7 +156,7 @@
 //!
 //! To access the current worker context, you can use [`Worker::map_current`] or
 //! [`Worker::with_current`]. These allow executing work on arbitrary pools, and
-//! can be used to write library code that works normally dispute not knowing
+//! can be used to write library code that works normally despite not knowing
 //! about the thread pool static defined by the application.
 //!
 //! ```rust
@@ -163,7 +170,6 @@
 //!         None => foo()
 //!     })
 //! }
-//!
 //! ```
 //!
 //! # Core Operations
@@ -175,7 +181,7 @@
 //! * *Block on.* Waits for a future to complete (outside of an async context).
 //!
 //! All of these with the exception of *Spawn* are blocking; they have a
-//! specific join-point where a thread must wait for the all the forks of the
+//! specific join-point where a thread must wait for all the forks of the
 //! parallel operation to complete before proceeding. While it is waiting,
 //! threads will attempt to do background work, or help each-other out with
 //! their assigned workload.
@@ -191,8 +197,8 @@
 //! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`]
 //!
 //! * *Worker.* Uses the provided worker context.
-//! * *Thread pool.* Looks for an existing worker context, creates one if it dosn't find one.
-//! * *Headless.* Looks for an existing worker context, and panics if it dosn't find one.
+//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one.
+//! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one.
 //!
 //! The headless and thread pool flavors are more or less just aliases for the
 //! worker flavor. Where possible, the worker flavor should be preferred to the
@@ -273,9 +279,8 @@ mod platform {
     pub use core::sync::atomic::AtomicPtr;
     pub use core::sync::atomic::AtomicU32;
     pub use core::sync::atomic::Ordering;
-    pub use std::sync::Barrier;
-    pub use std::sync::Condvar;
     pub use std::sync::Mutex;
+    pub use std::sync::OnceLock;
     pub use std::thread::Builder as ThreadBuilder;
     pub use std::thread::JoinHandle;
     pub use std::thread::available_parallelism;
@@ -287,8 +292,11 @@ mod platform {
 
     // Core exports
 
+    pub use std::sync::OnceLock; // shuttle has no OnceLock; std's version is fine here
+
+    pub use shuttle::rand::Rng;
+    pub use shuttle::rand::thread_rng;
     pub use shuttle::sync::Arc;
-    pub use shuttle::sync::Barrier;
     pub use shuttle::sync::Condvar;
     pub use shuttle::sync::Mutex;
     pub use shuttle::sync::Weak;
@@ -300,9 +308,6 @@ mod platform {
     pub use shuttle::thread::JoinHandle;
     pub use shuttle::thread_local;
 
-    pub use shuttle::rand::Rng;
-    pub use shuttle::rand::thread_rng;
-
     // Available parallelism
 
     pub fn available_parallelism() -> std::io::Result<core::num::NonZero<usize>> {
diff --git a/src/scope.rs b/src/scope.rs
index 641ae97..71a6c19 100644
--- a/src/scope.rs
+++ b/src/scope.rs
@@ -5,6 +5,7 @@ use alloc::boxed::Box;
 use core::any::Any;
 use core::cell::UnsafeCell;
 use core::future::Future;
+use core::hint::cold_path;
 use core::marker::PhantomData;
 use core::mem::ManuallyDrop;
 use core::pin::Pin;
@@ -61,15 +62,16 @@ use crate::unwind::AbortOnDrop;
 pub struct Scope<'scope, 'env: 'scope> {
     /// Number of active references to the scope (including the owning
     /// allocation). This is incremented each time a new `ScopePtr` is created,
-    /// and decremented when a `ScopePtr` is dropped or the owning thead is done
-    /// using it.
+    /// and decremented when a `ScopePtr` is dropped or the owning thread is
+    /// done using it.
     count: AtomicU32,
     /// A latch used to communicate when the scope has been completed.
     completed: Latch,
     /// If any job panics, we store the result here to propagate it.
     panic: AtomicPtr<Box<dyn Any + Send + 'static>>,
-    /// This adds invariance over 'scope, to make sure 'scope cannot shrink,
-    /// which is necessary for soundness.
+    /// This adds invariance over 'scope. In other words, it ensures 'scope
+    /// cannot shrink or grow. This keeps the lifetime properly bound to the
+    /// closure.
     ///
     /// Without invariance, this would compile fine but be unsound:
     ///
@@ -87,13 +89,18 @@ pub struct Scope<'scope, 'env: 'scope> {
     /// # });
     /// ```
     _scope: PhantomData<&'scope mut &'scope ()>,
-    /// This adds covariance over 'env.
+    /// This adds invariance over 'env. In other words, it ensures 'env cannot
+    /// shrink or grow.
+    ///
+    /// This is not strictly necessary for correctness, and could probably be
+    /// covariant instead. Invariance was chosen to follow the precedent set by
+    /// `std::thread::scope`.
     _env: PhantomData<&'env mut &'env ()>,
 }
 
 /// Executes a new scope on a worker. [`Worker::scope`],
-/// [`ThreadPool::scope`][crate::ThreadPool::scope] and [`scope`][crate::scope()] are all just
-/// an aliases for this function.
+/// [`ThreadPool::scope`][crate::ThreadPool::scope] and
+/// [`scope`][crate::scope()] are all just aliases for this function.
 ///
 /// For details about the `'scope` and `'env` lifetimes see [`Scope`].
 #[inline]
@@ -102,10 +109,21 @@ where
     F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
 {
     let abort_guard = AbortOnDrop;
-    // SAFETY: The scope is never moved or mutably referenced. The scope is only
-    // dropped at the end of this function, after the call to `complete`. The
-    // abort guard above prevents the stack from being dropped early during a
-    // panic unwind.
+    // SAFETY: `Scope::new` requires:
+    //
+    // 1. The `Scope` is never moved after initialization.
+    //
+    // 2. `complete` is called exactly once before the `Scope` is dropped.
+    //
+    // The scope is not moved in this function, and since no `&mut Scope`
+    // reference is allowed to escape, the caller cannot safely cause the scope
+    // to move either.
+    //
+    // `Scope::complete` is called unconditionally on the line bellow, before
+    // the implicit drop of `scope`. If the closure `f` panics, it is caught and
+    // re-emitted after `complete` finishes. In the event of an uncaught panic,
+    // we cannot ensure `complete` runs properly before the scope is dropped, so
+    // we force an abort via an `AbortOnDrop` guard.
     let scope = unsafe { Scope::new(worker) };
     // Panics that occur within the closure should be caught and propagated once
     // all spawned work is complete. This is not a safety requirement, it's just
@@ -138,9 +156,18 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     ///
     /// # Safety
     ///
-    /// The caller must promise not to move or mutably reference this scope
-    /// until it is dropped, and must not allow the scope to be dropped until
-    /// after `Scope::complete` is run and returns.
+    /// The caller must ensure:
+    ///
+    /// * The `Scope` is never moved after creation. `ScopePtr::new` captures a
+    ///   raw `*const Scope` pointer, and spawned jobs hold onto these pointers
+    ///   until they complete. Moving the scope would invalidate these pointers
+    ///   and cause UB when any `ScopePtr` is dropped or used for scope access.
+    ///
+    /// * `complete` is called exactly once before the `Scope` is dropped, after
+    ///   which no `ScopePtr` may be created for this scope. `complete` blocks
+    ///   until the reference count ticks down to zero, ensuring that the scope
+    ///   outlives all `ScopePtr` references. Failing to call `complete` may
+    ///   result in dangling `ScopePtr` and produce use-after-free.
     unsafe fn new(worker: &Worker) -> Scope<'scope, 'env> {
         Scope {
             count: AtomicU32::new(1),
@@ -154,9 +181,9 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// Runs a closure or future sometime before the scope completes. Valid
     /// inputs to this method are:
     ///
-    /// + A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type.
+    /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type.
     ///
-    /// + A `Future<Output = ()>` future, with no return type.
+    /// * A `Future<Output = ()>` future, with no return type.
     ///
     /// # Panics
     ///
@@ -168,9 +195,9 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// Runs a closure or future sometime before the scope completes. Valid
     /// inputs to this method are:
     ///
-    /// + A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type.
+    /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type.
     ///
-    /// + A `Future<Output = ()>` future, with no return type.
+    /// * A `Future<Output = ()>` future, with no return type.
     ///
     /// Unlike [`Scope::spawn`], this accepts the current worker as a parameter.
     pub fn spawn_on<M, S: ScopedSpawn<'scope, M>>(&'scope self, worker: &Worker, scoped_work: S) {
@@ -183,7 +210,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// `Scope::remove_reference`, or the scope will block forever on
     /// completion.
     fn add_reference(&self) {
-        let counter = self.count.fetch_add(1, Ordering::Release);
+        let counter = self.count.fetch_add(1, Ordering::Relaxed);
         tracing::trace!("scope reference counter increased to {}", counter + 1);
     }
 
@@ -191,11 +218,16 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that there is exactly one a matching call to
-    /// `add_reference` for every call to this function, unless used within
-    /// `Scope::complete`.
+    /// The caller must ensure that each call to `remove_reference` corresponds
+    /// to exactly one prior call to `add_reference` (or the implicit initial
+    /// count of 1 provided by `Scope::new`, in the case of `Scope::complete`).
+    ///
+    /// If `remove_reference` is called without a matching `add_reference`, the
+    /// scope latch will be set prematurely, potentially allowing the scope to
+    /// be freed while a `ScopePtr` still holds a pointer to it. Uses of the
+    /// `ScopePtr` thereafter may produce use-after-free.
     unsafe fn remove_reference(&self) {
-        let counter = self.count.fetch_sub(1, Ordering::Acquire);
+        let counter = self.count.fetch_sub(1, Ordering::Relaxed);
         tracing::trace!("scope reference counter decreased to {}", counter - 1);
         if counter == 1 {
             // Alerts the owning thread that the scope has completed.
@@ -204,8 +236,11 @@ impl<'scope, 'env> Scope<'scope, 'env> {
             // once, when the scope has been dropped and all work has been
             // completed.
             //
-            // SAFETY: The latch is passed as a reference, and is live for the
-            // duration of the function.
+            // SAFETY: The owning thread must call `Scope::complete` before
+            // dropping any `Scope`, and `Scope::complete` does not return until
+            // the latch is set, which happens only here, after the count
+            // reaches zero. Therefore, the `completed` field of this `Scope`
+            // must still be a live latch.
             unsafe { Latch::set(&self.completed) };
         }
     }
@@ -215,9 +250,24 @@ impl<'scope, 'env> Scope<'scope, 'env> {
     /// remainder are dropped.
     #[cold]
     fn store_panic(&self, err: Box<dyn Any + Send + 'static>) {
+        // Check if the panic pointer has already been set. This lets us avoid
+        // allocating a second time, and means we can immediately drop the panic
+        // we have just been passed.
+        //
+        // Dropping this panic may itself trigger a pnaic, but this will simply
+        // trigger the scope's abort guard, causing an abort rather than UB.
         if self.panic.load(Ordering::Relaxed).is_null() {
             let nil = ptr::null_mut();
             let err_ptr = Box::into_raw(Box::new(err));
+            // Try to atomically swap the panic pointer from null to the newly
+            // allocated error slot. If this succeeds, the write occurs with
+            // `Release` ordering, which establishes a happens-before
+            // relationship with the fence in `maybe_propagate_panic`, so that
+            // the heap-allocated error will be visible to the reader.
+            //
+            // If the write fails, another panic must have already occurred, and
+            // we don't need to synchronize memory (the previous call to
+            // `store_panic` handles the syncrhonization for it's panic data).
             if self
                 .panic
                 .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed)
@@ -238,8 +288,18 @@ impl<'scope, 'env> Scope<'scope, 'env> {
 
     /// Propagates any panic captured while the scope was executing.
     fn maybe_propagate_panic(&self) {
+        // Swap out the panic pointer. This gives us exclusive read access to
+        // whatever it points to.
         let panic = self.panic.swap(ptr::null_mut(), Ordering::Relaxed);
         if !panic.is_null() {
+            // We generally don't expect pancis to happen.
+            cold_path();
+            // If the panic pointer is not null, emit an `Acquire` fence to
+            // establish a happens-after relationship with the `Release` branch
+            // of the `compare_exchange` call in `store_panic`, so that the
+            // error stored at the memory location pointed to by the atomic
+            // pointer will be visible on the following line.
+            fence(Ordering::Acquire);
             // SAFETY: This was created by `Box::into_raw` in `store_panic` and,
             // because of the atomic swap just above, is only called once for
             // each box.
@@ -264,6 +324,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
         // causing the latch to become set and allowing this function to
         // return.
         unsafe { self.remove_reference() };
+
         // Wait for the remaining work to complete.
         worker.wait_for(&self.completed);
     }
@@ -272,11 +333,13 @@ impl<'scope, 'env> Scope<'scope, 'env> {
 // -----------------------------------------------------------------------------
 // Generalized scoped spawn trait
 
-/// A trait for types that can be spawned onto a [`Scope`]. It is implemented for:
+/// A trait for types that can be spawned onto a [`Scope`].
+///
+/// It is implemented for:
 ///
-/// + Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'scope`.
+/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'scope`.
 ///
-/// + Futures that satisfy `Future<Output = ()> + Send + 'scope`.
+/// * Futures that satisfy `Future<Output = ()> + Send + 'scope`.
 ///
 /// Due to a bug in rustc, you may be given errors when using closures
 /// with inferred types. If you encounter the following:
@@ -332,7 +395,7 @@ where
         let job_ref = unsafe { job.into_job_ref() };
 
         // Send the job to a queue to be executed.
-        worker.enqueue(job_ref);
+        worker.fifo_queue.push_new(job_ref);
     }
 }
 
@@ -344,7 +407,7 @@ where
     fn spawn_on<'env, 'worker>(self, worker: &'worker Worker, scope: &'scope Scope<'scope, 'env>) {
         let poll_job = ScopeFutureJob::new(worker.thread_pool(), scope, self);
         let job_ref = poll_job.into_job_ref();
-        worker.enqueue(job_ref);
+        worker.fifo_queue.push_new(job_ref);
     }
 }
 
@@ -360,14 +423,14 @@ const READY: u32 = 0;
 /// This value is used for the state of future-jobs that have already been
 /// woken. Jobs in this state may be in one of the three following categories:
 ///
-/// + A pending job that has been (or is about to be) pushed to the queue
+/// * A pending job that has been (or is about to be) pushed to the queue
 ///   so that it can be polled.
 ///
-/// + A pending job that is currently being polled (or has just finished) and
+/// * A pending job that is currently being polled (or has just finished) and
 ///   which was *not* queued after it was woken, because it was woken while
 ///   running.
 ///
-/// + A job that was woken after it completed or panicked. These jobs will stay
+/// * A job that was woken after it completed or panicked. These jobs will stay
 ///   in the WOKEN state forever, and will never be queued or polled again.
 ///
 /// When a WOKEN future-job is executed by a worker, it switches into the LOCKED
@@ -382,7 +445,7 @@ const WOKEN: u32 = 1;
 /// are either executing, completed, or have been canceled due to a panic. They
 /// may switch to the WOKEN state at any time, but are not queued to the pool
 /// when this happens (they are instead queued when the future is done being
-/// polled, assuming it has not pancaked or been completed).
+/// polled, assuming it has not panicked or been completed).
 ///
 /// When a job finished executing and has not been WOKEN, it switches back to
 /// the READY state.
@@ -426,7 +489,7 @@ impl<'scope, 'env, Fut> ScopeFutureJob<'scope, 'env, Fut>
 where
     Fut: Future<Output = ()> + Send + 'scope,
 {
-    /// This vtable is part of what allows a `ScopedFutureJob` to act as an
+    /// This vtable is part of what allows a `ScopeFutureJob` to act as an
     /// async task waker.
     const VTABLE: RawWakerVTable = RawWakerVTable::new(
         Self::clone_as_waker,
@@ -453,7 +516,7 @@ where
         })
     }
 
-    /// Converts an `Arc<ScpedFutureJob>` into a job ref that can be queued on a
+    /// Converts an `Arc<ScopeFutureJob>` into a job ref that can be queued on a
     /// thread pool. The ref-count is not decremented, ensuring that the job
     /// remains alive while this job ref exists.
     ///
@@ -462,8 +525,26 @@ where
         // SAFETY: Pointers created by `Arc::into_raw` are never null.
         let job_pointer = unsafe { NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast()) };
 
-        // SAFETY: This pointer is an erased `Arc<Self>` which is what
-        // `Self::poll` expects to receive.
+        // SAFETY: `JobRef::new_raw` requires that:
+        //
+        // * `job_pointer` and `Self::poll` be "matched".
+        //
+        //   `Self::poll` expects a pointer created by calling `Arc::into_raw`
+        //   on an `Arc<Self>`, which is exactly what `job_pointer` is.
+        //
+        // * `job_pointer` points to an initialized and aligned value which is
+        //   neither moved nor dropped until it is executed.
+        //
+        //   The Arc reference count must be least 1. `Arc::into_raw` transfers
+        //   ownership of the strong count from `self` into the `JobRef`, and
+        //   that count is only released in `poll`, after the arc produced by
+        //   `Arc::from_raw` is dropped. The data is therefore guaranteed to
+        //   remain live until `poll` is called.
+        //
+        // * If `poll` has additional safety requirements, `job_pointer` upholds
+        //   them.
+        //
+        //   In this case, `poll` does not have any additional requirements.
         unsafe { JobRef::new_raw(job_pointer, Self::poll) }
     }
 
@@ -521,12 +602,32 @@ where
             abort();
         }
 
-        // At this point, we have acquired exclusive ownership of the future.
-
-        // SAFETY: The arc never moves, and the future cannot be aliased mutably
-        // elsewhere because this is the only place we access it, and no other
-        // threads can have gotten past the memory swap above without causing an
-        // abort.
+        // SAFETY: The following line requires that:
+        //
+        // 1. No other mutable references to the future exist.
+        //
+        // 2. The future will not move.
+        //
+        // Access to the future is protected by the `state` field, which acts
+        // as a mutex. Just above, we executed
+        //
+        //     state.swap(LOCKED, Ordering::Acquire)
+        //
+        // which transitions us from the `WOKEN` into the `LOCKED` state. Any
+        // concurrent caller that also tries to execute `poll` will fail this
+        // swap, and cause an abort. Exclusive access is therefore guaranteed.
+        //
+        // In the event that `poll` has been called previously, the `Acquire`
+        // ordering synchronizes with the call to
+        //
+        //     state.compare_exchange(LOCKED, READY, Ordering::Release, Ordering::Release)
+        //
+        // later in this function. This ensures that all writes to the future
+        // performed by previous invocations are visible to us before we form
+        // the mutable reference.
+        //
+        // The future does not move, because it is stored in a field within an
+        // `Arc`, which has a stable heap-allocated address.
         let future = unsafe { Pin::new_unchecked(&mut *this.future.get()) };
 
         // Create a new context from the waker, and poll the future.
@@ -542,10 +643,6 @@ where
             }
             // The job is still pending, and has not yet panicked.
             Ok(Poll::Pending) => {
-                // The fence here ensures that our changes to the future become
-                // visible to the next thread to execute the job and poll the
-                // future.
-                fence(Ordering::Release);
                 // Try to set the state back back idle so other threads can
                 // schedule it again. This will only fail if the job was woken
                 // while running, and is already in the WOKEN state.
@@ -556,15 +653,28 @@ where
                     .state
                     .compare_exchange(LOCKED, READY, Ordering::Relaxed, Ordering::Relaxed)
                     .is_err();
+                // Emit a fence here, which synchronizes with the `Acquire` swap
+                // at the start of this function to ensure that the next thread
+                // to poll this future will observe the most recent version of
+                // it.
+                //
+                // A fence is required here because the write to `state` that
+                // establishes the happens-before relationship may be caused by
+                // either (a) the `compare_exchange` call above, or (b) the
+                // `swap` call in `wake`.
+                //
+                // This fence lets `wake` use `Relaxed` ordering, and upgrades
+                // it to `Release` only when necessary.
+                fence(Ordering::Release);
                 // If the job was woken while running, it should be queued
                 // immediately. Conveniently, we know the state will already be
-                // QUEUED, so we can leave it as it is.
+                // WOKEN, so we can leave it as it is.
                 if rescheduled {
                     // This converts the local `Arc<Self>` into a job ref,
                     // preventing it from being dropped and potentially
                     // extending the job's lifetime.
                     let job_ref = this.into_job_ref();
-                    worker.enqueue(job_ref);
+                    worker.fifo_queue.push_new(job_ref);
                 }
             }
             // The job panicked. Store the panic in the scope so it can be
@@ -589,7 +699,7 @@ where
     /// instance of `Arc<Self>` that is still alive.
     unsafe fn clone_as_waker(this: *const ()) -> RawWaker {
         // SAFETY: This is called on a pointer created by `Arc::into_raw` on an
-        // instance on of `Arc<Self`.
+        // instance of `Arc<Self>`.
         unsafe { Arc::increment_strong_count(this.cast::<Self>()) };
         RawWaker::new(this, &Self::VTABLE)
     }
@@ -602,14 +712,16 @@ where
     /// instance of `Arc<Self>` that is still alive.
     unsafe fn wake(this: *const ()) {
         // SAFETY: This is called on a pointer created by `Arc::into_raw` on an
-        // instance on of `Arc<Self`.
+        // instance of `Arc<Self>`.
         let this = unsafe { Arc::from_raw(this.cast::<Self>()) };
 
         if this.state.swap(WOKEN, Ordering::Relaxed) == READY {
-            this.thread_pool.with_worker(|worker| {
-                // Convert the waker into a job ref and queue it.
-                let job_ref = this.into_job_ref();
-                worker.enqueue(job_ref);
+            // Convert the waker into a job ref and queue it.
+            let thread_pool = this.thread_pool;
+            let job_ref = this.into_job_ref();
+            thread_pool.with_worker(|worker| match worker {
+                Some(worker) => worker.fifo_queue.push_new(job_ref),
+                None => thread_pool.queue_shared_job(job_ref),
             });
         }
     }
@@ -620,20 +732,23 @@ where
     ///
     /// Must be called with a pointer created by calling `Arc::into_raw` on an
     /// instance of `Arc<Self>` that is still alive.
-    fn wake_by_ref(this: *const ()) {
+    unsafe fn wake_by_ref(this: *const ()) {
         // We use manually drop here to prevent us from consuming the arc on
         // drop. This functions like an `&Arc<Self>` rather than an `Arc<Self>`.
         //
         // SAFETY: This is called on a pointer created by `Arc::into_raw` on an
-        // instance on of `Arc<Self`.
+        // instance of `Arc<Self>`.
         let this = unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::<Self>())) };
 
         if this.state.swap(WOKEN, Ordering::Relaxed) == READY {
-            this.thread_pool.with_worker(|worker| {
-                // Clone the waker, convert it into a job-ref and queue it.
-                let this = ManuallyDrop::into_inner(this.clone());
-                let job_ref = this.into_job_ref();
-                worker.enqueue(job_ref);
+            // Clone the waker, convert it into a job-ref and queue it.
+            let this = ManuallyDrop::into_inner(this.clone());
+            let thread_pool = this.thread_pool;
+            let job_ref = this.into_job_ref();
+
+            thread_pool.with_worker(|worker| match worker {
+                Some(worker) => worker.fifo_queue.push_new(job_ref),
+                None => thread_pool.queue_shared_job(job_ref),
             });
         }
     }
@@ -644,12 +759,12 @@ where
     ///
     /// Must be called with a pointer created by calling `Arc::into_raw` on an
     /// instance of `Arc<Self>` that is still alive.
-    fn drop_as_waker(this: *const ()) {
+    unsafe fn drop_as_waker(this: *const ()) {
         // Rather than converting back into an arc, we can just decrement the
         // counter here.
         //
         // SAFETY: This is called on a pointer created by `Arc::into_raw` on an
-        // instance on of `Arc<Self`.
+        // instance of `Arc<Self>`.
         unsafe { Arc::decrement_strong_count(this.cast::<Self>()) };
     }
 }
@@ -670,10 +785,16 @@ mod scope_ptr {
     /// reference scope from being deallocated.
     pub struct ScopePtr<'scope, 'env>(*const Scope<'scope, 'env>);
 
-    // SAFETY: !Send for raw pointers is not for safety, just as a lint.
+    // SAFETY: This is safe because (a) scope-pointer is only used to call
+    // `add_reference`, `remove_reference`, and `store_panic`, all of which are
+    // designed to be thread-safe; and (b) the `Scope` cannot be deallocated
+    // while any `ScopePtr` still points to it (due to reference counting).
     unsafe impl Send for ScopePtr<'_, '_> {}
 
-    // SAFETY: !Sync for raw pointers is not for safety, just as a lint.
+    // SAFETY: This is safe because (a) scope-pointer is only used to call
+    // `add_reference`, `remove_reference`, and `store_panic`, all of which are
+    // designed to be thread-safe; and (b) the `Scope` cannot be deallocated
+    // while any `ScopePtr` still points to it (due to reference counting).
     unsafe impl Sync for ScopePtr<'_, '_> {}
 
     impl<'scope, 'env> ScopePtr<'scope, 'env> {
@@ -730,14 +851,13 @@ mod tests {
     use std::vec;
     use std::vec::Vec;
 
+    use super::Scope;
     use crate::ThreadPool;
     use crate::Worker;
     use crate::scope;
     use crate::unwind;
     use crate::util::XorShift64Star;
 
-    use super::Scope;
-
     /// Tests that empty scopes return properly.
     #[test]
     fn scope_empty() {
@@ -833,7 +953,7 @@ mod tests {
         THREAD_POOL.depopulate();
     }
 
-    /// Tests that we can spawn futures onto the thraed pool and that they can
+    /// Tests that we can spawn futures onto the thread pool and that they can
     /// borrow data as expected.
     #[test]
     fn scope_future() {
@@ -924,7 +1044,7 @@ mod tests {
         let a = AtomicU8::new(0);
         let b = AtomicU8::new(0);
 
-        THREAD_POOL.with_worker(|worker| {
+        THREAD_POOL.on_worker(|worker| {
             scope(|scope| {
                 for _ in 0..NUM_JOBS {
                     scope.spawn_on(worker, |_: &Worker| {
@@ -973,12 +1093,12 @@ mod tests {
 
         let mut completed = false;
 
-        THREAD_POOL.with_worker(|worker| {
+        THREAD_POOL.on_worker(|worker| {
             worker.scope(|scope| {
                 scope.spawn_on(worker, |_: &Worker| {
                     // Creating a new worker instead of reusing the old one is
                     // bad form, but we may as well test it.
-                    THREAD_POOL.with_worker(|worker| {
+                    THREAD_POOL.on_worker(|worker| {
                         worker.scope(|scope| {
                             scope.spawn_on(worker, |_: &Worker| {
                                 completed = true;
@@ -1002,7 +1122,7 @@ mod tests {
         THREAD_POOL.resize_to_available();
 
         let counter_p = &AtomicUsize::new(0);
-        THREAD_POOL.with_worker(|worker| {
+        THREAD_POOL.on_worker(|worker| {
             worker.scope(|scope| {
                 scope.spawn(move |worker: &Worker| {
                     divide_and_conquer(worker, scope, counter_p, 1024)
@@ -1055,7 +1175,7 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        THREAD_POOL.with_worker(|_| {
+        THREAD_POOL.on_worker(|_| {
             let mut tree = random_tree(10, 1337);
             let values: Vec<_> = tree.iter().cloned().collect();
             tree.update(|v| *v += 1);
@@ -1143,7 +1263,7 @@ mod tests {
         static THREAD_POOL: ThreadPool = ThreadPool::new();
         THREAD_POOL.resize_to_available();
 
-        THREAD_POOL.with_worker(|_| {
+        THREAD_POOL.on_worker(|_| {
             let mut max_diff = Mutex::new(0);
             let bottom_of_stack = 0;
             scope(|s| the_final_countdown(s, &bottom_of_stack, &max_diff, 5));
diff --git a/src/thread_pool.rs b/src/thread_pool.rs
index 24017fa..3006db4 100644
--- a/src/thread_pool.rs
+++ b/src/thread_pool.rs
@@ -2,11 +2,13 @@
 
 use alloc::boxed::Box;
 use alloc::format;
-use alloc::string::ToString;
 use alloc::vec::Vec;
+use core::array;
+use core::borrow::Borrow;
 use core::cell::Cell;
 use core::cmp;
 use core::future::Future;
+use core::hint::cold_path;
 use core::marker::PhantomData;
 use core::num::NonZero;
 use core::pin::pin;
@@ -14,16 +16,20 @@ use core::ptr;
 use core::ptr::NonNull;
 use core::task::Context;
 use core::task::Poll;
-use crossbeam_queue::SegQueue;
-use crossbeam_utils::CachePadded;
 
 use async_task::Runnable;
+use crossbeam_queue::SegQueue;
+use crossbeam_utils::CachePadded;
+use st3::StealError;
+use st3::lifo::Stealer;
+use st3::lifo::Worker as Sharer;
 use tracing::debug;
 use tracing::trace;
 use tracing::trace_span;
 
 use crate::FnOnceMarker;
 use crate::FutureMarker;
+use crate::job::ExternalJob;
 use crate::job::HeapJob;
 use crate::job::JobQueue;
 use crate::job::JobRef;
@@ -49,7 +55,8 @@ use crate::util::XorShift64Star;
 /// # use forte::Worker;
 /// static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// ```
-/// Thread pools are empty when created, and must be explicitly resized at runtime.
+/// Thread pools are empty when created, and must be explicitly resized at
+/// runtime.
 /// ```
 /// # use forte::ThreadPool;
 /// # use forte::Worker;
@@ -60,144 +67,58 @@ use crate::util::XorShift64Star;
 /// [`spawn`][ThreadPool::spawn], [`block_on`][ThreadPool::block_on],
 /// [`join`][ThreadPool::join], or [`scope`][ThreadPool::scope].
 pub struct ThreadPool {
-    /// The internal state of the thread pool
-    ///
-    /// This should only be locked infrequently for short periods of time in
-    /// cold functions.
-    state: Mutex<ThreadPoolState>,
-    /// A queue used for cooperatively sharing jobs between workers.
+    /// A bit-set that tracks which seats are occupied.
+    occupied: CachePadded<AtomicU32>,
+    /// A bit-set that tracks which seats are sleeping.
+    sleeping: CachePadded<AtomicU32>,
+    /// Holds shared data for each thread participating in the pool.
+    seats: OnceLock<Box<Seats>>,
+    /// Holds controls for threads spawned and managed by the pool. Initalized
+    /// on first call to `occupy`, to allow for some non-static constructors.
+    managed_threads: Mutex<ManagedThreads>,
+    /// Used to inject external work into the thread pool. This is generally
+    /// treated as a fallback, for when the thread-pool is at capacity and
+    /// threads can't register themselves as workers.
     shared_jobs: SegQueue<JobRef>,
-    /// A condvar that is used to signal a new worker taking a lease on a seat.
-    start_heartbeat: Condvar,
-    /// Tracks the number of currently sleeping workers. Incremented when a
-    /// worker goes to sleep, decremented when a worker is woken.
-    num_sleeping: AtomicU32,
-}
-
-/// The internal state of a thread pool.
-struct ThreadPoolState {
-    /// The registry of seats. These seats may be "leased out" to different
-    /// threads temporarily, and will be re-used. The seats themselves are
-    /// leaked, and will never move or be deallocated.
-    seats: Vec<Seat>,
-    /// Threads managed directly by this thread pool.
-    managed_threads: ManagedThreads,
 }
 
-impl ThreadPoolState {
-    /// Claims a lease on the thread pool. A lease can be passed to
-    /// [`Worker::occupy`] to enter a worker context for the thread pool.
-    ///
-    /// There are a finite number of leases available on each pool. If they are
-    /// already claimed, this returns `None`.
-    fn claim_lease(&mut self, thread_pool: &'static ThreadPool) -> Lease {
-        // First try to claim an unoccupied seat.
-        for (index, seat) in self.seats.iter_mut().enumerate() {
-            if !seat.occupied {
-                seat.occupied = true;
-                return Lease {
-                    thread_pool,
-                    index,
-                    seat_data: seat.data,
-                };
-            }
-        }
-
-        // If none are available, add a new seat.
-        let index = self.seats.len();
-        let seat_data = Box::leak(Box::new(SeatData {
-            #[cfg(not(feature = "shuttle"))]
-            heartbeat: AtomicBool::new(true).into(),
-            sleep_controller: SleepController::new(&thread_pool.num_sleeping),
-        }));
-        let seat = Seat {
-            occupied: true,
-            data: seat_data,
-        };
-        self.seats.push(seat);
-        Lease {
-            thread_pool,
-            index,
-            seat_data,
-        }
-    }
-
-    /// Attempts to claim several leases at once. See
-    /// [`ThreadPool::claim_lease`] for more information. If no leases are
-    /// available, this returns an empty vector.
-    fn claim_leases(&mut self, thread_pool: &'static ThreadPool, num: usize) -> Vec<Lease> {
-        let mut leases = Vec::with_capacity(num);
-
-        // First try to claim unoccupied seats.
-        for (index, seat) in self.seats.iter_mut().enumerate() {
-            if leases.len() == num {
-                return leases;
-            }
-
-            if !seat.occupied {
-                seat.occupied = true;
-                leases.push(Lease {
-                    thread_pool,
-                    index,
-                    seat_data: seat.data,
-                });
-            }
-        }
-
-        // Then create new seats as needed.
-        while leases.len() != num {
-            let index = self.seats.len();
-            let seat_data = Box::leak(Box::new(SeatData {
-                #[cfg(not(feature = "shuttle"))]
-                heartbeat: AtomicBool::new(true).into(),
-                sleep_controller: SleepController::new(&thread_pool.num_sleeping),
-            }));
-            let seat = Seat {
-                occupied: true,
-                data: seat_data,
-            };
-            self.seats.push(seat);
-            leases.push(Lease {
-                thread_pool,
-                index,
-                seat_data,
-            });
-        }
-
-        leases
-    }
+/// A public interface that can be temporarily claimed and used by a thread.
+/// Claiming a seat allows a thread to participate in the thread pool as a
+/// worker.
+pub(crate) struct Seats {
+    /// The sharing side of each seat's work-stealing queue. These should only
+    /// ever be accessed by the thread that currently owns the lease for this
+    /// seat (to ensure the `!Sync` bound is respected).
+    sharers: [Sharer<JobRef>; 32],
+    /// The stealing side of each seat's work-stealing queue.
+    stealers: [Stealer<JobRef>; 32],
+    /// The sleep/wake controller for each seat.
+    sleep_controllers: [SleepController; 32],
 }
 
-#[derive(Clone)]
-struct Seat {
-    occupied: bool,
-    data: &'static SeatData,
-}
-
-/// A public interface that can be claimed and used by a worker.
-struct SeatData {
-    /// The heartbeat signal sent to the worker.
-    #[cfg(not(feature = "shuttle"))]
-    heartbeat: CachePadded<AtomicBool>,
-    /// Allows other threads to wake the worker.
-    sleep_controller: SleepController,
-}
+// SAFETY: `stealers` are `Send + Sync` by their own bounds. `workers[i]` is
+// only ever accessed by the single thread holding seat `i`'s occupancy lease;
+// the `occupied` bitmask in `ThreadPool` enforces that exclusivity.
+unsafe impl Sync for Seats {}
 
 /// A lease represents ownership of one of a "seats" in a thread pool, and
 /// allows the owning thread to participate in that pool as a worker.
 pub struct Lease {
     /// The thread pool against which this lease is held.
     thread_pool: &'static ThreadPool,
-    /// The index of the claimed seat.
-    index: usize,
-    /// The seat being claimed by this lease.
-    seat_data: &'static SeatData,
+    /// The index of the seat in the data list
+    seat_number: usize,
+    /// A reference to the pre-initalized seat data (to avoid repeated hits of
+    /// the `OnceLock`).
+    seats: &'static Seats,
 }
 
 impl Drop for Lease {
     fn drop(&mut self) {
-        let mut state = self.thread_pool.state.lock().unwrap();
-        state.seats[self.index].occupied = false;
+        // Unset the occupied bit for this seat
+        self.thread_pool
+            .occupied
+            .fetch_and(!(1 << self.seat_number), Ordering::Relaxed);
     }
 }
 
@@ -205,16 +126,13 @@ impl Drop for Lease {
 struct ManagedThreads {
     /// Stores thread controls for workers spawned by the pool.
     workers: Vec<ManagedWorker>,
-    /// Stores thread controls for the heartbeat thread.
-    #[cfg(not(feature = "shuttle"))]
-    heartbeat: Option<ThreadControl>,
 }
 
 /// Represents a worker thread that is managed by the pool, as opposed to
 /// external threads which temporarily participate in the pool.
 struct ManagedWorker {
     /// The index of this worker in the public worker info list.
-    index: usize,
+    seat_number: usize,
     /// Controls used to manage the lifecycle of the worker.
     control: ThreadControl,
 }
@@ -234,28 +152,114 @@ struct ThreadControl {
 impl ThreadPool {
     /// Creates a new thread pool.
     pub const fn new() -> ThreadPool {
+        // Create the pool itself.
         ThreadPool {
-            state: Mutex::new(ThreadPoolState {
-                seats: Vec::new(),
-                managed_threads: ManagedThreads {
-                    workers: Vec::new(),
-                    #[cfg(not(feature = "shuttle"))]
-                    heartbeat: None,
-                },
+            seats: OnceLock::new(),
+            occupied: CachePadded::new(AtomicU32::new(0)),
+            sleeping: CachePadded::new(AtomicU32::new(0)),
+            managed_threads: Mutex::new(ManagedThreads {
+                workers: Vec::new(),
             }),
             shared_jobs: SegQueue::new(),
-            start_heartbeat: Condvar::new(),
-            num_sleeping: AtomicU32::new(0),
         }
     }
 
+    /// Returns the pre-allocated steal queues, initializing them on the first call.
+    fn get_seats(&'static self) -> &'static Seats {
+        self.seats.get_or_init(|| {
+            let sharers: [Sharer<JobRef>; 32] =
+                array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY));
+            let stealers: [Stealer<JobRef>; 32] = array::from_fn(|i| sharers[i].stealer());
+            let sleep_controllers = array::from_fn(|_| SleepController::new());
+            Box::new(Seats {
+                sharers,
+                stealers,
+                sleep_controllers,
+            })
+        })
+    }
+
+    /// Adds a job ref to the shared queue.
+    pub fn queue_shared_job(&'static self, job_ref: JobRef) {
+        self.shared_jobs.push(job_ref);
+    }
+
     /// Claims a lease on the thread pool which can be occupied by a worker
     /// (using [`Worker::occupy`]), allowing a thread to participate in the pool.
+    ///
+    /// Returns none if all seats are occupied.
     #[cold]
-    pub fn claim_lease(&'static self) -> Lease {
-        self.start_heartbeat.notify_one();
-        let mut state = self.state.lock().unwrap();
-        state.claim_lease(self)
+    pub fn claim_lease(&'static self) -> Option<Lease> {
+        loop {
+            let occupied = self.occupied.load(Ordering::Relaxed);
+            if occupied == u32::MAX {
+                return None;
+            }
+            let seat_number = occupied.trailing_ones() as usize;
+            let mask = 1 << seat_number;
+            if self.occupied.fetch_or(mask, Ordering::Relaxed) & mask == 0 {
+                // At this point we have acquired the lease on the seat
+                return Some(Lease {
+                    thread_pool: self,
+                    seat_number,
+                    seats: self.get_seats(),
+                });
+            }
+        }
+    }
+
+    /// Claims up to `n` leases at once in a single atomic transaction.
+    ///
+    /// Finds up to `n` free seats, then atomically claims all of them with a
+    /// single `compare_exchange`. Either every selected seat is claimed together
+    /// or none are (and the loop retries). Returns between 0 and `n` leases;
+    /// returns an empty `Vec` when `n` is 0 or the pool is full.
+    #[cold]
+    pub fn claim_leases(&'static self, n: usize) -> Vec<Lease> {
+        if n == 0 {
+            return Vec::new();
+        }
+        let seats = self.get_seats();
+        loop {
+            let occupied = self.occupied.load(Ordering::Relaxed);
+            if occupied == u32::MAX {
+                return Vec::new();
+            }
+
+            // Build a mask of up to `n` free seats by walking the complement.
+            let mut claimed_seats = 0;
+            let mut free_seats = !occupied;
+            for _ in 0..n {
+                if free_seats == 0 {
+                    break;
+                }
+                let seat_bit = free_seats & free_seats.wrapping_neg(); // isolate lowest set bit
+                claimed_seats |= seat_bit;
+                free_seats &= !seat_bit;
+            }
+
+            // Attempt to claim all selected seats in one atomic step.
+            match self.occupied.compare_exchange(
+                occupied,
+                occupied | claimed_seats,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return (0..32)
+                        .filter(|&i| claimed_seats & (1 << i) != 0)
+                        .map(|seat_number| Lease {
+                            thread_pool: self,
+                            seat_number: seat_number as usize,
+                            seats,
+                        })
+                        .collect();
+                }
+                Err(_) => {
+                    // Another thread modified `occupied`; retry.
+                }
+            }
+        }
     }
 
     /// Returns an opaque identifier for this thread pool.
@@ -268,40 +272,7 @@ impl ThreadPool {
     /// Returns the number of workers participating in this thread pool.
     #[inline(always)]
     pub fn num_workers(&self) -> usize {
-        todo!()
-    }
-
-    /// Tries to ensure the calling thread is a member of the thread pool, and
-    /// then executes the provided closure. If the thread is already a member of
-    /// the pool, the closure is called directly. Otherwise, the thread will
-    /// attempt to temporarily register itself with the pool (which can be
-    /// slightly slower). If registration fails (because the pool is full to
-    /// capacity) the closure is passed `None` instead of a worker instance.
-    ///
-    /// The provided closure is never sent to another thread.
-    #[inline(always)]
-    pub fn with_worker<F, R>(&'static self, f: F) -> R
-    where
-        F: FnOnce(&Worker) -> R,
-    {
-        Worker::with_current(|worker| match worker {
-            Some(worker) if worker.lease.thread_pool.id() == self.id() => f(worker),
-            _ => self.with_worker_cold(f),
-        })
-    }
-
-    /// Tries to register the calling thread on the thread pool, and pass a
-    /// worker instance to the provided closure.
-    ///
-    /// This is the slow fallback for `with_worker` covering "external calls"
-    /// from outside the pool. Never call this directly.
-    #[cold]
-    fn with_worker_cold<F, R>(&'static self, f: F) -> R
-    where
-        F: FnOnce(&Worker) -> R,
-    {
-        let lease = self.state.lock().unwrap().claim_lease(self);
-        Worker::occupy(lease, f)
+        self.occupied.load(Ordering::Relaxed).count_ones() as usize
     }
 }
 
@@ -315,8 +286,8 @@ impl ThreadPool {
     ///
     /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn resize_to_available(&'static self) -> usize {
-        let available = available_parallelism().map(NonZero::get).unwrap_or(1);
-        let available = available.saturating_sub(2);
+        let mut available = available_parallelism().map(NonZero::get).unwrap_or(1);
+        available = available.saturating_sub(1);
         self.resize_to(available)
     }
 
@@ -369,7 +340,7 @@ impl ThreadPool {
 
     /// Resizes the pool, and returns the new size.
     ///
-    /// Not that the new size may be different from the size requested.
+    /// Note that the new size may be different from the size requested.
     #[cold]
     pub fn resize<F>(&'static self, get_size: F) -> usize
     where
@@ -380,14 +351,13 @@ impl ThreadPool {
         // Resizing a pool is a critical section; only one thread can resize the
         // pool at a time. This is implemented using a mutex on the thread manager.
         trace!("locking state");
-        let mut state = self.state.lock().unwrap();
+        let mut managed_threads = self.managed_threads.lock().unwrap();
 
         // Compute the new size of the pool, given the current size.
-        let current_size = state.managed_threads.workers.len();
+        let current_size = managed_threads.workers.len();
 
-        // You are only allowed to spawn managed threads for up to half the total number of workers,
-        // to leave room for non-managed threads. By default, this means at most 16 workers can be managed.
-        let mut new_size = get_size(current_size);
+        // Calculate the new size of the pool (counting only managed workers).
+        let new_size = get_size(current_size);
 
         trace!(
             "attempting to resize thread pool from {} to {} thread(s)",
@@ -401,98 +371,56 @@ impl ThreadPool {
             }
             // The size increased
             cmp::Ordering::Greater => {
-                // Acquire leases for the new threads.
-                trace!("locking worker leases");
-                let new_leases = state.claim_leases(self, new_size - current_size);
-                new_size = current_size + new_leases.len(); // Scale back the new size to what we can actually spawn.
-                trace!("acquired leases for {} new threads", new_size);
-
-                // When not in shuttle, start the heartbeat thread if scaling up from zero.
-                #[cfg(not(feature = "shuttle"))]
-                if new_size > 0 && current_size == 0 {
-                    debug!("spawning heartbeat runner");
-                    let halt = Arc::new(AtomicBool::new(false));
-                    let heartbeat_halt = halt.clone();
-                    let handle = ThreadBuilder::new()
-                        .name("heartbeat".to_string())
-                        .spawn(move || {
-                            heartbeat_loop(self, heartbeat_halt);
-                        })
-                        .unwrap();
-                    let control = ThreadControl { halt, handle };
-                    state.managed_threads.heartbeat = Some(control);
-                }
-
-                let barrier = Arc::new(Barrier::new(new_leases.len() + 1));
-
                 // Spawn the new workers.
-                for lease in new_leases {
-                    let index = lease.index;
-                    debug!("spawning managed worker with index {}", index);
+                let leases = self.claim_leases(new_size - current_size);
+                for lease in leases {
+                    let seat_number = lease.seat_number;
+                    debug!("spawning managed worker for seat number {}", seat_number);
                     let halt = Arc::new(AtomicBool::new(false));
                     let worker_halt = halt.clone();
-                    let worker_barrier = barrier.clone();
                     let handle = ThreadBuilder::new()
-                        .name(format!("worker {index}"))
+                        .name(format!("worker {seat_number}"))
                         .spawn(move || {
-                            managed_worker(lease, worker_halt, worker_barrier);
+                            managed_worker(lease, worker_halt);
                         })
                         .unwrap();
                     let control = ThreadControl { halt, handle };
-                    state
-                        .managed_threads
-                        .workers
-                        .push(ManagedWorker { index, control });
+                    managed_threads.workers.push(ManagedWorker {
+                        seat_number,
+                        control,
+                    });
                 }
 
-                drop(state);
-
-                // Wait for the threads to start.
-                barrier.wait();
+                drop(managed_threads);
             }
             // The size decreased
             cmp::Ordering::Less => {
                 // Pull the workers we intend to halt out of the thread manager.
-                let terminating_workers = state.managed_threads.workers.split_off(new_size);
-
-                // Halt the heartbeat thread when scaling to zero.
-                #[cfg(not(feature = "shuttle"))]
-                let heartbeat_control = if new_size == 0 {
-                    state.managed_threads.heartbeat.take()
-                } else {
-                    None
-                };
+                let terminating_workers = managed_threads.workers.split_off(new_size);
 
                 // Terminate and wake the workers.
+                let seats = self.get_seats();
                 for worker in &terminating_workers {
                     // Tell the worker to halt.
                     worker.control.halt.store(true, Ordering::Relaxed);
                     // Wake the worker up.
-                    state.seats[worker.index].data.sleep_controller.wake();
+                    seats.sleep_controllers[worker.seat_number].wake();
                 }
 
                 // Drop the lock on the state so as not to block the workers or heartbeat.
-                drop(state);
+                drop(managed_threads);
 
                 // Determine our seat index.
-                let own_seat = Worker::map_current(|worker| worker.lease.index);
+                let own_seat_number = Worker::map_current(|worker| worker.lease.seat_number);
 
                 // Wait for the other workers to fully halt.
                 for worker in terminating_workers {
                     // It's possible we may be trying to terminate ourselves, in
                     // which case we can skip the thread-join.
-                    if Some(worker.index) != own_seat {
+                    if Some(worker.seat_number) != own_seat_number {
                         let _ = worker.control.handle.join();
                     }
                 }
-
-                // If we took control of the heartbeat, halt it after the workers.
-                #[cfg(not(feature = "shuttle"))]
-                if let Some(control) = heartbeat_control {
-                    control.halt.store(true, Ordering::Relaxed);
-                    self.start_heartbeat.notify_one();
-                    let _ = control.handle.join();
-                }
             }
         }
 
@@ -501,14 +429,126 @@ impl ThreadPool {
     }
 }
 
+// -----------------------------------------------------------------------------
+// Thread pool worker access
+
+impl ThreadPool {
+    /// Runs the closure on a thread-pool worker.
+    ///
+    /// If this thread is not a worker, it will try to register itself as one.
+    /// If the thread pool is full, the closure is sent to another worker as a
+    /// job, and this thread is parked.
+    ///
+    /// If your closure is `!Send`, use [`with_worker`][ThreadPool::with_worker]
+    /// instead.
+    #[inline(always)]
+    pub fn on_worker<F, R>(&'static self, f: F) -> R
+    where
+        F: FnOnce(&Worker) -> R + Send,
+        R: Send,
+    {
+        self.with_worker(|worker| match worker {
+            Some(worker) => f(worker),
+            None => {
+                let mut job = ExternalJob::new(f);
+                // SAFETY: `ExternalJob::as_job_ref` requires:
+                //
+                // * The `ExternalJob` must not move or be deallocated until the
+                //   `JobRef` is executed.
+                //
+                // * The `JobRef` does not outlive any data the `ExternalJob` closes over.
+                //
+                // * `as_job_ref` is not called again while `JobRef` lives.
+                //
+                // The `ExternalJob` is a stack-allocated variable. After
+                // calling `as_job_ref`, we never move `job`, and we wait for
+                // the job to execute by calling `job.wait_for_value`. Only
+                // after that returns do we allow the `job` to be dropped. This
+                // also means that any data closed over by the `ExternalJob`
+                // must outlive the `JobRef`.
+                //
+                // Also, `as_job_ref` is plainly called only once.
+                let job_ref = unsafe { job.as_job_ref() };
+                self.queue_shared_job(job_ref);
+                // SAFETY: `wait_for_value` must be called at most once. This is
+                // the only call site for this particular `job`, which is a
+                // stack-local variable.
+                let result = unsafe { job.wait_for_value() };
+                match result {
+                    Ok(value) => value,
+                    Err(error) => unwind::resume_unwinding(error),
+                }
+            }
+        })
+    }
+
+    /// Runs the closure on a thread-pool worker.
+    ///
+    /// If this thread is not a worker, it will try to register itself as one.
+    /// If the thread pool is full, this panics.
+    ///
+    /// If you don't want to panic, use [`on_worker`][ThreadPool::on_worker] or
+    /// [`with_worker`][ThreadPool::with_worker] instead.
+    #[inline(always)]
+    #[track_caller]
+    pub fn expect_worker<F, R>(&'static self, f: F) -> R
+    where
+        F: FnOnce(&Worker) -> R,
+    {
+        self.with_worker(|worker| match worker {
+            Some(worker) => f(worker),
+            None => panic!("thread pool full; not able to access worker"),
+        })
+    }
+
+    /// Runs the closure on a thread-pool worker.
+    ///
+    /// If this thread is currently acting as a worker for the thread-pool, this
+    /// just looks that worker up. If this is not registered as a worker, or the
+    /// thread's worker is registered with different thread pool, the thread
+    /// will try to register itself with the correct pool. If the thread pool is
+    /// full, it passes the closure `None`.
+    ///
+    /// The provided closure is never sent to another thread. If your closure is
+    /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead.
+    #[inline(always)]
+    pub fn with_worker<F, R>(&'static self, f: F) -> R
+    where
+        F: FnOnce(Option<&Worker>) -> R,
+    {
+        Worker::with_current(|worker| match worker {
+            Some(worker) if worker.lease.thread_pool.id() == self.id() => f(Some(worker)),
+            _ => self.with_worker_cold(f),
+        })
+    }
+
+    /// Tries to register the calling thread on the thread pool, and pass a
+    /// worker instance to the provided closure.
+    ///
+    /// This is the slow fallback for `with_worker` covering "external calls"
+    /// from outside the pool. Never call this directly.
+    #[cold]
+    fn with_worker_cold<F, R>(&'static self, f: F) -> R
+    where
+        F: FnOnce(Option<&Worker>) -> R,
+    {
+        match self.claim_lease() {
+            Some(lease) => Worker::occupy(lease, |worker| f(Some(worker))),
+            None => f(None),
+        }
+    }
+}
+
 // -----------------------------------------------------------------------------
 // Generalized spawn trait
 
-/// A trait for types that can be spawned onto a [`ThreadPool`]. It is implemented for:
+/// A trait for types that can be spawned onto a [`ThreadPool`].
 ///
-/// + Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'static`.
+/// It is implemented for:
 ///
-/// + Futures that satisfy `Future<Output = T> + Send + 'static` where `T: Send + 'static`.
+/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'static`.
+///
+/// * Futures that satisfy `Future<Output = T> + Send + 'static` where `T: Send + 'static`.
 ///
 /// Due to a bug in rustc, you may be given errors when using closures
 /// with inferred types. If you encounter the following:
@@ -518,7 +558,7 @@ impl ThreadPool {
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// THREAD_POOL.spawn(|_| { });
-/// //                ^^^^^^^ the trait `Spawn<'_, _>` is not implemented for closure ...
+/// //                ^^^^^^^ ERROR: the trait `Spawn<'_, _>` is not implemented for closure ...
 /// ```
 /// Try adding a type hint to the closure's parameters, like so:
 /// ```
@@ -558,7 +598,7 @@ where
 
         // Queue the job for evaluation
         if let Some(worker) = worker {
-            worker.enqueue(job_ref);
+            worker.fifo_queue.push_new(job_ref);
         } else {
             // Push the work into the share queue and wake a worker
             thread_pool.shared_jobs.push(job_ref);
@@ -568,11 +608,11 @@ where
 
 pub type Task<T> = async_task::Task<T, &'static ThreadPool>;
 
-// Schedules a runnable future as a job.
-//
-// Async-task prefers that this is a static function, rather than a closure,
-// which is why this is a separate function that pulls the thread pool from the
-// runnable metadata.
+/// Schedules a runnable future as a job.
+///
+/// Async-task prefers that this is a static function, rather than a closure,
+/// which is why this is a separate function that pulls the thread pool from the
+/// runnable metadata.
 fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) {
     // Get a ref to the thread pool from the runnable.
     let thread_pool = *runnable.metadata();
@@ -588,16 +628,19 @@ fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) {
     let job_ref = unsafe { JobRef::new_raw(job_pointer, execute_runnable) };
 
     // Send this job off to be executed.
-    thread_pool.with_worker(|worker| {
-        worker.enqueue(job_ref);
+    thread_pool.with_worker(|worker| match worker {
+        Some(worker) => worker.fifo_queue.push_new(job_ref),
+        None => thread_pool.shared_jobs.push(job_ref),
     });
 }
 
-// Executes a raw pointer to a runnable future.
+/// Executes a raw pointer to a runnable future.
 #[inline(always)]
 fn execute_runnable(this: NonNull<()>, _worker: &Worker) {
-    // SAFETY: This pointer was created by the call to `Runnable::into_raw` just above.
-    let runnable = unsafe { Runnable::<()>::from_raw(this) };
+    // SAFETY: This pointer was created by `Runnable::into_raw` in
+    // `schedule_runnable` with type parameter `&'static ThreadPool`, and
+    // `from_raw` is called at most once.
+    let runnable = unsafe { Runnable::<&'static ThreadPool>::from_raw(this) };
     // Poll the task. This will drop the future if the task is
     // canceled or the future completes.
     runnable.run();
@@ -656,11 +699,10 @@ impl ThreadPool {
         F: Future<Output = T> + Send,
         T: Send,
     {
-        self.with_worker(|worker| worker.block_on(future))
+        self.on_worker(|worker| worker.block_on(future))
     }
 
-    /// Executes the two closures, possibly in parallel, and returns the
-    /// results.
+    /// Executes the two closures, possibly in parallel.
     ///
     /// See also: [`Worker::join`] and [`join`].
     #[inline(always)]
@@ -671,7 +713,7 @@ impl ThreadPool {
         RA: Send,
         RB: Send,
     {
-        self.with_worker(|worker| worker.join(a, b))
+        self.on_worker(|worker| worker.join(a, b))
     }
 
     /// Creates a scope onto which non-static work can be spawned.
@@ -681,9 +723,10 @@ impl ThreadPool {
     #[inline(always)]
     pub fn scope<'env, F, T>(&'static self, f: F) -> T
     where
-        F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
+        F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T + Send,
+        T: Send,
     {
-        self.with_worker(|worker| worker.scope(f))
+        self.on_worker(|worker| worker.scope(f))
     }
 }
 
@@ -694,12 +737,10 @@ thread_local! {
     static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) };
 }
 
-/// Holds the local context for a thread pool member, which allows queuing,
-/// executing, and sharing jobs on the pool.
+/// Represents membership in a thread pool.
 ///
-/// Workers are the recommended way to interface with a thread pool. To get
-/// access to worker for a given thread pool, users should call
-/// [`ThreadPool::with_worker`].
+/// To get access to worker for a given thread pool, users should call
+/// [`ThreadPool::with_worker`], [`ThreadPool::on_worker`], [`ThreadPool::expect_worker`]
 ///
 /// Every thread has at most one worker at a time. If a worker has already been
 /// set up, it may be accessed at any time by calling [`Worker::with_current`].
@@ -716,9 +757,22 @@ thread_local! {
 pub struct Worker {
     migrated: Cell<bool>,
     lease: Lease,
-    queue: JobQueue,
+    /// A sequence of jobs waiting to be executed. Newer jobs are executed
+    /// before older ones, allowing efficient depth-first execution. During
+    /// promotion, the oldest job is shared. Populated by `join()`.
+    ///
+    /// Jobs in this queue take precedence over those in the fifo queue.
+    lifo_queue: JobQueue,
+    /// A sequence of jobs waiting to be executed. Older jobs are executed
+    /// before newer ones, providing reliably low latency. During promotion,
+    /// this queue is partitioned into chunks and the chunks are shared.
+    /// Populated by `spawn()`.
+    ///
+    /// Jobs in this queue are executed only when the lifo queue is empty.
+    pub(crate) fifo_queue: JobQueue,
     rng: XorShift64Star,
-    // Make non-send
+    last_promote_tick: Cell<u64>,
+    // Make non-send.
     _phantom: PhantomData<*const ()>,
 }
 
@@ -746,17 +800,19 @@ impl Worker {
     {
         trace!("occupying lease");
 
-        let span = trace_span!("occupy", lease = lease.index);
+        let span = trace_span!("occupy", seat_number = lease.seat_number);
         let _enter = span.enter();
 
         // Create a new worker to occupy the lease. Note: It's potentially a
         // problem that the same thread can occupy multiple workers on the same
-        // thread. We many eventually need to design something to prevent this.
+        // thread. We may eventually need to design something to prevent this.
         let worker = Worker {
             migrated: Cell::new(false),
             lease,
-            queue: JobQueue::new(),
+            fifo_queue: JobQueue::new(),
+            lifo_queue: JobQueue::new(),
             rng: XorShift64Star::new(),
+            last_promote_tick: Cell::new(0),
             _phantom: PhantomData,
         };
 
@@ -767,9 +823,8 @@ impl Worker {
         // and pass in a worker reference directly.
         let result = f(&worker);
 
-        // Execute the work queue until it's empty. This happens to be pulled in
-        // LIFO order, but it's fairly arbitrary.
-        while let Some(job_ref) = worker.queue.pop_newest() {
+        // Finish executing local work before shutting down.
+        while let Some(job_ref) = worker.find_local_work() {
             worker.execute(job_ref, false);
         }
 
@@ -783,6 +838,13 @@ impl Worker {
         result
     }
 
+    /// Returns a reference to the push-side `Sharer` queue for this
+    /// worker's seat.
+    #[inline(always)]
+    fn sharer(&self) -> &Sharer<JobRef> {
+        &self.lease.seats.sharers[self.lease.seat_number]
+    }
+
     /// Calls the provided closure on the thread's worker instance, if it has one.
     ///
     /// Rust's thread locals are fairly costly, so this function is expensive.
@@ -794,16 +856,20 @@ impl Worker {
     {
         let worker_ptr = WORKER_PTR.with(Cell::get);
         if !worker_ptr.is_null() {
-            // SAFETY: The `WORKER` static is only set by `occupy`, and it's
-            // always set to a stack-allocated `Worker` which is never moved and
-            // is only accessed through shared references. Therefore, if the
-            // pointer is non-null, it must be safe to dereference.
+            // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw
+            // pointer to a `Worker`. It is only written to by `Worker::occupy`,
+            // which stores the address of a `Worker` allocated within it's own
+            // stack frame. Before it returns, `occupy` restores the previous
+            // value of `WORKER_PTR`, so that it is always either null or points
+            // to a live, immovable `Worker` on the current thread's call stack
+            // (but is never left dangling).
             //
-            // This creates a reference with an unbounded lifetime. To avoid
-            // turning it into a `'static`, we pass it in to a closure. This
-            // restricts its lifetime to the closure body, and prevents callers
-            // from keeping around references to Workers that will be
-            // deallocated when `occupy` returns.
+            // If the pointer is non-null, it is therefore valid to dereference
+            // as a shared reference. Forming a `'static` reference is avoided
+            // by passing the value into a closure, which bounds the reference's
+            // lifetime to the closure body and prevents callers from retaining
+            // it past the point where `occupy` returns and the `Worker` is
+            // freed.
             Some(f(unsafe { &*worker_ptr }))
         } else {
             None
@@ -828,7 +894,7 @@ impl Worker {
             //
             // This creates a reference with an unbounded lifetime. To avoid
             // turning it into a `'static`, we pass it in to a closure. This
-            // restricts it's lifetime to the closure body, and prevents callers
+            // restricts its lifetime to the closure body, and prevents callers
             // from keeping around references to Workers that will be
             // deallocated when `occupy` returns.
             f(Some(unsafe { &*worker_ptr }))
@@ -839,8 +905,8 @@ impl Worker {
 
     /// Returns the index of the worker in the leases list.
     #[inline(always)]
-    pub fn index(&self) -> usize {
-        self.lease.index
+    pub fn seat_number(&self) -> usize {
+        self.lease.seat_number
     }
 
     /// Returns the index of the thread pool of the worker.
@@ -849,66 +915,105 @@ impl Worker {
         self.lease.thread_pool
     }
 
-    /// Pushes a job onto the local queue, overflowing to the shared queue when
-    /// full.
-    #[inline(always)]
-    pub fn enqueue(&self, job_ref: JobRef) {
-        if let Some(job_ref) = self.queue.push(job_ref) {
-            // push the work to the shared queue
-            self.lease.thread_pool.shared_jobs.push(job_ref);
-        }
-    }
+    /// Capacity of the per-worker work-stealing queue. This is the maximum
+    /// amount a worker can make available for stealing at once.
+    const STEAL_QUEUE_CAPACITY: usize = 32;
+
+    /// The minimum number of CPU ticks between calls to [`Worker::promote_cold`].
+    /// Approximately 5μs at 3 GHz.
+    const PROMOTE_TICK_INTERVAL: u64 = 15_000;
 
     /// Try to promote the oldest task in the queue.
     #[inline(always)]
     fn promote(&self) {
-        // Check for a heartbeat, potentially promoting the job we just pushed
-        // to a shared job.
-        #[cfg(not(feature = "shuttle"))]
-        let heartbeat = self.lease.seat_data.heartbeat.load(Ordering::Relaxed);
-
-        #[cfg(feature = "shuttle")]
-        let heartbeat = true; // thread_rng().gen_bool(0.5);
-
-        if heartbeat && let Some(job_ref) = self.queue.pop_oldest() {
-            self.promote_cold(job_ref);
-            #[cfg(not(feature = "shuttle"))]
-            self.lease
-                .seat_data
-                .heartbeat
-                .store(false, Ordering::Relaxed);
+        // Promotions are fairly costly, so we limit their frequency using the
+        // cpu's instruction counter. Promote is called at a high frequency, and
+        // actually doing the promotion is probably a cold path.
+        let current_tick = tick_counter::start();
+        if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL {
+            // This should ideally become a conditional jump.
+            self.promote_cold(current_tick);
         }
     }
 
-    /// Pushes work onto the shared queue and wakes another worker.
+    /// The actual work-promotion implementation. Must be called infrequently.
     #[cold]
-    fn promote_cold(&self, job_ref: JobRef) {
-        // Push the job onto the shared queue.
-        self.lease.thread_pool.shared_jobs.push(job_ref);
-
-        // Fetch the number of sleeping workers and pending shared tasks
-        let num_sleeping = self.lease.thread_pool.num_sleeping.load(Ordering::Relaxed);
+    fn promote_cold(&self, current_tick: u64) {
+        // Update the promote tick so that `promote` won't call this again soon.
+        self.last_promote_tick.set(current_tick);
 
-        if num_sleeping == 0 {
+        // Early out if it seems like all workers are already awake.
+        let sleeping = self.lease.thread_pool.sleeping.load(Ordering::Relaxed);
+        if sleeping == 0 {
             return;
         }
+        cold_path();
+
+        // Track if we actually managed to share work.
+        let mut shared_job = false;
+
+        // Share work from the lifo queue. This is shared bit-by-bit, with old
+        // (and therefore theoretically "large") tasks shared first.
+        if let Some(job_ref) = self.lifo_queue.pop_oldest() {
+            // Push into our own steal queue so siblings can steal it.
+            if let Err(job_ref) = self.sharer().push(job_ref) {
+                // If the queue is full, that indicates that the pool is
+                // probably under high-load and we should continue local-first
+                // operation.
+                self.lifo_queue.push_old(job_ref);
+            } else {
+                shared_job = true;
+            }
+        }
 
-        // Try to wake a worker to work on it.
-        //
-        // Note: This operation is extremely expensive, and should be avoided if possible.
-        let seats = self.lease.thread_pool.state.lock().unwrap().seats.clone();
-        let num_seats = seats.len();
-        let offset = self.rng.next_usize(num_seats);
-        for i in 0..num_seats {
-            let i = (i + offset) % num_seats;
-            if i == self.lease.index {
-                continue;
+        // Share work from the fifo queue. Offload the newest jobs in a series of
+        // small chunks.
+        for job_refs in self.fifo_queue.split() {
+            // Create a new job that will insert a chunk of jobs into the
+            // runner's fifo queue when executed.
+            //
+            // This reduces the cost of sharing a large number of small jobs.
+            let batch_job = HeapJob::new(move |worker| {
+                worker.fifo_queue.append(job_refs);
+            });
+            // SAFETY: `into_job_ref` requires that the data closed over by the
+            // `HeapJob` outlive the `JobRef`.
+            //
+            // Here, the closure captures `job_refs` (a `VecDequeue<JobRef>`) by
+            // value, and so trivially outlives the newly created `JobRef`.
+            let batch_job_ref = unsafe { batch_job.into_job_ref() };
+            // Push the batch job into the steal queue so siblings can steal it.
+            if let Err(job_ref) = self.sharer().push(batch_job_ref) {
+                // If the queue is full, that indicates that the pool is
+                // probably under high-load and we should continue local-first
+                // operation.
+                //
+                // This just adds the jobs back to the local queue.
+                self.execute(job_ref, false);
+            } else {
+                shared_job = true;
             }
-            if seats[i].occupied {
-                let ready = seats[i].data.sleep_controller.wake();
-                if ready {
-                    return;
-                }
+        }
+
+        // If we added work to the steal queue, wake a random sibling to steal
+        // it from us, while we do other work.
+        if shared_job {
+            self.wake_random(sleeping);
+        }
+    }
+
+    /// Tries to wake a random sleeping worker. Expects to be given a bitset of
+    /// sleeping workers.
+    #[inline(always)]
+    fn wake_random(&self, sleeping: u32) {
+        let offset = self.rng.next_usize(32) as u32;
+        let mut randomized_sleeping = sleeping.rotate_right(offset);
+        while randomized_sleeping != 0 {
+            let index = (randomized_sleeping.trailing_zeros() + offset) % 32;
+            randomized_sleeping &= randomized_sleeping - 1; // Clear the lowest bit
+            let woken = self.lease.seats.sleep_controllers[index as usize].wake();
+            if woken {
+                return;
             }
         }
     }
@@ -916,7 +1021,11 @@ impl Worker {
     /// Create a new latch owned by the worker.
     #[inline(always)]
     pub fn new_latch(&self) -> Latch {
-        Latch::new(&self.lease.seat_data.sleep_controller)
+        Latch::new(
+            self.lease.seat_number,
+            &self.lease.thread_pool.sleeping,
+            &self.lease.seats.sleep_controllers[self.lease.seat_number],
+        )
     }
 
     /// Runs jobs until the provided latch is set.
@@ -926,40 +1035,67 @@ impl Worker {
     #[inline(always)]
     pub fn wait_for(&self, latch: &Latch) {
         while !latch.check() {
-            #[cfg(feature = "shuttle")]
-            shuttle::hint::spin_loop();
-
             if self.yield_now() == Yield::Idle {
                 latch.wait();
             }
         }
     }
 
-    /// Tries to find a job to execute, either in the local queue or shared on
-    /// the thread pool.
-    ///
-    /// The second value is true if the job was shared, or false if it was spawned locally.
+    /// Finds a job to work on. This function is entirely local, and does no
+    /// synchronization with the queue.
     #[inline(always)]
-    fn find_work(&self) -> Option<(JobRef, bool)> {
-        // We give preference first to things in our local deque, then in other
-        // workers deques, and finally to injected jobs from the outside. The
-        // idea is to finish what we started before we take on something new.
-        //
-        // We pull from the local queue in LIFO order, which means are popping
-        // from the *back* of the queue (the most recently added jobs). This is
-        // because `yield_now` (and by extension `wait_for` which uses it) is
-        // often called directly after pushing work onto the queue (as in `join`
-        // and `scope`). Pulling from the back of the queue potentially can
-        // allow these blocking operations to complete faster. In the cast when
-        // scopes/joins are deeply nested, this also causes work to be executed
-        // *depth-first*, which is often desirable.
-        self.queue
+    fn find_local_work(&self) -> Option<JobRef> {
+        self.lifo_queue
             .pop_newest()
+            .or_else(|| self.fifo_queue.pop_oldest())
+            .or_else(|| self.sharer().pop())
+    }
+
+    /// Finds a job to work on. This tries
+    /// [`find_local_work`][Worker::find_local_work] first, then falls back to
+    /// pulling shared work from the thread pool.
+    #[inline(always)]
+    fn find_work(&self) -> Option<(JobRef, bool)> {
+        self.find_local_work()
             .map(|job| (job, false))
+            .or_else(|| self.steal_from_siblings().map(|job| (job, true)))
             .or_else(|| self.claim_shared_job().map(|job| (job, true)))
     }
 
-    /// Claims a shared job from the thread pool.
+    /// Attempts to steal a job from another worker's work-stealing queue.
+    ///
+    /// Iterates over occupied seats in a random order to avoid always hitting
+    /// the same victim. Because stealers are pre-allocated and permanent, no
+    /// lock or atomic load is needed to access them.
+    fn steal_from_siblings(&self) -> Option<JobRef> {
+        let stealers = &self.lease.seats.stealers;
+        let occupied = self.lease.thread_pool.occupied.load(Ordering::Relaxed);
+        let my_seat = self.lease.seat_number as u32;
+
+        // Randomise the starting position so all workers get a fair shot as victims.
+        let offset = self.rng.next_usize(32) as u32;
+        let mut bits = (occupied & !(1u32 << my_seat)).rotate_right(offset);
+
+        while bits != 0 {
+            let shifted_idx = bits.trailing_zeros();
+            let idx = (shifted_idx + offset) % 32;
+            bits &= bits - 1;
+            // The stealer is a permanent reference — no lock or atomic load needed.
+            let stealer = &stealers[idx as usize];
+            // `steal_and_pop` returns one job directly and moves up to half the
+            // remaining items into our steal queue for later use.
+            loop {
+                match stealer.steal_and_pop(self.sharer(), |n| n / 2) {
+                    Ok((job, _)) => return Some(job),
+                    Err(StealError::Busy) => {} // transient; retry
+                    Err(StealError::Empty) => break,
+                }
+            }
+        }
+        None
+    }
+
+    /// Claims a job from the global injector queue.
     #[inline(always)]
     fn claim_shared_job(&self) -> Option<JobRef> {
         self.lease.thread_pool.shared_jobs.pop()
@@ -974,7 +1110,7 @@ impl Worker {
     pub fn yield_local(&self) -> Yield {
         // We use LIFO order here, pulling the newest work from the queue. This
         // is just for consistency with yield_now/find_work.
-        match self.queue.pop_newest() {
+        match self.find_local_work() {
             Some(job_ref) => {
                 self.execute(job_ref, false);
                 Yield::Executed
@@ -993,9 +1129,7 @@ impl Worker {
     /// [`Worker::yield_local`] instead.
     #[inline(always)]
     pub fn yield_now(&self) -> Yield {
-        // Try to promote an item from the queue
         self.promote();
-
         match self.find_work() {
             Some((job_ref, migrated)) => {
                 self.execute(job_ref, migrated);
@@ -1062,14 +1196,13 @@ impl Worker {
         T: Send,
     {
         // Create a new latch to block the thread until the future completes.
-        let latch = pin!(self.new_latch());
-        let latch = latch.into_ref();
-        // Convert the blocker into an async waker.
         //
-        // SAFETY: The blocker lasts for the duration of this function, and
-        // since the waker is only used within this function, it must outlive
-        // the waker.
-        let waker = unsafe { latch.as_waker() };
+        // This is allocated on the heap, even though the worker is allocated on
+        // the stack, because we can't prevent futures from keeping wakers
+        // around for arbitrary amounts of time or issuing wakeups for futures
+        // that have completed.
+        let latch = Arc::new(self.new_latch());
+        let waker = latch.clone().into();
         // Put the waker into an async context that can be used to poll futures.
         let mut ctx = Context::from_waker(&waker);
         // Pin the future, promising not to move it while it's being polled.
@@ -1080,14 +1213,14 @@ impl Worker {
                 // While the future is incomplete, run other tasks or sleep.
                 Poll::Pending => {
                     // This will not return until the latch is set.
-                    self.wait_for(latch.get_ref());
+                    self.wait_for(latch.borrow());
                     // We want to keep using the same latch every time we wait
                     // for the future to become ready, so we have to reset it
                     // here.
                     //
-                    // SAFETY: The latch must be in the set state because we
-                    // just waited for it.
-                    unsafe { latch.reset() };
+                    // The latch must be in the set state because we just waited
+                    // for it.
+                    latch.reset();
                 }
                 // When it is complete, pull out the result and return it.
                 Poll::Ready(res) => return res,
@@ -1095,8 +1228,7 @@ impl Worker {
         }
     }
 
-    /// Takes two closures and *potentially* runs them in parallel, then returns
-    /// the results.
+    /// Executes the two closures, possibly in parallel.
     ///
     /// If you do not have access to a [`Worker`], you may call
     /// [`ThreadPool::join`] or simply [`join`].
@@ -1127,7 +1259,7 @@ impl Worker {
         let job_ref_id = job_ref.id();
 
         // Push the job onto the queue.
-        self.enqueue(job_ref);
+        self.lifo_queue.push_new(job_ref);
 
         // If we have received a heartbeat, we remove the oldest item in the
         // local queue and push it into the shared queue. This causes work to be
@@ -1141,7 +1273,7 @@ impl Worker {
 
         // Attempt to recover the job from the queue. It should still be there
         // if we didn't share it.
-        if self.queue.recover_just_pushed(job_ref_id) {
+        if self.lifo_queue.recover_newest(job_ref_id) {
             // SAFETY: Because the ids match, the JobRef we just popped from
             // the queue must point to `stack_job`, implying that
             // `stack_job` cannot have been executed yet.
@@ -1178,9 +1310,12 @@ impl Worker {
 // -----------------------------------------------------------------------------
 // Thread local scheduling api
 
-/// Spawns a thread onto the current thread pool.
+/// Runs the provided closure in the background.
 ///
-/// If there is no current thread pool, this panics.
+/// <div class="warning">
+/// <strong>Note:</strong>
+/// This function panics if the current thread is not registered as a worker.
+/// </div>
 ///
 /// See also: [`Worker::spawn`] and [`ThreadPool::spawn`].
 pub fn spawn<M, S: Spawn<M>>(work: S) -> S::Output {
@@ -1191,9 +1326,12 @@ pub fn spawn<M, S: Spawn<M>>(work: S) -> S::Output {
     })
 }
 
-/// Blocks the thread waiting for a future to complete.
+/// Waits for a future to complete.
 ///
-/// If there is no current thread pool, this panics.
+/// <div class="warning">
+/// <strong>Note:</strong>
+/// This function panics if the current thread is not registered as a worker.
+/// </div>
 ///
 /// See also: [`Worker::block_on`] and [`ThreadPool::block_on`].
 pub fn block_on<F, T>(future: F) -> T
@@ -1208,10 +1346,12 @@ where
     })
 }
 
-/// Takes two closures and *potentially* runs them in parallel. It
-/// returns a pair of the results from those closures.
+/// Executes the two closures, possibly in parallel.
 ///
-/// If there is no current thread pool, this panics.
+/// <div class="warning">
+/// <strong>Note:</strong>
+/// This function panics if the current thread is not registered as a worker.
+/// </div>
 ///
 /// See also: [`Worker::join`] and [`ThreadPool::join`].
 pub fn join<A, B, RA, RB>(a: A, b: B) -> (RA, RB)
@@ -1228,15 +1368,17 @@ where
     })
 }
 
-/// Creates a "fork-join" scope and invokes the closure with a reference to
-/// it. Work spawned onto this scope does not have to have a `'static`
+/// Creates a new scope for spawning non-static work.
+///
+/// Work spawned onto the new scope does not have to have a `'static`
 /// lifetime, and can borrow local variables. Local borrowing is possible
 /// because this function will not return until all work spawned on the
 /// scope has completed, this ensuring the stack frame is kept alive for the
 /// duration.
 ///
 /// <div class="warning">
-/// <strong>Note:</strong> This function panics if the current thread is not registered as a worker.
+/// <strong>Note:</strong>
+/// This function panics if the current thread is not registered as a worker.
 /// </div>
 ///
 /// # Alternatives
@@ -1263,7 +1405,7 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let ok: Vec<i32> = vec![1, 2, 3];
 /// forte::scope(|scope| {
 ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1290,7 +1432,7 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let ok: Vec<i32> = vec![1, 2, 3];
 /// forte::scope(|scope| {
 ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1316,7 +1458,7 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let ok: Vec<i32> = vec![1, 2, 3];
 /// forte::scope(|scope| {
 ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1343,7 +1485,7 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let ok: Vec<i32> = vec![1, 2, 3];
 /// forte::scope(|scope| {
 ///     let bad: Vec<i32> = vec![4, 5, 6];
@@ -1370,10 +1512,10 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let mut leak = None;
 /// forte::scope(|scope| {
-///     leak = Some(scope); // <-- scope would be leaked here
+///     leak = Some(scope); // <-- ERROR: scope would be leaked here
 /// });
 /// drop(leak);
 /// # });
@@ -1387,7 +1529,7 @@ where
 /// # use forte::Worker;
 /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
 /// # THREAD_POOL.populate();
-/// # THREAD_POOL.with_worker(|worker| {
+/// # THREAD_POOL.expect_worker(|worker| {
 /// let mut counter = 0;
 /// let counter_ref = &mut counter;
 /// forte::scope(|scope| {
@@ -1415,12 +1557,13 @@ where
 /// THREAD_POOL.with_worker(|worker| {
 ///     worker.scope(|scope| {
 ///         worker.spawn(|worker: &Worker| {
-///             // ^^^^^ This creates a *non-static* job on the worker,
+///             // ^^^^^ ERROR: This creates a *static* job on the worker,
 ///             //       which may outlive the scope.
 ///             
 ///             scope.spawn_on(worker, |_: &Worker| { });
-///             // ^^^^^ This requires borrowing the scope within the
-///             //       unscoped job, which isn't allowed by the compiler.
+///             // ^^^^^ ERROR: This requires borrowing the scope within the
+///             //       unscoped job, which isn't allowed by the compiler
+///             //       because 'scope would have to to outlive 'static.
 ///         });
 ///     });
 /// });
@@ -1439,9 +1582,6 @@ where
 /// once a task is spawned using `scope.spawn(),` it will execute, even if the
 /// spawning task should later panic. The scope returns once all work is
 /// complete, and panics are propagated at that point.
-///
-/// Note: Panics in futures are instead propagated to their
-/// [`Task`][async_task::Task], and will not cause the scope to panic.
 pub fn scope<'env, F, T>(f: F) -> T
 where
     F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,
@@ -1460,11 +1600,9 @@ where
 /// Operating on the principle that you should finish what you start before
 /// starting something new, workers will first execute their queue, then execute
 /// shared jobs, then pull new jobs from the injector.
-fn managed_worker(lease: Lease, halt: Arc<AtomicBool>, barrier: Arc<Barrier>) {
+fn managed_worker(lease: Lease, halt: Arc<AtomicBool>) {
     trace!("starting managed worker");
 
-    barrier.wait();
-
     // Register as the indicated worker, and work until we are told to halt.
     Worker::occupy(lease, |worker| {
         while !halt.load(Ordering::Relaxed) {
@@ -1474,7 +1612,8 @@ fn managed_worker(lease: Lease, halt: Arc<AtomicBool>, barrier: Arc<Barrier>) {
             if let Some((job, migrated)) = worker.find_work() {
                 worker.execute(job, migrated);
             } else {
-                worker.lease.seat_data.sleep_controller.sleep();
+                worker.lease.seats.sleep_controllers[worker.lease.seat_number]
+                    .sleep(worker.lease.seat_number, &worker.lease.thread_pool.sleeping);
             }
         }
     });
@@ -1482,57 +1621,14 @@ fn managed_worker(lease: Lease, halt: Arc<AtomicBool>, barrier: Arc<Barrier>) {
     trace!("exiting managed worker");
 }
 
-// -----------------------------------------------------------------------------
-// Heartbeat sender loop
-
-/// This is the main loop for the heartbeat thread. It's in charge of
-/// periodically sending a "heartbeat" signal to each worker. By default, each
-/// worker receives a heartbeat about once every 100 μs.
-///
-/// Workers use the heartbeat signal to amortize the cost of promoting local
-/// jobs to shared jobs (which allows other works to claim them) and to reduce
-/// lock contention.
-///
-/// This is never runs when testing in shuttle.
-#[cfg(not(feature = "shuttle"))]
-fn heartbeat_loop(thread_pool: &'static ThreadPool, halt: Arc<AtomicBool>) {
-    trace!("starting managed heartbeat thread");
-
-    let mut seats = thread_pool.state.lock().unwrap().seats.clone();
-    let mut index = 0;
-
-    while !halt.load(Ordering::Relaxed) {
-        let num_seats = seats.len();
-        let (back, front) = seats.split_at(index);
-        if let Some((offset, seat)) = Iterator::chain(front.iter(), back.iter())
-            .enumerate()
-            .find(|(_, seat)| seat.occupied)
-        {
-            index = (index + offset + 1) % num_seats;
-            seat.data.heartbeat.store(true, Ordering::Relaxed);
-            std::thread::yield_now();
-            seats = thread_pool.state.lock().unwrap().seats.clone();
-        } else {
-            let state = thread_pool.state.lock().unwrap();
-            seats = thread_pool
-                .start_heartbeat
-                .wait(state)
-                .unwrap()
-                .seats
-                .clone();
-        }
-    }
-}
-
 // -----------------------------------------------------------------------------
 // Tests
 
 #[cfg(all(test, not(feature = "shuttle")))]
 mod tests {
 
-    use std::sync::mpsc::channel;
-
     use alloc::vec;
+    use std::sync::mpsc::channel;
 
     use super::*;
 
@@ -1596,7 +1692,7 @@ mod tests {
         THREAD_POOL.resize_to_available();
 
         let mut vals = [0; 1_024];
-        THREAD_POOL.with_worker(|worker| increment(worker, &mut vals));
+        THREAD_POOL.on_worker(|worker| increment(worker, &mut vals));
         assert_eq!(vals, [1; 1_024]);
 
         THREAD_POOL.depopulate();
@@ -1625,7 +1721,7 @@ mod tests {
         THREAD_POOL.resize_to_available();
 
         let mut vals = vec![0; 512 * 512];
-        THREAD_POOL.with_worker(|worker| increment(worker, &mut vals));
+        THREAD_POOL.on_worker(|worker| increment(worker, &mut vals));
         assert_eq!(vals, vec![1; 512 * 512]);
 
         THREAD_POOL.depopulate();
diff --git a/src/util.rs b/src/util.rs
index 33ba5da..661a430 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,8 +1,7 @@
-use core::{
-    cell::Cell,
-    hash::Hasher,
-    sync::atomic::{AtomicUsize, Ordering},
-};
+use core::cell::Cell;
+use core::hash::Hasher;
+use core::sync::atomic::AtomicUsize;
+use core::sync::atomic::Ordering;
 use std::hash::DefaultHasher;
 
 /// [xorshift*] is a fast pseudorandom number generator which will
diff --git a/tests/shuttle.rs b/tests/shuttle.rs
index 9d7e14a..776f3be 100644
--- a/tests/shuttle.rs
+++ b/tests/shuttle.rs
@@ -9,7 +9,6 @@ use core::task::Poll;
 
 use forte::ThreadPool;
 use forte::Worker;
-
 use shuttle::hint::black_box;
 use shuttle::sync::atomic::AtomicBool;
 use shuttle::sync::atomic::AtomicUsize;
@@ -20,24 +19,6 @@ use tracing_subscriber::fmt::Subscriber;
 // -----------------------------------------------------------------------------
 // Infrastructure
 
-/*
-
-fn trace<F>(f: F)
-where
-    F: Fn() + Send + Sync + 'static,
-{
-    let subscriber = Subscriber::builder()
-        .compact()
-        .with_max_level(Level::TRACE)
-        .without_time()
-        .with_thread_names(false)
-        .finish();
-
-    tracing::subscriber::with_default(subscriber, f);
-}
-
-*/
-
 /// Provides access to a thread pool which can be treated as static for the
 /// purposes of testing.
 fn with_thread_pool<F>(f: F) -> impl Fn() + 'static
@@ -48,13 +29,9 @@ where
         let thread_pool = Box::new(ThreadPool::new());
         let thread_pool_ptr = Box::into_raw(thread_pool);
 
-        // SAFETY: TODO
+        // SAFETY: This thread pool is never dropped.
         let thread_pool_ref = unsafe { &*thread_pool_ptr };
         f(thread_pool_ref);
-
-        // SAFETY: TODO
-        let thread_pool = unsafe { Box::from_raw(&mut *thread_pool_ptr) };
-        drop(thread_pool);
     }
 }
 
@@ -70,7 +47,7 @@ pub fn shuttle_populate_depopulate() {
         pool.depopulate();
     });
 
-    shuttle::check_dfs(test, None);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 // -----------------------------------------------------------------------------
@@ -85,7 +62,7 @@ pub fn shuttle_spawn_closure() {
         pool.depopulate();
     });
 
-    shuttle::check_dfs(test, None);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 #[derive(Default)]
@@ -117,7 +94,7 @@ pub fn shuttle_spawn_future() {
         pool.depopulate();
     });
 
-    shuttle::check_dfs(test, None);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 /// Tests a two-level join operation on a pool of size one.
@@ -146,7 +123,7 @@ pub fn join_4_on_1() {
         pool.depopulate();
     });
 
-    shuttle::check_pct(test, 100_000, 10_000);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 /// Tests a two-level join operation on a pool of size two.
@@ -175,7 +152,7 @@ pub fn join_4_on_2() {
         pool.depopulate();
     });
 
-    shuttle::check_pct(test, 100_000, 10_000);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 /// Tests a two-level join operation on a pool of size three.
@@ -204,7 +181,7 @@ pub fn join_4_on_3() {
         pool.depopulate();
     });
 
-    shuttle::check_pct(test, 100_000, 10_000);
+    shuttle::check_pct(test, 100_000, 100_000);
 }
 
 /// Tests a moderately deep join operation on a large pool.
@@ -226,115 +203,11 @@ pub fn join_long() {
         }
 
         let mut vals = [0; 10];
-        pool.with_worker(|worker| increment(worker, &mut vals));
+        pool.expect_worker(|worker| increment(worker, &mut vals));
         assert_eq!(vals, [1; 10]);
 
         pool.depopulate();
     });
 
-    shuttle::check_pct(test, 100_000, 10_000);
-}
-
-/*
-
-/// Tests for concurrency issues when blocking on a future.
-#[test]
-pub fn block_on() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            worker.block_on(async {
-                black_box(());
-            });
-        });
-    });
-}
-
-/// Tests for concurrency issues when spawning a future and then blocking on the
-/// resulting task.
-#[test]
-pub fn spawn_and_block() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            let task = worker.spawn_future(async {
-                black_box(());
-            });
-            worker.block_on(task);
-        });
-    });
-}
-
-// -----------------------------------------------------------------------------
-// Scoped API
-
-/// Test for concurrency issues when creating a scope.
-#[test]
-pub fn scope_empty() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            worker.scope(|_| {});
-        });
-    });
-}
-
-/// Tests for concurrency issues when returning a value from a scope.
-#[test]
-fn scope_result() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            let result = worker.scope(|_| 22);
-            assert_eq!(result, 22);
-        });
-    });
-}
-
-/// Tests for concurrency issues when spawning a scoped closure.
-#[test]
-pub fn scope_spawn() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            let complete = AtomicBool::new(false);
-            worker.scope(|scope| {
-                scope.spawn(|_| {
-                    complete.store(true, Ordering::Release);
-                });
-            });
-            worker.run_until(&complete);
-        });
-    });
-}
-
-/// Tests for concurrency issues when spawning multiple scoped closures.
-#[test]
-pub fn scope_two() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            let counter = &AtomicUsize::new(0);
-            worker.scope(|scope| {
-                scope.spawn(|_| {
-                    counter.fetch_add(1, Ordering::SeqCst);
-                });
-                scope.spawn(|_| {
-                    counter.fetch_add(10, Ordering::SeqCst);
-                });
-            });
-            let v = counter.load(Ordering::SeqCst);
-            assert_eq!(v, 11);
-        });
-    });
-}
-
-/// Tests for concurrency issues when spawning a scoped future, and blocking on
-/// it.
-#[test]
-pub fn scope_future() {
-    model(|| {
-        with_thread_pool(|_, worker| {
-            let vec = vec![1, 2, 3];
-            let task = worker.scope(|scope| scope.spawn_future(async { black_box(vec.len()) }));
-            let len = worker.block_on(task);
-            assert_eq!(len, vec.len());
-        });
-    });
+    shuttle::check_pct(test, 100_000, 100_000);
 }
-
-*/

From d0597d57e45a88d20c942c6dde5c66c4b90a40c8 Mon Sep 17 00:00:00 2001
From: NthTensor <nth.tensor@gmail.com>
Date: Fri, 24 Apr 2026 15:46:37 -0400
Subject: [PATCH 2/3] feat: improve docs

---
 CHANGELOG.md       |   2 +-
 Cargo.toml         |   2 +-
 README.md          | 159 +++++++++-
 src/job.rs         |  12 +-
 src/latch.rs       |  11 +-
 src/lib.rs         |  24 +-
 src/scope.rs       |  13 +-
 src/thread_pool.rs | 774 ++++++++++++++++++++++++++++-----------------
 8 files changed, 657 insertions(+), 340 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f7ea85..26911ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,7 +16,7 @@ This project is currently in early [pre-release], and there may be arbitrary bre
 
 ### Added 
 
-- `ThreadPool::num_workers` method which return the current number of workers
+- `ThreadPool::num_workers` method which returns the current number of workers
 - `ThreadPool::on_worker` variant of `with_worker` for `Send` closures.
 - `ThreadPool::expect_worker` variant of `with_worker` that panics.
 
diff --git a/Cargo.toml b/Cargo.toml
index d77e28f..2d68140 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ name = "forte"
 version = "1.0.0-dev"
 edition = "2024"
 license = "MIT OR Apache-2.0"
-description = "A low-overhead thread-pool with support for non-static async closures"
+description = "Low-overhead parallel and async work scheduler"
 repository = "https://github.com/NthTensor/Forte"
 
 [workspace]
diff --git a/README.md b/README.md
index 02d8549..7548f47 100644
--- a/README.md
+++ b/README.md
@@ -3,23 +3,158 @@
 [![Crates.io](https://img.shields.io/crates/v/forte.svg)](https://crates.io/crates/forte)
 [![Docs](https://docs.rs/forte/badge.svg)](https://docs.rs/forte/latest/forte/)
 
-An async-compatible thread-pool aiming for "speed through simplicity".
+Forte is a low-overhead parallel & async work scheduler. It can be used as a
+lower-overhead, lower-latency alternative to `rayon_core`, or as an async
+executor (like `tokio`).
 
-Forte is a parallel & async work scheduler designed to accommodate very large workloads with many short-lived tasks. It replicates the `rayon_core` api but with native support for futures and async tasks. 
-Its design was prompted by the needs of the bevy game engine, but should be applicable to any problem that involves running both synchronous and asynchronous work concurrently.
+## Static + Resizable Thread-Pools
 
-The thread-pool provided by this crate does not employ work-stealing. 
-Forte instead uses "Heartbeat Scheduling", an alternative load-balancing technique that (theoretically) provides provably small overheads and good utilization.
-The end effect is that work is only parallelized every so often, allowing more work to be done sequentially on each thread and amortizing the synchronization overhead.
+Thread pools are `const`-constructed, and intended to be defined as `static`
+variables within a binary crate. Adding a new thread-pool to your project is as
+simple as:
 
-# Acknowledgments
+```rust
+static THREAD_POOL: ThreadPool = ThreadPool::new();
+```
 
-Large portions of the code are direct ports from various versions of `rayon_core`, with minor simplifications and improvements. 
-We also relied upon `chili` and `spice` for reference while writing the heartbeat scheduling.
-Support for futures is based on an approach sketched out by members of the `rayon` community to whom we are deeply indebted.
+Thread pools are empty when created, and can be resized on demand. Up to 32
+threads can participate in a pool at a time (including worker threads and
+non-worker threads making blocking calls to the pool).
+
+```rust
+// Add as many workers to the thread pool as you have cores in your computer.
+THREAD_POOL.resize_to_available();
+
+// Resize the thread-pool to have exactly five workers
+THREAD_POOL.resize_to(5);
+
+// Remove all workers from the pool and shut it down.
+THREAD_POOL.depopulate();
+```
+
+## Fork-Join Parallelism
+
+Forte provides an extremely low-overhead parallelization primitive for blocking
+compute, similar to [`rayon::join`] or [`chili::Scope::join`]. At any point, it _may_
+run the two closures in parallel.
+
+```rust
+fn sum(node: &Node, worker: &Worker) -> u64 {
+    let (left, right) = worker.join(
+        |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
+        |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(),
+    );
+    
+    node.val + left + right
+}
+```
+
+This is optimized for depth-first traversal and hierarchical work-splitting,
+where each of the closures passed to `join` potentially contains another call to
+`join`.
+
+## Spawn Closures & Futures
+
+Forte also provides tools for load-balancing ultra-low-latency non-blocking
+compute (like polling `Futures`), similar to [`rayon::spawn`] or
+[`tokio::task::spawn`].
+
+```rust
+async fn serve() {
+    let listener = TcpListener::bind("127.0.0.1:8080").await?;
+    let mut incoming = listener.incoming();
+
+    while let Some(stream) = incoming.next().await {
+        // A new task is spawned for each inbound tcp stream. The stream is
+        // moved to the new task and processed there.
+        let task = THREAD_POOL.spawn(async move {
+            process(stream).await;
+        });
+        // Spawning a future gives us back a task handle we can use to await
+        // its completion, but we don't care about that here. `detach` lets
+        // drop the handle without canceling the stream-processing task.
+        task.detach();
+    }
+}
+```
+
+## Scoped Spawns
+
+For scheduling with non-static work, forte provides tools akin to
+[`std::thread::scope`], [`tokio_scoped::scope`] or [`rayon::scope`].
+
+```rust
+let mut v = String::from("Hello");
+forte::scope(|scope| {
+    scope.spawn(|_: &Worker| {
+        v.push('!');
+    });
+});
+// The scope doesn't exit until all spawned work is complete.
+assert_eq!(v.as_str(), "Hello!");
+```
+
+## Lazy Heartbeat Scheduling
+
+Forte uses a combination of [_heartbeat scheduling_][hb] and [_lazy
+scheduling_][lz] to achieve ultra-low overhead and minimize cpu-utilization.
+
+The vast majority of operations are local and serial. Most jobs are stored in
+simple double-ended queues, and adding new jobs to a worker has a zero-overhead
+path without any shared data-structures.
+
+Every worker also has a small fixed-capacity work-stealing queue (currently each
+has space for 32 jobs). Approximately every 5us (gated by the CPU's instruction
+counter) if there's space available, each worker pushes a small number of jobs
+into this queue. When a worker runs out of jobs to execute, it briefly tries to
+steal from its coworkers, then goes to sleep. 
+
+This approach has several benefits over more brute-force applications of
+work-stealing:
+
+* For any particular time-slice, there is an upper-bound on the overhead due to
+  synchronization. Since workers only touch shared data-structures every so
+  often, it can only slow them down so much. This reduces runtime variance and
+  lowers overhead.
+
+* There is a cap on frequency at which local-work is made available for sharing.
+  This reduces the probability that new work will become available at any given
+  instant, which means (unlike many work-stealing implementations) it doesn't
+  make sense to spin while trying to steal work. This can also reduce
+  over-sharing at the tail-end of a parallel operation.
+  
+* The occupancy of the work-stealing queue represents an estimate of system
+  load. When a worker's shared-queue is empty, that's a sign that some workers
+  may be starved, and more tasks should be shared. By contrast, when a worker's
+  shared-queue is full to capacity, that's a sign that the thread-pool may have
+  reached full resource-utilization, and should avoid the costs of
+  synchronization for a bit.
+  
+Jobs created by `join` are executed in LIFO order. When it comes time to share
+work, the oldest `join` job is promoted into the shared work-stealing queue. In
+the case of a binary tree, this means that execution progresses depth-first, but
+sharing progresses breadth-first.
+
+Jobs created by `spawn` are executed in FIFO order, to minimize latency. When it
+comes time to share work, the newest `spawn` jobs are grouped into small batches
+(16 jobs each) and those batches are promoted into the shared work-stealing
+queue. This means that spawns generally stay on the thread that spawned them,
+unless the thread is overwhelmed by an influx of new tasks.
 
 # License
 
-Forte is distributed under the terms of both the MIT license and the Apache License (Version 2.0).
-See LICENSE-APACHE and LICENSE-MIT for details.
+Forte is distributed under the terms of both the MIT license and the Apache
+License (Version 2.0). See LICENSE-APACHE and LICENSE-MIT for details. 
+
 Opening a pull request is assumed to signal agreement with these licensing terms.
+
+[`rayon::join`]: https://docs.rs/rayon/latest/rayon/fn.join.html
+[`chili::Scope::join`]: https://docs.rs/chili/latest/chili/struct.Scope.html#method.join
+[`rayon::spawn`]: https://docs.rs/rayon/latest/rayon/fn.spawn.html
+[`tokio::task::spawn`]: https://docs.rs/tokio/latest/tokio/task/fn.spawn.html
+[`std::thread::scope`]: https://doc.rust-lang.org/std/thread/fn.scope.html
+[`tokio_scoped::scope`]: https://docs.rs/tokio-scoped/latest/tokio_scoped/fn.scope.html
+[`rayon::scope`]: https://docs.rs/rayon/latest/rayon/fn.scope.html
+
+[hb]: https://www.andrew.cmu.edu/user/mrainey/heartbeat/heartbeat.html
+[lz]: https://dl.acm.org/doi/10.1145/2629643
diff --git a/src/job.rs b/src/job.rs
index 38d0c30..edbed59 100644
--- a/src/job.rs
+++ b/src/job.rs
@@ -132,7 +132,7 @@ unsafe impl Send for JobRef {}
 // Job queue
 
 /// A queue of jobs. This is a simple wrapper around a vec dequeue that uses
-/// inner mutation, and has some more intiuitively named methods to enforce
+/// inner mutation, and has some more intuitively named methods to enforce
 /// conventions.
 pub struct JobQueue {
     job_refs: UnsafeCell<VecDeque<JobRef>>,
@@ -206,8 +206,8 @@ impl JobQueue {
     const CHUNK_SIZE: usize = 16;
 
     /// Splits off a series of chunks from the end of the queue (the side with
-    /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. After, At most
-    /// `CHUNK_SIZE` jobs will be left in the queue.
+    /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. Afterwards, at most
+    /// `CHUNK_SIZE` jobs will remain in the queue.
     pub fn split(&self) -> Vec<VecDeque<JobRef>> {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
@@ -226,7 +226,7 @@ impl JobQueue {
     }
 
     /// Appends a chunk of jobs (expected to be provided by `split`) to the
-    /// queue. Jobs are added to the end (the side with the newst jobs).
+    /// queue. Jobs are added to the end (the side with the newest jobs).
     pub fn append(&self, mut split_refs: VecDeque<JobRef>) {
         // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one
         // thread. We ensure no other references to the inner value exist by not
@@ -417,9 +417,9 @@ where
         // The latch has not been set, and this function is called at most once,
         // so no concurrent access can occur.
         unsafe { (*return_value).write(result) };
-        // This syncrhonizies with the `Acquire` fence within `return_value()`,
+        // This synchronizes with the `Acquire` fence within `return_value()`,
         // establishing a happens-before relationship that makes the preceding
-        // `return_value` write vsibile to the reader.
+        // `return_value` write visible to the reader.
         //
         // This is required because latches do not synchronize memory.
         fence(Ordering::Release);
diff --git a/src/latch.rs b/src/latch.rs
index 9a1b3a1..425a9b8 100644
--- a/src/latch.rs
+++ b/src/latch.rs
@@ -1,10 +1,11 @@
-//! A core concept in Rayon is the *latch*. Forte has borrowed this, in a
-//! somewhat simplified form.
+//! Forte borrows the *latch* concept from Rayon.
 //!
 //! Every forte worker thread has a single "sleep controller" that it uses to
 //! park and unpark itself. Latches build on this to create a simple boolean
 //! switch, which allows the owning thread to sleep until the latch becomes set
 //! by another thread.
+//!
+//! Every latch points at one "sleep controller".
 
 use alloc::task::Wake;
 use core::borrow::Borrow;
@@ -77,9 +78,9 @@ impl Latch {
         self.state.load(Ordering::Relaxed) == SIGNAL
     }
 
-    /// Waits for the latch to be set. In actuality, this may be woken.
-    ///
-    /// Returns true if the latch signal was received, and false otherwise.
+    /// Puts the thread to sleep if the latch has not been set. The thread will
+    /// be woken when the latch becomes set, but may also wake before then. The
+    /// caller should always re-check the latch condition after this returns.
     ///
     /// # Memory Ordering
     ///
diff --git a/src/lib.rs b/src/lib.rs
index dd3071d..7fb6758 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -90,7 +90,7 @@
 //!
 //! Thread pools are dynamically sized; When your program starts they have size
 //! zero (meaning no worker threads are running). You can change the number of
-//! works assigned to a pool using [`ThreadPool::grow`], [`ThreadPool::shrink`]
+//! workers assigned to a pool using [`ThreadPool::grow`], [`ThreadPool::shrink`]
 //! and [`ThreadPool::resize_to`]. But most of the time you will want to call
 //! [`ThreadPool::resize_to_available`], which will resize the pool to exploit
 //! all the available parallelism on your system by spawning a worker thread for
@@ -122,8 +122,8 @@
 //! // there are no workers to parallelize it).
 //! THREAD_POOL.join(|_| println!("world"), |_| println!("hello "));
 //!
-//! // This will always print "hello world" (because join happens execute things
-//! // backwards in this case).
+//! // This will always print "hello world" (because join executes the second
+//! // closure first when running in serial).
 //! ```
 //!
 //! # Workers
@@ -151,7 +151,7 @@
 //! tasks left in the local queue are executed.
 //!
 //! You will only ever receive `&Worker` references, because the worker is not
-//! allowed to move or be mutably referenced. Worker are `!Send` and `!Sync`,
+//! allowed to move or be mutably referenced. Workers are `!Send` and `!Sync`,
 //! and are meant to represent local-only data.
 //!
 //! To access the current worker context, you can use [`Worker::map_current`] or
@@ -196,25 +196,14 @@
 //! | *Scope*    | [`scope()`] | [`ThreadPool::scope()`] | [`Worker::scope()`]
 //! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`]
 //!
-//! * *Worker.* Uses the provided worker context.
-//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one.
 //! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one.
+//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one.
+//! * *Worker.* Uses the provided worker context.
 //!
 //! The headless and thread pool flavors are more or less just aliases for the
 //! worker flavor. Where possible, the worker flavor should be preferred to the
 //! thread pool flavor, and the thread pool flavor should be preferred to the
 //! headless flavor.
-//!
-//! # Theory & Background
-//!
-//! Forte is based on `rayon_core`, to the extent that during development it was
-//! often possible to port code from `rayon_core` more or less verbatim.
-//! However, forte and rayon differ significantly in their goals and approach.
-//!
-//! Rayon uses an approach to work-stealing adapted from Cilk and Intel TBB.
-//! These techniques are largely the industry standard.
-//!
-//! [^TZANNES]: Tzannes et al. 2024, <https://dl.acm.org/doi/pdf/10.1145/2629643>
 
 #![no_std]
 #![cfg_attr(feature = "shuttle", allow(dead_code))]
@@ -252,6 +241,7 @@ pub struct FutureMarker();
 pub use scope::Scope;
 pub use scope::ScopedSpawn;
 pub use thread_pool::Spawn;
+pub use thread_pool::Task;
 pub use thread_pool::ThreadPool;
 pub use thread_pool::Worker;
 pub use thread_pool::Yield;
diff --git a/src/scope.rs b/src/scope.rs
index 71a6c19..03c3bb7 100644
--- a/src/scope.rs
+++ b/src/scope.rs
@@ -35,8 +35,9 @@ use crate::unwind::AbortOnDrop;
 // -----------------------------------------------------------------------------
 // Scope
 
-/// A scope which can spawn a number of non-static jobs and async tasks. Refer
-/// to [`scope`](crate::scope()) for more extensive documentation.
+/// A scope which can spawn a number of non-static jobs and async tasks.
+///
+/// Refer to [`scope`](crate::scope()) for more extensive documentation.
 ///
 /// # Lifetimes
 ///
@@ -119,7 +120,7 @@ where
     // reference is allowed to escape, the caller cannot safely cause the scope
     // to move either.
     //
-    // `Scope::complete` is called unconditionally on the line bellow, before
+    // `Scope::complete` is called unconditionally on the line below, before
     // the implicit drop of `scope`. If the closure `f` panics, it is caught and
     // re-emitted after `complete` finishes. In the event of an uncaught panic,
     // we cannot ensure `complete` runs properly before the scope is dropped, so
@@ -254,7 +255,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
         // allocating a second time, and means we can immediately drop the panic
         // we have just been passed.
         //
-        // Dropping this panic may itself trigger a pnaic, but this will simply
+        // Dropping this panic may itself trigger a panic, but this will simply
         // trigger the scope's abort guard, causing an abort rather than UB.
         if self.panic.load(Ordering::Relaxed).is_null() {
             let nil = ptr::null_mut();
@@ -267,7 +268,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
             //
             // If the write fails, another panic must have already occurred, and
             // we don't need to synchronize memory (the previous call to
-            // `store_panic` handles the syncrhonization for it's panic data).
+            // `store_panic` handles the synchronization for it's panic data).
             if self
                 .panic
                 .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed)
@@ -292,7 +293,7 @@ impl<'scope, 'env> Scope<'scope, 'env> {
         // whatever it points to.
         let panic = self.panic.swap(ptr::null_mut(), Ordering::Relaxed);
         if !panic.is_null() {
-            // We generally don't expect pancis to happen.
+            // We generally don't expect panics to happen.
             cold_path();
             // If the panic pointer is not null, emit an `Acquire` fence to
             // establish a happens-after relationship with the `Release` branch
diff --git a/src/thread_pool.rs b/src/thread_pool.rs
index 3006db4..4c235cc 100644
--- a/src/thread_pool.rs
+++ b/src/thread_pool.rs
@@ -45,27 +45,29 @@ use crate::util::XorShift64Star;
 // -----------------------------------------------------------------------------
 // Thread pool
 
-/// A thread pool is a set of threads.
+/// A statically-allocated handle to a dynamically-sized collection of threads.
 ///
-/// You can dispatch work to a thread pool, and it will be distributed amongst
-/// the threads and run as quickly as possible. To create a new thread pool,
-/// assign it to a constant.
-/// ```
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// ```
-/// Thread pools are empty when created, and must be explicitly resized at
-/// runtime.
-/// ```
+/// Each `ThreadPool` must be stored in a `static`, ideally defined within your
+/// root binary crate rather than a library crate. You can create a new pool
+/// with [`ThreadPool::new`], and will probably want to resize sometime between
+/// program init and when you want to start scheduling work.
+///
+/// ```rust
 /// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// THREAD_POOL.resize_to_available();
+/// static POOL: ThreadPool = ThreadPool::new();
+///
+/// fn main() {
+///     POOL.resize_to_available();
+///     // … schedule work …
+///     POOL.depopulate();
+/// }
 /// ```
-/// After this, you can start sending work to the pool with
-/// [`spawn`][ThreadPool::spawn], [`block_on`][ThreadPool::block_on],
-/// [`join`][ThreadPool::join], or [`scope`][ThreadPool::scope].
+///
+/// A pool can accommodate at most 32 participating threads (this includes
+/// managed worker threads created by the `resize` functions, but also external
+/// threads that become "temporary members" when they make blocking calls to the
+/// pool). All blocking methods (e.g. [`join`] and [`scope`]) work even with
+/// zero managed workers, but they won't run in parallel.
 pub struct ThreadPool {
     /// A bit-set that tracks which seats are occupied.
     occupied: CachePadded<AtomicU32>,
@@ -73,7 +75,7 @@ pub struct ThreadPool {
     sleeping: CachePadded<AtomicU32>,
     /// Holds shared data for each thread participating in the pool.
     seats: OnceLock<Box<Seats>>,
-    /// Holds controls for threads spawned and managed by the pool. Initalized
+    /// Holds controls for threads spawned and managed by the pool. Initialized
     /// on first call to `occupy`, to allow for some non-static constructors.
     managed_threads: Mutex<ManagedThreads>,
     /// Used to inject external work into the thread pool. This is generally
@@ -108,7 +110,7 @@ pub struct Lease {
     thread_pool: &'static ThreadPool,
     /// The index of the seat in the data list
     seat_number: usize,
-    /// A reference to the pre-initalized seat data (to avoid repeated hits of
+    /// A reference to the pre-initialized seat data (to avoid repeated hits of
     /// the `OnceLock`).
     seats: &'static Seats,
 }
@@ -187,7 +189,7 @@ impl ThreadPool {
     /// Claims a lease on the thread pool which can be occupied by a worker
     /// (using [`Worker::occupy`]), allowing a thread to participate in the pool.
     ///
-    /// Returns none if all seats are occupied.
+    /// Returns `None` if all seats are occupied.
     #[cold]
     pub fn claim_lease(&'static self) -> Option<Lease> {
         loop {
@@ -280,19 +282,25 @@ impl ThreadPool {
 // Thread pool resizing
 
 impl ThreadPool {
-    /// Resizes the thread pool to fill all available space. After this returns,
-    /// the pool will have at least one worker thread and at most `MAX_THREADS`.
-    /// Returns the new size of the pool.
+    /// Resizes the thread pool to fill (almost) all available cores. After this
+    /// returns, the pool will have between 1 and 32 workers. Returns the new
+    /// size of the pool.
+    ///
+    /// This always leaves one core free, so that the main program loop can
+    /// continue executing on it. If you have 8 cores, calling this function
+    /// will add 7 workers to the pool (and then the main thread will become the
+    /// 8th worker if it makes a blocking call like `join`).
     ///
     /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn resize_to_available(&'static self) -> usize {
         let mut available = available_parallelism().map(NonZero::get).unwrap_or(1);
-        available = available.saturating_sub(1);
+        available = available.saturating_sub(1).max(1);
         self.resize_to(available)
     }
 
     /// Resizes the pool to the specified number of threads. Returns the new
-    /// size of the thread pool, which may be smaller than requested.
+    /// size of the thread pool. The new size may be smaller than requested if
+    /// all the seats in the thread pool are occupied.
     ///
     /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn resize_to(&'static self, new_size: usize) -> usize {
@@ -300,19 +308,20 @@ impl ThreadPool {
     }
 
     /// Adds the given number of threads to the thread pool. Returns the new
-    /// size of the pool, which may be smaller than requested.
+    /// size of the pool. The new size may be smaller than requested if all the
+    /// seats in the thread pool are occupied.
     ///
-    /// See [`ThreadPool::resize_to`] for more information about resizing.
+    /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn grow(&'static self, added_threads: usize) -> usize {
-        self.resize(|current_size| current_size + added_threads)
+        self.resize(|current_size| current_size.saturating_add(added_threads))
     }
 
     /// Removes the given number of threads from the thread pool. Returns the new
     /// size of the pool.
     ///
-    /// See [`ThreadPool::resize_to`] for more information about resizing.
+    /// See [`ThreadPool::resize`] for more information about resizing.
     pub fn shrink(&'static self, terminated_threads: usize) -> usize {
-        self.resize(|current_size| current_size - terminated_threads)
+        self.resize(|current_size| current_size.saturating_sub(terminated_threads))
     }
 
     /// Ensures that there is at least one worker thread attached to the thread
@@ -338,9 +347,8 @@ impl ThreadPool {
         self.resize_to(0)
     }
 
-    /// Resizes the pool, and returns the new size.
-    ///
-    /// Note that the new size may be different from the size requested.
+    /// Resizes the pool, and returns the new size. The new size may be smaller
+    /// than requested if all the seats in the thread pool are occupied.
     #[cold]
     pub fn resize<F>(&'static self, get_size: F) -> usize
     where
@@ -504,10 +512,10 @@ impl ThreadPool {
     /// Runs the closure on a thread-pool worker.
     ///
     /// If this thread is currently acting as a worker for the thread-pool, this
-    /// just looks that worker up. If this is not registered as a worker, or the
-    /// thread's worker is registered with different thread pool, the thread
-    /// will try to register itself with the correct pool. If the thread pool is
-    /// full, it passes the closure `None`.
+    /// just looks that worker up. If this thread is not registered as a worker,
+    /// or if the thread's worker is registered with different thread pool, the
+    /// thread will try to register itself with the correct pool. If the thread
+    /// pool is full, it passes the closure `None`.
     ///
     /// The provided closure is never sent to another thread. If your closure is
     /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead.
@@ -550,6 +558,10 @@ impl ThreadPool {
 ///
 /// * Futures that satisfy `Future<Output = T> + Send + 'static` where `T: Send + 'static`.
 ///
+/// Closures return `()` when spawned, but futures return a [`Task`].
+///
+/// # Compile Errors
+///
 /// Due to a bug in rustc, you may be given errors when using closures
 /// with inferred types. If you encounter the following:
 ///
@@ -606,6 +618,8 @@ where
     }
 }
 
+/// An alias for [`async_task::Task`] that includes a reference to the pool on
+/// which the future is executing.
 pub type Task<T> = async_task::Task<T, &'static ThreadPool>;
 
 /// Schedules a runnable future as a job.
@@ -744,7 +758,7 @@ thread_local! {
 ///
 /// Every thread has at most one worker at a time. If a worker has already been
 /// set up, it may be accessed at any time by calling [`Worker::with_current`].
-/// A thread's worker can also manually overridden by claiming a lease
+/// A thread's worker can also be manually overridden by claiming a lease
 /// ([`ThreadPool::claim_lease`]) and passing it to [`Worker::occupy`]. The
 /// worker returned by `with_current` always represents the lease most recently
 /// occupied in the call stack.
@@ -781,15 +795,16 @@ pub struct Worker {
 pub enum Yield {
     /// Indicates that a job was executed.
     Executed,
-    /// Indicates that no job was executed, and the worker should perhaps be put
-    /// to sleep.
+    /// Indicates that no job was executed. After receiving this, do not `yield`
+    /// again until you have a reasonable expectation that new work will have
+    /// been shared.
     Idle,
 }
 
 impl Worker {
     /// Temporarily sets the thread's worker. [`Worker::with_current`] always
-    /// returns a reference to the worker set up by the most recent call to this
-    /// worker.
+    /// returns a reference to the worker set up by the most recent call to
+    /// `occupy`.
     ///
     /// Rust's thread locals are fairly costly, so this function is expensive.
     /// If you can avoid calling it, do so.
@@ -845,10 +860,9 @@ impl Worker {
         &self.lease.seats.sharers[self.lease.seat_number]
     }
 
-    /// Calls the provided closure on the thread's worker instance, if it has one.
-    ///
-    /// Rust's thread locals are fairly costly, so this function is expensive.
-    /// If you can avoid calling it, do so.
+    /// Calls the provided closure on the thread's worker instance, if it has
+    /// one. If this thread is not registered as a worker, the closure is not
+    /// called.
     #[inline(always)]
     pub fn map_current<F, R>(f: F) -> Option<R>
     where
@@ -876,10 +890,9 @@ impl Worker {
         }
     }
 
-    /// Looks up the current `Worker` instance from the thread local.
-    ///
-    /// Rust's thread locals are fairly costly, so this function is expensive.
-    /// If you can avoid calling it, do so.
+    /// Calls the provided closure on the thread's worker instance, if it has
+    /// one. If this thread is not registered as a worker, the closure is passed
+    /// `None`.
     #[inline(always)]
     pub fn with_current<F, R>(f: F) -> R
     where
@@ -903,13 +916,16 @@ impl Worker {
         }
     }
 
-    /// Returns the index of the worker in the leases list.
+    /// Returns this worker's seat index within the pool (0–31).
+    ///
+    /// Seat numbers may be re-used by different workers at different times, and
+    /// may not be contiguous or ordered.
     #[inline(always)]
     pub fn seat_number(&self) -> usize {
         self.lease.seat_number
     }
 
-    /// Returns the index of the thread pool of the worker.
+    /// Returns the thread pool this worker belongs to.
     #[inline(always)]
     pub fn thread_pool(&self) -> &'static ThreadPool {
         self.lease.thread_pool
@@ -1048,7 +1064,6 @@ impl Worker {
         self.lifo_queue
             .pop_newest()
             .or_else(|| self.fifo_queue.pop_oldest())
-            .or_else(|| self.sharer().pop())
     }
 
     /// Finds a job to work on. This tries
@@ -1058,6 +1073,7 @@ impl Worker {
     fn find_work(&self) -> Option<(JobRef, bool)> {
         self.find_local_work()
             .map(|job| (job, false))
+            .or_else(|| self.sharer().pop().map(|job| (job, false)))
             .or_else(|| self.steal_from_siblings().map(|job| (job, true)))
             .or_else(|| self.claim_shared_job().map(|job| (job, true)))
     }
@@ -1104,8 +1120,14 @@ impl Worker {
     /// Cooperatively yields execution to the thread pool, allowing it to execute
     /// some work.
     ///
-    /// This function only executes local work: work already queued on the
-    /// worker. It will never claim shared work.
+    /// This function will only execute work already held locally by the worker,
+    /// and does no synchronization. To claim and run shared work, use
+    /// [`yield_now`][Worker::yield_now].
+    ///
+    /// If no work is found, this returns `Yield::Idle`. This function should
+    /// not be called again (for at least a few microseconds) after an idle.
+    /// Calling this repeatedly in a spin-loop should be avoided, as it's likely
+    /// to significantly spike CPU usage and waste resources.
     #[inline(always)]
     pub fn yield_local(&self) -> Yield {
         // We use LIFO order here, pulling the newest work from the queue. This
@@ -1122,11 +1144,17 @@ impl Worker {
     /// Cooperatively yields execution to the thread pool, allowing it to execute
     /// some work.
     ///
-    /// This function may execute either local or shared work: work already
-    /// queued on the worker, or work off-loaded by a different worker. If there
-    /// is no work on the pool, this will lock the thread pool mutex, so it
-    /// should not be called within a hot loop. Consider using
-    /// [`Worker::yield_local`] instead.
+    /// If the worker has no local work to do, it will try to steal work from
+    /// coworkers or claim work from the shared injection queue. If instead the
+    /// worker has a backlog of local work, the worker may make some of it
+    /// accessible to other workers for stealing. This involves synchronization
+    /// with the pool, and so should be called infrequently. To yield without
+    /// synchronizing with the pool, use [`yield_local`][Worker::yield_local].
+    ///
+    /// If no work is found, this returns `Yield::Idle`. This function should
+    /// not be called again (for at least a few microseconds) after an idle.
+    /// Calling this repeatedly in a spin-loop should be avoided, as it's likely
+    /// to significantly spike CPU usage and waste resources.
     #[inline(always)]
     pub fn yield_now(&self) -> Yield {
         self.promote();
@@ -1160,7 +1188,6 @@ impl Worker {
 // -----------------------------------------------------------------------------
 // Worker operations
 
-/// # Operations
 impl Worker {
     /// Spawns work (a closure or future) onto the thread pool. Just like a
     /// standard thread, this work executes concurrently (and potentially in
@@ -1185,10 +1212,37 @@ impl Worker {
     /// Polls a future to completion, then returns the outcome. This function
     /// will prioritize polling the future as soon as it becomes available, and
     /// while the future is not available it will try to do other meaningful
-    /// work.
+    /// work from the thread-pool. If the thread pool runs out of work, the
+    /// thread is suspended until the future completes or more background work
+    /// becomes available.
+    ///
+    /// # Async & Concurrency
+    ///
+    /// This is a convenient way to introduce concurrency into otherwise blocking
+    /// operations. For example, it is _totally acceptable_ to use `block_on`
+    /// within one of the branches of of a `join` operation (to perform I/O, for
+    /// example).
+    ///
+    /// This should **not** be called within `async` contexts. While it will not
+    /// block the execution of work on the pool, it will prevent the enclosing
+    /// future's `poll` method from returning. This can potentially lead to
+    /// deadlocks.
+    ///
+    /// Other implementation of `block_on` (like those defined by the `futures`
+    /// crate) should not be called within parallel forte operations. They will
+    /// block execution of work on the pool.
+    ///
+    /// # Alternatives
     ///
     /// If you do not have access to a [`Worker`], you may call
-    /// [`ThreadPool::block_on`] or simply [`block_on`].
+    /// [`ThreadPool::block_on`] instead. If you don't have a static reference
+    /// to a specific thread pool (as is often the case in library code) you can
+    /// use [`block_on`] instead, as long as you are sure that your code will
+    /// run within a worker.
+    ///
+    /// # Panics
+    ///
+    /// If the future panics, this immediately panics.
     #[inline(always)]
     pub fn block_on<F, T>(&self, future: F) -> T
     where
@@ -1230,8 +1284,118 @@ impl Worker {
 
     /// Executes the two closures, possibly in parallel.
     ///
-    /// If you do not have access to a [`Worker`], you may call
-    /// [`ThreadPool::join`] or simply [`join`].
+    /// This is conceptually similar to spawning two threads to execute each
+    /// closure, and then joining both (although the implementation is quite
+    /// different). It is intended for implementing recursive,
+    /// divide-and-conquer algorithms where each branch may itself call `join`.
+    ///
+    /// # Examples
+    ///
+    /// This example (taken wholesale from `rayon`) uses `join` to perform a
+    /// quick-sort.
+    ///
+    /// ```rust
+    /// # use forte::*;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.resize_to_available();
+    ///
+    /// let mut v = vec![5, 1, 8, 22, 0, 44];
+    /// THREAD_POOL.on_worker(|worker| quick_sort(worker, &mut v));
+    /// assert_eq!(v, vec![0, 1, 5, 8, 22, 44]);
+    ///
+    /// fn quick_sort<T: PartialOrd + Send>(worker: &Worker, v: &mut [T]) {
+    ///     if v.len() > 1 {
+    ///         let mid = partition(v);
+    ///         let (lo, hi) = v.split_at_mut(mid);
+    ///         worker.join(|w| quick_sort(w, lo),
+    ///                     |w| quick_sort(w, hi));
+    ///     }
+    /// }
+    ///
+    /// // Partition rearranges all items `<=` to the pivot
+    /// // item (arbitrary selected to be the last item in the slice)
+    /// // to the first half of the slice. It then returns the
+    /// // "dividing point" where the pivot is placed.
+    /// fn partition<T: PartialOrd + Send>(v: &mut [T]) -> usize {
+    ///     let pivot = v.len() - 1;
+    ///     let mut i = 0;
+    ///     for j in 0..pivot {
+    ///         if v[j] <= v[pivot] {
+    ///             v.swap(i, j);
+    ///             i += 1;
+    ///         }
+    ///     }
+    ///     v.swap(i, pivot);
+    ///     i
+    /// }
+    /// ```
+    ///
+    /// This example (taken from `chili`) shows how to use `join` to sum the
+    /// nodes of a binary tree.
+    ///
+    /// ```rust
+    /// # use forte::*;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.resize_to_available();
+    ///
+    /// let tree = gen_tree(8);
+    /// let result = THREAD_POOL.on_worker(|worker| sum(worker, &tree));
+    /// assert_eq!(result, 255);
+    ///
+    /// struct Node {
+    ///     val: u64,
+    ///     left: Option<Box<Node>>,
+    ///     right: Option<Box<Node>>,
+    /// }
+    ///
+    /// fn gen_tree(layers: usize) -> Box<Node> {
+    ///     Box::new(Node {
+    ///         val: 1,
+    ///         left: (layers != 1).then(|| gen_tree(layers - 1)),
+    ///         right: (layers != 1).then(|| gen_tree(layers - 1)),
+    ///     })
+    /// }
+    ///
+    /// fn sum(worker: &Worker, node: &Node) -> u64 {
+    ///     let (left, right) = worker.join(
+    ///         |w| node.left.as_deref().map(|n| sum(w, n)).unwrap_or_default(),
+    ///         |w| node.right.as_deref().map(|n| sum(w, n)).unwrap_or_default(),
+    ///     );
+    ///     node.val + left + right
+    /// }
+    /// ```
+    ///
+    /// # Alternatives
+    ///
+    /// If you do not have a `Worker`, you can use [`ThreadPool::join`]
+    /// instead. If you don't have a static reference to a specific thread pool
+    /// (as is often the case in library code) you can use [`join`] instead, as
+    /// long as you are sure that your code will run within a worker.
+    ///
+    /// If your workload isn't amenable to the divide-and-conquer approach or is
+    /// async, but you still want to borrow local data in your computations, you
+    /// may want to use a [`scope`][`Worker::scope`] instead.
+    ///
+    /// # Warning about blocking I/O
+    ///
+    /// The assumption is that the closures given to `join()` are CPU-bound
+    /// tasks that do not perform blocking operations. If you do perform I/O,
+    /// and that I/O should block (e.g., waiting for a network request), the
+    /// overall performance may be poor. Moreover, if you cause one closure to
+    /// be blocked waiting on another (for example, using a channel), that could
+    /// lead to a deadlock.
+    ///
+    /// You can use [`block_on`][Worker::block_on] to do async I/O within a
+    /// `join` branch, as long as different branches are not made to depend on
+    /// each other.
+    ///
+    /// # Panics
+    ///
+    /// Both closures are always executed to completion. If either panics,
+    /// `join` will propagate that panic after both complete. When both panic,
+    /// only the panic from the first argument is propagated and the panic from
+    /// the other argument is dropped (this may cause program aborts in some
+    /// situations).
     #[inline(always)]
     pub fn join<A, B, RA, RB>(&self, a: A, b: B) -> (RA, RB)
     where
@@ -1295,9 +1459,209 @@ impl Worker {
         }
     }
 
-    /// Creates a scope onto which non-static work can be spawned. For more complete docs, see [`scope`].
+    /// Creates a new scope for spawning non-static work.
+    ///
+    /// Work spawned onto the new scope does not have to have a `'static`
+    /// lifetime, and can borrow local variables. Local borrowing is possible
+    /// because this function will not return until all work spawned on the
+    /// scope has completed, this ensuring the stack frame is kept alive for the
+    /// duration.
+    ///
+    /// # Accessing stack data
+    ///
+    /// In general, spawned tasks may borrow any stack data that lives outside
+    /// the scope closure.
+    ///
+    /// ```
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let ok: Vec<i32> = vec![1, 2, 3];
+    /// forte::scope(|scope| {
+    ///     let bad: Vec<i32> = vec![4, 5, 6];
+    ///     scope.spawn_on(worker, |_: &Worker| {
+    ///         // Transfer ownership of `bad` into a local variable (also named `bad`).
+    ///         // This will force the closure to take ownership of `bad` from the environment.
+    ///         let bad = bad;
+    ///         println!("ok: {:?}", ok); // `ok` is only borrowed.
+    ///         println!("bad: {:?}", bad); // refers to our local variable, above.
+    ///     });
+    ///
+    ///     scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok`
+    /// });
+    /// # });
+    /// ```
+    /// As the comments example above suggest, to reference `bad` we must
+    /// take ownership of it. One way to do this is to detach the closure
+    /// from the surrounding stack frame, using the `move` keyword. This
+    /// will cause it to take ownership of *all* the variables it touches,
+    /// in this case including both `ok` *and* `bad`:
+    ///
+    /// ```rust
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let ok: Vec<i32> = vec![1, 2, 3];
+    /// forte::scope(|scope| {
+    ///     let bad: Vec<i32> = vec![4, 5, 6];
+    ///     scope.spawn_on(worker, move |_: &Worker| {
+    ///         println!("ok: {:?}", ok);
+    ///         println!("bad: {:?}", bad);
+    ///     });
+    ///
+    ///     // That closure is fine, but now we can't use `ok` anywhere else,
+    ///     // since it is owned by the previous task:
+    ///     // scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok));
+    /// });
+    /// # });
+    /// ```
+    ///
+    /// While this works, it could be a problem if we want to use `ok` elsewhere.
+    /// There are two choices. We can keep the closure as a `move` closure, but
+    /// instead of referencing the variable `ok`, we create a shadowed variable that
+    /// is a borrow of `ok` and capture *that*:
+    ///
+    /// ```rust
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let ok: Vec<i32> = vec![1, 2, 3];
+    /// forte::scope(|scope| {
+    ///     let bad: Vec<i32> = vec![4, 5, 6];
+    ///     let ok: &Vec<i32> = &ok; // shadow the original `ok`
+    ///     scope.spawn_on(worker, move |_: &Worker| {
+    ///         println!("ok: {:?}", ok); // captures the shadowed version
+    ///         println!("bad: {:?}", bad);
+    ///     });
+    ///
+    ///     // Now we too can use the shadowed `ok`, since `&Vec<i32>` references
+    ///     // can be shared freely. Note that we need a `move` closure here though,
+    ///     // because otherwise we'd be trying to borrow the shadowed `ok`,
+    ///     // and that doesn't outlive `scope`.
+    ///     scope.spawn_on(worker, move |_: &Worker| println!("ok: {:?}", ok));
+    /// });
+    /// # });
+    /// ```
+    ///
+    /// Another option is not to use the `move` keyword but instead to take ownership
+    /// of individual variables:
+    ///
+    /// ```rust
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let ok: Vec<i32> = vec![1, 2, 3];
+    /// forte::scope(|scope| {
+    ///     let bad: Vec<i32> = vec![4, 5, 6];
+    ///     scope.spawn_on(worker, |_: &Worker| {
+    ///         // Transfer ownership of `bad` into a local variable (also named `bad`).
+    ///         // This will force the closure to take ownership of `bad` from the environment.
+    ///         let bad = bad;
+    ///         println!("ok: {:?}", ok); // `ok` is only borrowed.
+    ///         println!("bad: {:?}", bad); // refers to our local variable, above.
+    ///     });
+    ///
+    ///     scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok`
+    /// });
+    /// # });
+    /// ```
+    ///
+    /// # Referencing the scope
+    ///
+    /// The scope passed into the closure is not allowed to leak out of this call.
+    /// In other words, this will fail to compile:
+    ///
+    /// ```compile_fail
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let mut leak = None;
+    /// forte::scope(|scope| {
+    ///     leak = Some(scope); // <-- ERROR: scope would be leaked here
+    /// });
+    /// drop(leak);
+    /// # });
+    /// ```
+    ///
+    /// Anything spawned onto the scope can capture a reference to it.
+    /// This allows scoped work to spawn other scoped work.
     ///
-    /// If you do not have access to a worker, you can use [`ThreadPool::scope`] or simply [`scope`].
+    /// ```
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// # THREAD_POOL.expect_worker(|worker| {
+    /// let mut counter = 0;
+    /// let counter_ref = &mut counter;
+    /// forte::scope(|scope| {
+    ///     scope.spawn_on(worker, |worker: &Worker| {
+    ///         *counter_ref += 1;
+    ///         // Note: we borrow the scope again here.
+    ///         scope.spawn_on(worker, move |_: &Worker| {
+    ///             *counter_ref += 1;
+    ///         });
+    ///     });
+    /// });
+    /// assert_eq!(counter, 2);
+    /// # });
+    /// ```
+    ///
+    /// It's possible to spawn non-scoped work within the closure, but these
+    /// generally can't hold references to the scope. So for example, the
+    /// following also fails to compile:
+    ///
+    /// ```compile_fail,E0521
+    /// # use forte::ThreadPool;
+    /// # use forte::Worker;
+    /// # static THREAD_POOL: ThreadPool = ThreadPool::new();
+    /// # THREAD_POOL.populate();
+    /// THREAD_POOL.with_worker(|worker| {
+    ///     worker.scope(|scope| {
+    ///         worker.spawn(|worker: &Worker| {
+    ///             // ^^^^^ ERROR: This creates a *static* job on the worker,
+    ///             //       which may outlive the scope.
+    ///             
+    ///             scope.spawn_on(worker, |_: &Worker| { });
+    ///             // ^^^^^ ERROR: This requires borrowing the scope within the
+    ///             //       unscoped job, which isn't allowed by the compiler
+    ///             //       because 'scope would have to to outlive 'static.
+    ///         });
+    ///     });
+    /// });
+    /// ```
+    ///
+    /// # Alternatives
+    ///
+    /// If you do not have a `Worker`, you can use [`ThreadPool::scope`]
+    /// instead. If you don't have a static reference to a specific thread pool
+    /// (as is often the case in library code) you can use [`scope`] instead, as
+    /// long as you are sure that your code will run within a worker.
+    ///
+    /// Scopes are a more flexible building block compared to
+    /// [`join`][Worker::join], since a loop can be used to spawn any number of
+    /// tasks without recursing. However, that flexibility comes at a
+    /// performance price: tasks spawned using `scope` must be allocated onto
+    /// the heap, whereas [`join`][Worker::join] can make exclusive use of the
+    /// stack. Prefer [`join`][Worker::join]) where possible.
+    ///
+    /// # Panics
+    ///
+    /// If a panic occurs, either in the closure given to `scope` or in job
+    /// spawned on the scope, that panic is caught and stored. When all the work
+    /// on the scope is complete, `scope` will then re-emit that panic. If
+    /// multiple panics occurs, the first will propagate and the others will be
+    /// caught and dropped (which may result in program aborts).
     #[inline(always)]
     pub fn scope<'env, F, T>(&self, f: F) -> T
     where
@@ -1308,16 +1672,24 @@ impl Worker {
 }
 
 // -----------------------------------------------------------------------------
-// Thread local scheduling api
+// Implicit worker registration api
 
 /// Runs the provided closure in the background.
 ///
-/// <div class="warning">
-/// <strong>Note:</strong>
-/// This function panics if the current thread is not registered as a worker.
-/// </div>
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
+/// this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
+///
+/// If you have a reference to a [`Worker`], it's better to use [`Worker::spawn`]
+/// instead. If you don't have a worker, but know which thread pool you want to
+/// use, [`ThreadPool::spawn`] is more appropriate.
+///
+/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
+///
+/// **Warning:** This function panics if the current thread is not registered as a worker.
 ///
-/// See also: [`Worker::spawn`] and [`ThreadPool::spawn`].
+/// </pre></div>
 pub fn spawn<M, S: Spawn<M>>(work: S) -> S::Output {
     Worker::with_current(|worker| {
         worker
@@ -1328,12 +1700,20 @@ pub fn spawn<M, S: Spawn<M>>(work: S) -> S::Output {
 
 /// Waits for a future to complete.
 ///
-/// <div class="warning">
-/// <strong>Note:</strong>
-/// This function panics if the current thread is not registered as a worker.
-/// </div>
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
+/// this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
 ///
-/// See also: [`Worker::block_on`] and [`ThreadPool::block_on`].
+/// If you have a reference to a [`Worker`], it's better to use
+/// [`Worker::block_on`] instead. If you don't have a worker, but know which
+/// thread pool you want to use, [`ThreadPool::block_on`] is more appropriate.
+///
+/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
+///
+/// **Warning:** This function panics if the current thread is not registered as a worker.
+///
+/// </pre></div>
 pub fn block_on<F, T>(future: F) -> T
 where
     F: Future<Output = T> + Send,
@@ -1348,12 +1728,20 @@ where
 
 /// Executes the two closures, possibly in parallel.
 ///
-/// <div class="warning">
-/// <strong>Note:</strong>
-/// This function panics if the current thread is not registered as a worker.
-/// </div>
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
+/// this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
+///
+/// If you have a reference to a [`Worker`], it's better to use [`Worker::join`]
+/// instead. If you don't have a worker, but know which thread pool you want to
+/// use, [`ThreadPool::join`] is more appropriate.
+///
+/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
 ///
-/// See also: [`Worker::join`] and [`ThreadPool::join`].
+/// **Warning:** This function panics if the current thread is not registered as a worker.
+///
+/// </pre></div>
 pub fn join<A, B, RA, RB>(a: A, b: B) -> (RA, RB)
 where
     A: FnOnce(&Worker) -> RA + Send,
@@ -1370,218 +1758,20 @@ where
 
 /// Creates a new scope for spawning non-static work.
 ///
-/// Work spawned onto the new scope does not have to have a `'static`
-/// lifetime, and can borrow local variables. Local borrowing is possible
-/// because this function will not return until all work spawned on the
-/// scope has completed, this ensuring the stack frame is kept alive for the
-/// duration.
-///
-/// <div class="warning">
-/// <strong>Note:</strong>
-/// This function panics if the current thread is not registered as a worker.
-/// </div>
-///
-/// # Alternatives
-///
-/// Where possible, [`ThreadPool::scope`] or [`Worker::scope`] should be used
-/// instead. These functions are more efficient, and do not panic when not
-/// within a worker.
-///
-/// Scopes are a more flexible building block compared to [`join()`], since a
-/// loop can be used to spawn any number of tasks without recursing.
-/// However, that flexibility comes at a performance price: tasks spawned
-/// using `scope` must be allocated onto the heap, whereas [`join()`] can make
-/// exclusive use of the stack. Prefer [`join()`] (or ideally [`Worker::join`]) where possible.
-///
-/// [`join()`]: Worker::join
-///
-/// # Accessing stack data
-///
-/// In general, spawned tasks may borrow any stack data that lives outside
-/// the scope closure.
-///
-/// ```
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let ok: Vec<i32> = vec![1, 2, 3];
-/// forte::scope(|scope| {
-///     let bad: Vec<i32> = vec![4, 5, 6];
-///     scope.spawn_on(worker, |_: &Worker| {
-///         // Transfer ownership of `bad` into a local variable (also named `bad`).
-///         // This will force the closure to take ownership of `bad` from the environment.
-///         let bad = bad;
-///         println!("ok: {:?}", ok); // `ok` is only borrowed.
-///         println!("bad: {:?}", bad); // refers to our local variable, above.
-///     });
-///
-///     scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok`
-/// });
-/// # });
-/// ```
-/// As the comments example above suggest, to reference `bad` we must
-/// take ownership of it. One way to do this is to detach the closure
-/// from the surrounding stack frame, using the `move` keyword. This
-/// will cause it to take ownership of *all* the variables it touches,
-/// in this case including both `ok` *and* `bad`:
-///
-/// ```rust
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let ok: Vec<i32> = vec![1, 2, 3];
-/// forte::scope(|scope| {
-///     let bad: Vec<i32> = vec![4, 5, 6];
-///     scope.spawn_on(worker, move |_: &Worker| {
-///         println!("ok: {:?}", ok);
-///         println!("bad: {:?}", bad);
-///     });
-///
-///     // That closure is fine, but now we can't use `ok` anywhere else,
-///     // since it is owned by the previous task:
-///     // scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok));
-/// });
-/// # });
-/// ```
+/// When executed on a thread that is currently registered as a worker (i.e. the
+/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar)
+/// this is able to look up that registration and find the worker and
+/// thread-pool implicitly.
 ///
-/// While this works, it could be a problem if we want to use `ok` elsewhere.
-/// There are two choices. We can keep the closure as a `move` closure, but
-/// instead of referencing the variable `ok`, we create a shadowed variable that
-/// is a borrow of `ok` and capture *that*:
-///
-/// ```rust
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let ok: Vec<i32> = vec![1, 2, 3];
-/// forte::scope(|scope| {
-///     let bad: Vec<i32> = vec![4, 5, 6];
-///     let ok: &Vec<i32> = &ok; // shadow the original `ok`
-///     scope.spawn_on(worker, move |_: &Worker| {
-///         println!("ok: {:?}", ok); // captures the shadowed version
-///         println!("bad: {:?}", bad);
-///     });
-///
-///     // Now we too can use the shadowed `ok`, since `&Vec<i32>` references
-///     // can be shared freely. Note that we need a `move` closure here though,
-///     // because otherwise we'd be trying to borrow the shadowed `ok`,
-///     // and that doesn't outlive `scope`.
-///     scope.spawn_on(worker, move |_: &Worker| println!("ok: {:?}", ok));
-/// });
-/// # });
-/// ```
-///
-/// Another option is not to use the `move` keyword but instead to take ownership
-/// of individual variables:
-///
-/// ```rust
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let ok: Vec<i32> = vec![1, 2, 3];
-/// forte::scope(|scope| {
-///     let bad: Vec<i32> = vec![4, 5, 6];
-///     scope.spawn_on(worker, |_: &Worker| {
-///         // Transfer ownership of `bad` into a local variable (also named `bad`).
-///         // This will force the closure to take ownership of `bad` from the environment.
-///         let bad = bad;
-///         println!("ok: {:?}", ok); // `ok` is only borrowed.
-///         println!("bad: {:?}", bad); // refers to our local variable, above.
-///     });
-///
-///     scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok`
-/// });
-/// # });
-/// ```
-///
-/// # Referencing the scope
-///
-/// The scope passed into the closure is not allowed to leak out of this call.
-/// In other words, this will fail to compile:
-///
-/// ```compile_fail
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let mut leak = None;
-/// forte::scope(|scope| {
-///     leak = Some(scope); // <-- ERROR: scope would be leaked here
-/// });
-/// drop(leak);
-/// # });
-/// ```
-///
-/// Anything spawned onto the scope can capture a reference to it.
-/// This allows scoped work to spawn other scoped work.
-///
-/// ```
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// # THREAD_POOL.expect_worker(|worker| {
-/// let mut counter = 0;
-/// let counter_ref = &mut counter;
-/// forte::scope(|scope| {
-///     scope.spawn_on(worker, |worker: &Worker| {
-///         *counter_ref += 1;
-///         // Note: we borrow the scope again here.
-///         scope.spawn_on(worker, move |_: &Worker| {
-///             *counter_ref += 1;
-///         });
-///     });
-/// });
-/// assert_eq!(counter, 2);
-/// # });
-/// ```
-///
-/// It's possible to spawn non-scoped work within the closure, but these
-/// generally can't hold references to the scope. So for example, the
-/// following also fails to compile:
-///
-/// ```compile_fail,E0521
-/// # use forte::ThreadPool;
-/// # use forte::Worker;
-/// # static THREAD_POOL: ThreadPool = ThreadPool::new();
-/// # THREAD_POOL.populate();
-/// THREAD_POOL.with_worker(|worker| {
-///     worker.scope(|scope| {
-///         worker.spawn(|worker: &Worker| {
-///             // ^^^^^ ERROR: This creates a *static* job on the worker,
-///             //       which may outlive the scope.
-///             
-///             scope.spawn_on(worker, |_: &Worker| { });
-///             // ^^^^^ ERROR: This requires borrowing the scope within the
-///             //       unscoped job, which isn't allowed by the compiler
-///             //       because 'scope would have to to outlive 'static.
-///         });
-///     });
-/// });
-/// ```
+/// If you have a reference to a [`Worker`], it's better to use
+/// [`Worker::scope`] instead. If you don't have a worker, but know which thread
+/// pool you want to use, [`ThreadPool::scope`] is more appropriate.
 ///
-/// # Panics
+/// <div class="example-wrap" style="display:inline-block"><pre class="compile_fail" style="white-space:normal;font:inherit;">
 ///
-/// This function panics when not called within a worker. The
-/// [`ThreadPool::scope`] and [`Worker::scope`] functions do not, and should be
-/// preferred when possible.
+/// **Warning:** This function panics if the current thread is not registered as a worker.
 ///
-/// If a panic occurs, either in the closure given to `scope()` or in a blocking
-/// (non-async) job spawned on the scope, that panic will be propagated and the
-/// call to `scope()` will panic. If multiple panics occurs, it is
-/// non-deterministic which of their panic values will propagate. Regardless,
-/// once a task is spawned using `scope.spawn(),` it will execute, even if the
-/// spawning task should later panic. The scope returns once all work is
-/// complete, and panics are propagated at that point.
+/// </pre></div>
 pub fn scope<'env, F, T>(f: F) -> T
 where
     F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T,

From be90449ac5cbdcaef59cf72eba41463fd7f5f70c Mon Sep 17 00:00:00 2001
From: NthTensor <nth.tensor@gmail.com>
Date: Mon, 4 May 2026 07:51:17 -0400
Subject: [PATCH 3/3] fix: switch to hotclock for cpu ticks

---
 Cargo.lock         | 13 ++++++-------
 Cargo.toml         |  2 +-
 src/thread_pool.rs |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 75b2950..0c5fddb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -499,10 +499,10 @@ dependencies = [
  "crossbeam-utils",
  "dashmap",
  "divan",
+ "hotclock",
  "rayon",
  "shuttle",
  "st3",
- "tick_counter",
  "tracing",
  "tracing-subscriber",
 ]
@@ -633,6 +633,11 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
+[[package]]
+name = "hotclock"
+version = "0.2.0"
+source = "git+https://github.com/spence/hotclock#8cf14ae9d62dba7f7780a3c920ab6208b6568777"
+
 [[package]]
 name = "is-terminal"
 version = "0.4.16"
@@ -1184,12 +1189,6 @@ dependencies = [
  "cfg-if",
 ]
 
-[[package]]
-name = "tick_counter"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37f1310986d0aa940019cbb2b480161c60a614dba076cbb20e82bfbc236bbabd"
-
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
diff --git a/Cargo.toml b/Cargo.toml
index 2d68140..89cb4d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ atomic-wait = "1.1.0"
 crossbeam-queue = "0.3.12"
 crossbeam-utils = "0.8.21"
 st3 = "0.4"
-tick_counter = "0.4.5"
+hotclock = { git = "https://github.com/spence/hotclock" }
 
 shuttle = { version = "0.8.0", optional = true }
 tracing = { version = "0.1.41", features = ["release_max_level_off"] }
diff --git a/src/thread_pool.rs b/src/thread_pool.rs
index 4c235cc..69ca80b 100644
--- a/src/thread_pool.rs
+++ b/src/thread_pool.rs
@@ -945,7 +945,7 @@ impl Worker {
         // Promotions are fairly costly, so we limit their frequency using the
         // cpu's instruction counter. Promote is called at a high frequency, and
         // actually doing the promotion is probably a cold path.
-        let current_tick = tick_counter::start();
+        let current_tick = hotclock::Instant::now().as_raw();
         if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL {
             // This should ideally become a conditional jump.
             self.promote_cold(current_tick);