From 2c10ea5e28df6a790cd276ed4673e66a16a2b128 Mon Sep 17 00:00:00 2001 From: NthTensor Date: Sun, 25 Jan 2026 13:15:00 -0500 Subject: [PATCH 1/3] feat: instruction counters and lazy scheduling --- CHANGELOG.md | 18 + Cargo.lock | 165 ++++++- Cargo.toml | 4 +- benches/bevy_tasks.rs | 18 +- benches/flat_scope.rs | 6 +- benches/flood_fill.rs | 15 +- benches/fork_join.rs | 57 ++- src/compile_fail.rs | 313 +++++++------ src/job.rs | 507 +++++++++++++++++---- src/latch.rs | 181 ++++---- src/lib.rs | 57 +-- src/scope.rs | 272 +++++++---- src/thread_pool.rs | 1000 ++++++++++++++++++++++------------------- src/util.rs | 9 +- tests/shuttle.rs | 145 +----- 15 files changed, 1714 insertions(+), 1053 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03e89e3..3f7ea85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,4 @@ + # Changelog All notable changes to this project will be documented in this file. @@ -13,6 +14,23 @@ This project is currently in early [pre-release], and there may be arbitrary bre ## [Unreleased] +### Added + +- `ThreadPool::num_workers` method which return the current number of workers +- `ThreadPool::on_worker` variant of `with_worker` for `Send` closures. +- `ThreadPool::expect_worker` variant of `with_worker` that panics. + +### Changed +- Work sharing has been rewritten to improve performance. +- Thread pools can now have a max of 32 workers at a time. +- `spawn`, `Scope::spawn`, and `Worker::spawn` now accept closures and futures. +- `ThreadPool::with_worker` now provides `Option<&Worker>` instead of `&Worker`. +- `claim_lease` now returns `Option` instead of `Lease`. +- `Scope` now has two lifetimes instead of one, and is more flexible. + +### Removed +- All versions of `spawn_future` and `spawn_async`; just use `spawn` instead. + ## [1.0.0-alpha.4] ### Added diff --git a/Cargo.lock b/Cargo.lock index 4274d5d..75b2950 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,12 +55,6 @@ dependencies = [ "serde", ] -[[package]] -name = "arraydeque" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" - [[package]] name = "assoc" version = "0.1.3" @@ -496,7 +490,6 @@ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" name = "forte" version = "1.0.0-dev" dependencies = [ - "arraydeque", "async-task", "atomic-wait", "bevy_tasks", @@ -508,6 +501,8 @@ dependencies = [ "divan", "rayon", "shuttle", + "st3", + "tick_counter", "tracing", "tracing-subscriber", ] @@ -543,6 +538,19 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "generator" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e" +dependencies = [ + "cc", + "libc", + "log", + "rustversion", + "windows 0.48.0", +] + [[package]] name = "generator" version = "0.8.5" @@ -554,7 +562,7 @@ dependencies = [ "libc", "log", "rustversion", - "windows", + "windows 0.61.3", ] [[package]] @@ -694,6 +702,28 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "loom" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5" +dependencies = [ + "cfg-if", + "generator 0.7.5", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "memchr" version = "2.7.5" @@ -920,8 +950,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", ] [[package]] @@ -932,7 +971,7 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -941,6 +980,12 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -1055,7 +1100,7 @@ dependencies = [ "assoc", "bitvec", "cfg-if", - "generator", + "generator 0.8.5", "hex", "owo-colors", "rand", @@ -1087,6 +1132,16 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "st3" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5a1d2cec4c9904d238075cb3a212615e67aee6acce849e4e565acf2320a7bf1" +dependencies = [ + "crossbeam-utils", + "loom", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -1129,6 +1184,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tick_counter" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37f1310986d0aa940019cbb2b480161c60a614dba076cbb20e82bfbc236bbabd" + [[package]] name = "tinytemplate" version = "1.2.1" @@ -1188,10 +1249,14 @@ version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] @@ -1323,6 +1388,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows" version = "0.61.3" @@ -1371,9 +1445,9 @@ dependencies = [ [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", @@ -1382,9 +1456,9 @@ dependencies = [ [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", @@ -1452,7 +1526,22 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -1486,6 +1575,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -1498,6 +1593,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -1510,6 +1611,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -1528,6 +1635,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -1540,6 +1653,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -1552,6 +1671,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -1564,6 +1689,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" diff --git a/Cargo.toml b/Cargo.toml index 3a5057c..d77e28f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,11 +11,13 @@ resolver = "2" members = ["ci"] [dependencies] -arraydeque = "0.5.1" async-task = "4.7.1" atomic-wait = "1.1.0" crossbeam-queue = "0.3.12" crossbeam-utils = "0.8.21" +st3 = "0.4" +tick_counter = "0.4.5" + shuttle = { version = "0.8.0", optional = true } tracing = { version = "0.1.41", features = ["release_max_level_off"] } tracing-subscriber = "0.3.19" diff --git a/benches/bevy_tasks.rs b/benches/bevy_tasks.rs index d3cbe33..b730f1f 100644 --- a/benches/bevy_tasks.rs +++ b/benches/bevy_tasks.rs @@ -40,21 +40,21 @@ mod overhead { for i in 0..80 { black_box(i); } - // std::thread::sleep(Duration::from_nanos(100)); black_box(value); } #[divan::bench(args = LEN)] - fn serial(bencher: Bencher, len: usize) { + fn baseline(bencher: Bencher, len: usize) { let mut vec: Vec<_> = (0..len).collect(); bencher.bench_local(|| vec.iter_mut().for_each(work)); } #[divan::bench(args = LEN)] fn bevy_tasks(bencher: Bencher, len: usize) { - use crate::BevyParChunksMut; use bevy_tasks::ParallelIterator; + use crate::BevyParChunksMut; + let mut vec: Vec<_> = (0..len).collect(); let pool = bevy_tasks::TaskPoolBuilder::new() .thread_name("bevy_tasks".to_string()) @@ -83,18 +83,18 @@ mod overhead { let mut vec: Vec<_> = (0..len).collect(); - THREAD_POOL.resize_to_available(); - - bencher.bench_local(|| { - THREAD_POOL.with_worker(|worker| { - forte_chunks::<8, _, _>(worker, &mut vec, &|c| { + THREAD_POOL.expect_worker(|worker| { + bencher.bench_local(|| { + forte_chunks::<64, _, _>(worker, &mut vec, &|c| { c.iter_mut().for_each(work); }); - }) + }); }); } } fn main() { + THREAD_POOL.resize_to_available(); + divan::main(); } diff --git a/benches/flat_scope.rs b/benches/flat_scope.rs index 0aca71e..8d48ef0 100644 --- a/benches/flat_scope.rs +++ b/benches/flat_scope.rs @@ -1,6 +1,8 @@ //! A benchmark for fork-join workloads adapted from `chili`. -use std::hash::{DefaultHasher, Hash, Hasher}; +use std::hash::DefaultHasher; +use std::hash::Hash; +use std::hash::Hasher; use criterion::black_box; use divan::Bencher; @@ -37,7 +39,7 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new(); fn forte(bencher: Bencher, size: usize) { use forte::Worker; - COMPUTE.with_worker(|worker| { + COMPUTE.expect_worker(|worker| { bencher.bench_local(|| { worker.scope(|scope| { for i in 0..size { diff --git a/benches/flood_fill.rs b/benches/flood_fill.rs index 5ccb631..471953d 100644 --- a/benches/flood_fill.rs +++ b/benches/flood_fill.rs @@ -1,7 +1,10 @@ //! A benchmark for fork-join workloads adapted from `chili`. -use std::collections::{HashSet, VecDeque}; -use std::hash::{DefaultHasher, Hash, Hasher}; +use std::collections::HashSet; +use std::collections::VecDeque; +use std::hash::DefaultHasher; +use std::hash::Hash; +use std::hash::Hasher; use criterion::black_box; use dashmap::DashSet; @@ -69,7 +72,8 @@ static COMPUTE: forte::ThreadPool = forte::ThreadPool::new(); #[divan::bench(args = sizes(), threads = false)] fn forte(bencher: Bencher, size: usize) { - use forte::{Scope, Worker}; + use forte::Scope; + use forte::Worker; fn visit<'scope, 'env>( size: usize, @@ -122,7 +126,7 @@ fn forte(bencher: Bencher, size: usize) { } } - COMPUTE.with_worker(|worker| { + COMPUTE.expect_worker(|worker| { bencher.bench_local(|| { let visited = DashSet::new(); @@ -135,7 +139,8 @@ fn forte(bencher: Bencher, size: usize) { #[divan::bench(args = sizes(), threads = false)] fn rayon(bencher: Bencher, size: usize) { - use rayon::{Scope, scope}; + use rayon::Scope; + use rayon::scope; fn visit<'scope>( size: usize, diff --git a/benches/fork_join.rs b/benches/fork_join.rs index 6772b8b..2e55b24 100644 --- a/benches/fork_join.rs +++ b/benches/fork_join.rs @@ -86,7 +86,7 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) { let tree = Node::tree(nodes.0); - COMPUTE.with_worker(|worker| { + COMPUTE.expect_worker(|worker| { info!("Staring Benchmark"); bencher.bench_local(move || { assert_eq!(sum(&tree, worker), nodes.1 as u64); @@ -94,6 +94,26 @@ fn forte(bencher: Bencher, nodes: (usize, usize)) { }); } +#[divan::bench(args = nodes())] +fn throughput_forte(bencher: Bencher, nodes: (usize, usize)) { + fn sum(node: &Node, worker: &Worker) -> u64 { + let (left, right) = worker.join( + |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(), + |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(), + ); + + node.val + left + right + } + + info!("Staring Benchmark"); + bencher.bench(|| { + COMPUTE.expect_worker(|worker| { + let tree = Node::tree(nodes.0); + assert_eq!(sum(&tree, worker), nodes.1 as u64); + }); + }); +} + #[divan::bench(args = nodes())] fn chili(bencher: Bencher, nodes: (usize, usize)) { fn sum(node: &Node, scope: &mut Scope<'_>) -> u64 { @@ -113,6 +133,24 @@ fn chili(bencher: Bencher, nodes: (usize, usize)) { }); } +#[divan::bench(args = nodes())] +fn thrughput_chili(bencher: Bencher, nodes: (usize, usize)) { + fn sum(node: &Node, scope: &mut Scope<'_>) -> u64 { + let (left, right) = scope.join( + |s| node.left.as_deref().map(|n| sum(n, s)).unwrap_or_default(), + |s| node.right.as_deref().map(|n| sum(n, s)).unwrap_or_default(), + ); + + node.val + left + right + } + + bencher.bench(move || { + let tree = Node::tree(nodes.0); + let mut scope = Scope::global(); + assert_eq!(sum(&tree, &mut scope), nodes.1 as u64); + }); +} + #[divan::bench(args = nodes())] fn rayon(bencher: Bencher, nodes: (usize, usize)) { fn sum(node: &Node) -> u64 { @@ -131,6 +169,23 @@ fn rayon(bencher: Bencher, nodes: (usize, usize)) { }); } +#[divan::bench(args = nodes())] +fn throughput_rayon(bencher: Bencher, nodes: (usize, usize)) { + fn sum(node: &Node) -> u64 { + let (left, right) = rayon::join( + || node.left.as_deref().map(sum).unwrap_or_default(), + || node.right.as_deref().map(sum).unwrap_or_default(), + ); + + node.val + left + right + } + + bencher.bench(move || { + let tree = Node::tree(nodes.0); + assert_eq!(sum(&tree), nodes.1 as u64); + }); +} + fn main() { let fmt_layer = fmt::layer() .without_time() diff --git a/src/compile_fail.rs b/src/compile_fail.rs index 832b171..e60ed01 100644 --- a/src/compile_fail.rs +++ b/src/compile_fail.rs @@ -3,188 +3,175 @@ // ----------------------------------------------------------------------------- // Ensures non-send data cannot be moved into a join. -/** ```compile_fail,E0277 - -use std::rc::Rc; -use forte::ThreadPool; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -let r = Rc::new(22); -THREAD_POOL.join(|_| r.clone(), |_| r.clone()); -//~^ ERROR - -``` */ +/// ```compile_fail,E0277 +/// use std::rc::Rc; +/// use forte::ThreadPool; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// let r = Rc::new(22); +/// THREAD_POOL.join(|_| r.clone(), |_| r.clone()); +/// //~^ ERROR +/// ``` mod nonsend_input {} // ----------------------------------------------------------------------------- // Ensures non-send data cannot be returned by join. -/** ```compile_fail,E0277 - -use std::rc::Rc; -use forte::ThreadPool; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -THREAD_POOL.join(|_| Rc::new(22), |_| ()); //~ ERROR - -THREAD_POOL.depopulate(); - -``` */ +/// ```compile_fail,E0277 +/// use std::rc::Rc; +/// use forte::ThreadPool; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// THREAD_POOL.join(|_| Rc::new(22), |_| ()); //~ ERROR +/// +/// THREAD_POOL.depopulate(); +/// ``` mod nonsend_left_join {} -/** ```compile_fail,E0277 - -use std::rc::Rc; -use forte::ThreadPool; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -THREAD_POOL.join(|_| (), |_| Rc::new(23)); //~ ERROR - -THREAD_POOL.depopulate(); - -``` */ +/// ```compile_fail,E0277 +/// use std::rc::Rc; +/// use forte::ThreadPool; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// THREAD_POOL.join(|_| (), |_| Rc::new(23)); //~ ERROR +/// +/// THREAD_POOL.depopulate(); +/// ``` mod nonsend_right_join {} // ----------------------------------------------------------------------------- // Ensures scopes can not borrow data spawned within the closure. -/** ```compile_fail,E0373 - -use forte::ThreadPool; -use forte::Worker; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -fn bad_scope(f: F) - where F: FnOnce(&i32) + Send, -{ - THREAD_POOL.scope(|scope| { - let x = 22; - scope.spawn(|_: &Worker| f(&x)); //~ ERROR `x` does not live long enough - }); -} - -fn good_scope(f: F) - where F: FnOnce(&i32) + Send, -{ - let x = 22; - THREAD_POOL.scope(|scope| { - scope.spawn(|_: &Worker| f(&x)); - }); -} - -fn main() { } - -``` */ +/// ```compile_fail,E0373 +/// use forte::ThreadPool; +/// use forte::Worker; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// fn bad_scope(f: F) +/// where +/// F: FnOnce(&i32) + Send, +/// { +/// THREAD_POOL.scope(|scope| { +/// let x = 22; +/// scope.spawn(|_: &Worker| f(&x)); //~ ERROR `x` does not live long enough +/// }); +/// } +/// +/// fn good_scope(f: F) +/// where +/// F: FnOnce(&i32) + Send, +/// { +/// let x = 22; +/// THREAD_POOL.scope(|scope| { +/// scope.spawn(|_: &Worker| f(&x)); +/// }); +/// } +/// +/// fn main() {} +/// ``` mod scope_join_bad {} // ----------------------------------------------------------------------------- // Ensures the two branches of a join mutably borrow the same data. -/** ```compile_fail,E0524 - -use forte::ThreadPool; -use forte::Worker; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -fn quick_sort(v: &mut [T]) { - if v.len() <= 1 { - return; - } - - let mid = partition(v); - let (lo, _hi) = v.split_at_mut(mid); - THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(lo)); //~ ERROR -} - -fn partition(v: &mut [T]) -> usize { - let pivot = v.len() - 1; - let mut i = 0; - for j in 0..pivot { - if v[j] <= v[pivot] { - v.swap(i, j); - i += 1; - } - } - v.swap(i, pivot); - i -} - -fn main() { } - -``` */ +/// ```compile_fail,E0524 +/// use forte::ThreadPool; +/// use forte::Worker; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// fn quick_sort(v: &mut [T]) { +/// if v.len() <= 1 { +/// return; +/// } +/// +/// let mid = partition(v); +/// let (lo, _hi) = v.split_at_mut(mid); +/// THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(lo)); //~ ERROR +/// } +/// +/// fn partition(v: &mut [T]) -> usize { +/// let pivot = v.len() - 1; +/// let mut i = 0; +/// for j in 0..pivot { +/// if v[j] <= v[pivot] { +/// v.swap(i, j); +/// i += 1; +/// } +/// } +/// v.swap(i, pivot); +/// i +/// } +/// fn main() {} +/// ``` mod quicksort_race_1 {} -/** ```compile_fail,E0500 - -use forte::ThreadPool; -use forte::Worker; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -fn quick_sort(v: &mut [T]) { - if v.len() <= 1 { - return; - } - - let mid = partition(v); - let (lo, _hi) = v.split_at_mut(mid); - THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(v)); //~ ERROR -} - -fn partition(v: &mut [T]) -> usize { - let pivot = v.len() - 1; - let mut i = 0; - for j in 0..pivot { - if v[j] <= v[pivot] { - v.swap(i, j); - i += 1; - } - } - v.swap(i, pivot); - i -} - -fn main() { } - -``` */ +/// ```compile_fail,E0500 +/// use forte::ThreadPool; +/// use forte::Worker; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// fn quick_sort(v: &mut [T]) { +/// if v.len() <= 1 { +/// return; +/// } +/// +/// let mid = partition(v); +/// let (lo, _hi) = v.split_at_mut(mid); +/// THREAD_POOL.join(|_| quick_sort(lo), |_| quick_sort(v)); //~ ERROR +/// } +/// +/// fn partition(v: &mut [T]) -> usize { +/// let pivot = v.len() - 1; +/// let mut i = 0; +/// for j in 0..pivot { +/// if v[j] <= v[pivot] { +/// v.swap(i, j); +/// i += 1; +/// } +/// } +/// v.swap(i, pivot); +/// i +/// } +/// +/// fn main() { } +/// ``` mod quicksort_race_2 {} -/** ```compile_fail,E0524 - -use forte::ThreadPool; -use forte::Worker; - -static THREAD_POOL: ThreadPool = ThreadPool::new(); - -fn quick_sort(v: &mut [T]) { - if v.len() <= 1 { - return; - } - - let mid = partition(v); - let (_lo, hi) = v.split_at_mut(mid); - THREAD_POOL.join(|_| quick_sort(hi), |_| quick_sort(hi)); //~ ERROR -} - -fn partition(v: &mut [T]) -> usize { - let pivot = v.len() - 1; - let mut i = 0; - for j in 0..pivot { - if v[j] <= v[pivot] { - v.swap(i, j); - i += 1; - } - } - v.swap(i, pivot); - i -} - -fn main() { } - -``` */ +/// ```compile_fail,E0524 +/// use forte::ThreadPool; +/// use forte::Worker; +/// +/// static THREAD_POOL: ThreadPool = ThreadPool::new(); +/// +/// fn quick_sort(v: &mut [T]) { +/// if v.len() <= 1 { +/// return; +/// } +/// +/// let mid = partition(v); +/// let (_lo, hi) = v.split_at_mut(mid); +/// THREAD_POOL.join(|_| quick_sort(hi), |_| quick_sort(hi)); //~ ERROR +/// } +/// +/// fn partition(v: &mut [T]) -> usize { +/// let pivot = v.len() - 1; +/// let mut i = 0; +/// for j in 0..pivot { +/// if v[j] <= v[pivot] { +/// v.swap(i, j); +/// i += 1; +/// } +/// } +/// v.swap(i, pivot); +/// i +/// } +/// +/// fn main() {} +/// ``` mod quicksort_race_3 {} diff --git a/src/job.rs b/src/job.rs index 75e6347..38d0c30 100644 --- a/src/job.rs +++ b/src/job.rs @@ -12,14 +12,18 @@ //! (c) Each job reference is executed exactly once. use alloc::boxed::Box; -use arraydeque::ArrayDeque; +use alloc::collections::VecDeque; +use alloc::vec::Vec; use core::cell::UnsafeCell; -use core::mem::{ManuallyDrop, MaybeUninit}; +use core::mem::ManuallyDrop; +use core::mem::MaybeUninit; use core::ptr::NonNull; -use core::sync::atomic::{Ordering, fence}; +use core::sync::atomic::Ordering; +use core::sync::atomic::fence; use std::thread::Result as ThreadResult; use crate::latch::Latch; +use crate::platform::AtomicU32; use crate::thread_pool::Worker; use crate::unwind; @@ -34,7 +38,7 @@ trait Job { /// /// # Safety /// - /// Implements must specify the invariant of the pointer `this` that the + /// Implementors must specify the invariant of the pointer `this` that the /// caller is expected to uphold. /// /// This may be called from a different thread than the one which scheduled @@ -59,7 +63,7 @@ pub struct JobRef { /// of `StackJob` or `HeapJob`. But it can contain other things as well. job_pointer: NonNull<()>, /// A function pointer that can execute the job stored at `job_pointer`. - /// This is usually point to an implementation of `Job::execute` (either + /// This usually points to an implementation of `Job::execute` (either /// `HeapJob::execute` or `StackJob::execute`). But it can contain other /// things as well. execute_fn: unsafe fn(NonNull<()>, &Worker), @@ -70,9 +74,16 @@ impl JobRef { /// /// # Safety /// - /// The caller must ensure that `job_pointer` remains valid to pass to - /// `execute_fn` until the job is executed. What exactly this means is - /// dependent on the implementation of the execute function. + /// The caller must ensure that: + /// + /// * `job_pointer` and `execute_fn` are *matched*; the `execute_fn` must be + /// a function that can safely receive `job_pointer` as it's first argument. + /// + /// * `job_pointer` points to an initialized and properly aligned value which + /// is neither moved nor dropped until `execute_fn` is called. + /// + /// * `job_pointer` is "valid" now and until `execute_fn` is called, + /// according to the contract of the specific `execute_fn` being stored. #[inline(always)] pub unsafe fn new_raw( job_pointer: NonNull<()>, @@ -94,56 +105,94 @@ impl JobRef { /// Executes the `JobRef` by passing the execute function on the job pointer. #[inline(always)] pub fn execute(self, worker: &Worker) { - // SAFETY: The constructor of `JobRef` is required to ensure this is valid. + // SAFETY: Calling this function on this pointer is valid due to the + // contract of `JobRef::new_raw`: + // + // * `self.execute_fn` and `self.job_pointer` are "matched": every + // `JobRef` is constructed via `new_raw`, which requires the caller + // to supply a compatible pair. + // + // * `self.job_pointer` is valid at this point: `new_raw` requires the + // pointer to remain valid until `execute_fn` is called, and we are + // calling it now. + // + // * This is called at most once: `execute` consumes `self`, so the + // pointer cannot be used again via this `JobRef`. unsafe { (self.execute_fn)(self.job_pointer, worker) } } } -// SAFETY: !Send for raw pointers is not for safety, just as a lint. +// SAFETY: `JobRef` is a type-erased data pointer + function pointer tuple. The +// data pointer always points to a `Send` value due to the safety requirements +// of `JobRef::new_raw`. Function pointers are always `Send`. Therefore it is +// sound to move a `JobRef` across thread boundaries. unsafe impl Send for JobRef {} // ----------------------------------------------------------------------------- // Job queue +/// A queue of jobs. This is a simple wrapper around a vec dequeue that uses +/// inner mutation, and has some more intiuitively named methods to enforce +/// conventions. pub struct JobQueue { - job_refs: UnsafeCell>, + job_refs: UnsafeCell>, } impl JobQueue { + /// Creates a new job queue. pub fn new() -> JobQueue { JobQueue { - job_refs: UnsafeCell::new(ArrayDeque::new()), + job_refs: UnsafeCell::new(VecDeque::new()), } } - #[inline(always)] - pub fn push(&self, job_ref: JobRef) -> Option { - // SAFETY: The queue itself is only access mutably within `push_back`, - // `pop_back` and `pop_front`. Since these functions never call each - // other, we must have exclusive access to the queue. + /// Insert a job at the back of the queue (the side with the newest jobs). + pub fn push_new(&self, job_ref: JobRef) { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. let job_refs = unsafe { &mut *self.job_refs.get() }; - if let Err(full) = job_refs.push_back(job_ref) { - Some(full.element) - } else { - None - } + job_refs.push_back(job_ref); } - #[inline(always)] + /// Insert a job at the front of the queue (the side with the oldest jobs). + pub fn push_old(&self, job_ref: JobRef) { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. + let job_refs = unsafe { &mut *self.job_refs.get() }; + job_refs.push_front(job_ref); + } + + /// Removes the newest job in the queue. pub fn pop_newest(&self) -> Option { - // SAFETY: The queue itself is only access mutably within `push_back`, - // `pop_back` and `pop_front`. Since these functions never call each - // other, we must have exclusive access to the queue. + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. let job_refs = unsafe { &mut *self.job_refs.get() }; job_refs.pop_back() } - // Attempt to remove the given job-ref from the back of the queue. + /// Removes the oldest job in the queue. + pub fn pop_oldest(&self) -> Option { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. + let job_refs = unsafe { &mut *self.job_refs.get() }; + job_refs.pop_front() + } + + /// Attempt to remove the given job-ref from the back of the queue. #[inline(always)] - pub fn recover_just_pushed(&self, id: (usize, usize)) -> bool { - // SAFETY: The queue itself is only access mutably within `push_back`, - // `pop_back` and `pop_front`. Since these functions never call each - // other, we must have exclusive access to the queue. + pub fn recover_newest(&self, id: (usize, usize)) -> bool { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. let job_refs = unsafe { &mut *self.job_refs.get() }; if job_refs.back().map(JobRef::id) == Some(id) { let _ = job_refs.pop_back(); @@ -153,13 +202,38 @@ impl JobQueue { } } - #[cold] - pub fn pop_oldest(&self) -> Option { - // SAFETY: The queue itself is only access mutably within `push_back`, - // `pop_back` and `pop_front`. Since these functions never call each - // other, we must have exclusive access to the queue. + /// The size of a chunk of jobs. + const CHUNK_SIZE: usize = 16; + + /// Splits off a series of chunks from the end of the queue (the side with + /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. After, At most + /// `CHUNK_SIZE` jobs will be left in the queue. + pub fn split(&self) -> Vec> { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. let job_refs = unsafe { &mut *self.job_refs.get() }; - job_refs.pop_front() + let mut len = job_refs.len(); + let num_chunks = len / Self::CHUNK_SIZE; + (0..num_chunks) + .map(|_| { + let chunk = job_refs.split_off(len - Self::CHUNK_SIZE); + len -= Self::CHUNK_SIZE; + chunk + }) + .collect() + } + + /// Appends a chunk of jobs (expected to be provided by `split`) to the + /// queue. Jobs are added to the end (the side with the newst jobs). + pub fn append(&self, mut split_refs: VecDeque) { + // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one + // thread. We ensure no other references to the inner value exist by not + // returning any references from this API, making this exclusive access + // safe. + let job_refs = unsafe { &mut *self.job_refs.get() }; + job_refs.append(&mut split_refs); } } @@ -195,23 +269,35 @@ where /// /// # Safety /// - /// The caller must ensure that the `StackJob` that the returned `JobRef` refers - /// to will live as long as the `JobRef`. The caller must also ensure that - /// the `JobRef` does not outlive the data the `StackJob` closes over; which - /// is to say, if the closure references something, that thing must exist at - /// least until the `JobRef` is executed or dropped. Additionally, the - /// caller must ensure that they never create two different `JobRef`s that - /// point to the same `StackJob`. + /// The caller must ensure that: + /// + /// * The `StackJob` will outlive the `JobRef`. + /// + /// * The `StackJob` will not move for the lifetime of the `JobRef`. + /// + /// * The `StackJob` does not outlive any data it closes over. + /// + /// * This function is not called again so long as the `JobRef` lives. #[inline(always)] pub unsafe fn as_job_ref(&self) -> JobRef { let job_pointer = NonNull::from(self).cast(); - // SAFETY: The caller ensures the `StackJob` will outlive the `JobRef`, - // so it will remain valid to convert this pointer into a reference, and - // hence it is possible to pass this pointer to `Self::execute`. + // SAFETY: `JobRef::new_raw` requires: + // + // * `job_pointer` and `Self::execute` are matched. + // + // Here, `execute` expects a pointer to `Self`, which is what + // `job_pointer` is. + // + // * The pointee is live, not moved, and not dropped until `execute_fn` + // is called. // - // `Self::execute` cannot be called multiple times because - // `JobRef::execute` takes ownership of the `JobRef`, and we only create - // a single `JobRef` for each stack job. + // Here, the caller guarantees the `StackJob` outlives and does not + // move for the lifetime of the `JobRef`. + // + // * `execute_fn` to be called at most once. + // + // Here, `JobRef::execute` consumes the `JobRef`, and only one + // `JobRef` is created per `StackJob`, so it is called exactly once. unsafe { JobRef::new_raw(job_pointer, Self::execute) } } @@ -228,29 +314,49 @@ where /// /// # Safety /// - /// This may only be called before the job is executed. + /// The caller must ensure that either this function or `execute` are called + /// for a given `StackJob` (not both), and that this function must not be + /// called multiple times. #[inline(always)] pub unsafe fn unwrap(&mut self) -> F { - // SAFETY: This will not be used again. Given that `execute` has not - // already been, it will never be used twice. - unsafe { ManuallyDrop::take(self.f.get_mut()) } + let f_mut = self.f.get_mut(); + // SAFETY: `ManuallyDrop` requires us to ensure that it is not used + // again after we `take()` it's contents. + // + // `take()` is called in two places: once here, and once in `execute`. + // Since this function is mutually exclusive with `execute`, and is + // called at most once, the `ManuallyDrop` is not used again. + unsafe { ManuallyDrop::take(f_mut) } } /// Unwraps the job into it's return value. /// /// # Safety /// - /// This may only be called after the job has finished executing, and it's - /// latch has been set. + /// The caller must ensure that: + /// + /// * This is called only after the job's latch is set. + /// + /// * That this is called at most once for a given `StackJob`. #[inline(always)] pub unsafe fn return_value(&mut self) -> ThreadResult { - // Synchronize with the fence in `StackJob::execute`. + // Synchronize with the fence in `StackJob::execute`, establishing a + // happens-after relationship with the following read.. fence(Ordering::Acquire); // Get a ref to the result. let result_ref = self.return_value.get_mut(); - // SAFETY: The job has completed, which means the return value must have - // been initialized. This consumes the job, so there's no chance of this - // accidentally duplicating data. + // SAFETY: `assume_init_read` requires: + // + // * The `MaybeUninit` is fully initialized. + // + // As this function can only be called if the latch has been set, and + // the latch is only set at the end of `StackJob::execute` (after + // `return_value` is written and memory is synchronized via the above + // fence) the memory must be initialized. + // + // * That data not be incorrectly duplicated by repeated calls. + // + // Data is not duplicated because this function is called at most once. unsafe { result_ref.assume_init_read() } } } @@ -264,36 +370,58 @@ where /// /// # Safety /// - /// The caller must ensure that `this` is valid to access a `StackJob` - /// immutably at least until the `Latch` within the `StackJob` has been set. - /// As a consequence, this may not be run after a latch has been set. Since - /// this function sets the latch, the caller must ensure to only call this - /// function once. + /// The caller must ensure that: + /// + /// * `this` is a non-null, properly aligned pointer to a live instance of + /// `StackJob`. + /// + /// * The `StackJob` will not move or be deallocated until the latch it + /// contains is set. + /// + /// * Either this function or `unwrap` are called at most once for a given + /// `StackJob`. #[inline(always)] unsafe fn execute(this: NonNull<()>, worker: &Worker) { // SAFETY: The caller ensures `this` can be converted into an immutable // reference until we set the latch, and the latch has not yet been set. let this = unsafe { this.cast::().as_ref() }; // Create an abort guard. If the closure panics, this will convert the - // panic into an abort. Doing so prevents use-after-free for other elements of the stack. + // panic into an abort. Doing so prevents use-after-free for other + // elements of the stack. let abort_guard = unwind::AbortOnDrop; - // SAFETY: This memory location is accessed only in this function and in - // `unwrap`. The latter cannot have been called because it consumes the - // stack job. And since this function is called only once, we can - // guarantee that we have exclusive access. + // SAFETY: `f` is a `UnsafeCell>`. Creating a + // `&mut ManuallyDrop` is only sound so long as no other live + // references exist. + // + // `f` is accessed mutably in two places: once here, and once in + // `unwrap`. Since this function is mutually exclusive with `unwrap`, + // and is called at most once, exclusive access is guaranteed. let f_ref = unsafe { &mut *this.f.get() }; - // SAFETY: The caller ensures this function is called only once. + // SAFETY: `ManuallyDrop` requires us to ensure that it is not used + // again after we `take()` it's contents. + // + // `take()` is called in two places: once here, and once in `unwrap`. + // Since this function is mutually exclusive with `unwrap`, and is + // called at most once, the `ManuallyDrop` is not used again. let f = unsafe { ManuallyDrop::take(f_ref) }; - // Run the job. If the job panics, we propagate the panic back to the main thread. + // Run the job. If the job panics, we propagate the panic back to the + // main thread. let result = unwind::halt_unwinding(|| f(worker)); // Get the uninitialized memory where we should put the return value. let return_value = this.return_value.get(); - // SAFETY: The return value is only accessed here and in - // `StackJob::return_value`. Since the other method consumes the stack - // job, it's not possible for it to run concurrently. Therefore, we must - // have exclusive access to the return value. + // SAFETY: Writing to this unsafe cell requires that no other thread + // holds a reference to it's contents. + // + // The `return_value` is only written here and only read within + // `StackJob::return_value`, and then only after the latch has been set. + // The latch has not been set, and this function is called at most once, + // so no concurrent access can occur. unsafe { (*return_value).write(result) }; - // Latches do not participate in memory ordering, so we need to do this manually. + // This syncrhonizies with the `Acquire` fence within `return_value()`, + // establishing a happens-before relationship that makes the preceding + // `return_value` write vsibile to the reader. + // + // This is required because latches do not synchronize memory. fence(Ordering::Release); // SAFETY: The caller ensures the job is valid until the latch is set. // Since the latch is a field of the job, the latch must be valid until @@ -304,6 +432,229 @@ where } } +// ----------------------------------------------------------------------------- +// Stack allocated work function on a non-worker thread + +/// Like [`StackJob`] but allocated on the stack of a non-worker thread. While +/// this job is pending, the owning thread is fully blocked. +#[cfg(not(feature = "shuttle"))] +pub struct ExternalJob { + f: UnsafeCell>, + completed: AtomicU32, + return_value: UnsafeCell>>, +} + +#[cfg(not(feature = "shuttle"))] +impl ExternalJob +where + F: FnOnce(&Worker) -> T + Send, + T: Send, +{ + /// Creates a new `ExternalJob`. + #[inline(always)] + pub fn new(f: F) -> ExternalJob { + ExternalJob { + f: UnsafeCell::new(ManuallyDrop::new(f)), + completed: AtomicU32::new(0), + return_value: UnsafeCell::new(MaybeUninit::uninit()), + } + } + + /// Creates a `JobRef` pointing to this job. The underlying `ExternalJob` is + /// not dropped after the `JobRef` is executed. + /// + /// # Safety + /// + /// The caller must ensure that: + /// + /// * The `ExternalJob` will not move or be deallocated until the `JobRef` + /// is executed. + /// + /// * The `JobRef` does not outlive any data the `ExternalJob` closes over. + /// + /// * This function is not called again so long as the `JobRef` lives. + #[inline(always)] + pub unsafe fn as_job_ref(&self) -> JobRef { + let job_pointer = NonNull::from(self).cast(); + // SAFETY: The `job_pointer` is trivially aligned and non-null, + // because it is derived from a reference. + // + // The caller must not allow the `ExternalJob` to move or be deallocated + // until the `JobRef` is executed. This guarantees that `job_pointer` + // remains valid for the lifetime of `JobRef`, satisfying the + // requirements of `JobRef::new_raw`. + // + // The caller guarantees that this function is not called again while + // `JobRef` lives, so `Self::execute` can be called at most once for + // this particular `ExternalJob`. This satisfies the at-most-once + // execution invariant documented on `Job::execute`. + unsafe { JobRef::new_raw(job_pointer, Self::execute) } + } + + /// Waits for the `ExternalJob` to be executed and returns the result. + /// + /// # Safety + /// + /// This must be called at most once. + #[inline(always)] + pub unsafe fn wait_for_value(&mut self) -> ThreadResult { + // Wait for the complete flag to be set. + loop { + atomic_wait::wait(&self.completed, 0); + if self.completed.load(Ordering::Relaxed) == 1 { + break; + } + } + // Synchronize memory; we do this with a fence, so that we only do a + // relaxed load in the case of a spurious wakeup. + fence(Ordering::Acquire); + // Get a ref to the result. + let result_ref = self.return_value.get_mut(); + // SAFETY: `assume_init_read` requires: + // + // * The `MaybeUninit` is fully initialized. + // + // As this can only be called if we have observed that `completed` has + // been set to 1, and that only happens at the end of + // `ExternalJob::execute` (after `return_value` is written and memory + // is synchronized via the above fence) the memory must be initialized. + // + // * That data not be incorrectly duplicated by repeated calls. + // + // Data is not duplicated because this function is called at most + // once. + unsafe { result_ref.assume_init_read() } + } +} + +#[cfg(not(feature = "shuttle"))] +impl Job for ExternalJob +where + F: FnOnce(&Worker) -> T + Send, + T: Send, +{ + /// Executes an `ExternalJob` from a const pointer. + /// + /// # Safety + /// + /// The caller must ensure that: + /// + /// * `this` is a non-null, properly aligned pointer to a live instance + /// of `ExternalJob`. + /// + /// * The `ExternalJob` will not move or be deallocated for as long as + /// `completed` remains set to 0. + /// + /// * This function is called at most once for a given `ExternalJob`. + #[inline(always)] + unsafe fn execute(this: NonNull<()>, worker: &Worker) { + // SAFETY: The caller ensures `this` can be converted into an immutable + // reference until we set the `complete` atomic. + let this = unsafe { this.cast::().as_ref() }; + // Create an abort guard. If the closure panics, this will convert the + // panic into an abort. Doing so prevents use-after-free for other + // elements of the stack. + let abort_guard = unwind::AbortOnDrop; + // SAFETY: `f` is a `UnsafeCell>`. Creating a + // `&mut ManuallyDrop` is only sound so long as no other live + // references exist. + // + // Since this field is never access mutably except for here and this + // function is called at most once, exclusive access is guaranteed. + let f_ref = unsafe { &mut *this.f.get() }; + // SAFETY: `ManuallyDrop` requires us to ensure that it is not used + // again after we `take()` it's contents. + // + // Since it is not used in the remainder of this function, and this + // function is called at most once, it is indeed not used again. + let f = unsafe { ManuallyDrop::take(f_ref) }; + // Run the job. If the job panics, we propagate the panic back to the + // main thread. + let result = unwind::halt_unwinding(|| f(worker)); + // Get the uninitialized memory where we should put the return value. + let return_value = this.return_value.get(); + // SAFETY: Writing to this unsafe cell requires that no other thread + // holds a reference to it's contents. + // + // The `return_value` is only read within `ExternalJob::wait_for_value`, + // and then only after `completed` is set to 1. Since this function is + // called at most once, `completed` must still be set to 0. Therefore no + // concurrent access can occur. + unsafe { (*return_value).write(result) }; + // Set `completed` to 1, allowing reads of the return value. This + // `Release` store synchronizes with the `Acquire` fence in + // `ExternalJob::wait_for_value`, establishing a happens-before + // relationship that makes the preceding `return_value` write visible + // to the waiting reader. + this.completed.store(1, Ordering::Release); + // Notify the waiting thread that the job is complete. + atomic_wait::wake_one(&this.completed); + // Forget the abort guard, re-enabling panics. + core::mem::forget(abort_guard); + } +} + +#[cfg(feature = "shuttle")] +pub struct ExternalJob { + f: UnsafeCell>, + mutex: shuttle::sync::Mutex>>, + condvar: shuttle::sync::Condvar, +} + +#[cfg(feature = "shuttle")] +impl ExternalJob +where + F: FnOnce(&Worker) -> T + Send, + T: Send, +{ + /// Creates a new `ExternalJob`. + #[inline(always)] + pub fn new(f: F) -> ExternalJob { + ExternalJob { + f: UnsafeCell::new(ManuallyDrop::new(f)), + mutex: shuttle::sync::Mutex::new(None), + condvar: shuttle::sync::Condvar::new(), + } + } + + #[inline(always)] + #[allow(clippy::undocumented_unsafe_blocks)] + pub unsafe fn as_job_ref(&self) -> JobRef { + let job_pointer = NonNull::from(self).cast(); + unsafe { JobRef::new_raw(job_pointer, Self::execute) } + } + + #[inline(always)] + pub unsafe fn wait_for_value(&mut self) -> ThreadResult { + let mut value = self.mutex.lock().unwrap(); + while value.is_none() { + value = self.condvar.wait(value).unwrap(); + } + Option::take(&mut value).unwrap() + } +} + +#[cfg(feature = "shuttle")] +impl Job for ExternalJob +where + F: FnOnce(&Worker) -> T + Send, + T: Send, +{ + #[inline(always)] + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe fn execute(this: NonNull<()>, worker: &Worker) { + let this = unsafe { this.cast::().as_ref() }; + let abort_guard = unwind::AbortOnDrop; + let f_ref = unsafe { &mut *this.f.get() }; + let f = unsafe { ManuallyDrop::take(f_ref) }; + let result = unwind::halt_unwinding(|| f(worker)); + let mut value = this.mutex.lock().unwrap(); + *value = Some(result); + this.condvar.notify_one(); + core::mem::forget(abort_guard); + } +} + // ----------------------------------------------------------------------------- // Heap allocated work function @@ -362,7 +713,7 @@ where /// # Safety /// /// The caller must ensure that `this` is a pointer, created by calling - /// `Box::into_raw` on a `Box`. After the call `this` must be + /// `Box::into_raw` on a `Box>`. After the call `this` must be /// treated as dangling. #[inline(always)] unsafe fn execute(this: NonNull<()>, worker: &Worker) { diff --git a/src/latch.rs b/src/latch.rs index 8c7564c..9a1b3a1 100644 --- a/src/latch.rs +++ b/src/latch.rs @@ -1,15 +1,13 @@ //! A core concept in Rayon is the *latch*. Forte has borrowed this, in a //! somewhat simplified form. //! -//! Every forte worker thread is has a single "sleep controller" that it uses to +//! Every forte worker thread has a single "sleep controller" that it uses to //! park and unpark itself. Latches build on this to create a simple boolean //! switch, which allows the owning thread to sleep until the latch becomes set //! by another thread. -use core::{ - pin::Pin, - task::{RawWaker, RawWakerVTable, Waker}, -}; +use alloc::task::Wake; +use core::borrow::Borrow; use crate::platform::*; @@ -40,20 +38,36 @@ const ASLEEP: u32 = 0b10; /// The general idea and spirit for latches (as well as some of the /// documentation) is due to rayon. However the implementation is specific to /// forte. +/// +/// ## Memory Ordering +/// +/// Latches _do not synchronize memory_. They are only used for signaling. If +/// the thread that sets a latch wishes to transmit a value to the thread +/// waiting for that latch, explicit fences must be used. pub struct Latch { /// Holds the internal state of the latch. This tracks if the latch has been /// set or not. state: AtomicU32, + /// Tracks the number of sleeping threads in the pool. + sleeping: &'static AtomicU32, /// The sleep controller for the owning thread. sleep_controller: &'static SleepController, + /// The seat number that owns this latch + seat_number: usize, } impl Latch { /// Creates a new latch, owned by a specific thread. - pub fn new(sleep_controller: &'static SleepController) -> Latch { + pub fn new( + seat_number: usize, + sleeping: &'static AtomicU32, + sleep_controller: &'static SleepController, + ) -> Latch { Latch { state: AtomicU32::new(LOCKED), + sleeping, sleep_controller, + seat_number, } } @@ -66,13 +80,22 @@ impl Latch { /// Waits for the latch to be set. In actuality, this may be woken. /// /// Returns true if the latch signal was received, and false otherwise. + /// + /// # Memory Ordering + /// + /// This does not synchronize memory. To synchronize memory with the thread + /// setting the latch, call `fence(Ordering::Acquire)` after this function. + /// The other thread must issue a corresponding `fence(Ordering::Release)` + /// call. #[cold] pub fn wait(&self) { // First, check if the latch has been set. // // In the event of a race with `set`: - // + If this happens before the store, then we will go to sleep. - // + If this happens after the store, then we notice and return. + // + // * If this happens before the store, then we will go to sleep. + // + // * If this happens after the store, then we notice and return. if self.state.load(Ordering::Relaxed) == SIGNAL { return; } @@ -80,7 +103,7 @@ impl Latch { // // In the event of a race with `set`, the `wake` will always cause this // to return regardless of memory ordering. - self.sleep_controller.sleep(); + self.sleep_controller.sleep(self.seat_number, self.sleeping); } /// Activates the latch, potentially unblocking the owning thread. @@ -88,35 +111,46 @@ impl Latch { /// This takes a raw pointer because the latch may be de-allocated by a /// different thread while this function is executing. /// + /// # Memory Ordering + /// + /// This does not synchronize memory. To synchronize memory with the waiting + /// thread, call `fence(Ordering::Release)` before this function. The other + /// thread must issue a corresponding `fence(Ordering::Acquire)` call. + /// /// # Safety /// - /// The latch pointer must be valid when passed to this function, and must - /// not be allowed to become dangling until after the latch is set. + /// The latch pointer must be valid when passed to this function. After this + /// call, the latch pointer may become dangling and must not be dereferenced + /// unless it is known to still be valid. #[inline(always)] pub unsafe fn set(latch: *const Latch) { - // SAFETY: At this point, the latch must still be valid to dereference. - let sleep_controller = unsafe { (*latch).sleep_controller }; + // SAFETY: The caller guarantees the latch remain alive until `set` + // returns. + let latch = unsafe { &*latch }; + let sleep_controller = latch.sleep_controller; // First we set the state to true. // // In the event of a race with `wait`, this may cause `wait` to return. // Otherwise the other thread will sleep within `wait. - // - // SAFETY: At this point, the latch must still be valid to dereference. - unsafe { (*latch).state.store(SIGNAL, Ordering::Relaxed) }; + latch.state.store(SIGNAL, Ordering::Relaxed); // We must try to wake the other thread, just in case it missed the - // notification and went to sleep. This garentees that the other thread + // notification and went to sleep. This guarantees that the other thread // will make progress. sleep_controller.wake(); } /// Restores the latch to the default state. /// - /// # Safety + /// # Deadlocks + /// + /// This may only be called by the thread that "owns" the latch, and only + /// after it has *observed* the latch entering the `SIGNAL` state, e.g. + /// after either `wait` or `check` has returned `true`. /// - /// This may only be called when in the `SIGNAL` state, eg. after either `wait` or - /// `check` has returned `true`. + /// Calling `reset` from a different thread or before observing the signal + /// is likely to result in deadlocks. #[inline(always)] - pub unsafe fn reset(&self) { + pub fn reset(&self) { self.state.store(LOCKED, Ordering::Relaxed); } } @@ -128,58 +162,52 @@ impl Latch { #[cfg(not(feature = "shuttle"))] pub struct SleepController { state: AtomicU32, - num_sleeping: &'static AtomicU32, } #[cfg(not(feature = "shuttle"))] impl SleepController { - /// Creates a new latch. Expects to be passed an atomic used for tracking - /// the number of sleeping workers. - pub fn new(num_sleeping: &'static AtomicU32) -> Self { + /// Creates a new sleep controller. + pub const fn new() -> Self { SleepController { state: AtomicU32::new(LOCKED), - num_sleeping, } } - // Attempt to wake the thread to which this belongs. - // - // Returns true if this allows the thread to make progress (by waking it up - // or catching it before it goes to sleep) and false if the thread was - // running. + /// Attempt to wake the thread to which this belongs. + /// + /// Returns true if this allows the thread to make progress (by waking it up + /// or catching it before it goes to sleep) and false if the thread was + /// running. #[inline(always)] pub fn wake(&self) -> bool { - // Set set the state to SIGNAL and read the current state, which must be + // Set the state to SIGNAL and read the current state, which must be // either LOCKED, ASLEEP or SIGNAL. let sleep_state = self.state.swap(SIGNAL, Ordering::Relaxed); - let asleep = sleep_state == ASLEEP; - if asleep { - // Decrement the sleeping counter by one. - self.num_sleeping.fetch_sub(1, Ordering::Relaxed); + if sleep_state == ASLEEP { // If the state was ASLEEP, the thread is either asleep or about to // go to sleep. // - // + If it is about to go to sleep (but has not yet called + // * If it is about to go to sleep (but has not yet called // `atomic_wait::wait`) then setting the state to SIGNAL above // should prevent it from going to sleep. // - // + If it is already waiting, the following notification will wake + // * If it is already waiting, the following notification will wake // it up. // // Either way, after this call the other thread must make progress. atomic_wait::wake_one(&self.state); } - // Return true if the other thread was asleep - asleep + // Return true if the other thread was asleep and not already notified. + sleep_state == ASLEEP } - // Attempt to send the thread to sleep. This should only be called on a - // single thread, and we say that this controller "belongs" to that thread. - // - // Returns true if this thread makes a syscall to suspend the thread, and - // false if the thread was already woken (letting us skip the syscall). + /// Attempt to send the thread to sleep. This should only be called on a + /// single thread, and we say that this controller "belongs" to that thread. + /// + /// Returns true if this thread makes a syscall to suspend the thread, and + /// false if the thread was already woken (letting us skip the syscall). #[cold] - pub fn sleep(&self) { + pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) { // Set the state to ASLEEP and read the current state, which must be // either LOCKED or SIGNAL. let state = self.state.swap(ASLEEP, Ordering::Relaxed); @@ -187,10 +215,10 @@ impl SleepController { // we should try to put the thread to sleep. Otherwise we should return // early. if state == LOCKED { - // Increase the sleeping count by one. - self.num_sleeping.fetch_add(1, Ordering::Relaxed); + // Set the sleeping bit for this worker. + sleeping.fetch_or(1 << seat_number, Ordering::Relaxed); // If we have received a signal since entering the sleep state - // (meaning the state is not longer set to ASLEEP) then this will + // (meaning the state is no longer set to ASLEEP) then this will // return immediately. // // If the state is still ASLEEP, then the next call to `wake` will @@ -198,6 +226,8 @@ impl SleepController { // // Either way, there is no way we can fail to receive a `wake`. atomic_wait::wait(&self.state, ASLEEP); + // Clear the sleeping bit for this worker. + sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed); } // Set the state back to LOCKED so that we are ready to receive new // signals. @@ -217,17 +247,14 @@ pub struct SleepController { } #[cfg(feature = "shuttle")] -impl Default for SleepController { - fn default() -> SleepController { +impl SleepController { + pub fn new() -> Self { SleepController { state: Mutex::new(LOCKED), condvar: Condvar::new(), } } -} -#[cfg(feature = "shuttle")] -impl SleepController { pub fn wake(&self) -> bool { let state = core::mem::replace(&mut *self.state.lock().unwrap(), SIGNAL); let asleep = state == ASLEEP; @@ -237,43 +264,33 @@ impl SleepController { asleep } - pub fn sleep(&self) { + pub fn sleep(&self, seat_number: usize, sleeping: &'static AtomicU32) { let mut state = self.state.lock().unwrap(); if *state == LOCKED { *state = ASLEEP; - self.condvar.wait(state).unwrap(); + sleeping.fetch_or(1 << seat_number, Ordering::Relaxed); + while *state == ASLEEP { + state = self.condvar.wait(state).unwrap(); + } + sleeping.fetch_and(!(1 << seat_number), Ordering::Relaxed); } + *state = LOCKED; } } // ----------------------------------------------------------------------------- -// Async waker +// Async wakers -impl Latch { - /// Creates an async waker from a reference to a latch. - /// - /// # Safety - /// - /// The latch must outlive the waker. - pub unsafe fn as_waker(self: Pin<&Self>) -> Waker { - let this: *const Self = Pin::get_ref(self); - let raw_waker = RawWaker::new(this.cast::<()>(), &RAW_WAKER_VTABLE); - // SAFETY: The RawWakerVTable api contract is upheald and these - // functions are all thread-safe. - unsafe { Waker::from_raw(raw_waker) } +impl Wake for Latch { + fn wake(self: Arc) { + // SAFETY: The borrowed `Arc` is held for the duration of this call, + // keeping the `Latch` alive. + unsafe { Latch::set(self.borrow()) }; } -} -const RAW_WAKER_VTABLE: RawWakerVTable = RawWakerVTable::new( - #[inline(always)] - |ptr| RawWaker::new(ptr, &RAW_WAKER_VTABLE), - wake, - wake, - |_| {}, -); - -fn wake(this: *const ()) { - let latch = this.cast::(); - // SAFETY: The latch must be valid for the duration - unsafe { Latch::set(latch) }; + fn wake_by_ref(self: &Arc) { + // SAFETY: The borrowed `Arc` is held for the duration of this call, + // keeping the `Latch` alive. + unsafe { Latch::set(self.borrow()) }; + } } diff --git a/src/lib.rs b/src/lib.rs index 368c2d0..dd3071d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,12 +4,18 @@ //! `ForkJoinPool`. //! //! It features: -//! + Statically defined and dynamically sized thread pools. -//! + Fully stack-allocated and inlined fork/join parrellism. -//! + The ability to execute both closures and futures on the same pool. -//! + Hybrid scopes that can contain work distributed across multiple thread pools. -//! + A primitive for awaiting async work in non-async contexts without spinning. -//! + An exposed unsafe api, built for for low-level integration and customization. +//! +//! * Statically defined and dynamically sized thread pools. +//! +//! * Fully stack-allocated and inlined fork/join parallelism. +//! +//! * The ability to execute both closures and futures on the same pool. +//! +//! * Hybrid scopes that can contain work distributed across multiple thread pools. +//! +//! * A primitive for awaiting async work in non-async contexts without spinning. +//! +//! * An exposed unsafe api, built for low-level integration and customization. //! //! Here's an example of what it looks like: //! @@ -24,7 +30,7 @@ //! THREAD_POOL.resize_to_available(); //! //! // Register this thread as a worker on the pool. -//! THREAD_POOL.with_worker(|worker| { +//! THREAD_POOL.expect_worker(|worker| { //! // Spawn a job onto the pool. The closure also accepts a worker, because the //! // job may be executed on a different thread. This will be the worker for whatever //! // thread it executes on. @@ -96,7 +102,7 @@ //! external thread tries to use a pool of size zero (with no workers), it will //! still be able to do work, it just won't be done in parallel. And if multiple //! external threads use an empty pool at the same time, they will sometimes try -//! to collaborate and help each-other out with work. +//! to collaborate and help each other out with work. //! //! ``` //! # use forte::ThreadPool; @@ -113,7 +119,7 @@ //! THREAD_POOL.depopulate(); //! //! // Do the same work, but this time we know it will execute serially (because -//! // there are no workers to parallelized it). +//! // there are no workers to parallelize it). //! THREAD_POOL.join(|_| println!("world"), |_| println!("hello ")); //! //! // This will always print "hello world" (because join happens execute things @@ -125,22 +131,23 @@ //! Thread pools are comprised of (and run on) workers, represented as instances //! of the [`Worker`] type. All work done on the pool is done in a "worker //! context" created by [`Worker::occupy`]. The recommended way to access a -//! worker context for a specific pool is via [`ThreadPool::with_worker`]. +//! worker context for a specific pool is via [`ThreadPool::with_worker`], +//! [`ThreadPool::on_worker`], or [`ThreadPool::expect_worker`]. //! //! ``` //! # use forte::ThreadPool; //! # static THREAD_POOL: ThreadPool = ThreadPool::new(); -//! THREAD_POOL.with_worker(|worker_1| { // <-- Creates a worker on the pool. -//! THREAD_POOL.with_worker(|worker_2| { // <-- Returns a reference to the existing worker. +//! THREAD_POOL.expect_worker(|worker_1| { // <-- Sets up this thread as a worker. +//! THREAD_POOL.expect_worker(|worker_2| { // <-- Returns a reference to the existing worker. //! // These pointers are identical. //! assert!(std::ptr::eq(worker_1, worker_2)); -//! }); // <-- Leaving this scope does nothing. -//! }); // <-- Leaving this scope frees the worker. +//! }); // <-- Leaving this scope does nothing. +//! }); // <-- Leaving this scope frees the worker. //! ``` //! //! Every worker holds a local queue of tasks, as well as metadata that allows //! other workers on the pool to communicate with it and wake it from sleep. -//! When existing outermost scope (where the worker was actually allocated), all +//! When exiting the outermost scope (where the worker was actually allocated), all //! tasks left in the local queue are executed. //! //! You will only ever receive `&Worker` references, because the worker is not @@ -149,7 +156,7 @@ //! //! To access the current worker context, you can use [`Worker::map_current`] or //! [`Worker::with_current`]. These allow executing work on arbitrary pools, and -//! can be used to write library code that works normally dispute not knowing +//! can be used to write library code that works normally despite not knowing //! about the thread pool static defined by the application. //! //! ```rust @@ -163,7 +170,6 @@ //! None => foo() //! }) //! } -//! //! ``` //! //! # Core Operations @@ -175,7 +181,7 @@ //! * *Block on.* Waits for a future to complete (outside of an async context). //! //! All of these with the exception of *Spawn* are blocking; they have a -//! specific join-point where a thread must wait for the all the forks of the +//! specific join-point where a thread must wait for all the forks of the //! parallel operation to complete before proceeding. While it is waiting, //! threads will attempt to do background work, or help each-other out with //! their assigned workload. @@ -191,8 +197,8 @@ //! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`] //! //! * *Worker.* Uses the provided worker context. -//! * *Thread pool.* Looks for an existing worker context, creates one if it dosn't find one. -//! * *Headless.* Looks for an existing worker context, and panics if it dosn't find one. +//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one. +//! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one. //! //! The headless and thread pool flavors are more or less just aliases for the //! worker flavor. Where possible, the worker flavor should be preferred to the @@ -273,9 +279,8 @@ mod platform { pub use core::sync::atomic::AtomicPtr; pub use core::sync::atomic::AtomicU32; pub use core::sync::atomic::Ordering; - pub use std::sync::Barrier; - pub use std::sync::Condvar; pub use std::sync::Mutex; + pub use std::sync::OnceLock; pub use std::thread::Builder as ThreadBuilder; pub use std::thread::JoinHandle; pub use std::thread::available_parallelism; @@ -287,8 +292,11 @@ mod platform { // Core exports + pub use std::sync::OnceLock; // shuttle has no OnceLock; std's version is fine here + + pub use shuttle::rand::Rng; + pub use shuttle::rand::thread_rng; pub use shuttle::sync::Arc; - pub use shuttle::sync::Barrier; pub use shuttle::sync::Condvar; pub use shuttle::sync::Mutex; pub use shuttle::sync::Weak; @@ -300,9 +308,6 @@ mod platform { pub use shuttle::thread::JoinHandle; pub use shuttle::thread_local; - pub use shuttle::rand::Rng; - pub use shuttle::rand::thread_rng; - // Available parallelism pub fn available_parallelism() -> std::io::Result> { diff --git a/src/scope.rs b/src/scope.rs index 641ae97..71a6c19 100644 --- a/src/scope.rs +++ b/src/scope.rs @@ -5,6 +5,7 @@ use alloc::boxed::Box; use core::any::Any; use core::cell::UnsafeCell; use core::future::Future; +use core::hint::cold_path; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::pin::Pin; @@ -61,15 +62,16 @@ use crate::unwind::AbortOnDrop; pub struct Scope<'scope, 'env: 'scope> { /// Number of active references to the scope (including the owning /// allocation). This is incremented each time a new `ScopePtr` is created, - /// and decremented when a `ScopePtr` is dropped or the owning thead is done - /// using it. + /// and decremented when a `ScopePtr` is dropped or the owning thread is + /// done using it. count: AtomicU32, /// A latch used to communicate when the scope has been completed. completed: Latch, /// If any job panics, we store the result here to propagate it. panic: AtomicPtr>, - /// This adds invariance over 'scope, to make sure 'scope cannot shrink, - /// which is necessary for soundness. + /// This adds invariance over 'scope. In other words, it ensures 'scope + /// cannot shrink or grow. This keeps the lifetime properly bound to the + /// closure. /// /// Without invariance, this would compile fine but be unsound: /// @@ -87,13 +89,18 @@ pub struct Scope<'scope, 'env: 'scope> { /// # }); /// ``` _scope: PhantomData<&'scope mut &'scope ()>, - /// This adds covariance over 'env. + /// This adds invariance over 'env. In other words, it ensures 'env cannot + /// shrink or grow. + /// + /// This is not strictly necessary for correctness, and could probably be + /// covariant instead. Invariance was chosen to follow the precedent set by + /// `std::thread::scope`. _env: PhantomData<&'env mut &'env ()>, } /// Executes a new scope on a worker. [`Worker::scope`], -/// [`ThreadPool::scope`][crate::ThreadPool::scope] and [`scope`][crate::scope()] are all just -/// an aliases for this function. +/// [`ThreadPool::scope`][crate::ThreadPool::scope] and +/// [`scope`][crate::scope()] are all just aliases for this function. /// /// For details about the `'scope` and `'env` lifetimes see [`Scope`]. #[inline] @@ -102,10 +109,21 @@ where F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, { let abort_guard = AbortOnDrop; - // SAFETY: The scope is never moved or mutably referenced. The scope is only - // dropped at the end of this function, after the call to `complete`. The - // abort guard above prevents the stack from being dropped early during a - // panic unwind. + // SAFETY: `Scope::new` requires: + // + // 1. The `Scope` is never moved after initialization. + // + // 2. `complete` is called exactly once before the `Scope` is dropped. + // + // The scope is not moved in this function, and since no `&mut Scope` + // reference is allowed to escape, the caller cannot safely cause the scope + // to move either. + // + // `Scope::complete` is called unconditionally on the line bellow, before + // the implicit drop of `scope`. If the closure `f` panics, it is caught and + // re-emitted after `complete` finishes. In the event of an uncaught panic, + // we cannot ensure `complete` runs properly before the scope is dropped, so + // we force an abort via an `AbortOnDrop` guard. let scope = unsafe { Scope::new(worker) }; // Panics that occur within the closure should be caught and propagated once // all spawned work is complete. This is not a safety requirement, it's just @@ -138,9 +156,18 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// /// # Safety /// - /// The caller must promise not to move or mutably reference this scope - /// until it is dropped, and must not allow the scope to be dropped until - /// after `Scope::complete` is run and returns. + /// The caller must ensure: + /// + /// * The `Scope` is never moved after creation. `ScopePtr::new` captures a + /// raw `*const Scope` pointer, and spawned jobs hold onto these pointers + /// until they complete. Moving the scope would invalidate these pointers + /// and cause UB when any `ScopePtr` is dropped or used for scope access. + /// + /// * `complete` is called exactly once before the `Scope` is dropped, after + /// which no `ScopePtr` may be created for this scope. `complete` blocks + /// until the reference count ticks down to zero, ensuring that the scope + /// outlives all `ScopePtr` references. Failing to call `complete` may + /// result in dangling `ScopePtr` and produce use-after-free. unsafe fn new(worker: &Worker) -> Scope<'scope, 'env> { Scope { count: AtomicU32::new(1), @@ -154,9 +181,9 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// Runs a closure or future sometime before the scope completes. Valid /// inputs to this method are: /// - /// + A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type. + /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type. /// - /// + A `Future` future, with no return type. + /// * A `Future` future, with no return type. /// /// # Panics /// @@ -168,9 +195,9 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// Runs a closure or future sometime before the scope completes. Valid /// inputs to this method are: /// - /// + A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type. + /// * A `for<'worker> FnOnce(&'worker Worker)` closure, with no return type. /// - /// + A `Future` future, with no return type. + /// * A `Future` future, with no return type. /// /// Unlike [`Scope::spawn`], this accepts the current worker as a parameter. pub fn spawn_on>(&'scope self, worker: &Worker, scoped_work: S) { @@ -183,7 +210,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// `Scope::remove_reference`, or the scope will block forever on /// completion. fn add_reference(&self) { - let counter = self.count.fetch_add(1, Ordering::Release); + let counter = self.count.fetch_add(1, Ordering::Relaxed); tracing::trace!("scope reference counter increased to {}", counter + 1); } @@ -191,11 +218,16 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// /// # Safety /// - /// The caller must ensure that there is exactly one a matching call to - /// `add_reference` for every call to this function, unless used within - /// `Scope::complete`. + /// The caller must ensure that each call to `remove_reference` corresponds + /// to exactly one prior call to `add_reference` (or the implicit initial + /// count of 1 provided by `Scope::new`, in the case of `Scope::complete`). + /// + /// If `remove_reference` is called without a matching `add_reference`, the + /// scope latch will be set prematurely, potentially allowing the scope to + /// be freed while a `ScopePtr` still holds a pointer to it. Uses of the + /// `ScopePtr` thereafter may produce use-after-free. unsafe fn remove_reference(&self) { - let counter = self.count.fetch_sub(1, Ordering::Acquire); + let counter = self.count.fetch_sub(1, Ordering::Relaxed); tracing::trace!("scope reference counter decreased to {}", counter - 1); if counter == 1 { // Alerts the owning thread that the scope has completed. @@ -204,8 +236,11 @@ impl<'scope, 'env> Scope<'scope, 'env> { // once, when the scope has been dropped and all work has been // completed. // - // SAFETY: The latch is passed as a reference, and is live for the - // duration of the function. + // SAFETY: The owning thread must call `Scope::complete` before + // dropping any `Scope`, and `Scope::complete` does not return until + // the latch is set, which happens only here, after the count + // reaches zero. Therefore, the `completed` field of this `Scope` + // must still be a live latch. unsafe { Latch::set(&self.completed) }; } } @@ -215,9 +250,24 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// remainder are dropped. #[cold] fn store_panic(&self, err: Box) { + // Check if the panic pointer has already been set. This lets us avoid + // allocating a second time, and means we can immediately drop the panic + // we have just been passed. + // + // Dropping this panic may itself trigger a pnaic, but this will simply + // trigger the scope's abort guard, causing an abort rather than UB. if self.panic.load(Ordering::Relaxed).is_null() { let nil = ptr::null_mut(); let err_ptr = Box::into_raw(Box::new(err)); + // Try to atomically swap the panic pointer from null to the newly + // allocated error slot. If this succeeds, the write occurs with + // `Release` ordering, which establishes a happens-before + // relationship with the fence in `maybe_propagate_panic`, so that + // the heap-allocated error will be visible to the reader. + // + // If the write fails, another panic must have already occurred, and + // we don't need to synchronize memory (the previous call to + // `store_panic` handles the syncrhonization for it's panic data). if self .panic .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed) @@ -238,8 +288,18 @@ impl<'scope, 'env> Scope<'scope, 'env> { /// Propagates any panic captured while the scope was executing. fn maybe_propagate_panic(&self) { + // Swap out the panic pointer. This gives us exclusive read access to + // whatever it points to. let panic = self.panic.swap(ptr::null_mut(), Ordering::Relaxed); if !panic.is_null() { + // We generally don't expect pancis to happen. + cold_path(); + // If the panic pointer is not null, emit an `Acquire` fence to + // establish a happens-after relationship with the `Release` branch + // of the `compare_exchange` call in `store_panic`, so that the + // error stored at the memory location pointed to by the atomic + // pointer will be visible on the following line. + fence(Ordering::Acquire); // SAFETY: This was created by `Box::into_raw` in `store_panic` and, // because of the atomic swap just above, is only called once for // each box. @@ -264,6 +324,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { // causing the latch to become set and allowing this function to // return. unsafe { self.remove_reference() }; + // Wait for the remaining work to complete. worker.wait_for(&self.completed); } @@ -272,11 +333,13 @@ impl<'scope, 'env> Scope<'scope, 'env> { // ----------------------------------------------------------------------------- // Generalized scoped spawn trait -/// A trait for types that can be spawned onto a [`Scope`]. It is implemented for: +/// A trait for types that can be spawned onto a [`Scope`]. +/// +/// It is implemented for: /// -/// + Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'scope`. +/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'scope`. /// -/// + Futures that satisfy `Future + Send + 'scope`. +/// * Futures that satisfy `Future + Send + 'scope`. /// /// Due to a bug in rustc, you may be given errors when using closures /// with inferred types. If you encounter the following: @@ -332,7 +395,7 @@ where let job_ref = unsafe { job.into_job_ref() }; // Send the job to a queue to be executed. - worker.enqueue(job_ref); + worker.fifo_queue.push_new(job_ref); } } @@ -344,7 +407,7 @@ where fn spawn_on<'env, 'worker>(self, worker: &'worker Worker, scope: &'scope Scope<'scope, 'env>) { let poll_job = ScopeFutureJob::new(worker.thread_pool(), scope, self); let job_ref = poll_job.into_job_ref(); - worker.enqueue(job_ref); + worker.fifo_queue.push_new(job_ref); } } @@ -360,14 +423,14 @@ const READY: u32 = 0; /// This value is used for the state of future-jobs that have already been /// woken. Jobs in this state may be in one of the three following categories: /// -/// + A pending job that has been (or is about to be) pushed to the queue +/// * A pending job that has been (or is about to be) pushed to the queue /// so that it can be polled. /// -/// + A pending job that is currently being polled (or has just finished) and +/// * A pending job that is currently being polled (or has just finished) and /// which was *not* queued after it was woken, because it was woken while /// running. /// -/// + A job that was woken after it completed or panicked. These jobs will stay +/// * A job that was woken after it completed or panicked. These jobs will stay /// in the WOKEN state forever, and will never be queued or polled again. /// /// When a WOKEN future-job is executed by a worker, it switches into the LOCKED @@ -382,7 +445,7 @@ const WOKEN: u32 = 1; /// are either executing, completed, or have been canceled due to a panic. They /// may switch to the WOKEN state at any time, but are not queued to the pool /// when this happens (they are instead queued when the future is done being -/// polled, assuming it has not pancaked or been completed). +/// polled, assuming it has not panicked or been completed). /// /// When a job finished executing and has not been WOKEN, it switches back to /// the READY state. @@ -426,7 +489,7 @@ impl<'scope, 'env, Fut> ScopeFutureJob<'scope, 'env, Fut> where Fut: Future + Send + 'scope, { - /// This vtable is part of what allows a `ScopedFutureJob` to act as an + /// This vtable is part of what allows a `ScopeFutureJob` to act as an /// async task waker. const VTABLE: RawWakerVTable = RawWakerVTable::new( Self::clone_as_waker, @@ -453,7 +516,7 @@ where }) } - /// Converts an `Arc` into a job ref that can be queued on a + /// Converts an `Arc` into a job ref that can be queued on a /// thread pool. The ref-count is not decremented, ensuring that the job /// remains alive while this job ref exists. /// @@ -462,8 +525,26 @@ where // SAFETY: Pointers created by `Arc::into_raw` are never null. let job_pointer = unsafe { NonNull::new_unchecked(Arc::into_raw(self).cast_mut().cast()) }; - // SAFETY: This pointer is an erased `Arc` which is what - // `Self::poll` expects to receive. + // SAFETY: `JobRef::new_raw` requires that: + // + // * `job_pointer` and `Self::poll` be "matched". + // + // `Self::poll` expects a pointer created by calling `Arc::into_raw` + // on an `Arc`, which is exactly what `job_pointer` is. + // + // * `job_pointer` points to an initialized and aligned value which is + // neither moved nor dropped until it is executed. + // + // The Arc reference count must be least 1. `Arc::into_raw` transfers + // ownership of the strong count from `self` into the `JobRef`, and + // that count is only released in `poll`, after the arc produced by + // `Arc::from_raw` is dropped. The data is therefore guaranteed to + // remain live until `poll` is called. + // + // * If `poll` has additional safety requirements, `job_pointer` upholds + // them. + // + // In this case, `poll` does not have any additional requirements. unsafe { JobRef::new_raw(job_pointer, Self::poll) } } @@ -521,12 +602,32 @@ where abort(); } - // At this point, we have acquired exclusive ownership of the future. - - // SAFETY: The arc never moves, and the future cannot be aliased mutably - // elsewhere because this is the only place we access it, and no other - // threads can have gotten past the memory swap above without causing an - // abort. + // SAFETY: The following line requires that: + // + // 1. No other mutable references to the future exist. + // + // 2. The future will not move. + // + // Access to the future is protected by the `state` field, which acts + // as a mutex. Just above, we executed + // + // state.swap(LOCKED, Ordering::Acquire) + // + // which transitions us from the `WOKEN` into the `LOCKED` state. Any + // concurrent caller that also tries to execute `poll` will fail this + // swap, and cause an abort. Exclusive access is therefore guaranteed. + // + // In the event that `poll` has been called previously, the `Acquire` + // ordering synchronizes with the call to + // + // state.compare_exchange(LOCKED, READY, Ordering::Release, Ordering::Release) + // + // later in this function. This ensures that all writes to the future + // performed by previous invocations are visible to us before we form + // the mutable reference. + // + // The future does not move, because it is stored in a field within an + // `Arc`, which has a stable heap-allocated address. let future = unsafe { Pin::new_unchecked(&mut *this.future.get()) }; // Create a new context from the waker, and poll the future. @@ -542,10 +643,6 @@ where } // The job is still pending, and has not yet panicked. Ok(Poll::Pending) => { - // The fence here ensures that our changes to the future become - // visible to the next thread to execute the job and poll the - // future. - fence(Ordering::Release); // Try to set the state back back idle so other threads can // schedule it again. This will only fail if the job was woken // while running, and is already in the WOKEN state. @@ -556,15 +653,28 @@ where .state .compare_exchange(LOCKED, READY, Ordering::Relaxed, Ordering::Relaxed) .is_err(); + // Emit a fence here, which synchronizes with the `Acquire` swap + // at the start of this function to ensure that the next thread + // to poll this future will observe the most recent version of + // it. + // + // A fence is required here because the write to `state` that + // establishes the happens-before relationship may be caused by + // either (a) the `compare_exchange` call above, or (b) the + // `swap` call in `wake`. + // + // This fence lets `wake` use `Relaxed` ordering, and upgrades + // it to `Release` only when necessary. + fence(Ordering::Release); // If the job was woken while running, it should be queued // immediately. Conveniently, we know the state will already be - // QUEUED, so we can leave it as it is. + // WOKEN, so we can leave it as it is. if rescheduled { // This converts the local `Arc` into a job ref, // preventing it from being dropped and potentially // extending the job's lifetime. let job_ref = this.into_job_ref(); - worker.enqueue(job_ref); + worker.fifo_queue.push_new(job_ref); } } // The job panicked. Store the panic in the scope so it can be @@ -589,7 +699,7 @@ where /// instance of `Arc` that is still alive. unsafe fn clone_as_waker(this: *const ()) -> RawWaker { // SAFETY: This is called on a pointer created by `Arc::into_raw` on an - // instance on of `Arc`. unsafe { Arc::increment_strong_count(this.cast::()) }; RawWaker::new(this, &Self::VTABLE) } @@ -602,14 +712,16 @@ where /// instance of `Arc` that is still alive. unsafe fn wake(this: *const ()) { // SAFETY: This is called on a pointer created by `Arc::into_raw` on an - // instance on of `Arc`. let this = unsafe { Arc::from_raw(this.cast::()) }; if this.state.swap(WOKEN, Ordering::Relaxed) == READY { - this.thread_pool.with_worker(|worker| { - // Convert the waker into a job ref and queue it. - let job_ref = this.into_job_ref(); - worker.enqueue(job_ref); + // Convert the waker into a job ref and queue it. + let thread_pool = this.thread_pool; + let job_ref = this.into_job_ref(); + thread_pool.with_worker(|worker| match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => thread_pool.queue_shared_job(job_ref), }); } } @@ -620,20 +732,23 @@ where /// /// Must be called with a pointer created by calling `Arc::into_raw` on an /// instance of `Arc` that is still alive. - fn wake_by_ref(this: *const ()) { + unsafe fn wake_by_ref(this: *const ()) { // We use manually drop here to prevent us from consuming the arc on // drop. This functions like an `&Arc` rather than an `Arc`. // // SAFETY: This is called on a pointer created by `Arc::into_raw` on an - // instance on of `Arc`. let this = unsafe { ManuallyDrop::new(Arc::from_raw(this.cast::())) }; if this.state.swap(WOKEN, Ordering::Relaxed) == READY { - this.thread_pool.with_worker(|worker| { - // Clone the waker, convert it into a job-ref and queue it. - let this = ManuallyDrop::into_inner(this.clone()); - let job_ref = this.into_job_ref(); - worker.enqueue(job_ref); + // Clone the waker, convert it into a job-ref and queue it. + let this = ManuallyDrop::into_inner(this.clone()); + let thread_pool = this.thread_pool; + let job_ref = this.into_job_ref(); + + thread_pool.with_worker(|worker| match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => thread_pool.queue_shared_job(job_ref), }); } } @@ -644,12 +759,12 @@ where /// /// Must be called with a pointer created by calling `Arc::into_raw` on an /// instance of `Arc` that is still alive. - fn drop_as_waker(this: *const ()) { + unsafe fn drop_as_waker(this: *const ()) { // Rather than converting back into an arc, we can just decrement the // counter here. // // SAFETY: This is called on a pointer created by `Arc::into_raw` on an - // instance on of `Arc`. unsafe { Arc::decrement_strong_count(this.cast::()) }; } } @@ -670,10 +785,16 @@ mod scope_ptr { /// reference scope from being deallocated. pub struct ScopePtr<'scope, 'env>(*const Scope<'scope, 'env>); - // SAFETY: !Send for raw pointers is not for safety, just as a lint. + // SAFETY: This is safe because (a) scope-pointer is only used to call + // `add_reference`, `remove_reference`, and `store_panic`, all of which are + // designed to be thread-safe; and (b) the `Scope` cannot be deallocated + // while any `ScopePtr` still points to it (due to reference counting). unsafe impl Send for ScopePtr<'_, '_> {} - // SAFETY: !Sync for raw pointers is not for safety, just as a lint. + // SAFETY: This is safe because (a) scope-pointer is only used to call + // `add_reference`, `remove_reference`, and `store_panic`, all of which are + // designed to be thread-safe; and (b) the `Scope` cannot be deallocated + // while any `ScopePtr` still points to it (due to reference counting). unsafe impl Sync for ScopePtr<'_, '_> {} impl<'scope, 'env> ScopePtr<'scope, 'env> { @@ -730,14 +851,13 @@ mod tests { use std::vec; use std::vec::Vec; + use super::Scope; use crate::ThreadPool; use crate::Worker; use crate::scope; use crate::unwind; use crate::util::XorShift64Star; - use super::Scope; - /// Tests that empty scopes return properly. #[test] fn scope_empty() { @@ -833,7 +953,7 @@ mod tests { THREAD_POOL.depopulate(); } - /// Tests that we can spawn futures onto the thraed pool and that they can + /// Tests that we can spawn futures onto the thread pool and that they can /// borrow data as expected. #[test] fn scope_future() { @@ -924,7 +1044,7 @@ mod tests { let a = AtomicU8::new(0); let b = AtomicU8::new(0); - THREAD_POOL.with_worker(|worker| { + THREAD_POOL.on_worker(|worker| { scope(|scope| { for _ in 0..NUM_JOBS { scope.spawn_on(worker, |_: &Worker| { @@ -973,12 +1093,12 @@ mod tests { let mut completed = false; - THREAD_POOL.with_worker(|worker| { + THREAD_POOL.on_worker(|worker| { worker.scope(|scope| { scope.spawn_on(worker, |_: &Worker| { // Creating a new worker instead of reusing the old one is // bad form, but we may as well test it. - THREAD_POOL.with_worker(|worker| { + THREAD_POOL.on_worker(|worker| { worker.scope(|scope| { scope.spawn_on(worker, |_: &Worker| { completed = true; @@ -1002,7 +1122,7 @@ mod tests { THREAD_POOL.resize_to_available(); let counter_p = &AtomicUsize::new(0); - THREAD_POOL.with_worker(|worker| { + THREAD_POOL.on_worker(|worker| { worker.scope(|scope| { scope.spawn(move |worker: &Worker| { divide_and_conquer(worker, scope, counter_p, 1024) @@ -1055,7 +1175,7 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - THREAD_POOL.with_worker(|_| { + THREAD_POOL.on_worker(|_| { let mut tree = random_tree(10, 1337); let values: Vec<_> = tree.iter().cloned().collect(); tree.update(|v| *v += 1); @@ -1143,7 +1263,7 @@ mod tests { static THREAD_POOL: ThreadPool = ThreadPool::new(); THREAD_POOL.resize_to_available(); - THREAD_POOL.with_worker(|_| { + THREAD_POOL.on_worker(|_| { let mut max_diff = Mutex::new(0); let bottom_of_stack = 0; scope(|s| the_final_countdown(s, &bottom_of_stack, &max_diff, 5)); diff --git a/src/thread_pool.rs b/src/thread_pool.rs index 24017fa..3006db4 100644 --- a/src/thread_pool.rs +++ b/src/thread_pool.rs @@ -2,11 +2,13 @@ use alloc::boxed::Box; use alloc::format; -use alloc::string::ToString; use alloc::vec::Vec; +use core::array; +use core::borrow::Borrow; use core::cell::Cell; use core::cmp; use core::future::Future; +use core::hint::cold_path; use core::marker::PhantomData; use core::num::NonZero; use core::pin::pin; @@ -14,16 +16,20 @@ use core::ptr; use core::ptr::NonNull; use core::task::Context; use core::task::Poll; -use crossbeam_queue::SegQueue; -use crossbeam_utils::CachePadded; use async_task::Runnable; +use crossbeam_queue::SegQueue; +use crossbeam_utils::CachePadded; +use st3::StealError; +use st3::lifo::Stealer; +use st3::lifo::Worker as Sharer; use tracing::debug; use tracing::trace; use tracing::trace_span; use crate::FnOnceMarker; use crate::FutureMarker; +use crate::job::ExternalJob; use crate::job::HeapJob; use crate::job::JobQueue; use crate::job::JobRef; @@ -49,7 +55,8 @@ use crate::util::XorShift64Star; /// # use forte::Worker; /// static THREAD_POOL: ThreadPool = ThreadPool::new(); /// ``` -/// Thread pools are empty when created, and must be explicitly resized at runtime. +/// Thread pools are empty when created, and must be explicitly resized at +/// runtime. /// ``` /// # use forte::ThreadPool; /// # use forte::Worker; @@ -60,144 +67,58 @@ use crate::util::XorShift64Star; /// [`spawn`][ThreadPool::spawn], [`block_on`][ThreadPool::block_on], /// [`join`][ThreadPool::join], or [`scope`][ThreadPool::scope]. pub struct ThreadPool { - /// The internal state of the thread pool - /// - /// This should only be locked infrequently for short periods of time in - /// cold functions. - state: Mutex, - /// A queue used for cooperatively sharing jobs between workers. + /// A bit-set that tracks which seats are occupied. + occupied: CachePadded, + /// A bit-set that tracks which seats are sleeping. + sleeping: CachePadded, + /// Holds shared data for each thread participating in the pool. + seats: OnceLock>, + /// Holds controls for threads spawned and managed by the pool. Initalized + /// on first call to `occupy`, to allow for some non-static constructors. + managed_threads: Mutex, + /// Used to inject external work into the thread pool. This is generally + /// treated as a fallback, for when the thread-pool is at capacity and + /// threads can't register themselves as workers. shared_jobs: SegQueue, - /// A condvar that is used to signal a new worker taking a lease on a seat. - start_heartbeat: Condvar, - /// Tracks the number of currently sleeping workers. Incremented when a - /// worker goes to sleep, decremented when a worker is woken. - num_sleeping: AtomicU32, -} - -/// The internal state of a thread pool. -struct ThreadPoolState { - /// The registry of seats. These seats may be "leased out" to different - /// threads temporarily, and will be re-used. The seats themselves are - /// leaked, and will never move or be deallocated. - seats: Vec, - /// Threads managed directly by this thread pool. - managed_threads: ManagedThreads, } -impl ThreadPoolState { - /// Claims a lease on the thread pool. A lease can be passed to - /// [`Worker::occupy`] to enter a worker context for the thread pool. - /// - /// There are a finite number of leases available on each pool. If they are - /// already claimed, this returns `None`. - fn claim_lease(&mut self, thread_pool: &'static ThreadPool) -> Lease { - // First try to claim an unoccupied seat. - for (index, seat) in self.seats.iter_mut().enumerate() { - if !seat.occupied { - seat.occupied = true; - return Lease { - thread_pool, - index, - seat_data: seat.data, - }; - } - } - - // If none are available, add a new seat. - let index = self.seats.len(); - let seat_data = Box::leak(Box::new(SeatData { - #[cfg(not(feature = "shuttle"))] - heartbeat: AtomicBool::new(true).into(), - sleep_controller: SleepController::new(&thread_pool.num_sleeping), - })); - let seat = Seat { - occupied: true, - data: seat_data, - }; - self.seats.push(seat); - Lease { - thread_pool, - index, - seat_data, - } - } - - /// Attempts to claim several leases at once. See - /// [`ThreadPool::claim_lease`] for more information. If no leases are - /// available, this returns an empty vector. - fn claim_leases(&mut self, thread_pool: &'static ThreadPool, num: usize) -> Vec { - let mut leases = Vec::with_capacity(num); - - // First try to claim unoccupied seats. - for (index, seat) in self.seats.iter_mut().enumerate() { - if leases.len() == num { - return leases; - } - - if !seat.occupied { - seat.occupied = true; - leases.push(Lease { - thread_pool, - index, - seat_data: seat.data, - }); - } - } - - // Then create new seats as needed. - while leases.len() != num { - let index = self.seats.len(); - let seat_data = Box::leak(Box::new(SeatData { - #[cfg(not(feature = "shuttle"))] - heartbeat: AtomicBool::new(true).into(), - sleep_controller: SleepController::new(&thread_pool.num_sleeping), - })); - let seat = Seat { - occupied: true, - data: seat_data, - }; - self.seats.push(seat); - leases.push(Lease { - thread_pool, - index, - seat_data, - }); - } - - leases - } +/// A public interface that can be temporarily claimed and used by a thread. +/// Claiming a seat allows a thread to participate in the thread pool as a +/// worker. +pub(crate) struct Seats { + /// The sharing side of each seat's work-stealing queue. These should only + /// ever be accessed by the thread that currently owns the lease for this + /// seat (to ensure the `!Sync` bound is respected). + sharers: [Sharer; 32], + /// The stealing side of each seat's work-stealing queue. + stealers: [Stealer; 32], + /// The sleep/wake controller for each seat. + sleep_controllers: [SleepController; 32], } -#[derive(Clone)] -struct Seat { - occupied: bool, - data: &'static SeatData, -} - -/// A public interface that can be claimed and used by a worker. -struct SeatData { - /// The heartbeat signal sent to the worker. - #[cfg(not(feature = "shuttle"))] - heartbeat: CachePadded, - /// Allows other threads to wake the worker. - sleep_controller: SleepController, -} +// SAFETY: `stealers` are `Send + Sync` by their own bounds. `workers[i]` is +// only ever accessed by the single thread holding seat `i`'s occupancy lease; +// the `occupied` bitmask in `ThreadPool` enforces that exclusivity. +unsafe impl Sync for Seats {} /// A lease represents ownership of one of a "seats" in a thread pool, and /// allows the owning thread to participate in that pool as a worker. pub struct Lease { /// The thread pool against which this lease is held. thread_pool: &'static ThreadPool, - /// The index of the claimed seat. - index: usize, - /// The seat being claimed by this lease. - seat_data: &'static SeatData, + /// The index of the seat in the data list + seat_number: usize, + /// A reference to the pre-initalized seat data (to avoid repeated hits of + /// the `OnceLock`). + seats: &'static Seats, } impl Drop for Lease { fn drop(&mut self) { - let mut state = self.thread_pool.state.lock().unwrap(); - state.seats[self.index].occupied = false; + // Unset the occupied bit for this seat + self.thread_pool + .occupied + .fetch_and(!(1 << self.seat_number), Ordering::Relaxed); } } @@ -205,16 +126,13 @@ impl Drop for Lease { struct ManagedThreads { /// Stores thread controls for workers spawned by the pool. workers: Vec, - /// Stores thread controls for the heartbeat thread. - #[cfg(not(feature = "shuttle"))] - heartbeat: Option, } /// Represents a worker thread that is managed by the pool, as opposed to /// external threads which temporarily participate in the pool. struct ManagedWorker { /// The index of this worker in the public worker info list. - index: usize, + seat_number: usize, /// Controls used to manage the lifecycle of the worker. control: ThreadControl, } @@ -234,28 +152,114 @@ struct ThreadControl { impl ThreadPool { /// Creates a new thread pool. pub const fn new() -> ThreadPool { + // Create the pool itself. ThreadPool { - state: Mutex::new(ThreadPoolState { - seats: Vec::new(), - managed_threads: ManagedThreads { - workers: Vec::new(), - #[cfg(not(feature = "shuttle"))] - heartbeat: None, - }, + seats: OnceLock::new(), + occupied: CachePadded::new(AtomicU32::new(0)), + sleeping: CachePadded::new(AtomicU32::new(0)), + managed_threads: Mutex::new(ManagedThreads { + workers: Vec::new(), }), shared_jobs: SegQueue::new(), - start_heartbeat: Condvar::new(), - num_sleeping: AtomicU32::new(0), } } + /// Returns the pre-allocated steal queues, initializing them on the first call. + fn get_seats(&'static self) -> &'static Seats { + self.seats.get_or_init(|| { + let sharers: [Sharer; 32] = + array::from_fn(|_| Sharer::new(Worker::STEAL_QUEUE_CAPACITY)); + let stealers: [Stealer; 32] = array::from_fn(|i| sharers[i].stealer()); + let sleep_controllers = array::from_fn(|_| SleepController::new()); + Box::new(Seats { + sharers, + stealers, + sleep_controllers, + }) + }) + } + + /// Adds a job ref to the shared queue. + pub fn queue_shared_job(&'static self, job_ref: JobRef) { + self.shared_jobs.push(job_ref); + } + /// Claims a lease on the thread pool which can be occupied by a worker /// (using [`Worker::occupy`]), allowing a thread to participate in the pool. + /// + /// Returns none if all seats are occupied. #[cold] - pub fn claim_lease(&'static self) -> Lease { - self.start_heartbeat.notify_one(); - let mut state = self.state.lock().unwrap(); - state.claim_lease(self) + pub fn claim_lease(&'static self) -> Option { + loop { + let occupied = self.occupied.load(Ordering::Relaxed); + if occupied == u32::MAX { + return None; + } + let seat_number = occupied.trailing_ones() as usize; + let mask = 1 << seat_number; + if self.occupied.fetch_or(mask, Ordering::Relaxed) & mask == 0 { + // At this point we have acquired the lease on the seat + return Some(Lease { + thread_pool: self, + seat_number, + seats: self.get_seats(), + }); + } + } + } + + /// Claims up to `n` leases at once in a single atomic transaction. + /// + /// Finds up to `n` free seats, then atomically claims all of them with a + /// single `compare_exchange`. Either every selected seat is claimed together + /// or none are (and the loop retries). Returns between 0 and `n` leases; + /// returns an empty `Vec` when `n` is 0 or the pool is full. + #[cold] + pub fn claim_leases(&'static self, n: usize) -> Vec { + if n == 0 { + return Vec::new(); + } + let seats = self.get_seats(); + loop { + let occupied = self.occupied.load(Ordering::Relaxed); + if occupied == u32::MAX { + return Vec::new(); + } + + // Build a mask of up to `n` free seats by walking the complement. + let mut claimed_seats = 0; + let mut free_seats = !occupied; + for _ in 0..n { + if free_seats == 0 { + break; + } + let seat_bit = free_seats & free_seats.wrapping_neg(); // isolate lowest set bit + claimed_seats |= seat_bit; + free_seats &= !seat_bit; + } + + // Attempt to claim all selected seats in one atomic step. + match self.occupied.compare_exchange( + occupied, + occupied | claimed_seats, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => { + return (0..32) + .filter(|&i| claimed_seats & (1 << i) != 0) + .map(|seat_number| Lease { + thread_pool: self, + seat_number: seat_number as usize, + seats, + }) + .collect(); + } + Err(_) => { + // Another thread modified `occupied`; retry. + } + } + } } /// Returns an opaque identifier for this thread pool. @@ -268,40 +272,7 @@ impl ThreadPool { /// Returns the number of workers participating in this thread pool. #[inline(always)] pub fn num_workers(&self) -> usize { - todo!() - } - - /// Tries to ensure the calling thread is a member of the thread pool, and - /// then executes the provided closure. If the thread is already a member of - /// the pool, the closure is called directly. Otherwise, the thread will - /// attempt to temporarily register itself with the pool (which can be - /// slightly slower). If registration fails (because the pool is full to - /// capacity) the closure is passed `None` instead of a worker instance. - /// - /// The provided closure is never sent to another thread. - #[inline(always)] - pub fn with_worker(&'static self, f: F) -> R - where - F: FnOnce(&Worker) -> R, - { - Worker::with_current(|worker| match worker { - Some(worker) if worker.lease.thread_pool.id() == self.id() => f(worker), - _ => self.with_worker_cold(f), - }) - } - - /// Tries to register the calling thread on the thread pool, and pass a - /// worker instance to the provided closure. - /// - /// This is the slow fallback for `with_worker` covering "external calls" - /// from outside the pool. Never call this directly. - #[cold] - fn with_worker_cold(&'static self, f: F) -> R - where - F: FnOnce(&Worker) -> R, - { - let lease = self.state.lock().unwrap().claim_lease(self); - Worker::occupy(lease, f) + self.occupied.load(Ordering::Relaxed).count_ones() as usize } } @@ -315,8 +286,8 @@ impl ThreadPool { /// /// See [`ThreadPool::resize`] for more information about resizing. pub fn resize_to_available(&'static self) -> usize { - let available = available_parallelism().map(NonZero::get).unwrap_or(1); - let available = available.saturating_sub(2); + let mut available = available_parallelism().map(NonZero::get).unwrap_or(1); + available = available.saturating_sub(1); self.resize_to(available) } @@ -369,7 +340,7 @@ impl ThreadPool { /// Resizes the pool, and returns the new size. /// - /// Not that the new size may be different from the size requested. + /// Note that the new size may be different from the size requested. #[cold] pub fn resize(&'static self, get_size: F) -> usize where @@ -380,14 +351,13 @@ impl ThreadPool { // Resizing a pool is a critical section; only one thread can resize the // pool at a time. This is implemented using a mutex on the thread manager. trace!("locking state"); - let mut state = self.state.lock().unwrap(); + let mut managed_threads = self.managed_threads.lock().unwrap(); // Compute the new size of the pool, given the current size. - let current_size = state.managed_threads.workers.len(); + let current_size = managed_threads.workers.len(); - // You are only allowed to spawn managed threads for up to half the total number of workers, - // to leave room for non-managed threads. By default, this means at most 16 workers can be managed. - let mut new_size = get_size(current_size); + // Calculate the new size of the pool (counting only managed workers). + let new_size = get_size(current_size); trace!( "attempting to resize thread pool from {} to {} thread(s)", @@ -401,98 +371,56 @@ impl ThreadPool { } // The size increased cmp::Ordering::Greater => { - // Acquire leases for the new threads. - trace!("locking worker leases"); - let new_leases = state.claim_leases(self, new_size - current_size); - new_size = current_size + new_leases.len(); // Scale back the new size to what we can actually spawn. - trace!("acquired leases for {} new threads", new_size); - - // When not in shuttle, start the heartbeat thread if scaling up from zero. - #[cfg(not(feature = "shuttle"))] - if new_size > 0 && current_size == 0 { - debug!("spawning heartbeat runner"); - let halt = Arc::new(AtomicBool::new(false)); - let heartbeat_halt = halt.clone(); - let handle = ThreadBuilder::new() - .name("heartbeat".to_string()) - .spawn(move || { - heartbeat_loop(self, heartbeat_halt); - }) - .unwrap(); - let control = ThreadControl { halt, handle }; - state.managed_threads.heartbeat = Some(control); - } - - let barrier = Arc::new(Barrier::new(new_leases.len() + 1)); - // Spawn the new workers. - for lease in new_leases { - let index = lease.index; - debug!("spawning managed worker with index {}", index); + let leases = self.claim_leases(new_size - current_size); + for lease in leases { + let seat_number = lease.seat_number; + debug!("spawning managed worker for seat number {}", seat_number); let halt = Arc::new(AtomicBool::new(false)); let worker_halt = halt.clone(); - let worker_barrier = barrier.clone(); let handle = ThreadBuilder::new() - .name(format!("worker {index}")) + .name(format!("worker {seat_number}")) .spawn(move || { - managed_worker(lease, worker_halt, worker_barrier); + managed_worker(lease, worker_halt); }) .unwrap(); let control = ThreadControl { halt, handle }; - state - .managed_threads - .workers - .push(ManagedWorker { index, control }); + managed_threads.workers.push(ManagedWorker { + seat_number, + control, + }); } - drop(state); - - // Wait for the threads to start. - barrier.wait(); + drop(managed_threads); } // The size decreased cmp::Ordering::Less => { // Pull the workers we intend to halt out of the thread manager. - let terminating_workers = state.managed_threads.workers.split_off(new_size); - - // Halt the heartbeat thread when scaling to zero. - #[cfg(not(feature = "shuttle"))] - let heartbeat_control = if new_size == 0 { - state.managed_threads.heartbeat.take() - } else { - None - }; + let terminating_workers = managed_threads.workers.split_off(new_size); // Terminate and wake the workers. + let seats = self.get_seats(); for worker in &terminating_workers { // Tell the worker to halt. worker.control.halt.store(true, Ordering::Relaxed); // Wake the worker up. - state.seats[worker.index].data.sleep_controller.wake(); + seats.sleep_controllers[worker.seat_number].wake(); } // Drop the lock on the state so as not to block the workers or heartbeat. - drop(state); + drop(managed_threads); // Determine our seat index. - let own_seat = Worker::map_current(|worker| worker.lease.index); + let own_seat_number = Worker::map_current(|worker| worker.lease.seat_number); // Wait for the other workers to fully halt. for worker in terminating_workers { // It's possible we may be trying to terminate ourselves, in // which case we can skip the thread-join. - if Some(worker.index) != own_seat { + if Some(worker.seat_number) != own_seat_number { let _ = worker.control.handle.join(); } } - - // If we took control of the heartbeat, halt it after the workers. - #[cfg(not(feature = "shuttle"))] - if let Some(control) = heartbeat_control { - control.halt.store(true, Ordering::Relaxed); - self.start_heartbeat.notify_one(); - let _ = control.handle.join(); - } } } @@ -501,14 +429,126 @@ impl ThreadPool { } } +// ----------------------------------------------------------------------------- +// Thread pool worker access + +impl ThreadPool { + /// Runs the closure on a thread-pool worker. + /// + /// If this thread is not a worker, it will try to register itself as one. + /// If the thread pool is full, the closure is sent to another worker as a + /// job, and this thread is parked. + /// + /// If your closure is `!Send`, use [`with_worker`][ThreadPool::with_worker] + /// instead. + #[inline(always)] + pub fn on_worker(&'static self, f: F) -> R + where + F: FnOnce(&Worker) -> R + Send, + R: Send, + { + self.with_worker(|worker| match worker { + Some(worker) => f(worker), + None => { + let mut job = ExternalJob::new(f); + // SAFETY: `ExternalJob::as_job_ref` requires: + // + // * The `ExternalJob` must not move or be deallocated until the + // `JobRef` is executed. + // + // * The `JobRef` does not outlive any data the `ExternalJob` closes over. + // + // * `as_job_ref` is not called again while `JobRef` lives. + // + // The `ExternalJob` is a stack-allocated variable. After + // calling `as_job_ref`, we never move `job`, and we wait for + // the job to execute by calling `job.wait_for_value`. Only + // after that returns do we allow the `job` to be dropped. This + // also means that any data closed over by the `ExternalJob` + // must outlive the `JobRef`. + // + // Also, `as_job_ref` is plainly called only once. + let job_ref = unsafe { job.as_job_ref() }; + self.queue_shared_job(job_ref); + // SAFETY: `wait_for_value` must be called at most once. This is + // the only call site for this particular `job`, which is a + // stack-local variable. + let result = unsafe { job.wait_for_value() }; + match result { + Ok(value) => value, + Err(error) => unwind::resume_unwinding(error), + } + } + }) + } + + /// Runs the closure on a thread-pool worker. + /// + /// If this thread is not a worker, it will try to register itself as one. + /// If the thread pool is full, this panics. + /// + /// If you don't want to panic, use [`on_worker`][ThreadPool::on_worker] or + /// [`with_worker`][ThreadPool::with_worker] instead. + #[inline(always)] + #[track_caller] + pub fn expect_worker(&'static self, f: F) -> R + where + F: FnOnce(&Worker) -> R, + { + self.with_worker(|worker| match worker { + Some(worker) => f(worker), + None => panic!("thread pool full; not able to access worker"), + }) + } + + /// Runs the closure on a thread-pool worker. + /// + /// If this thread is currently acting as a worker for the thread-pool, this + /// just looks that worker up. If this is not registered as a worker, or the + /// thread's worker is registered with different thread pool, the thread + /// will try to register itself with the correct pool. If the thread pool is + /// full, it passes the closure `None`. + /// + /// The provided closure is never sent to another thread. If your closure is + /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead. + #[inline(always)] + pub fn with_worker(&'static self, f: F) -> R + where + F: FnOnce(Option<&Worker>) -> R, + { + Worker::with_current(|worker| match worker { + Some(worker) if worker.lease.thread_pool.id() == self.id() => f(Some(worker)), + _ => self.with_worker_cold(f), + }) + } + + /// Tries to register the calling thread on the thread pool, and pass a + /// worker instance to the provided closure. + /// + /// This is the slow fallback for `with_worker` covering "external calls" + /// from outside the pool. Never call this directly. + #[cold] + fn with_worker_cold(&'static self, f: F) -> R + where + F: FnOnce(Option<&Worker>) -> R, + { + match self.claim_lease() { + Some(lease) => Worker::occupy(lease, |worker| f(Some(worker))), + None => f(None), + } + } +} + // ----------------------------------------------------------------------------- // Generalized spawn trait -/// A trait for types that can be spawned onto a [`ThreadPool`]. It is implemented for: +/// A trait for types that can be spawned onto a [`ThreadPool`]. /// -/// + Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'static`. +/// It is implemented for: /// -/// + Futures that satisfy `Future + Send + 'static` where `T: Send + 'static`. +/// * Closures that satisfy `for<'worker> FnOnce(&'worker Worker) + Send + 'static`. +/// +/// * Futures that satisfy `Future + Send + 'static` where `T: Send + 'static`. /// /// Due to a bug in rustc, you may be given errors when using closures /// with inferred types. If you encounter the following: @@ -518,7 +558,7 @@ impl ThreadPool { /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// THREAD_POOL.spawn(|_| { }); -/// // ^^^^^^^ the trait `Spawn<'_, _>` is not implemented for closure ... +/// // ^^^^^^^ ERROR: the trait `Spawn<'_, _>` is not implemented for closure ... /// ``` /// Try adding a type hint to the closure's parameters, like so: /// ``` @@ -558,7 +598,7 @@ where // Queue the job for evaluation if let Some(worker) = worker { - worker.enqueue(job_ref); + worker.fifo_queue.push_new(job_ref); } else { // Push the work into the share queue and wake a worker thread_pool.shared_jobs.push(job_ref); @@ -568,11 +608,11 @@ where pub type Task = async_task::Task; -// Schedules a runnable future as a job. -// -// Async-task prefers that this is a static function, rather than a closure, -// which is why this is a separate function that pulls the thread pool from the -// runnable metadata. +/// Schedules a runnable future as a job. +/// +/// Async-task prefers that this is a static function, rather than a closure, +/// which is why this is a separate function that pulls the thread pool from the +/// runnable metadata. fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) { // Get a ref to the thread pool from the runnable. let thread_pool = *runnable.metadata(); @@ -588,16 +628,19 @@ fn schedule_runnable(runnable: Runnable<&'static ThreadPool>) { let job_ref = unsafe { JobRef::new_raw(job_pointer, execute_runnable) }; // Send this job off to be executed. - thread_pool.with_worker(|worker| { - worker.enqueue(job_ref); + thread_pool.with_worker(|worker| match worker { + Some(worker) => worker.fifo_queue.push_new(job_ref), + None => thread_pool.shared_jobs.push(job_ref), }); } -// Executes a raw pointer to a runnable future. +/// Executes a raw pointer to a runnable future. #[inline(always)] fn execute_runnable(this: NonNull<()>, _worker: &Worker) { - // SAFETY: This pointer was created by the call to `Runnable::into_raw` just above. - let runnable = unsafe { Runnable::<()>::from_raw(this) }; + // SAFETY: This pointer was created by `Runnable::into_raw` in + // `schedule_runnable` with type parameter `&'static ThreadPool`, and + // `from_raw` is called at most once. + let runnable = unsafe { Runnable::<&'static ThreadPool>::from_raw(this) }; // Poll the task. This will drop the future if the task is // canceled or the future completes. runnable.run(); @@ -656,11 +699,10 @@ impl ThreadPool { F: Future + Send, T: Send, { - self.with_worker(|worker| worker.block_on(future)) + self.on_worker(|worker| worker.block_on(future)) } - /// Executes the two closures, possibly in parallel, and returns the - /// results. + /// Executes the two closures, possibly in parallel. /// /// See also: [`Worker::join`] and [`join`]. #[inline(always)] @@ -671,7 +713,7 @@ impl ThreadPool { RA: Send, RB: Send, { - self.with_worker(|worker| worker.join(a, b)) + self.on_worker(|worker| worker.join(a, b)) } /// Creates a scope onto which non-static work can be spawned. @@ -681,9 +723,10 @@ impl ThreadPool { #[inline(always)] pub fn scope<'env, F, T>(&'static self, f: F) -> T where - F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, + F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T + Send, + T: Send, { - self.with_worker(|worker| worker.scope(f)) + self.on_worker(|worker| worker.scope(f)) } } @@ -694,12 +737,10 @@ thread_local! { static WORKER_PTR: Cell<*const Worker> = const { Cell::new(ptr::null()) }; } -/// Holds the local context for a thread pool member, which allows queuing, -/// executing, and sharing jobs on the pool. +/// Represents membership in a thread pool. /// -/// Workers are the recommended way to interface with a thread pool. To get -/// access to worker for a given thread pool, users should call -/// [`ThreadPool::with_worker`]. +/// To get access to worker for a given thread pool, users should call +/// [`ThreadPool::with_worker`], [`ThreadPool::on_worker`], [`ThreadPool::expect_worker`] /// /// Every thread has at most one worker at a time. If a worker has already been /// set up, it may be accessed at any time by calling [`Worker::with_current`]. @@ -716,9 +757,22 @@ thread_local! { pub struct Worker { migrated: Cell, lease: Lease, - queue: JobQueue, + /// A sequence of jobs waiting to be executed. Newer jobs are executed + /// before older ones, allowing efficient depth-first execution. During + /// promotion, the oldest job is shared. Populated by `join()`. + /// + /// Jobs in this queue take precedence over those in the fifo queue. + lifo_queue: JobQueue, + /// A sequence of jobs waiting to be executed. Older jobs are executed + /// before newer ones, providing reliably low latency. During promotion, + /// this queue is partitioned into chunks and the chunks are shared. + /// Populated by `spawn()`. + /// + /// Jobs in this queue are executed only when the lifo queue is empty. + pub(crate) fifo_queue: JobQueue, rng: XorShift64Star, - // Make non-send + last_promote_tick: Cell, + // Make non-send. _phantom: PhantomData<*const ()>, } @@ -746,17 +800,19 @@ impl Worker { { trace!("occupying lease"); - let span = trace_span!("occupy", lease = lease.index); + let span = trace_span!("occupy", seat_number = lease.seat_number); let _enter = span.enter(); // Create a new worker to occupy the lease. Note: It's potentially a // problem that the same thread can occupy multiple workers on the same - // thread. We many eventually need to design something to prevent this. + // thread. We may eventually need to design something to prevent this. let worker = Worker { migrated: Cell::new(false), lease, - queue: JobQueue::new(), + fifo_queue: JobQueue::new(), + lifo_queue: JobQueue::new(), rng: XorShift64Star::new(), + last_promote_tick: Cell::new(0), _phantom: PhantomData, }; @@ -767,9 +823,8 @@ impl Worker { // and pass in a worker reference directly. let result = f(&worker); - // Execute the work queue until it's empty. This happens to be pulled in - // LIFO order, but it's fairly arbitrary. - while let Some(job_ref) = worker.queue.pop_newest() { + // Finish executing local work before shutting down. + while let Some(job_ref) = worker.find_local_work() { worker.execute(job_ref, false); } @@ -783,6 +838,13 @@ impl Worker { result } + /// Returns a reference to the push-side `Sharer` queue for this + /// worker's seat. + #[inline(always)] + fn sharer(&self) -> &Sharer { + &self.lease.seats.sharers[self.lease.seat_number] + } + /// Calls the provided closure on the thread's worker instance, if it has one. /// /// Rust's thread locals are fairly costly, so this function is expensive. @@ -794,16 +856,20 @@ impl Worker { { let worker_ptr = WORKER_PTR.with(Cell::get); if !worker_ptr.is_null() { - // SAFETY: The `WORKER` static is only set by `occupy`, and it's - // always set to a stack-allocated `Worker` which is never moved and - // is only accessed through shared references. Therefore, if the - // pointer is non-null, it must be safe to dereference. + // SAFETY: `WORKER_PTR` is a thread-local `Cell` holding a raw + // pointer to a `Worker`. It is only written to by `Worker::occupy`, + // which stores the address of a `Worker` allocated within it's own + // stack frame. Before it returns, `occupy` restores the previous + // value of `WORKER_PTR`, so that it is always either null or points + // to a live, immovable `Worker` on the current thread's call stack + // (but is never left dangling). // - // This creates a reference with an unbounded lifetime. To avoid - // turning it into a `'static`, we pass it in to a closure. This - // restricts its lifetime to the closure body, and prevents callers - // from keeping around references to Workers that will be - // deallocated when `occupy` returns. + // If the pointer is non-null, it is therefore valid to dereference + // as a shared reference. Forming a `'static` reference is avoided + // by passing the value into a closure, which bounds the reference's + // lifetime to the closure body and prevents callers from retaining + // it past the point where `occupy` returns and the `Worker` is + // freed. Some(f(unsafe { &*worker_ptr })) } else { None @@ -828,7 +894,7 @@ impl Worker { // // This creates a reference with an unbounded lifetime. To avoid // turning it into a `'static`, we pass it in to a closure. This - // restricts it's lifetime to the closure body, and prevents callers + // restricts its lifetime to the closure body, and prevents callers // from keeping around references to Workers that will be // deallocated when `occupy` returns. f(Some(unsafe { &*worker_ptr })) @@ -839,8 +905,8 @@ impl Worker { /// Returns the index of the worker in the leases list. #[inline(always)] - pub fn index(&self) -> usize { - self.lease.index + pub fn seat_number(&self) -> usize { + self.lease.seat_number } /// Returns the index of the thread pool of the worker. @@ -849,66 +915,105 @@ impl Worker { self.lease.thread_pool } - /// Pushes a job onto the local queue, overflowing to the shared queue when - /// full. - #[inline(always)] - pub fn enqueue(&self, job_ref: JobRef) { - if let Some(job_ref) = self.queue.push(job_ref) { - // push the work to the shared queue - self.lease.thread_pool.shared_jobs.push(job_ref); - } - } + /// Capacity of the per-worker work-stealing queue. This is the maximum + /// amount a worker can make available for stealing at once. + const STEAL_QUEUE_CAPACITY: usize = 32; + + /// The minimum number of CPU ticks between calls to [`Worker::promote_cold`]. + /// Approximately 5μs at 3 GHz. + const PROMOTE_TICK_INTERVAL: u64 = 15_000; /// Try to promote the oldest task in the queue. #[inline(always)] fn promote(&self) { - // Check for a heartbeat, potentially promoting the job we just pushed - // to a shared job. - #[cfg(not(feature = "shuttle"))] - let heartbeat = self.lease.seat_data.heartbeat.load(Ordering::Relaxed); - - #[cfg(feature = "shuttle")] - let heartbeat = true; // thread_rng().gen_bool(0.5); - - if heartbeat && let Some(job_ref) = self.queue.pop_oldest() { - self.promote_cold(job_ref); - #[cfg(not(feature = "shuttle"))] - self.lease - .seat_data - .heartbeat - .store(false, Ordering::Relaxed); + // Promotions are fairly costly, so we limit their frequency using the + // cpu's instruction counter. Promote is called at a high frequency, and + // actually doing the promotion is probably a cold path. + let current_tick = tick_counter::start(); + if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL { + // This should ideally become a conditional jump. + self.promote_cold(current_tick); } } - /// Pushes work onto the shared queue and wakes another worker. + /// The actual work-promotion implementation. Must be called infrequently. #[cold] - fn promote_cold(&self, job_ref: JobRef) { - // Push the job onto the shared queue. - self.lease.thread_pool.shared_jobs.push(job_ref); - - // Fetch the number of sleeping workers and pending shared tasks - let num_sleeping = self.lease.thread_pool.num_sleeping.load(Ordering::Relaxed); + fn promote_cold(&self, current_tick: u64) { + // Update the promote tick so that `promote` won't call this again soon. + self.last_promote_tick.set(current_tick); - if num_sleeping == 0 { + // Early out if it seems like all workers are already awake. + let sleeping = self.lease.thread_pool.sleeping.load(Ordering::Relaxed); + if sleeping == 0 { return; } + cold_path(); + + // Track if we actually managed to share work. + let mut shared_job = false; + + // Share work from the lifo queue. This is shared bit-by-bit, with old + // (and therefore theoretically "large") tasks shared first. + if let Some(job_ref) = self.lifo_queue.pop_oldest() { + // Push into our own steal queue so siblings can steal it. + if let Err(job_ref) = self.sharer().push(job_ref) { + // If the queue is full, that indicates that the pool is + // probably under high-load and we should continue local-first + // operation. + self.lifo_queue.push_old(job_ref); + } else { + shared_job = true; + } + } - // Try to wake a worker to work on it. - // - // Note: This operation is extremely expensive, and should be avoided if possible. - let seats = self.lease.thread_pool.state.lock().unwrap().seats.clone(); - let num_seats = seats.len(); - let offset = self.rng.next_usize(num_seats); - for i in 0..num_seats { - let i = (i + offset) % num_seats; - if i == self.lease.index { - continue; + // Share work from the fifo queue. Offload the newest jobs in a series of + // small chunks. + for job_refs in self.fifo_queue.split() { + // Create a new job that will insert a chunk of jobs into the + // runner's fifo queue when executed. + // + // This reduces the cost of sharing a large number of small jobs. + let batch_job = HeapJob::new(move |worker| { + worker.fifo_queue.append(job_refs); + }); + // SAFETY: `into_job_ref` requires that the data closed over by the + // `HeapJob` outlive the `JobRef`. + // + // Here, the closure captures `job_refs` (a `VecDequeue`) by + // value, and so trivially outlives the newly created `JobRef`. + let batch_job_ref = unsafe { batch_job.into_job_ref() }; + // Push the batch job into the steal queue so siblings can steal it. + if let Err(job_ref) = self.sharer().push(batch_job_ref) { + // If the queue is full, that indicates that the pool is + // probably under high-load and we should continue local-first + // operation. + // + // This just adds the jobs back to the local queue. + self.execute(job_ref, false); + } else { + shared_job = true; } - if seats[i].occupied { - let ready = seats[i].data.sleep_controller.wake(); - if ready { - return; - } + } + + // If we added work to the steal queue, wake a random sibling to steal + // it from us, while we do other work. + if shared_job { + self.wake_random(sleeping); + } + } + + /// Tries to wake a random sleeping worker. Expects to be given a bitset of + /// sleeping workers. + #[inline(always)] + fn wake_random(&self, sleeping: u32) { + let offset = self.rng.next_usize(32) as u32; + let mut randomized_sleeping = sleeping.rotate_right(offset); + while randomized_sleeping != 0 { + let index = (randomized_sleeping.trailing_zeros() + offset) % 32; + randomized_sleeping &= randomized_sleeping - 1; // Clear the lowest bit + let woken = self.lease.seats.sleep_controllers[index as usize].wake(); + if woken { + return; } } } @@ -916,7 +1021,11 @@ impl Worker { /// Create a new latch owned by the worker. #[inline(always)] pub fn new_latch(&self) -> Latch { - Latch::new(&self.lease.seat_data.sleep_controller) + Latch::new( + self.lease.seat_number, + &self.lease.thread_pool.sleeping, + &self.lease.seats.sleep_controllers[self.lease.seat_number], + ) } /// Runs jobs until the provided latch is set. @@ -926,40 +1035,67 @@ impl Worker { #[inline(always)] pub fn wait_for(&self, latch: &Latch) { while !latch.check() { - #[cfg(feature = "shuttle")] - shuttle::hint::spin_loop(); - if self.yield_now() == Yield::Idle { latch.wait(); } } } - /// Tries to find a job to execute, either in the local queue or shared on - /// the thread pool. - /// - /// The second value is true if the job was shared, or false if it was spawned locally. + /// Finds a job to work on. This function is entirely local, and does no + /// synchronization with the queue. #[inline(always)] - fn find_work(&self) -> Option<(JobRef, bool)> { - // We give preference first to things in our local deque, then in other - // workers deques, and finally to injected jobs from the outside. The - // idea is to finish what we started before we take on something new. - // - // We pull from the local queue in LIFO order, which means are popping - // from the *back* of the queue (the most recently added jobs). This is - // because `yield_now` (and by extension `wait_for` which uses it) is - // often called directly after pushing work onto the queue (as in `join` - // and `scope`). Pulling from the back of the queue potentially can - // allow these blocking operations to complete faster. In the cast when - // scopes/joins are deeply nested, this also causes work to be executed - // *depth-first*, which is often desirable. - self.queue + fn find_local_work(&self) -> Option { + self.lifo_queue .pop_newest() + .or_else(|| self.fifo_queue.pop_oldest()) + .or_else(|| self.sharer().pop()) + } + + /// Finds a job to work on. This tries + /// [`find_local_work`][Worker::find_local_work] first, then falls back to + /// pulling shared work from the thread pool. + #[inline(always)] + fn find_work(&self) -> Option<(JobRef, bool)> { + self.find_local_work() .map(|job| (job, false)) + .or_else(|| self.steal_from_siblings().map(|job| (job, true))) .or_else(|| self.claim_shared_job().map(|job| (job, true))) } - /// Claims a shared job from the thread pool. + /// Attempts to steal a job from another worker's work-stealing queue. + /// + /// Iterates over occupied seats in a random order to avoid always hitting + /// the same victim. Because stealers are pre-allocated and permanent, no + /// lock or atomic load is needed to access them. + fn steal_from_siblings(&self) -> Option { + let stealers = &self.lease.seats.stealers; + let occupied = self.lease.thread_pool.occupied.load(Ordering::Relaxed); + let my_seat = self.lease.seat_number as u32; + + // Randomise the starting position so all workers get a fair shot as victims. + let offset = self.rng.next_usize(32) as u32; + let mut bits = (occupied & !(1u32 << my_seat)).rotate_right(offset); + + while bits != 0 { + let shifted_idx = bits.trailing_zeros(); + let idx = (shifted_idx + offset) % 32; + bits &= bits - 1; + // The stealer is a permanent reference — no lock or atomic load needed. + let stealer = &stealers[idx as usize]; + // `steal_and_pop` returns one job directly and moves up to half the + // remaining items into our steal queue for later use. + loop { + match stealer.steal_and_pop(self.sharer(), |n| n / 2) { + Ok((job, _)) => return Some(job), + Err(StealError::Busy) => {} // transient; retry + Err(StealError::Empty) => break, + } + } + } + None + } + + /// Claims a job from the global injector queue. #[inline(always)] fn claim_shared_job(&self) -> Option { self.lease.thread_pool.shared_jobs.pop() @@ -974,7 +1110,7 @@ impl Worker { pub fn yield_local(&self) -> Yield { // We use LIFO order here, pulling the newest work from the queue. This // is just for consistency with yield_now/find_work. - match self.queue.pop_newest() { + match self.find_local_work() { Some(job_ref) => { self.execute(job_ref, false); Yield::Executed @@ -993,9 +1129,7 @@ impl Worker { /// [`Worker::yield_local`] instead. #[inline(always)] pub fn yield_now(&self) -> Yield { - // Try to promote an item from the queue self.promote(); - match self.find_work() { Some((job_ref, migrated)) => { self.execute(job_ref, migrated); @@ -1062,14 +1196,13 @@ impl Worker { T: Send, { // Create a new latch to block the thread until the future completes. - let latch = pin!(self.new_latch()); - let latch = latch.into_ref(); - // Convert the blocker into an async waker. // - // SAFETY: The blocker lasts for the duration of this function, and - // since the waker is only used within this function, it must outlive - // the waker. - let waker = unsafe { latch.as_waker() }; + // This is allocated on the heap, even though the worker is allocated on + // the stack, because we can't prevent futures from keeping wakers + // around for arbitrary amounts of time or issuing wakeups for futures + // that have completed. + let latch = Arc::new(self.new_latch()); + let waker = latch.clone().into(); // Put the waker into an async context that can be used to poll futures. let mut ctx = Context::from_waker(&waker); // Pin the future, promising not to move it while it's being polled. @@ -1080,14 +1213,14 @@ impl Worker { // While the future is incomplete, run other tasks or sleep. Poll::Pending => { // This will not return until the latch is set. - self.wait_for(latch.get_ref()); + self.wait_for(latch.borrow()); // We want to keep using the same latch every time we wait // for the future to become ready, so we have to reset it // here. // - // SAFETY: The latch must be in the set state because we - // just waited for it. - unsafe { latch.reset() }; + // The latch must be in the set state because we just waited + // for it. + latch.reset(); } // When it is complete, pull out the result and return it. Poll::Ready(res) => return res, @@ -1095,8 +1228,7 @@ impl Worker { } } - /// Takes two closures and *potentially* runs them in parallel, then returns - /// the results. + /// Executes the two closures, possibly in parallel. /// /// If you do not have access to a [`Worker`], you may call /// [`ThreadPool::join`] or simply [`join`]. @@ -1127,7 +1259,7 @@ impl Worker { let job_ref_id = job_ref.id(); // Push the job onto the queue. - self.enqueue(job_ref); + self.lifo_queue.push_new(job_ref); // If we have received a heartbeat, we remove the oldest item in the // local queue and push it into the shared queue. This causes work to be @@ -1141,7 +1273,7 @@ impl Worker { // Attempt to recover the job from the queue. It should still be there // if we didn't share it. - if self.queue.recover_just_pushed(job_ref_id) { + if self.lifo_queue.recover_newest(job_ref_id) { // SAFETY: Because the ids match, the JobRef we just popped from // the queue must point to `stack_job`, implying that // `stack_job` cannot have been executed yet. @@ -1178,9 +1310,12 @@ impl Worker { // ----------------------------------------------------------------------------- // Thread local scheduling api -/// Spawns a thread onto the current thread pool. +/// Runs the provided closure in the background. /// -/// If there is no current thread pool, this panics. +///
+/// Note: +/// This function panics if the current thread is not registered as a worker. +///
/// /// See also: [`Worker::spawn`] and [`ThreadPool::spawn`]. pub fn spawn>(work: S) -> S::Output { @@ -1191,9 +1326,12 @@ pub fn spawn>(work: S) -> S::Output { }) } -/// Blocks the thread waiting for a future to complete. +/// Waits for a future to complete. /// -/// If there is no current thread pool, this panics. +///
+/// Note: +/// This function panics if the current thread is not registered as a worker. +///
/// /// See also: [`Worker::block_on`] and [`ThreadPool::block_on`]. pub fn block_on(future: F) -> T @@ -1208,10 +1346,12 @@ where }) } -/// Takes two closures and *potentially* runs them in parallel. It -/// returns a pair of the results from those closures. +/// Executes the two closures, possibly in parallel. /// -/// If there is no current thread pool, this panics. +///
+/// Note: +/// This function panics if the current thread is not registered as a worker. +///
/// /// See also: [`Worker::join`] and [`ThreadPool::join`]. pub fn join(a: A, b: B) -> (RA, RB) @@ -1228,15 +1368,17 @@ where }) } -/// Creates a "fork-join" scope and invokes the closure with a reference to -/// it. Work spawned onto this scope does not have to have a `'static` +/// Creates a new scope for spawning non-static work. +/// +/// Work spawned onto the new scope does not have to have a `'static` /// lifetime, and can borrow local variables. Local borrowing is possible /// because this function will not return until all work spawned on the /// scope has completed, this ensuring the stack frame is kept alive for the /// duration. /// ///
-/// Note: This function panics if the current thread is not registered as a worker. +/// Note: +/// This function panics if the current thread is not registered as a worker. ///
/// /// # Alternatives @@ -1263,7 +1405,7 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1290,7 +1432,7 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1316,7 +1458,7 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1343,7 +1485,7 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let ok: Vec = vec![1, 2, 3]; /// forte::scope(|scope| { /// let bad: Vec = vec![4, 5, 6]; @@ -1370,10 +1512,10 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let mut leak = None; /// forte::scope(|scope| { -/// leak = Some(scope); // <-- scope would be leaked here +/// leak = Some(scope); // <-- ERROR: scope would be leaked here /// }); /// drop(leak); /// # }); @@ -1387,7 +1529,7 @@ where /// # use forte::Worker; /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); /// # THREAD_POOL.populate(); -/// # THREAD_POOL.with_worker(|worker| { +/// # THREAD_POOL.expect_worker(|worker| { /// let mut counter = 0; /// let counter_ref = &mut counter; /// forte::scope(|scope| { @@ -1415,12 +1557,13 @@ where /// THREAD_POOL.with_worker(|worker| { /// worker.scope(|scope| { /// worker.spawn(|worker: &Worker| { -/// // ^^^^^ This creates a *non-static* job on the worker, +/// // ^^^^^ ERROR: This creates a *static* job on the worker, /// // which may outlive the scope. /// /// scope.spawn_on(worker, |_: &Worker| { }); -/// // ^^^^^ This requires borrowing the scope within the -/// // unscoped job, which isn't allowed by the compiler. +/// // ^^^^^ ERROR: This requires borrowing the scope within the +/// // unscoped job, which isn't allowed by the compiler +/// // because 'scope would have to to outlive 'static. /// }); /// }); /// }); @@ -1439,9 +1582,6 @@ where /// once a task is spawned using `scope.spawn(),` it will execute, even if the /// spawning task should later panic. The scope returns once all work is /// complete, and panics are propagated at that point. -/// -/// Note: Panics in futures are instead propagated to their -/// [`Task`][async_task::Task], and will not cause the scope to panic. pub fn scope<'env, F, T>(f: F) -> T where F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, @@ -1460,11 +1600,9 @@ where /// Operating on the principle that you should finish what you start before /// starting something new, workers will first execute their queue, then execute /// shared jobs, then pull new jobs from the injector. -fn managed_worker(lease: Lease, halt: Arc, barrier: Arc) { +fn managed_worker(lease: Lease, halt: Arc) { trace!("starting managed worker"); - barrier.wait(); - // Register as the indicated worker, and work until we are told to halt. Worker::occupy(lease, |worker| { while !halt.load(Ordering::Relaxed) { @@ -1474,7 +1612,8 @@ fn managed_worker(lease: Lease, halt: Arc, barrier: Arc) { if let Some((job, migrated)) = worker.find_work() { worker.execute(job, migrated); } else { - worker.lease.seat_data.sleep_controller.sleep(); + worker.lease.seats.sleep_controllers[worker.lease.seat_number] + .sleep(worker.lease.seat_number, &worker.lease.thread_pool.sleeping); } } }); @@ -1482,57 +1621,14 @@ fn managed_worker(lease: Lease, halt: Arc, barrier: Arc) { trace!("exiting managed worker"); } -// ----------------------------------------------------------------------------- -// Heartbeat sender loop - -/// This is the main loop for the heartbeat thread. It's in charge of -/// periodically sending a "heartbeat" signal to each worker. By default, each -/// worker receives a heartbeat about once every 100 μs. -/// -/// Workers use the heartbeat signal to amortize the cost of promoting local -/// jobs to shared jobs (which allows other works to claim them) and to reduce -/// lock contention. -/// -/// This is never runs when testing in shuttle. -#[cfg(not(feature = "shuttle"))] -fn heartbeat_loop(thread_pool: &'static ThreadPool, halt: Arc) { - trace!("starting managed heartbeat thread"); - - let mut seats = thread_pool.state.lock().unwrap().seats.clone(); - let mut index = 0; - - while !halt.load(Ordering::Relaxed) { - let num_seats = seats.len(); - let (back, front) = seats.split_at(index); - if let Some((offset, seat)) = Iterator::chain(front.iter(), back.iter()) - .enumerate() - .find(|(_, seat)| seat.occupied) - { - index = (index + offset + 1) % num_seats; - seat.data.heartbeat.store(true, Ordering::Relaxed); - std::thread::yield_now(); - seats = thread_pool.state.lock().unwrap().seats.clone(); - } else { - let state = thread_pool.state.lock().unwrap(); - seats = thread_pool - .start_heartbeat - .wait(state) - .unwrap() - .seats - .clone(); - } - } -} - // ----------------------------------------------------------------------------- // Tests #[cfg(all(test, not(feature = "shuttle")))] mod tests { - use std::sync::mpsc::channel; - use alloc::vec; + use std::sync::mpsc::channel; use super::*; @@ -1596,7 +1692,7 @@ mod tests { THREAD_POOL.resize_to_available(); let mut vals = [0; 1_024]; - THREAD_POOL.with_worker(|worker| increment(worker, &mut vals)); + THREAD_POOL.on_worker(|worker| increment(worker, &mut vals)); assert_eq!(vals, [1; 1_024]); THREAD_POOL.depopulate(); @@ -1625,7 +1721,7 @@ mod tests { THREAD_POOL.resize_to_available(); let mut vals = vec![0; 512 * 512]; - THREAD_POOL.with_worker(|worker| increment(worker, &mut vals)); + THREAD_POOL.on_worker(|worker| increment(worker, &mut vals)); assert_eq!(vals, vec![1; 512 * 512]); THREAD_POOL.depopulate(); diff --git a/src/util.rs b/src/util.rs index 33ba5da..661a430 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,8 +1,7 @@ -use core::{ - cell::Cell, - hash::Hasher, - sync::atomic::{AtomicUsize, Ordering}, -}; +use core::cell::Cell; +use core::hash::Hasher; +use core::sync::atomic::AtomicUsize; +use core::sync::atomic::Ordering; use std::hash::DefaultHasher; /// [xorshift*] is a fast pseudorandom number generator which will diff --git a/tests/shuttle.rs b/tests/shuttle.rs index 9d7e14a..776f3be 100644 --- a/tests/shuttle.rs +++ b/tests/shuttle.rs @@ -9,7 +9,6 @@ use core::task::Poll; use forte::ThreadPool; use forte::Worker; - use shuttle::hint::black_box; use shuttle::sync::atomic::AtomicBool; use shuttle::sync::atomic::AtomicUsize; @@ -20,24 +19,6 @@ use tracing_subscriber::fmt::Subscriber; // ----------------------------------------------------------------------------- // Infrastructure -/* - -fn trace(f: F) -where - F: Fn() + Send + Sync + 'static, -{ - let subscriber = Subscriber::builder() - .compact() - .with_max_level(Level::TRACE) - .without_time() - .with_thread_names(false) - .finish(); - - tracing::subscriber::with_default(subscriber, f); -} - -*/ - /// Provides access to a thread pool which can be treated as static for the /// purposes of testing. fn with_thread_pool(f: F) -> impl Fn() + 'static @@ -48,13 +29,9 @@ where let thread_pool = Box::new(ThreadPool::new()); let thread_pool_ptr = Box::into_raw(thread_pool); - // SAFETY: TODO + // SAFETY: This thread pool is never dropped. let thread_pool_ref = unsafe { &*thread_pool_ptr }; f(thread_pool_ref); - - // SAFETY: TODO - let thread_pool = unsafe { Box::from_raw(&mut *thread_pool_ptr) }; - drop(thread_pool); } } @@ -70,7 +47,7 @@ pub fn shuttle_populate_depopulate() { pool.depopulate(); }); - shuttle::check_dfs(test, None); + shuttle::check_pct(test, 100_000, 100_000); } // ----------------------------------------------------------------------------- @@ -85,7 +62,7 @@ pub fn shuttle_spawn_closure() { pool.depopulate(); }); - shuttle::check_dfs(test, None); + shuttle::check_pct(test, 100_000, 100_000); } #[derive(Default)] @@ -117,7 +94,7 @@ pub fn shuttle_spawn_future() { pool.depopulate(); }); - shuttle::check_dfs(test, None); + shuttle::check_pct(test, 100_000, 100_000); } /// Tests a two-level join operation on a pool of size one. @@ -146,7 +123,7 @@ pub fn join_4_on_1() { pool.depopulate(); }); - shuttle::check_pct(test, 100_000, 10_000); + shuttle::check_pct(test, 100_000, 100_000); } /// Tests a two-level join operation on a pool of size two. @@ -175,7 +152,7 @@ pub fn join_4_on_2() { pool.depopulate(); }); - shuttle::check_pct(test, 100_000, 10_000); + shuttle::check_pct(test, 100_000, 100_000); } /// Tests a two-level join operation on a pool of size three. @@ -204,7 +181,7 @@ pub fn join_4_on_3() { pool.depopulate(); }); - shuttle::check_pct(test, 100_000, 10_000); + shuttle::check_pct(test, 100_000, 100_000); } /// Tests a moderately deep join operation on a large pool. @@ -226,115 +203,11 @@ pub fn join_long() { } let mut vals = [0; 10]; - pool.with_worker(|worker| increment(worker, &mut vals)); + pool.expect_worker(|worker| increment(worker, &mut vals)); assert_eq!(vals, [1; 10]); pool.depopulate(); }); - shuttle::check_pct(test, 100_000, 10_000); -} - -/* - -/// Tests for concurrency issues when blocking on a future. -#[test] -pub fn block_on() { - model(|| { - with_thread_pool(|_, worker| { - worker.block_on(async { - black_box(()); - }); - }); - }); -} - -/// Tests for concurrency issues when spawning a future and then blocking on the -/// resulting task. -#[test] -pub fn spawn_and_block() { - model(|| { - with_thread_pool(|_, worker| { - let task = worker.spawn_future(async { - black_box(()); - }); - worker.block_on(task); - }); - }); -} - -// ----------------------------------------------------------------------------- -// Scoped API - -/// Test for concurrency issues when creating a scope. -#[test] -pub fn scope_empty() { - model(|| { - with_thread_pool(|_, worker| { - worker.scope(|_| {}); - }); - }); -} - -/// Tests for concurrency issues when returning a value from a scope. -#[test] -fn scope_result() { - model(|| { - with_thread_pool(|_, worker| { - let result = worker.scope(|_| 22); - assert_eq!(result, 22); - }); - }); -} - -/// Tests for concurrency issues when spawning a scoped closure. -#[test] -pub fn scope_spawn() { - model(|| { - with_thread_pool(|_, worker| { - let complete = AtomicBool::new(false); - worker.scope(|scope| { - scope.spawn(|_| { - complete.store(true, Ordering::Release); - }); - }); - worker.run_until(&complete); - }); - }); -} - -/// Tests for concurrency issues when spawning multiple scoped closures. -#[test] -pub fn scope_two() { - model(|| { - with_thread_pool(|_, worker| { - let counter = &AtomicUsize::new(0); - worker.scope(|scope| { - scope.spawn(|_| { - counter.fetch_add(1, Ordering::SeqCst); - }); - scope.spawn(|_| { - counter.fetch_add(10, Ordering::SeqCst); - }); - }); - let v = counter.load(Ordering::SeqCst); - assert_eq!(v, 11); - }); - }); -} - -/// Tests for concurrency issues when spawning a scoped future, and blocking on -/// it. -#[test] -pub fn scope_future() { - model(|| { - with_thread_pool(|_, worker| { - let vec = vec![1, 2, 3]; - let task = worker.scope(|scope| scope.spawn_future(async { black_box(vec.len()) })); - let len = worker.block_on(task); - assert_eq!(len, vec.len()); - }); - }); + shuttle::check_pct(test, 100_000, 100_000); } - -*/ From d0597d57e45a88d20c942c6dde5c66c4b90a40c8 Mon Sep 17 00:00:00 2001 From: NthTensor Date: Fri, 24 Apr 2026 15:46:37 -0400 Subject: [PATCH 2/3] feat: improve docs --- CHANGELOG.md | 2 +- Cargo.toml | 2 +- README.md | 159 +++++++++- src/job.rs | 12 +- src/latch.rs | 11 +- src/lib.rs | 24 +- src/scope.rs | 13 +- src/thread_pool.rs | 774 ++++++++++++++++++++++++++++----------------- 8 files changed, 657 insertions(+), 340 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f7ea85..26911ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ This project is currently in early [pre-release], and there may be arbitrary bre ### Added -- `ThreadPool::num_workers` method which return the current number of workers +- `ThreadPool::num_workers` method which returns the current number of workers - `ThreadPool::on_worker` variant of `with_worker` for `Send` closures. - `ThreadPool::expect_worker` variant of `with_worker` that panics. diff --git a/Cargo.toml b/Cargo.toml index d77e28f..2d68140 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "forte" version = "1.0.0-dev" edition = "2024" license = "MIT OR Apache-2.0" -description = "A low-overhead thread-pool with support for non-static async closures" +description = "Low-overhead parallel and async work scheduler" repository = "https://github.com/NthTensor/Forte" [workspace] diff --git a/README.md b/README.md index 02d8549..7548f47 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,158 @@ [![Crates.io](https://img.shields.io/crates/v/forte.svg)](https://crates.io/crates/forte) [![Docs](https://docs.rs/forte/badge.svg)](https://docs.rs/forte/latest/forte/) -An async-compatible thread-pool aiming for "speed through simplicity". +Forte is a low-overhead parallel & async work scheduler. It can be used as a +lower-overhead, lower-latency alternative to `rayon_core`, or as an async +executor (like `tokio`). -Forte is a parallel & async work scheduler designed to accommodate very large workloads with many short-lived tasks. It replicates the `rayon_core` api but with native support for futures and async tasks. -Its design was prompted by the needs of the bevy game engine, but should be applicable to any problem that involves running both synchronous and asynchronous work concurrently. +## Static + Resizable Thread-Pools -The thread-pool provided by this crate does not employ work-stealing. -Forte instead uses "Heartbeat Scheduling", an alternative load-balancing technique that (theoretically) provides provably small overheads and good utilization. -The end effect is that work is only parallelized every so often, allowing more work to be done sequentially on each thread and amortizing the synchronization overhead. +Thread pools are `const`-constructed, and intended to be defined as `static` +variables within a binary crate. Adding a new thread-pool to your project is as +simple as: -# Acknowledgments +```rust +static THREAD_POOL: ThreadPool = ThreadPool::new(); +``` -Large portions of the code are direct ports from various versions of `rayon_core`, with minor simplifications and improvements. -We also relied upon `chili` and `spice` for reference while writing the heartbeat scheduling. -Support for futures is based on an approach sketched out by members of the `rayon` community to whom we are deeply indebted. +Thread pools are empty when created, and can be resized on demand. Up to 32 +threads can participate in a pool at a time (including worker threads and +non-worker threads making blocking calls to the pool). + +```rust +// Add as many workers to the thread pool as you have cores in your computer. +THREAD_POOL.resize_to_available(); + +// Resize the thread-pool to have exactly five workers +THREAD_POOL.resize_to(5); + +// Remove all workers from the pool and shut it down. +THREAD_POOL.depopulate(); +``` + +## Fork-Join Parallelism + +Forte provides an extremely low-overhead parallelization primitive for blocking +compute, similar to [`rayon::join`] or [`chili::Scope::join`]. At any point, it _may_ +run the two closures in parallel. + +```rust +fn sum(node: &Node, worker: &Worker) -> u64 { + let (left, right) = worker.join( + |w| node.left.as_deref().map(|n| sum(n, w)).unwrap_or_default(), + |w| node.right.as_deref().map(|n| sum(n, w)).unwrap_or_default(), + ); + + node.val + left + right +} +``` + +This is optimized for depth-first traversal and hierarchical work-splitting, +where each of the closures passed to `join` potentially contains another call to +`join`. + +## Spawn Closures & Futures + +Forte also provides tools for load-balancing ultra-low-latency non-blocking +compute (like polling `Futures`), similar to [`rayon::spawn`] or +[`tokio::task::spawn`]. + +```rust +async fn serve() { + let listener = TcpListener::bind("127.0.0.1:8080").await?; + let mut incoming = listener.incoming(); + + while let Some(stream) = incoming.next().await { + // A new task is spawned for each inbound tcp stream. The stream is + // moved to the new task and processed there. + let task = THREAD_POOL.spawn(async move { + process(stream).await; + }); + // Spawning a future gives us back a task handle we can use to await + // its completion, but we don't care about that here. `detach` lets + // drop the handle without canceling the stream-processing task. + task.detach(); + } +} +``` + +## Scoped Spawns + +For scheduling with non-static work, forte provides tools akin to +[`std::thread::scope`], [`tokio_scoped::scope`] or [`rayon::scope`]. + +```rust +let mut v = String::from("Hello"); +forte::scope(|scope| { + scope.spawn(|_: &Worker| { + v.push('!'); + }); +}); +// The scope doesn't exit until all spawned work is complete. +assert_eq!(v.as_str(), "Hello!"); +``` + +## Lazy Heartbeat Scheduling + +Forte uses a combination of [_heartbeat scheduling_][hb] and [_lazy +scheduling_][lz] to achieve ultra-low overhead and minimize cpu-utilization. + +The vast majority of operations are local and serial. Most jobs are stored in +simple double-ended queues, and adding new jobs to a worker has a zero-overhead +path without any shared data-structures. + +Every worker also has a small fixed-capacity work-stealing queue (currently each +has space for 32 jobs). Approximately every 5us (gated by the CPU's instruction +counter) if there's space available, each worker pushes a small number of jobs +into this queue. When a worker runs out of jobs to execute, it briefly tries to +steal from its coworkers, then goes to sleep. + +This approach has several benefits over more brute-force applications of +work-stealing: + +* For any particular time-slice, there is an upper-bound on the overhead due to + synchronization. Since workers only touch shared data-structures every so + often, it can only slow them down so much. This reduces runtime variance and + lowers overhead. + +* There is a cap on frequency at which local-work is made available for sharing. + This reduces the probability that new work will become available at any given + instant, which means (unlike many work-stealing implementations) it doesn't + make sense to spin while trying to steal work. This can also reduce + over-sharing at the tail-end of a parallel operation. + +* The occupancy of the work-stealing queue represents an estimate of system + load. When a worker's shared-queue is empty, that's a sign that some workers + may be starved, and more tasks should be shared. By contrast, when a worker's + shared-queue is full to capacity, that's a sign that the thread-pool may have + reached full resource-utilization, and should avoid the costs of + synchronization for a bit. + +Jobs created by `join` are executed in LIFO order. When it comes time to share +work, the oldest `join` job is promoted into the shared work-stealing queue. In +the case of a binary tree, this means that execution progresses depth-first, but +sharing progresses breadth-first. + +Jobs created by `spawn` are executed in FIFO order, to minimize latency. When it +comes time to share work, the newest `spawn` jobs are grouped into small batches +(16 jobs each) and those batches are promoted into the shared work-stealing +queue. This means that spawns generally stay on the thread that spawned them, +unless the thread is overwhelmed by an influx of new tasks. # License -Forte is distributed under the terms of both the MIT license and the Apache License (Version 2.0). -See LICENSE-APACHE and LICENSE-MIT for details. +Forte is distributed under the terms of both the MIT license and the Apache +License (Version 2.0). See LICENSE-APACHE and LICENSE-MIT for details. + Opening a pull request is assumed to signal agreement with these licensing terms. + +[`rayon::join`]: https://docs.rs/rayon/latest/rayon/fn.join.html +[`chili::Scope::join`]: https://docs.rs/chili/latest/chili/struct.Scope.html#method.join +[`rayon::spawn`]: https://docs.rs/rayon/latest/rayon/fn.spawn.html +[`tokio::task::spawn`]: https://docs.rs/tokio/latest/tokio/task/fn.spawn.html +[`std::thread::scope`]: https://doc.rust-lang.org/std/thread/fn.scope.html +[`tokio_scoped::scope`]: https://docs.rs/tokio-scoped/latest/tokio_scoped/fn.scope.html +[`rayon::scope`]: https://docs.rs/rayon/latest/rayon/fn.scope.html + +[hb]: https://www.andrew.cmu.edu/user/mrainey/heartbeat/heartbeat.html +[lz]: https://dl.acm.org/doi/10.1145/2629643 diff --git a/src/job.rs b/src/job.rs index 38d0c30..edbed59 100644 --- a/src/job.rs +++ b/src/job.rs @@ -132,7 +132,7 @@ unsafe impl Send for JobRef {} // Job queue /// A queue of jobs. This is a simple wrapper around a vec dequeue that uses -/// inner mutation, and has some more intiuitively named methods to enforce +/// inner mutation, and has some more intuitively named methods to enforce /// conventions. pub struct JobQueue { job_refs: UnsafeCell>, @@ -206,8 +206,8 @@ impl JobQueue { const CHUNK_SIZE: usize = 16; /// Splits off a series of chunks from the end of the queue (the side with - /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. After, At most - /// `CHUNK_SIZE` jobs will be left in the queue. + /// the newest jobs). Each chunk is of size `CHUNK_SIZE`. Afterwards, at most + /// `CHUNK_SIZE` jobs will remain in the queue. pub fn split(&self) -> Vec> { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not @@ -226,7 +226,7 @@ impl JobQueue { } /// Appends a chunk of jobs (expected to be provided by `split`) to the - /// queue. Jobs are added to the end (the side with the newst jobs). + /// queue. Jobs are added to the end (the side with the newest jobs). pub fn append(&self, mut split_refs: VecDeque) { // SAFETY: `JobQueue` is not `Sync`, so this can only be called from one // thread. We ensure no other references to the inner value exist by not @@ -417,9 +417,9 @@ where // The latch has not been set, and this function is called at most once, // so no concurrent access can occur. unsafe { (*return_value).write(result) }; - // This syncrhonizies with the `Acquire` fence within `return_value()`, + // This synchronizes with the `Acquire` fence within `return_value()`, // establishing a happens-before relationship that makes the preceding - // `return_value` write vsibile to the reader. + // `return_value` write visible to the reader. // // This is required because latches do not synchronize memory. fence(Ordering::Release); diff --git a/src/latch.rs b/src/latch.rs index 9a1b3a1..425a9b8 100644 --- a/src/latch.rs +++ b/src/latch.rs @@ -1,10 +1,11 @@ -//! A core concept in Rayon is the *latch*. Forte has borrowed this, in a -//! somewhat simplified form. +//! Forte borrows the *latch* concept from Rayon. //! //! Every forte worker thread has a single "sleep controller" that it uses to //! park and unpark itself. Latches build on this to create a simple boolean //! switch, which allows the owning thread to sleep until the latch becomes set //! by another thread. +//! +//! Every latch points at one "sleep controller". use alloc::task::Wake; use core::borrow::Borrow; @@ -77,9 +78,9 @@ impl Latch { self.state.load(Ordering::Relaxed) == SIGNAL } - /// Waits for the latch to be set. In actuality, this may be woken. - /// - /// Returns true if the latch signal was received, and false otherwise. + /// Puts the thread to sleep if the latch has not been set. The thread will + /// be woken when the latch becomes set, but may also wake before then. The + /// caller should always re-check the latch condition after this returns. /// /// # Memory Ordering /// diff --git a/src/lib.rs b/src/lib.rs index dd3071d..7fb6758 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -90,7 +90,7 @@ //! //! Thread pools are dynamically sized; When your program starts they have size //! zero (meaning no worker threads are running). You can change the number of -//! works assigned to a pool using [`ThreadPool::grow`], [`ThreadPool::shrink`] +//! workers assigned to a pool using [`ThreadPool::grow`], [`ThreadPool::shrink`] //! and [`ThreadPool::resize_to`]. But most of the time you will want to call //! [`ThreadPool::resize_to_available`], which will resize the pool to exploit //! all the available parallelism on your system by spawning a worker thread for @@ -122,8 +122,8 @@ //! // there are no workers to parallelize it). //! THREAD_POOL.join(|_| println!("world"), |_| println!("hello ")); //! -//! // This will always print "hello world" (because join happens execute things -//! // backwards in this case). +//! // This will always print "hello world" (because join executes the second +//! // closure first when running in serial). //! ``` //! //! # Workers @@ -151,7 +151,7 @@ //! tasks left in the local queue are executed. //! //! You will only ever receive `&Worker` references, because the worker is not -//! allowed to move or be mutably referenced. Worker are `!Send` and `!Sync`, +//! allowed to move or be mutably referenced. Workers are `!Send` and `!Sync`, //! and are meant to represent local-only data. //! //! To access the current worker context, you can use [`Worker::map_current`] or @@ -196,25 +196,14 @@ //! | *Scope* | [`scope()`] | [`ThreadPool::scope()`] | [`Worker::scope()`] //! | *Block on* | [`block_on()`] | [`ThreadPool::block_on()`] | [`Worker::block_on()`] //! -//! * *Worker.* Uses the provided worker context. -//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one. //! * *Headless.* Looks for an existing worker context, and panics if it doesn't find one. +//! * *Thread pool.* Looks for an existing worker context, creates one if it doesn't find one. +//! * *Worker.* Uses the provided worker context. //! //! The headless and thread pool flavors are more or less just aliases for the //! worker flavor. Where possible, the worker flavor should be preferred to the //! thread pool flavor, and the thread pool flavor should be preferred to the //! headless flavor. -//! -//! # Theory & Background -//! -//! Forte is based on `rayon_core`, to the extent that during development it was -//! often possible to port code from `rayon_core` more or less verbatim. -//! However, forte and rayon differ significantly in their goals and approach. -//! -//! Rayon uses an approach to work-stealing adapted from Cilk and Intel TBB. -//! These techniques are largely the industry standard. -//! -//! [^TZANNES]: Tzannes et al. 2024, #![no_std] #![cfg_attr(feature = "shuttle", allow(dead_code))] @@ -252,6 +241,7 @@ pub struct FutureMarker(); pub use scope::Scope; pub use scope::ScopedSpawn; pub use thread_pool::Spawn; +pub use thread_pool::Task; pub use thread_pool::ThreadPool; pub use thread_pool::Worker; pub use thread_pool::Yield; diff --git a/src/scope.rs b/src/scope.rs index 71a6c19..03c3bb7 100644 --- a/src/scope.rs +++ b/src/scope.rs @@ -35,8 +35,9 @@ use crate::unwind::AbortOnDrop; // ----------------------------------------------------------------------------- // Scope -/// A scope which can spawn a number of non-static jobs and async tasks. Refer -/// to [`scope`](crate::scope()) for more extensive documentation. +/// A scope which can spawn a number of non-static jobs and async tasks. +/// +/// Refer to [`scope`](crate::scope()) for more extensive documentation. /// /// # Lifetimes /// @@ -119,7 +120,7 @@ where // reference is allowed to escape, the caller cannot safely cause the scope // to move either. // - // `Scope::complete` is called unconditionally on the line bellow, before + // `Scope::complete` is called unconditionally on the line below, before // the implicit drop of `scope`. If the closure `f` panics, it is caught and // re-emitted after `complete` finishes. In the event of an uncaught panic, // we cannot ensure `complete` runs properly before the scope is dropped, so @@ -254,7 +255,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { // allocating a second time, and means we can immediately drop the panic // we have just been passed. // - // Dropping this panic may itself trigger a pnaic, but this will simply + // Dropping this panic may itself trigger a panic, but this will simply // trigger the scope's abort guard, causing an abort rather than UB. if self.panic.load(Ordering::Relaxed).is_null() { let nil = ptr::null_mut(); @@ -267,7 +268,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { // // If the write fails, another panic must have already occurred, and // we don't need to synchronize memory (the previous call to - // `store_panic` handles the syncrhonization for it's panic data). + // `store_panic` handles the synchronization for it's panic data). if self .panic .compare_exchange(nil, err_ptr, Ordering::Release, Ordering::Relaxed) @@ -292,7 +293,7 @@ impl<'scope, 'env> Scope<'scope, 'env> { // whatever it points to. let panic = self.panic.swap(ptr::null_mut(), Ordering::Relaxed); if !panic.is_null() { - // We generally don't expect pancis to happen. + // We generally don't expect panics to happen. cold_path(); // If the panic pointer is not null, emit an `Acquire` fence to // establish a happens-after relationship with the `Release` branch diff --git a/src/thread_pool.rs b/src/thread_pool.rs index 3006db4..4c235cc 100644 --- a/src/thread_pool.rs +++ b/src/thread_pool.rs @@ -45,27 +45,29 @@ use crate::util::XorShift64Star; // ----------------------------------------------------------------------------- // Thread pool -/// A thread pool is a set of threads. +/// A statically-allocated handle to a dynamically-sized collection of threads. /// -/// You can dispatch work to a thread pool, and it will be distributed amongst -/// the threads and run as quickly as possible. To create a new thread pool, -/// assign it to a constant. -/// ``` -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// ``` -/// Thread pools are empty when created, and must be explicitly resized at -/// runtime. -/// ``` +/// Each `ThreadPool` must be stored in a `static`, ideally defined within your +/// root binary crate rather than a library crate. You can create a new pool +/// with [`ThreadPool::new`], and will probably want to resize sometime between +/// program init and when you want to start scheduling work. +/// +/// ```rust /// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// THREAD_POOL.resize_to_available(); +/// static POOL: ThreadPool = ThreadPool::new(); +/// +/// fn main() { +/// POOL.resize_to_available(); +/// // … schedule work … +/// POOL.depopulate(); +/// } /// ``` -/// After this, you can start sending work to the pool with -/// [`spawn`][ThreadPool::spawn], [`block_on`][ThreadPool::block_on], -/// [`join`][ThreadPool::join], or [`scope`][ThreadPool::scope]. +/// +/// A pool can accommodate at most 32 participating threads (this includes +/// managed worker threads created by the `resize` functions, but also external +/// threads that become "temporary members" when they make blocking calls to the +/// pool). All blocking methods (e.g. [`join`] and [`scope`]) work even with +/// zero managed workers, but they won't run in parallel. pub struct ThreadPool { /// A bit-set that tracks which seats are occupied. occupied: CachePadded, @@ -73,7 +75,7 @@ pub struct ThreadPool { sleeping: CachePadded, /// Holds shared data for each thread participating in the pool. seats: OnceLock>, - /// Holds controls for threads spawned and managed by the pool. Initalized + /// Holds controls for threads spawned and managed by the pool. Initialized /// on first call to `occupy`, to allow for some non-static constructors. managed_threads: Mutex, /// Used to inject external work into the thread pool. This is generally @@ -108,7 +110,7 @@ pub struct Lease { thread_pool: &'static ThreadPool, /// The index of the seat in the data list seat_number: usize, - /// A reference to the pre-initalized seat data (to avoid repeated hits of + /// A reference to the pre-initialized seat data (to avoid repeated hits of /// the `OnceLock`). seats: &'static Seats, } @@ -187,7 +189,7 @@ impl ThreadPool { /// Claims a lease on the thread pool which can be occupied by a worker /// (using [`Worker::occupy`]), allowing a thread to participate in the pool. /// - /// Returns none if all seats are occupied. + /// Returns `None` if all seats are occupied. #[cold] pub fn claim_lease(&'static self) -> Option { loop { @@ -280,19 +282,25 @@ impl ThreadPool { // Thread pool resizing impl ThreadPool { - /// Resizes the thread pool to fill all available space. After this returns, - /// the pool will have at least one worker thread and at most `MAX_THREADS`. - /// Returns the new size of the pool. + /// Resizes the thread pool to fill (almost) all available cores. After this + /// returns, the pool will have between 1 and 32 workers. Returns the new + /// size of the pool. + /// + /// This always leaves one core free, so that the main program loop can + /// continue executing on it. If you have 8 cores, calling this function + /// will add 7 workers to the pool (and then the main thread will become the + /// 8th worker if it makes a blocking call like `join`). /// /// See [`ThreadPool::resize`] for more information about resizing. pub fn resize_to_available(&'static self) -> usize { let mut available = available_parallelism().map(NonZero::get).unwrap_or(1); - available = available.saturating_sub(1); + available = available.saturating_sub(1).max(1); self.resize_to(available) } /// Resizes the pool to the specified number of threads. Returns the new - /// size of the thread pool, which may be smaller than requested. + /// size of the thread pool. The new size may be smaller than requested if + /// all the seats in the thread pool are occupied. /// /// See [`ThreadPool::resize`] for more information about resizing. pub fn resize_to(&'static self, new_size: usize) -> usize { @@ -300,19 +308,20 @@ impl ThreadPool { } /// Adds the given number of threads to the thread pool. Returns the new - /// size of the pool, which may be smaller than requested. + /// size of the pool. The new size may be smaller than requested if all the + /// seats in the thread pool are occupied. /// - /// See [`ThreadPool::resize_to`] for more information about resizing. + /// See [`ThreadPool::resize`] for more information about resizing. pub fn grow(&'static self, added_threads: usize) -> usize { - self.resize(|current_size| current_size + added_threads) + self.resize(|current_size| current_size.saturating_add(added_threads)) } /// Removes the given number of threads from the thread pool. Returns the new /// size of the pool. /// - /// See [`ThreadPool::resize_to`] for more information about resizing. + /// See [`ThreadPool::resize`] for more information about resizing. pub fn shrink(&'static self, terminated_threads: usize) -> usize { - self.resize(|current_size| current_size - terminated_threads) + self.resize(|current_size| current_size.saturating_sub(terminated_threads)) } /// Ensures that there is at least one worker thread attached to the thread @@ -338,9 +347,8 @@ impl ThreadPool { self.resize_to(0) } - /// Resizes the pool, and returns the new size. - /// - /// Note that the new size may be different from the size requested. + /// Resizes the pool, and returns the new size. The new size may be smaller + /// than requested if all the seats in the thread pool are occupied. #[cold] pub fn resize(&'static self, get_size: F) -> usize where @@ -504,10 +512,10 @@ impl ThreadPool { /// Runs the closure on a thread-pool worker. /// /// If this thread is currently acting as a worker for the thread-pool, this - /// just looks that worker up. If this is not registered as a worker, or the - /// thread's worker is registered with different thread pool, the thread - /// will try to register itself with the correct pool. If the thread pool is - /// full, it passes the closure `None`. + /// just looks that worker up. If this thread is not registered as a worker, + /// or if the thread's worker is registered with different thread pool, the + /// thread will try to register itself with the correct pool. If the thread + /// pool is full, it passes the closure `None`. /// /// The provided closure is never sent to another thread. If your closure is /// `Send`, consider using [`on_worker`][ThreadPool::on_worker] instead. @@ -550,6 +558,10 @@ impl ThreadPool { /// /// * Futures that satisfy `Future + Send + 'static` where `T: Send + 'static`. /// +/// Closures return `()` when spawned, but futures return a [`Task`]. +/// +/// # Compile Errors +/// /// Due to a bug in rustc, you may be given errors when using closures /// with inferred types. If you encounter the following: /// @@ -606,6 +618,8 @@ where } } +/// An alias for [`async_task::Task`] that includes a reference to the pool on +/// which the future is executing. pub type Task = async_task::Task; /// Schedules a runnable future as a job. @@ -744,7 +758,7 @@ thread_local! { /// /// Every thread has at most one worker at a time. If a worker has already been /// set up, it may be accessed at any time by calling [`Worker::with_current`]. -/// A thread's worker can also manually overridden by claiming a lease +/// A thread's worker can also be manually overridden by claiming a lease /// ([`ThreadPool::claim_lease`]) and passing it to [`Worker::occupy`]. The /// worker returned by `with_current` always represents the lease most recently /// occupied in the call stack. @@ -781,15 +795,16 @@ pub struct Worker { pub enum Yield { /// Indicates that a job was executed. Executed, - /// Indicates that no job was executed, and the worker should perhaps be put - /// to sleep. + /// Indicates that no job was executed. After receiving this, do not `yield` + /// again until you have a reasonable expectation that new work will have + /// been shared. Idle, } impl Worker { /// Temporarily sets the thread's worker. [`Worker::with_current`] always - /// returns a reference to the worker set up by the most recent call to this - /// worker. + /// returns a reference to the worker set up by the most recent call to + /// `occupy`. /// /// Rust's thread locals are fairly costly, so this function is expensive. /// If you can avoid calling it, do so. @@ -845,10 +860,9 @@ impl Worker { &self.lease.seats.sharers[self.lease.seat_number] } - /// Calls the provided closure on the thread's worker instance, if it has one. - /// - /// Rust's thread locals are fairly costly, so this function is expensive. - /// If you can avoid calling it, do so. + /// Calls the provided closure on the thread's worker instance, if it has + /// one. If this thread is not registered as a worker, the closure is not + /// called. #[inline(always)] pub fn map_current(f: F) -> Option where @@ -876,10 +890,9 @@ impl Worker { } } - /// Looks up the current `Worker` instance from the thread local. - /// - /// Rust's thread locals are fairly costly, so this function is expensive. - /// If you can avoid calling it, do so. + /// Calls the provided closure on the thread's worker instance, if it has + /// one. If this thread is not registered as a worker, the closure is passed + /// `None`. #[inline(always)] pub fn with_current(f: F) -> R where @@ -903,13 +916,16 @@ impl Worker { } } - /// Returns the index of the worker in the leases list. + /// Returns this worker's seat index within the pool (0–31). + /// + /// Seat numbers may be re-used by different workers at different times, and + /// may not be contiguous or ordered. #[inline(always)] pub fn seat_number(&self) -> usize { self.lease.seat_number } - /// Returns the index of the thread pool of the worker. + /// Returns the thread pool this worker belongs to. #[inline(always)] pub fn thread_pool(&self) -> &'static ThreadPool { self.lease.thread_pool @@ -1048,7 +1064,6 @@ impl Worker { self.lifo_queue .pop_newest() .or_else(|| self.fifo_queue.pop_oldest()) - .or_else(|| self.sharer().pop()) } /// Finds a job to work on. This tries @@ -1058,6 +1073,7 @@ impl Worker { fn find_work(&self) -> Option<(JobRef, bool)> { self.find_local_work() .map(|job| (job, false)) + .or_else(|| self.sharer().pop().map(|job| (job, false))) .or_else(|| self.steal_from_siblings().map(|job| (job, true))) .or_else(|| self.claim_shared_job().map(|job| (job, true))) } @@ -1104,8 +1120,14 @@ impl Worker { /// Cooperatively yields execution to the thread pool, allowing it to execute /// some work. /// - /// This function only executes local work: work already queued on the - /// worker. It will never claim shared work. + /// This function will only execute work already held locally by the worker, + /// and does no synchronization. To claim and run shared work, use + /// [`yield_now`][Worker::yield_now]. + /// + /// If no work is found, this returns `Yield::Idle`. This function should + /// not be called again (for at least a few microseconds) after an idle. + /// Calling this repeatedly in a spin-loop should be avoided, as it's likely + /// to significantly spike CPU usage and waste resources. #[inline(always)] pub fn yield_local(&self) -> Yield { // We use LIFO order here, pulling the newest work from the queue. This @@ -1122,11 +1144,17 @@ impl Worker { /// Cooperatively yields execution to the thread pool, allowing it to execute /// some work. /// - /// This function may execute either local or shared work: work already - /// queued on the worker, or work off-loaded by a different worker. If there - /// is no work on the pool, this will lock the thread pool mutex, so it - /// should not be called within a hot loop. Consider using - /// [`Worker::yield_local`] instead. + /// If the worker has no local work to do, it will try to steal work from + /// coworkers or claim work from the shared injection queue. If instead the + /// worker has a backlog of local work, the worker may make some of it + /// accessible to other workers for stealing. This involves synchronization + /// with the pool, and so should be called infrequently. To yield without + /// synchronizing with the pool, use [`yield_local`][Worker::yield_local]. + /// + /// If no work is found, this returns `Yield::Idle`. This function should + /// not be called again (for at least a few microseconds) after an idle. + /// Calling this repeatedly in a spin-loop should be avoided, as it's likely + /// to significantly spike CPU usage and waste resources. #[inline(always)] pub fn yield_now(&self) -> Yield { self.promote(); @@ -1160,7 +1188,6 @@ impl Worker { // ----------------------------------------------------------------------------- // Worker operations -/// # Operations impl Worker { /// Spawns work (a closure or future) onto the thread pool. Just like a /// standard thread, this work executes concurrently (and potentially in @@ -1185,10 +1212,37 @@ impl Worker { /// Polls a future to completion, then returns the outcome. This function /// will prioritize polling the future as soon as it becomes available, and /// while the future is not available it will try to do other meaningful - /// work. + /// work from the thread-pool. If the thread pool runs out of work, the + /// thread is suspended until the future completes or more background work + /// becomes available. + /// + /// # Async & Concurrency + /// + /// This is a convenient way to introduce concurrency into otherwise blocking + /// operations. For example, it is _totally acceptable_ to use `block_on` + /// within one of the branches of of a `join` operation (to perform I/O, for + /// example). + /// + /// This should **not** be called within `async` contexts. While it will not + /// block the execution of work on the pool, it will prevent the enclosing + /// future's `poll` method from returning. This can potentially lead to + /// deadlocks. + /// + /// Other implementation of `block_on` (like those defined by the `futures` + /// crate) should not be called within parallel forte operations. They will + /// block execution of work on the pool. + /// + /// # Alternatives /// /// If you do not have access to a [`Worker`], you may call - /// [`ThreadPool::block_on`] or simply [`block_on`]. + /// [`ThreadPool::block_on`] instead. If you don't have a static reference + /// to a specific thread pool (as is often the case in library code) you can + /// use [`block_on`] instead, as long as you are sure that your code will + /// run within a worker. + /// + /// # Panics + /// + /// If the future panics, this immediately panics. #[inline(always)] pub fn block_on(&self, future: F) -> T where @@ -1230,8 +1284,118 @@ impl Worker { /// Executes the two closures, possibly in parallel. /// - /// If you do not have access to a [`Worker`], you may call - /// [`ThreadPool::join`] or simply [`join`]. + /// This is conceptually similar to spawning two threads to execute each + /// closure, and then joining both (although the implementation is quite + /// different). It is intended for implementing recursive, + /// divide-and-conquer algorithms where each branch may itself call `join`. + /// + /// # Examples + /// + /// This example (taken wholesale from `rayon`) uses `join` to perform a + /// quick-sort. + /// + /// ```rust + /// # use forte::*; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.resize_to_available(); + /// + /// let mut v = vec![5, 1, 8, 22, 0, 44]; + /// THREAD_POOL.on_worker(|worker| quick_sort(worker, &mut v)); + /// assert_eq!(v, vec![0, 1, 5, 8, 22, 44]); + /// + /// fn quick_sort(worker: &Worker, v: &mut [T]) { + /// if v.len() > 1 { + /// let mid = partition(v); + /// let (lo, hi) = v.split_at_mut(mid); + /// worker.join(|w| quick_sort(w, lo), + /// |w| quick_sort(w, hi)); + /// } + /// } + /// + /// // Partition rearranges all items `<=` to the pivot + /// // item (arbitrary selected to be the last item in the slice) + /// // to the first half of the slice. It then returns the + /// // "dividing point" where the pivot is placed. + /// fn partition(v: &mut [T]) -> usize { + /// let pivot = v.len() - 1; + /// let mut i = 0; + /// for j in 0..pivot { + /// if v[j] <= v[pivot] { + /// v.swap(i, j); + /// i += 1; + /// } + /// } + /// v.swap(i, pivot); + /// i + /// } + /// ``` + /// + /// This example (taken from `chili`) shows how to use `join` to sum the + /// nodes of a binary tree. + /// + /// ```rust + /// # use forte::*; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.resize_to_available(); + /// + /// let tree = gen_tree(8); + /// let result = THREAD_POOL.on_worker(|worker| sum(worker, &tree)); + /// assert_eq!(result, 255); + /// + /// struct Node { + /// val: u64, + /// left: Option>, + /// right: Option>, + /// } + /// + /// fn gen_tree(layers: usize) -> Box { + /// Box::new(Node { + /// val: 1, + /// left: (layers != 1).then(|| gen_tree(layers - 1)), + /// right: (layers != 1).then(|| gen_tree(layers - 1)), + /// }) + /// } + /// + /// fn sum(worker: &Worker, node: &Node) -> u64 { + /// let (left, right) = worker.join( + /// |w| node.left.as_deref().map(|n| sum(w, n)).unwrap_or_default(), + /// |w| node.right.as_deref().map(|n| sum(w, n)).unwrap_or_default(), + /// ); + /// node.val + left + right + /// } + /// ``` + /// + /// # Alternatives + /// + /// If you do not have a `Worker`, you can use [`ThreadPool::join`] + /// instead. If you don't have a static reference to a specific thread pool + /// (as is often the case in library code) you can use [`join`] instead, as + /// long as you are sure that your code will run within a worker. + /// + /// If your workload isn't amenable to the divide-and-conquer approach or is + /// async, but you still want to borrow local data in your computations, you + /// may want to use a [`scope`][`Worker::scope`] instead. + /// + /// # Warning about blocking I/O + /// + /// The assumption is that the closures given to `join()` are CPU-bound + /// tasks that do not perform blocking operations. If you do perform I/O, + /// and that I/O should block (e.g., waiting for a network request), the + /// overall performance may be poor. Moreover, if you cause one closure to + /// be blocked waiting on another (for example, using a channel), that could + /// lead to a deadlock. + /// + /// You can use [`block_on`][Worker::block_on] to do async I/O within a + /// `join` branch, as long as different branches are not made to depend on + /// each other. + /// + /// # Panics + /// + /// Both closures are always executed to completion. If either panics, + /// `join` will propagate that panic after both complete. When both panic, + /// only the panic from the first argument is propagated and the panic from + /// the other argument is dropped (this may cause program aborts in some + /// situations). #[inline(always)] pub fn join(&self, a: A, b: B) -> (RA, RB) where @@ -1295,9 +1459,209 @@ impl Worker { } } - /// Creates a scope onto which non-static work can be spawned. For more complete docs, see [`scope`]. + /// Creates a new scope for spawning non-static work. + /// + /// Work spawned onto the new scope does not have to have a `'static` + /// lifetime, and can borrow local variables. Local borrowing is possible + /// because this function will not return until all work spawned on the + /// scope has completed, this ensuring the stack frame is kept alive for the + /// duration. + /// + /// # Accessing stack data + /// + /// In general, spawned tasks may borrow any stack data that lives outside + /// the scope closure. + /// + /// ``` + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let ok: Vec = vec![1, 2, 3]; + /// forte::scope(|scope| { + /// let bad: Vec = vec![4, 5, 6]; + /// scope.spawn_on(worker, |_: &Worker| { + /// // Transfer ownership of `bad` into a local variable (also named `bad`). + /// // This will force the closure to take ownership of `bad` from the environment. + /// let bad = bad; + /// println!("ok: {:?}", ok); // `ok` is only borrowed. + /// println!("bad: {:?}", bad); // refers to our local variable, above. + /// }); + /// + /// scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok` + /// }); + /// # }); + /// ``` + /// As the comments example above suggest, to reference `bad` we must + /// take ownership of it. One way to do this is to detach the closure + /// from the surrounding stack frame, using the `move` keyword. This + /// will cause it to take ownership of *all* the variables it touches, + /// in this case including both `ok` *and* `bad`: + /// + /// ```rust + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let ok: Vec = vec![1, 2, 3]; + /// forte::scope(|scope| { + /// let bad: Vec = vec![4, 5, 6]; + /// scope.spawn_on(worker, move |_: &Worker| { + /// println!("ok: {:?}", ok); + /// println!("bad: {:?}", bad); + /// }); + /// + /// // That closure is fine, but now we can't use `ok` anywhere else, + /// // since it is owned by the previous task: + /// // scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); + /// }); + /// # }); + /// ``` + /// + /// While this works, it could be a problem if we want to use `ok` elsewhere. + /// There are two choices. We can keep the closure as a `move` closure, but + /// instead of referencing the variable `ok`, we create a shadowed variable that + /// is a borrow of `ok` and capture *that*: + /// + /// ```rust + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let ok: Vec = vec![1, 2, 3]; + /// forte::scope(|scope| { + /// let bad: Vec = vec![4, 5, 6]; + /// let ok: &Vec = &ok; // shadow the original `ok` + /// scope.spawn_on(worker, move |_: &Worker| { + /// println!("ok: {:?}", ok); // captures the shadowed version + /// println!("bad: {:?}", bad); + /// }); + /// + /// // Now we too can use the shadowed `ok`, since `&Vec` references + /// // can be shared freely. Note that we need a `move` closure here though, + /// // because otherwise we'd be trying to borrow the shadowed `ok`, + /// // and that doesn't outlive `scope`. + /// scope.spawn_on(worker, move |_: &Worker| println!("ok: {:?}", ok)); + /// }); + /// # }); + /// ``` + /// + /// Another option is not to use the `move` keyword but instead to take ownership + /// of individual variables: + /// + /// ```rust + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let ok: Vec = vec![1, 2, 3]; + /// forte::scope(|scope| { + /// let bad: Vec = vec![4, 5, 6]; + /// scope.spawn_on(worker, |_: &Worker| { + /// // Transfer ownership of `bad` into a local variable (also named `bad`). + /// // This will force the closure to take ownership of `bad` from the environment. + /// let bad = bad; + /// println!("ok: {:?}", ok); // `ok` is only borrowed. + /// println!("bad: {:?}", bad); // refers to our local variable, above. + /// }); + /// + /// scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok` + /// }); + /// # }); + /// ``` + /// + /// # Referencing the scope + /// + /// The scope passed into the closure is not allowed to leak out of this call. + /// In other words, this will fail to compile: + /// + /// ```compile_fail + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let mut leak = None; + /// forte::scope(|scope| { + /// leak = Some(scope); // <-- ERROR: scope would be leaked here + /// }); + /// drop(leak); + /// # }); + /// ``` + /// + /// Anything spawned onto the scope can capture a reference to it. + /// This allows scoped work to spawn other scoped work. /// - /// If you do not have access to a worker, you can use [`ThreadPool::scope`] or simply [`scope`]. + /// ``` + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// # THREAD_POOL.expect_worker(|worker| { + /// let mut counter = 0; + /// let counter_ref = &mut counter; + /// forte::scope(|scope| { + /// scope.spawn_on(worker, |worker: &Worker| { + /// *counter_ref += 1; + /// // Note: we borrow the scope again here. + /// scope.spawn_on(worker, move |_: &Worker| { + /// *counter_ref += 1; + /// }); + /// }); + /// }); + /// assert_eq!(counter, 2); + /// # }); + /// ``` + /// + /// It's possible to spawn non-scoped work within the closure, but these + /// generally can't hold references to the scope. So for example, the + /// following also fails to compile: + /// + /// ```compile_fail,E0521 + /// # use forte::ThreadPool; + /// # use forte::Worker; + /// # static THREAD_POOL: ThreadPool = ThreadPool::new(); + /// # THREAD_POOL.populate(); + /// THREAD_POOL.with_worker(|worker| { + /// worker.scope(|scope| { + /// worker.spawn(|worker: &Worker| { + /// // ^^^^^ ERROR: This creates a *static* job on the worker, + /// // which may outlive the scope. + /// + /// scope.spawn_on(worker, |_: &Worker| { }); + /// // ^^^^^ ERROR: This requires borrowing the scope within the + /// // unscoped job, which isn't allowed by the compiler + /// // because 'scope would have to to outlive 'static. + /// }); + /// }); + /// }); + /// ``` + /// + /// # Alternatives + /// + /// If you do not have a `Worker`, you can use [`ThreadPool::scope`] + /// instead. If you don't have a static reference to a specific thread pool + /// (as is often the case in library code) you can use [`scope`] instead, as + /// long as you are sure that your code will run within a worker. + /// + /// Scopes are a more flexible building block compared to + /// [`join`][Worker::join], since a loop can be used to spawn any number of + /// tasks without recursing. However, that flexibility comes at a + /// performance price: tasks spawned using `scope` must be allocated onto + /// the heap, whereas [`join`][Worker::join] can make exclusive use of the + /// stack. Prefer [`join`][Worker::join]) where possible. + /// + /// # Panics + /// + /// If a panic occurs, either in the closure given to `scope` or in job + /// spawned on the scope, that panic is caught and stored. When all the work + /// on the scope is complete, `scope` will then re-emit that panic. If + /// multiple panics occurs, the first will propagate and the others will be + /// caught and dropped (which may result in program aborts). #[inline(always)] pub fn scope<'env, F, T>(&self, f: F) -> T where @@ -1308,16 +1672,24 @@ impl Worker { } // ----------------------------------------------------------------------------- -// Thread local scheduling api +// Implicit worker registration api /// Runs the provided closure in the background. /// -///
-/// Note: -/// This function panics if the current thread is not registered as a worker. -///
+/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) +/// this is able to look up that registration and find the worker and +/// thread-pool implicitly. +/// +/// If you have a reference to a [`Worker`], it's better to use [`Worker::spawn`] +/// instead. If you don't have a worker, but know which thread pool you want to +/// use, [`ThreadPool::spawn`] is more appropriate. +/// +///
+///
+/// **Warning:** This function panics if the current thread is not registered as a worker.
 ///
-/// See also: [`Worker::spawn`] and [`ThreadPool::spawn`].
+/// 
pub fn spawn>(work: S) -> S::Output { Worker::with_current(|worker| { worker @@ -1328,12 +1700,20 @@ pub fn spawn>(work: S) -> S::Output { /// Waits for a future to complete. /// -///
-/// Note: -/// This function panics if the current thread is not registered as a worker. -///
+/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) +/// this is able to look up that registration and find the worker and +/// thread-pool implicitly. /// -/// See also: [`Worker::block_on`] and [`ThreadPool::block_on`]. +/// If you have a reference to a [`Worker`], it's better to use +/// [`Worker::block_on`] instead. If you don't have a worker, but know which +/// thread pool you want to use, [`ThreadPool::block_on`] is more appropriate. +/// +///
+///
+/// **Warning:** This function panics if the current thread is not registered as a worker.
+///
+/// 
pub fn block_on(future: F) -> T where F: Future + Send, @@ -1348,12 +1728,20 @@ where /// Executes the two closures, possibly in parallel. /// -///
-/// Note: -/// This function panics if the current thread is not registered as a worker. -///
+/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) +/// this is able to look up that registration and find the worker and +/// thread-pool implicitly. +/// +/// If you have a reference to a [`Worker`], it's better to use [`Worker::join`] +/// instead. If you don't have a worker, but know which thread pool you want to +/// use, [`ThreadPool::join`] is more appropriate. +/// +///
 ///
-/// See also: [`Worker::join`] and [`ThreadPool::join`].
+/// **Warning:** This function panics if the current thread is not registered as a worker.
+///
+/// 
pub fn join(a: A, b: B) -> (RA, RB) where A: FnOnce(&Worker) -> RA + Send, @@ -1370,218 +1758,20 @@ where /// Creates a new scope for spawning non-static work. /// -/// Work spawned onto the new scope does not have to have a `'static` -/// lifetime, and can borrow local variables. Local borrowing is possible -/// because this function will not return until all work spawned on the -/// scope has completed, this ensuring the stack frame is kept alive for the -/// duration. -/// -///
-/// Note: -/// This function panics if the current thread is not registered as a worker. -///
-/// -/// # Alternatives -/// -/// Where possible, [`ThreadPool::scope`] or [`Worker::scope`] should be used -/// instead. These functions are more efficient, and do not panic when not -/// within a worker. -/// -/// Scopes are a more flexible building block compared to [`join()`], since a -/// loop can be used to spawn any number of tasks without recursing. -/// However, that flexibility comes at a performance price: tasks spawned -/// using `scope` must be allocated onto the heap, whereas [`join()`] can make -/// exclusive use of the stack. Prefer [`join()`] (or ideally [`Worker::join`]) where possible. -/// -/// [`join()`]: Worker::join -/// -/// # Accessing stack data -/// -/// In general, spawned tasks may borrow any stack data that lives outside -/// the scope closure. -/// -/// ``` -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let ok: Vec = vec![1, 2, 3]; -/// forte::scope(|scope| { -/// let bad: Vec = vec![4, 5, 6]; -/// scope.spawn_on(worker, |_: &Worker| { -/// // Transfer ownership of `bad` into a local variable (also named `bad`). -/// // This will force the closure to take ownership of `bad` from the environment. -/// let bad = bad; -/// println!("ok: {:?}", ok); // `ok` is only borrowed. -/// println!("bad: {:?}", bad); // refers to our local variable, above. -/// }); -/// -/// scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok` -/// }); -/// # }); -/// ``` -/// As the comments example above suggest, to reference `bad` we must -/// take ownership of it. One way to do this is to detach the closure -/// from the surrounding stack frame, using the `move` keyword. This -/// will cause it to take ownership of *all* the variables it touches, -/// in this case including both `ok` *and* `bad`: -/// -/// ```rust -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let ok: Vec = vec![1, 2, 3]; -/// forte::scope(|scope| { -/// let bad: Vec = vec![4, 5, 6]; -/// scope.spawn_on(worker, move |_: &Worker| { -/// println!("ok: {:?}", ok); -/// println!("bad: {:?}", bad); -/// }); -/// -/// // That closure is fine, but now we can't use `ok` anywhere else, -/// // since it is owned by the previous task: -/// // scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); -/// }); -/// # }); -/// ``` +/// When executed on a thread that is currently registered as a worker (i.e. the +/// closure inside [`Worker::occupy`], [`ThreadPool::with_worker`], or similar) +/// this is able to look up that registration and find the worker and +/// thread-pool implicitly. /// -/// While this works, it could be a problem if we want to use `ok` elsewhere. -/// There are two choices. We can keep the closure as a `move` closure, but -/// instead of referencing the variable `ok`, we create a shadowed variable that -/// is a borrow of `ok` and capture *that*: -/// -/// ```rust -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let ok: Vec = vec![1, 2, 3]; -/// forte::scope(|scope| { -/// let bad: Vec = vec![4, 5, 6]; -/// let ok: &Vec = &ok; // shadow the original `ok` -/// scope.spawn_on(worker, move |_: &Worker| { -/// println!("ok: {:?}", ok); // captures the shadowed version -/// println!("bad: {:?}", bad); -/// }); -/// -/// // Now we too can use the shadowed `ok`, since `&Vec` references -/// // can be shared freely. Note that we need a `move` closure here though, -/// // because otherwise we'd be trying to borrow the shadowed `ok`, -/// // and that doesn't outlive `scope`. -/// scope.spawn_on(worker, move |_: &Worker| println!("ok: {:?}", ok)); -/// }); -/// # }); -/// ``` -/// -/// Another option is not to use the `move` keyword but instead to take ownership -/// of individual variables: -/// -/// ```rust -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let ok: Vec = vec![1, 2, 3]; -/// forte::scope(|scope| { -/// let bad: Vec = vec![4, 5, 6]; -/// scope.spawn_on(worker, |_: &Worker| { -/// // Transfer ownership of `bad` into a local variable (also named `bad`). -/// // This will force the closure to take ownership of `bad` from the environment. -/// let bad = bad; -/// println!("ok: {:?}", ok); // `ok` is only borrowed. -/// println!("bad: {:?}", bad); // refers to our local variable, above. -/// }); -/// -/// scope.spawn_on(worker, |_: &Worker| println!("ok: {:?}", ok)); // we too can borrow `ok` -/// }); -/// # }); -/// ``` -/// -/// # Referencing the scope -/// -/// The scope passed into the closure is not allowed to leak out of this call. -/// In other words, this will fail to compile: -/// -/// ```compile_fail -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let mut leak = None; -/// forte::scope(|scope| { -/// leak = Some(scope); // <-- ERROR: scope would be leaked here -/// }); -/// drop(leak); -/// # }); -/// ``` -/// -/// Anything spawned onto the scope can capture a reference to it. -/// This allows scoped work to spawn other scoped work. -/// -/// ``` -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// # THREAD_POOL.expect_worker(|worker| { -/// let mut counter = 0; -/// let counter_ref = &mut counter; -/// forte::scope(|scope| { -/// scope.spawn_on(worker, |worker: &Worker| { -/// *counter_ref += 1; -/// // Note: we borrow the scope again here. -/// scope.spawn_on(worker, move |_: &Worker| { -/// *counter_ref += 1; -/// }); -/// }); -/// }); -/// assert_eq!(counter, 2); -/// # }); -/// ``` -/// -/// It's possible to spawn non-scoped work within the closure, but these -/// generally can't hold references to the scope. So for example, the -/// following also fails to compile: -/// -/// ```compile_fail,E0521 -/// # use forte::ThreadPool; -/// # use forte::Worker; -/// # static THREAD_POOL: ThreadPool = ThreadPool::new(); -/// # THREAD_POOL.populate(); -/// THREAD_POOL.with_worker(|worker| { -/// worker.scope(|scope| { -/// worker.spawn(|worker: &Worker| { -/// // ^^^^^ ERROR: This creates a *static* job on the worker, -/// // which may outlive the scope. -/// -/// scope.spawn_on(worker, |_: &Worker| { }); -/// // ^^^^^ ERROR: This requires borrowing the scope within the -/// // unscoped job, which isn't allowed by the compiler -/// // because 'scope would have to to outlive 'static. -/// }); -/// }); -/// }); -/// ``` +/// If you have a reference to a [`Worker`], it's better to use +/// [`Worker::scope`] instead. If you don't have a worker, but know which thread +/// pool you want to use, [`ThreadPool::scope`] is more appropriate. /// -/// # Panics +///
 ///
-/// This function panics when not called within a worker. The
-/// [`ThreadPool::scope`] and [`Worker::scope`] functions do not, and should be
-/// preferred when possible.
+/// **Warning:** This function panics if the current thread is not registered as a worker.
 ///
-/// If a panic occurs, either in the closure given to `scope()` or in a blocking
-/// (non-async) job spawned on the scope, that panic will be propagated and the
-/// call to `scope()` will panic. If multiple panics occurs, it is
-/// non-deterministic which of their panic values will propagate. Regardless,
-/// once a task is spawned using `scope.spawn(),` it will execute, even if the
-/// spawning task should later panic. The scope returns once all work is
-/// complete, and panics are propagated at that point.
+/// 
pub fn scope<'env, F, T>(f: F) -> T where F: for<'scope> FnOnce(&'scope Scope<'scope, 'env>) -> T, From be90449ac5cbdcaef59cf72eba41463fd7f5f70c Mon Sep 17 00:00:00 2001 From: NthTensor Date: Mon, 4 May 2026 07:51:17 -0400 Subject: [PATCH 3/3] fix: switch to hotclock for cpu ticks --- Cargo.lock | 13 ++++++------- Cargo.toml | 2 +- src/thread_pool.rs | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75b2950..0c5fddb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -499,10 +499,10 @@ dependencies = [ "crossbeam-utils", "dashmap", "divan", + "hotclock", "rayon", "shuttle", "st3", - "tick_counter", "tracing", "tracing-subscriber", ] @@ -633,6 +633,11 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hotclock" +version = "0.2.0" +source = "git+https://github.com/spence/hotclock#8cf14ae9d62dba7f7780a3c920ab6208b6568777" + [[package]] name = "is-terminal" version = "0.4.16" @@ -1184,12 +1189,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "tick_counter" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37f1310986d0aa940019cbb2b480161c60a614dba076cbb20e82bfbc236bbabd" - [[package]] name = "tinytemplate" version = "1.2.1" diff --git a/Cargo.toml b/Cargo.toml index 2d68140..89cb4d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ atomic-wait = "1.1.0" crossbeam-queue = "0.3.12" crossbeam-utils = "0.8.21" st3 = "0.4" -tick_counter = "0.4.5" +hotclock = { git = "https://github.com/spence/hotclock" } shuttle = { version = "0.8.0", optional = true } tracing = { version = "0.1.41", features = ["release_max_level_off"] } diff --git a/src/thread_pool.rs b/src/thread_pool.rs index 4c235cc..69ca80b 100644 --- a/src/thread_pool.rs +++ b/src/thread_pool.rs @@ -945,7 +945,7 @@ impl Worker { // Promotions are fairly costly, so we limit their frequency using the // cpu's instruction counter. Promote is called at a high frequency, and // actually doing the promotion is probably a cold path. - let current_tick = tick_counter::start(); + let current_tick = hotclock::Instant::now().as_raw(); if current_tick.wrapping_sub(self.last_promote_tick.get()) >= Self::PROMOTE_TICK_INTERVAL { // This should ideally become a conditional jump. self.promote_cold(current_tick);