diff --git a/source/common/stats/allocator.cc b/source/common/stats/allocator.cc index 59cae0ebfff9b..4673fe3d5c9a0 100644 --- a/source/common/stats/allocator.cc +++ b/source/common/stats/allocator.cc @@ -92,19 +92,39 @@ template class StatsSharedImpl : public MetricImpl // RefcountInterface void incRefCount() override { ++ref_count_; } bool decRefCount() override { - // We must, unfortunately, hold the allocator's lock when decrementing the - // refcount. Otherwise another thread may simultaneously try to allocate the - // same name'd stat after we decrement it, and we'll wind up with a - // dtor/update race. To avoid this we must hold the lock until the stat is - // removed from the map. + ASSERT(ref_count_ >= 1); + // Takes a snapshot of the current ref_count_ + uint32_t ref_count_snapshot = ref_count_.load(std::memory_order_relaxed); + + // Checks if ref_count_snapshot is 1. If it is 1, then a decrement would free memory, and we + // would need to acquire the allocator mutex. + // + // If ref_count_snapshot is > 1, then there are some important cases where thread + // interruptions might change what happens: + // + // First case: ref_count_ is unchanged (ref_count_ == ref_count_snapshot) + // Zero or more threads touch ref_count_ before compare_exchange_weak is called. + // compare_exchange_weak returns true, and decRefCount returns false. // - // It might be worth thinking about a race-free way to decrement ref-counts - // without a lock, for the case where ref_count > 2, and we don't need to - // destruct anything. But it seems preferable at to be conservative here, - // as stats will only go out of scope when a scope is destructed (during - // xDS) or during admin stats operations. + // Second case: ref_count_ is changed (ref_count_ != ref_count_snapshot && ref_count_ > 1) + // One or more threads touch ref_count_ before compare_exchange_weak is called. + // compare_exchange_weak returns false, ref_count_snapshot is set to be ref_count_. Loop + // restarts. + // + // Third case: ref_count_ is changed (ref_count_ != ref_count_snapshot && ref_count <= 1) + // One or more threads touch ref_count_ before compare_exchange_weak is called. + // compare_exchange_weak returns false. ref_count_snapshot is set to be ref_count_. Loop + // restarts, but exits out of the while loop because conditional is false (a decrement would + // free memory; check start of comment). + while (ref_count_snapshot > 1) { + if (ref_count_.compare_exchange_weak(ref_count_snapshot, ref_count_snapshot - 1, + std::memory_order_acq_rel)) { + return false; + } + } + // Another thread may call incRefCount at this point. The lock path still does the right thing + // because the stat is not freed if ref_count_ is not 0. Thread::LockGuard lock(alloc_.mutex_); - ASSERT(ref_count_ >= 1); if (--ref_count_ == 0) { alloc_.sync().syncPoint(Allocator::DecrementToZeroSyncPoint); removeFromSetLockHeld(); diff --git a/test/common/stats/BUILD b/test/common/stats/BUILD index 6991db3f2284b..5a13cf9d57efd 100644 --- a/test/common/stats/BUILD +++ b/test/common/stats/BUILD @@ -416,6 +416,24 @@ envoy_benchmark_test( benchmark_binary = "thread_local_store_speed_test", ) +envoy_cc_benchmark_binary( + name = "dec_ref_count_speed_test", + srcs = ["dec_ref_count_speed_test.cc"], + rbe_pool = "6gig", + deps = [ + "//source/common/common:assert_lib", + "//source/common/stats:allocator_lib", + "//source/common/stats:symbol_table_lib", + "@abseil-cpp//absl/strings", + "@benchmark", + ], +) + +envoy_benchmark_test( + name = "dec_ref_count_speed_test_benchmark_test", + benchmark_binary = "dec_ref_count_speed_test", +) + envoy_cc_test( name = "utility_test", srcs = ["utility_test.cc"], diff --git a/test/common/stats/dec_ref_count_speed_test.cc b/test/common/stats/dec_ref_count_speed_test.cc new file mode 100644 index 0000000000000..8fd2d1b5f20f6 --- /dev/null +++ b/test/common/stats/dec_ref_count_speed_test.cc @@ -0,0 +1,123 @@ +// Note: this should be run with --compilation_mode=opt, and would benefit from a +// quiescent system with disabled cstate power management. +// +// Exercises both paths of StatsSharedImpl::decRefCount(): +// * Fast path (ref_count_ > 1): single-thread, multi-thread same stat, +// multi-thread distinct stats. +// * Slow path (ref_count_ == 1): single-thread. +// +// Fast-path benchmarks pre-seed ref_count_ so every dec in the timed loop +// stays on the CAS path. Slow-path benchmark pre-allocates one counter per +// iteration with ref_count_=1, so each timed dec frees its counter. + +#include +#include + +#include "source/common/common/assert.h" +#include "source/common/stats/allocator.h" +#include "source/common/stats/symbol_table.h" + +#include "absl/strings/str_cat.h" +#include "absl/types/optional.h" +#include "benchmark/benchmark.h" + +namespace Envoy { +namespace Stats { +namespace { + +class DecRefCountFixture { +public: + explicit DecRefCountFixture(size_t num_counters) : alloc_(symbol_table_), pool_(symbol_table_) { + counters_.reserve(num_counters); + for (size_t i = 0; i < num_counters; ++i) { + counters_.push_back( + alloc_.makeCounter(pool_.add(absl::StrCat("benchmark.counter.", i)), StatName(), {})); + } + } + + Counter& counter(size_t i) { return *counters_[i]; } + CounterSharedPtr& counterPtr(size_t i) { return counters_[i]; } + size_t size() const { return counters_.size(); } + +private: + SymbolTable symbol_table_; + Allocator alloc_; + StatNamePool pool_; + std::vector counters_; +}; + +// Single thread, single stat, ref_count_ > 1 +void bmDecRefCountFastPathSingleThread(benchmark::State& state) { + DecRefCountFixture fixture(1); + Counter& counter = fixture.counter(0); + RELEASE_ASSERT(state.max_iterations < std::numeric_limits::max() - 1, + "ref_count_ will overflow"); + for (uint32_t i = 0; i < state.max_iterations; i++) { + counter.incRefCount(); + } + for (auto _ : state) { // NOLINT + const bool freed = counter.decRefCount(); + benchmark::DoNotOptimize(freed); + } +} +BENCHMARK(bmDecRefCountFastPathSingleThread); + +absl::optional multi_thread_fixture; +void multiThreadInitSameStat(const benchmark::State&) { multi_thread_fixture.emplace(1); } +void multiThreadInitDistinctStat(const benchmark::State& state) { + multi_thread_fixture.emplace(state.threads()); +} +void multiThreadDestroy(const benchmark::State&) { multi_thread_fixture.reset(); } + +// Multiple threads, same stat, ref_count_ > 1 +void bmDecRefCountFastPathMultiThreadSameStat(benchmark::State& state) { + Counter& counter = multi_thread_fixture->counter(0); + RELEASE_ASSERT(static_cast(state.max_iterations) * state.threads() < + std::numeric_limits::max() - 1, + "ref_count_ will overflow across all threads"); + for (uint32_t i = 0; i < state.max_iterations; i++) { + counter.incRefCount(); + } + for (auto _ : state) { // NOLINT + const bool freed = counter.decRefCount(); + benchmark::DoNotOptimize(freed); + } +} +BENCHMARK(bmDecRefCountFastPathMultiThreadSameStat) + ->ThreadRange(1, 16) + ->UseRealTime() + ->Setup(multiThreadInitSameStat) + ->Teardown(multiThreadDestroy); + +// Multiple threads, different stats, ref_count_ > 1 +void bmDecRefCountFastPathMultiThreadDistinctStat(benchmark::State& state) { + Counter& counter = multi_thread_fixture->counter(state.thread_index()); + RELEASE_ASSERT(state.max_iterations < std::numeric_limits::max() - 1, + "ref_count_ will overflow on a per-thread counter"); + for (uint32_t i = 0; i < state.max_iterations; i++) { + counter.incRefCount(); + } + for (auto _ : state) { // NOLINT + const bool freed = counter.decRefCount(); + benchmark::DoNotOptimize(freed); + } +} +BENCHMARK(bmDecRefCountFastPathMultiThreadDistinctStat) + ->ThreadRange(1, 16) + ->UseRealTime() + ->Setup(multiThreadInitDistinctStat) + ->Teardown(multiThreadDestroy); + +// Single thread, multiple stats, ref_count_ == 1 +void bmDecRefCountSlowPathSingleThread(benchmark::State& state) { + DecRefCountFixture fixture(state.max_iterations); + size_t idx = 0; + for (auto _ : state) { // NOLINT + fixture.counterPtr(idx++).reset(); + } +} +BENCHMARK(bmDecRefCountSlowPathSingleThread)->Iterations(1 << 18); + +} // namespace +} // namespace Stats +} // namespace Envoy