Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions source/common/stats/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,39 @@ template <class BaseClass> class StatsSharedImpl : public MetricImpl<BaseClass>
// RefcountInterface
void incRefCount() override { ++ref_count_; }
bool decRefCount() override {
// We must, unfortunately, hold the allocator's lock when decrementing the
// refcount. Otherwise another thread may simultaneously try to allocate the
// same name'd stat after we decrement it, and we'll wind up with a
// dtor/update race. To avoid this we must hold the lock until the stat is
// removed from the map.
ASSERT(ref_count_ >= 1);
// Takes a snapshot of the current ref_count_
uint32_t ref_count_snapshot = ref_count_.load(std::memory_order_relaxed);

// Checks if ref_count_snapshot is 1. If it is 1, then a decrement would free memory, and we
// would need to acquire the allocator mutex.
//
// If ref_count_snapshot is > 1, then there are some important cases where thread
// interruptions might change what happens:
//
// First case: ref_count_ is unchanged (ref_count_ == ref_count_snapshot)
// Zero or more threads touch ref_count_ before compare_exchange_weak is called.
// compare_exchange_weak returns true, and decRefCount returns false.
//
// It might be worth thinking about a race-free way to decrement ref-counts
// without a lock, for the case where ref_count > 2, and we don't need to
// destruct anything. But it seems preferable at to be conservative here,
// as stats will only go out of scope when a scope is destructed (during
// xDS) or during admin stats operations.
// Second case: ref_count_ is changed (ref_count_ != ref_count_snapshot && ref_count_ > 1)
// One or more threads touch ref_count_ before compare_exchange_weak is called.
// compare_exchange_weak returns false, ref_count_snapshot is set to be ref_count_. Loop
// restarts.
//
// Third case: ref_count_ is changed (ref_count_ != ref_count_snapshot && ref_count <= 1)
// One or more threads touch ref_count_ before compare_exchange_weak is called.
// compare_exchange_weak returns false. ref_count_snapshot is set to be ref_count_. Loop
// restarts, but exits out of the while loop because conditional is false (a decrement would
// free memory; check start of comment).
while (ref_count_snapshot > 1) {
if (ref_count_.compare_exchange_weak(ref_count_snapshot, ref_count_snapshot - 1,
std::memory_order_acq_rel)) {
return false;
}
}
// Another thread may call incRefCount at this point. The lock path still does the right thing
// because the stat is not freed if ref_count_ is not 0.
Thread::LockGuard lock(alloc_.mutex_);
ASSERT(ref_count_ >= 1);
if (--ref_count_ == 0) {
alloc_.sync().syncPoint(Allocator::DecrementToZeroSyncPoint);
removeFromSetLockHeld();
Expand Down
18 changes: 18 additions & 0 deletions test/common/stats/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,24 @@ envoy_benchmark_test(
benchmark_binary = "thread_local_store_speed_test",
)

envoy_cc_benchmark_binary(
name = "dec_ref_count_speed_test",
srcs = ["dec_ref_count_speed_test.cc"],
rbe_pool = "6gig",
deps = [
"//source/common/common:assert_lib",
"//source/common/stats:allocator_lib",
"//source/common/stats:symbol_table_lib",
"@abseil-cpp//absl/strings",
"@benchmark",
],
)

envoy_benchmark_test(
name = "dec_ref_count_speed_test_benchmark_test",
benchmark_binary = "dec_ref_count_speed_test",
)

envoy_cc_test(
name = "utility_test",
srcs = ["utility_test.cc"],
Expand Down
123 changes: 123 additions & 0 deletions test/common/stats/dec_ref_count_speed_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Note: this should be run with --compilation_mode=opt, and would benefit from a
// quiescent system with disabled cstate power management.
//
// Exercises both paths of StatsSharedImpl::decRefCount():
// * Fast path (ref_count_ > 1): single-thread, multi-thread same stat,
// multi-thread distinct stats.
// * Slow path (ref_count_ == 1): single-thread.
//
// Fast-path benchmarks pre-seed ref_count_ so every dec in the timed loop
// stays on the CAS path. Slow-path benchmark pre-allocates one counter per
// iteration with ref_count_=1, so each timed dec frees its counter.

#include <limits>
#include <vector>

#include "source/common/common/assert.h"
#include "source/common/stats/allocator.h"
#include "source/common/stats/symbol_table.h"

#include "absl/strings/str_cat.h"
#include "absl/types/optional.h"
#include "benchmark/benchmark.h"

namespace Envoy {
namespace Stats {
namespace {

class DecRefCountFixture {
public:
explicit DecRefCountFixture(size_t num_counters) : alloc_(symbol_table_), pool_(symbol_table_) {
counters_.reserve(num_counters);
for (size_t i = 0; i < num_counters; ++i) {
counters_.push_back(
alloc_.makeCounter(pool_.add(absl::StrCat("benchmark.counter.", i)), StatName(), {}));
}
}

Counter& counter(size_t i) { return *counters_[i]; }
CounterSharedPtr& counterPtr(size_t i) { return counters_[i]; }
size_t size() const { return counters_.size(); }

private:
SymbolTable symbol_table_;
Allocator alloc_;
StatNamePool pool_;
std::vector<CounterSharedPtr> counters_;
};

// Single thread, single stat, ref_count_ > 1
void bmDecRefCountFastPathSingleThread(benchmark::State& state) {
DecRefCountFixture fixture(1);
Counter& counter = fixture.counter(0);
RELEASE_ASSERT(state.max_iterations < std::numeric_limits<uint32_t>::max() - 1,
"ref_count_ will overflow");
for (uint32_t i = 0; i < state.max_iterations; i++) {
counter.incRefCount();
}
for (auto _ : state) { // NOLINT
const bool freed = counter.decRefCount();
benchmark::DoNotOptimize(freed);
}
}
BENCHMARK(bmDecRefCountFastPathSingleThread);

absl::optional<DecRefCountFixture> multi_thread_fixture;
void multiThreadInitSameStat(const benchmark::State&) { multi_thread_fixture.emplace(1); }
void multiThreadInitDistinctStat(const benchmark::State& state) {
multi_thread_fixture.emplace(state.threads());
}
void multiThreadDestroy(const benchmark::State&) { multi_thread_fixture.reset(); }

// Multiple threads, same stat, ref_count_ > 1
void bmDecRefCountFastPathMultiThreadSameStat(benchmark::State& state) {
Counter& counter = multi_thread_fixture->counter(0);
RELEASE_ASSERT(static_cast<uint64_t>(state.max_iterations) * state.threads() <
std::numeric_limits<uint32_t>::max() - 1,
"ref_count_ will overflow across all threads");
for (uint32_t i = 0; i < state.max_iterations; i++) {
counter.incRefCount();
}
for (auto _ : state) { // NOLINT
const bool freed = counter.decRefCount();
benchmark::DoNotOptimize(freed);
}
}
BENCHMARK(bmDecRefCountFastPathMultiThreadSameStat)
->ThreadRange(1, 16)
->UseRealTime()
->Setup(multiThreadInitSameStat)
->Teardown(multiThreadDestroy);

// Multiple threads, different stats, ref_count_ > 1
void bmDecRefCountFastPathMultiThreadDistinctStat(benchmark::State& state) {
Counter& counter = multi_thread_fixture->counter(state.thread_index());
RELEASE_ASSERT(state.max_iterations < std::numeric_limits<uint32_t>::max() - 1,
"ref_count_ will overflow on a per-thread counter");
for (uint32_t i = 0; i < state.max_iterations; i++) {
counter.incRefCount();
}
for (auto _ : state) { // NOLINT
const bool freed = counter.decRefCount();
benchmark::DoNotOptimize(freed);
}
}
BENCHMARK(bmDecRefCountFastPathMultiThreadDistinctStat)
->ThreadRange(1, 16)
->UseRealTime()
->Setup(multiThreadInitDistinctStat)
->Teardown(multiThreadDestroy);

// Single thread, multiple stats, ref_count_ == 1
void bmDecRefCountSlowPathSingleThread(benchmark::State& state) {
DecRefCountFixture fixture(state.max_iterations);
size_t idx = 0;
for (auto _ : state) { // NOLINT
fixture.counterPtr(idx++).reset();
}
}
BENCHMARK(bmDecRefCountSlowPathSingleThread)->Iterations(1 << 18);

} // namespace
} // namespace Stats
} // namespace Envoy
Loading