Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions include/common_symbol_errors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@ using namespace std::string_view_literals;

namespace ddprof {

inline constexpr std::array<std::string_view, 6> k_common_frame_names = {
"[truncated]"sv, "[unknown mapping]"sv,
"[unwind failure]"sv, "[incomplete]"sv,
"[lost]"sv, "[maximum pids]"sv};
inline constexpr std::array<std::string_view, 7> k_common_frame_names = {
"[truncated]"sv,
"[unknown mapping]"sv,
"[unwind failure]"sv,
"[incomplete]"sv,
"[lost]"sv,
"[maximum pids]"sv,
"[live-alloc cleared]"sv};

enum SymbolErrors : std::uint8_t {
truncated_stack = 0,
Expand All @@ -26,6 +30,7 @@ enum SymbolErrors : std::uint8_t {
incomplete_stack,
lost_event,
max_pids,
live_alloc_cleared,
};

} // namespace ddprof
6 changes: 5 additions & 1 deletion include/ddprof_stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ namespace ddprof {
X(PPROF_SIZE, "pprof.size", STAT_GAUGE) \
X(PROFILE_DURATION, "profile.duration_ms", STAT_GAUGE) \
X(AGGREGATION_AVG_TIME, "aggregation.avg_time_ns", STAT_GAUGE) \
X(BACKPOPULATE_COUNT, "backpopulate.count", STAT_GAUGE)
X(BACKPOPULATE_COUNT, "backpopulate.count", STAT_GAUGE) \
X(LIVE_ALLOC_SNAPSHOT_BYTES, "live_alloc.snapshot.bytes", STAT_GAUGE) \
X(LIVE_ALLOC_CLEARED_STACKS, "live_alloc.snapshot.cleared_stacks", \
STAT_GAUGE) \
X(LIVE_ALLOC_DROPPED_PIDS, "live_alloc.snapshot.dropped_pids", STAT_GAUGE)

// Expand the enum/index for the individual stats
enum DDPROF_STATS : uint8_t { STATS_TABLE(X_ENUM) STATS_LEN };
Expand Down
19 changes: 19 additions & 0 deletions include/live_allocation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "unwind_output_hash.hpp"

#include <cstddef>
#include <deque>
#include <string>
#include <sys/types.h>
#include <unordered_map>

Expand Down Expand Up @@ -55,6 +57,23 @@ class LiveAllocation {
// NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
WatcherVector _watcher_vector;

// Owns the string storage backing string_views inside UnwindOutputs that
// were restored from a snapshot. Live entries created from incoming
// allocation events continue to use views into Process / base-frame
// tables; this storage is only used by snapshot-restore.
// NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
std::deque<std::string> _restored_strings;

// Returns a string_view backed by _restored_strings. Empty input maps
// to an empty view without allocation.
std::string_view intern_restored_string(std::string_view sv) {
if (sv.empty()) {
return {};
}
_restored_strings.emplace_back(sv);
return _restored_strings.back();
}

void register_library_state(int watcher_pos, pid_t pid,
uint32_t address_conflict_count,
uint32_t tracked_address_count,
Expand Down
129 changes: 129 additions & 0 deletions include/live_allocation_snapshot.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.

// Live-allocation snapshot: portable serialisation of LiveAllocation across
// worker resets.
//
// The aggregator state in LiveAllocation refers to per-worker handles
// (libdatadog dictionary pointers, indices into SymbolHdr tables, string_views
// into Process/base-frame caches). None of these survive a worker fork, so
// before a worker restart we capture a fully self-owned snapshot via
// `capture_snapshot`, serialise it through a memfd held by the parent, and
// re-intern everything into the freshly-built tables of the new worker via
// `restore_snapshot`.

#pragma once

#include "ddprof_defs.hpp"
#include "live_allocation.hpp"

#include <cstddef>
#include <cstdint>
#include <string>
#include <sys/types.h>
#include <vector>

namespace ddprof {

struct SymbolHdr;

namespace live_alloc_snapshot {

// Fully self-owned mirror of one FunLoc.
struct FunLocPortable {
ProcessAddress_t ip{};
ElfAddress_t elf_addr{};
uint32_t lineno{};
std::string fn_name;
std::string fn_system_name;
std::string fn_file;
ElfAddress_t map_low{};
ElfAddress_t map_high{};
Offset_t map_offset{};
std::string map_filename;
std::string map_build_id;
};

// Fully self-owned mirror of one UnwindOutput.
struct UnwindOutputPortable {
std::vector<FunLocPortable> locs;
std::string container_id;
std::string exe_name;
std::string thread_name;
int pid{};
int tid{};
};

struct AddressEntry {
uintptr_t addr{};
int64_t value{};
uint32_t stack_idx{}; // index into Snapshot::stacks
};

struct PidEntry {
int watcher_pos{};
pid_t pid{};
uint32_t address_conflict_count{};
uint32_t tracked_address_count{};
std::vector<AddressEntry> addresses;
};

// Special stack index meaning "stack was dropped during budget enforcement".
// On restore this is materialised as a synthetic single-frame stack pointing
// at the [live-alloc cleared] common frame, ensuring per-PID heap totals are
// preserved even when detail is shed.
inline constexpr uint32_t k_cleared_stack_idx =
std::numeric_limits<uint32_t>::max();

struct Snapshot {
std::vector<UnwindOutputPortable> stacks;
std::vector<PidEntry> pids;
// How many distinct allocation addresses had their stack remapped to the
// synthetic cleared stack because of the size budget.
uint32_t cleared_addresses{};
// How many (watcher_pos, pid) entries we dropped entirely because even
// minimal accounting did not fit in the budget.
uint32_t dropped_pids{};
};

// Default and hard-ceiling sizes for the serialised snapshot.
inline constexpr std::size_t k_default_max_snapshot_bytes = 4UL * 1024 * 1024;
inline constexpr std::size_t k_hard_max_snapshot_bytes = 20UL * 1024 * 1024;

// Read a Snapshot out of an in-memory LiveAllocation, by resolving symbol /
// mapping IDs through `symbol_hdr`. The result has no references to the
// originating worker.
//
// If `max_bytes` is non-zero and the projected serialised size exceeds it,
// the snapshot is degraded in a value-preserving way: low-value stacks are
// remapped to the synthetic cleared stack first, and finally whole pids are
// dropped from the lowest aggregate value upwards. The `cleared_addresses` /
// `dropped_pids` counters in the result report how aggressive the degradation
// had to be.
Snapshot capture_snapshot(const LiveAllocation &live_alloc,
const SymbolHdr &symbol_hdr,
std::size_t max_bytes = k_default_max_snapshot_bytes);

// Serialise / deserialise a Snapshot to/from a binary blob.
void serialize(const Snapshot &snapshot, std::vector<uint8_t> &out);
bool deserialize(const uint8_t *data, std::size_t size, Snapshot &out);

// Write the binary blob into `fd` (memfd). Truncates fd to the blob size on
// success. On error, returns false and leaves fd in an unspecified state.
bool write_to_fd(int fd, const Snapshot &snapshot);

// Read a snapshot blob out of `fd`. Returns false if fd is empty / unreadable
// / malformed.
bool read_from_fd(int fd, Snapshot &out);

// Re-intern a snapshot into `symbol_hdr` and populate the empty `live_alloc`
// state. Restored UnwindOutputs hold string_views into
// LiveAllocation::_restored_strings.
void restore_snapshot(const Snapshot &snapshot, LiveAllocation &live_alloc,
SymbolHdr &symbol_hdr);

} // namespace live_alloc_snapshot

} // namespace ddprof
7 changes: 7 additions & 0 deletions include/persistent_worker_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ struct PersistentWorkerState {
// Why not volatile ? Although several threads can update the number of
// cycles, by design Only a single thread reads and writes to this variable.
uint32_t profile_seq;
// memfd holding the most recent serialized live-allocation snapshot.
// The fd is opened by the parent in main_loop and inherited by every
// worker child. A child that is restarting writes its serialized
// LiveAllocation state to this fd just before exiting; the next child
// reads it during worker_library_init so live-heap tracking survives
// worker resets. -1 if snapshotting is disabled.
int live_alloc_snapshot_fd;
};

} // namespace ddprof
24 changes: 24 additions & 0 deletions src/ddprof_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#include "ddprof_worker.hpp"

#include "ddprof_context.hpp"
#include "ddprof_context_lib.hpp"
#include "ddprof_perf_event.hpp"
#include "ddprof_stats.hpp"
#include "dso_hdr.hpp"
#include "exporter/ddprof_exporter.hpp"
#include "live_allocation_snapshot.hpp"
#include "logger.hpp"
#include "perf.hpp"
#include "persistent_worker_state.hpp"
#include "pevent_lib.hpp"
#include "pprof/ddprof_pprof.hpp"
#include "procutils.hpp"
Expand Down Expand Up @@ -561,6 +564,27 @@ DDRes worker_library_init(DDProfContext &ctx,
ctx.worker_ctx.exp[1] = nullptr;
ctx.worker_ctx.pprof[0] = nullptr;
ctx.worker_ctx.pprof[1] = nullptr;

// If the previous worker handed us a live-allocation snapshot, replay
// it before the poll loop starts draining new events. Restored entries
// use string storage owned by LiveAllocation itself.
if (persistent_worker_state &&
persistent_worker_state->live_alloc_snapshot_fd >= 0 &&
context_allocation_profiling_watcher_idx(ctx) != -1) {
live_alloc_snapshot::Snapshot snap;
if (live_alloc_snapshot::read_from_fd(
persistent_worker_state->live_alloc_snapshot_fd, snap)) {
if (!snap.stacks.empty() || !snap.pids.empty()) {
live_alloc_snapshot::restore_snapshot(snap,
ctx.worker_ctx.live_allocation,
ctx.worker_ctx.us->symbol_hdr);
LG_NTC("[live-alloc] Snapshot restored: stacks=%zu pids=%zu "
"cleared=%u dropped_pids=%u",
snap.stacks.size(), snap.pids.size(), snap.cleared_addresses,
snap.dropped_pids);
}
}
}
}
CatchExcept2DDRes();
return {};
Expand Down
Loading