Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ jobs:
cargo llvm-cov clean --workspace
# Do not source show-env here — llvm-cov nextest sets instrumentation itself.
# extension-module is for the Python wheel only; omit it in lib tests (PyO3 linking).
cargo llvm-cov nextest --workspace --no-default-features
# Cdylib-only plugin wrappers share sources with rlib crates; exclude from coverage.
cargo llvm-cov nextest --workspace --no-default-features \
--exclude probing-hccl-profapi --exclude probing-nccl-profiler-cdylib
cargo llvm-cov nextest -p probing-server --no-default-features --features kmsg,gpu,gpu-cuda
cargo llvm-cov report --lcov --output-path coverage.lcov

Expand Down
35 changes: 35 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ members = [
"probing/extensions/python",
"probing/extensions/gpu",
"probing/extensions/nccl-profiler",
"probing/extensions/nccl-profiler-cdylib",
"probing/extensions/hccl-shim",
"probing/extensions/hccl-profapi",
"probing/server",
"probing/crates/store",
]
Expand Down Expand Up @@ -113,7 +116,7 @@ pyo3-build-config = "0.29.0"
[profile.dev]
debug = 1
split-debuginfo = "unpacked"
codegen-units = 256
codegen-units = 16

[profile.release]
opt-level = "z" # Optimize for size.
Expand Down
53 changes: 40 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Probing Makefile
#
# develop → maturin develop (Rust/Python daily loop)
# frontend → web/dist/ (manual, needs dx)
# frontend → web/dist/ + python/probing/bundled_web/ (manual, needs dx)
# wheel → bundle skills + UI, then maturin build
# frontend wheel → full release path
#
Expand Down Expand Up @@ -32,6 +32,7 @@ endif
endif

PYTHON ?= $(shell test -x .venv/bin/python && echo .venv/bin/python || echo python3)
VENV_PYTHON := $(abspath .venv/bin/python)
BUILD_PY_DEPS := build wheel toml maturin
DEV_PTH := python/probing/dev_pth.py
DEV_PY_DEPS := pyyaml pytest pytest-cov coverage ipython ipykernel
Expand All @@ -53,7 +54,7 @@ help:
@echo ""
@echo " develop / dev Bootstrap: _core, CLI, pytest, site hook"
@echo " core Rebuild probing._core after Rust edits"
@echo " frontend Build web/dist/ (dx; manual)"
@echo " frontend Build web/dist/ + sync bundled_web (dx; manual)"
@echo " wheel Build dist/*.whl (needs web/dist/; bundles skills + UI)"
@echo " wheel-ci alias for wheel (native build; PyPI uses maturin-action + zig)"
@echo " install-wheel pip install dist/probing-*.whl"
Expand Down Expand Up @@ -83,7 +84,7 @@ install-dev-python-deps:
fi

# ==============================================================================
.PHONY: core develop dev check-dev frontend wheel wheel-ci install-wheel wheel-bundle nccl-profiler-lib venv venv-wheel install-build-deps install-wheel-test-deps
.PHONY: core develop dev check-dev frontend sync-bundled-web wheel wheel-ci install-wheel wheel-bundle nccl-profiler-lib hccl-shim-lib venv venv-wheel install-build-deps install-wheel-test-deps

venv:
@test -x .venv/bin/python || $(shell command -v python3 || echo python3) -m venv .venv
Expand All @@ -98,7 +99,7 @@ install-build-deps: venv
install-wheel-test-deps: venv
$(PYTHON) -m pip install -q -U pip $(PYTEST_WHEEL_DEPS)

core: nccl-profiler-lib
core: nccl-profiler-lib hccl-shim-lib
$(PYTHON) -m maturin develop $(MATURIN_FLAGS)

develop: install-build-deps core install-dev-python-deps
Expand All @@ -125,16 +126,24 @@ frontend:
cp -R $(DX_PUBLIC)/. web/dist/
@mkdir -p web/dist/assets
@cp -f web/assets/logo.svg web/dist/logo.svg 2>/dev/null || true
@cp -f web/assets/logo.svg web/dist/assets/logo.svg 2>/dev/null || true
@cp -f web/assets/tailwind.css web/dist/assets/tailwind.css
@echo "web/dist ($$(du -sh web/dist | cut -f1))"
$(MAKE) sync-bundled-web

sync-bundled-web:
@test -f web/dist/index.html || { echo "error: web/dist missing — run make frontend first"; exit 1; }
rm -rf python/probing/bundled_web
cp -R web/dist python/probing/bundled_web
@echo "python/probing/bundled_web ($$(du -sh python/probing/bundled_web | cut -f1))"

wheel-bundle:
@test -f web/dist/index.html || { echo "error: run 'make frontend' first"; exit 1; }
rm -rf python/probing/bundled_skills python/probing/bundled_web
rm -rf python/probing/bundled_skills
cp -R skills python/probing/bundled_skills
cp -R web/dist python/probing/bundled_web
$(MAKE) sync-bundled-web

wheel: install-build-deps wheel-bundle nccl-profiler-lib
wheel: install-build-deps wheel-bundle nccl-profiler-lib hccl-shim-lib
$(PYTHON) -m maturin build $(MATURIN_FLAGS) --out dist

wheel-ci:
Expand All @@ -160,14 +169,30 @@ else
NCCL_OUT := target/release/libprobing_nccl_profiler.so
endif
nccl-profiler-lib:
cargo build -p probing-nccl-profiler $(CARGO_RELEASE)
cargo build -p probing-nccl-profiler-cdylib $(CARGO_RELEASE)
mkdir -p python/probing/libs
cp $(NCCL_OUT) python/probing/libs/
else
nccl-profiler-lib:
@:
endif

# Linux HCCL libprofapi.so shim → python/probing/shim/hccl/
ifeq ($(UNAME_S),Linux)
ifdef DEBUG
HCCL_SHIM_OUT := target/debug/libprofapi.so
else
HCCL_SHIM_OUT := target/release/libprofapi.so
endif
hccl-shim-lib:
cargo build -p probing-hccl-profapi $(CARGO_RELEASE)
mkdir -p python/probing/shim/hccl
cp $(HCCL_SHIM_OUT) python/probing/shim/hccl/
else
hccl-shim-lib:
@:
endif

# ==============================================================================
PYTEST_WHEEL_DEPS := pytest pytest-cov coverage pyyaml websockets pandas torch ipykernel
# Installed wheel only — do not pass python/probing (conflicts with site-packages).
Expand All @@ -184,17 +209,17 @@ test: test-rust test-python
test-rust: test-rust-unit test-rust-regression

test-rust-unit:
@if test -x .venv/bin/python; then \
export PYTHON_SYS_EXECUTABLE=.venv/bin/python PYO3_PYTHON=.venv/bin/python; \
@if test -x $(VENV_PYTHON); then \
export PYTHON_SYS_EXECUTABLE=$(VENV_PYTHON) PYO3_PYTHON=$(VENV_PYTHON); \
elif command -v pyenv >/dev/null 2>&1; then \
P=$$(pyenv which python3 2>/dev/null); \
test -n "$$P" && export PYTHON_SYS_EXECUTABLE=$$P PYO3_PYTHON=$$P; \
fi; \
cargo nextest run --lib --workspace --no-default-features --nff

test-rust-regression:
@if test -x .venv/bin/python; then \
export PYTHON_SYS_EXECUTABLE=.venv/bin/python PYO3_PYTHON=.venv/bin/python; \
@if test -x $(VENV_PYTHON); then \
export PYTHON_SYS_EXECUTABLE=$(VENV_PYTHON) PYO3_PYTHON=$(VENV_PYTHON); \
elif command -v pyenv >/dev/null 2>&1; then \
P=$$(pyenv which python3 2>/dev/null); \
test -n "$$P" && export PYTHON_SYS_EXECUTABLE=$$P PYO3_PYTHON=$$P; \
Expand Down Expand Up @@ -233,7 +258,9 @@ clippy-fix:

coverage-rust:
cargo llvm-cov clean --workspace
cargo llvm-cov nextest --workspace --no-default-features --nff --lcov --output-path coverage.lcov --ignore-filename-regex '(.*/tests?/|.*/benches?/|.*/examples?/)' || true
cargo llvm-cov nextest --workspace --no-default-features --nff \
--exclude probing-hccl-profapi --exclude probing-nccl-profiler-cdylib \
--lcov --output-path coverage.lcov --ignore-filename-regex '(.*/tests?/|.*/benches?/|.*/examples?/)' || true
coverage-python:
${PYTEST_RUN} --cov=python/probing --cov=tests --cov-report=xml:coverage.xml --cov-report=term $(PYTEST_ARGS) || true
coverage: coverage-rust coverage-python
Expand Down
72 changes: 39 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,24 +113,20 @@ probing list

### SQL Analytics Interface
```bash
# Memory usage analysis
probing -t <pid> query "SELECT * FROM memory_usage WHERE timestamp > now() - interval '5 min'"

# Performance hotspot analysis
# GPU memory trend across training steps
probing -t <pid> query "
SELECT operation_name, avg(duration_ms), count(*)
FROM profiling_data
WHERE timestamp > now() - interval '5 minutes'
GROUP BY operation_name
ORDER BY avg(duration_ms) DESC
SELECT local_step, AVG(allocated) as avg_mb
FROM python.torch_trace
GROUP BY local_step ORDER BY local_step
"

# Training progress tracking
# Find the slowest collectives
probing -t <pid> query "
SELECT epoch, avg(loss), min(loss), count(*) as steps
FROM training_logs
GROUP BY epoch
ORDER BY epoch
SELECT op, AVG(duration_ms) as avg_ms, COUNT(*) as calls
FROM python.comm_collective
GROUP BY op
ORDER BY avg_ms DESC
LIMIT 5
"
```

Expand Down Expand Up @@ -160,34 +156,44 @@ The REPL provides:

### Distributed Training Analysis
```bash
# Monitor all cluster nodes
probing cluster attach

# Inter-node communication latency
probing -t <pid> query "SELECT src_rank, dst_rank, avg(latency_ms) FROM comm_metrics"

# Cross-node stack trace comparison
probing -t <pid> query "SELECT * FROM python.backtrace"
# See all registered cluster nodes
probing -t <master> cluster nodes

# Cross-rank communication analysis via federation
probing -t <master> query "
SELECT _role, _rank, op, AVG(duration_ms) as avg_ms
FROM global.python.comm_collective
GROUP BY _role, _rank, op
ORDER BY avg_ms DESC
LIMIT 10
"

# GPU utilization analysis
probing -t <pid> query "SELECT avg(gpu_util) FROM gpu_metrics WHERE timestamp > now() - 60"
# GPU utilization across devices
probing -t <pid> query "
SELECT ts, mem_used_pct, gpu_util_pct
FROM gpu.utilization ORDER BY ts DESC LIMIT 20
"
```

### Memory Analysis
```bash
# Quick memory usage overview
probing -t <pid> memory

# Memory growth trend analysis
probing -t <pid> query "SELECT hour(timestamp), avg(memory_mb) FROM memory_usage GROUP BY hour(timestamp)"

# Memory leak detection
# Memory growth trend across steps
probing -t <pid> query "
SELECT function_name, sum(allocated_bytes) as total_alloc
FROM memory_allocations
WHERE timestamp > now() - interval '1 hour'
GROUP BY function_name
ORDER BY total_alloc DESC
SELECT local_step, AVG(allocated_delta) as delta_mb
FROM python.torch_trace
GROUP BY local_step
ORDER BY local_step
"

# Check current CPU/GPU memory via eval
probing -t <pid> eval "
import torch, gc; gc.collect()
alloc = torch.cuda.memory_allocated()/1024**2
reserved = torch.cuda.memory_reserved()/1024**2
print(f'GPU alloc: {alloc:.0f}MB, reserved: {reserved:.0f}MB')
"
```

Expand Down
7 changes: 7 additions & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ plugins:
Data Layer: 数据层
Profiling: 性能分析
Distributed: 分布式
Federated Query Engine: 联邦查询引擎
Cluster with Pulsing: 基于 Pulsing 的集群
Extensibility: 扩展机制
Modularity: 模块化与边界
Expand All @@ -109,6 +110,9 @@ plugins:
API Reference: API 参考
Reference: 参考手册
SQL Tables: SQL 表目录
API Reference: API 参考
Environment Variables: 环境变量
Skill Format: Skill 格式规范
Versions: 版本兼容性
Contributing: 贡献指南
- mkdocstrings:
Expand Down Expand Up @@ -142,6 +146,7 @@ nav:
- Profiling: design/profiling.md
- Debugging: design/debugging.md
- Distributed: design/distributed.md
- Federated Query Engine: design/federation.md
- NCCL Profiler: design/nccl-profiler.md
- Cluster with Pulsing: design/cluster-pulsing.md
- Extensibility: design/extensibility.md
Expand All @@ -154,6 +159,8 @@ nav:
- reference/index.md
- SQL Tables: reference/sql-tables.md
- API Reference: api-reference.md
- Environment Variables: reference/env-vars.md
- Skill Format: reference/skill-format.md
- Versions: versions.md
- Contributing: contributing.md

Expand Down
Loading
Loading