-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathrocprofv3_multi_node.json
More file actions
56 lines (50 loc) · 1.34 KB
/
Copy pathrocprofv3_multi_node.json
File metadata and controls
56 lines (50 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
{
"_comment": "Multi-Node, Multi-GPU (2 Nodes x 4 GPUs) - SLURM Configuration with ROCm Profiling",
"_description": "Configuration for distributed training across multiple nodes with ROCm profiling tools on SLURM",
"_use_case": "Large-scale multi-node training with communication profiling, power monitoring, and VRAM tracking",
"_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different (e.g., 'gpu', 'compute').",
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"slurm": {
"partition": "amd-rccl",
"nodes": 2,
"gpus_per_node": 4,
"time": "04:00:00",
"output_dir": "./slurm_results",
"exclusive": true
},
"distributed": {
"launcher": "torchrun",
"nnodes": 2,
"nproc_per_node": 4
},
"tools": [
{
"name": "rocprofv3_communication",
"env_vars": {
"RCCL_DEBUG": "INFO",
"NCCL_DEBUG": "INFO"
}
},
{
"name": "gpu_info_power_profiler",
"env_vars": {
"POWER_DEVICE": "all",
"POWER_SAMPLING_RATE": "0.1"
}
},
{
"name": "gpu_info_vram_profiler",
"env_vars": {
"VRAM_DEVICE": "all",
"VRAM_SAMPLING_RATE": "0.1"
}
}
],
"env_vars": {
"OMP_NUM_THREADS": "8",
"NCCL_IB_DISABLE": "0",
"NCCL_SOCKET_IFNAME": "eth0"
},
"debug": false
}