Skip to content
48 changes: 44 additions & 4 deletions superbench/common/utils/device_manager.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Device Managerment Library Utility."""
"""Device Management Library Utility."""

import numbers
from typing import Optional

from superbench.common.utils import logger
Expand All @@ -15,6 +16,26 @@
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
import amdsmi as rocml

# amdsmi reports power in microwatts on some ROCm versions and in watts on
# others. Any plausible per-GPU watt value is well below 100,000, while µW
# values for real cards are tens of millions, so we use a magnitude-based
# heuristic to detect µW and convert.
_AMDSMI_MICROWATTS_PER_WATT = 1_000_000
_AMDSMI_MICROWATTS_THRESHOLD = 100_000


def _amdsmi_power_to_watts(value):
"""Convert an amdsmi power value to integer watts.

Returns None if value is not a plausible numeric reading (e.g. 'N/A' or bool).
Applies the µW->W heuristic above so callers never have to guess units.
"""
if not isinstance(value, numbers.Real) or isinstance(value, bool):
return None
if value > _AMDSMI_MICROWATTS_THRESHOLD:
value = value // _AMDSMI_MICROWATTS_PER_WATT
return int(value)


class DeviceManager:
"""Device management base module."""
Expand Down Expand Up @@ -332,7 +353,14 @@ def __init__(self):

def __del__(self):
"""Destructor."""
rocml.amdsmi_shut_down()
# Be defensive at interpreter shutdown / partial-import time: the
# module-level ``rocml`` global may have been torn down, or may never
# have been imported (e.g., when this class is constructed via
# __new__ in tests). Swallow any error so GC never raises.
try:
rocml.amdsmi_shut_down()
except Exception:
pass

def get_device_count(self):
"""Get the number of device.
Expand Down Expand Up @@ -389,10 +417,19 @@ def get_device_power(self, idx):
"""
try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
# amdsmi sets fields to 'N/A' when the hardware reports 0xFFFF (unsupported).
# On MI300X, average_socket_power is unsupported, so fall back to current_socket_power.
for key in ('average_socket_power', 'current_socket_power'):
if key not in power_measure:
logger.warning('amdsmi power_info missing expected key: {}'.format(key))
continue
watts = _amdsmi_power_to_watts(power_measure[key])
if watts is not None:
return watts
return None
except Exception as err:
logger.warning('Get device power failed: {}'.format(str(err)))
return None
return int(power_measure['average_socket_power'])

def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Expand All @@ -405,10 +442,13 @@ def get_device_power_limit(self, idx):
"""
try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
if 'power_limit' not in power_measure:
logger.warning('amdsmi power_info missing expected key: power_limit')
return None
return _amdsmi_power_to_watts(power_measure['power_limit'])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None
return int(power_measure['power_limit'])

def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Expand Down
138 changes: 137 additions & 1 deletion tests/common/test_device_manager.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for nvidia_helper module."""
"""Tests for the device_manager module across NVIDIA and AMD backends."""

import numbers
from unittest import mock

from tests.helper import decorator
from superbench.common.utils import device_manager as dm

_DM_MODULE = 'superbench.common.utils.device_manager'

Comment thread
polarG marked this conversation as resolved.

@decorator.cuda_test
@mock.patch('superbench.common.utils.process.run_command')
Expand Down Expand Up @@ -52,3 +54,137 @@ def test_nvidia_helper_utils(mock_run_command):
'gpu_remap_none': 0
}
assert (gpu_remapped_info == expected)


def _make_amd_manager():
"""Build an AmdDeviceManager instance bypassing __init__ (no ROCm required)."""
manager = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager)
manager._device_handlers = [mock.Mock()]
return manager


def test_amd_get_device_power_average_supported():
"""average_socket_power is numeric -> returned as int."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 123.7,
'current_socket_power': 456,
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 123


def test_amd_get_device_power_falls_back_to_current():
"""average_socket_power='N/A' -> fall back to current_socket_power."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 'N/A',
'current_socket_power': 321,
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 321


def test_amd_get_device_power_both_unsupported_returns_none():
"""Both fields non-numeric -> returns None."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 'N/A',
'current_socket_power': 'N/A',
'power_limit': 750,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) is None


def test_amd_get_device_power_missing_keys_returns_none():
"""Missing keys -> None and warning logged (no exception)."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) is None


def test_amd_get_device_power_microwatts_converted():
"""average_socket_power reported in µW -> converted to watts.

Verifies the unit handling is symmetric with get_device_power_limit so the
monitor record's gpu_power and gpu_power_limit cannot drift by 1e6.
"""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {
'average_socket_power': 350_000_000, # 350 W in µW
'current_socket_power': 360_000_000,
'power_limit': 750_000_000,
}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power(0) == 350
assert manager.get_device_power_limit(0) == 750

Comment thread
polarG marked this conversation as resolved.
Comment thread
polarG marked this conversation as resolved.

def test_amd_get_device_power_limit_microwatts_converted():
"""power_limit reported in µW (e.g., 750000000) -> converted to 750 W."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 750_000_000}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) == 750


def test_amd_get_device_power_limit_watts_passthrough():
"""power_limit already in watts (small value) -> returned as-is."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 300}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) == 300


def test_amd_get_device_power_limit_non_numeric_returns_none():
"""power_limit='N/A' -> returns None."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {'power_limit': 'N/A'}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) is None


def test_amd_get_device_power_limit_missing_key_returns_none():
"""Missing power_limit key -> returns None without raising."""
manager = _make_amd_manager()
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_power_info.return_value = {}
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
assert manager.get_device_power_limit(0) is None


def test_amd_device_manager_lifecycle():
"""__init__ calls amdsmi_init/get_processor_handles; __del__ tolerates failures.

Lifecycle is important: a regression in __del__ would surface as noisy
NameError / AttributeError messages in benchmark logs at interpreter shutdown.
"""
rocml_mock = mock.Mock()
rocml_mock.amdsmi_get_processor_handles.return_value = ['h0', 'h1']
with mock.patch(f'{_DM_MODULE}.rocml', rocml_mock, create=True):
manager = dm.AmdDeviceManager()
rocml_mock.amdsmi_init.assert_called_once()
assert manager.get_device_count() == 2
manager.__del__()
rocml_mock.amdsmi_shut_down.assert_called_once()

# Simulate the destructor running when amdsmi has been torn down (e.g.,
# interpreter shutdown). It must swallow the error rather than raise.
manager2 = dm.AmdDeviceManager.__new__(dm.AmdDeviceManager)
manager2._device_handlers = []
bad_rocml = mock.Mock()
bad_rocml.amdsmi_shut_down.side_effect = RuntimeError('rocm gone')
with mock.patch(f'{_DM_MODULE}.rocml', bad_rocml, create=True):
manager2.__del__() # must not raise
Loading