mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 13:44:15 +08:00
Track monitor (#156907)
Tracking gpu mem allocation, we were tracking the gpu bandwidth memory, the mem allocation is the one reflect wether the gpu is oom or not, upcoming ui fix. UI fix: https://github.com/pytorch/test-infra/pull/6878/files Pull Request resolved: https://github.com/pytorch/pytorch/pull/156907 Approved by: https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
be483a5481
commit
f76f4abf3f
2
.github/actions/linux-test/action.yml
vendored
2
.github/actions/linux-test/action.yml
vendored
@ -126,7 +126,7 @@ runs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
|
python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84
|
||||||
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ packaging==25.0
|
|||||||
parameterized==0.8.1
|
parameterized==0.8.1
|
||||||
pillow==10.3.0
|
pillow==10.3.0
|
||||||
protobuf==5.29.4
|
protobuf==5.29.4
|
||||||
psutil==5.9.1
|
psutil==5.9.8
|
||||||
pygments==2.15.0
|
pygments==2.15.0
|
||||||
pytest-cpp==2.3.0
|
pytest-cpp==2.3.0
|
||||||
pytest-flakefinder==1.1.0
|
pytest-flakefinder==1.1.0
|
||||||
|
2
.github/workflows/_linux-build.yml
vendored
2
.github/workflows/_linux-build.yml
vendored
@ -225,7 +225,7 @@ jobs:
|
|||||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||||
run: |
|
run: |
|
||||||
mkdir -p ../../usage_logs
|
mkdir -p ../../usage_logs
|
||||||
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
|
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
|
||||||
python3 -m tools.stats.monitor \
|
python3 -m tools.stats.monitor \
|
||||||
--log-interval "$MONITOR_LOG_INTERVAL" \
|
--log-interval "$MONITOR_LOG_INTERVAL" \
|
||||||
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
|
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
|
||||||
|
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -205,7 +205,7 @@ jobs:
|
|||||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
||||||
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
2
.github/workflows/_mac-test.yml
vendored
2
.github/workflows/_mac-test.yml
vendored
@ -136,7 +136,7 @@ jobs:
|
|||||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||||
run: |
|
run: |
|
||||||
"$VENV_PATH/bin/python3" -m pip install psutil==5.9.1 dataclasses_json==0.6.7
|
"$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
|
||||||
"$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
"$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
2
.github/workflows/_rocm-test.yml
vendored
2
.github/workflows/_rocm-test.yml
vendored
@ -132,7 +132,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
|
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
|
||||||
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
2
.github/workflows/_win-test.yml
vendored
2
.github/workflows/_win-test.yml
vendored
@ -138,7 +138,7 @@ jobs:
|
|||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: |
|
||||||
# Windows conda doesn't have python3 binary, only python, but it's python3
|
# Windows conda doesn't have python3 binary, only python, but it's python3
|
||||||
${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
||||||
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
2
.github/workflows/_xpu-test.yml
vendored
2
.github/workflows/_xpu-test.yml
vendored
@ -133,7 +133,7 @@ jobs:
|
|||||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
|
||||||
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
@ -78,6 +78,9 @@ class GpuData:
|
|||||||
uuid: str
|
uuid: str
|
||||||
utilization: float
|
utilization: float
|
||||||
mem_utilization: float
|
mem_utilization: float
|
||||||
|
allocated_mem: float
|
||||||
|
allocated_mem_value: float
|
||||||
|
total_mem_value: float
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -259,6 +262,7 @@ class UsageLogger:
|
|||||||
return UtilizationStats(
|
return UtilizationStats(
|
||||||
avg=round(avg, 2),
|
avg=round(avg, 2),
|
||||||
max=round(maxi, 2),
|
max=round(maxi, 2),
|
||||||
|
raw=data_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _output_data(self) -> None:
|
def _output_data(self) -> None:
|
||||||
@ -338,20 +342,33 @@ class UsageLogger:
|
|||||||
calculate_gpu = []
|
calculate_gpu = []
|
||||||
gpu_mem_utilization = defaultdict(list)
|
gpu_mem_utilization = defaultdict(list)
|
||||||
gpu_utilization = defaultdict(list)
|
gpu_utilization = defaultdict(list)
|
||||||
|
gpu_allocated_mem = defaultdict(list)
|
||||||
|
gpu_allocated_mem_values = defaultdict(list)
|
||||||
|
gpu_total_mem_values = defaultdict(float)
|
||||||
|
|
||||||
for data in data_list:
|
for data in data_list:
|
||||||
for gpu in data.gpu_list:
|
for gpu in data.gpu_list:
|
||||||
gpu_mem_utilization[gpu.uuid].append(gpu.mem_utilization)
|
gpu_mem_utilization[gpu.uuid].append(gpu.mem_utilization)
|
||||||
gpu_utilization[gpu.uuid].append(gpu.utilization)
|
gpu_utilization[gpu.uuid].append(gpu.utilization)
|
||||||
|
gpu_allocated_mem[gpu.uuid].append(gpu.allocated_mem)
|
||||||
|
gpu_allocated_mem_values[gpu.uuid].append(gpu.allocated_mem_value)
|
||||||
|
gpu_total_mem_values[gpu.uuid] = gpu.total_mem_value
|
||||||
|
|
||||||
for gpu_uuid in gpu_utilization.keys():
|
for gpu_uuid in gpu_utilization.keys():
|
||||||
gpu_util_stats = self._generate_stats(gpu_utilization[gpu_uuid])
|
gpu_util_stats = self._generate_stats(gpu_utilization[gpu_uuid])
|
||||||
gpu_mem_util_stats = self._generate_stats(gpu_mem_utilization[gpu_uuid])
|
gpu_mem_util_stats = self._generate_stats(gpu_mem_utilization[gpu_uuid])
|
||||||
|
gpu_allocated_mem_stats = self._generate_stats(gpu_allocated_mem[gpu_uuid])
|
||||||
|
gpu_allocated_mem_value_stats = self._generate_stats(
|
||||||
|
gpu_allocated_mem_values[gpu_uuid]
|
||||||
|
)
|
||||||
calculate_gpu.append(
|
calculate_gpu.append(
|
||||||
GpuUsage(
|
GpuUsage(
|
||||||
uuid=gpu_uuid,
|
uuid=gpu_uuid,
|
||||||
util_percent=gpu_util_stats,
|
util_percent=gpu_util_stats,
|
||||||
mem_util_percent=gpu_mem_util_stats,
|
mem_util_percent=gpu_mem_util_stats,
|
||||||
|
allocated_mem_percent=gpu_allocated_mem_stats,
|
||||||
|
allocated_mem_value=gpu_allocated_mem_value_stats,
|
||||||
|
total_mem_value=gpu_total_mem_values[gpu_uuid],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return calculate_gpu
|
return calculate_gpu
|
||||||
@ -382,11 +399,21 @@ class UsageLogger:
|
|||||||
# see https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
|
# see https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
|
||||||
gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
|
gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
|
||||||
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
||||||
|
gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
||||||
|
mem_utilization = gpu_utilization.memory
|
||||||
|
|
||||||
|
allocate_mem_MB = gpu_memory_info.used / 1024**2
|
||||||
|
total_mem_MB = gpu_memory_info.total / 1024**2
|
||||||
|
allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100
|
||||||
|
|
||||||
gpu_data_list.append(
|
gpu_data_list.append(
|
||||||
GpuData(
|
GpuData(
|
||||||
uuid=gpu_uuid,
|
uuid=gpu_uuid,
|
||||||
utilization=gpu_utilization.gpu,
|
utilization=gpu_utilization.gpu,
|
||||||
mem_utilization=gpu_utilization.memory,
|
mem_utilization=mem_utilization,
|
||||||
|
allocated_mem=allocate_mem_percent,
|
||||||
|
allocated_mem_value=allocate_mem_MB,
|
||||||
|
total_mem_value=total_mem_MB,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif self._has_amdsmi:
|
elif self._has_amdsmi:
|
||||||
@ -397,11 +424,20 @@ class UsageLogger:
|
|||||||
gpu_uuid = amdsmi.amdsmi_get_gpu_device_uuid(handle)
|
gpu_uuid = amdsmi.amdsmi_get_gpu_device_uuid(handle)
|
||||||
gpu_utilization = engine_usage["gfx_activity"]
|
gpu_utilization = engine_usage["gfx_activity"]
|
||||||
gpu_mem_utilization = gpu_utilization["umc_activity"]
|
gpu_mem_utilization = gpu_utilization["umc_activity"]
|
||||||
|
mem_info = amdsmi.amdsmi_get_gpu_memory_usage(handle)
|
||||||
|
|
||||||
|
allocate_mem_MB = mem_info["vram_usage"] / 1024**2
|
||||||
|
total_mem_MB = mem_info["vram_total"] / 1024**2
|
||||||
|
allocate_mem_percent = allocate_mem_MB / total_mem_MB * 100
|
||||||
|
|
||||||
gpu_data_list.append(
|
gpu_data_list.append(
|
||||||
GpuData(
|
GpuData(
|
||||||
uuid=gpu_uuid,
|
uuid=gpu_uuid,
|
||||||
utilization=gpu_utilization,
|
utilization=gpu_utilization,
|
||||||
mem_utilization=gpu_mem_utilization,
|
mem_utilization=gpu_mem_utilization,
|
||||||
|
allocated_mem=allocate_mem_percent,
|
||||||
|
allocated_mem_value=allocate_mem_MB,
|
||||||
|
total_mem_value=total_mem_MB,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return gpu_data_list
|
return gpu_data_list
|
||||||
@ -499,7 +535,9 @@ class UsageLogger:
|
|||||||
cmd = " ".join(process.cmdline())
|
cmd = " ".join(process.cmdline())
|
||||||
processName = process.name()
|
processName = process.name()
|
||||||
pid = process.pid
|
pid = process.pid
|
||||||
if "python" in processName and cmd.startswith("python"):
|
is_python = "python" in processName and "python" in cmd
|
||||||
|
is_pytest = "pytest" in cmd
|
||||||
|
if is_python or is_pytest:
|
||||||
python_test_processes.append({"pid": pid, "cmd": cmd})
|
python_test_processes.append({"pid": pid, "cmd": cmd})
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||||||
from dataclasses_json import DataClassJsonMixin
|
from dataclasses_json import DataClassJsonMixin
|
||||||
|
|
||||||
|
|
||||||
_DATA_MODEL_VERSION = 1.0
|
_DATA_MODEL_VERSION = 1.5
|
||||||
|
|
||||||
|
|
||||||
# data model for test log usage
|
# data model for test log usage
|
||||||
@ -13,6 +13,7 @@ _DATA_MODEL_VERSION = 1.0
|
|||||||
class UtilizationStats:
|
class UtilizationStats:
|
||||||
avg: Optional[float] = None
|
avg: Optional[float] = None
|
||||||
max: Optional[float] = None
|
max: Optional[float] = None
|
||||||
|
raw: Optional[list[float]] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -36,6 +37,9 @@ class GpuUsage(DataClassJsonMixin):
|
|||||||
uuid: Optional[str] = None
|
uuid: Optional[str] = None
|
||||||
util_percent: Optional[UtilizationStats] = None
|
util_percent: Optional[UtilizationStats] = None
|
||||||
mem_util_percent: Optional[UtilizationStats] = None
|
mem_util_percent: Optional[UtilizationStats] = None
|
||||||
|
allocated_mem_percent: Optional[UtilizationStats] = None
|
||||||
|
allocated_mem_value: Optional[UtilizationStats] = None
|
||||||
|
total_mem_value: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
Reference in New Issue
Block a user