添加 gpu_memory_monitor.py

This commit is contained in:
2025-08-05 18:42:20 +08:00
parent edc6be62b8
commit a0de3ba1a3

109
gpu_memory_monitor.py Normal file
View File

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
import subprocess
import time
import re
import matplotlib.pyplot as plt
import numpy as np
import datetime
# 存储所有GPU的显存占用数据
gpu_memory_data = []
gpu_count = 0
def get_gpu_count():
"""获取系统中的GPU数量"""
try:
output = subprocess.check_output(['nvidia-smi', '-L'])
return len(output.decode('utf-8').strip().split('\n'))
except Exception as e:
print(f"Error detecting GPUs: {str(e)}")
return 0
def get_gpu_memory_usage(gpu_id):
"""获取指定GPU的显存占用情况"""
command = [
'nvidia-smi',
'--query-gpu=memory.used',
'--format=csv,noheader,nounits',
'-i', str(gpu_id)
]
try:
output = subprocess.check_output(command)
return int(output.decode('utf-8').strip())
except Exception as e:
print(f"Error getting GPU {gpu_id} memory: {str(e)}")
return 0
def monitor_gpu_memory(interval=1):
"""监控GPU显存占用并实时显示"""
global gpu_count, gpu_memory_data
gpu_count = get_gpu_count()
if gpu_count == 0:
print("No GPUs detected!")
return
print(f"Detected {gpu_count} GPU(s)")
gpu_memory_data = [[] for _ in range(gpu_count)]
# 打印表头
header = "Timestamp " + "".join([f"GPU {i} (MB) " for i in range(gpu_count)])
print(header)
print("-" * len(header))
try:
while True:
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
memory_values = []
for i in range(gpu_count):
memory_usage = get_gpu_memory_usage(i)
gpu_memory_data[i].append(memory_usage)
memory_values.append(memory_usage)
# 实时打印数据
data_line = f"{timestamp} " + "".join([f"{mem:8} " for mem in memory_values])
print(data_line, end='\r')
time.sleep(interval)
except KeyboardInterrupt:
print("\n" + "-" * len(header))
def plot_gpu_memory_data():
"""绘制显存占用曲线图"""
if not gpu_memory_data or len(gpu_memory_data) == 0:
return
max_len = max(len(data) for data in gpu_memory_data)
times = np.arange(len(gpu_memory_data[0]))
plt.figure(figsize=(12, 6))
for i in range(len(gpu_memory_data)):
gpu_data = np.array(gpu_memory_data[i] + [np.nan] * (max_len - len(gpu_memory_data[i])))
plt.plot(times, gpu_data, label=f'GPU {i}')
plt.xlabel('Time (s)')
plt.ylabel('Memory Usage (MB)')
plt.title('GPU Memory Usage Over Time')
plt.legend()
plt.grid(True)
plt.savefig('gpu_memory_usage.png')
plt.show()
if __name__ == '__main__':
try:
print("Monitoring GPU memory usage. Press Ctrl+C to stop.")
monitor_gpu_memory(interval=1)
except Exception as e:
print(f"Error: {str(e)}")
finally:
if gpu_count > 0:
# 将数据保存到文件
with open('gpu_memory_data.txt', 'w') as f:
for i in range(gpu_count):
f.write(f'GPU {i} Memory Usage: ')
for data in gpu_memory_data[i]:
f.write(f"{data} ")
f.write('\n')
plot_gpu_memory_data()
print("Memory usage data has been plotted and saved to gpu_memory_usage.png and gpu_memory_data.txt")