添加 gpu_memory_monitor.py
This commit is contained in:
109
gpu_memory_monitor.py
Normal file
109
gpu_memory_monitor.py
Normal file
@ -0,0 +1,109 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import subprocess
|
||||
import time
|
||||
import re
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
# 存储所有GPU的显存占用数据
|
||||
gpu_memory_data = []
|
||||
gpu_count = 0
|
||||
|
||||
def get_gpu_count():
|
||||
"""获取系统中的GPU数量"""
|
||||
try:
|
||||
output = subprocess.check_output(['nvidia-smi', '-L'])
|
||||
return len(output.decode('utf-8').strip().split('\n'))
|
||||
except Exception as e:
|
||||
print(f"Error detecting GPUs: {str(e)}")
|
||||
return 0
|
||||
|
||||
def get_gpu_memory_usage(gpu_id):
|
||||
"""获取指定GPU的显存占用情况"""
|
||||
command = [
|
||||
'nvidia-smi',
|
||||
'--query-gpu=memory.used',
|
||||
'--format=csv,noheader,nounits',
|
||||
'-i', str(gpu_id)
|
||||
]
|
||||
try:
|
||||
output = subprocess.check_output(command)
|
||||
return int(output.decode('utf-8').strip())
|
||||
except Exception as e:
|
||||
print(f"Error getting GPU {gpu_id} memory: {str(e)}")
|
||||
return 0
|
||||
|
||||
def monitor_gpu_memory(interval=1):
|
||||
"""监控GPU显存占用并实时显示"""
|
||||
global gpu_count, gpu_memory_data
|
||||
gpu_count = get_gpu_count()
|
||||
|
||||
if gpu_count == 0:
|
||||
print("No GPUs detected!")
|
||||
return
|
||||
|
||||
print(f"Detected {gpu_count} GPU(s)")
|
||||
gpu_memory_data = [[] for _ in range(gpu_count)]
|
||||
|
||||
# 打印表头
|
||||
header = "Timestamp " + "".join([f"GPU {i} (MB) " for i in range(gpu_count)])
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
|
||||
try:
|
||||
while True:
|
||||
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
memory_values = []
|
||||
|
||||
for i in range(gpu_count):
|
||||
memory_usage = get_gpu_memory_usage(i)
|
||||
gpu_memory_data[i].append(memory_usage)
|
||||
memory_values.append(memory_usage)
|
||||
|
||||
# 实时打印数据
|
||||
data_line = f"{timestamp} " + "".join([f"{mem:8} " for mem in memory_values])
|
||||
print(data_line, end='\r')
|
||||
|
||||
time.sleep(interval)
|
||||
except KeyboardInterrupt:
|
||||
print("\n" + "-" * len(header))
|
||||
|
||||
def plot_gpu_memory_data():
|
||||
"""绘制显存占用曲线图"""
|
||||
if not gpu_memory_data or len(gpu_memory_data) == 0:
|
||||
return
|
||||
|
||||
max_len = max(len(data) for data in gpu_memory_data)
|
||||
times = np.arange(len(gpu_memory_data[0]))
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
for i in range(len(gpu_memory_data)):
|
||||
gpu_data = np.array(gpu_memory_data[i] + [np.nan] * (max_len - len(gpu_memory_data[i])))
|
||||
plt.plot(times, gpu_data, label=f'GPU {i}')
|
||||
|
||||
plt.xlabel('Time (s)')
|
||||
plt.ylabel('Memory Usage (MB)')
|
||||
plt.title('GPU Memory Usage Over Time')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.savefig('gpu_memory_usage.png')
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
print("Monitoring GPU memory usage. Press Ctrl+C to stop.")
|
||||
monitor_gpu_memory(interval=1)
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
finally:
|
||||
if gpu_count > 0:
|
||||
# 将数据保存到文件
|
||||
with open('gpu_memory_data.txt', 'w') as f:
|
||||
for i in range(gpu_count):
|
||||
f.write(f'GPU {i} Memory Usage: ')
|
||||
for data in gpu_memory_data[i]:
|
||||
f.write(f"{data} ")
|
||||
f.write('\n')
|
||||
plot_gpu_memory_data()
|
||||
print("Memory usage data has been plotted and saved to gpu_memory_usage.png and gpu_memory_data.txt")
|
Reference in New Issue
Block a user