109 lines
3.4 KiB
Python
109 lines
3.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
import subprocess
|
|
import time
|
|
import re
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import datetime
|
|
|
|
# 存储所有GPU的显存占用数据
|
|
gpu_memory_data = []
|
|
gpu_count = 0
|
|
|
|
def get_gpu_count():
|
|
"""获取系统中的GPU数量"""
|
|
try:
|
|
output = subprocess.check_output(['nvidia-smi', '-L'])
|
|
return len(output.decode('utf-8').strip().split('\n'))
|
|
except Exception as e:
|
|
print(f"Error detecting GPUs: {str(e)}")
|
|
return 0
|
|
|
|
def get_gpu_memory_usage(gpu_id):
|
|
"""获取指定GPU的显存占用情况"""
|
|
command = [
|
|
'nvidia-smi',
|
|
'--query-gpu=memory.used',
|
|
'--format=csv,noheader,nounits',
|
|
'-i', str(gpu_id)
|
|
]
|
|
try:
|
|
output = subprocess.check_output(command)
|
|
return int(output.decode('utf-8').strip())
|
|
except Exception as e:
|
|
print(f"Error getting GPU {gpu_id} memory: {str(e)}")
|
|
return 0
|
|
|
|
def monitor_gpu_memory(interval=1):
|
|
"""监控GPU显存占用并实时显示"""
|
|
global gpu_count, gpu_memory_data
|
|
gpu_count = get_gpu_count()
|
|
|
|
if gpu_count == 0:
|
|
print("No GPUs detected!")
|
|
return
|
|
|
|
print(f"Detected {gpu_count} GPU(s)")
|
|
gpu_memory_data = [[] for _ in range(gpu_count)]
|
|
|
|
# 打印表头
|
|
header = "Timestamp " + "".join([f"GPU {i} (MB) " for i in range(gpu_count)])
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
try:
|
|
while True:
|
|
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
|
|
memory_values = []
|
|
|
|
for i in range(gpu_count):
|
|
memory_usage = get_gpu_memory_usage(i)
|
|
gpu_memory_data[i].append(memory_usage)
|
|
memory_values.append(memory_usage)
|
|
|
|
# 实时打印数据
|
|
data_line = f"{timestamp} " + "".join([f"{mem:8} " for mem in memory_values])
|
|
print(data_line, end='\r')
|
|
|
|
time.sleep(interval)
|
|
except KeyboardInterrupt:
|
|
print("\n" + "-" * len(header))
|
|
|
|
def plot_gpu_memory_data():
|
|
"""绘制显存占用曲线图"""
|
|
if not gpu_memory_data or len(gpu_memory_data) == 0:
|
|
return
|
|
|
|
max_len = max(len(data) for data in gpu_memory_data)
|
|
times = np.arange(len(gpu_memory_data[0]))
|
|
|
|
plt.figure(figsize=(12, 6))
|
|
for i in range(len(gpu_memory_data)):
|
|
gpu_data = np.array(gpu_memory_data[i] + [np.nan] * (max_len - len(gpu_memory_data[i])))
|
|
plt.plot(times, gpu_data, label=f'GPU {i}')
|
|
|
|
plt.xlabel('Time (s)')
|
|
plt.ylabel('Memory Usage (MB)')
|
|
plt.title('GPU Memory Usage Over Time')
|
|
plt.legend()
|
|
plt.grid(True)
|
|
plt.savefig('gpu_memory_usage.png')
|
|
plt.show()
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
print("Monitoring GPU memory usage. Press Ctrl+C to stop.")
|
|
monitor_gpu_memory(interval=1)
|
|
except Exception as e:
|
|
print(f"Error: {str(e)}")
|
|
finally:
|
|
if gpu_count > 0:
|
|
# 将数据保存到文件
|
|
with open('gpu_memory_data.txt', 'w') as f:
|
|
for i in range(gpu_count):
|
|
f.write(f'GPU {i} Memory Usage: ')
|
|
for data in gpu_memory_data[i]:
|
|
f.write(f"{data} ")
|
|
f.write('\n')
|
|
plot_gpu_memory_data()
|
|
print("Memory usage data has been plotted and saved to gpu_memory_usage.png and gpu_memory_data.txt") |