添加 gpu_memory_monitor.py

2025-08-05 18:42:20 +08:00
parent edc6be62b8
commit a0de3ba1a3
1 changed files with 109 additions and 0 deletions
--- a/gpu_memory_monitor.py
+++ b/gpu_memory_monitor.py
@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+import subprocess
+import time
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+import datetime
+
+# 存储所有GPU的显存占用数据
+gpu_memory_data = []
+gpu_count = 0
+
+def get_gpu_count():
+    """获取系统中的GPU数量"""
+    try:
+        output = subprocess.check_output(['nvidia-smi', '-L'])
+        return len(output.decode('utf-8').strip().split('\n'))
+    except Exception as e:
+        print(f"Error detecting GPUs: {str(e)}")
+        return 0
+
+def get_gpu_memory_usage(gpu_id):
+    """获取指定GPU的显存占用情况"""
+    command = [
+        'nvidia-smi', 
+        '--query-gpu=memory.used',
+        '--format=csv,noheader,nounits',
+        '-i', str(gpu_id)
+    ]
+    try:
+        output = subprocess.check_output(command)
+        return int(output.decode('utf-8').strip())
+    except Exception as e:
+        print(f"Error getting GPU {gpu_id} memory: {str(e)}")
+        return 0
+
+def monitor_gpu_memory(interval=1):
+    """监控GPU显存占用并实时显示"""
+    global gpu_count, gpu_memory_data
+    gpu_count = get_gpu_count()
+    
+    if gpu_count == 0:
+        print("No GPUs detected!")
+        return
+    
+    print(f"Detected {gpu_count} GPU(s)")
+    gpu_memory_data = [[] for _ in range(gpu_count)]
+    
+    # 打印表头
+    header = "Timestamp          " + "".join([f"GPU {i} (MB)   " for i in range(gpu_count)])
+    print(header)
+    print("-" * len(header))
+    
+    try:
+        while True:
+            timestamp = datetime.datetime.now().strftime("%H:%M:%S")
+            memory_values = []
+            
+            for i in range(gpu_count):
+                memory_usage = get_gpu_memory_usage(i)
+                gpu_memory_data[i].append(memory_usage)
+                memory_values.append(memory_usage)
+            
+            # 实时打印数据
+            data_line = f"{timestamp}   " + "".join([f"{mem:8}      " for mem in memory_values])
+            print(data_line, end='\r')
+            
+            time.sleep(interval)
+    except KeyboardInterrupt:
+        print("\n" + "-" * len(header))
+
+def plot_gpu_memory_data():
+    """绘制显存占用曲线图"""
+    if not gpu_memory_data or len(gpu_memory_data) == 0:
+        return
+        
+    max_len = max(len(data) for data in gpu_memory_data)
+    times = np.arange(len(gpu_memory_data[0]))
+    
+    plt.figure(figsize=(12, 6))
+    for i in range(len(gpu_memory_data)):
+        gpu_data = np.array(gpu_memory_data[i] + [np.nan] * (max_len - len(gpu_memory_data[i])))
+        plt.plot(times, gpu_data, label=f'GPU {i}')
+    
+    plt.xlabel('Time (s)')
+    plt.ylabel('Memory Usage (MB)')
+    plt.title('GPU Memory Usage Over Time')
+    plt.legend()
+    plt.grid(True)
+    plt.savefig('gpu_memory_usage.png')
+    plt.show()
+
+if __name__ == '__main__':
+    try:
+        print("Monitoring GPU memory usage. Press Ctrl+C to stop.")
+        monitor_gpu_memory(interval=1)
+    except Exception as e:
+        print(f"Error: {str(e)}")
+    finally:
+        if gpu_count > 0:
+            # 将数据保存到文件
+            with open('gpu_memory_data.txt', 'w') as f:
+                for i in range(gpu_count):
+                    f.write(f'GPU {i} Memory Usage: ')
+                    for data in gpu_memory_data[i]:
+                        f.write(f"{data} ")
+                    f.write('\n')
+            plot_gpu_memory_data()
+            print("Memory usage data has been plotted and saved to gpu_memory_usage.png and gpu_memory_data.txt")