# -*- coding: utf-8 -*- import subprocess import time import re import matplotlib.pyplot as plt import numpy as np import datetime # 存储所有GPU的显存占用数据 gpu_memory_data = [] gpu_count = 0 def get_gpu_count(): """获取系统中的GPU数量""" try: output = subprocess.check_output(['nvidia-smi', '-L']) return len(output.decode('utf-8').strip().split('\n')) except Exception as e: print(f"Error detecting GPUs: {str(e)}") return 0 def get_gpu_memory_usage(gpu_id): """获取指定GPU的显存占用情况""" command = [ 'nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits', '-i', str(gpu_id) ] try: output = subprocess.check_output(command) return int(output.decode('utf-8').strip()) except Exception as e: print(f"Error getting GPU {gpu_id} memory: {str(e)}") return 0 def monitor_gpu_memory(interval=1): """监控GPU显存占用并实时显示""" global gpu_count, gpu_memory_data gpu_count = get_gpu_count() if gpu_count == 0: print("No GPUs detected!") return print(f"Detected {gpu_count} GPU(s)") gpu_memory_data = [[] for _ in range(gpu_count)] # 打印表头 header = "Timestamp " + "".join([f"GPU {i} (MB) " for i in range(gpu_count)]) print(header) print("-" * len(header)) try: while True: timestamp = datetime.datetime.now().strftime("%H:%M:%S") memory_values = [] for i in range(gpu_count): memory_usage = get_gpu_memory_usage(i) gpu_memory_data[i].append(memory_usage) memory_values.append(memory_usage) # 实时打印数据 data_line = f"{timestamp} " + "".join([f"{mem:8} " for mem in memory_values]) print(data_line, end='\r') time.sleep(interval) except KeyboardInterrupt: print("\n" + "-" * len(header)) def plot_gpu_memory_data(): """绘制显存占用曲线图""" if not gpu_memory_data or len(gpu_memory_data) == 0: return max_len = max(len(data) for data in gpu_memory_data) times = np.arange(len(gpu_memory_data[0])) plt.figure(figsize=(12, 6)) for i in range(len(gpu_memory_data)): gpu_data = np.array(gpu_memory_data[i] + [np.nan] * (max_len - len(gpu_memory_data[i]))) plt.plot(times, gpu_data, label=f'GPU {i}') plt.xlabel('Time (s)') plt.ylabel('Memory Usage (MB)') plt.title('GPU Memory Usage Over Time') plt.legend() plt.grid(True) plt.savefig('gpu_memory_usage.png') plt.show() if __name__ == '__main__': try: print("Monitoring GPU memory usage. Press Ctrl+C to stop.") monitor_gpu_memory(interval=1) except Exception as e: print(f"Error: {str(e)}") finally: if gpu_count > 0: # 将数据保存到文件 with open('gpu_memory_data.txt', 'w') as f: for i in range(gpu_count): f.write(f'GPU {i} Memory Usage: ') for data in gpu_memory_data[i]: f.write(f"{data} ") f.write('\n') plot_gpu_memory_data() print("Memory usage data has been plotted and saved to gpu_memory_usage.png and gpu_memory_data.txt")