Coverage for orchestr_ant_ion / monitoring / system.py: 75%

127 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-19 08:36 +0000

1"""System monitoring module for tracking CPU, GPU, and memory metrics.""" 

2 

3from __future__ import annotations 

4 

5import time 

6from dataclasses import dataclass 

7 

8import psutil 

9from loguru import logger 

10 

11from orchestr_ant_ion.monitoring.gpu import GPUProbe 

12 

13 

14@dataclass 

15class SystemMetrics: 

16 """Container for system metrics at a specific timestamp.""" 

17 

18 timestamp: float 

19 cpu_percent: float 

20 memory_percent: float 

21 memory_used_mb: float 

22 memory_available_mb: float 

23 gpu_utilization: float | None = None 

24 gpu_memory_used_mb: float | None = None 

25 gpu_memory_total_mb: float | None = None 

26 gpu_temperature: float | None = None 

27 

28 

29class SystemMonitor: 

30 """Monitor system resources (CPU, RAM, GPU) and log metrics over time. 

31 

32 Example: 

33 >>> monitor = SystemMonitor(interval=1.0) 

34 >>> monitor.start() 

35 >>> # ... do some work ... 

36 >>> monitor.stop() 

37 >>> metrics = monitor.get_metrics() 

38 >>> monitor.print_summary() 

39 """ 

40 

41 def __init__(self, interval: float = 1.0, gpu_index: int = 0) -> None: 

42 """Initialize the system monitor. 

43 

44 Args: 

45 interval: Time interval between measurements in seconds 

46 gpu_index: GPU device index to monitor (default: 0) 

47 """ 

48 self.interval = interval 

49 self.gpu_index = gpu_index 

50 self.metrics: list[SystemMetrics] = [] 

51 self._monitoring = False 

52 self._gpu = GPUProbe(gpu_index) 

53 

54 # Prime psutil's cpu_percent for non-blocking reads 

55 psutil.cpu_percent(interval=None) 

56 

57 def _collect_metrics(self) -> SystemMetrics: 

58 """Collect current system metrics.""" 

59 cpu_percent = psutil.cpu_percent(interval=None) 

60 memory = psutil.virtual_memory() 

61 

62 gpu_util = None 

63 gpu_mem_used = None 

64 gpu_mem_total = None 

65 gpu_temp = None 

66 

67 snapshot = self._gpu.read() 

68 if snapshot is not None: 

69 gpu_util = snapshot.utilization 

70 gpu_mem_used = snapshot.memory_used_bytes / (1024**2) 

71 gpu_mem_total = snapshot.memory_total_bytes / (1024**2) 

72 gpu_temp = snapshot.temperature_celsius 

73 

74 return SystemMetrics( 

75 timestamp=time.time(), 

76 cpu_percent=cpu_percent, 

77 memory_percent=memory.percent, 

78 memory_used_mb=memory.used / (1024**2), 

79 memory_available_mb=memory.available / (1024**2), 

80 gpu_utilization=gpu_util, 

81 gpu_memory_used_mb=gpu_mem_used, 

82 gpu_memory_total_mb=gpu_mem_total, 

83 gpu_temperature=gpu_temp, 

84 ) 

85 

86 def start(self) -> None: 

87 """Start monitoring and collecting metrics.""" 

88 self._monitoring = True 

89 logger.info("System monitoring started (interval: {}s)", self.interval) 

90 self.metrics = [] 

91 

92 def record(self) -> None: 

93 """Record a single metric snapshot.""" 

94 if not self._monitoring: 

95 logger.warning("Monitor not started. Call start() first.") 

96 return 

97 

98 metrics = self._collect_metrics() 

99 self.metrics.append(metrics) 

100 

101 log_msg = f"CPU: {metrics.cpu_percent:.1f}% | RAM: {metrics.memory_percent:.1f}% ({metrics.memory_used_mb:.0f}MB)" 

102 

103 if metrics.gpu_utilization is not None: 

104 log_msg += f" | GPU: {metrics.gpu_utilization:.1f}% | VRAM: {metrics.gpu_memory_used_mb:.0f}/{metrics.gpu_memory_total_mb:.0f}MB | Temp: {metrics.gpu_temperature:.0f}C" 

105 

106 logger.debug(log_msg) 

107 

108 def stop(self) -> None: 

109 """Stop monitoring.""" 

110 self._monitoring = False 

111 logger.info( 

112 "System monitoring stopped. Collected {} samples.", len(self.metrics) 

113 ) 

114 

115 def get_metrics(self) -> list[SystemMetrics]: 

116 """Get all collected metrics.""" 

117 return self.metrics 

118 

119 @property 

120 def is_monitoring(self) -> bool: 

121 """Return True if monitoring is active.""" 

122 return self._monitoring 

123 

124 @property 

125 def gpu_handle(self) -> object | None: 

126 """Return the GPU handle if initialized.""" 

127 return self._gpu._handle # noqa: SLF001 

128 

129 def print_summary(self) -> None: 

130 """Print a summary of collected metrics.""" 

131 if not self.metrics: 

132 logger.warning("No metrics collected.") 

133 return 

134 

135 cpu_values = [m.cpu_percent for m in self.metrics] 

136 mem_values = [m.memory_percent for m in self.metrics] 

137 

138 cpu_stats = self._compute_stats(cpu_values) 

139 mem_stats = self._compute_stats(mem_values) 

140 

141 logger.info("=" * 60) 

142 logger.info("SYSTEM MONITORING SUMMARY") 

143 logger.info("=" * 60) 

144 logger.info("Samples collected: {}", len(self.metrics)) 

145 logger.info( 

146 "Duration: {:.2f}s", 

147 self.metrics[-1].timestamp - self.metrics[0].timestamp, 

148 ) 

149 logger.info("") 

150 logger.info("CPU Usage:") 

151 logger.info(" Average: {:.1f}%", cpu_stats["avg"]) 

152 logger.info(" Min: {:.1f}%", cpu_stats["min"]) 

153 logger.info(" Max: {:.1f}%", cpu_stats["max"]) 

154 logger.info("") 

155 logger.info("Memory Usage:") 

156 logger.info(" Average: {:.1f}%", mem_stats["avg"]) 

157 logger.info(" Min: {:.1f}%", mem_stats["min"]) 

158 logger.info(" Max: {:.1f}%", mem_stats["max"]) 

159 

160 if self.metrics[0].gpu_utilization is not None: 

161 gpu_util_values = [ 

162 m.gpu_utilization for m in self.metrics if m.gpu_utilization is not None 

163 ] 

164 gpu_mem_values = [ 

165 m.gpu_memory_used_mb 

166 for m in self.metrics 

167 if m.gpu_memory_used_mb is not None 

168 ] 

169 gpu_temp_values = [ 

170 m.gpu_temperature for m in self.metrics if m.gpu_temperature is not None 

171 ] 

172 

173 if gpu_util_values: 

174 gpu_util_stats = self._compute_stats(gpu_util_values) 

175 logger.info("") 

176 logger.info("GPU Usage:") 

177 logger.info(" Average: {:.1f}%", gpu_util_stats["avg"]) 

178 logger.info(" Min: {:.1f}%", gpu_util_stats["min"]) 

179 logger.info(" Max: {:.1f}%", gpu_util_stats["max"]) 

180 

181 if gpu_mem_values: 

182 gpu_mem_stats = self._compute_stats(gpu_mem_values) 

183 logger.info("") 

184 logger.info("GPU Memory:") 

185 logger.info(" Average: {:.0f}MB", gpu_mem_stats["avg"]) 

186 logger.info(" Min: {:.0f}MB", gpu_mem_stats["min"]) 

187 logger.info(" Max: {:.0f}MB", gpu_mem_stats["max"]) 

188 

189 if gpu_temp_values: 

190 gpu_temp_stats = self._compute_stats(gpu_temp_values) 

191 logger.info("") 

192 logger.info("GPU Temperature:") 

193 logger.info(" Average: {:.1f}C", gpu_temp_stats["avg"]) 

194 logger.info(" Min: {:.1f}C", gpu_temp_stats["min"]) 

195 logger.info(" Max: {:.1f}C", gpu_temp_stats["max"]) 

196 

197 logger.info("=" * 60) 

198 

199 @staticmethod 

200 def _compute_stats(values: list[float]) -> dict[str, float]: 

201 """Compute min, max, and average in a single pass.""" 

202 if not values: 

203 return {"avg": 0.0, "min": 0.0, "max": 0.0} 

204 total = 0.0 

205 min_val = float("inf") 

206 max_val = float("-inf") 

207 for v in values: 

208 total += v 

209 min_val = min(min_val, v) 

210 max_val = max(max_val, v) 

211 return {"avg": total / len(values), "min": min_val, "max": max_val} 

212 

213 def __del__(self) -> None: 

214 """Cleanup GPU monitoring on deletion.""" 

215 self._gpu.shutdown()