Coverage for orchestr_ant_ion / monitoring / system.py: 75%
127 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 08:36 +0000
1"""System monitoring module for tracking CPU, GPU, and memory metrics."""
3from __future__ import annotations
5import time
6from dataclasses import dataclass
8import psutil
9from loguru import logger
11from orchestr_ant_ion.monitoring.gpu import GPUProbe
14@dataclass
15class SystemMetrics:
16 """Container for system metrics at a specific timestamp."""
18 timestamp: float
19 cpu_percent: float
20 memory_percent: float
21 memory_used_mb: float
22 memory_available_mb: float
23 gpu_utilization: float | None = None
24 gpu_memory_used_mb: float | None = None
25 gpu_memory_total_mb: float | None = None
26 gpu_temperature: float | None = None
29class SystemMonitor:
30 """Monitor system resources (CPU, RAM, GPU) and log metrics over time.
32 Example:
33 >>> monitor = SystemMonitor(interval=1.0)
34 >>> monitor.start()
35 >>> # ... do some work ...
36 >>> monitor.stop()
37 >>> metrics = monitor.get_metrics()
38 >>> monitor.print_summary()
39 """
41 def __init__(self, interval: float = 1.0, gpu_index: int = 0) -> None:
42 """Initialize the system monitor.
44 Args:
45 interval: Time interval between measurements in seconds
46 gpu_index: GPU device index to monitor (default: 0)
47 """
48 self.interval = interval
49 self.gpu_index = gpu_index
50 self.metrics: list[SystemMetrics] = []
51 self._monitoring = False
52 self._gpu = GPUProbe(gpu_index)
54 # Prime psutil's cpu_percent for non-blocking reads
55 psutil.cpu_percent(interval=None)
57 def _collect_metrics(self) -> SystemMetrics:
58 """Collect current system metrics."""
59 cpu_percent = psutil.cpu_percent(interval=None)
60 memory = psutil.virtual_memory()
62 gpu_util = None
63 gpu_mem_used = None
64 gpu_mem_total = None
65 gpu_temp = None
67 snapshot = self._gpu.read()
68 if snapshot is not None:
69 gpu_util = snapshot.utilization
70 gpu_mem_used = snapshot.memory_used_bytes / (1024**2)
71 gpu_mem_total = snapshot.memory_total_bytes / (1024**2)
72 gpu_temp = snapshot.temperature_celsius
74 return SystemMetrics(
75 timestamp=time.time(),
76 cpu_percent=cpu_percent,
77 memory_percent=memory.percent,
78 memory_used_mb=memory.used / (1024**2),
79 memory_available_mb=memory.available / (1024**2),
80 gpu_utilization=gpu_util,
81 gpu_memory_used_mb=gpu_mem_used,
82 gpu_memory_total_mb=gpu_mem_total,
83 gpu_temperature=gpu_temp,
84 )
86 def start(self) -> None:
87 """Start monitoring and collecting metrics."""
88 self._monitoring = True
89 logger.info("System monitoring started (interval: {}s)", self.interval)
90 self.metrics = []
92 def record(self) -> None:
93 """Record a single metric snapshot."""
94 if not self._monitoring:
95 logger.warning("Monitor not started. Call start() first.")
96 return
98 metrics = self._collect_metrics()
99 self.metrics.append(metrics)
101 log_msg = f"CPU: {metrics.cpu_percent:.1f}% | RAM: {metrics.memory_percent:.1f}% ({metrics.memory_used_mb:.0f}MB)"
103 if metrics.gpu_utilization is not None:
104 log_msg += f" | GPU: {metrics.gpu_utilization:.1f}% | VRAM: {metrics.gpu_memory_used_mb:.0f}/{metrics.gpu_memory_total_mb:.0f}MB | Temp: {metrics.gpu_temperature:.0f}C"
106 logger.debug(log_msg)
108 def stop(self) -> None:
109 """Stop monitoring."""
110 self._monitoring = False
111 logger.info(
112 "System monitoring stopped. Collected {} samples.", len(self.metrics)
113 )
115 def get_metrics(self) -> list[SystemMetrics]:
116 """Get all collected metrics."""
117 return self.metrics
119 @property
120 def is_monitoring(self) -> bool:
121 """Return True if monitoring is active."""
122 return self._monitoring
124 @property
125 def gpu_handle(self) -> object | None:
126 """Return the GPU handle if initialized."""
127 return self._gpu._handle # noqa: SLF001
129 def print_summary(self) -> None:
130 """Print a summary of collected metrics."""
131 if not self.metrics:
132 logger.warning("No metrics collected.")
133 return
135 cpu_values = [m.cpu_percent for m in self.metrics]
136 mem_values = [m.memory_percent for m in self.metrics]
138 cpu_stats = self._compute_stats(cpu_values)
139 mem_stats = self._compute_stats(mem_values)
141 logger.info("=" * 60)
142 logger.info("SYSTEM MONITORING SUMMARY")
143 logger.info("=" * 60)
144 logger.info("Samples collected: {}", len(self.metrics))
145 logger.info(
146 "Duration: {:.2f}s",
147 self.metrics[-1].timestamp - self.metrics[0].timestamp,
148 )
149 logger.info("")
150 logger.info("CPU Usage:")
151 logger.info(" Average: {:.1f}%", cpu_stats["avg"])
152 logger.info(" Min: {:.1f}%", cpu_stats["min"])
153 logger.info(" Max: {:.1f}%", cpu_stats["max"])
154 logger.info("")
155 logger.info("Memory Usage:")
156 logger.info(" Average: {:.1f}%", mem_stats["avg"])
157 logger.info(" Min: {:.1f}%", mem_stats["min"])
158 logger.info(" Max: {:.1f}%", mem_stats["max"])
160 if self.metrics[0].gpu_utilization is not None:
161 gpu_util_values = [
162 m.gpu_utilization for m in self.metrics if m.gpu_utilization is not None
163 ]
164 gpu_mem_values = [
165 m.gpu_memory_used_mb
166 for m in self.metrics
167 if m.gpu_memory_used_mb is not None
168 ]
169 gpu_temp_values = [
170 m.gpu_temperature for m in self.metrics if m.gpu_temperature is not None
171 ]
173 if gpu_util_values:
174 gpu_util_stats = self._compute_stats(gpu_util_values)
175 logger.info("")
176 logger.info("GPU Usage:")
177 logger.info(" Average: {:.1f}%", gpu_util_stats["avg"])
178 logger.info(" Min: {:.1f}%", gpu_util_stats["min"])
179 logger.info(" Max: {:.1f}%", gpu_util_stats["max"])
181 if gpu_mem_values:
182 gpu_mem_stats = self._compute_stats(gpu_mem_values)
183 logger.info("")
184 logger.info("GPU Memory:")
185 logger.info(" Average: {:.0f}MB", gpu_mem_stats["avg"])
186 logger.info(" Min: {:.0f}MB", gpu_mem_stats["min"])
187 logger.info(" Max: {:.0f}MB", gpu_mem_stats["max"])
189 if gpu_temp_values:
190 gpu_temp_stats = self._compute_stats(gpu_temp_values)
191 logger.info("")
192 logger.info("GPU Temperature:")
193 logger.info(" Average: {:.1f}C", gpu_temp_stats["avg"])
194 logger.info(" Min: {:.1f}C", gpu_temp_stats["min"])
195 logger.info(" Max: {:.1f}C", gpu_temp_stats["max"])
197 logger.info("=" * 60)
199 @staticmethod
200 def _compute_stats(values: list[float]) -> dict[str, float]:
201 """Compute min, max, and average in a single pass."""
202 if not values:
203 return {"avg": 0.0, "min": 0.0, "max": 0.0}
204 total = 0.0
205 min_val = float("inf")
206 max_val = float("-inf")
207 for v in values:
208 total += v
209 min_val = min(min_val, v)
210 max_val = max(max_val, v)
211 return {"avg": total / len(values), "min": min_val, "max": max_val}
213 def __del__(self) -> None:
214 """Cleanup GPU monitoring on deletion."""
215 self._gpu.shutdown()