Coverage for orchestr_ant_ion / monitoring / gpu.py: 55%
64 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 08:36 +0000
1"""Shared GPU probing utilities for system monitors."""
3from __future__ import annotations
5from contextlib import suppress
6from dataclasses import dataclass
7from typing import TYPE_CHECKING
9from loguru import logger
12if TYPE_CHECKING:
13 from types import TracebackType
16try:
17 import pynvml
19 PYNVML_AVAILABLE = True
20except ImportError:
21 PYNVML_AVAILABLE = False
22 pynvml = None # type: ignore[assignment]
23 logger.warning("nvidia-ml-py not available. GPU monitoring disabled.")
26@dataclass
27class GPUSnapshot:
28 """Container for a single GPU metrics reading."""
30 utilization: float = 0.0
31 memory_used_bytes: int = 0
32 memory_total_bytes: int = 0
33 temperature_celsius: float = 0.0
34 power_watts: float = 0.0
37class GPUProbe:
38 """Shared GPU probe wrapping pynvml init/read/shutdown.
40 This class supports both explicit lifecycle management via shutdown()
41 and context manager protocol for guaranteed resource cleanup.
43 Example:
44 >>> with GPUProbe() as gpu:
45 ... snapshot = gpu.read()
46 ... print(f"GPU: {gpu.gpu_name}, Temp: {snapshot.temperature_celsius}C")
47 """
49 def __init__(self, gpu_index: int = 0) -> None:
50 """Initialize pynvml and grab a device handle.
52 Args:
53 gpu_index: GPU device index to monitor (default: 0).
54 """
55 self.gpu_index = gpu_index
56 self._handle: object | None = None
57 self.gpu_name = "N/A"
58 self.available = False
60 if not PYNVML_AVAILABLE:
61 return
63 try:
64 pynvml.nvmlInit()
65 self._handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
66 name = pynvml.nvmlDeviceGetName(self._handle)
67 if isinstance(name, bytes):
68 name = name.decode("utf-8")
69 self.gpu_name = name
70 self.available = True
71 logger.success("GPU monitoring initialized: {}", self.gpu_name)
72 except (pynvml.NVMLError, RuntimeError) as exc:
73 logger.warning("Failed to initialize GPU monitoring: {}", exc)
74 self._handle = None
76 def read(self) -> GPUSnapshot | None:
77 """Read current GPU metrics.
79 Returns:
80 GPUSnapshot with current metrics, or None if unavailable.
81 """
82 if not self.available or self._handle is None:
83 return None
85 try:
86 util = pynvml.nvmlDeviceGetUtilizationRates(self._handle)
87 mem = pynvml.nvmlDeviceGetMemoryInfo(self._handle)
88 temp = float(
89 pynvml.nvmlDeviceGetTemperature(
90 self._handle, pynvml.NVML_TEMPERATURE_GPU
91 )
92 )
94 power = 0.0
95 with suppress(Exception):
96 power = pynvml.nvmlDeviceGetPowerUsage(self._handle) / 1000.0
98 return GPUSnapshot(
99 utilization=float(util.gpu),
100 memory_used_bytes=int(mem.used),
101 memory_total_bytes=int(mem.total),
102 temperature_celsius=temp,
103 power_watts=power,
104 )
105 except (pynvml.NVMLError, RuntimeError) as exc:
106 logger.debug("Error reading GPU metrics: {}", exc)
107 return None
109 def shutdown(self) -> None:
110 """Release pynvml resources.
112 Safe to call multiple times. After shutdown, read() will return None.
113 """
114 if self._handle is not None and PYNVML_AVAILABLE and self.available:
115 with suppress(Exception):
116 pynvml.nvmlShutdown()
117 logger.debug("GPU monitoring shutdown complete")
118 self._handle = None
119 self.available = False
121 def __enter__(self) -> GPUProbe:
122 """Enter context manager."""
123 return self
125 def __exit__(
126 self,
127 exc_type: type[BaseException] | None,
128 exc_val: BaseException | None,
129 exc_tb: TracebackType | None,
130 ) -> None:
131 """Exit context manager and ensure cleanup."""
132 self.shutdown()