Coverage for orchestr_ant_ion / monitoring / gpu.py: 55%

64 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-19 08:36 +0000

1"""Shared GPU probing utilities for system monitors.""" 

2 

3from __future__ import annotations 

4 

5from contextlib import suppress 

6from dataclasses import dataclass 

7from typing import TYPE_CHECKING 

8 

9from loguru import logger 

10 

11 

12if TYPE_CHECKING: 

13 from types import TracebackType 

14 

15 

16try: 

17 import pynvml 

18 

19 PYNVML_AVAILABLE = True 

20except ImportError: 

21 PYNVML_AVAILABLE = False 

22 pynvml = None # type: ignore[assignment] 

23 logger.warning("nvidia-ml-py not available. GPU monitoring disabled.") 

24 

25 

26@dataclass 

27class GPUSnapshot: 

28 """Container for a single GPU metrics reading.""" 

29 

30 utilization: float = 0.0 

31 memory_used_bytes: int = 0 

32 memory_total_bytes: int = 0 

33 temperature_celsius: float = 0.0 

34 power_watts: float = 0.0 

35 

36 

37class GPUProbe: 

38 """Shared GPU probe wrapping pynvml init/read/shutdown. 

39 

40 This class supports both explicit lifecycle management via shutdown() 

41 and context manager protocol for guaranteed resource cleanup. 

42 

43 Example: 

44 >>> with GPUProbe() as gpu: 

45 ... snapshot = gpu.read() 

46 ... print(f"GPU: {gpu.gpu_name}, Temp: {snapshot.temperature_celsius}C") 

47 """ 

48 

49 def __init__(self, gpu_index: int = 0) -> None: 

50 """Initialize pynvml and grab a device handle. 

51 

52 Args: 

53 gpu_index: GPU device index to monitor (default: 0). 

54 """ 

55 self.gpu_index = gpu_index 

56 self._handle: object | None = None 

57 self.gpu_name = "N/A" 

58 self.available = False 

59 

60 if not PYNVML_AVAILABLE: 

61 return 

62 

63 try: 

64 pynvml.nvmlInit() 

65 self._handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) 

66 name = pynvml.nvmlDeviceGetName(self._handle) 

67 if isinstance(name, bytes): 

68 name = name.decode("utf-8") 

69 self.gpu_name = name 

70 self.available = True 

71 logger.success("GPU monitoring initialized: {}", self.gpu_name) 

72 except (pynvml.NVMLError, RuntimeError) as exc: 

73 logger.warning("Failed to initialize GPU monitoring: {}", exc) 

74 self._handle = None 

75 

76 def read(self) -> GPUSnapshot | None: 

77 """Read current GPU metrics. 

78 

79 Returns: 

80 GPUSnapshot with current metrics, or None if unavailable. 

81 """ 

82 if not self.available or self._handle is None: 

83 return None 

84 

85 try: 

86 util = pynvml.nvmlDeviceGetUtilizationRates(self._handle) 

87 mem = pynvml.nvmlDeviceGetMemoryInfo(self._handle) 

88 temp = float( 

89 pynvml.nvmlDeviceGetTemperature( 

90 self._handle, pynvml.NVML_TEMPERATURE_GPU 

91 ) 

92 ) 

93 

94 power = 0.0 

95 with suppress(Exception): 

96 power = pynvml.nvmlDeviceGetPowerUsage(self._handle) / 1000.0 

97 

98 return GPUSnapshot( 

99 utilization=float(util.gpu), 

100 memory_used_bytes=int(mem.used), 

101 memory_total_bytes=int(mem.total), 

102 temperature_celsius=temp, 

103 power_watts=power, 

104 ) 

105 except (pynvml.NVMLError, RuntimeError) as exc: 

106 logger.debug("Error reading GPU metrics: {}", exc) 

107 return None 

108 

109 def shutdown(self) -> None: 

110 """Release pynvml resources. 

111 

112 Safe to call multiple times. After shutdown, read() will return None. 

113 """ 

114 if self._handle is not None and PYNVML_AVAILABLE and self.available: 

115 with suppress(Exception): 

116 pynvml.nvmlShutdown() 

117 logger.debug("GPU monitoring shutdown complete") 

118 self._handle = None 

119 self.available = False 

120 

121 def __enter__(self) -> GPUProbe: 

122 """Enter context manager.""" 

123 return self 

124 

125 def __exit__( 

126 self, 

127 exc_type: type[BaseException] | None, 

128 exc_val: BaseException | None, 

129 exc_tb: TracebackType | None, 

130 ) -> None: 

131 """Exit context manager and ensure cleanup.""" 

132 self.shutdown()