Coverage for orchestr_ant_ion/monitoring/gpu.py: 55%

1"""Shared GPU probing utilities for system monitors."""

3from __future__ import annotations

5from contextlib import suppress

6from dataclasses import dataclass

7from typing import TYPE_CHECKING

9from loguru import logger

12if TYPE_CHECKING:

13 from types import TracebackType

16try:

17 import pynvml

19 PYNVML_AVAILABLE = True

20except ImportError:

21 PYNVML_AVAILABLE = False

22 pynvml = None # type: ignore[assignment]

23 logger.warning("nvidia-ml-py not available. GPU monitoring disabled.")

26@dataclass

27class GPUSnapshot:

28 """Container for a single GPU metrics reading."""

30 utilization: float = 0.0

31 memory_used_bytes: int = 0

32 memory_total_bytes: int = 0

33 temperature_celsius: float = 0.0

34 power_watts: float = 0.0

37class GPUProbe:

38 """Shared GPU probe wrapping pynvml init/read/shutdown.

40 This class supports both explicit lifecycle management via shutdown()

41 and context manager protocol for guaranteed resource cleanup.

43 Example:

44 >>> with GPUProbe() as gpu:

45 ... snapshot = gpu.read()

46 ... print(f"GPU: {gpu.gpu_name}, Temp: {snapshot.temperature_celsius}C")

47 """

49 def __init__(self, gpu_index: int = 0) -> None:

50 """Initialize pynvml and grab a device handle.

52 Args:

53 gpu_index: GPU device index to monitor (default: 0).

54 """

55 self.gpu_index = gpu_index

56 self._handle: object | None = None

57 self.gpu_name = "N/A"

58 self.available = False

60 if not PYNVML_AVAILABLE:

61 return

63 try:

64 pynvml.nvmlInit()

65 self._handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)

66 name = pynvml.nvmlDeviceGetName(self._handle)

67 if isinstance(name, bytes):

68 name = name.decode("utf-8")

69 self.gpu_name = name

70 self.available = True

71 logger.success("GPU monitoring initialized: {}", self.gpu_name)

72 except (pynvml.NVMLError, RuntimeError) as exc:

73 logger.warning("Failed to initialize GPU monitoring: {}", exc)

74 self._handle = None

76 def read(self) -> GPUSnapshot | None:

77 """Read current GPU metrics.

79 Returns:

80 GPUSnapshot with current metrics, or None if unavailable.

81 """

82 if not self.available or self._handle is None:

83 return None

85 try:

86 util = pynvml.nvmlDeviceGetUtilizationRates(self._handle)

87 mem = pynvml.nvmlDeviceGetMemoryInfo(self._handle)

88 temp = float(

89 pynvml.nvmlDeviceGetTemperature(

90 self._handle, pynvml.NVML_TEMPERATURE_GPU

91 )

92 )

94 power = 0.0

95 with suppress(Exception):

96 power = pynvml.nvmlDeviceGetPowerUsage(self._handle) / 1000.0

98 return GPUSnapshot(

99 utilization=float(util.gpu),

100 memory_used_bytes=int(mem.used),

101 memory_total_bytes=int(mem.total),

102 temperature_celsius=temp,

103 power_watts=power,

104 )

105 except (pynvml.NVMLError, RuntimeError) as exc:

106 logger.debug("Error reading GPU metrics: {}", exc)

107 return None

108

109 def shutdown(self) -> None:

110 """Release pynvml resources.

111

112 Safe to call multiple times. After shutdown, read() will return None.

113 """

114 if self._handle is not None and PYNVML_AVAILABLE and self.available:

115 with suppress(Exception):

116 pynvml.nvmlShutdown()

117 logger.debug("GPU monitoring shutdown complete")

118 self._handle = None

119 self.available = False

120

121 def __enter__(self) -> GPUProbe:

122 """Enter context manager."""

123 return self

124

125 def __exit__(

126 self,

127 exc_type: type[BaseException] | None,

128 exc_val: BaseException | None,

129 exc_tb: TracebackType | None,

130 ) -> None:

131 """Exit context manager and ensure cleanup."""

132 self.shutdown()

Coverage for orchestr_ant_ion / monitoring / gpu.py: 55%

64 statements