diff --git a/check_mk-amd-gpu/amd-gpu-0.1.0.mkp b/check_mk-amd-gpu/amd-gpu-0.1.0.mkp new file mode 100755 index 0000000..28c40ed Binary files /dev/null and b/check_mk-amd-gpu/amd-gpu-0.1.0.mkp differ diff --git a/check_mk-amd-gpu/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py b/check_mk-amd-gpu/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py new file mode 100644 index 0000000..daf3add --- /dev/null +++ b/check_mk-amd-gpu/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# +# Copyright 2024 Spearhead Systems SRL + +from cmk.base.plugins.agent_based.agent_based_api.v1 import ( + register, + Service, + Result, + Metric, + State, +) + + +def discovery_amd_gpu(section): + name = section[0][0] + yield Service(item=name) + + +def check_state(alert_percentages, measured_percent): + if alert_percentages: + if alert_percentages[1] <= measured_percent: + return State.CRIT + elif alert_percentages[0] <= measured_percent: + return State.WARN + return State.OK + + +def get_levels(alert_levels, total=None): + if alert_levels == None: + return + + if total == None: + return alert_levels + + return (alert_levels[0] / 100 * total, alert_levels[1] / 100 * total) + + +def check_amd_gpu(item, params, section): + if item != section[0][0]: + return + + gpu_percent = int(float(section[1][0])) + vram_bytes_used = int(section[2][0]) + vram_bytes_total = int(section[3][0]) + vram_bytes_free = max(0, vram_bytes_total - vram_bytes_used) + + vram_mb_used = vram_bytes_used // 1048576 + vram_mb_total = vram_bytes_total // 1048576 + vram_mb_free = vram_bytes_free // 1048576 + + alert_gpu_percent = params.get("gpu_percent") + alert_vram_used_percent = params.get("vram_used_percent") + alert_vram_free_percent = params.get("vram_free_percent") + + vram_used_percent = vram_bytes_used / vram_bytes_total * 100 + vram_free_percent = 100 - vram_used_percent + + yield Result( + state=check_state(alert_gpu_percent, gpu_percent), + summary=f"GPU: {gpu_percent}%" + ) + + yield Result( + state=check_state(alert_vram_free_percent, vram_free_percent), + summary=f"VRAM free: {vram_mb_free} MiB" + ) + + yield Result( + state=check_state(alert_vram_used_percent, vram_used_percent), + summary=f"VRAM used: {vram_mb_used} MiB" + ) + + yield Result( + state=State.OK, + summary=f"VRAM total: {vram_mb_total} MiB" + ) + + yield Metric( + name="gpu_percent", + value=gpu_percent, + levels=get_levels(alert_gpu_percent), + boundaries=(0, 100) + ) + + yield Metric( + name="vram_used", + value=vram_mb_used, + levels=get_levels(alert_vram_used_percent, vram_mb_total), + boundaries=(0, vram_mb_total) + ) + + yield Metric( + name="vram_free", + value=vram_mb_free, + levels=get_levels(alert_vram_free_percent, vram_mb_total), + boundaries=(0, vram_mb_total) + ) + +register.check_plugin( + name='amd_gpu', + service_name='AMD GPU - %s', + discovery_function=discovery_amd_gpu, + check_function=check_amd_gpu, + check_default_parameters={}, + check_ruleset_name='amd_gpu', +) diff --git a/check_mk-amd-gpu/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py b/check_mk-amd-gpu/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py new file mode 100644 index 0000000..b660603 --- /dev/null +++ b/check_mk-amd-gpu/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# +# Copyright 2024 Spearhead Systems SRL + +from cmk.gui.i18n import _ +from cmk.gui.plugins.wato.utils import ( + CheckParameterRulespecWithItem, + rulespec_registry, + RulespecGroupCheckParametersHardware, +) +from cmk.gui.valuespec import Dictionary, Percentage, Tuple + + +def _parameter_valuespec_amd_gpu(): + return Dictionary( + title=_("GPU utilization"), + help=_( + "These metrics are queried directly from the AMD GPU. " + "Upper and lower levels can be specified for individual metrics." + ), + elements=[ + ( + "gpu_percent", + Tuple( + title=_("GPU Used"), + help=_("If usage of total GPU compute goes above these percentages, issue alerts."), + elements=[ + Percentage( + title=_("Warn if above"), + default_value=90 + ), + Percentage( + title=_("Crit if above"), + default_value=100 + ) + ] + ) + ), + ( + "vram_free_percent", + Tuple( + title=_("VRAM Free"), + help=_("If free VRAM goes above these percentages, issue alerts."), + elements=[ + Percentage( + title="Warn if above", + default_value=70 + ), + Percentage( + title="Crit if above", + default_value=90 + ) + ] + ) + ), + ( + "vram_used_percent", + Tuple( + title=_("VRAM Used"), + help=_("If used VRAM goes above these percentages, issue alerts."), + elements=[ + Percentage( + title="Warn if above", + default_value=70 + ), + Percentage( + title="Crit if above", + default_value=90 + ) + ] + ) + ) + ] + ) + + +rulespec_registry.register( + CheckParameterRulespecWithItem( + check_group_name="amd_gpu", + group=RulespecGroupCheckParametersHardware, + match_type="dict", + parameter_valuespec=_parameter_valuespec_amd_gpu, + title=lambda: _("AMD GPU Metrics"), + ) +) diff --git a/check_mk-amd-gpu/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 b/check_mk-amd-gpu/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 new file mode 100644 index 0000000..4cb5b74 --- /dev/null +++ b/check_mk-amd-gpu/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 @@ -0,0 +1,20 @@ +# Copyright 2024 Spearhead Systems SRL +# +# This goes in C:\ProgramData\checkmk\agent\plugins. It should be added automatically by +# baking a new MSI after setting "Agent Rules" > "Deploy Custom Files With Agent" with +# "Deploy Custom Files With Agent" including "amd_gpu". + +foreach ($Item in Get-ChildItem "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}" -Name -Include 000*) { + $Name = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "DriverDesc" + if ($Name -match 'Radeon') { + $GpuBytesTotal = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.qwMemorySize" + $GpuRawName = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.AdapterString" + break + } +} + +$GpuName = [System.Text.Encoding]::Unicode.GetString($GpuRawName) +$GpuPercent = (((Get-Counter "\GPU Engine(*)\Utilization Percentage" ).CounterSamples).CookedValue | measure -sum).sum +$GpuBytesUsed = (((Get-Counter "\GPU Process Memory(*)\Dedicated Usage").CounterSamples).CookedValue | measure -sum).sum + +Write-Output "<<>>", $GpuName, $GpuPercent, $GpuBytesUsed, $GpuBytesTotal