diff --git a/amd-gpu/amd-gpu-0.1.1.mkp b/amd-gpu/2.3/amd-gpu-0.1.1.mkp similarity index 100% rename from amd-gpu/amd-gpu-0.1.1.mkp rename to amd-gpu/2.3/amd-gpu-0.1.1.mkp diff --git a/amd-gpu/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py b/amd-gpu/2.3/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py similarity index 100% rename from amd-gpu/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py rename to amd-gpu/2.3/local/lib/check_mk/base/plugins/agent_based/amd_gpu.py diff --git a/amd-gpu/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py b/amd-gpu/2.3/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py similarity index 100% rename from amd-gpu/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py rename to amd-gpu/2.3/local/lib/python3/cmk/gui/plugins/wato/check_parameters/amd_gpu.py diff --git a/amd-gpu/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 b/amd-gpu/2.3/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 similarity index 100% rename from amd-gpu/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 rename to amd-gpu/2.3/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 diff --git a/amd-gpu/2.4/amd_gpu-0.2.0.mkp b/amd-gpu/2.4/amd_gpu-0.2.0.mkp new file mode 100755 index 0000000..dcbcf36 Binary files /dev/null and b/amd-gpu/2.4/amd_gpu-0.2.0.mkp differ diff --git a/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/agent_based/amd_gpu.py b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/agent_based/amd_gpu.py new file mode 100644 index 0000000..309289a --- /dev/null +++ b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/agent_based/amd_gpu.py @@ -0,0 +1,102 @@ +# Copyright 2026 Spearhead Systems SRL + +import json +from cmk.agent_based.v2 import ( + Result, + Service, + Metric, + State, + CheckPlugin, + check_levels, +) + + +def discovery_amd_gpu(section): + name = section[0][0] + yield Service(item=name) + + +def get_levels(alert_levels, total): + if alert_levels is None: + return + if alert_levels[0] != "fixed": + return + + return (alert_levels[1][0], alert_levels[1][1] / 100 * total) + + +def check_amd_gpu(item, params, section): + if item != section[0][0]: + return + + gpu_percent = int(float(section[1][0])) + vram_bytes_used = int(section[2][0]) + vram_bytes_total = int(section[3][0]) + vram_bytes_free = max(0, vram_bytes_total - vram_bytes_used) + + vram_mb_used = vram_bytes_used // 1048576 + vram_mb_total = vram_bytes_total // 1048576 + vram_mb_free = vram_bytes_free // 1048576 + + alert_gpu_percent = params.get("gpu_percent") + alert_vram_used_percent = params.get("vram_used_percent") + alert_vram_free_percent = params.get("vram_free_percent") + + vram_used_percent = vram_bytes_used / vram_bytes_total * 100 + vram_free_percent = 100 - vram_used_percent + + yield from check_levels( + gpu_percent, + levels_upper = alert_gpu_percent, + metric_name = "gpu_percent", + render_func = lambda p: f"{p:.2f}%", + label = "GPU Used", + boundaries = (0, 100), + ) + + yield from check_levels( + vram_free_percent, + levels_upper = alert_vram_free_percent, + metric_name = "vram_free_percent", + render_func = lambda p: f"{vram_mb_free} MiB, {p:.2f}%", + label = "VRAM Free", + boundaries = (0, 100), + ) + + yield from check_levels( + vram_used_percent, + levels_upper = alert_vram_used_percent, + metric_name = "vram_used_percent", + render_func = lambda p: f"{vram_mb_used} MiB, {p:.2f}%", + label = "VRAM Used", + boundaries = (0, 100), + ) + + yield Result( + state = State.OK, + summary = f"VRAM total: {vram_mb_total} MiB" + ) + + yield Metric( + name = "vram_used", + value = vram_mb_used, + levels = get_levels(alert_vram_used_percent, vram_mb_total), + boundaries = (0, vram_mb_total) + ) + + yield Metric( + name = "vram_free", + value = vram_mb_free, + levels = get_levels(alert_vram_free_percent, vram_mb_total), + boundaries = (0, vram_mb_total) + ) + + +check_plugin_amd_gpu = CheckPlugin( + name = "amd_gpu", + check_ruleset_name = "amd_gpu", + service_name = "AMD GPU - %s", + discovery_function = discovery_amd_gpu, + check_function = check_amd_gpu, + check_default_parameters = {}, +) diff --git a/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/graphing/amd_gpu.py b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/graphing/amd_gpu.py new file mode 100644 index 0000000..d47593c --- /dev/null +++ b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/graphing/amd_gpu.py @@ -0,0 +1,71 @@ +# Copyright (C) 2026 Spearhead Systems SRL + +from cmk.graphing.v1 import Title +from cmk.graphing.v1.graphs import Graph, MinimalRange +from cmk.graphing.v1.metrics import ( + Metric, + DecimalNotation, + Unit, + Color, +) + + +UNIT_PERCENT = Unit(DecimalNotation("%")) +UNIT_MBYTES = Unit(DecimalNotation("MB")) + + +metric_amd_gpu_gpu_percent = Metric( + title = Title("GPU Percent Used"), + name = "gpu_percent", + unit = UNIT_PERCENT, + color = Color.BLUE, +) + +metric_amd_gpu_vram_free_percent = Metric( + title = Title("VRAM Percent Free"), + name = "vram_free_percent", + unit = UNIT_PERCENT, + color = Color.GREEN, +) + +metric_amd_gpu_vram_used_percent = Metric( + title = Title("VRAM Percent Used"), + name = "vram_used_percent", + unit = UNIT_PERCENT, + color = Color.ORANGE, +) + +metric_amd_gpu_vram_free = Metric( + title = Title("VRAM Free Bytes"), + name = "vram_free", + unit = UNIT_MBYTES, + color = Color.LIGHT_GREEN, +) + +metric_amd_gpu_vram_used = Metric( + title = Title("VRAM Used Bytes"), + name = "vram_used", + unit = UNIT_MBYTES, + color = Color.LIGHT_ORANGE, +) + + +graph_amd_gpu_vram_percentages = Graph( + name = "gpu_cpu_vram_percentages", + title = Title("GPU/VRAM Percentages"), + simple_lines = ( + "gpu_percent", + "vram_free_percent", + "vram_used_percent", + ), + minimal_range = MinimalRange(0, 100), +) + +graph_amd_gpu_vram_mbytes = Graph( + name = "gpu_vram_mbytes", + title = Title("VRAM MiB"), + simple_lines = ( + "vram_free", + "vram_used", + ), +) diff --git a/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/rulesets/amd_gpu.py b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/rulesets/amd_gpu.py new file mode 100644 index 0000000..38af832 --- /dev/null +++ b/amd-gpu/2.4/local/lib/python3/cmk_addons/plugins/amd_gpu/rulesets/amd_gpu.py @@ -0,0 +1,75 @@ +# Copyright 2026 Spearhead Systems SRL + +from cmk.rulesets.v1.form_specs import ( + Dictionary, + DictElement, + Float, + DefaultValue, + LevelDirection, + SimpleLevels, +) +from cmk.rulesets.v1.rule_specs import ( + CheckParameters, + HostAndItemCondition, + Topic, + Title, + Help, +) + + +def _valuespec_amd_gpu(): + return Dictionary( + title = Title("GPU utilization"), + help_text = Help( + "These metrics are queried directly from the AMD GPU. " + "Upper and lower levels can be specified for individual metrics." + ), + elements = { + "gpu_percent": DictElement( + parameter_form = SimpleLevels( + title = Title("GPU Used"), + help_text = Help( + "If usage of total GPU compute goes above these " + "percentages, issue alerts." + ), + level_direction = LevelDirection.UPPER, + form_spec_template = Float(title = Title("%")), + prefill_fixed_levels = DefaultValue(value=(90, 100)) + ), + ), + "vram_free_percent": DictElement( + parameter_form = SimpleLevels( + title = Title("VRAM Free"), + help_text = Help( + "If free VRAM goes above these percentages, " + "issue alerts." + ), + level_direction = LevelDirection.UPPER, + form_spec_template = Float(title = Title("%")), + prefill_fixed_levels = DefaultValue(value=(70, 90)) + ), + ), + "vram_used_percent": DictElement( + parameter_form = SimpleLevels( + title = Title("VRAM Used"), + help_text = Help( + "If used VRAM goes above these percentages, " + "issue alerts." + ), + level_direction = LevelDirection.UPPER, + form_spec_template = Float(title = Title("%")), + prefill_fixed_levels = DefaultValue(value=(70, 90)) + ), + ), + }, + ) + +rule_spec_amd_gpu = CheckParameters( + title = Title("AMD GPU Metrics"), + name = "amd_gpu", + topic = Topic.PERIPHERALS, + parameter_form = _valuespec_amd_gpu, + condition = HostAndItemCondition( + item_title = Title("GPU") + ), +) diff --git a/amd-gpu/2.4/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 b/amd-gpu/2.4/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 new file mode 100644 index 0000000..acad1f5 --- /dev/null +++ b/amd-gpu/2.4/local/share/check_mk/agents/custom/amd_gpu/lib/plugins/amd_gpu.ps1 @@ -0,0 +1,20 @@ +# Copyright 2026 Spearhead Systems SRL +# +# This goes in C:\ProgramData\checkmk\agent\plugins. It should be added automatically by +# baking a new MSI after setting "Agent Rules" > "Deploy Custom Files With Agent" with +# "Deploy Custom Files With Agent" including "amd_gpu". + +foreach ($Item in Get-ChildItem "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}" -Name -Include 000*) { + $Name = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "DriverDesc" + if ($Name -match 'Radeon') { + $GpuBytesTotal = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.qwMemorySize" + $GpuRawName = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.AdapterString" + break + } +} + +$GpuName = [System.Text.Encoding]::Unicode.GetString($GpuRawName) +$GpuPercent = (((Get-Counter "\GPU Engine(*)\Utilization Percentage" ).CounterSamples).CookedValue | measure -sum).sum +$GpuBytesUsed = (((Get-Counter "\GPU Process Memory(*)\Dedicated Usage").CounterSamples).CookedValue | measure -sum).sum + +Write-Output "<<>>", $GpuName, $GpuPercent, $GpuBytesUsed, $GpuBytesTotal