Add AMD GPU plugin.
This commit is contained in:
parent
8f9970a40c
commit
5ba99c1c15
BIN
check_mk-amd-gpu/amd-gpu-0.1.0.mkp
Executable file
BIN
check_mk-amd-gpu/amd-gpu-0.1.0.mkp
Executable file
Binary file not shown.
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2024 Spearhead Systems SRL
|
||||||
|
|
||||||
|
from cmk.base.plugins.agent_based.agent_based_api.v1 import (
|
||||||
|
register,
|
||||||
|
Service,
|
||||||
|
Result,
|
||||||
|
Metric,
|
||||||
|
State,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def discovery_amd_gpu(section):
|
||||||
|
name = section[0][0]
|
||||||
|
yield Service(item=name)
|
||||||
|
|
||||||
|
|
||||||
|
def check_state(alert_percentages, measured_percent):
|
||||||
|
if alert_percentages:
|
||||||
|
if alert_percentages[1] <= measured_percent:
|
||||||
|
return State.CRIT
|
||||||
|
elif alert_percentages[0] <= measured_percent:
|
||||||
|
return State.WARN
|
||||||
|
return State.OK
|
||||||
|
|
||||||
|
|
||||||
|
def get_levels(alert_levels, total=None):
|
||||||
|
if alert_levels == None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if total == None:
|
||||||
|
return alert_levels
|
||||||
|
|
||||||
|
return (alert_levels[0] / 100 * total, alert_levels[1] / 100 * total)
|
||||||
|
|
||||||
|
|
||||||
|
def check_amd_gpu(item, params, section):
|
||||||
|
if item != section[0][0]:
|
||||||
|
return
|
||||||
|
|
||||||
|
gpu_percent = int(float(section[1][0]))
|
||||||
|
vram_bytes_used = int(section[2][0])
|
||||||
|
vram_bytes_total = int(section[3][0])
|
||||||
|
vram_bytes_free = max(0, vram_bytes_total - vram_bytes_used)
|
||||||
|
|
||||||
|
vram_mb_used = vram_bytes_used // 1048576
|
||||||
|
vram_mb_total = vram_bytes_total // 1048576
|
||||||
|
vram_mb_free = vram_bytes_free // 1048576
|
||||||
|
|
||||||
|
alert_gpu_percent = params.get("gpu_percent")
|
||||||
|
alert_vram_used_percent = params.get("vram_used_percent")
|
||||||
|
alert_vram_free_percent = params.get("vram_free_percent")
|
||||||
|
|
||||||
|
vram_used_percent = vram_bytes_used / vram_bytes_total * 100
|
||||||
|
vram_free_percent = 100 - vram_used_percent
|
||||||
|
|
||||||
|
yield Result(
|
||||||
|
state=check_state(alert_gpu_percent, gpu_percent),
|
||||||
|
summary=f"GPU: {gpu_percent}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Result(
|
||||||
|
state=check_state(alert_vram_free_percent, vram_free_percent),
|
||||||
|
summary=f"VRAM free: {vram_mb_free} MiB"
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Result(
|
||||||
|
state=check_state(alert_vram_used_percent, vram_used_percent),
|
||||||
|
summary=f"VRAM used: {vram_mb_used} MiB"
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Result(
|
||||||
|
state=State.OK,
|
||||||
|
summary=f"VRAM total: {vram_mb_total} MiB"
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Metric(
|
||||||
|
name="gpu_percent",
|
||||||
|
value=gpu_percent,
|
||||||
|
levels=get_levels(alert_gpu_percent),
|
||||||
|
boundaries=(0, 100)
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Metric(
|
||||||
|
name="vram_used",
|
||||||
|
value=vram_mb_used,
|
||||||
|
levels=get_levels(alert_vram_used_percent, vram_mb_total),
|
||||||
|
boundaries=(0, vram_mb_total)
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Metric(
|
||||||
|
name="vram_free",
|
||||||
|
value=vram_mb_free,
|
||||||
|
levels=get_levels(alert_vram_free_percent, vram_mb_total),
|
||||||
|
boundaries=(0, vram_mb_total)
|
||||||
|
)
|
||||||
|
|
||||||
|
register.check_plugin(
|
||||||
|
name='amd_gpu',
|
||||||
|
service_name='AMD GPU - %s',
|
||||||
|
discovery_function=discovery_amd_gpu,
|
||||||
|
check_function=check_amd_gpu,
|
||||||
|
check_default_parameters={},
|
||||||
|
check_ruleset_name='amd_gpu',
|
||||||
|
)
|
@ -0,0 +1,85 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2024 Spearhead Systems SRL
|
||||||
|
|
||||||
|
from cmk.gui.i18n import _
|
||||||
|
from cmk.gui.plugins.wato.utils import (
|
||||||
|
CheckParameterRulespecWithItem,
|
||||||
|
rulespec_registry,
|
||||||
|
RulespecGroupCheckParametersHardware,
|
||||||
|
)
|
||||||
|
from cmk.gui.valuespec import Dictionary, Percentage, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def _parameter_valuespec_amd_gpu():
|
||||||
|
return Dictionary(
|
||||||
|
title=_("GPU utilization"),
|
||||||
|
help=_(
|
||||||
|
"These metrics are queried directly from the AMD GPU. "
|
||||||
|
"Upper and lower levels can be specified for individual metrics."
|
||||||
|
),
|
||||||
|
elements=[
|
||||||
|
(
|
||||||
|
"gpu_percent",
|
||||||
|
Tuple(
|
||||||
|
title=_("GPU Used"),
|
||||||
|
help=_("If usage of total GPU compute goes above these percentages, issue alerts."),
|
||||||
|
elements=[
|
||||||
|
Percentage(
|
||||||
|
title=_("Warn if above"),
|
||||||
|
default_value=90
|
||||||
|
),
|
||||||
|
Percentage(
|
||||||
|
title=_("Crit if above"),
|
||||||
|
default_value=100
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"vram_free_percent",
|
||||||
|
Tuple(
|
||||||
|
title=_("VRAM Free"),
|
||||||
|
help=_("If free VRAM goes above these percentages, issue alerts."),
|
||||||
|
elements=[
|
||||||
|
Percentage(
|
||||||
|
title="Warn if above",
|
||||||
|
default_value=70
|
||||||
|
),
|
||||||
|
Percentage(
|
||||||
|
title="Crit if above",
|
||||||
|
default_value=90
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"vram_used_percent",
|
||||||
|
Tuple(
|
||||||
|
title=_("VRAM Used"),
|
||||||
|
help=_("If used VRAM goes above these percentages, issue alerts."),
|
||||||
|
elements=[
|
||||||
|
Percentage(
|
||||||
|
title="Warn if above",
|
||||||
|
default_value=70
|
||||||
|
),
|
||||||
|
Percentage(
|
||||||
|
title="Crit if above",
|
||||||
|
default_value=90
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
rulespec_registry.register(
|
||||||
|
CheckParameterRulespecWithItem(
|
||||||
|
check_group_name="amd_gpu",
|
||||||
|
group=RulespecGroupCheckParametersHardware,
|
||||||
|
match_type="dict",
|
||||||
|
parameter_valuespec=_parameter_valuespec_amd_gpu,
|
||||||
|
title=lambda: _("AMD GPU Metrics"),
|
||||||
|
)
|
||||||
|
)
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright 2024 Spearhead Systems SRL
|
||||||
|
#
|
||||||
|
# This goes in C:\ProgramData\checkmk\agent\plugins. It should be added automatically by
|
||||||
|
# baking a new MSI after setting "Agent Rules" > "Deploy Custom Files With Agent" with
|
||||||
|
# "Deploy Custom Files With Agent" including "amd_gpu".
|
||||||
|
|
||||||
|
foreach ($Item in Get-ChildItem "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}" -Name -Include 000*) {
|
||||||
|
$Name = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "DriverDesc"
|
||||||
|
if ($Name -match 'Radeon') {
|
||||||
|
$GpuBytesTotal = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.qwMemorySize"
|
||||||
|
$GpuRawName = Get-ItemPropertyValue "HKLM:\SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}\$Item" "HardwareInformation.AdapterString"
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$GpuName = [System.Text.Encoding]::Unicode.GetString($GpuRawName)
|
||||||
|
$GpuPercent = (((Get-Counter "\GPU Engine(*)\Utilization Percentage" ).CounterSamples).CookedValue | measure -sum).sum
|
||||||
|
$GpuBytesUsed = (((Get-Counter "\GPU Process Memory(*)\Dedicated Usage").CounterSamples).CookedValue | measure -sum).sum
|
||||||
|
|
||||||
|
Write-Output "<<<amd_gpu:sep(0)>>>", $GpuName, $GpuPercent, $GpuBytesUsed, $GpuBytesTotal
|
Loading…
Reference in New Issue
Block a user