Fix some metric checks.

This commit is contained in:
Marsell Kukuljevic 2024-08-24 22:49:22 +02:00
parent 88da961bd1
commit 29436f795d
3 changed files with 226 additions and 38 deletions

View File

@ -6,6 +6,22 @@ from datetime import datetime, timezone
from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State
def check_state_below(alert_percentages, measured_percent):
if alert_percentages:
if alert_percentages[1] <= measured_percent:
return State.CRIT
elif alert_percentages[0] <= measured_percent:
return State.WARN
return State.OK
def check_state_above(alert_percentages, measured_percent):
if alert_percentages:
if alert_percentages[1] >= measured_percent:
return State.CRIT
elif alert_percentages[0] >= measured_percent:
return State.WARN
return State.OK
# Convert JSON entries into dictionaries indexed by name. We're assuming here # Convert JSON entries into dictionaries indexed by name. We're assuming here
# that the name is unique across AZs and resource groups. If not, add the # that the name is unique across AZs and resource groups. If not, add the
# 'location' and 'resource_group' fields in each object to the name. # 'location' and 'resource_group' fields in each object to the name.
@ -27,26 +43,110 @@ def discover(section):
yield Service(item=name) yield Service(item=name)
# Given a specific metric, look it up in the parsed output, and produce # Given a specific keyvault metric, look it up in the parsed output, and produce
# results on that service based upon the metric's range. # results on that service based upon the metric's range.
def check_keyvault(item, params, section): def check_keyvault(item, params, section):
warn_days = params.get("warn_days") vault = section.get(item)
crit_days = params.get("crit_days") if vault is None:
cert = section.get(item)
if cert is None:
return return
expires = datetime.fromisoformat(cert["attributes"]["expires"]) metrics = vault["metrics"]
now = datetime.now(timezone.utc)
remaining_days = (expires - now).days
state = State.OK availability = metrics.get("Availability")
if crit_days is not None and remaining_days < crit_days: capacity = metrics.get("SaturationShoebox")
state = State.CRIT latency = metrics.get("ServiceApiLatency")
elif warn_days is not None and remaining_days < warn_days: hits = metrics.get("ServiceApiHit")
state = State.WARN results = metrics.get("ServiceApiResult")
alert_availability_percent = params.get("availability")
alert_capacity_percent = params.get("capacity")
alert_latency_milliseconds = params.get("latency")
if availability:
check_state_below(alert_availability_percent, availability)
yield Metric(
name="availability",
value=availability,
levels=alert_availability_percent,
boundaries=(0, 100)
)
if capacity:
check_state_above(alert_capacity_percent, capacity)
yield Metric(
name="capacity",
value=capacity,
levels=alert_capacity_percent,
boundaries=(0, 100)
)
if latency:
check_state_above(alert_latency_milliseconds, latency)
yield Metric(
name="latency",
value=latency,
levels=alert_latency_milliseconds,
boundaries=(0, None)
)
if hits:
yield Metric(
name="hits",
value=hits,
boundaries=(0, None)
)
if results:
yield Metric(
name="results",
value=results,
boundaries=(0, None)
)
# Given a specific firewall metric, look it up in the parsed output, and produce
# results on that service based upon the metric's range.
def check_firewall(item, params, section):
firewall = section.get(item)
if firewall is None:
return
metrics = vault["metrics"]
availability = metrics.get("FirewallHealth")
throughput = metrics.get("Throughput")
latency = metrics.get("FirewallLatencyPng")
alert_availability_percent = params.get("availability")
alert_latency_milliseconds = params.get("latency")
if availability:
check_state_below(alert_availability_percent, availability)
yield Metric(
name="availability",
value=availability,
levels=alert_availability_percent,
boundaries=(0, 100)
)
if latency:
check_state_above(alert_latency_milliseconds, latency)
yield Metric(
name="latency",
value=latency,
levels=alert_latency_milliseconds,
boundaries=(0, None)
)
if throughput:
yield Metric(
name="throughput",
value=thoughput,
boundaries=(0, None)
)
def check_defender(item, params, section):
yield Result(state=state, summary="Expires in %d days" % remaining_days) yield Result(state=state, summary="Expires in %d days" % remaining_days)
@ -75,7 +175,7 @@ register.check_plugin(
name="azure_firewall_metrics", name="azure_firewall_metrics",
service_name="Azure Firewall Metric %s", service_name="Azure Firewall Metric %s",
check_function=check_keyvault, check_function=check_firewall,
check_default_parameters={}, check_default_parameters={},
check_ruleset_name="azure_firewall_metrics", check_ruleset_name="azure_firewall_metrics",
@ -91,7 +191,7 @@ register.check_plugin(
name="azure_defender_alerts", name="azure_defender_alerts",
service_name="Azure Defender Alert %s", service_name="Azure Defender Alert %s",
check_function=check_keyvault, check_function=check_defender,
check_default_parameters={}, check_default_parameters={},
check_ruleset_name="azure_defender_alerts", check_ruleset_name="azure_defender_alerts",

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (C) 2024 Spearhead Systems SRL # Copyright (C) 2024 Spearhead Systems SRL
from urllib import request, parse from urllib import request, parse, error
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
import json import json
import sys import sys
@ -26,6 +26,18 @@ REGION_RE = re.compile('/locations/(.+?)/')
RESOURCE_GROUP_RE = re.compile('/resourceGroups/(.+?)/') RESOURCE_GROUP_RE = re.compile('/resourceGroups/(.+?)/')
# https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling
def get_url(req, default):
try:
res = request.urlopen(req)
return res.read()
except error.HTTPError as e:
if e.code == 429:
return default
else:
raise e
def get_token(tenant, username, password): def get_token(tenant, username, password):
data = parse.urlencode({ data = parse.urlencode({
'username': username, 'username': username,
@ -40,9 +52,12 @@ def get_token(tenant, username, password):
req = request.Request(f'https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token', req = request.Request(f'https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token',
data=str.encode(data)) data=str.encode(data))
res = request.urlopen(req)
token_data = json.loads(res.read()) res = get_url(req, None)
if res is None:
return
token_data = json.loads(res)
token = token_data['access_token'] token = token_data['access_token']
return token return token
@ -50,8 +65,8 @@ def get_token(tenant, username, password):
def get_json(token, path, version='2023-07-01'): def get_json(token, path, version='2023-07-01'):
url = f"https://management.azure.com{path}{'?' in path and '&' or '?'}api-version={version}" url = f"https://management.azure.com{path}{'?' in path and '&' or '?'}api-version={version}"
req = request.Request(url, headers={'Authorization': f'Bearer {token}'}) req = request.Request(url, headers={'Authorization': f'Bearer {token}'})
res = request.urlopen(req) res = get_url(req, "[]")
data = json.loads(res.read()) data = json.loads(res)
return data['value'] return data['value']
@ -157,4 +172,3 @@ for subscription in list_subscriptions(token):
'resource_group': re.search(RESOURCE_GROUP_RE, vault['id'])[1], 'resource_group': re.search(RESOURCE_GROUP_RE, vault['id'])[1],
'metrics': metrics_to_lookup(metrics), 'metrics': metrics_to_lookup(metrics),
}) })

View File

@ -56,30 +56,105 @@ def _valuespec_special_agents_azure_discovery():
def _valuespec_special_agents_azure_keyvault_metric_check(): def _valuespec_special_agents_azure_keyvault_metric_check():
return Dictionary( return Dictionary(
title=_("Azure Key Vault Metric Checks"), title=_("Azure Key Vault Metric Checks"),
optional_keys=["warn_percent", "crit_percent"],
elements=[ elements=[
( (
"warn_percent", "availability",
Integer( Tuple(
minvalue=0, title=_("Availability"),
default_value=98, help=_("If drops below these percentages over the past minute, issue alert"),
title=_("Warn when percentage falls below this threshold"), elements=[
Percentage(
title=_("Warn if below"),
default_value=98
), ),
Percentage(
title=_("Crit if below"),
default_value=90
)
]
)
), ),
( (
"crit_percent", "capacity",
Integer( Tuple(
minvalue=0, title=_("Capacity used"),
default_value=90, help=_("If goes above these percentages over the past minute, issue alert"),
title=_("Warn when percentage falls below this threshold"), elements=[
Percentage(
title=_("Warn if above"),
default_value=80
), ),
Percentage(
title=_("Crit if above"),
default_value=98
)
]
)
),
(
"latency",
Tuple(
title=_("Request latency"),
help=_("If goes above the average milliseconds over the past minute, issue alert"),
elements=[
Integer(
title=_("Warn if above"),
default_value=100,
minvalue=0,
),
Integer(
title=_("Crit if above"),
default_value=2000,
minvalue=0,
)
]
)
), ),
], ],
) )
def _valuespec_special_agents_azure_firewall_metric_check(): def _valuespec_special_agents_azure_firewall_metric_check():
return _valuespec_special_agents_azure_keyvault_metric_check() return Dictionary(
title=_("Azure Firewall Metric Checks"),
elements=[
(
"availability",
Tuple(
title=_("Availability"),
help=_("If drops below these percentages over the past minute, issue alert"),
elements=[
Percentage(
title=_("Warn if below"),
default_value=98
),
Percentage(
title=_("Crit if below"),
default_value=90
)
]
)
),
(
"latency",
Tuple(
title=_("Request latency"),
help=_("If goes above the average milliseconds over the past minute, issue alert"),
elements=[
Integer(
title=_("Warn if above"),
default_value=100,
minvalue=0,
),
Integer(
title=_("Crit if above"),
default_value=2000,
minvalue=0,
)
]
)
),
],
)
rulespec_registry.register( rulespec_registry.register(
HostRulespec( HostRulespec(
@ -122,4 +197,3 @@ rulespec_registry.register(
parameter_valuespec=_valuespec_special_agents_azure_keyvault_metric_check, parameter_valuespec=_valuespec_special_agents_azure_keyvault_metric_check,
) )
) )