Fix some metric checks.

This commit is contained in:
Marsell Kukuljevic 2024-08-24 22:49:22 +02:00
parent 88da961bd1
commit 29436f795d
3 changed files with 226 additions and 38 deletions

View File

@ -6,6 +6,22 @@ from datetime import datetime, timezone
from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State
def check_state_below(alert_percentages, measured_percent):
if alert_percentages:
if alert_percentages[1] <= measured_percent:
return State.CRIT
elif alert_percentages[0] <= measured_percent:
return State.WARN
return State.OK
def check_state_above(alert_percentages, measured_percent):
if alert_percentages:
if alert_percentages[1] >= measured_percent:
return State.CRIT
elif alert_percentages[0] >= measured_percent:
return State.WARN
return State.OK
# Convert JSON entries into dictionaries indexed by name. We're assuming here
# that the name is unique across AZs and resource groups. If not, add the
# 'location' and 'resource_group' fields in each object to the name.
@ -27,26 +43,110 @@ def discover(section):
yield Service(item=name)
# Given a specific metric, look it up in the parsed output, and produce
# Given a specific keyvault metric, look it up in the parsed output, and produce
# results on that service based upon the metric's range.
def check_keyvault(item, params, section):
warn_days = params.get("warn_days")
crit_days = params.get("crit_days")
cert = section.get(item)
if cert is None:
vault = section.get(item)
if vault is None:
return
expires = datetime.fromisoformat(cert["attributes"]["expires"])
now = datetime.now(timezone.utc)
remaining_days = (expires - now).days
metrics = vault["metrics"]
state = State.OK
if crit_days is not None and remaining_days < crit_days:
state = State.CRIT
elif warn_days is not None and remaining_days < warn_days:
state = State.WARN
availability = metrics.get("Availability")
capacity = metrics.get("SaturationShoebox")
latency = metrics.get("ServiceApiLatency")
hits = metrics.get("ServiceApiHit")
results = metrics.get("ServiceApiResult")
alert_availability_percent = params.get("availability")
alert_capacity_percent = params.get("capacity")
alert_latency_milliseconds = params.get("latency")
if availability:
check_state_below(alert_availability_percent, availability)
yield Metric(
name="availability",
value=availability,
levels=alert_availability_percent,
boundaries=(0, 100)
)
if capacity:
check_state_above(alert_capacity_percent, capacity)
yield Metric(
name="capacity",
value=capacity,
levels=alert_capacity_percent,
boundaries=(0, 100)
)
if latency:
check_state_above(alert_latency_milliseconds, latency)
yield Metric(
name="latency",
value=latency,
levels=alert_latency_milliseconds,
boundaries=(0, None)
)
if hits:
yield Metric(
name="hits",
value=hits,
boundaries=(0, None)
)
if results:
yield Metric(
name="results",
value=results,
boundaries=(0, None)
)
# Given a specific firewall metric, look it up in the parsed output, and produce
# results on that service based upon the metric's range.
def check_firewall(item, params, section):
firewall = section.get(item)
if firewall is None:
return
metrics = vault["metrics"]
availability = metrics.get("FirewallHealth")
throughput = metrics.get("Throughput")
latency = metrics.get("FirewallLatencyPng")
alert_availability_percent = params.get("availability")
alert_latency_milliseconds = params.get("latency")
if availability:
check_state_below(alert_availability_percent, availability)
yield Metric(
name="availability",
value=availability,
levels=alert_availability_percent,
boundaries=(0, 100)
)
if latency:
check_state_above(alert_latency_milliseconds, latency)
yield Metric(
name="latency",
value=latency,
levels=alert_latency_milliseconds,
boundaries=(0, None)
)
if throughput:
yield Metric(
name="throughput",
value=thoughput,
boundaries=(0, None)
)
def check_defender(item, params, section):
yield Result(state=state, summary="Expires in %d days" % remaining_days)
@ -75,7 +175,7 @@ register.check_plugin(
name="azure_firewall_metrics",
service_name="Azure Firewall Metric %s",
check_function=check_keyvault,
check_function=check_firewall,
check_default_parameters={},
check_ruleset_name="azure_firewall_metrics",
@ -91,7 +191,7 @@ register.check_plugin(
name="azure_defender_alerts",
service_name="Azure Defender Alert %s",
check_function=check_keyvault,
check_function=check_defender,
check_default_parameters={},
check_ruleset_name="azure_defender_alerts",

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# Copyright (C) 2024 Spearhead Systems SRL
from urllib import request, parse
from urllib import request, parse, error
from datetime import datetime, timezone, timedelta
import json
import sys
@ -26,6 +26,18 @@ REGION_RE = re.compile('/locations/(.+?)/')
RESOURCE_GROUP_RE = re.compile('/resourceGroups/(.+?)/')
# https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling
def get_url(req, default):
try:
res = request.urlopen(req)
return res.read()
except error.HTTPError as e:
if e.code == 429:
return default
else:
raise e
def get_token(tenant, username, password):
data = parse.urlencode({
'username': username,
@ -40,9 +52,12 @@ def get_token(tenant, username, password):
req = request.Request(f'https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token',
data=str.encode(data))
res = request.urlopen(req)
token_data = json.loads(res.read())
res = get_url(req, None)
if res is None:
return
token_data = json.loads(res)
token = token_data['access_token']
return token
@ -50,8 +65,8 @@ def get_token(tenant, username, password):
def get_json(token, path, version='2023-07-01'):
url = f"https://management.azure.com{path}{'?' in path and '&' or '?'}api-version={version}"
req = request.Request(url, headers={'Authorization': f'Bearer {token}'})
res = request.urlopen(req)
data = json.loads(res.read())
res = get_url(req, "[]")
data = json.loads(res)
return data['value']
@ -157,4 +172,3 @@ for subscription in list_subscriptions(token):
'resource_group': re.search(RESOURCE_GROUP_RE, vault['id'])[1],
'metrics': metrics_to_lookup(metrics),
})

View File

@ -56,30 +56,105 @@ def _valuespec_special_agents_azure_discovery():
def _valuespec_special_agents_azure_keyvault_metric_check():
return Dictionary(
title=_("Azure Key Vault Metric Checks"),
optional_keys=["warn_percent", "crit_percent"],
elements=[
(
"warn_percent",
Integer(
minvalue=0,
default_value=98,
title=_("Warn when percentage falls below this threshold"),
),
"availability",
Tuple(
title=_("Availability"),
help=_("If drops below these percentages over the past minute, issue alert"),
elements=[
Percentage(
title=_("Warn if below"),
default_value=98
),
Percentage(
title=_("Crit if below"),
default_value=90
)
]
)
),
(
"crit_percent",
Integer(
minvalue=0,
default_value=90,
title=_("Warn when percentage falls below this threshold"),
),
"capacity",
Tuple(
title=_("Capacity used"),
help=_("If goes above these percentages over the past minute, issue alert"),
elements=[
Percentage(
title=_("Warn if above"),
default_value=80
),
Percentage(
title=_("Crit if above"),
default_value=98
)
]
)
),
(
"latency",
Tuple(
title=_("Request latency"),
help=_("If goes above the average milliseconds over the past minute, issue alert"),
elements=[
Integer(
title=_("Warn if above"),
default_value=100,
minvalue=0,
),
Integer(
title=_("Crit if above"),
default_value=2000,
minvalue=0,
)
]
)
),
],
)
def _valuespec_special_agents_azure_firewall_metric_check():
return _valuespec_special_agents_azure_keyvault_metric_check()
return Dictionary(
title=_("Azure Firewall Metric Checks"),
elements=[
(
"availability",
Tuple(
title=_("Availability"),
help=_("If drops below these percentages over the past minute, issue alert"),
elements=[
Percentage(
title=_("Warn if below"),
default_value=98
),
Percentage(
title=_("Crit if below"),
default_value=90
)
]
)
),
(
"latency",
Tuple(
title=_("Request latency"),
help=_("If goes above the average milliseconds over the past minute, issue alert"),
elements=[
Integer(
title=_("Warn if above"),
default_value=100,
minvalue=0,
),
Integer(
title=_("Crit if above"),
default_value=2000,
minvalue=0,
)
]
)
),
],
)
rulespec_registry.register(
HostRulespec(
@ -122,4 +197,3 @@ rulespec_registry.register(
parameter_valuespec=_valuespec_special_agents_azure_keyvault_metric_check,
)
)