diff --git a/check_mk-azure/local/lib/check_mk/base/plugins/agent_based/azure.py b/check_mk-azure/local/lib/check_mk/base/plugins/agent_based/azure.py index 616983c..9ab4b0d 100644 --- a/check_mk-azure/local/lib/check_mk/base/plugins/agent_based/azure.py +++ b/check_mk-azure/local/lib/check_mk/base/plugins/agent_based/azure.py @@ -6,6 +6,22 @@ from datetime import datetime, timezone from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State +def check_state_below(alert_percentages, measured_percent): + if alert_percentages: + if alert_percentages[1] <= measured_percent: + return State.CRIT + elif alert_percentages[0] <= measured_percent: + return State.WARN + return State.OK + +def check_state_above(alert_percentages, measured_percent): + if alert_percentages: + if alert_percentages[1] >= measured_percent: + return State.CRIT + elif alert_percentages[0] >= measured_percent: + return State.WARN + return State.OK + # Convert JSON entries into dictionaries indexed by name. We're assuming here # that the name is unique across AZs and resource groups. If not, add the # 'location' and 'resource_group' fields in each object to the name. @@ -27,26 +43,110 @@ def discover(section): yield Service(item=name) -# Given a specific metric, look it up in the parsed output, and produce +# Given a specific keyvault metric, look it up in the parsed output, and produce # results on that service based upon the metric's range. def check_keyvault(item, params, section): - warn_days = params.get("warn_days") - crit_days = params.get("crit_days") - - cert = section.get(item) - if cert is None: + vault = section.get(item) + if vault is None: return - expires = datetime.fromisoformat(cert["attributes"]["expires"]) - now = datetime.now(timezone.utc) - remaining_days = (expires - now).days + metrics = vault["metrics"] - state = State.OK - if crit_days is not None and remaining_days < crit_days: - state = State.CRIT - elif warn_days is not None and remaining_days < warn_days: - state = State.WARN + availability = metrics.get("Availability") + capacity = metrics.get("SaturationShoebox") + latency = metrics.get("ServiceApiLatency") + hits = metrics.get("ServiceApiHit") + results = metrics.get("ServiceApiResult") + alert_availability_percent = params.get("availability") + alert_capacity_percent = params.get("capacity") + alert_latency_milliseconds = params.get("latency") + + if availability: + check_state_below(alert_availability_percent, availability) + yield Metric( + name="availability", + value=availability, + levels=alert_availability_percent, + boundaries=(0, 100) + ) + + if capacity: + check_state_above(alert_capacity_percent, capacity) + yield Metric( + name="capacity", + value=capacity, + levels=alert_capacity_percent, + boundaries=(0, 100) + ) + + if latency: + check_state_above(alert_latency_milliseconds, latency) + yield Metric( + name="latency", + value=latency, + levels=alert_latency_milliseconds, + boundaries=(0, None) + ) + + if hits: + yield Metric( + name="hits", + value=hits, + boundaries=(0, None) + ) + + if results: + yield Metric( + name="results", + value=results, + boundaries=(0, None) + ) + + +# Given a specific firewall metric, look it up in the parsed output, and produce +# results on that service based upon the metric's range. +def check_firewall(item, params, section): + firewall = section.get(item) + if firewall is None: + return + + metrics = vault["metrics"] + + availability = metrics.get("FirewallHealth") + throughput = metrics.get("Throughput") + latency = metrics.get("FirewallLatencyPng") + + alert_availability_percent = params.get("availability") + alert_latency_milliseconds = params.get("latency") + + if availability: + check_state_below(alert_availability_percent, availability) + yield Metric( + name="availability", + value=availability, + levels=alert_availability_percent, + boundaries=(0, 100) + ) + + if latency: + check_state_above(alert_latency_milliseconds, latency) + yield Metric( + name="latency", + value=latency, + levels=alert_latency_milliseconds, + boundaries=(0, None) + ) + + if throughput: + yield Metric( + name="throughput", + value=thoughput, + boundaries=(0, None) + ) + + +def check_defender(item, params, section): yield Result(state=state, summary="Expires in %d days" % remaining_days) @@ -75,7 +175,7 @@ register.check_plugin( name="azure_firewall_metrics", service_name="Azure Firewall Metric %s", - check_function=check_keyvault, + check_function=check_firewall, check_default_parameters={}, check_ruleset_name="azure_firewall_metrics", @@ -91,7 +191,7 @@ register.check_plugin( name="azure_defender_alerts", service_name="Azure Defender Alert %s", - check_function=check_keyvault, + check_function=check_defender, check_default_parameters={}, check_ruleset_name="azure_defender_alerts", diff --git a/check_mk-azure/local/share/check_mk/agents/special/agent_azure b/check_mk-azure/local/share/check_mk/agents/special/agent_azure index 7fe82f7..ec7ab2a 100755 --- a/check_mk-azure/local/share/check_mk/agents/special/agent_azure +++ b/check_mk-azure/local/share/check_mk/agents/special/agent_azure @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # Copyright (C) 2024 Spearhead Systems SRL -from urllib import request, parse +from urllib import request, parse, error from datetime import datetime, timezone, timedelta import json import sys @@ -26,6 +26,18 @@ REGION_RE = re.compile('/locations/(.+?)/') RESOURCE_GROUP_RE = re.compile('/resourceGroups/(.+?)/') +# https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling +def get_url(req, default): + try: + res = request.urlopen(req) + return res.read() + except error.HTTPError as e: + if e.code == 429: + return default + else: + raise e + + def get_token(tenant, username, password): data = parse.urlencode({ 'username': username, @@ -40,9 +52,12 @@ def get_token(tenant, username, password): req = request.Request(f'https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token', data=str.encode(data)) - res = request.urlopen(req) - token_data = json.loads(res.read()) + res = get_url(req, None) + if res is None: + return + + token_data = json.loads(res) token = token_data['access_token'] return token @@ -50,8 +65,8 @@ def get_token(tenant, username, password): def get_json(token, path, version='2023-07-01'): url = f"https://management.azure.com{path}{'?' in path and '&' or '?'}api-version={version}" req = request.Request(url, headers={'Authorization': f'Bearer {token}'}) - res = request.urlopen(req) - data = json.loads(res.read()) + res = get_url(req, "[]") + data = json.loads(res) return data['value'] @@ -157,4 +172,3 @@ for subscription in list_subscriptions(token): 'resource_group': re.search(RESOURCE_GROUP_RE, vault['id'])[1], 'metrics': metrics_to_lookup(metrics), }) - diff --git a/check_mk-azure/local/share/check_mk/web/plugins/wato/azure.py b/check_mk-azure/local/share/check_mk/web/plugins/wato/azure.py index ef1f3e2..527dff1 100644 --- a/check_mk-azure/local/share/check_mk/web/plugins/wato/azure.py +++ b/check_mk-azure/local/share/check_mk/web/plugins/wato/azure.py @@ -56,30 +56,105 @@ def _valuespec_special_agents_azure_discovery(): def _valuespec_special_agents_azure_keyvault_metric_check(): return Dictionary( title=_("Azure Key Vault Metric Checks"), - optional_keys=["warn_percent", "crit_percent"], elements=[ ( - "warn_percent", - Integer( - minvalue=0, - default_value=98, - title=_("Warn when percentage falls below this threshold"), - ), + "availability", + Tuple( + title=_("Availability"), + help=_("If drops below these percentages over the past minute, issue alert"), + elements=[ + Percentage( + title=_("Warn if below"), + default_value=98 + ), + Percentage( + title=_("Crit if below"), + default_value=90 + ) + ] + ) ), ( - "crit_percent", - Integer( - minvalue=0, - default_value=90, - title=_("Warn when percentage falls below this threshold"), - ), + "capacity", + Tuple( + title=_("Capacity used"), + help=_("If goes above these percentages over the past minute, issue alert"), + elements=[ + Percentage( + title=_("Warn if above"), + default_value=80 + ), + Percentage( + title=_("Crit if above"), + default_value=98 + ) + ] + ) + ), + ( + "latency", + Tuple( + title=_("Request latency"), + help=_("If goes above the average milliseconds over the past minute, issue alert"), + elements=[ + Integer( + title=_("Warn if above"), + default_value=100, + minvalue=0, + ), + Integer( + title=_("Crit if above"), + default_value=2000, + minvalue=0, + ) + ] + ) ), ], ) def _valuespec_special_agents_azure_firewall_metric_check(): - return _valuespec_special_agents_azure_keyvault_metric_check() - + return Dictionary( + title=_("Azure Firewall Metric Checks"), + elements=[ + ( + "availability", + Tuple( + title=_("Availability"), + help=_("If drops below these percentages over the past minute, issue alert"), + elements=[ + Percentage( + title=_("Warn if below"), + default_value=98 + ), + Percentage( + title=_("Crit if below"), + default_value=90 + ) + ] + ) + ), + ( + "latency", + Tuple( + title=_("Request latency"), + help=_("If goes above the average milliseconds over the past minute, issue alert"), + elements=[ + Integer( + title=_("Warn if above"), + default_value=100, + minvalue=0, + ), + Integer( + title=_("Crit if above"), + default_value=2000, + minvalue=0, + ) + ] + ) + ), + ], + ) rulespec_registry.register( HostRulespec( @@ -122,4 +197,3 @@ rulespec_registry.register( parameter_valuespec=_valuespec_special_agents_azure_keyvault_metric_check, ) ) -