diff --git a/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks new file mode 100755 index 0000000..d3aa112 --- /dev/null +++ b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +# Copyright (C) 2026 Spearhead Systems SRL + +import http.client, argparse, json, ssl +from datetime import datetime, timezone +from collections import defaultdict + + +OK = 0 +WARN = 1 +CRIT = 2 + +SECONDS_PER_DAY = 24 * 60 * 60 + + +# GET HTTP with Bearer auth. Returns data structure parsed from JSON. +# +# Since results in VSPC are paginated, we go through all pages and return an +# array of all results. +# +# We reuse the HTTP connection to reduce overhead. +def get_paginated_json_url(host, port, path, token, insecure): + ctx = None + if insecure: + ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_CLIENT) + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + conn = http.client.HTTPSConnection(host, port=port, timeout=10, context=ctx) + headers = { "Authorization": f"Bearer {token}" } + + results = [] + offset = 0 + + while True: + conn.request("GET", f"{path}?offset={offset}", headers=headers) + response = conn.getresponse() + + if response.status != 200: + raise Exception(f"Status code for {path} was {response.status}") + + page = json.loads(response.read()) + + meta = page["meta"] + data = page["data"] + + results.extend(data) + + total = meta["pagingInfo"]["total"] + count = meta["pagingInfo"]["count"] + offset = offset + count + + if offset >= total: + break + + conn.close() + + return results + + +# Parse the command-line arguments. We have several options, but hostname is +# always required. Print out help to console if we get no args. +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "hostname", help="Hostname or IP of target VSPC" + ) + parser.add_argument( + "-t", "--token", required=True, help="API token" + ) + parser.add_argument( + "-p", "--port", default=1280, type=int, help="TCP port (default: 1280)" + ) + parser.add_argument( + "-k", "--insecure", default=False, help="Skip certificate verification", + action="store_true" + ) + parser.add_argument( + "-d", "--demo", default=False, help="Show an example output", + action="store_true" + ) + + return parser.parse_args() + + +# Every server must have one management agent; if there's no management agent, +# the server effectively doesn't exist for backups. Every server *probably* +# has one backup agent, since it's what does the backups. If the backup agent +# is present, then it must have a management agent. +# +# Every backup agent can perform multiple jobs. +# +# Therefore we looked up all management agents, all backup agents, and all +# jobs. Then we walk from every management agent to the backup agent (if +# present), then match all jobs against that backup agent. We ignore jobs +# that aren't enabled. +# +# But it's more complicated than that: jobs tell us the last time they ran, not +# the last time they *successfully* ran, which is important for when we WARN or +# CRIT. To get this information we need info from +# protectedWorkloads/computersManagedByConsole (here in the "managed" argument), +# which contain the last restore point. That last restore point tells us when +# the last successful backup was. +# +# Cumulatively, we check statuses, the scheduled job type, error messages, and +# how long since the last run and successful run. We return a table of the +# format: +# +# { +# : [ +# { +# "status": 0/1/2, +# "message": "..." +# }, +# ... +# ], +# ... +# } +def process(mAgents, bAgents, jobs, managed): + mToB = {} + for agent in bAgents: + mToB[agent["managementAgentUid"]] = agent + + bToJ = defaultdict(list) + for job in jobs: + if job["isEnabled"]: + bToJ[job["backupAgentUid"]].append(job) + + bToM = {} + for m in managed: + bToM[m["backupAgentUid"]] = m + + results = defaultdict(list) + for mAgent in mAgents: + host = mAgent["tag"] or mAgent["hostName"] + mAgentId = mAgent["instanceUid"] + + mStatus = mAgent["status"] + if mStatus != "Healthy": + results[host].append({ + "status": WARN, + "message": f"Managment agent {mAgentId} is: {mStatus}." + }) + continue + + bAgent = mToB.get(mAgent["instanceUid"]) + if not bAgent: + results[host].append({ + "status": WARN, + "message": f"Host appears to have no backup agent." + }) + continue + + bAgentId = bAgent["instanceUid"] + + bStatus = bAgent["status"] + if bAgent["status"] != "Active": + results[host].append({ + "status": WARN, + "message": f"Backup agent {bAgentId} is {bStatus}." + }) + continue + + if bAgent["totalJobsCount"] == 0: + results[host].append({ + "status": WARN, + "message": f"Backup agent {bAgentId} has no jobs." + }) + continue + + mEntry = bToM.get(bAgentId) + + jobs = bToJ.get(bAgentId) + if not jobs: + continue + + for job in jobs: + jobId = job["instanceUid"] + last = job["lastEndTime"] + sched = job["scheduleType"] + + daysSinceLastRun = None + if last: + timeSinceLastRun = datetime.now(timezone.utc) - datetime.fromisoformat(last) + daysSinceLastRun = timeSinceLastRun.seconds / SECONDS_PER_DAY + + if sched == "NotScheduled": + results[host].append({ + "status": WARN, + "message": f"Backup agent {bAgentId} job {jobId} is not scheduled." + }) + continue + + failureMessage = job["failureMessage"] + if failureMessage: + # We use 2.2 here to give wiggle room for jobs to complete if + # they take longer than expected. + results[host].append({ + "status": CRIT if daysSinceLastRun > 2.2 else WARN, + "message": f"Backup agent {bAgentId} job {jobId} failed {daysSinceLastRun:.1f} days ago: {failureMessage}" + }) + continue + + if sched != "Daily": + results[host].append({ + "status": WARN, + "message": f"Backup agent {bAgentId} job {jobId} has scheduleType unknown to CheckMK plugin: {sched}." + }) + continue + + if not daysSinceLastRun: + results[host].append({ + "status": OK, + "message": f"Backup agent {bAgentId} is healthy; no backups yet." + }) + continue + + # We use 1.2 & 2.2 here to give wiggle room for jobs to complete if + # they take longer than expected. + if daysSinceLastRun > 2.2: + results[host].append({ + "status": CRIT, + "message": f"Backup agent {bAgentId} job {jobId} is late! Last backup ran {daysSinceLastRun:.1f} days ago." + }) + elif daysSinceLastRun > 1.2: + results[host].append({ + "status": WARN, + "message": f"Backup agent {bAgentId} job {jobId} is late! Last backup ran {daysSinceLastRun:.1f} days ago." + }) + else: + results[host].append({ + "status": OK, + "message": f"Backup agent {bAgentId} job {jobId} is healthy; last backup ran {daysSinceLastRun:.1f} days ago." + }) + + if not mEntry: + continue + + lastRestorePoint = mEntry["latestRestorePointDate"] + if not lastRestorePoint: + continue + + timeSinceLastSuccess = datetime.now(timezone.utc) - datetime.fromisoformat(lastRestorePoint) + daysSinceLastSuccess = timeSinceLastSuccess.seconds / SECONDS_PER_DAY + + if daysSinceLastSuccess > 2.2: + results[host].append({ + "status": CRIT, + "message": f"Job {jobId} last SUCCESSFULLY ran {daysSinceLastSuccess:.1f} days ago!" + }) + elif daysSinceLastSuccess > 1.2: + results[host].append({ + "status": WARN, + "message": f"Job {jobId} last SUCCESSFULLY ran {daysSinceLastSuccess:.1f} days ago!" + }) + else: + results[host].append({ + "status": OK, + "message": f"Job {jobId} last successfully ran {daysSinceLastSuccess:.1f} days ago." + }) + + return results + + +def print_demo(): + print(""" +<<<>>> +<<>> +1 "Veeam Backup" - Host appears to have no backup agent. +<<<>>> +<<<>>> +<<>> +1 "Veeam Backup" - Managment agent e4ade74b-4c5d-4204-a35c-68ccf2c73d16 is: Inaccessible. +<<<>>> +<<<>>> +<<>> +0 "Veeam Backup" - Backup agent c1e3d991-1fc7-409c-b9ee-8e25abeb1774 job d7fd4b02-a80c-6e9b-a046-75c8031768a8 is healthy; last backup ran 0.9 days ago. | Job d7fd4b02-a80c-6e9b-a046-75c8031768a8 last successfully ran 0.0 days ago. | Backup agent c1e3d991-1fc7-409c-b9ee-8e25abeb1774 job 42bba0eb-0d7c-66e7-aa6e-fb2fcfb63f67 is healthy; last backup ran 0.0 days ago. | Job 42bba0eb-0d7c-66e7-aa6e-fb2fcfb63f67 last successfully ran 0.0 days ago. +<<<>>> +<<<>>> +<<>> +0 "Veeam Backup" - Backup agent 3279b5ec-e65e-cd44-b749-a8e2ee0b634d job beebdcd2-2624-60dc-ae88-bb105ac75d3d is healthy; last backup ran 0.8 days ago. | Job beebdcd2-2624-60dc-ae88-bb105ac75d3d last successfully ran 0.8 days ago. +<<<>>> +<<<>>> +<<>> +2 "Veeam Backup" - Backup agent 795349ba-038a-5580-b4c2-5ab361d41f8f job 63270e37-9ff4-6491-b062-b95a09af82b7 failed 2.1 days ago: Failed to start a backup job. server=oatpp/1.2.5 / code=401 / description=Unauthorized / message=Unauthorized / +<<<>>> + """.strip()) + + +# Print out all our results in a format that CheckMK understands. +def print_out(results): + for host, rows in results.items(): + print(f"<<<<{host}>>>>") + print("<<>>") + + max_status = OK + messages = [] + for row in rows: + max_status = max(max_status, row["status"]) + messages.append(row["message"].replace("\n", " / ")) + + print(f'{max_status} "Veeam Backup" - {" | ".join(messages)}') + print("<<<>>>") + + +# Check the status of all management agents, backup agents, and backup jobs. +# Print results. +def main(argv=None): + args = parse_arguments() + + if args.demo: + return print_demo() + + mAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/managementAgents', args.token, args.insecure) + bAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents', args.token, args.insecure) + jobs = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents/jobs', args.token, args.insecure) + managed = get_paginated_json_url(args.hostname, args.port, '/api/v3/protectedWorkloads/computersManagedByConsole', args.token, args.insecure) + + results = process(mAgents, bAgents, jobs, managed) + print_out(results) + + +if __name__ == "__main__": + main() diff --git a/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/rulesets/vspc_backup_checks.py b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/rulesets/vspc_backup_checks.py new file mode 100644 index 0000000..26faa1c --- /dev/null +++ b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/rulesets/vspc_backup_checks.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright (C) 2026 Spearhead Systems SRL + +from cmk.rulesets.v1.form_specs import Dictionary, DictElement, String, Integer, Password, BooleanChoice, DefaultValue +from cmk.rulesets.v1.rule_specs import SpecialAgent, Topic, Title, Help +from cmk.rulesets.v1.form_specs.validators import LengthInRange, NumberInRange + + +def _formspec_vspc(): + return Dictionary( + title = Title("VSPC Server Configuration"), + elements = { + "instance": DictElement( + required = True, + parameter_form = String( + title = Title("Hostname / IP"), + help_text = Help("Host or IP of VSPC host for queries"), + custom_validate = (LengthInRange(min_value=1),), + ), + ), + "port": DictElement( + required = True, + parameter_form = Integer( + title = Title("Port"), + help_text = Help("Port of VSPC host for query"), + prefill = DefaultValue(1280), + custom_validate = (NumberInRange(min_value=1, max_value=65535),), + ), + ), + "token": DictElement( + required = True, + parameter_form = Password( + title = Title("API Token"), + help_text = Help("API token used for authentication by VSPC."), + custom_validate = (LengthInRange(min_value=1),), + ), + ), + "insecure": DictElement( + required = True, + parameter_form = BooleanChoice( + title = Title("Insecure"), + help_text = Help("Ignore unverified HTTPS certificate warnings when contacting VSPC"), + prefill = DefaultValue(False), + ), + ), + "demo": DictElement( + required = True, + parameter_form = BooleanChoice( + title = Title("Demo"), + help_text = Help("Puts agent into demo mode, returning fixed demo data regardless of VSPC results"), + prefill = DefaultValue(False), + ), + ), + }, + ) + + +rule_spec_agent_config_vspc_backup_checks = SpecialAgent( + topic=Topic.NETWORKING, + name="vspc_backup_checks", + title=Title("VSPC Backup Checks"), + parameter_form=_formspec_vspc, +) + diff --git a/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/server_side_calls/agent_vspc_backup_checks.py b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/server_side_calls/agent_vspc_backup_checks.py new file mode 100644 index 0000000..1a0e2cc --- /dev/null +++ b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/server_side_calls/agent_vspc_backup_checks.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright (C) 2026 Spearhead Systems SRL + +from cmk.server_side_calls.v1 import noop_parser, SpecialAgentConfig, SpecialAgentCommand + + +def _agent_arguments(params, host_config): + token = params["token"] + if type(token) != str: + token = token.unsafe() + + args = [] + + if params["insecure"]: + args.append("--insecure") + if params["demo"]: + args.append("--demo") + + args.extend(["--port", str(params["port"])]) + args.extend(["--token", token]) + args.append(params["instance"]) + + yield SpecialAgentCommand(command_arguments=args) + + +special_agent_vspc_backup_checks = SpecialAgentConfig( + name="vspc_backup_checks", + parameter_parser=noop_parser, + commands_function=_agent_arguments, +) diff --git a/vspc_backup_checks/2.3/vspc_backup_checks-0.2.0.mkp b/vspc_backup_checks/2.3/vspc_backup_checks-0.2.0.mkp new file mode 100755 index 0000000..d333e57 Binary files /dev/null and b/vspc_backup_checks/2.3/vspc_backup_checks-0.2.0.mkp differ