diff --git a/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks index dd96a85..ba21922 100755 --- a/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks +++ b/vspc_backup_checks/2.3/local/lib/python3/cmk_addons/plugins/vspc_backup_checks/libexec/agent_vspc_backup_checks @@ -99,9 +99,13 @@ def parse_arguments(): # But it's more complicated than that: jobs tell us the last time they ran, not # the last time they *successfully* ran, which is important for when we WARN or # CRIT. To get this information we need info from -# protectedWorkloads/computersManagedByConsole (here in the "managed" argument), -# which contain the last restore point. That last restore point tells us when -# the last successful backup was. +# protectedWorkloads/computersManagedByConsole/restores (here in the "restores" +# argument), which contain the all recent restore points. From that we can +# determine when the last successful backup was. +# +# Note that protectedWorkloads/computersManagedByConsole returns bad data on +# failed backup jobs. We *must* explicity get data from the restore endpoint +# and use the most recent restore point. # # Cumulatively, we check statuses, the scheduled job type, error messages, and # how long since the last run and successful run. We return a table of the @@ -117,7 +121,7 @@ def parse_arguments(): # ], # ... # } -def process(mAgents, bAgents, jobs, managed): +def process(mAgents, bAgents, jobs, restores): mToB = {} for agent in bAgents: mToB[agent["managementAgentUid"]] = agent @@ -127,9 +131,20 @@ def process(mAgents, bAgents, jobs, managed): if job["isEnabled"]: bToJ[job["backupAgentUid"]].append(job) - bToM = {} - for m in managed: - bToM[m["backupAgentUid"]] = m + # The list of restores must ultimately be filtered to the most recent + # restores across both the backup agent ID and job ID. This is because a + # VM's backup agent (the backup agent ID) can run several different kinds + # of jobs (job ID). For example, both a home directory backup and a DB + # directory backup. So we want the most recent successful restore point + # for every (backup agent ID, job ID) pair. + bToR = defaultdict(dict) + for r in restores: + bId = r["backupAgentUid"] + jId = r["jobUid"] + + most_recent = bToR[bId].get(jId) + if not most_recent or most_recent["creationDate"] < r["creationDate"]: + bToR[bId][jId] = r results = defaultdict(list) for mAgent in mAgents: @@ -169,7 +184,7 @@ def process(mAgents, bAgents, jobs, managed): }) continue - mEntry = bToM.get(bAgentId) + rEntries = bToR.get(bAgentId) jobs = bToJ.get(bAgentId) if not jobs: @@ -231,13 +246,11 @@ def process(mAgents, bAgents, jobs, managed): "message": f"Backup agent {bAgentId} job {jobId} is healthy; last backup ran {daysSinceLastRun:.1f} days ago." }) - if not mEntry: - continue - - lastRestorePoint = mEntry["latestRestorePointDate"] - if not lastRestorePoint: + rEntry = rEntries.get(jobId) + if not rEntry: continue + lastRestorePoint = rEntry["creationDate"] timeSinceLastSuccess = datetime.now(timezone.utc) - datetime.fromisoformat(lastRestorePoint) daysSinceLastSuccess = timeSinceLastSuccess.seconds / SECONDS_PER_DAY @@ -309,12 +322,12 @@ def main(argv=None): if args.demo: return print_demo() - mAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/managementAgents', args.token, args.insecure) - bAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents', args.token, args.insecure) - jobs = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents/jobs', args.token, args.insecure) - managed = get_paginated_json_url(args.hostname, args.port, '/api/v3/protectedWorkloads/computersManagedByConsole', args.token, args.insecure) + mAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/managementAgents', args.token, args.insecure) + bAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents', args.token, args.insecure) + jobs = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents/jobs', args.token, args.insecure) + restores = get_paginated_json_url(args.hostname, args.port, '/api/v3/protectedWorkloads/computersManagedByConsole/restorePoints', args.token, args.insecure) - results = process(mAgents, bAgents, jobs, managed) + results = process(mAgents, bAgents, jobs, restores) print_out(results)