#!/usr/bin/env python3
# Copyright (C) 2026 Spearhead Systems SRL

import http.client, argparse, json, ssl
from datetime import datetime, timezone
from collections import defaultdict


OK   = 0
WARN = 1
CRIT = 2

SECONDS_PER_DAY = 24 * 60 * 60


# GET HTTP with Bearer auth. Returns data structure parsed from JSON.
#
# Since results in VSPC are paginated, we go through all pages and return an
# array of all results.
#
# We reuse the HTTP connection to reduce overhead.
def get_paginated_json_url(host, port, path, token, insecure):
    ctx = None
    if insecure:
        ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_CLIENT)
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE

    conn = http.client.HTTPSConnection(host, port=port, timeout=10, context=ctx)
    headers = { "Authorization": f"Bearer {token}" }

    results = []
    offset = 0

    while True:
        conn.request("GET", f"{path}?offset={offset}", headers=headers)
        response = conn.getresponse()

        if response.status != 200:
            raise Exception(f"Status code for {path} was {response.status}")

        page = json.loads(response.read())

        meta = page["meta"]
        data = page["data"]

        results.extend(data)

        total  = meta["pagingInfo"]["total"]
        count  = meta["pagingInfo"]["count"]
        offset = offset + count

        if offset >= total:
            break

    conn.close()

    return results


# Parse the command-line arguments. We have several options, but hostname is
# always required. Print out help to console if we get no args.
def parse_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "hostname", help="Hostname or IP of target VSPC"
    )
    parser.add_argument(
        "-t", "--token", required=True, help="API token"
    )
    parser.add_argument(
        "-p", "--port", default=1280, type=int, help="TCP port (default: 1280)"
    )
    parser.add_argument(
        "-k", "--insecure", default=False, help="Skip certificate verification",
        action="store_true"
    )
    parser.add_argument(
        "-d", "--demo", default=False, help="Show an example output",
        action="store_true"
    )

    return parser.parse_args()


# Every server must have one management agent; if there's no management agent,
# the server effectively doesn't exist for backups. Every server *probably*
# has one backup agent, since it's what does the backups. If the backup agent
# is present, then it must have a management agent.
#
# Every backup agent can perform multiple jobs.
#
# Therefore we looked up all management agents, all backup agents, and all
# jobs. Then we walk from every management agent to the backup agent (if
# present), then match all jobs against that backup agent. We ignore jobs
# that aren't enabled.
#
# But it's more complicated than that: jobs tell us the last time they ran, not
# the last time they *successfully* ran, which is important for when we WARN or
# CRIT. To get this information we need info from
# protectedWorkloads/computersManagedByConsole (here in the "managed" argument),
# which contain the last restore point. That last restore point tells us when
# the last successful backup was.
#
# Cumulatively, we check statuses, the scheduled job type, error messages, and
# how long since the last run and successful run. We return a table of the
# format:
#
# {
#     <hostname>: [
#         {
#             "status": 0/1/2,
#             "message": "..."
#         },
#         ...
#     ],
#     ...
# }
def process(mAgents, bAgents, jobs, managed):
    mToB = {}
    for agent in bAgents:
        mToB[agent["managementAgentUid"]] = agent

    bToJ = defaultdict(list)
    for job in jobs:
        if job["isEnabled"]:
            bToJ[job["backupAgentUid"]].append(job)

    bToM = {}
    for m in managed:
      bToM[m["backupAgentUid"]] = m

    results = defaultdict(list)
    for mAgent in mAgents:
        host = mAgent["tag"] or mAgent["hostName"]
        mAgentId = mAgent["instanceUid"]

        mStatus = mAgent["status"]
        if mStatus != "Healthy":
            results[host].append({
                "status":  WARN,
                "message": f"Managment agent {mAgentId} is: {mStatus}."
            })
            continue

        bAgent = mToB.get(mAgent["instanceUid"])
        if not bAgent:
            results[host].append({
                "status":  WARN,
                "message": f"Host appears to have no backup agent."
            })
            continue

        bAgentId = bAgent["instanceUid"]

        bStatus = bAgent["status"]
        if bAgent["status"] != "Active":
            results[host].append({
                "status":  WARN,
                "message": f"Backup agent {bAgentId} is {bStatus}."
            })
            continue

        if bAgent["totalJobsCount"] == 0:
            results[host].append({
                "status":  WARN,
                "message": f"Backup agent {bAgentId} has no jobs."
            })
            continue

        mEntry = bToM.get(bAgentId)

        jobs = bToJ.get(bAgentId)
        if not jobs:
            continue

        for job in jobs:
            jobId = job["instanceUid"]
            last  = job["lastEndTime"]
            sched = job["scheduleType"]

            daysSinceLastRun = None
            if last:
                timeSinceLastRun = datetime.now(timezone.utc) - datetime.fromisoformat(last)
                daysSinceLastRun = timeSinceLastRun.seconds / SECONDS_PER_DAY

            if sched == "NotScheduled":
                results[host].append({
                    "status":  WARN,
                    "message": f"Backup agent {bAgentId} job {jobId} is not scheduled."
                })
                continue

            failureMessage = job["failureMessage"]
            if failureMessage:
                # We use 2.2 here to give wiggle room for jobs to complete if
                # they take longer than expected.
                results[host].append({
                    "status":  CRIT if daysSinceLastRun > 2.2 else WARN,
                    "message": f"Backup agent {bAgentId} job {jobId} failed {daysSinceLastRun:.1f} days ago: {failureMessage}"
                })
                continue

            if sched != "Daily":
                results[host].append({
                    "status":  WARN,
                    "message": f"Backup agent {bAgentId} job {jobId} has scheduleType unknown to CheckMK plugin: {sched}."
                })
                continue

            if not daysSinceLastRun:
                results[host].append({
                    "status":  OK,
                    "message": f"Backup agent {bAgentId} is healthy; no backups yet."
                })
                continue

            # We use 1.2 & 2.2 here to give wiggle room for jobs to complete if
            # they take longer than expected.
            if daysSinceLastRun > 2.2:
                results[host].append({
                    "status":  CRIT,
                    "message": f"Backup agent {bAgentId} job {jobId} is late! Last backup ran {daysSinceLastRun:.1f} days ago."
                })
            elif daysSinceLastRun > 1.2:
                results[host].append({
                    "status":  WARN,
                    "message": f"Backup agent {bAgentId} job {jobId} is late! Last backup ran {daysSinceLastRun:.1f} days ago."
                })
            else:
                results[host].append({
                    "status":  OK,
                    "message": f"Backup agent {bAgentId} job {jobId} is healthy; last backup ran {daysSinceLastRun:.1f} days ago."
                })

            if not mEntry:
                continue

            lastRestorePoint = mEntry["latestRestorePointDate"]
            if not lastRestorePoint:
                continue

            timeSinceLastSuccess = datetime.now(timezone.utc) - datetime.fromisoformat(lastRestorePoint)
            daysSinceLastSuccess = timeSinceLastSuccess.seconds / SECONDS_PER_DAY

            if daysSinceLastSuccess > 2.2:
                results[host].append({
                    "status":  CRIT,
                    "message": f"Job {jobId} last SUCCESSFULLY ran {daysSinceLastSuccess:.1f} days ago!"
                })
            elif daysSinceLastSuccess > 1.2:
                results[host].append({
                    "status":  WARN,
                    "message": f"Job {jobId} last SUCCESSFULLY ran {daysSinceLastSuccess:.1f} days ago!"
                })
            else:
                results[host].append({
                    "status":  OK,
                    "message": f"Job {jobId} last successfully ran {daysSinceLastSuccess:.1f} days ago."
                })

    return results


def print_demo():
    print("""
<<<<newveeam>>>>
<<<local>>>
1 "Veeam Backup" - Host appears to have no backup agent.
<<<>>>
<<<<retrip-nova>>>>
<<<local>>>
1 "Veeam Backup" - Managment agent e4ade74b-4c5d-4204-a35c-68ccf2c73d16 is: Inaccessible.
<<<>>>
<<<<guacamole.foo.bar>>>>
<<<local>>>
0 "Veeam Backup" - Backup agent c1e3d991-1fc7-409c-b9ee-8e25abeb1774 job d7fd4b02-a80c-6e9b-a046-75c8031768a8 is healthy; last backup ran 0.9 days ago. | Job d7fd4b02-a80c-6e9b-a046-75c8031768a8 last successfully ran 0.0 days ago. | Backup agent c1e3d991-1fc7-409c-b9ee-8e25abeb1774 job 42bba0eb-0d7c-66e7-aa6e-fb2fcfb63f67 is healthy; last backup ran 0.0 days ago. | Job 42bba0eb-0d7c-66e7-aa6e-fb2fcfb63f67 last successfully ran 0.0 days ago.
<<<>>>
<<<<nc.foo.bar>>>>
<<<local>>>
0 "Veeam Backup" - Backup agent 3279b5ec-e65e-cd44-b749-a8e2ee0b634d job beebdcd2-2624-60dc-ae88-bb105ac75d3d is healthy; last backup ran 0.8 days ago. | Job beebdcd2-2624-60dc-ae88-bb105ac75d3d last successfully ran 0.8 days ago.
<<<>>>
<<<<Catalina>>>>
<<<local>>>
2 "Veeam Backup" - Backup agent 795349ba-038a-5580-b4c2-5ab361d41f8f job 63270e37-9ff4-6491-b062-b95a09af82b7 failed 2.1 days ago: Failed to start a backup job. server=oatpp/1.2.5 / code=401 / description=Unauthorized / message=Unauthorized /
<<<>>>
    """.strip())


# Print out all our results in a format that CheckMK understands.
def print_out(results):
    for host, rows in results.items():
        print(f"<<<<{host}>>>>")
        print("<<<local>>>")

        max_status = OK
        messages   = []
        for row in rows:
            max_status = max(max_status, row["status"])
            messages.append(row["message"].replace("\n", " / "))

        print(f'{max_status} "Veeam Backup" - {" | ".join(messages)}')
        print("<<<>>>")


# Check the status of all management agents, backup agents, and backup jobs.
# Print results.
def main(argv=None):
    args = parse_arguments()

    if args.demo:
        return print_demo()

    mAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/managementAgents',  args.token, args.insecure)
    bAgents = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents',      args.token, args.insecure)
    jobs    = get_paginated_json_url(args.hostname, args.port, '/api/v3/infrastructure/backupAgents/jobs', args.token, args.insecure)
    managed = get_paginated_json_url(args.hostname, args.port, '/api/v3/protectedWorkloads/computersManagedByConsole', args.token, args.insecure)

    results = process(mAgents, bAgents, jobs, managed)
    print_out(results)


if __name__ == "__main__":
    main()
