diff --git a/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py b/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py new file mode 100644 index 0000000..d5c966e --- /dev/null +++ b/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# Parses and checks external VM IPs. + +import json +from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State + + +def parse_triton_wedge(string_table): + lookup = {} + + for row in string_table: + nic = json.loads(row[0]) + cn_name = nic["cn"] + vms_in_cn = lookup.setdefault(cn_name, []) + vms_in_cn.append(nic) + + return lookup + + +register.agent_section( + name="triton_wedge", + parse_function=parse_triton_wedge +) + + +def discover_triton_wedge(section): + for cn_name, vms in sorted(section.items()): + yield Service(item=cn_name, parameters={"name": cn_name}) + + +def check_triton_wedge(item, params, section): + cn_name = params["name"] + vms = section.get(cn_name) + + if vms is None: + yield Result(state=State.WARN, summary="Not appearing in NAPI") + return + + wedged_vms = [] + + for vm in vms: + if vm["wedged"]: + wedged_vms.append(vm) + + if len(wedged_vms) == 0: + yield Result(state=State.OK, summary="No wedge detected") + elif len(wedged_vms) == 1: + vm = wedged_vms[0] + summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"]) + yield Result(state=State.WARN, summary=summary) + else: + lst = ", ".join(map(lambda vm: "VM %s (%s)" % (vm["vm"], vm["ip"]), wedged_vms)) + yield Result(state=State.CRIT, summary=f"Likely wedged detected for {lst}") + + +register.check_plugin( + name="triton_wedge", + service_name="Triton Wedge CN %s", + discovery_function=discover_triton_wedge, + check_function=check_triton_wedge, + check_default_parameters={}, + check_ruleset_name="triton_wedge", +) diff --git a/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge b/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge new file mode 100755 index 0000000..3d7cbb2 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# The range of ephemeral local ports we use when attempting to probe remote +# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use +# a stride of 128. +PORT_RANGE_START = 57000 +PORT_RANGE_END = 57128 +CONNECT_RETRIES = 3 +CHECK_REMOTE_PORTS = [443, 80] +CONCURRENT_SCANS = 200 + + +import urllib.request, sys, argparse, asyncio, json, socket, errno + + +def get_url(url): + request = urllib.request.Request(url) + with urllib.request.urlopen(request) as conn: + data = conn.read() + return data + + +# Fetch and parse details about active zone NICs on the external network. +def query_napi(addr): + url = 'http://%s/nics?nic_tag=external&belongs_to_type=zone&state=running' % addr + try: + json_data = get_url(url) + nics = json.loads(json_data) + return nics + except urllib.error.HTTPError as e: + sys.stderr.write("NAPI error: %s\n" % e) + + +# asyncio provides some nice connection methods, but none of them allow us to +# use SO_REUSEPORT. This flag is critical since we're repeatedly using the +# same range of local ports to port map remote IPs. So we have to resort to +# this low-level socket hackery to enable SO_REUSEPORT. +async def async_connect(src, dest): + loop = asyncio.get_event_loop() + + sd = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + sd.bind(src) + sd.setblocking(False) + + connected = False + + # We try to connect() several times, in case a packet got lost. + for attempt in range(CONNECT_RETRIES): + try: + future = loop.sock_connect(sd, dest) + await asyncio.wait_for(future, timeout=0.1) + connected = True + break + except ConnectionRefusedError: + # ECONNREFUSED (we received a RST after sending an ACK). If we + # receive this there's no point retrying. + break + except (TimeoutError, asyncio.TimeoutError): + # Usually you'd wait for the TCP stack to make its own retries, but + # we know our target IPs are in a nearby rack, so we don't want to + # wait that long. Ergo we do our own trying, with a fast timeout. + # If we hit here, a packet might have been lost, so try again. + pass + except OSError as e: + if e.errno == errno.EHOSTUNREACH: + # If there is no route, no point retrying either. + break + else: + raise + + sd.close() + return connected + + +# Check for a wedge on a NIC. We detect a wedge by doing the following: +# +# Us (local IP, local port) -----> Them (remote IP, remote port) +# +# 1. Find an open remote port (to speed things up we check ports 443 and 80) +# 2. Repeatedly connect() to the remote port while incrementing our local port +# 3. If we find a local port that fails to connect, this may be a wedge +# +async def check_for_wedge(nic, semaphore): + local_ip = "0.0.0.0" + remote_ip = nic["ip"] + + can_connect = False + result = { + "cn": nic["cn_uuid"], + "vm": nic["belongs_to_uuid"], + "ip": nic["ip"], + "wedged": False + } + + async with semaphore: + # To speed things up, we only check ports 443 and 80, which are the + # most common ports on the Internet. + for remote_port in CHECK_REMOTE_PORTS: + if can_connect: + break + + for local_port in range(PORT_RANGE_START, PORT_RANGE_END): + src = (local_ip, local_port) + dest = (remote_ip, remote_port) + connected = await async_connect(src, dest) + + if can_connect and not connected: + result["wedged"] = True + return result + elif connected: + can_connect = True + + return result + + +# Given an array of nics, scan the ports on each nic's IP address, checking if +# any appear to be wedged. +async def scan(nics): + sem = asyncio.Semaphore(CONCURRENT_SCANS) + tasks = map(lambda nic: asyncio.create_task(check_for_wedge(nic, sem)), nics) + done, pending = await asyncio.wait(tasks) + return map(lambda f: f.result(), done) + + +# Print out all our results in a format that CheckMK understands. Most of our +# output are in JSON rows. +def print_out(scan_results, agent_name): + sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n") + scan_results = list(scan_results) + scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"]) + for entry in scan_results: + sys.stdout.write("%s\n" % json.dumps(entry)) + + +# Parse the command-line arguments, specifically for hostname. Print out help +# to console if we get no args. +def parse_arguments(argv): + parser = argparse.ArgumentParser() + parser.add_argument( + "hostname", metavar="HOSTNAME", help="Hostname of NAPI to query." + ) + return parser.parse_args(argv) + + +# Parse args, contact NAPI, query external IPs for VMs, and then print results +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + args = parse_arguments(argv) + nics = query_napi(args.hostname) + + # Sort the IPs so that (tend) to scan them in relative order. This is to + # increase the time between scans to the same IP due to consecutive agent + # executions, otherwise there's a higher chance we bump into TIME_WAIT. + #nics.sort(key=lambda d: d["ip"]) + + scan_results = asyncio.run(scan(nics)) + print_out(scan_results, "triton_wedge") + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge b/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge new file mode 100644 index 0000000..2cfe785 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + + +def agent_triton_wedge(params, hostname, ipaddress): + return [params["instance"]] + +special_agent_info["triton_wedge"] = agent_triton_wedge diff --git a/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py b/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py new file mode 100644 index 0000000..f734e27 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# +# GUI config page for triton_wedge. + +from cmk.gui.i18n import _ +from cmk.gui.plugins.wato.utils import ( + rulespec_registry, + HostRulespec, + RulespecGroupCheckParametersHardware +) +from cmk.gui.watolib.rulespecs import Rulespec +from cmk.gui.valuespec import ( + Dictionary, + Hostname, +) + + +def _valuespec_special_agents_triton_wedge(): + return Dictionary( + title=_("Triton Wedge Detection"), + help=_(""), + elements=[ + ( + "instance", + Hostname( + title=_("Hostname"), + help=_("Hostname or IP of NAPI to query"), + allow_empty=False, + ), + ), + ], + ) + + +rulespec_registry.register( + HostRulespec( + factory_default=Rulespec.FACTORY_DEFAULT_UNUSED, + name="special_agents:triton_wedge", + group=RulespecGroupCheckParametersHardware, + valuespec=_valuespec_special_agents_triton_wedge, + ) +) diff --git a/check_mk-wedge/triton_wedge-0.1.0.mkp b/check_mk-wedge/triton_wedge-0.1.0.mkp new file mode 100755 index 0000000..e6bd128 Binary files /dev/null and b/check_mk-wedge/triton_wedge-0.1.0.mkp differ