diff --git a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py index 74c9c49..d838a35 100644 --- a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py +++ b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py @@ -7,15 +7,10 @@ from cmk.agent_based.v2 import Result, Service, State, CheckPlugin, AgentSection def parse_triton_wedge(string_table): - lookup = {} - + vms = [] for row in string_table: - nic = json.loads(row[0]) - cn_name = nic["cn"] - vms_in_cn = lookup.setdefault(cn_name, []) - vms_in_cn.append(nic) - - return lookup + vms.append(json.loads(row[0])) + return vms agent_section_triton_wedge = AgentSection( @@ -25,18 +20,12 @@ agent_section_triton_wedge = AgentSection( def discover_triton_wedge(section): - for cn_name, vms in sorted(section.items()): - yield Service(item=cn_name, parameters={"name": cn_name}) + if section: + yield Service(item="1") def check_triton_wedge(item, params, section): - cn_name = params["name"] - vms = section.get(cn_name) - - if vms is None: - yield Result(state=State.WARN, summary="Not appearing in NAPI") - return - + vms = section wedged_vms = [] for vm in vms: @@ -44,7 +33,8 @@ def check_triton_wedge(item, params, section): wedged_vms.append(vm) if len(wedged_vms) == 0: - yield Result(state=State.OK, summary="No wedge detected") + summary = f"No wedge detected ({len(vms)} VM external NIC(s) checked)" + yield Result(state=State.OK, summary=summary) elif len(wedged_vms) == 1: vm = wedged_vms[0] summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"]) @@ -56,7 +46,7 @@ def check_triton_wedge(item, params, section): check_plugin_triton_wedge = CheckPlugin( name="triton_wedge", - service_name="Triton Wedge CN %s", + service_name="Triton Wedge Detector (%s)", discovery_function=discover_triton_wedge, check_function=check_triton_wedge, check_default_parameters={}, diff --git a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge index 5da84b6..371b799 100755 --- a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge +++ b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge @@ -2,16 +2,69 @@ # The range of ephemeral local ports we use when attempting to probe remote # IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use -# a stride of 128. -PORT_RANGE_START = 57000 -PORT_RANGE_END = 57128 +# a stride of 16. So we select a subrange of 16 ports somewhere within the +# PORT_RANGE_START/END to scan; this reduces the chance of exhausted ports when +# a human uses this tool (we have a tendency to run commands faster than +# the ephemeral port timeout). +PORT_SUBRANGE_SIZE = 16 +PORT_RANGE_START = 50000 +PORT_RANGE_END = 65504 +# How often and for how long to attempt to connect to a VM CONNECT_RETRIES = 3 +CONNECT_TIMEOUT = 1 # seconds +# Remote ports, in order, to attempt to connect to. More ports means higher +# chance of being able to test for a wedge, but also takes more time. CHECK_REMOTE_PORTS = [443, 80] +# How many VMs we'll be portmapping concurrently. CONCURRENT_SCANS = 200 NAPI_TIMEOUT = 10 # seconds +# This is a hackish lookup to quickly convert CN UUIDs to host names. +# Ops wants host names, and wants them quick. Adding the lookup here is +# the fastest way to do it, although ideally it'd be put in a rule instead. +HOSTNAME_LOOKUP = { + "00000000-0000-0000-0000-ac1f6b41905a": "ac-1f-6b-27-81-40", + "44454c4c-3000-104a-804a-b3c04f465632": "e4-43-4b-b7-ad-a4", + "44454c4c-3000-104b-8039-b3c04f465632": "e4-43-4b-b7-ad-e0", + "44454c4c-3000-1056-8044-b4c04f465632": "e4-43-4b-b7-b0-38", + "44454c4c-3300-1051-8030-b4c04f525032": "e4-43-4b-bd-94-4c", + "44454c4c-3600-1038-804b-b2c04f445a32": "e4-43-4b-86-30-30", + "44454c4c-4200-1031-8033-c4c04f594d32": "24-6e-96-5e-a9-c8", + "44454c4c-4400-1053-8054-b4c04f474c32": "80-18-44-e5-20-38", + "44454c4c-4400-1053-8058-b4c04f474c32": "80-18-44-e5-1f-b4", + "44454c4c-4400-1054-8030-b4c04f484c32": "80-18-44-e5-35-80", + "44454c4c-4400-1054-8052-b4c04f474c32": "80-18-44-e5-24-bc", + "44454c4c-4400-1058-8042-c3c04f513033": "24-6e-96-5e-b3-9c", + "44454c4c-4400-1059-8042-c3c04f513033": "24-6e-96-63-f7-9c", + "44454c4c-4400-105a-8037-c3c04f513033": "24-6e-96-2e-fa-54", + "44454c4c-4600-1030-8057-c2c04f485032": "24-6e-96-0d-9c-98", + "44454c4c-4800-1038-8048-b2c04f435a32": "e4-43-4b-86-72-c8", + "44454c4c-4800-1038-8048-b4c04f435a32": "e4-43-4b-86-72-d0", + "44454c4c-4800-1038-8048-b5c04f435a32": "e4-43-4b-86-6c-8c", + "44454c4c-4800-1038-8048-b6c04f435a32": "e4-43-4b-86-6c-08", + "44454c4c-4800-1038-8048-b7c04f435a32": "e4-43-4b-86-73-18", + "44454c4c-4800-1038-8048-b8c04f435a32": "e4-43-4b-86-72-ec", + "44454c4c-4800-1038-8048-b9c04f435a32": "e4-43-4b-86-73-00", + "44454c4c-4800-1038-8048-c2c04f435a32": "e4-43-4b-86-73-04", + "44454c4c-4a00-1048-8033-b7c04f513033": "24-6e-96-2f-22-28", + "44454c4c-4b00-1056-8057-83965dec755b": "80-18-44-e5-d2-58-backup", + "44454c4c-4b00-1056-8057-b7c04f314c32": "headnode", + "44454c4c-4c00-104a-805a-b7c04f314c32": "80-18-44-e5-cf-84", + "44454c4c-4c00-104b-8058-b7c04f314c32": "80-18-44-e5-d2-50", + "44454c4c-4c00-104c-8051-b7c04f314c32": "80-18-44-e5-cf-4c", + "44454c4c-4c00-104c-8057-b7c04f314c32": "80-18-44-e5-ce-24", + "44454c4c-4c00-104d-8051-b7c04f314c32": "80-18-44-e5-ce-6c", + "44454c4c-4c00-104d-8058-b7c04f314c32": "80-18-44-e5-d2-6c", + "44454c4c-4c00-104e-8052-b7c04f314c32": "80-18-44-e5-d0-8c", + "44454c4c-4c00-104e-8056-b7c04f314c32": "80-18-44-e5-d0-1c", + "44454c4c-4c00-104e-8058-b7c04f314c32": "80-18-44-e5-cd-1c", + "44454c4c-5000-1053-8036-c3c04f445032": "24-6e-96-39-6c-5c", +} + + import urllib.request, sys, argparse, asyncio, json, socket, errno +import random def get_url(url, timeout=None): @@ -50,7 +103,7 @@ async def async_connect(src, dest): for attempt in range(CONNECT_RETRIES): try: future = loop.sock_connect(sd, dest) - await asyncio.wait_for(future, timeout=0.1) + await asyncio.wait_for(future, timeout=CONNECT_TIMEOUT) connected = True break except ConnectionRefusedError: @@ -74,6 +127,20 @@ async def async_connect(src, dest): return connected +# Return a pair of numbers to use as the start and end ephemeral ports which +# are used to connect to a remote server. +def calculate_local_port_range(): + # Pick a subrange of local ports to use, with a granularity of + # the size of that range to prevent subrange overlaps. + num_ports = PORT_RANGE_END - PORT_RANGE_START + num_ranges = int(num_ports / PORT_SUBRANGE_SIZE) + + range_start = random.randint(0, num_ranges) * PORT_SUBRANGE_SIZE + PORT_RANGE_START + range_end = range_start + PORT_SUBRANGE_SIZE + + return range_start, range_end + + # Check for a wedge on a NIC. We detect a wedge by doing the following: # # Us (local IP, local port) -----> Them (remote IP, remote port) @@ -87,24 +154,41 @@ async def check_for_wedge(nic, semaphore): remote_ip = nic["ip"] can_connect = False + + cn = nic["cn_uuid"] + # convert server UUID to hostname if we know the hostname + cn_hostname = HOSTNAME_LOOKUP.get(cn) + if cn_hostname: + cn = cn_hostname + result = { - "cn": nic["cn_uuid"], + "cn": cn, "vm": nic["belongs_to_uuid"], "ip": nic["ip"], "wedged": False } async with semaphore: + local_start_port, local_end_port = calculate_local_port_range() + # To speed things up, we only check ports 443 and 80, which are the # most common ports on the Internet. for remote_port in CHECK_REMOTE_PORTS: if can_connect: break - for local_port in range(PORT_RANGE_START, PORT_RANGE_END): + for local_port in range(local_start_port, local_end_port): src = (local_ip, local_port) dest = (remote_ip, remote_port) - connected = await async_connect(src, dest) + + try: + connected = await async_connect(src, dest) + except OSError as e: + if e.errno == errno.EADDRINUSE: + result["wedged"] = None + return result + else: + raise if can_connect and not connected: result["wedged"] = True @@ -127,11 +211,24 @@ async def scan(nics): # Print out all our results in a format that CheckMK understands. Most of our # output are in JSON rows. def print_out(scan_results, agent_name): - sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n") scan_results = list(scan_results) scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"]) + + last_cn = None + for entry in scan_results: - sys.stdout.write("%s\n" % json.dumps(entry)) + curr_cn = entry["cn"] + + if curr_cn != last_cn: + last_cn = curr_cn + sys.stdout.write(f"<<<<{curr_cn}>>>>\n") + sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n") + + sys.stdout.write("%s\n" % json.dumps({ + "vm": entry["vm"], + "ip": entry["ip"], + "wedged": entry["wedged"], + })) # Parse the command-line arguments, specifically for hostname. Print out help @@ -163,4 +260,3 @@ def main(argv=None): if __name__ == "__main__": sys.exit(main()) - diff --git a/wedge/triton_wedge-0.2.0.mkp b/wedge/triton_wedge-0.2.0.mkp deleted file mode 100755 index 9ecd718..0000000 Binary files a/wedge/triton_wedge-0.2.0.mkp and /dev/null differ diff --git a/wedge/triton_wedge-0.3.0.mkp b/wedge/triton_wedge-0.3.0.mkp new file mode 100755 index 0000000..d6b10e2 Binary files /dev/null and b/wedge/triton_wedge-0.3.0.mkp differ