diff --git a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py index d838a35..0a2c593 100644 --- a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py +++ b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/agent_based/triton_wedge.py @@ -28,8 +28,12 @@ def check_triton_wedge(item, params, section): vms = section wedged_vms = [] + if len(vms) == 1 and vms[0].get("error"): + yield Result(state=State.UNKNOWN, summary=vms[0]["error"]) + return + for vm in vms: - if vm["wedged"]: + if vm["wedged"] == "probably": wedged_vms.append(vm) if len(wedged_vms) == 0: diff --git a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge index 371b799..0c1a7b9 100755 --- a/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge +++ b/wedge/local/lib/python3/cmk_addons/plugins/triton_wedge/libexec/agent_triton_wedge @@ -11,13 +11,14 @@ PORT_RANGE_START = 50000 PORT_RANGE_END = 65504 # How often and for how long to attempt to connect to a VM CONNECT_RETRIES = 3 -CONNECT_TIMEOUT = 1 # seconds +CONNECT_TIMEOUT = 0.5 # seconds # Remote ports, in order, to attempt to connect to. More ports means higher # chance of being able to test for a wedge, but also takes more time. CHECK_REMOTE_PORTS = [443, 80] # How many VMs we'll be portmapping concurrently. CONCURRENT_SCANS = 200 NAPI_TIMEOUT = 10 # seconds +AGENT_NAME = "triton_wedge" # This is a hackish lookup to quickly convert CN UUIDs to host names. @@ -81,8 +82,8 @@ def query_napi(addr): json_data = get_url(url, NAPI_TIMEOUT) nics = json.loads(json_data) return nics - except urllib.error.HTTPError as e: - sys.stderr.write("NAPI error: %s\n" % e) + except (urllib.error.HTTPError, urllib.error.URLError): + return None # asyncio provides some nice connection methods, but none of them allow us to @@ -148,7 +149,6 @@ def calculate_local_port_range(): # 1. Find an open remote port (to speed things up we check ports 443 and 80) # 2. Repeatedly connect() to the remote port while incrementing our local port # 3. If we find a local port that fails to connect, this may be a wedge -# async def check_for_wedge(nic, semaphore): local_ip = "0.0.0.0" remote_ip = nic["ip"] @@ -165,7 +165,7 @@ async def check_for_wedge(nic, semaphore): "cn": cn, "vm": nic["belongs_to_uuid"], "ip": nic["ip"], - "wedged": False + "wedged": "no" } async with semaphore: @@ -185,13 +185,13 @@ async def check_for_wedge(nic, semaphore): connected = await async_connect(src, dest) except OSError as e: if e.errno == errno.EADDRINUSE: - result["wedged"] = None + result["wedged"] = "temporary local port collision" return result else: raise if can_connect and not connected: - result["wedged"] = True + result["wedged"] = "probably" return result elif connected: can_connect = True @@ -208,9 +208,20 @@ async def scan(nics): return map(lambda f: f.result(), done) +# This is only used when we cannot contact NAPI. Ops wants to know when this +# happens, but if we cannot contact NAPI then we also don't know about any CNs. +# We use information in HOSTNAME_LOOKUP to still be able to report about +# a lack of contact to NAPI, even when we cannot contact NAPI. +def print_out_napi_err(): + for host in HOSTNAME_LOOKUP.values(): + sys.stdout.write(f"<<<<{host}>>>>\n") + sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n") + sys.stdout.write('{"error": "Cannot contact NAPI"}\n') + + # Print out all our results in a format that CheckMK understands. Most of our # output are in JSON rows. -def print_out(scan_results, agent_name): +def print_out(scan_results): scan_results = list(scan_results) scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"]) @@ -222,7 +233,7 @@ def print_out(scan_results, agent_name): if curr_cn != last_cn: last_cn = curr_cn sys.stdout.write(f"<<<<{curr_cn}>>>>\n") - sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n") + sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n") sys.stdout.write("%s\n" % json.dumps({ "vm": entry["vm"], @@ -249,13 +260,13 @@ def main(argv=None): args = parse_arguments(argv) nics = query_napi(args.hostname) - # Sort the IPs so that (tend) to scan them in relative order. This is to - # increase the time between scans to the same IP due to consecutive agent - # executions, otherwise there's a higher chance we bump into TIME_WAIT. - #nics.sort(key=lambda d: d["ip"]) + if nics == None: + print_out_napi_err() + # We must return 0, or CheckMK won't check the output we're returning + sys.exit(0) scan_results = asyncio.run(scan(nics)) - print_out(scan_results, "triton_wedge") + print_out(scan_results) if __name__ == "__main__": diff --git a/wedge/triton_wedge-0.3.0.mkp b/wedge/triton_wedge-0.3.0.mkp deleted file mode 100755 index d6b10e2..0000000 Binary files a/wedge/triton_wedge-0.3.0.mkp and /dev/null differ diff --git a/wedge/triton_wedge-0.3.1.mkp b/wedge/triton_wedge-0.3.1.mkp new file mode 100755 index 0000000..a9dbfb1 Binary files /dev/null and b/wedge/triton_wedge-0.3.1.mkp differ