Reduced connect() timeout to speed up scan, and percolate NAPI connection errors up to CheckMK's GUI.
This commit is contained in:
parent
ebe9b389ce
commit
ffafe3c4f6
@ -28,8 +28,12 @@ def check_triton_wedge(item, params, section):
|
||||
vms = section
|
||||
wedged_vms = []
|
||||
|
||||
if len(vms) == 1 and vms[0].get("error"):
|
||||
yield Result(state=State.UNKNOWN, summary=vms[0]["error"])
|
||||
return
|
||||
|
||||
for vm in vms:
|
||||
if vm["wedged"]:
|
||||
if vm["wedged"] == "probably":
|
||||
wedged_vms.append(vm)
|
||||
|
||||
if len(wedged_vms) == 0:
|
||||
|
@ -11,13 +11,14 @@ PORT_RANGE_START = 50000
|
||||
PORT_RANGE_END = 65504
|
||||
# How often and for how long to attempt to connect to a VM
|
||||
CONNECT_RETRIES = 3
|
||||
CONNECT_TIMEOUT = 1 # seconds
|
||||
CONNECT_TIMEOUT = 0.5 # seconds
|
||||
# Remote ports, in order, to attempt to connect to. More ports means higher
|
||||
# chance of being able to test for a wedge, but also takes more time.
|
||||
CHECK_REMOTE_PORTS = [443, 80]
|
||||
# How many VMs we'll be portmapping concurrently.
|
||||
CONCURRENT_SCANS = 200
|
||||
NAPI_TIMEOUT = 10 # seconds
|
||||
AGENT_NAME = "triton_wedge"
|
||||
|
||||
|
||||
# This is a hackish lookup to quickly convert CN UUIDs to host names.
|
||||
@ -81,8 +82,8 @@ def query_napi(addr):
|
||||
json_data = get_url(url, NAPI_TIMEOUT)
|
||||
nics = json.loads(json_data)
|
||||
return nics
|
||||
except urllib.error.HTTPError as e:
|
||||
sys.stderr.write("NAPI error: %s\n" % e)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
return None
|
||||
|
||||
|
||||
# asyncio provides some nice connection methods, but none of them allow us to
|
||||
@ -148,7 +149,6 @@ def calculate_local_port_range():
|
||||
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
|
||||
# 2. Repeatedly connect() to the remote port while incrementing our local port
|
||||
# 3. If we find a local port that fails to connect, this may be a wedge
|
||||
#
|
||||
async def check_for_wedge(nic, semaphore):
|
||||
local_ip = "0.0.0.0"
|
||||
remote_ip = nic["ip"]
|
||||
@ -165,7 +165,7 @@ async def check_for_wedge(nic, semaphore):
|
||||
"cn": cn,
|
||||
"vm": nic["belongs_to_uuid"],
|
||||
"ip": nic["ip"],
|
||||
"wedged": False
|
||||
"wedged": "no"
|
||||
}
|
||||
|
||||
async with semaphore:
|
||||
@ -185,13 +185,13 @@ async def check_for_wedge(nic, semaphore):
|
||||
connected = await async_connect(src, dest)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EADDRINUSE:
|
||||
result["wedged"] = None
|
||||
result["wedged"] = "temporary local port collision"
|
||||
return result
|
||||
else:
|
||||
raise
|
||||
|
||||
if can_connect and not connected:
|
||||
result["wedged"] = True
|
||||
result["wedged"] = "probably"
|
||||
return result
|
||||
elif connected:
|
||||
can_connect = True
|
||||
@ -208,9 +208,20 @@ async def scan(nics):
|
||||
return map(lambda f: f.result(), done)
|
||||
|
||||
|
||||
# This is only used when we cannot contact NAPI. Ops wants to know when this
|
||||
# happens, but if we cannot contact NAPI then we also don't know about any CNs.
|
||||
# We use information in HOSTNAME_LOOKUP to still be able to report about
|
||||
# a lack of contact to NAPI, even when we cannot contact NAPI.
|
||||
def print_out_napi_err():
|
||||
for host in HOSTNAME_LOOKUP.values():
|
||||
sys.stdout.write(f"<<<<{host}>>>>\n")
|
||||
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
|
||||
sys.stdout.write('{"error": "Cannot contact NAPI"}\n')
|
||||
|
||||
|
||||
# Print out all our results in a format that CheckMK understands. Most of our
|
||||
# output are in JSON rows.
|
||||
def print_out(scan_results, agent_name):
|
||||
def print_out(scan_results):
|
||||
scan_results = list(scan_results)
|
||||
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
||||
|
||||
@ -222,7 +233,7 @@ def print_out(scan_results, agent_name):
|
||||
if curr_cn != last_cn:
|
||||
last_cn = curr_cn
|
||||
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
|
||||
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
|
||||
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
|
||||
|
||||
sys.stdout.write("%s\n" % json.dumps({
|
||||
"vm": entry["vm"],
|
||||
@ -249,13 +260,13 @@ def main(argv=None):
|
||||
args = parse_arguments(argv)
|
||||
nics = query_napi(args.hostname)
|
||||
|
||||
# Sort the IPs so that (tend) to scan them in relative order. This is to
|
||||
# increase the time between scans to the same IP due to consecutive agent
|
||||
# executions, otherwise there's a higher chance we bump into TIME_WAIT.
|
||||
#nics.sort(key=lambda d: d["ip"])
|
||||
if nics == None:
|
||||
print_out_napi_err()
|
||||
# We must return 0, or CheckMK won't check the output we're returning
|
||||
sys.exit(0)
|
||||
|
||||
scan_results = asyncio.run(scan(nics))
|
||||
print_out(scan_results, "triton_wedge")
|
||||
print_out(scan_results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Binary file not shown.
BIN
wedge/triton_wedge-0.3.1.mkp
Executable file
BIN
wedge/triton_wedge-0.3.1.mkp
Executable file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user