Improvements as requested by George:
* Switch wedge agent from reporting CN UUIDs to CN hostnames, and use CheckMK's piggyback mechanism to send wedge status to the correct Host in CheckMK. * Improve how local ports are selected during connect() attempts, so there's (much) less likely to be conflicts on subsequent runs, due to length of ephemeral port expiry. * Increase connect() time from 100ms to 1000ms, to deal better with potentially slow/overloaded VMs.
This commit is contained in:
parent
0358e8b2e8
commit
9e0e13a636
@ -7,15 +7,10 @@ from cmk.agent_based.v2 import Result, Service, State, CheckPlugin, AgentSection
|
|||||||
|
|
||||||
|
|
||||||
def parse_triton_wedge(string_table):
|
def parse_triton_wedge(string_table):
|
||||||
lookup = {}
|
vms = []
|
||||||
|
|
||||||
for row in string_table:
|
for row in string_table:
|
||||||
nic = json.loads(row[0])
|
vms.append(json.loads(row[0]))
|
||||||
cn_name = nic["cn"]
|
return vms
|
||||||
vms_in_cn = lookup.setdefault(cn_name, [])
|
|
||||||
vms_in_cn.append(nic)
|
|
||||||
|
|
||||||
return lookup
|
|
||||||
|
|
||||||
|
|
||||||
agent_section_triton_wedge = AgentSection(
|
agent_section_triton_wedge = AgentSection(
|
||||||
@ -25,18 +20,12 @@ agent_section_triton_wedge = AgentSection(
|
|||||||
|
|
||||||
|
|
||||||
def discover_triton_wedge(section):
|
def discover_triton_wedge(section):
|
||||||
for cn_name, vms in sorted(section.items()):
|
if section:
|
||||||
yield Service(item=cn_name, parameters={"name": cn_name})
|
yield Service(item="1")
|
||||||
|
|
||||||
|
|
||||||
def check_triton_wedge(item, params, section):
|
def check_triton_wedge(item, params, section):
|
||||||
cn_name = params["name"]
|
vms = section
|
||||||
vms = section.get(cn_name)
|
|
||||||
|
|
||||||
if vms is None:
|
|
||||||
yield Result(state=State.WARN, summary="Not appearing in NAPI")
|
|
||||||
return
|
|
||||||
|
|
||||||
wedged_vms = []
|
wedged_vms = []
|
||||||
|
|
||||||
for vm in vms:
|
for vm in vms:
|
||||||
@ -44,7 +33,8 @@ def check_triton_wedge(item, params, section):
|
|||||||
wedged_vms.append(vm)
|
wedged_vms.append(vm)
|
||||||
|
|
||||||
if len(wedged_vms) == 0:
|
if len(wedged_vms) == 0:
|
||||||
yield Result(state=State.OK, summary="No wedge detected")
|
summary = f"No wedge detected ({len(vms)} VM external NIC(s) checked)"
|
||||||
|
yield Result(state=State.OK, summary=summary)
|
||||||
elif len(wedged_vms) == 1:
|
elif len(wedged_vms) == 1:
|
||||||
vm = wedged_vms[0]
|
vm = wedged_vms[0]
|
||||||
summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"])
|
summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"])
|
||||||
@ -56,7 +46,7 @@ def check_triton_wedge(item, params, section):
|
|||||||
|
|
||||||
check_plugin_triton_wedge = CheckPlugin(
|
check_plugin_triton_wedge = CheckPlugin(
|
||||||
name="triton_wedge",
|
name="triton_wedge",
|
||||||
service_name="Triton Wedge CN %s",
|
service_name="Triton Wedge Detector (%s)",
|
||||||
discovery_function=discover_triton_wedge,
|
discovery_function=discover_triton_wedge,
|
||||||
check_function=check_triton_wedge,
|
check_function=check_triton_wedge,
|
||||||
check_default_parameters={},
|
check_default_parameters={},
|
||||||
|
@ -2,16 +2,69 @@
|
|||||||
|
|
||||||
# The range of ephemeral local ports we use when attempting to probe remote
|
# The range of ephemeral local ports we use when attempting to probe remote
|
||||||
# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use
|
# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use
|
||||||
# a stride of 128.
|
# a stride of 16. So we select a subrange of 16 ports somewhere within the
|
||||||
PORT_RANGE_START = 57000
|
# PORT_RANGE_START/END to scan; this reduces the chance of exhausted ports when
|
||||||
PORT_RANGE_END = 57128
|
# a human uses this tool (we have a tendency to run commands faster than
|
||||||
|
# the ephemeral port timeout).
|
||||||
|
PORT_SUBRANGE_SIZE = 16
|
||||||
|
PORT_RANGE_START = 50000
|
||||||
|
PORT_RANGE_END = 65504
|
||||||
|
# How often and for how long to attempt to connect to a VM
|
||||||
CONNECT_RETRIES = 3
|
CONNECT_RETRIES = 3
|
||||||
|
CONNECT_TIMEOUT = 1 # seconds
|
||||||
|
# Remote ports, in order, to attempt to connect to. More ports means higher
|
||||||
|
# chance of being able to test for a wedge, but also takes more time.
|
||||||
CHECK_REMOTE_PORTS = [443, 80]
|
CHECK_REMOTE_PORTS = [443, 80]
|
||||||
|
# How many VMs we'll be portmapping concurrently.
|
||||||
CONCURRENT_SCANS = 200
|
CONCURRENT_SCANS = 200
|
||||||
NAPI_TIMEOUT = 10 # seconds
|
NAPI_TIMEOUT = 10 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
# This is a hackish lookup to quickly convert CN UUIDs to host names.
|
||||||
|
# Ops wants host names, and wants them quick. Adding the lookup here is
|
||||||
|
# the fastest way to do it, although ideally it'd be put in a rule instead.
|
||||||
|
HOSTNAME_LOOKUP = {
|
||||||
|
"00000000-0000-0000-0000-ac1f6b41905a": "ac-1f-6b-27-81-40",
|
||||||
|
"44454c4c-3000-104a-804a-b3c04f465632": "e4-43-4b-b7-ad-a4",
|
||||||
|
"44454c4c-3000-104b-8039-b3c04f465632": "e4-43-4b-b7-ad-e0",
|
||||||
|
"44454c4c-3000-1056-8044-b4c04f465632": "e4-43-4b-b7-b0-38",
|
||||||
|
"44454c4c-3300-1051-8030-b4c04f525032": "e4-43-4b-bd-94-4c",
|
||||||
|
"44454c4c-3600-1038-804b-b2c04f445a32": "e4-43-4b-86-30-30",
|
||||||
|
"44454c4c-4200-1031-8033-c4c04f594d32": "24-6e-96-5e-a9-c8",
|
||||||
|
"44454c4c-4400-1053-8054-b4c04f474c32": "80-18-44-e5-20-38",
|
||||||
|
"44454c4c-4400-1053-8058-b4c04f474c32": "80-18-44-e5-1f-b4",
|
||||||
|
"44454c4c-4400-1054-8030-b4c04f484c32": "80-18-44-e5-35-80",
|
||||||
|
"44454c4c-4400-1054-8052-b4c04f474c32": "80-18-44-e5-24-bc",
|
||||||
|
"44454c4c-4400-1058-8042-c3c04f513033": "24-6e-96-5e-b3-9c",
|
||||||
|
"44454c4c-4400-1059-8042-c3c04f513033": "24-6e-96-63-f7-9c",
|
||||||
|
"44454c4c-4400-105a-8037-c3c04f513033": "24-6e-96-2e-fa-54",
|
||||||
|
"44454c4c-4600-1030-8057-c2c04f485032": "24-6e-96-0d-9c-98",
|
||||||
|
"44454c4c-4800-1038-8048-b2c04f435a32": "e4-43-4b-86-72-c8",
|
||||||
|
"44454c4c-4800-1038-8048-b4c04f435a32": "e4-43-4b-86-72-d0",
|
||||||
|
"44454c4c-4800-1038-8048-b5c04f435a32": "e4-43-4b-86-6c-8c",
|
||||||
|
"44454c4c-4800-1038-8048-b6c04f435a32": "e4-43-4b-86-6c-08",
|
||||||
|
"44454c4c-4800-1038-8048-b7c04f435a32": "e4-43-4b-86-73-18",
|
||||||
|
"44454c4c-4800-1038-8048-b8c04f435a32": "e4-43-4b-86-72-ec",
|
||||||
|
"44454c4c-4800-1038-8048-b9c04f435a32": "e4-43-4b-86-73-00",
|
||||||
|
"44454c4c-4800-1038-8048-c2c04f435a32": "e4-43-4b-86-73-04",
|
||||||
|
"44454c4c-4a00-1048-8033-b7c04f513033": "24-6e-96-2f-22-28",
|
||||||
|
"44454c4c-4b00-1056-8057-83965dec755b": "80-18-44-e5-d2-58-backup",
|
||||||
|
"44454c4c-4b00-1056-8057-b7c04f314c32": "headnode",
|
||||||
|
"44454c4c-4c00-104a-805a-b7c04f314c32": "80-18-44-e5-cf-84",
|
||||||
|
"44454c4c-4c00-104b-8058-b7c04f314c32": "80-18-44-e5-d2-50",
|
||||||
|
"44454c4c-4c00-104c-8051-b7c04f314c32": "80-18-44-e5-cf-4c",
|
||||||
|
"44454c4c-4c00-104c-8057-b7c04f314c32": "80-18-44-e5-ce-24",
|
||||||
|
"44454c4c-4c00-104d-8051-b7c04f314c32": "80-18-44-e5-ce-6c",
|
||||||
|
"44454c4c-4c00-104d-8058-b7c04f314c32": "80-18-44-e5-d2-6c",
|
||||||
|
"44454c4c-4c00-104e-8052-b7c04f314c32": "80-18-44-e5-d0-8c",
|
||||||
|
"44454c4c-4c00-104e-8056-b7c04f314c32": "80-18-44-e5-d0-1c",
|
||||||
|
"44454c4c-4c00-104e-8058-b7c04f314c32": "80-18-44-e5-cd-1c",
|
||||||
|
"44454c4c-5000-1053-8036-c3c04f445032": "24-6e-96-39-6c-5c",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
import urllib.request, sys, argparse, asyncio, json, socket, errno
|
import urllib.request, sys, argparse, asyncio, json, socket, errno
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
def get_url(url, timeout=None):
|
def get_url(url, timeout=None):
|
||||||
@ -50,7 +103,7 @@ async def async_connect(src, dest):
|
|||||||
for attempt in range(CONNECT_RETRIES):
|
for attempt in range(CONNECT_RETRIES):
|
||||||
try:
|
try:
|
||||||
future = loop.sock_connect(sd, dest)
|
future = loop.sock_connect(sd, dest)
|
||||||
await asyncio.wait_for(future, timeout=0.1)
|
await asyncio.wait_for(future, timeout=CONNECT_TIMEOUT)
|
||||||
connected = True
|
connected = True
|
||||||
break
|
break
|
||||||
except ConnectionRefusedError:
|
except ConnectionRefusedError:
|
||||||
@ -74,6 +127,20 @@ async def async_connect(src, dest):
|
|||||||
return connected
|
return connected
|
||||||
|
|
||||||
|
|
||||||
|
# Return a pair of numbers to use as the start and end ephemeral ports which
|
||||||
|
# are used to connect to a remote server.
|
||||||
|
def calculate_local_port_range():
|
||||||
|
# Pick a subrange of local ports to use, with a granularity of
|
||||||
|
# the size of that range to prevent subrange overlaps.
|
||||||
|
num_ports = PORT_RANGE_END - PORT_RANGE_START
|
||||||
|
num_ranges = int(num_ports / PORT_SUBRANGE_SIZE)
|
||||||
|
|
||||||
|
range_start = random.randint(0, num_ranges) * PORT_SUBRANGE_SIZE + PORT_RANGE_START
|
||||||
|
range_end = range_start + PORT_SUBRANGE_SIZE
|
||||||
|
|
||||||
|
return range_start, range_end
|
||||||
|
|
||||||
|
|
||||||
# Check for a wedge on a NIC. We detect a wedge by doing the following:
|
# Check for a wedge on a NIC. We detect a wedge by doing the following:
|
||||||
#
|
#
|
||||||
# Us (local IP, local port) -----> Them (remote IP, remote port)
|
# Us (local IP, local port) -----> Them (remote IP, remote port)
|
||||||
@ -87,24 +154,41 @@ async def check_for_wedge(nic, semaphore):
|
|||||||
remote_ip = nic["ip"]
|
remote_ip = nic["ip"]
|
||||||
|
|
||||||
can_connect = False
|
can_connect = False
|
||||||
|
|
||||||
|
cn = nic["cn_uuid"]
|
||||||
|
# convert server UUID to hostname if we know the hostname
|
||||||
|
cn_hostname = HOSTNAME_LOOKUP.get(cn)
|
||||||
|
if cn_hostname:
|
||||||
|
cn = cn_hostname
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"cn": nic["cn_uuid"],
|
"cn": cn,
|
||||||
"vm": nic["belongs_to_uuid"],
|
"vm": nic["belongs_to_uuid"],
|
||||||
"ip": nic["ip"],
|
"ip": nic["ip"],
|
||||||
"wedged": False
|
"wedged": False
|
||||||
}
|
}
|
||||||
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
|
local_start_port, local_end_port = calculate_local_port_range()
|
||||||
|
|
||||||
# To speed things up, we only check ports 443 and 80, which are the
|
# To speed things up, we only check ports 443 and 80, which are the
|
||||||
# most common ports on the Internet.
|
# most common ports on the Internet.
|
||||||
for remote_port in CHECK_REMOTE_PORTS:
|
for remote_port in CHECK_REMOTE_PORTS:
|
||||||
if can_connect:
|
if can_connect:
|
||||||
break
|
break
|
||||||
|
|
||||||
for local_port in range(PORT_RANGE_START, PORT_RANGE_END):
|
for local_port in range(local_start_port, local_end_port):
|
||||||
src = (local_ip, local_port)
|
src = (local_ip, local_port)
|
||||||
dest = (remote_ip, remote_port)
|
dest = (remote_ip, remote_port)
|
||||||
|
|
||||||
|
try:
|
||||||
connected = await async_connect(src, dest)
|
connected = await async_connect(src, dest)
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno == errno.EADDRINUSE:
|
||||||
|
result["wedged"] = None
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
if can_connect and not connected:
|
if can_connect and not connected:
|
||||||
result["wedged"] = True
|
result["wedged"] = True
|
||||||
@ -127,11 +211,24 @@ async def scan(nics):
|
|||||||
# Print out all our results in a format that CheckMK understands. Most of our
|
# Print out all our results in a format that CheckMK understands. Most of our
|
||||||
# output are in JSON rows.
|
# output are in JSON rows.
|
||||||
def print_out(scan_results, agent_name):
|
def print_out(scan_results, agent_name):
|
||||||
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
|
|
||||||
scan_results = list(scan_results)
|
scan_results = list(scan_results)
|
||||||
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
||||||
|
|
||||||
|
last_cn = None
|
||||||
|
|
||||||
for entry in scan_results:
|
for entry in scan_results:
|
||||||
sys.stdout.write("%s\n" % json.dumps(entry))
|
curr_cn = entry["cn"]
|
||||||
|
|
||||||
|
if curr_cn != last_cn:
|
||||||
|
last_cn = curr_cn
|
||||||
|
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
|
||||||
|
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
|
||||||
|
|
||||||
|
sys.stdout.write("%s\n" % json.dumps({
|
||||||
|
"vm": entry["vm"],
|
||||||
|
"ip": entry["ip"],
|
||||||
|
"wedged": entry["wedged"],
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
# Parse the command-line arguments, specifically for hostname. Print out help
|
# Parse the command-line arguments, specifically for hostname. Print out help
|
||||||
@ -163,4 +260,3 @@ def main(argv=None):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
||||||
|
Binary file not shown.
BIN
wedge/triton_wedge-0.3.0.mkp
Executable file
BIN
wedge/triton_wedge-0.3.0.mkp
Executable file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user