Improvements as requested by George:

* Switch wedge agent from reporting CN UUIDs to CN hostnames, and use CheckMK's piggyback mechanism to send wedge status to the correct Host in CheckMK.
* Improve how local ports are selected during connect() attempts, so there's (much) less likely to be conflicts on subsequent runs, due to length of ephemeral port expiry.
* Increase connect() time from 100ms to 1000ms, to deal better with potentially slow/overloaded VMs.
This commit is contained in:
Marsell Kukuljevic 2025-05-20 16:56:19 +02:00
parent 0358e8b2e8
commit 9e0e13a636
4 changed files with 115 additions and 29 deletions

View File

@ -7,15 +7,10 @@ from cmk.agent_based.v2 import Result, Service, State, CheckPlugin, AgentSection
def parse_triton_wedge(string_table):
lookup = {}
vms = []
for row in string_table:
nic = json.loads(row[0])
cn_name = nic["cn"]
vms_in_cn = lookup.setdefault(cn_name, [])
vms_in_cn.append(nic)
return lookup
vms.append(json.loads(row[0]))
return vms
agent_section_triton_wedge = AgentSection(
@ -25,18 +20,12 @@ agent_section_triton_wedge = AgentSection(
def discover_triton_wedge(section):
for cn_name, vms in sorted(section.items()):
yield Service(item=cn_name, parameters={"name": cn_name})
if section:
yield Service(item="1")
def check_triton_wedge(item, params, section):
cn_name = params["name"]
vms = section.get(cn_name)
if vms is None:
yield Result(state=State.WARN, summary="Not appearing in NAPI")
return
vms = section
wedged_vms = []
for vm in vms:
@ -44,7 +33,8 @@ def check_triton_wedge(item, params, section):
wedged_vms.append(vm)
if len(wedged_vms) == 0:
yield Result(state=State.OK, summary="No wedge detected")
summary = f"No wedge detected ({len(vms)} VM external NIC(s) checked)"
yield Result(state=State.OK, summary=summary)
elif len(wedged_vms) == 1:
vm = wedged_vms[0]
summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"])
@ -56,7 +46,7 @@ def check_triton_wedge(item, params, section):
check_plugin_triton_wedge = CheckPlugin(
name="triton_wedge",
service_name="Triton Wedge CN %s",
service_name="Triton Wedge Detector (%s)",
discovery_function=discover_triton_wedge,
check_function=check_triton_wedge,
check_default_parameters={},

View File

@ -2,16 +2,69 @@
# The range of ephemeral local ports we use when attempting to probe remote
# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use
# a stride of 128.
PORT_RANGE_START = 57000
PORT_RANGE_END = 57128
# a stride of 16. So we select a subrange of 16 ports somewhere within the
# PORT_RANGE_START/END to scan; this reduces the chance of exhausted ports when
# a human uses this tool (we have a tendency to run commands faster than
# the ephemeral port timeout).
PORT_SUBRANGE_SIZE = 16
PORT_RANGE_START = 50000
PORT_RANGE_END = 65504
# How often and for how long to attempt to connect to a VM
CONNECT_RETRIES = 3
CONNECT_TIMEOUT = 1 # seconds
# Remote ports, in order, to attempt to connect to. More ports means higher
# chance of being able to test for a wedge, but also takes more time.
CHECK_REMOTE_PORTS = [443, 80]
# How many VMs we'll be portmapping concurrently.
CONCURRENT_SCANS = 200
NAPI_TIMEOUT = 10 # seconds
# This is a hackish lookup to quickly convert CN UUIDs to host names.
# Ops wants host names, and wants them quick. Adding the lookup here is
# the fastest way to do it, although ideally it'd be put in a rule instead.
HOSTNAME_LOOKUP = {
"00000000-0000-0000-0000-ac1f6b41905a": "ac-1f-6b-27-81-40",
"44454c4c-3000-104a-804a-b3c04f465632": "e4-43-4b-b7-ad-a4",
"44454c4c-3000-104b-8039-b3c04f465632": "e4-43-4b-b7-ad-e0",
"44454c4c-3000-1056-8044-b4c04f465632": "e4-43-4b-b7-b0-38",
"44454c4c-3300-1051-8030-b4c04f525032": "e4-43-4b-bd-94-4c",
"44454c4c-3600-1038-804b-b2c04f445a32": "e4-43-4b-86-30-30",
"44454c4c-4200-1031-8033-c4c04f594d32": "24-6e-96-5e-a9-c8",
"44454c4c-4400-1053-8054-b4c04f474c32": "80-18-44-e5-20-38",
"44454c4c-4400-1053-8058-b4c04f474c32": "80-18-44-e5-1f-b4",
"44454c4c-4400-1054-8030-b4c04f484c32": "80-18-44-e5-35-80",
"44454c4c-4400-1054-8052-b4c04f474c32": "80-18-44-e5-24-bc",
"44454c4c-4400-1058-8042-c3c04f513033": "24-6e-96-5e-b3-9c",
"44454c4c-4400-1059-8042-c3c04f513033": "24-6e-96-63-f7-9c",
"44454c4c-4400-105a-8037-c3c04f513033": "24-6e-96-2e-fa-54",
"44454c4c-4600-1030-8057-c2c04f485032": "24-6e-96-0d-9c-98",
"44454c4c-4800-1038-8048-b2c04f435a32": "e4-43-4b-86-72-c8",
"44454c4c-4800-1038-8048-b4c04f435a32": "e4-43-4b-86-72-d0",
"44454c4c-4800-1038-8048-b5c04f435a32": "e4-43-4b-86-6c-8c",
"44454c4c-4800-1038-8048-b6c04f435a32": "e4-43-4b-86-6c-08",
"44454c4c-4800-1038-8048-b7c04f435a32": "e4-43-4b-86-73-18",
"44454c4c-4800-1038-8048-b8c04f435a32": "e4-43-4b-86-72-ec",
"44454c4c-4800-1038-8048-b9c04f435a32": "e4-43-4b-86-73-00",
"44454c4c-4800-1038-8048-c2c04f435a32": "e4-43-4b-86-73-04",
"44454c4c-4a00-1048-8033-b7c04f513033": "24-6e-96-2f-22-28",
"44454c4c-4b00-1056-8057-83965dec755b": "80-18-44-e5-d2-58-backup",
"44454c4c-4b00-1056-8057-b7c04f314c32": "headnode",
"44454c4c-4c00-104a-805a-b7c04f314c32": "80-18-44-e5-cf-84",
"44454c4c-4c00-104b-8058-b7c04f314c32": "80-18-44-e5-d2-50",
"44454c4c-4c00-104c-8051-b7c04f314c32": "80-18-44-e5-cf-4c",
"44454c4c-4c00-104c-8057-b7c04f314c32": "80-18-44-e5-ce-24",
"44454c4c-4c00-104d-8051-b7c04f314c32": "80-18-44-e5-ce-6c",
"44454c4c-4c00-104d-8058-b7c04f314c32": "80-18-44-e5-d2-6c",
"44454c4c-4c00-104e-8052-b7c04f314c32": "80-18-44-e5-d0-8c",
"44454c4c-4c00-104e-8056-b7c04f314c32": "80-18-44-e5-d0-1c",
"44454c4c-4c00-104e-8058-b7c04f314c32": "80-18-44-e5-cd-1c",
"44454c4c-5000-1053-8036-c3c04f445032": "24-6e-96-39-6c-5c",
}
import urllib.request, sys, argparse, asyncio, json, socket, errno
import random
def get_url(url, timeout=None):
@ -50,7 +103,7 @@ async def async_connect(src, dest):
for attempt in range(CONNECT_RETRIES):
try:
future = loop.sock_connect(sd, dest)
await asyncio.wait_for(future, timeout=0.1)
await asyncio.wait_for(future, timeout=CONNECT_TIMEOUT)
connected = True
break
except ConnectionRefusedError:
@ -74,6 +127,20 @@ async def async_connect(src, dest):
return connected
# Return a pair of numbers to use as the start and end ephemeral ports which
# are used to connect to a remote server.
def calculate_local_port_range():
# Pick a subrange of local ports to use, with a granularity of
# the size of that range to prevent subrange overlaps.
num_ports = PORT_RANGE_END - PORT_RANGE_START
num_ranges = int(num_ports / PORT_SUBRANGE_SIZE)
range_start = random.randint(0, num_ranges) * PORT_SUBRANGE_SIZE + PORT_RANGE_START
range_end = range_start + PORT_SUBRANGE_SIZE
return range_start, range_end
# Check for a wedge on a NIC. We detect a wedge by doing the following:
#
# Us (local IP, local port) -----> Them (remote IP, remote port)
@ -87,24 +154,41 @@ async def check_for_wedge(nic, semaphore):
remote_ip = nic["ip"]
can_connect = False
cn = nic["cn_uuid"]
# convert server UUID to hostname if we know the hostname
cn_hostname = HOSTNAME_LOOKUP.get(cn)
if cn_hostname:
cn = cn_hostname
result = {
"cn": nic["cn_uuid"],
"cn": cn,
"vm": nic["belongs_to_uuid"],
"ip": nic["ip"],
"wedged": False
}
async with semaphore:
local_start_port, local_end_port = calculate_local_port_range()
# To speed things up, we only check ports 443 and 80, which are the
# most common ports on the Internet.
for remote_port in CHECK_REMOTE_PORTS:
if can_connect:
break
for local_port in range(PORT_RANGE_START, PORT_RANGE_END):
for local_port in range(local_start_port, local_end_port):
src = (local_ip, local_port)
dest = (remote_ip, remote_port)
try:
connected = await async_connect(src, dest)
except OSError as e:
if e.errno == errno.EADDRINUSE:
result["wedged"] = None
return result
else:
raise
if can_connect and not connected:
result["wedged"] = True
@ -127,11 +211,24 @@ async def scan(nics):
# Print out all our results in a format that CheckMK understands. Most of our
# output are in JSON rows.
def print_out(scan_results, agent_name):
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
scan_results = list(scan_results)
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
last_cn = None
for entry in scan_results:
sys.stdout.write("%s\n" % json.dumps(entry))
curr_cn = entry["cn"]
if curr_cn != last_cn:
last_cn = curr_cn
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
sys.stdout.write("%s\n" % json.dumps({
"vm": entry["vm"],
"ip": entry["ip"],
"wedged": entry["wedged"],
}))
# Parse the command-line arguments, specifically for hostname. Print out help
@ -163,4 +260,3 @@ def main(argv=None):
if __name__ == "__main__":
sys.exit(main())

Binary file not shown.

BIN
wedge/triton_wedge-0.3.0.mkp Executable file

Binary file not shown.