Compare commits

..

9 Commits

5 changed files with 0 additions and 375 deletions

View File

@ -1,57 +0,0 @@
#!/usr/bin/env python3
#
# Parses and checks external VM IPs.
import json
from cmk.agent_based.v2 import Result, Service, State, CheckPlugin, AgentSection
def parse_triton_wedge(string_table):
vms = []
for row in string_table:
vms.append(json.loads(row[0]))
return vms
agent_section_triton_wedge = AgentSection(
name="triton_wedge",
parse_function=parse_triton_wedge
)
def discover_triton_wedge(section):
if section:
yield Service(item="1")
def check_triton_wedge(item, params, section):
vms = section
wedged_vms = []
if len(vms) == 1 and vms[0].get("error"):
yield Result(state=State.UNKNOWN, summary=vms[0]["error"])
return
for vm in vms:
if vm["wedged"] == "probably":
wedged_vms.append(vm)
if len(wedged_vms) == 0:
summary = f"No wedge detected ({len(vms)} VM external NIC(s) checked)"
yield Result(state=State.OK, summary=summary)
elif len(wedged_vms) == 1:
vm = wedged_vms[0]
summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"])
yield Result(state=State.WARN, summary=summary)
else:
lst = ", ".join(map(lambda vm: "VM %s (%s)" % (vm["vm"], vm["ip"]), wedged_vms))
yield Result(state=State.CRIT, summary=f"Likely wedge detected for {lst}")
check_plugin_triton_wedge = CheckPlugin(
name="triton_wedge",
service_name="Triton Wedge Detector (%s)",
discovery_function=discover_triton_wedge,
check_function=check_triton_wedge,
check_default_parameters={},
)

View File

@ -1,273 +0,0 @@
#!/usr/bin/env python3
# The range of ephemeral local ports we use when attempting to probe remote
# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use
# a stride of 16. So we select a subrange of 16 ports somewhere within the
# PORT_RANGE_START/END to scan; this reduces the chance of exhausted ports when
# a human uses this tool (we have a tendency to run commands faster than
# the ephemeral port timeout).
PORT_SUBRANGE_SIZE = 16
PORT_RANGE_START = 50000
PORT_RANGE_END = 65504
# How often and for how long to attempt to connect to a VM
CONNECT_RETRIES = 3
CONNECT_TIMEOUT = 0.5 # seconds
# Remote ports, in order, to attempt to connect to. More ports means higher
# chance of being able to test for a wedge, but also takes more time.
CHECK_REMOTE_PORTS = [443, 80]
# How many VMs we'll be portmapping concurrently.
CONCURRENT_SCANS = 200
NAPI_TIMEOUT = 10 # seconds
AGENT_NAME = "triton_wedge"
# This is a hackish lookup to quickly convert CN UUIDs to host names.
# Ops wants host names, and wants them quick. Adding the lookup here is
# the fastest way to do it, although ideally it'd be put in a rule instead.
HOSTNAME_LOOKUP = {
"00000000-0000-0000-0000-ac1f6b41905a": "ac-1f-6b-27-81-40",
"44454c4c-3000-104a-804a-b3c04f465632": "e4-43-4b-b7-ad-a4",
"44454c4c-3000-104b-8039-b3c04f465632": "e4-43-4b-b7-ad-e0",
"44454c4c-3000-1056-8044-b4c04f465632": "e4-43-4b-b7-b0-38",
"44454c4c-3300-1051-8030-b4c04f525032": "e4-43-4b-bd-94-4c",
"44454c4c-3600-1038-804b-b2c04f445a32": "e4-43-4b-86-30-30",
"44454c4c-4200-1031-8033-c4c04f594d32": "24-6e-96-5e-a9-c8",
"44454c4c-4400-1053-8054-b4c04f474c32": "80-18-44-e5-20-38",
"44454c4c-4400-1053-8058-b4c04f474c32": "80-18-44-e5-1f-b4",
"44454c4c-4400-1054-8030-b4c04f484c32": "80-18-44-e5-35-80",
"44454c4c-4400-1054-8052-b4c04f474c32": "80-18-44-e5-24-bc",
"44454c4c-4400-1058-8042-c3c04f513033": "24-6e-96-5e-b3-9c",
"44454c4c-4400-1059-8042-c3c04f513033": "24-6e-96-63-f7-9c",
"44454c4c-4400-105a-8037-c3c04f513033": "24-6e-96-2e-fa-54",
"44454c4c-4600-1030-8057-c2c04f485032": "24-6e-96-0d-9c-98",
"44454c4c-4800-1038-8048-b2c04f435a32": "e4-43-4b-86-72-c8",
"44454c4c-4800-1038-8048-b4c04f435a32": "e4-43-4b-86-72-d0",
"44454c4c-4800-1038-8048-b5c04f435a32": "e4-43-4b-86-6c-8c",
"44454c4c-4800-1038-8048-b6c04f435a32": "e4-43-4b-86-6c-08",
"44454c4c-4800-1038-8048-b7c04f435a32": "e4-43-4b-86-73-18",
"44454c4c-4800-1038-8048-b8c04f435a32": "e4-43-4b-86-72-ec",
"44454c4c-4800-1038-8048-b9c04f435a32": "e4-43-4b-86-73-00",
"44454c4c-4800-1038-8048-c2c04f435a32": "e4-43-4b-86-73-04",
"44454c4c-4a00-1048-8033-b7c04f513033": "24-6e-96-2f-22-28",
"44454c4c-4b00-1056-8057-83965dec755b": "80-18-44-e5-d2-58-backup",
"44454c4c-4b00-1056-8057-b7c04f314c32": "headnode",
"44454c4c-4c00-104a-805a-b7c04f314c32": "80-18-44-e5-cf-84",
"44454c4c-4c00-104b-8058-b7c04f314c32": "80-18-44-e5-d2-50",
"44454c4c-4c00-104c-8051-b7c04f314c32": "80-18-44-e5-cf-4c",
"44454c4c-4c00-104c-8057-b7c04f314c32": "80-18-44-e5-ce-24",
"44454c4c-4c00-104d-8051-b7c04f314c32": "80-18-44-e5-ce-6c",
"44454c4c-4c00-104d-8058-b7c04f314c32": "80-18-44-e5-d2-6c",
"44454c4c-4c00-104e-8052-b7c04f314c32": "80-18-44-e5-d0-8c",
"44454c4c-4c00-104e-8056-b7c04f314c32": "80-18-44-e5-d0-1c",
"44454c4c-4c00-104e-8058-b7c04f314c32": "80-18-44-e5-cd-1c",
"44454c4c-5000-1053-8036-c3c04f445032": "24-6e-96-39-6c-5c",
}
import urllib.request, sys, argparse, asyncio, json, socket, errno
import random
def get_url(url, timeout=None):
request = urllib.request.Request(url)
with urllib.request.urlopen(request, timeout=timeout) as conn:
data = conn.read()
return data
# Fetch and parse details about active zone NICs on the external network.
def query_napi(addr):
url = 'http://%s/nics?nic_tag=external&belongs_to_type=zone&state=running' % addr
try:
json_data = get_url(url, NAPI_TIMEOUT)
nics = json.loads(json_data)
return nics
except (urllib.error.HTTPError, urllib.error.URLError):
return None
# asyncio provides some nice connection methods, but none of them allow us to
# use SO_REUSEPORT. This flag is critical since we're repeatedly using the
# same range of local ports to port map remote IPs. So we have to resort to
# this low-level socket hackery to enable SO_REUSEPORT.
async def async_connect(src, dest):
loop = asyncio.get_event_loop()
sd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
sd.bind(src)
sd.setblocking(False)
connected = False
# We try to connect() several times, in case a packet got lost.
for attempt in range(CONNECT_RETRIES):
try:
future = loop.sock_connect(sd, dest)
await asyncio.wait_for(future, timeout=CONNECT_TIMEOUT)
connected = True
break
except ConnectionRefusedError:
# ECONNREFUSED (we received a RST after sending an ACK). If we
# receive this there's no point retrying.
break
except (TimeoutError, asyncio.TimeoutError):
# Usually you'd wait for the TCP stack to make its own retries, but
# we know our target IPs are in a nearby rack, so we don't want to
# wait that long. Ergo we do our own trying, with a fast timeout.
# If we hit here, a packet might have been lost, so try again.
pass
except OSError as e:
if e.errno == errno.EHOSTUNREACH:
# If there is no route, no point retrying either.
break
else:
raise
sd.close()
return connected
# Return a pair of numbers to use as the start and end ephemeral ports which
# are used to connect to a remote server.
def calculate_local_port_range():
# Pick a subrange of local ports to use, with a granularity of
# the size of that range to prevent subrange overlaps.
num_ports = PORT_RANGE_END - PORT_RANGE_START
num_ranges = int(num_ports / PORT_SUBRANGE_SIZE)
range_start = random.randint(0, num_ranges) * PORT_SUBRANGE_SIZE + PORT_RANGE_START
range_end = range_start + PORT_SUBRANGE_SIZE
return range_start, range_end
# Check for a wedge on a NIC. We detect a wedge by doing the following:
#
# Us (local IP, local port) -----> Them (remote IP, remote port)
#
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
# 2. Repeatedly connect() to the remote port while incrementing our local port
# 3. If we find a local port that fails to connect, this may be a wedge
async def check_for_wedge(nic, semaphore):
local_ip = "0.0.0.0"
remote_ip = nic["ip"]
can_connect = False
cn = nic["cn_uuid"]
# convert server UUID to hostname if we know the hostname
cn_hostname = HOSTNAME_LOOKUP.get(cn)
if cn_hostname:
cn = cn_hostname
result = {
"cn": cn,
"vm": nic["belongs_to_uuid"],
"ip": nic["ip"],
"wedged": "no"
}
async with semaphore:
local_start_port, local_end_port = calculate_local_port_range()
# To speed things up, we only check ports 443 and 80, which are the
# most common ports on the Internet.
for remote_port in CHECK_REMOTE_PORTS:
if can_connect:
break
for local_port in range(local_start_port, local_end_port):
src = (local_ip, local_port)
dest = (remote_ip, remote_port)
try:
connected = await async_connect(src, dest)
except OSError as e:
if e.errno == errno.EADDRINUSE:
result["wedged"] = "temporary local port collision"
return result
else:
raise
if can_connect and not connected:
result["wedged"] = "probably"
return result
elif connected:
can_connect = True
return result
# Given an array of nics, scan the ports on each nic's IP address, checking if
# any appear to be wedged.
async def scan(nics):
sem = asyncio.Semaphore(CONCURRENT_SCANS)
tasks = map(lambda nic: asyncio.create_task(check_for_wedge(nic, sem)), nics)
done, pending = await asyncio.wait(tasks)
return map(lambda f: f.result(), done)
# This is only used when we cannot contact NAPI. Ops wants to know when this
# happens, but if we cannot contact NAPI then we also don't know about any CNs.
# We use information in HOSTNAME_LOOKUP to still be able to report about
# a lack of contact to NAPI, even when we cannot contact NAPI.
def print_out_napi_err():
for host in HOSTNAME_LOOKUP.values():
sys.stdout.write(f"<<<<{host}>>>>\n")
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
sys.stdout.write('{"error": "Cannot contact NAPI"}\n')
# Print out all our results in a format that CheckMK understands. Most of our
# output are in JSON rows.
def print_out(scan_results):
scan_results = list(scan_results)
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
last_cn = None
for entry in scan_results:
curr_cn = entry["cn"]
if curr_cn != last_cn:
last_cn = curr_cn
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
sys.stdout.write("%s\n" % json.dumps({
"vm": entry["vm"],
"ip": entry["ip"],
"wedged": entry["wedged"],
}))
# Parse the command-line arguments, specifically for hostname. Print out help
# to console if we get no args.
def parse_arguments(argv):
parser = argparse.ArgumentParser()
parser.add_argument(
"hostname", metavar="HOSTNAME", help="Hostname of NAPI to query."
)
return parser.parse_args(argv)
# Parse args, contact NAPI, query external IPs for VMs, and then print results
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
args = parse_arguments(argv)
nics = query_napi(args.hostname)
if nics == None:
print_out_napi_err()
# We must return 0, or CheckMK won't check the output we're returning
sys.exit(0)
scan_results = asyncio.run(scan(nics))
print_out(scan_results)
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,30 +0,0 @@
#!/usr/bin/env python3
#
# GUI config page for triton_wedge.
from cmk.rulesets.v1.form_specs.validators import LengthInRange
from cmk.rulesets.v1.form_specs import Dictionary, DictElement, String
from cmk.rulesets.v1.rule_specs import SpecialAgent, Topic, Title, Help
def _formspec():
return Dictionary(
title=Title("Triton Wedge Detection"),
elements={
"instance": DictElement(
parameter_form=String(
title=Title("Hostname"),
help_text=Help("Hostname or IP of NAPI to query"),
custom_validate=(LengthInRange(min_value=1),),
),
required=True
)
}
)
rule_spec_agent_config_triton_wedge = SpecialAgent(
topic=Topic.NETWORKING,
name="triton_wedge",
title=Title("Triton Wedge Detector"),
parameter_form=_formspec,
)

View File

@ -1,15 +0,0 @@
#!/usr/bin/env python3
from cmk.server_side_calls.v1 import noop_parser, SpecialAgentConfig, SpecialAgentCommand
def _agent_arguments(params, host_config):
yield SpecialAgentCommand(command_arguments=[params["instance"]])
special_agent_triton_wedge = SpecialAgentConfig(
name="triton_wedge",
parameter_parser=noop_parser,
commands_function=_agent_arguments,
)

Binary file not shown.