Add Triton Wedge detector.
This commit is contained in:
parent
1b4fafb15e
commit
0833ae7a16
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Parses and checks external VM IPs.
|
||||||
|
|
||||||
|
import json
|
||||||
|
from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State
|
||||||
|
|
||||||
|
|
||||||
|
def parse_triton_wedge(string_table):
|
||||||
|
lookup = {}
|
||||||
|
|
||||||
|
for row in string_table:
|
||||||
|
nic = json.loads(row[0])
|
||||||
|
cn_name = nic["cn"]
|
||||||
|
vms_in_cn = lookup.setdefault(cn_name, [])
|
||||||
|
vms_in_cn.append(nic)
|
||||||
|
|
||||||
|
return lookup
|
||||||
|
|
||||||
|
|
||||||
|
register.agent_section(
|
||||||
|
name="triton_wedge",
|
||||||
|
parse_function=parse_triton_wedge
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def discover_triton_wedge(section):
|
||||||
|
for cn_name, vms in sorted(section.items()):
|
||||||
|
yield Service(item=cn_name, parameters={"name": cn_name})
|
||||||
|
|
||||||
|
|
||||||
|
def check_triton_wedge(item, params, section):
|
||||||
|
cn_name = params["name"]
|
||||||
|
vms = section.get(cn_name)
|
||||||
|
|
||||||
|
if vms is None:
|
||||||
|
yield Result(state=State.WARN, summary="Not appearing in NAPI")
|
||||||
|
return
|
||||||
|
|
||||||
|
wedged_vms = []
|
||||||
|
|
||||||
|
for vm in vms:
|
||||||
|
if vm["wedged"]:
|
||||||
|
wedged_vms.append(vm)
|
||||||
|
|
||||||
|
if len(wedged_vms) == 0:
|
||||||
|
yield Result(state=State.OK, summary="No wedge detected")
|
||||||
|
elif len(wedged_vms) == 1:
|
||||||
|
vm = wedged_vms[0]
|
||||||
|
summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"])
|
||||||
|
yield Result(state=State.WARN, summary=summary)
|
||||||
|
else:
|
||||||
|
lst = ", ".join(map(lambda vm: "VM %s (%s)" % (vm["vm"], vm["ip"]), wedged_vms))
|
||||||
|
yield Result(state=State.CRIT, summary=f"Likely wedged detected for {lst}")
|
||||||
|
|
||||||
|
|
||||||
|
register.check_plugin(
|
||||||
|
name="triton_wedge",
|
||||||
|
service_name="Triton Wedge CN %s",
|
||||||
|
discovery_function=discover_triton_wedge,
|
||||||
|
check_function=check_triton_wedge,
|
||||||
|
check_default_parameters={},
|
||||||
|
check_ruleset_name="triton_wedge",
|
||||||
|
)
|
165
check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge
Executable file
165
check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge
Executable file
@ -0,0 +1,165 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# The range of ephemeral local ports we use when attempting to probe remote
|
||||||
|
# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use
|
||||||
|
# a stride of 128.
|
||||||
|
PORT_RANGE_START = 57000
|
||||||
|
PORT_RANGE_END = 57128
|
||||||
|
CONNECT_RETRIES = 3
|
||||||
|
CHECK_REMOTE_PORTS = [443, 80]
|
||||||
|
CONCURRENT_SCANS = 200
|
||||||
|
|
||||||
|
|
||||||
|
import urllib.request, sys, argparse, asyncio, json, socket, errno
|
||||||
|
|
||||||
|
|
||||||
|
def get_url(url):
|
||||||
|
request = urllib.request.Request(url)
|
||||||
|
with urllib.request.urlopen(request) as conn:
|
||||||
|
data = conn.read()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# Fetch and parse details about active zone NICs on the external network.
|
||||||
|
def query_napi(addr):
|
||||||
|
url = 'http://%s/nics?nic_tag=external&belongs_to_type=zone&state=running' % addr
|
||||||
|
try:
|
||||||
|
json_data = get_url(url)
|
||||||
|
nics = json.loads(json_data)
|
||||||
|
return nics
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
sys.stderr.write("NAPI error: %s\n" % e)
|
||||||
|
|
||||||
|
|
||||||
|
# asyncio provides some nice connection methods, but none of them allow us to
|
||||||
|
# use SO_REUSEPORT. This flag is critical since we're repeatedly using the
|
||||||
|
# same range of local ports to port map remote IPs. So we have to resort to
|
||||||
|
# this low-level socket hackery to enable SO_REUSEPORT.
|
||||||
|
async def async_connect(src, dest):
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
sd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
|
||||||
|
sd.bind(src)
|
||||||
|
sd.setblocking(False)
|
||||||
|
|
||||||
|
connected = False
|
||||||
|
|
||||||
|
# We try to connect() several times, in case a packet got lost.
|
||||||
|
for attempt in range(CONNECT_RETRIES):
|
||||||
|
try:
|
||||||
|
future = loop.sock_connect(sd, dest)
|
||||||
|
await asyncio.wait_for(future, timeout=0.1)
|
||||||
|
connected = True
|
||||||
|
break
|
||||||
|
except ConnectionRefusedError:
|
||||||
|
# ECONNREFUSED (we received a RST after sending an ACK). If we
|
||||||
|
# receive this there's no point retrying.
|
||||||
|
break
|
||||||
|
except (TimeoutError, asyncio.TimeoutError):
|
||||||
|
# Usually you'd wait for the TCP stack to make its own retries, but
|
||||||
|
# we know our target IPs are in a nearby rack, so we don't want to
|
||||||
|
# wait that long. Ergo we do our own trying, with a fast timeout.
|
||||||
|
# If we hit here, a packet might have been lost, so try again.
|
||||||
|
pass
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno == errno.EHOSTUNREACH:
|
||||||
|
# If there is no route, no point retrying either.
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
sd.close()
|
||||||
|
return connected
|
||||||
|
|
||||||
|
|
||||||
|
# Check for a wedge on a NIC. We detect a wedge by doing the following:
|
||||||
|
#
|
||||||
|
# Us (local IP, local port) -----> Them (remote IP, remote port)
|
||||||
|
#
|
||||||
|
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
|
||||||
|
# 2. Repeatedly connect() to the remote port while incrementing our local port
|
||||||
|
# 3. If we find a local port that fails to connect, this may be a wedge
|
||||||
|
#
|
||||||
|
async def check_for_wedge(nic, semaphore):
|
||||||
|
local_ip = "0.0.0.0"
|
||||||
|
remote_ip = nic["ip"]
|
||||||
|
|
||||||
|
can_connect = False
|
||||||
|
result = {
|
||||||
|
"cn": nic["cn_uuid"],
|
||||||
|
"vm": nic["belongs_to_uuid"],
|
||||||
|
"ip": nic["ip"],
|
||||||
|
"wedged": False
|
||||||
|
}
|
||||||
|
|
||||||
|
async with semaphore:
|
||||||
|
# To speed things up, we only check ports 443 and 80, which are the
|
||||||
|
# most common ports on the Internet.
|
||||||
|
for remote_port in CHECK_REMOTE_PORTS:
|
||||||
|
if can_connect:
|
||||||
|
break
|
||||||
|
|
||||||
|
for local_port in range(PORT_RANGE_START, PORT_RANGE_END):
|
||||||
|
src = (local_ip, local_port)
|
||||||
|
dest = (remote_ip, remote_port)
|
||||||
|
connected = await async_connect(src, dest)
|
||||||
|
|
||||||
|
if can_connect and not connected:
|
||||||
|
result["wedged"] = True
|
||||||
|
return result
|
||||||
|
elif connected:
|
||||||
|
can_connect = True
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Given an array of nics, scan the ports on each nic's IP address, checking if
|
||||||
|
# any appear to be wedged.
|
||||||
|
async def scan(nics):
|
||||||
|
sem = asyncio.Semaphore(CONCURRENT_SCANS)
|
||||||
|
tasks = map(lambda nic: asyncio.create_task(check_for_wedge(nic, sem)), nics)
|
||||||
|
done, pending = await asyncio.wait(tasks)
|
||||||
|
return map(lambda f: f.result(), done)
|
||||||
|
|
||||||
|
|
||||||
|
# Print out all our results in a format that CheckMK understands. Most of our
|
||||||
|
# output are in JSON rows.
|
||||||
|
def print_out(scan_results, agent_name):
|
||||||
|
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
|
||||||
|
scan_results = list(scan_results)
|
||||||
|
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
||||||
|
for entry in scan_results:
|
||||||
|
sys.stdout.write("%s\n" % json.dumps(entry))
|
||||||
|
|
||||||
|
|
||||||
|
# Parse the command-line arguments, specifically for hostname. Print out help
|
||||||
|
# to console if we get no args.
|
||||||
|
def parse_arguments(argv):
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"hostname", metavar="HOSTNAME", help="Hostname of NAPI to query."
|
||||||
|
)
|
||||||
|
return parser.parse_args(argv)
|
||||||
|
|
||||||
|
|
||||||
|
# Parse args, contact NAPI, query external IPs for VMs, and then print results
|
||||||
|
def main(argv=None):
|
||||||
|
if argv is None:
|
||||||
|
argv = sys.argv[1:]
|
||||||
|
|
||||||
|
args = parse_arguments(argv)
|
||||||
|
nics = query_napi(args.hostname)
|
||||||
|
|
||||||
|
# Sort the IPs so that (tend) to scan them in relative order. This is to
|
||||||
|
# increase the time between scans to the same IP due to consecutive agent
|
||||||
|
# executions, otherwise there's a higher chance we bump into TIME_WAIT.
|
||||||
|
#nics.sort(key=lambda d: d["ip"])
|
||||||
|
|
||||||
|
scan_results = asyncio.run(scan(nics))
|
||||||
|
print_out(scan_results, "triton_wedge")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
|
|
@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
|
||||||
|
def agent_triton_wedge(params, hostname, ipaddress):
|
||||||
|
return [params["instance"]]
|
||||||
|
|
||||||
|
special_agent_info["triton_wedge"] = agent_triton_wedge
|
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# GUI config page for triton_wedge.
|
||||||
|
|
||||||
|
from cmk.gui.i18n import _
|
||||||
|
from cmk.gui.plugins.wato.utils import (
|
||||||
|
rulespec_registry,
|
||||||
|
HostRulespec,
|
||||||
|
RulespecGroupCheckParametersHardware
|
||||||
|
)
|
||||||
|
from cmk.gui.watolib.rulespecs import Rulespec
|
||||||
|
from cmk.gui.valuespec import (
|
||||||
|
Dictionary,
|
||||||
|
Hostname,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _valuespec_special_agents_triton_wedge():
|
||||||
|
return Dictionary(
|
||||||
|
title=_("Triton Wedge Detection"),
|
||||||
|
help=_(""),
|
||||||
|
elements=[
|
||||||
|
(
|
||||||
|
"instance",
|
||||||
|
Hostname(
|
||||||
|
title=_("Hostname"),
|
||||||
|
help=_("Hostname or IP of NAPI to query"),
|
||||||
|
allow_empty=False,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
rulespec_registry.register(
|
||||||
|
HostRulespec(
|
||||||
|
factory_default=Rulespec.FACTORY_DEFAULT_UNUSED,
|
||||||
|
name="special_agents:triton_wedge",
|
||||||
|
group=RulespecGroupCheckParametersHardware,
|
||||||
|
valuespec=_valuespec_special_agents_triton_wedge,
|
||||||
|
)
|
||||||
|
)
|
BIN
check_mk-wedge/triton_wedge-0.1.0.mkp
Executable file
BIN
check_mk-wedge/triton_wedge-0.1.0.mkp
Executable file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user