From 0833ae7a16b3c74147805dbbdd4c3d6657b2ef6c Mon Sep 17 00:00:00 2001 From: Marsell Kukuljevic Date: Tue, 1 Apr 2025 20:07:06 +0200 Subject: [PATCH] Add Triton Wedge detector. --- .../base/plugins/agent_based/triton_wedge.py | 64 +++++++ .../agents/special/agent_triton_wedge | 165 ++++++++++++++++++ .../share/check_mk/checks/agent_triton_wedge | 7 + .../check_mk/web/plugins/wato/triton_wedge.py | 42 +++++ check_mk-wedge/triton_wedge-0.1.0.mkp | Bin 0 -> 4234 bytes 5 files changed, 278 insertions(+) create mode 100644 check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py create mode 100755 check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge create mode 100644 check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge create mode 100644 check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py create mode 100755 check_mk-wedge/triton_wedge-0.1.0.mkp diff --git a/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py b/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py new file mode 100644 index 0000000..d5c966e --- /dev/null +++ b/check_mk-wedge/local/lib/check_mk/base/plugins/agent_based/triton_wedge.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# Parses and checks external VM IPs. + +import json +from cmk.base.plugins.agent_based.agent_based_api.v1 import register, Result, Service, State + + +def parse_triton_wedge(string_table): + lookup = {} + + for row in string_table: + nic = json.loads(row[0]) + cn_name = nic["cn"] + vms_in_cn = lookup.setdefault(cn_name, []) + vms_in_cn.append(nic) + + return lookup + + +register.agent_section( + name="triton_wedge", + parse_function=parse_triton_wedge +) + + +def discover_triton_wedge(section): + for cn_name, vms in sorted(section.items()): + yield Service(item=cn_name, parameters={"name": cn_name}) + + +def check_triton_wedge(item, params, section): + cn_name = params["name"] + vms = section.get(cn_name) + + if vms is None: + yield Result(state=State.WARN, summary="Not appearing in NAPI") + return + + wedged_vms = [] + + for vm in vms: + if vm["wedged"]: + wedged_vms.append(vm) + + if len(wedged_vms) == 0: + yield Result(state=State.OK, summary="No wedge detected") + elif len(wedged_vms) == 1: + vm = wedged_vms[0] + summary = "Potential wedge detected for VM %s (%s)" % (vm["vm"], vm["ip"]) + yield Result(state=State.WARN, summary=summary) + else: + lst = ", ".join(map(lambda vm: "VM %s (%s)" % (vm["vm"], vm["ip"]), wedged_vms)) + yield Result(state=State.CRIT, summary=f"Likely wedged detected for {lst}") + + +register.check_plugin( + name="triton_wedge", + service_name="Triton Wedge CN %s", + discovery_function=discover_triton_wedge, + check_function=check_triton_wedge, + check_default_parameters={}, + check_ruleset_name="triton_wedge", +) diff --git a/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge b/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge new file mode 100755 index 0000000..3d7cbb2 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/agents/special/agent_triton_wedge @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# The range of ephemeral local ports we use when attempting to probe remote +# IPs. In the past, wedged ports appeared with a stride of 8; to be safe, we use +# a stride of 128. +PORT_RANGE_START = 57000 +PORT_RANGE_END = 57128 +CONNECT_RETRIES = 3 +CHECK_REMOTE_PORTS = [443, 80] +CONCURRENT_SCANS = 200 + + +import urllib.request, sys, argparse, asyncio, json, socket, errno + + +def get_url(url): + request = urllib.request.Request(url) + with urllib.request.urlopen(request) as conn: + data = conn.read() + return data + + +# Fetch and parse details about active zone NICs on the external network. +def query_napi(addr): + url = 'http://%s/nics?nic_tag=external&belongs_to_type=zone&state=running' % addr + try: + json_data = get_url(url) + nics = json.loads(json_data) + return nics + except urllib.error.HTTPError as e: + sys.stderr.write("NAPI error: %s\n" % e) + + +# asyncio provides some nice connection methods, but none of them allow us to +# use SO_REUSEPORT. This flag is critical since we're repeatedly using the +# same range of local ports to port map remote IPs. So we have to resort to +# this low-level socket hackery to enable SO_REUSEPORT. +async def async_connect(src, dest): + loop = asyncio.get_event_loop() + + sd = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + sd.bind(src) + sd.setblocking(False) + + connected = False + + # We try to connect() several times, in case a packet got lost. + for attempt in range(CONNECT_RETRIES): + try: + future = loop.sock_connect(sd, dest) + await asyncio.wait_for(future, timeout=0.1) + connected = True + break + except ConnectionRefusedError: + # ECONNREFUSED (we received a RST after sending an ACK). If we + # receive this there's no point retrying. + break + except (TimeoutError, asyncio.TimeoutError): + # Usually you'd wait for the TCP stack to make its own retries, but + # we know our target IPs are in a nearby rack, so we don't want to + # wait that long. Ergo we do our own trying, with a fast timeout. + # If we hit here, a packet might have been lost, so try again. + pass + except OSError as e: + if e.errno == errno.EHOSTUNREACH: + # If there is no route, no point retrying either. + break + else: + raise + + sd.close() + return connected + + +# Check for a wedge on a NIC. We detect a wedge by doing the following: +# +# Us (local IP, local port) -----> Them (remote IP, remote port) +# +# 1. Find an open remote port (to speed things up we check ports 443 and 80) +# 2. Repeatedly connect() to the remote port while incrementing our local port +# 3. If we find a local port that fails to connect, this may be a wedge +# +async def check_for_wedge(nic, semaphore): + local_ip = "0.0.0.0" + remote_ip = nic["ip"] + + can_connect = False + result = { + "cn": nic["cn_uuid"], + "vm": nic["belongs_to_uuid"], + "ip": nic["ip"], + "wedged": False + } + + async with semaphore: + # To speed things up, we only check ports 443 and 80, which are the + # most common ports on the Internet. + for remote_port in CHECK_REMOTE_PORTS: + if can_connect: + break + + for local_port in range(PORT_RANGE_START, PORT_RANGE_END): + src = (local_ip, local_port) + dest = (remote_ip, remote_port) + connected = await async_connect(src, dest) + + if can_connect and not connected: + result["wedged"] = True + return result + elif connected: + can_connect = True + + return result + + +# Given an array of nics, scan the ports on each nic's IP address, checking if +# any appear to be wedged. +async def scan(nics): + sem = asyncio.Semaphore(CONCURRENT_SCANS) + tasks = map(lambda nic: asyncio.create_task(check_for_wedge(nic, sem)), nics) + done, pending = await asyncio.wait(tasks) + return map(lambda f: f.result(), done) + + +# Print out all our results in a format that CheckMK understands. Most of our +# output are in JSON rows. +def print_out(scan_results, agent_name): + sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n") + scan_results = list(scan_results) + scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"]) + for entry in scan_results: + sys.stdout.write("%s\n" % json.dumps(entry)) + + +# Parse the command-line arguments, specifically for hostname. Print out help +# to console if we get no args. +def parse_arguments(argv): + parser = argparse.ArgumentParser() + parser.add_argument( + "hostname", metavar="HOSTNAME", help="Hostname of NAPI to query." + ) + return parser.parse_args(argv) + + +# Parse args, contact NAPI, query external IPs for VMs, and then print results +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + args = parse_arguments(argv) + nics = query_napi(args.hostname) + + # Sort the IPs so that (tend) to scan them in relative order. This is to + # increase the time between scans to the same IP due to consecutive agent + # executions, otherwise there's a higher chance we bump into TIME_WAIT. + #nics.sort(key=lambda d: d["ip"]) + + scan_results = asyncio.run(scan(nics)) + print_out(scan_results, "triton_wedge") + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge b/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge new file mode 100644 index 0000000..2cfe785 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/checks/agent_triton_wedge @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + + +def agent_triton_wedge(params, hostname, ipaddress): + return [params["instance"]] + +special_agent_info["triton_wedge"] = agent_triton_wedge diff --git a/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py b/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py new file mode 100644 index 0000000..f734e27 --- /dev/null +++ b/check_mk-wedge/local/share/check_mk/web/plugins/wato/triton_wedge.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# +# GUI config page for triton_wedge. + +from cmk.gui.i18n import _ +from cmk.gui.plugins.wato.utils import ( + rulespec_registry, + HostRulespec, + RulespecGroupCheckParametersHardware +) +from cmk.gui.watolib.rulespecs import Rulespec +from cmk.gui.valuespec import ( + Dictionary, + Hostname, +) + + +def _valuespec_special_agents_triton_wedge(): + return Dictionary( + title=_("Triton Wedge Detection"), + help=_(""), + elements=[ + ( + "instance", + Hostname( + title=_("Hostname"), + help=_("Hostname or IP of NAPI to query"), + allow_empty=False, + ), + ), + ], + ) + + +rulespec_registry.register( + HostRulespec( + factory_default=Rulespec.FACTORY_DEFAULT_UNUSED, + name="special_agents:triton_wedge", + group=RulespecGroupCheckParametersHardware, + valuespec=_valuespec_special_agents_triton_wedge, + ) +) diff --git a/check_mk-wedge/triton_wedge-0.1.0.mkp b/check_mk-wedge/triton_wedge-0.1.0.mkp new file mode 100755 index 0000000000000000000000000000000000000000..e6bd1288a0f3e361b62428174958bb732d34d004 GIT binary patch literal 4234 zcmaLZS2P;{pvG~rY8Is^HQOp`wJ5b>6}4$?ic&Fa6C+JTi`vv4wQ8@RLPhLVD>O#! zRRl3Y?U8%$!#(GobMLuN-#OpY|MABYLq$R*fCrHhcWsECYt23w%*_*X@spPtkmz06 zUtNl}OpfTLtp$Ow-Q9u#3Q~u0MTdXPI#mUBDuLfr1ubH6Guc#P*;F4rk#KUzr%sHn(wAFzywY2_?idLW> ziX7r)p;-jXr6y(uVp0!Qu6OOzC@n^ zy{nyXF60Or^KOF=ht0y?XmWn*)eJFKv=@JvKps%ud7HMLm)8H6bvj}R@d?eAY3`V> z_ArR0yxw*DC#1;?iqi=DXG~~P&}uZjeQVDyl7A)U7EZ?`i!O`&=jv&}flnqy*-52& zbm`O*@CIN6HgOSN#X!Z8Xf4@w9=JEjB3@Mb;*7lV1Djo`_1yA^O?Q{ho?EO zMQ-Su8iw}>5+D5Mn4?cVunT11v=BAx?`{jmDA+$FI&l|OkAG^+t!qO|U0mW~_Jyuz zeT=Dp3SU3RfvS%l`l_zA1gPP9;3izJkebHihZ>7D`3ASRx1p+uwJODyziFXY$FUXR?9QA2 z0uDQDaVZ>7AyYQwK{=ytw)Jaa$9GRE#oPwX=@W(8r9(#q3x?MU;ix<7W#+lJbg-hu z=~xs*f|WXgGC+1}3U?ErgeIjv6kOd3m~V@gj3E0pjx>}v+muP)b!{HW$U^#=g<_)o zLd@>r96m{}1WmT3?I1lWe(Y-;U+cyh*{dfQP39SDl@{UA_o*Vzuk0BM6)R}Zc1>B1$si|D^HeEv!>;$t-@c`BFQ853N6glG z>m84o8)-1(%G5I#*Hv1r(}_Jqu$07}JsfWa4|}GBy9e(RCa%)7y#E!gg{WTbYv}AW z%s$Z>gm_keoiWEvUf-1KCvG6v*)mVvdE~0eNW+wQ8L7zsuYpqf*6*OWt|!hs^N9G| zTzUiC)ux*rG3xM?gUE(~!YOjL2s}ACIdm9(hT!Y9x1L&?KwC2L9HJEH;3TeFNPnr? zAvLo*RHX|7ysJI)6?{Un=HY!?JTc$?C*v*(Wje$icS;AkBD^|kgQ@=}`ApqrH1g%_wpkBex1rMPo!QBnvZ=tgIys>l4 zbcfC+a0zj4iAWhrwId{daTMiGI!ydbAie4Ro*V5!{-1AlV`3~QGb)*mnb+y5&hIns ziWtFB@`EnN0=#iO9azBn{mXiVq5i-6pPI6Htt4x{`dzdv1QOx%u+&wZ@kV;mZaF>Z z!KlZAwZZ1%zpL%dFEZJA2Al4~`lB5!+Z10bPh+=8%YL^sx3y$hE6+G8p>u{2%gz1- zZyy4!!iAq=9@yMJ!h$2Rdrd548r3bwmRE}XH94PclVfYYLNY6Ol)vG^PI)TkRLA;9 zr_scw51XprL0vmxDh6+^l#Ek#X$3PXKl=Tq?_^+C+cBefHjFrQ^dsw@)a2hi{SjL| zS7V*RlYppYj~uG4Kc`%YVWRJZDkmb9FtQT;k%lps`lTOZ*>tR&Z}X@l&37CTEs_}p ziMRM-7a%#HSn>L0BLp$F*v>4SExOkUF%(;8?3gVJ@$!GHlH#}X>Kpm(Z(K+&3L};H zZvV7LMMXtT+gg;g$F&Cu>4eGV<$viROnpAZts*D&9{hcOHf(jBdZp6u&>bYA-w&Ai zj{NqRG@C$y!ZE9|Ea=d1#YT!WoM0mG1SQUpc{R4Obe~zuGa? z42JUh$S}Hp#`akk=$diVHEjoep%!@hRLJsY9oxSWO^E;sR%U8m&04|G{odi-5+xg$EZ_o3^H2)G z6e$!e0d^9&@}iw=^sF7fW+|~04Odfqn7?22SY4rR&oc)a@l$;!D?Ewhm9PY~is^|O zpYkY=DnU?nheeqh`djo4?U!G{*@aC%Tr-ru$JZ)UGH4dZh$&F&i2DK@1neqb4PS4o zNJ>oFhdCS-3-e&Rw*8io+iX75Q;w_(^Jrj;`P3*YsW!01OP=)>*eQqQEe-r zm*AZu++$-&iwW!YlzKY53-WlweD{&@6Xjd($u;g=3#2p-6IfAJQXTUrEE~&vUf5ZX zo81@7J<~oeib{Vc)e$tKF=q=a!@{RpFmHC1ol0RqBUCX&3_LeK*flBnyhLG2eEwJe z&dimh`Pg}Hzkt6`#d8X}21D`Bi?EIm(@u4m;~sZpNf;g^F7{Y}o5dTz|BF;kEUO`C zOF20BXUEdbz5OQY42SzB8SOT)!UDIvEhx!$`vYiT+Omxus3iTnn2vuL4jW-Z*&jSK za0nho;EK<)cls|L`NhnrFuMpq(jhSCsGH9^X>tJH4p;k1Qf}&aiU!~i^L@zm&P(~D z4t_mxgT-SQ7iY7(<%o@mBe{|wH22hr>yz^sEjVhctRTiY-h>CXP8e^r!PqmT`645_)P_2_3_ zmw6=jKyf1<$xM;{@=Uap<~4<{p@Zqw?++3Q*JT6F`|Cl=O>XG|_7QGYmR7#IMk+%^ z+7XsvdR9smzwC|(oTAP=;vv=+lJ1Hplvd)`4qw#H@ri-nEZ&eT8nk*j_~r7ogs>4g z(!+J8c&oUFqPVR0E<_JzRK;x(<$ZY>hITf@p5iDl9;0b>jn(x`k}G?Ab}dyu{kQPG zk^W?{rTz*M5n`3zd-$YFC(f0WK5TVhU|yTj!c++!G-X+hZqe(-86|t0*3IwhzArNA zyWHG3&oyZ-uh%T_oq!fMmXzWmEHS?Q&gYW4J9omF4PJY>nV+swl7zDU$%yl?w}FR0 zQso=IN|zbFk*jwTF|%bv^faR9XCbu^y0`Qp>Cc!S#ZaSMviD=k%v@Je4VE<=2{p zg*Qetll57n8;9Bk9GdE$<9Z_W$t}=3e3&Ol?!(%%-R&zZ`L}|cgN{1|1RO+mpcVRQU~swQt_m(|(tPN{5^@T@Rx(e(BgyUdF7 z1cLURN_N9XO0(@{>Oig?mHzam>m(bxeDw|e6v+D~S03;LFsYMI_{X$RK&E8F#5gJl zou&cudlp3ztY<>50*QPjtXc!S9sSzN`0}Y*el&e2XT#L$)Vx;^13`Rl*^=wL>|Ig` z@I8!bd$UkNux>yav>TC@eWnKd{97!>8-S*!b3>o5B3_zzx>2>yht>c%a|HqSVCz?h z*q+PXE1wUn08c|saqxFS+h%Jk70g8}SJ=^t6T&wy1^pNQoZMk&t9do3xz0I+I}5hi zSn2if21Zp$PXiJEI?kJ{mSkZu$J9;>nU(&@$z9V0|-Y z!HxB9dS_AU&!C8okc!z1%Dln3Xa-YS-@53_t*yq7AC*2sE2Ih;SM z`_;0px(zh|7h!S0xGYXyz2)gAnGGntNVb8-0aA`Z{1cn(6K6<5b!o#Xmvd@XBg)Lw z4-<5fjWzCqS1o?z`W!mRCMxu?u>I11NN4gGpuxaU;?aM>(y26~KQe0-!U~84VcO?`1N?bwbg~}4IcIGcbKR@oTNE?x!(tE(J=qoAn t?sJ;L4J)cO@v|F_n!y4TuCD(>apkPMKJ{0ARgU}^vW literal 0 HcmV?d00001