Compare commits

..

2 Commits

6 changed files with 30 additions and 223 deletions

View File

@ -1,208 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2019 tribe29 GmbH - License: GNU General Public License v2
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
# conditions defined in the file COPYING, which is part of this source code package.
from cmk.gui.i18n import _
from cmk.gui.plugins.wato.utils import (
CheckParameterRulespecWithItem,
rulespec_registry,
RulespecGroupCheckParametersNetworking,
)
from cmk.gui.valuespec import Dictionary, DropdownChoice, Integer, TextInput, Tuple
def _item_spec_cisco_ip_sla():
return TextInput(
title=_("RTT row index of the service"),
allow_empty=True,
)
def _parameter_valuespec_cisco_ip_sla():
return Dictionary(
elements=[
(
"rtt_type",
DropdownChoice(
title=_("RTT type"),
choices=[
("echo", _("echo")),
("path echo", _("path echo")),
("file IO", _("file IO")),
("UDP echo", _("UDP echo")),
("TCP connect", _("TCP connect")),
("HTTP", _("HTTP")),
("DNS", _("DNS")),
("jitter", _("jitter")),
("DLSw", _("DLSw")),
("DHCP", _("DHCP")),
("FTP", _("FTP")),
("VoIP", _("VoIP")),
("RTP", _("RTP")),
("LSP group", _("LSP group")),
("ICMP jitter", _("ICMP jitter")),
("LSP ping", _("LSP ping")),
("LSP trace", _("LSP trace")),
("ethernet ping", _("ethernet ping")),
("ethernet jitter", _("ethernet jitter")),
("LSP ping pseudowire", _("LSP ping pseudowire")),
],
default_value="echo",
),
),
(
"threshold",
Integer(
title=_("Treshold"),
help=_(
"Depending on the precision the unit can be "
"either milliseconds or micoseconds."
),
unit=_("ms/us"),
minvalue=1,
default_value=5000,
),
),
(
"state",
DropdownChoice(
title=_("State"),
choices=[
("active", _("active")),
("inactive", _("inactive")),
("reset", _("reset")),
("orderly stop", _("orderly stop")),
("immediate stop", _("immediate stop")),
("pending", _("pending")),
("restart", _("restart")),
],
default_value="active",
),
),
(
"connection_lost_occured",
DropdownChoice(
title=_("Connection lost occured"),
choices=[
("yes", _("yes")),
("no", _("no")),
],
default_value="no",
),
),
(
"timeout_occured",
DropdownChoice(
title=_("Timeout occured"),
choices=[
("yes", _("yes")),
("no", _("no")),
],
default_value="no",
),
),
(
"completion_time_over_treshold_occured",
DropdownChoice(
title=_("Completion time over treshold occured"),
choices=[
("yes", _("yes")),
("no", _("no")),
],
default_value="no",
),
),
(
"latest_rtt_completion_time",
Tuple(
title=_("Latest RTT completion time"),
help=_(
"Depending on the precision the unit can be "
"either milliseconds or micoseconds."
),
elements=[
Integer(
title=_("Warning at"),
unit=_("ms/us"),
minvalue=1,
default_value=100,
),
Integer(
title=_("Critical at"),
unit=_("ms/us"),
minvalue=1,
default_value=200,
),
],
),
),
(
"latest_rtt_state",
DropdownChoice(
title=_("Latest RTT state"),
choices=[
("ok", _("OK")),
("disconnected", _("disconnected")),
("over treshold", _("over treshold")),
("timeout", _("timeout")),
("other", _("other")),
],
default_value="ok",
),
),
(
"packets_lost_src->dest",
Tuple(
title=_("Packets lost src->dest"),
elements=[
Integer(
title=_("Warning at"),
unit=_("packets"),
minvalue=1,
default_value=100,
),
Integer(
title=_("Critical at"),
unit=_("packets"),
minvalue=1,
default_value=1000,
),
],
),
),
(
"packets_lost_dest->src",
Tuple(
title=_("Packets lost dest->src"),
elements=[
Integer(
title=_("Warning at"),
unit=_("packets"),
minvalue=1,
default_value=100,
),
Integer(
title=_("Critical at"),
unit=_("packets"),
minvalue=1,
default_value=1000,
),
],
),
),
],
)
rulespec_registry.register(
CheckParameterRulespecWithItem(
check_group_name="cisco_ip_sla",
group=RulespecGroupCheckParametersNetworking,
item_spec=_item_spec_cisco_ip_sla,
match_type="dict",
parameter_valuespec=_parameter_valuespec_cisco_ip_sla,
title=lambda: _("Cisco IP SLA"),
)
)

View File

@ -28,8 +28,12 @@ def check_triton_wedge(item, params, section):
vms = section
wedged_vms = []
if len(vms) == 1 and vms[0].get("error"):
yield Result(state=State.UNKNOWN, summary=vms[0]["error"])
return
for vm in vms:
if vm["wedged"]:
if vm["wedged"] == "probably":
wedged_vms.append(vm)
if len(wedged_vms) == 0:

View File

@ -11,13 +11,14 @@ PORT_RANGE_START = 50000
PORT_RANGE_END = 65504
# How often and for how long to attempt to connect to a VM
CONNECT_RETRIES = 3
CONNECT_TIMEOUT = 1 # seconds
CONNECT_TIMEOUT = 0.5 # seconds
# Remote ports, in order, to attempt to connect to. More ports means higher
# chance of being able to test for a wedge, but also takes more time.
CHECK_REMOTE_PORTS = [443, 80]
# How many VMs we'll be portmapping concurrently.
CONCURRENT_SCANS = 200
NAPI_TIMEOUT = 10 # seconds
AGENT_NAME = "triton_wedge"
# This is a hackish lookup to quickly convert CN UUIDs to host names.
@ -81,8 +82,8 @@ def query_napi(addr):
json_data = get_url(url, NAPI_TIMEOUT)
nics = json.loads(json_data)
return nics
except urllib.error.HTTPError as e:
sys.stderr.write("NAPI error: %s\n" % e)
except (urllib.error.HTTPError, urllib.error.URLError):
return None
# asyncio provides some nice connection methods, but none of them allow us to
@ -148,7 +149,6 @@ def calculate_local_port_range():
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
# 2. Repeatedly connect() to the remote port while incrementing our local port
# 3. If we find a local port that fails to connect, this may be a wedge
#
async def check_for_wedge(nic, semaphore):
local_ip = "0.0.0.0"
remote_ip = nic["ip"]
@ -165,7 +165,7 @@ async def check_for_wedge(nic, semaphore):
"cn": cn,
"vm": nic["belongs_to_uuid"],
"ip": nic["ip"],
"wedged": False
"wedged": "no"
}
async with semaphore:
@ -185,13 +185,13 @@ async def check_for_wedge(nic, semaphore):
connected = await async_connect(src, dest)
except OSError as e:
if e.errno == errno.EADDRINUSE:
result["wedged"] = None
result["wedged"] = "temporary local port collision"
return result
else:
raise
if can_connect and not connected:
result["wedged"] = True
result["wedged"] = "probably"
return result
elif connected:
can_connect = True
@ -208,9 +208,20 @@ async def scan(nics):
return map(lambda f: f.result(), done)
# This is only used when we cannot contact NAPI. Ops wants to know when this
# happens, but if we cannot contact NAPI then we also don't know about any CNs.
# We use information in HOSTNAME_LOOKUP to still be able to report about
# a lack of contact to NAPI, even when we cannot contact NAPI.
def print_out_napi_err():
for host in HOSTNAME_LOOKUP.values():
sys.stdout.write(f"<<<<{host}>>>>\n")
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
sys.stdout.write('{"error": "Cannot contact NAPI"}\n')
# Print out all our results in a format that CheckMK understands. Most of our
# output are in JSON rows.
def print_out(scan_results, agent_name):
def print_out(scan_results):
scan_results = list(scan_results)
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
@ -222,7 +233,7 @@ def print_out(scan_results, agent_name):
if curr_cn != last_cn:
last_cn = curr_cn
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
sys.stdout.write("%s\n" % json.dumps({
"vm": entry["vm"],
@ -249,13 +260,13 @@ def main(argv=None):
args = parse_arguments(argv)
nics = query_napi(args.hostname)
# Sort the IPs so that (tend) to scan them in relative order. This is to
# increase the time between scans to the same IP due to consecutive agent
# executions, otherwise there's a higher chance we bump into TIME_WAIT.
#nics.sort(key=lambda d: d["ip"])
if nics == None:
print_out_napi_err()
# We must return 0, or CheckMK won't check the output we're returning
sys.exit(0)
scan_results = asyncio.run(scan(nics))
print_out(scan_results, "triton_wedge")
print_out(scan_results)
if __name__ == "__main__":

Binary file not shown.

BIN
wedge/triton_wedge-0.3.1.mkp Executable file

Binary file not shown.