Compare commits
No commits in common. "ffafe3c4f6c37bf502422e2a5e08898047a6d3c0" and "9e0e13a636041c36b5196e070a368ca5ddafac8a" have entirely different histories.
ffafe3c4f6
...
9e0e13a636
BIN
check_mk-cisco_ip_sla/2.2/cisco_ip_sla-1.0.1.mkp
Normal file
BIN
check_mk-cisco_ip_sla/2.2/cisco_ip_sla-1.0.1.mkp
Normal file
Binary file not shown.
@ -0,0 +1,208 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2019 tribe29 GmbH - License: GNU General Public License v2
|
||||||
|
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
|
||||||
|
# conditions defined in the file COPYING, which is part of this source code package.
|
||||||
|
|
||||||
|
from cmk.gui.i18n import _
|
||||||
|
from cmk.gui.plugins.wato.utils import (
|
||||||
|
CheckParameterRulespecWithItem,
|
||||||
|
rulespec_registry,
|
||||||
|
RulespecGroupCheckParametersNetworking,
|
||||||
|
)
|
||||||
|
from cmk.gui.valuespec import Dictionary, DropdownChoice, Integer, TextInput, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def _item_spec_cisco_ip_sla():
|
||||||
|
return TextInput(
|
||||||
|
title=_("RTT row index of the service"),
|
||||||
|
allow_empty=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parameter_valuespec_cisco_ip_sla():
|
||||||
|
return Dictionary(
|
||||||
|
elements=[
|
||||||
|
(
|
||||||
|
"rtt_type",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("RTT type"),
|
||||||
|
choices=[
|
||||||
|
("echo", _("echo")),
|
||||||
|
("path echo", _("path echo")),
|
||||||
|
("file IO", _("file IO")),
|
||||||
|
("UDP echo", _("UDP echo")),
|
||||||
|
("TCP connect", _("TCP connect")),
|
||||||
|
("HTTP", _("HTTP")),
|
||||||
|
("DNS", _("DNS")),
|
||||||
|
("jitter", _("jitter")),
|
||||||
|
("DLSw", _("DLSw")),
|
||||||
|
("DHCP", _("DHCP")),
|
||||||
|
("FTP", _("FTP")),
|
||||||
|
("VoIP", _("VoIP")),
|
||||||
|
("RTP", _("RTP")),
|
||||||
|
("LSP group", _("LSP group")),
|
||||||
|
("ICMP jitter", _("ICMP jitter")),
|
||||||
|
("LSP ping", _("LSP ping")),
|
||||||
|
("LSP trace", _("LSP trace")),
|
||||||
|
("ethernet ping", _("ethernet ping")),
|
||||||
|
("ethernet jitter", _("ethernet jitter")),
|
||||||
|
("LSP ping pseudowire", _("LSP ping pseudowire")),
|
||||||
|
],
|
||||||
|
default_value="echo",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"threshold",
|
||||||
|
Integer(
|
||||||
|
title=_("Treshold"),
|
||||||
|
help=_(
|
||||||
|
"Depending on the precision the unit can be "
|
||||||
|
"either milliseconds or micoseconds."
|
||||||
|
),
|
||||||
|
unit=_("ms/us"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=5000,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"state",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("State"),
|
||||||
|
choices=[
|
||||||
|
("active", _("active")),
|
||||||
|
("inactive", _("inactive")),
|
||||||
|
("reset", _("reset")),
|
||||||
|
("orderly stop", _("orderly stop")),
|
||||||
|
("immediate stop", _("immediate stop")),
|
||||||
|
("pending", _("pending")),
|
||||||
|
("restart", _("restart")),
|
||||||
|
],
|
||||||
|
default_value="active",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"connection_lost_occured",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("Connection lost occured"),
|
||||||
|
choices=[
|
||||||
|
("yes", _("yes")),
|
||||||
|
("no", _("no")),
|
||||||
|
],
|
||||||
|
default_value="no",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"timeout_occured",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("Timeout occured"),
|
||||||
|
choices=[
|
||||||
|
("yes", _("yes")),
|
||||||
|
("no", _("no")),
|
||||||
|
],
|
||||||
|
default_value="no",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"completion_time_over_treshold_occured",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("Completion time over treshold occured"),
|
||||||
|
choices=[
|
||||||
|
("yes", _("yes")),
|
||||||
|
("no", _("no")),
|
||||||
|
],
|
||||||
|
default_value="no",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"latest_rtt_completion_time",
|
||||||
|
Tuple(
|
||||||
|
title=_("Latest RTT completion time"),
|
||||||
|
help=_(
|
||||||
|
"Depending on the precision the unit can be "
|
||||||
|
"either milliseconds or micoseconds."
|
||||||
|
),
|
||||||
|
elements=[
|
||||||
|
Integer(
|
||||||
|
title=_("Warning at"),
|
||||||
|
unit=_("ms/us"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=100,
|
||||||
|
),
|
||||||
|
Integer(
|
||||||
|
title=_("Critical at"),
|
||||||
|
unit=_("ms/us"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=200,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"latest_rtt_state",
|
||||||
|
DropdownChoice(
|
||||||
|
title=_("Latest RTT state"),
|
||||||
|
choices=[
|
||||||
|
("ok", _("OK")),
|
||||||
|
("disconnected", _("disconnected")),
|
||||||
|
("over treshold", _("over treshold")),
|
||||||
|
("timeout", _("timeout")),
|
||||||
|
("other", _("other")),
|
||||||
|
],
|
||||||
|
default_value="ok",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"packets_lost_src->dest",
|
||||||
|
Tuple(
|
||||||
|
title=_("Packets lost src->dest"),
|
||||||
|
elements=[
|
||||||
|
Integer(
|
||||||
|
title=_("Warning at"),
|
||||||
|
unit=_("packets"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=100,
|
||||||
|
),
|
||||||
|
Integer(
|
||||||
|
title=_("Critical at"),
|
||||||
|
unit=_("packets"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=1000,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"packets_lost_dest->src",
|
||||||
|
Tuple(
|
||||||
|
title=_("Packets lost dest->src"),
|
||||||
|
elements=[
|
||||||
|
Integer(
|
||||||
|
title=_("Warning at"),
|
||||||
|
unit=_("packets"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=100,
|
||||||
|
),
|
||||||
|
Integer(
|
||||||
|
title=_("Critical at"),
|
||||||
|
unit=_("packets"),
|
||||||
|
minvalue=1,
|
||||||
|
default_value=1000,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
rulespec_registry.register(
|
||||||
|
CheckParameterRulespecWithItem(
|
||||||
|
check_group_name="cisco_ip_sla",
|
||||||
|
group=RulespecGroupCheckParametersNetworking,
|
||||||
|
item_spec=_item_spec_cisco_ip_sla,
|
||||||
|
match_type="dict",
|
||||||
|
parameter_valuespec=_parameter_valuespec_cisco_ip_sla,
|
||||||
|
title=lambda: _("Cisco IP SLA"),
|
||||||
|
)
|
||||||
|
)
|
@ -28,12 +28,8 @@ def check_triton_wedge(item, params, section):
|
|||||||
vms = section
|
vms = section
|
||||||
wedged_vms = []
|
wedged_vms = []
|
||||||
|
|
||||||
if len(vms) == 1 and vms[0].get("error"):
|
|
||||||
yield Result(state=State.UNKNOWN, summary=vms[0]["error"])
|
|
||||||
return
|
|
||||||
|
|
||||||
for vm in vms:
|
for vm in vms:
|
||||||
if vm["wedged"] == "probably":
|
if vm["wedged"]:
|
||||||
wedged_vms.append(vm)
|
wedged_vms.append(vm)
|
||||||
|
|
||||||
if len(wedged_vms) == 0:
|
if len(wedged_vms) == 0:
|
||||||
|
@ -11,14 +11,13 @@ PORT_RANGE_START = 50000
|
|||||||
PORT_RANGE_END = 65504
|
PORT_RANGE_END = 65504
|
||||||
# How often and for how long to attempt to connect to a VM
|
# How often and for how long to attempt to connect to a VM
|
||||||
CONNECT_RETRIES = 3
|
CONNECT_RETRIES = 3
|
||||||
CONNECT_TIMEOUT = 0.5 # seconds
|
CONNECT_TIMEOUT = 1 # seconds
|
||||||
# Remote ports, in order, to attempt to connect to. More ports means higher
|
# Remote ports, in order, to attempt to connect to. More ports means higher
|
||||||
# chance of being able to test for a wedge, but also takes more time.
|
# chance of being able to test for a wedge, but also takes more time.
|
||||||
CHECK_REMOTE_PORTS = [443, 80]
|
CHECK_REMOTE_PORTS = [443, 80]
|
||||||
# How many VMs we'll be portmapping concurrently.
|
# How many VMs we'll be portmapping concurrently.
|
||||||
CONCURRENT_SCANS = 200
|
CONCURRENT_SCANS = 200
|
||||||
NAPI_TIMEOUT = 10 # seconds
|
NAPI_TIMEOUT = 10 # seconds
|
||||||
AGENT_NAME = "triton_wedge"
|
|
||||||
|
|
||||||
|
|
||||||
# This is a hackish lookup to quickly convert CN UUIDs to host names.
|
# This is a hackish lookup to quickly convert CN UUIDs to host names.
|
||||||
@ -82,8 +81,8 @@ def query_napi(addr):
|
|||||||
json_data = get_url(url, NAPI_TIMEOUT)
|
json_data = get_url(url, NAPI_TIMEOUT)
|
||||||
nics = json.loads(json_data)
|
nics = json.loads(json_data)
|
||||||
return nics
|
return nics
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
except urllib.error.HTTPError as e:
|
||||||
return None
|
sys.stderr.write("NAPI error: %s\n" % e)
|
||||||
|
|
||||||
|
|
||||||
# asyncio provides some nice connection methods, but none of them allow us to
|
# asyncio provides some nice connection methods, but none of them allow us to
|
||||||
@ -149,6 +148,7 @@ def calculate_local_port_range():
|
|||||||
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
|
# 1. Find an open remote port (to speed things up we check ports 443 and 80)
|
||||||
# 2. Repeatedly connect() to the remote port while incrementing our local port
|
# 2. Repeatedly connect() to the remote port while incrementing our local port
|
||||||
# 3. If we find a local port that fails to connect, this may be a wedge
|
# 3. If we find a local port that fails to connect, this may be a wedge
|
||||||
|
#
|
||||||
async def check_for_wedge(nic, semaphore):
|
async def check_for_wedge(nic, semaphore):
|
||||||
local_ip = "0.0.0.0"
|
local_ip = "0.0.0.0"
|
||||||
remote_ip = nic["ip"]
|
remote_ip = nic["ip"]
|
||||||
@ -165,7 +165,7 @@ async def check_for_wedge(nic, semaphore):
|
|||||||
"cn": cn,
|
"cn": cn,
|
||||||
"vm": nic["belongs_to_uuid"],
|
"vm": nic["belongs_to_uuid"],
|
||||||
"ip": nic["ip"],
|
"ip": nic["ip"],
|
||||||
"wedged": "no"
|
"wedged": False
|
||||||
}
|
}
|
||||||
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
@ -185,13 +185,13 @@ async def check_for_wedge(nic, semaphore):
|
|||||||
connected = await async_connect(src, dest)
|
connected = await async_connect(src, dest)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
if e.errno == errno.EADDRINUSE:
|
if e.errno == errno.EADDRINUSE:
|
||||||
result["wedged"] = "temporary local port collision"
|
result["wedged"] = None
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if can_connect and not connected:
|
if can_connect and not connected:
|
||||||
result["wedged"] = "probably"
|
result["wedged"] = True
|
||||||
return result
|
return result
|
||||||
elif connected:
|
elif connected:
|
||||||
can_connect = True
|
can_connect = True
|
||||||
@ -208,20 +208,9 @@ async def scan(nics):
|
|||||||
return map(lambda f: f.result(), done)
|
return map(lambda f: f.result(), done)
|
||||||
|
|
||||||
|
|
||||||
# This is only used when we cannot contact NAPI. Ops wants to know when this
|
|
||||||
# happens, but if we cannot contact NAPI then we also don't know about any CNs.
|
|
||||||
# We use information in HOSTNAME_LOOKUP to still be able to report about
|
|
||||||
# a lack of contact to NAPI, even when we cannot contact NAPI.
|
|
||||||
def print_out_napi_err():
|
|
||||||
for host in HOSTNAME_LOOKUP.values():
|
|
||||||
sys.stdout.write(f"<<<<{host}>>>>\n")
|
|
||||||
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
|
|
||||||
sys.stdout.write('{"error": "Cannot contact NAPI"}\n')
|
|
||||||
|
|
||||||
|
|
||||||
# Print out all our results in a format that CheckMK understands. Most of our
|
# Print out all our results in a format that CheckMK understands. Most of our
|
||||||
# output are in JSON rows.
|
# output are in JSON rows.
|
||||||
def print_out(scan_results):
|
def print_out(scan_results, agent_name):
|
||||||
scan_results = list(scan_results)
|
scan_results = list(scan_results)
|
||||||
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
scan_results.sort(key=lambda d: d["cn"] + d["vm"] + d["ip"])
|
||||||
|
|
||||||
@ -233,7 +222,7 @@ def print_out(scan_results):
|
|||||||
if curr_cn != last_cn:
|
if curr_cn != last_cn:
|
||||||
last_cn = curr_cn
|
last_cn = curr_cn
|
||||||
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
|
sys.stdout.write(f"<<<<{curr_cn}>>>>\n")
|
||||||
sys.stdout.write(f"<<<{AGENT_NAME}:sep(0)>>>\n")
|
sys.stdout.write(f"<<<{agent_name}:sep(0)>>>\n")
|
||||||
|
|
||||||
sys.stdout.write("%s\n" % json.dumps({
|
sys.stdout.write("%s\n" % json.dumps({
|
||||||
"vm": entry["vm"],
|
"vm": entry["vm"],
|
||||||
@ -260,13 +249,13 @@ def main(argv=None):
|
|||||||
args = parse_arguments(argv)
|
args = parse_arguments(argv)
|
||||||
nics = query_napi(args.hostname)
|
nics = query_napi(args.hostname)
|
||||||
|
|
||||||
if nics == None:
|
# Sort the IPs so that (tend) to scan them in relative order. This is to
|
||||||
print_out_napi_err()
|
# increase the time between scans to the same IP due to consecutive agent
|
||||||
# We must return 0, or CheckMK won't check the output we're returning
|
# executions, otherwise there's a higher chance we bump into TIME_WAIT.
|
||||||
sys.exit(0)
|
#nics.sort(key=lambda d: d["ip"])
|
||||||
|
|
||||||
scan_results = asyncio.run(scan(nics))
|
scan_results = asyncio.run(scan(nics))
|
||||||
print_out(scan_results)
|
print_out(scan_results, "triton_wedge")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
BIN
wedge/triton_wedge-0.3.0.mkp
Executable file
BIN
wedge/triton_wedge-0.3.0.mkp
Executable file
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user