Allow Graylog metrics plugin to use its own custom EWMA windows.
This commit is contained in:
parent
8cd662e2bc
commit
0c882e643c
Binary file not shown.
BIN
graylog-metrics/2.4/graylog_input_metrics-1.1.0.mkp
Executable file
BIN
graylog-metrics/2.4/graylog_input_metrics-1.1.0.mkp
Executable file
Binary file not shown.
@ -15,6 +15,8 @@
|
|||||||
# "rs_m15_rate": 163.80530659055356}}
|
# "rs_m15_rate": 163.80530659055356}}
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
|
import math
|
||||||
from cmk.agent_based.v2 import (
|
from cmk.agent_based.v2 import (
|
||||||
Result,
|
Result,
|
||||||
Service,
|
Service,
|
||||||
@ -23,11 +25,10 @@ from cmk.agent_based.v2 import (
|
|||||||
CheckPlugin,
|
CheckPlugin,
|
||||||
AgentSection,
|
AgentSection,
|
||||||
render,
|
render,
|
||||||
|
get_value_store,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_graylog_input_metrics(section):
|
def parse_graylog_input_metrics(section):
|
||||||
if not section:
|
if not section:
|
||||||
return {}
|
return {}
|
||||||
@ -48,15 +49,91 @@ def render_msgs(num_msgs):
|
|||||||
|
|
||||||
|
|
||||||
check_configs = [
|
check_configs = [
|
||||||
("im_m1_rate", render_msgs, "Incoming messages/sec 1m"),
|
("im", "small", render_msgs, "Incoming msgs/sec small window"),
|
||||||
("im_m5_rate", render_msgs, "5m"),
|
("im", "medium", render_msgs, "Incoming msgs/sec medium window"),
|
||||||
("im_m15_rate", render_msgs, "15m"),
|
("im", "large", render_msgs, "Incoming msgs/sec large window"),
|
||||||
("rs_m1_rate", render.bytes, "Incoming bytes/sec 1m"),
|
("rs", "small", render.bytes, "Incoming bytes/sec small window"),
|
||||||
("rs_m5_rate", render.bytes, "5m"),
|
("rs", "medium", render.bytes, "Incoming bytes/sec medium window"),
|
||||||
("rs_m15_rate", render.bytes, "15m"),
|
("rs", "large", render.bytes, "Incoming bytes/sec large window"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Since Graylog only provides 1, 5 and 15 minute windows, when we're attempting
|
||||||
|
# to (semi-)reconstruct raw values we choose the best Graylog window to use.
|
||||||
|
# It's a balance between the delay and dilution in each window. We're trying
|
||||||
|
# to pick the Graylog window with the strongest signal.
|
||||||
|
def determine_metric_input(store, window_length):
|
||||||
|
previous_timestamp = store.get("timestamp")
|
||||||
|
|
||||||
|
if previous_timestamp is not None:
|
||||||
|
barrier = max(window_length, time.time() - previous_timestamp)
|
||||||
|
else:
|
||||||
|
barrier = window_length
|
||||||
|
|
||||||
|
if window_length < 2.5:
|
||||||
|
return 1
|
||||||
|
elif window_length < 10:
|
||||||
|
return 5
|
||||||
|
else:
|
||||||
|
return 15
|
||||||
|
|
||||||
|
|
||||||
|
# We take two window lengths: the window length the user specificed in the
|
||||||
|
# check rule (which can be an arbitrary length), and Graylog's native window
|
||||||
|
# (which is 1, 5 or 15 minutes). We then attempt to pull out the most recent
|
||||||
|
# value by effectively reversing the exponentially-weighted moving average
|
||||||
|
# (EMWA) window math on the Graylog window, and then creating our own EWMA
|
||||||
|
# value on our own window size.
|
||||||
|
#
|
||||||
|
# The rest of the logic is there for the various gotchas that come with these
|
||||||
|
# two transforms.
|
||||||
|
def calculate_ewma(store, window_name, window_length, graylog_window_length, newest_value):
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
previous_timestamp = store.get("timestamp")
|
||||||
|
previous_ewma = store.get(f"{window_name}_ewma")
|
||||||
|
previous_graylog = store.get(f"{window_name}_graylog")
|
||||||
|
|
||||||
|
# We need to store both the time delta and the prior EWMA values for both
|
||||||
|
# Graylog and our own window so the math can work -- we're adding new
|
||||||
|
# values to a moving *average* after all.
|
||||||
|
store["timestamp"] = now
|
||||||
|
store[f"{window_name}_graylog"] = newest_value
|
||||||
|
|
||||||
|
if previous_ewma is None:
|
||||||
|
# Provide a seed on first run to speed up convergence.
|
||||||
|
store[f"{window_name}_ewma"] = newest_value
|
||||||
|
return newest_value
|
||||||
|
|
||||||
|
if previous_timestamp is None or previous_graylog is None:
|
||||||
|
return newest_value
|
||||||
|
|
||||||
|
# Since both our and Graylog's windows are both in minutes, not seconds,
|
||||||
|
# the delta is in minutes too.
|
||||||
|
time_delta = (now - previous_timestamp) / 60.0
|
||||||
|
|
||||||
|
# Reverse Graylog's EWMA
|
||||||
|
raw_alpha = math.exp(- time_delta / graylog_window_length)
|
||||||
|
raw_value = (newest_value - raw_alpha * previous_graylog) / (1 - raw_alpha)
|
||||||
|
|
||||||
|
# Since the above transform magnifies noise, we need to clamp here since
|
||||||
|
# the noise can cause us to drop below zero.
|
||||||
|
if raw_value < 0:
|
||||||
|
raw_value = 0
|
||||||
|
|
||||||
|
# Create our own EWMA
|
||||||
|
ewma_alpha = math.exp(- time_delta / window_length)
|
||||||
|
ewma = ewma_alpha * previous_ewma + (1 - ewma_alpha) * raw_value
|
||||||
|
|
||||||
|
store[f"{window_name}_ewma"] = ewma
|
||||||
|
|
||||||
|
# Another clamp, although probably not necessary.
|
||||||
|
if ewma >= 0:
|
||||||
|
return ewma
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def check_graylog_input_metrics(item, params, section):
|
def check_graylog_input_metrics(item, params, section):
|
||||||
item_id = item.split()[-1][1:-1]
|
item_id = item.split()[-1][1:-1]
|
||||||
input_info = section.get(item_id)
|
input_info = section.get(item_id)
|
||||||
@ -75,16 +152,35 @@ def check_graylog_input_metrics(item, params, section):
|
|||||||
if input_info["input_port"]:
|
if input_info["input_port"]:
|
||||||
yield Result(state=State.OK, summary="Port: %s" % input_info["input_port"])
|
yield Result(state=State.OK, summary="Port: %s" % input_info["input_port"])
|
||||||
|
|
||||||
for metric_name, render_func, label in check_configs:
|
store = get_value_store()
|
||||||
value = input_info.get(metric_name)
|
|
||||||
|
for prefix, window_name, render_func, label in check_configs:
|
||||||
|
metric_name = f"{prefix}_{window_name}_rate"
|
||||||
|
|
||||||
|
config = params.get(metric_name, {})
|
||||||
|
window_length = config.get("window")
|
||||||
|
levels_upper = config.get("upper")
|
||||||
|
levels_lower = config.get("lower")
|
||||||
|
|
||||||
|
if window_length is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
graylog_window_length = determine_metric_input(store, window_length)
|
||||||
|
metric_input = f"{prefix}_m{graylog_window_length}_rate"
|
||||||
|
|
||||||
|
value = input_info.get(metric_input)
|
||||||
if value is None:
|
if value is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
levels_upper = params.get(metric_name, {}).get("upper")
|
# Since Graylog natively gives us 1m, 5m, 15m, there's no need to
|
||||||
levels_lower = params.get(metric_name, {}).get("lower")
|
# crunch math for them; we just pass those values through.
|
||||||
|
if window_length in [1, 5, 15]:
|
||||||
|
ewma_value = value
|
||||||
|
else:
|
||||||
|
ewma_value = calculate_ewma(store, metric_name, window_length, graylog_window_length, value)
|
||||||
|
|
||||||
yield from check_levels(
|
yield from check_levels(
|
||||||
value,
|
ewma_value,
|
||||||
levels_upper = levels_upper,
|
levels_upper = levels_upper,
|
||||||
levels_lower = levels_lower,
|
levels_lower = levels_lower,
|
||||||
metric_name = metric_name,
|
metric_name = metric_name,
|
||||||
|
|||||||
@ -7,40 +7,40 @@ UNIT_BYTES = metrics.Unit(metrics.IECNotation("bytes/sec"))
|
|||||||
UNIT_MSGS = metrics.Unit(metrics.IECNotation("msgs/sec"))
|
UNIT_MSGS = metrics.Unit(metrics.IECNotation("msgs/sec"))
|
||||||
|
|
||||||
|
|
||||||
metric_graylog_input_metrics_im_m1_rate = metrics.Metric(
|
metric_graylog_input_metrics_im_small_rate = metrics.Metric(
|
||||||
title = Title("Incoming messages/sec (1 min)"),
|
title = Title("Incoming messages/sec (small window)"),
|
||||||
name = "im_m1_rate",
|
name = "im_small_rate",
|
||||||
unit = UNIT_BYTES,
|
unit = UNIT_BYTES,
|
||||||
color = metrics.Color.LIGHT_GREEN,
|
color = metrics.Color.LIGHT_GREEN,
|
||||||
)
|
)
|
||||||
metric_graylog_input_metrics_im_m5_rate = metrics.Metric(
|
metric_graylog_input_metrics_im_medium_rate = metrics.Metric(
|
||||||
title = Title("Incoming messages/sec (5 min)"),
|
title = Title("Incoming messages/sec (medium window)"),
|
||||||
name = "im_m5_rate",
|
name = "im_medium_rate",
|
||||||
unit = UNIT_BYTES,
|
unit = UNIT_BYTES,
|
||||||
color = metrics.Color.GREEN,
|
color = metrics.Color.GREEN,
|
||||||
)
|
)
|
||||||
metric_graylog_input_metrics_im_m15_rate = metrics.Metric(
|
metric_graylog_input_metrics_im_large_rate = metrics.Metric(
|
||||||
title = Title("Incoming messages/sec (15 min)"),
|
title = Title("Incoming messages/sec (large window)"),
|
||||||
name = "im_m15_rate",
|
name = "im_large_rate",
|
||||||
unit = UNIT_BYTES,
|
unit = UNIT_BYTES,
|
||||||
color = metrics.Color.DARK_GREEN,
|
color = metrics.Color.DARK_GREEN,
|
||||||
)
|
)
|
||||||
|
|
||||||
metric_graylog_input_metrics_rs_m1_rate = metrics.Metric(
|
metric_graylog_input_metrics_rs_small_rate = metrics.Metric(
|
||||||
title = Title("Incoming bytes/sec (1 min)"),
|
title = Title("Incoming bytes/sec (small window)"),
|
||||||
name = "rs_m1_rate",
|
name = "rs_small_rate",
|
||||||
unit = UNIT_MSGS,
|
unit = UNIT_MSGS,
|
||||||
color = metrics.Color.LIGHT_BLUE,
|
color = metrics.Color.LIGHT_BLUE,
|
||||||
)
|
)
|
||||||
metric_graylog_input_metrics_rs_m5_rate = metrics.Metric(
|
metric_graylog_input_metrics_rs_medium_rate = metrics.Metric(
|
||||||
title = Title("Incoming bytes/sec (5 min)"),
|
title = Title("Incoming bytes/sec (medium window)"),
|
||||||
name = "rs_m5_rate",
|
name = "rs_medium_rate",
|
||||||
unit = UNIT_MSGS,
|
unit = UNIT_MSGS,
|
||||||
color = metrics.Color.BLUE,
|
color = metrics.Color.BLUE,
|
||||||
)
|
)
|
||||||
metric_graylog_input_metrics_rs_m15_rate = metrics.Metric(
|
metric_graylog_input_metrics_rs_large_rate = metrics.Metric(
|
||||||
title = Title("Incoming bytes/sec (15 min)"),
|
title = Title("Incoming bytes/sec (large window)"),
|
||||||
name = "rs_m15_rate",
|
name = "rs_large_rate",
|
||||||
unit = UNIT_MSGS,
|
unit = UNIT_MSGS,
|
||||||
color = metrics.Color.DARK_BLUE,
|
color = metrics.Color.DARK_BLUE,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
# Copyright (C) 2026 Spearhead Systems SRL
|
# Copyright (C) 2026 Spearhead Systems SRL
|
||||||
|
|
||||||
|
from cmk.rulesets.v1.form_specs.validators import NumberInRange
|
||||||
from cmk.rulesets.v1.form_specs import (
|
from cmk.rulesets.v1.form_specs import (
|
||||||
Dictionary,
|
Dictionary,
|
||||||
DictElement,
|
DictElement,
|
||||||
Float,
|
Float,
|
||||||
|
Integer,
|
||||||
DefaultValue,
|
DefaultValue,
|
||||||
LevelDirection,
|
LevelDirection,
|
||||||
SimpleLevels,
|
SimpleLevels,
|
||||||
@ -29,7 +31,7 @@ titles = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _rate(metric, minutes, level):
|
def _rate(metric, window, level):
|
||||||
unit_name = titles[metric]
|
unit_name = titles[metric]
|
||||||
def_value = default_levels[level]
|
def_value = default_levels[level]
|
||||||
|
|
||||||
@ -38,7 +40,7 @@ def _rate(metric, minutes, level):
|
|||||||
title = Title(f"{level.value.capitalize()} level"),
|
title = Title(f"{level.value.capitalize()} level"),
|
||||||
level_direction = level,
|
level_direction = level,
|
||||||
form_spec_template = Float(
|
form_spec_template = Float(
|
||||||
title = Title(f"{unit_name}/{minutes}min"),
|
title = Title(f"{unit_name} {window} time window"),
|
||||||
),
|
),
|
||||||
prefill_fixed_levels = DefaultValue(value=(def_value, def_value))
|
prefill_fixed_levels = DefaultValue(value=(def_value, def_value))
|
||||||
),
|
),
|
||||||
@ -49,13 +51,25 @@ def _parameter_valuespec_graylog_input_metrics():
|
|||||||
elements = {}
|
elements = {}
|
||||||
|
|
||||||
for metric in ["im", "rs"]:
|
for metric in ["im", "rs"]:
|
||||||
for minutes in [1, 5, 15]:
|
for window, default in [("small", 1), ("medium", 5), ("large", 15)]:
|
||||||
elements[f"{metric}_m{minutes}_rate"] = DictElement(
|
elements[f"{metric}_{window}_rate"] = DictElement(
|
||||||
parameter_form = Dictionary(
|
parameter_form = Dictionary(
|
||||||
title = Title(f"Incoming {titles[metric]} for past {minutes} minute(s)"),
|
title = Title(f"Incoming {titles[metric]} over {window} time window"),
|
||||||
elements = {
|
elements = {
|
||||||
"upper": _rate(metric, minutes, LevelDirection.UPPER),
|
"window": DictElement(
|
||||||
"lower": _rate(metric, minutes, LevelDirection.LOWER)
|
required = True,
|
||||||
|
parameter_form = Integer(
|
||||||
|
title = Title("Minutes"),
|
||||||
|
help_text = Help(
|
||||||
|
"Set how many minutes this Range Weighted Moving Average "
|
||||||
|
"window should use."
|
||||||
|
),
|
||||||
|
prefill = DefaultValue(default),
|
||||||
|
custom_validate = (NumberInRange(min_value=1),),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
"upper": _rate(metric, window, LevelDirection.UPPER),
|
||||||
|
"lower": _rate(metric, window, LevelDirection.LOWER),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user