Files
zulip/scripts/nagios/check-rabbitmq-queue
Tim Abbott bbc1484253 check-rabbitmq-queue: Adjust threshholds for paging.
Ultimately, this isn't an effective way to monitor this queue; we want
time-based monitoring, not count-based monitoring.  Doing that
properly will likely involve modifying the queue processor to write
something about its status.

But until we add the monitoring we want, it makes sense to leave this
active with low limits.
2019-10-13 22:39:52 -07:00

81 lines
2.5 KiB
Python
Executable File

#!/usr/bin/env python3
import re
import time
import os
import subprocess
# The WARN_THRESHOLD/CRIT_THRESHOLD settings makes it possible to
# configure specific queues to have a higher or lower limit then the
# default.
WARN_THRESHOLD_DEFAULT = 10
WARN_THRESHOLD = {
'missedmessage_emails': WARN_THRESHOLD_DEFAULT,
# The user_activity worker has high throughput and uses a
# LoopQueueProcessingWorker, so it's normal to have a moderate
# backlog.
'user_activity': 1000,
}
CRIT_THRESHOLD_DEFAULT = 50
CRIT_THRESHOLD = {
'missedmessage_emails': CRIT_THRESHOLD_DEFAULT,
# A backlog of hundreds of events for user_activity likely
# indicates an outage of the processor.
'user_activity': 5000,
}
states = {
0: "OK",
1: "WARNING",
2: "CRITICAL",
3: "UNKNOWN"
}
pattern = re.compile(r'(\w+)\t(\d+)')
output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_queues'], universal_newlines=True)
status = 0
max_count = 0
warn_queues = []
if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
print("This script must be run as the root or rabbitmq user")
for line in output.split("\n"):
line = line.strip()
m = pattern.match(line)
if m:
queue = m.group(1)
count = int(m.group(2))
this_status = 0
if count > CRIT_THRESHOLD.get(queue, CRIT_THRESHOLD_DEFAULT):
this_status = 2
warn_queues.append(queue)
elif count > WARN_THRESHOLD.get(queue, WARN_THRESHOLD_DEFAULT):
this_status = max(status, 1)
warn_queues.append(queue)
status = max(status, this_status)
max_count = max(max_count, count)
warn_about = ", ".join(warn_queues)
now = int(time.time())
now_struct = time.gmtime(now)
# While we are sending digest emails, at 1800 hrs (UTC) each weekday, the mail
# queues can get backed up; don't alert on those. Additionally, certain workers
# (slow_queries and digest_emails) have a polling algorithm that means it's
# normal for them to accumulate items.
if not set(warn_queues) - set(("digest_emails", "slow_queries")) and \
now_struct.tm_hour == 18 and now_struct.tm_min < 25:
status = 0
print("%s|%s|%s|processing digests, not alerting on elevated mail queues" % (
now, status, states[status]))
exit(0)
if status > 0:
print("%s|%s|%s|max count %s, queues affected: %s" % (
now, status, states[status], max_count, warn_about))
else:
print("%s|%s|%s|queues normal, max count %s" % (now, status, states[status], max_count))