mirror of
https://github.com/zulip/zulip.git
synced 2025-11-04 14:03:30 +00:00
check-rabbitmq-queue: Add a simple algorithm to analyze queue stats.
This new algorithm is designed to avoid monitoring paging when a queue simply has bursty behavior.
This commit is contained in:
committed by
Tim Abbott
parent
0e57975643
commit
122d0bca83
120
tools/tests/test_check_rabbitmq_queue.py
Normal file
120
tools/tests/test_check_rabbitmq_queue.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import mock
|
||||
from unittest import TestCase
|
||||
|
||||
from scripts.lib.check_rabbitmq_queue import (
|
||||
analyze_queue_stats,
|
||||
OK,
|
||||
WARNING,
|
||||
CRITICAL,
|
||||
UNKNOWN,
|
||||
)
|
||||
import time
|
||||
|
||||
class AnalyzeQueueStatsTests(TestCase):
|
||||
def test_no_stats_available(self) -> None:
|
||||
result = analyze_queue_stats('name', {}, 0)
|
||||
self.assertEqual(result['status'], UNKNOWN)
|
||||
|
||||
def test_queue_stuck(self) -> None:
|
||||
"""Last update > 5 minutes ago and there's events in the queue.
|
||||
|
||||
In theory, we could be having bad luck with a race where in
|
||||
the last (event_handing_time * 50) a burst was added, but it's
|
||||
unlikely and shouldn't fail 2 in a row for Nagios anyway.
|
||||
"""
|
||||
result = analyze_queue_stats('name', {'update_time': time.time() - 301}, 100)
|
||||
self.assertEqual(result['status'], CRITICAL)
|
||||
self.assertIn('queue appears to be stuck', result['message'])
|
||||
|
||||
def test_queue_just_started(self) -> None:
|
||||
"""
|
||||
We just started processing a burst of events, and haven't processed enough
|
||||
to log productivity statistics yet.
|
||||
"""
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 10000,
|
||||
'recent_average_consume_time': None}, 10000)
|
||||
self.assertEqual(result['status'], OK)
|
||||
|
||||
def test_queue_normal(self) -> None:
|
||||
"""10000 events and each takes a second => it'll take a long time to empty."""
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 10000,
|
||||
'queue_last_emptied_timestamp': time.time() - 10000,
|
||||
'recent_average_consume_time': 1}, 10000)
|
||||
self.assertEqual(result['status'], CRITICAL)
|
||||
self.assertIn('clearing the backlog', result['message'])
|
||||
|
||||
# If we're doing 10K/sec, it's OK.
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 10000,
|
||||
'queue_last_emptied_timestamp': time.time() - 10000,
|
||||
'recent_average_consume_time': 0.0001}, 10000)
|
||||
self.assertEqual(result['status'], OK)
|
||||
|
||||
# Verify logic around whether it'll take MAX_SECONDS_TO_CLEAR_NORMAL to clear queue.
|
||||
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_NORMAL',
|
||||
{'name': 10}):
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 11,
|
||||
'queue_last_emptied_timestamp': time.time() - 10000,
|
||||
'recent_average_consume_time': 1}, 11)
|
||||
self.assertEqual(result['status'], WARNING)
|
||||
self.assertIn('clearing the backlog', result['message'])
|
||||
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 9,
|
||||
'queue_last_emptied_timestamp': time.time() - 10000,
|
||||
'recent_average_consume_time': 1}, 9)
|
||||
self.assertEqual(result['status'], OK)
|
||||
|
||||
def test_queue_burst(self) -> None:
|
||||
"""Test logic for just after a large number of events were added
|
||||
to an empty queue. Happens routinely for digest emails, for example."""
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 10000,
|
||||
'queue_last_emptied_timestamp': time.time() - 1,
|
||||
'recent_average_consume_time': 1}, 10000)
|
||||
self.assertEqual(result['status'], CRITICAL)
|
||||
self.assertIn('clearing the burst', result['message'])
|
||||
|
||||
# verify logic around MAX_SECONDS_TO_CLEAR_FOR_BURSTS.
|
||||
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_FOR_BURSTS',
|
||||
{'name': 10}):
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 11,
|
||||
'queue_last_emptied_timestamp': time.time() - 1,
|
||||
'recent_average_consume_time': 1}, 11)
|
||||
self.assertEqual(result['status'], WARNING)
|
||||
self.assertIn('clearing the burst', result['message'])
|
||||
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 9,
|
||||
'queue_last_emptied_timestamp': time.time() - 1,
|
||||
'recent_average_consume_time': 1}, 9)
|
||||
self.assertEqual(result['status'], OK)
|
||||
|
||||
def test_queue_burst_long_time_to_clear_allowed(self) -> None:
|
||||
"""
|
||||
For a queue that is allowed > 300s to clear a burst of events,
|
||||
we need to verify that the checker will not stop categorizing this as a burst
|
||||
while the worker is still processing the events, within the allowed time limit.
|
||||
"""
|
||||
start_time = time.time()
|
||||
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS',
|
||||
{'name': 600}), \
|
||||
mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_FOR_BURSTS',
|
||||
{'name': 600}):
|
||||
with mock.patch('time.time', return_value=start_time + 599):
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 599,
|
||||
'queue_last_emptied_timestamp': start_time,
|
||||
'recent_average_consume_time': 1}, 599)
|
||||
self.assertEqual(result['status'], OK)
|
||||
|
||||
with mock.patch('time.time', return_value=start_time + 601):
|
||||
result = analyze_queue_stats('name', {'update_time': time.time(),
|
||||
'current_queue_size': 599,
|
||||
'queue_last_emptied_timestamp': start_time,
|
||||
'recent_average_consume_time': 1}, 599)
|
||||
self.assertEqual(result['status'], CRITICAL)
|
||||
Reference in New Issue
Block a user