queue: Eliminate useless "burst" concept in monitoring.

The reason higher expected_time_to_clear_backlog were allowed for queues
during "bursts" was, in simpler terms, because those queues to which
this happens, intrinsically have a higher acceptable "time until cleared"
for new events. E.g. digests_email, where it's completely fine to take a
long time to send them out after putting in the queue. And that's
already configurable without a normal/burst distinction.
Thanks to this we can remove a bunch of overly complicated, and
ultimately useless, logic.
This commit is contained in:
Mateusz Mandera
2020-09-20 13:35:35 +02:00
committed by Tim Abbott
parent 810514dd9d
commit cd9b194d88
2 changed files with 13 additions and 91 deletions

View File

@@ -42,8 +42,8 @@ class AnalyzeQueueStatsTests(TestCase):
'recent_average_consume_time': 0.0001}, 10000)
self.assertEqual(result['status'], OK)
# Verify logic around whether it'll take MAX_SECONDS_TO_CLEAR_NORMAL to clear queue.
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_NORMAL',
# Verify logic around whether it'll take MAX_SECONDS_TO_CLEAR to clear queue.
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR',
{'name': 10}):
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 11,
@@ -57,54 +57,3 @@ class AnalyzeQueueStatsTests(TestCase):
'queue_last_emptied_timestamp': time.time() - 10000,
'recent_average_consume_time': 1}, 9)
self.assertEqual(result['status'], OK)
def test_queue_burst(self) -> None:
"""Test logic for just after a large number of events were added
to an empty queue. Happens routinely for digest emails, for example."""
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 10000,
'queue_last_emptied_timestamp': time.time() - 1,
'recent_average_consume_time': 1}, 10000)
self.assertEqual(result['status'], CRITICAL)
self.assertIn('clearing the burst', result['message'])
# verify logic around MAX_SECONDS_TO_CLEAR_FOR_BURSTS.
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_FOR_BURSTS',
{'name': 10}):
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 11,
'queue_last_emptied_timestamp': time.time() - 1,
'recent_average_consume_time': 1}, 11)
self.assertEqual(result['status'], WARNING)
self.assertIn('clearing the burst', result['message'])
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 9,
'queue_last_emptied_timestamp': time.time() - 1,
'recent_average_consume_time': 1}, 9)
self.assertEqual(result['status'], OK)
def test_queue_burst_long_time_to_clear_allowed(self) -> None:
"""
For a queue that is allowed > 300s to clear a burst of events,
we need to verify that the checker will not stop categorizing this as a burst
while the worker is still processing the events, within the allowed time limit.
"""
start_time = time.time()
with mock.patch.dict('scripts.lib.check_rabbitmq_queue.CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS',
{'name': 600}), \
mock.patch.dict('scripts.lib.check_rabbitmq_queue.MAX_SECONDS_TO_CLEAR_FOR_BURSTS',
{'name': 600}):
with mock.patch('time.time', return_value=start_time + 599):
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 599,
'queue_last_emptied_timestamp': start_time,
'recent_average_consume_time': 1}, 599)
self.assertEqual(result['status'], OK)
with mock.patch('time.time', return_value=start_time + 601):
result = analyze_queue_stats('name', {'update_time': time.time(),
'current_queue_size': 599,
'queue_last_emptied_timestamp': start_time,
'recent_average_consume_time': 1}, 599)
self.assertEqual(result['status'], CRITICAL)