queue: Eliminate useless "burst" concept in monitoring.

The reason higher expected_time_to_clear_backlog were allowed for queues during "bursts" was, in simpler terms, because those queues to which this happens, intrinsically have a higher acceptable "time until cleared" for new events. E.g. digests_email, where it's completely fine to take a long time to send them out after putting in the queue. And that's already configurable without a normal/burst distinction. Thanks to this we can remove a bunch of overly complicated, and ultimately useless, logic.
2025-11-03 13:33:24 +00:00 · 2020-09-20 13:35:35 +02:00
parent 810514dd9d
commit cd9b194d88
2 changed files with 13 additions and 91 deletions
--- a/scripts/lib/check_rabbitmq_queue.py
+++ b/scripts/lib/check_rabbitmq_queue.py
@@ -38,23 +38,15 @@ states = {
    3: "UNKNOWN",
 }

-MAX_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
-    lambda: 120,
-    digest_emails=600,
-)
-MAX_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
+MAX_SECONDS_TO_CLEAR: DefaultDict[str, int] = defaultdict(
    lambda: 30,
    digest_emails=1200,
    missedmessage_mobile_notifications=120,
 )
-CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
-    lambda: 240,
-    digest_emails=1200,
-)
-CRITICAL_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
+CRITICAL_SECONDS_TO_CLEAR: DefaultDict[str, int] = defaultdict(
    lambda: 60,
    missedmessage_mobile_notifications=180,
-    digest_emails=600,
+    digest_emails=1800,
 )

 def analyze_queue_stats(queue_name: str, stats: Dict[str, Any],
@@ -96,34 +88,15 @@ def analyze_queue_stats(queue_name: str, stats: Dict[str, Any],
                    message='')

    expected_time_to_clear_backlog = current_size * average_consume_time
-    time_since_emptied = now - stats['queue_last_emptied_timestamp']
-    if time_since_emptied > max(300, CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]):
-        # We need the max() expression in case the rules for the queue
-        # permit longer processing times than 300s - to prevent
-        # incorrectly throwing an error by changing the classification
-        # of the the backlog from "burst" to "not burst" after 300s,
-        # while the worker is still processing it and staying below
-        # the CRITICAL threshold.
-        if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_NORMAL[queue_name]:
-            if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_NORMAL[queue_name]:
-                status = CRITICAL
-            else:
-                status = WARNING
+    if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR[queue_name]:
+        if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR[queue_name]:
+            status = CRITICAL
+        else:
+            status = WARNING

-            return dict(status=status,
-                        name=queue_name,
-                        message=f'clearing the backlog will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
-    else:
-        # We slept recently, so treat this as a burst.
-        if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
-            if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
-                status = CRITICAL
-            else:
-                status = WARNING
-
-            return dict(status=status,
-                        name=queue_name,
-                        message=f'clearing the burst will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
+        return dict(status=status,
+                    name=queue_name,
+                    message=f'clearing the backlog will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')

    return dict(status=OK,
                name=queue_name,