mirror of
https://github.com/zulip/zulip.git
synced 2025-11-03 13:33:24 +00:00
queue: Eliminate useless "burst" concept in monitoring.
The reason higher expected_time_to_clear_backlog were allowed for queues during "bursts" was, in simpler terms, because those queues to which this happens, intrinsically have a higher acceptable "time until cleared" for new events. E.g. digests_email, where it's completely fine to take a long time to send them out after putting in the queue. And that's already configurable without a normal/burst distinction. Thanks to this we can remove a bunch of overly complicated, and ultimately useless, logic.
This commit is contained in:
committed by
Tim Abbott
parent
810514dd9d
commit
cd9b194d88
@@ -38,23 +38,15 @@ states = {
|
||||
3: "UNKNOWN",
|
||||
}
|
||||
|
||||
MAX_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
|
||||
lambda: 120,
|
||||
digest_emails=600,
|
||||
)
|
||||
MAX_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
|
||||
MAX_SECONDS_TO_CLEAR: DefaultDict[str, int] = defaultdict(
|
||||
lambda: 30,
|
||||
digest_emails=1200,
|
||||
missedmessage_mobile_notifications=120,
|
||||
)
|
||||
CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
|
||||
lambda: 240,
|
||||
digest_emails=1200,
|
||||
)
|
||||
CRITICAL_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
|
||||
CRITICAL_SECONDS_TO_CLEAR: DefaultDict[str, int] = defaultdict(
|
||||
lambda: 60,
|
||||
missedmessage_mobile_notifications=180,
|
||||
digest_emails=600,
|
||||
digest_emails=1800,
|
||||
)
|
||||
|
||||
def analyze_queue_stats(queue_name: str, stats: Dict[str, Any],
|
||||
@@ -96,34 +88,15 @@ def analyze_queue_stats(queue_name: str, stats: Dict[str, Any],
|
||||
message='')
|
||||
|
||||
expected_time_to_clear_backlog = current_size * average_consume_time
|
||||
time_since_emptied = now - stats['queue_last_emptied_timestamp']
|
||||
if time_since_emptied > max(300, CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]):
|
||||
# We need the max() expression in case the rules for the queue
|
||||
# permit longer processing times than 300s - to prevent
|
||||
# incorrectly throwing an error by changing the classification
|
||||
# of the the backlog from "burst" to "not burst" after 300s,
|
||||
# while the worker is still processing it and staying below
|
||||
# the CRITICAL threshold.
|
||||
if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_NORMAL[queue_name]:
|
||||
if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_NORMAL[queue_name]:
|
||||
status = CRITICAL
|
||||
else:
|
||||
status = WARNING
|
||||
if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR[queue_name]:
|
||||
if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR[queue_name]:
|
||||
status = CRITICAL
|
||||
else:
|
||||
status = WARNING
|
||||
|
||||
return dict(status=status,
|
||||
name=queue_name,
|
||||
message=f'clearing the backlog will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
|
||||
else:
|
||||
# We slept recently, so treat this as a burst.
|
||||
if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
|
||||
if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
|
||||
status = CRITICAL
|
||||
else:
|
||||
status = WARNING
|
||||
|
||||
return dict(status=status,
|
||||
name=queue_name,
|
||||
message=f'clearing the burst will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
|
||||
return dict(status=status,
|
||||
name=queue_name,
|
||||
message=f'clearing the backlog will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
|
||||
|
||||
return dict(status=OK,
|
||||
name=queue_name,
|
||||
|
||||
Reference in New Issue
Block a user