mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-03 21:43:21 +00:00 
			
		
		
		
	Fix excessive CPU usage by rabbitmq-numconsumers Nagios checks.
The previous model for these Nagios checks was kinda crazy -- every minute, we'd run a full `rabbitmctl list_consumers` for each of the dozen+ consumers that we have, and then do the exact same parsing logic for each to determine whether the target queue has a running consumer to write out a state file. Because `rabbitmctl list_consumers` takes a small amount of resources, on systems where CPU is very limited (e.g. t2 style AWS instances), this minor CPU wastage could be problematic. Now we just do that `rabbitmqctl list_consumers` once per minute, and output all the state files from a single command. Further TODO items on this front include removing the hardcoded list of queues.
This commit is contained in:
		@@ -2,15 +2,4 @@ SHELL=/bin/bash
 | 
				
			|||||||
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
 | 
					PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
 | 
				
			||||||
USER=root
 | 
					USER=root
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file notify_tornado
 | 
					* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity_interval
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_presence
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file invites
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file signups
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file message_sender
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file feedback_messages
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file error_reports
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file digest_emails
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file email_mirror
 | 
					 | 
				
			||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file missedmessage_mobile_notifications
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -22,10 +22,6 @@ if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
 | 
				
			|||||||
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
 | 
					usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
parser = optparse.OptionParser(usage=usage)
 | 
					parser = optparse.OptionParser(usage=usage)
 | 
				
			||||||
parser.add_option('--queue',
 | 
					 | 
				
			||||||
                  dest='queue_name',
 | 
					 | 
				
			||||||
                  default="notify_tornado",
 | 
					 | 
				
			||||||
                  action='store')
 | 
					 | 
				
			||||||
parser.add_option('--min-threshold',
 | 
					parser.add_option('--min-threshold',
 | 
				
			||||||
                  dest='min_count',
 | 
					                  dest='min_count',
 | 
				
			||||||
                  type="int",
 | 
					                  type="int",
 | 
				
			||||||
@@ -39,19 +35,41 @@ output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'],
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
consumers = defaultdict(int) # type: Dict[str, int]
 | 
					consumers = defaultdict(int) # type: Dict[str, int]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					worker_queues = {'error_reports',
 | 
				
			||||||
 | 
					                 'user_presence',
 | 
				
			||||||
 | 
					                 'digest_emails',
 | 
				
			||||||
 | 
					                 'slow_queries',
 | 
				
			||||||
 | 
					                 'missedmessage_mobile_notifications',
 | 
				
			||||||
 | 
					                 'feedback_messages',
 | 
				
			||||||
 | 
					                 'signups',
 | 
				
			||||||
 | 
					                 'notify_tornado',
 | 
				
			||||||
 | 
					                 'message_sender',
 | 
				
			||||||
 | 
					                 'missedmessage_emails',
 | 
				
			||||||
 | 
					                 'email_mirror',
 | 
				
			||||||
 | 
					                 'user_activity_interval',
 | 
				
			||||||
 | 
					                 'invites',
 | 
				
			||||||
 | 
					                 'user_activity'}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for queue_name in worker_queues:
 | 
				
			||||||
 | 
					    consumers[queue_name] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for line in output.split('\n'):
 | 
					for line in output.split('\n'):
 | 
				
			||||||
    parts = line.split('\t')
 | 
					    parts = line.split('\t')
 | 
				
			||||||
    if len(parts) and parts[0] == options.queue_name:
 | 
					    if len(parts) >= 2:
 | 
				
			||||||
        consumers[parts[0]] += 1
 | 
					        consumers[parts[0]] += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
now = int(time.time())
 | 
					now = int(time.time())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if consumers[options.queue_name] < options.min_count:
 | 
					for queue_name in consumers.keys():
 | 
				
			||||||
 | 
					    state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
 | 
				
			||||||
 | 
					    state_file_tmp = state_file_path + "-tmp"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if consumers[queue_name] < options.min_count:
 | 
				
			||||||
        status = 2
 | 
					        status = 2
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        status = 0
 | 
					        status = 0
 | 
				
			||||||
 | 
					    with open(state_file_tmp, "w") as f:
 | 
				
			||||||
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
 | 
					        f.write("%s|%s|%s|queue %s has %s consumers, needs %s\n" % (
 | 
				
			||||||
            now, status, states[status], options.queue_name,
 | 
					            now, status, states[status], queue_name,
 | 
				
			||||||
            consumers[options.queue_name], options.min_count))
 | 
					            consumers[queue_name], options.min_count))
 | 
				
			||||||
 | 
					    subprocess.check_call(["mv", state_file_tmp, state_file_path])
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,16 +0,0 @@
 | 
				
			|||||||
#!/usr/bin/env bash
 | 
					 | 
				
			||||||
set -e
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
queue=$1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if [ -z "$queue" ]; then
 | 
					 | 
				
			||||||
    echo "Usage: $0 <queue-name>"
 | 
					 | 
				
			||||||
    exit 2
 | 
					 | 
				
			||||||
fi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
ZULIP_DIR=/home/zulip/deployments/current
 | 
					 | 
				
			||||||
STATE_DIR=/var/lib/nagios_state
 | 
					 | 
				
			||||||
STATE_FILE=$STATE_DIR/check-rabbitmq-consumers-$queue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"$ZULIP_DIR/scripts/nagios/check-rabbitmq-consumers" "--queue=$queue" &> "${STATE_FILE}-tmp";
 | 
					 | 
				
			||||||
mv "${STATE_FILE}-tmp" "$STATE_FILE"
 | 
					 | 
				
			||||||
@@ -81,14 +81,9 @@ if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*u
 | 
				
			|||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
 | 
					echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
 | 
				
			||||||
 | 
					# First run the check that usually runs in cron and populates the state files
 | 
				
			||||||
 | 
					/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
 | 
				
			||||||
for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
 | 
					for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
 | 
				
			||||||
    if ! /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"; then
 | 
					 | 
				
			||||||
        # Temporary section while we're debugging why this fails nondeterministically in CI
 | 
					 | 
				
			||||||
        STATE_DIR=/var/lib/nagios_state
 | 
					 | 
				
			||||||
        ls "$STATE_DIR"
 | 
					 | 
				
			||||||
        cat "$STATE_DIR"/*
 | 
					 | 
				
			||||||
        /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"
 | 
					 | 
				
			||||||
    fi
 | 
					 | 
				
			||||||
    if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
 | 
					    if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
 | 
				
			||||||
        set +x
 | 
					        set +x
 | 
				
			||||||
        echo
 | 
					        echo
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user