From ddc9c53e1b78da4374ba51a0e6907b61c1ebbab4 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Tue, 3 Sep 2013 18:03:51 -0400 Subject: [PATCH] nagios: Set max_check_attempts to 3 for rabbitmq consumers. This works around the fact that we seem to have a mysterous extra checkup 40s after an error first occurs with these checks, which always fails because the data is updated by a cron job that runs every minute. (imported from commit e7fe9c85e8399115443269287e695b140b4443ff) --- .../zulip/files/nagios3/conf.d/services.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/servers/puppet/modules/zulip/files/nagios3/conf.d/services.cfg b/servers/puppet/modules/zulip/files/nagios3/conf.d/services.cfg index c040037547..aad5c71df9 100644 --- a/servers/puppet/modules/zulip/files/nagios3/conf.d/services.cfg +++ b/servers/puppet/modules/zulip/files/nagios3/conf.d/services.cfg @@ -256,6 +256,9 @@ define service { use generic-service service_description Check rabbitmq queue sizes check_command check_rabbitmq_queues!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } @@ -264,6 +267,9 @@ define service { use generic-service service_description Check rabbitmq notify_tornado consumers check_command check_rabbitmq_tornado_consumers!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } @@ -272,6 +278,9 @@ define service { use generic-service service_description Check rabbitmq useractivity consumers check_command check_rabbitmq_useractivity_consumers!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 hostgroup_name frontends contact_groups admins } @@ -280,6 +289,9 @@ define service { use generic-service service_description Check rabbitmq invites consumers check_command check_rabbitmq_invites_consumers!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 hostgroup_name frontends contact_groups admins } @@ -288,6 +300,9 @@ define service { use generic-service service_description Check rabbitmq signups consumers check_command check_rabbitmq_signups_consumers!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 hostgroup_name frontends contact_groups admins }