queue: Suppress error mail from brief rabbitmq downtimes.

Details in comment. Together with a few previous commits, this should completely eliminate sending error mail to admins when the RabbitMQ server is simply restarted and comes back up normally.
2025-11-11 09:27:43 +00:00 · 2018-03-20 16:37:04 -07:00
parent d857f26cd3
commit 73559e5320
1 changed files with 22 additions and 2 deletions
--- a/zerver/lib/queue.py
+++ b/zerver/lib/queue.py
@@ -183,6 +183,7 @@ class TornadoQueueClient(SimpleQueueClient):
            # TornadoConnection can process heartbeats, so enable them.
            rabbitmq_heartbeat=None)
        self._on_open_cbs = []  # type: List[Callable[[], None]]
        self._connection_failure_count = 0
    def _connect(self) -> None:
        self.log.info("Beginning TornadoQueueClient connection")
@@ -202,21 +203,40 @@ class TornadoQueueClient(SimpleQueueClient):
    CONNECTION_RETRY_SECS = 2
    # When the RabbitMQ server is restarted, it's normal for it to
    # take a few seconds to come back; we'll retry a few times and all
    # will be well.  So for the first few failures, we report only at
    # "warning" level, avoiding an email to the server admin.
    #
    # A loss of an existing connection starts a retry loop just like a
    # failed connection attempt, so it counts as the first failure.
    #
    # On an unloaded test system, a RabbitMQ restart takes about 6s,
    # potentially causing 4 failures.  We add some headroom above that.
    CONNECTION_FAILURES_BEFORE_NOTIFY = 10
    def _on_connection_open_error(self, connection: pika.connection.Connection,
                                  message: Optional[str]=None) -> None:
        self._connection_failure_count += 1
        retry_secs = self.CONNECTION_RETRY_SECS
-        self.log.critical("TornadoQueueClient couldn't connect to RabbitMQ, retrying in %d secs..."
+        message = ("TornadoQueueClient couldn't connect to RabbitMQ, retrying in %d secs..."
                   % (retry_secs,))
        if self._connection_failure_count > self.CONNECTION_FAILURES_BEFORE_NOTIFY:
            self.log.critical(message)
        else:
            self.log.warning(message)
        ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
    def _on_connection_closed(self, connection: pika.connection.Connection,
                              reply_code: int, reply_text: str) -> None:
        self._connection_failure_count = 1
        retry_secs = self.CONNECTION_RETRY_SECS
        self.log.warning("TornadoQueueClient lost connection to RabbitMQ, reconnecting in %d secs..."
                         % (retry_secs,))
        ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
    def _on_open(self, connection: pika.connection.Connection) -> None:
        self._connection_failure_count = 0
        try:
            self.connection.channel(
                on_open_callback = self._on_channel_open)