mirror of
https://github.com/zulip/zulip.git
synced 2025-11-11 09:27:43 +00:00
queue: Suppress error mail from brief rabbitmq downtimes.
Details in comment. Together with a few previous commits, this should completely eliminate sending error mail to admins when the RabbitMQ server is simply restarted and comes back up normally.
This commit is contained in:
@@ -183,6 +183,7 @@ class TornadoQueueClient(SimpleQueueClient):
|
|||||||
# TornadoConnection can process heartbeats, so enable them.
|
# TornadoConnection can process heartbeats, so enable them.
|
||||||
rabbitmq_heartbeat=None)
|
rabbitmq_heartbeat=None)
|
||||||
self._on_open_cbs = [] # type: List[Callable[[], None]]
|
self._on_open_cbs = [] # type: List[Callable[[], None]]
|
||||||
|
self._connection_failure_count = 0
|
||||||
|
|
||||||
def _connect(self) -> None:
|
def _connect(self) -> None:
|
||||||
self.log.info("Beginning TornadoQueueClient connection")
|
self.log.info("Beginning TornadoQueueClient connection")
|
||||||
@@ -202,21 +203,40 @@ class TornadoQueueClient(SimpleQueueClient):
|
|||||||
|
|
||||||
CONNECTION_RETRY_SECS = 2
|
CONNECTION_RETRY_SECS = 2
|
||||||
|
|
||||||
|
# When the RabbitMQ server is restarted, it's normal for it to
|
||||||
|
# take a few seconds to come back; we'll retry a few times and all
|
||||||
|
# will be well. So for the first few failures, we report only at
|
||||||
|
# "warning" level, avoiding an email to the server admin.
|
||||||
|
#
|
||||||
|
# A loss of an existing connection starts a retry loop just like a
|
||||||
|
# failed connection attempt, so it counts as the first failure.
|
||||||
|
#
|
||||||
|
# On an unloaded test system, a RabbitMQ restart takes about 6s,
|
||||||
|
# potentially causing 4 failures. We add some headroom above that.
|
||||||
|
CONNECTION_FAILURES_BEFORE_NOTIFY = 10
|
||||||
|
|
||||||
def _on_connection_open_error(self, connection: pika.connection.Connection,
|
def _on_connection_open_error(self, connection: pika.connection.Connection,
|
||||||
message: Optional[str]=None) -> None:
|
message: Optional[str]=None) -> None:
|
||||||
|
self._connection_failure_count += 1
|
||||||
retry_secs = self.CONNECTION_RETRY_SECS
|
retry_secs = self.CONNECTION_RETRY_SECS
|
||||||
self.log.critical("TornadoQueueClient couldn't connect to RabbitMQ, retrying in %d secs..."
|
message = ("TornadoQueueClient couldn't connect to RabbitMQ, retrying in %d secs..."
|
||||||
% (retry_secs,))
|
% (retry_secs,))
|
||||||
|
if self._connection_failure_count > self.CONNECTION_FAILURES_BEFORE_NOTIFY:
|
||||||
|
self.log.critical(message)
|
||||||
|
else:
|
||||||
|
self.log.warning(message)
|
||||||
ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
|
ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
|
||||||
|
|
||||||
def _on_connection_closed(self, connection: pika.connection.Connection,
|
def _on_connection_closed(self, connection: pika.connection.Connection,
|
||||||
reply_code: int, reply_text: str) -> None:
|
reply_code: int, reply_text: str) -> None:
|
||||||
|
self._connection_failure_count = 1
|
||||||
retry_secs = self.CONNECTION_RETRY_SECS
|
retry_secs = self.CONNECTION_RETRY_SECS
|
||||||
self.log.warning("TornadoQueueClient lost connection to RabbitMQ, reconnecting in %d secs..."
|
self.log.warning("TornadoQueueClient lost connection to RabbitMQ, reconnecting in %d secs..."
|
||||||
% (retry_secs,))
|
% (retry_secs,))
|
||||||
ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
|
ioloop.IOLoop.instance().call_later(retry_secs, self._reconnect)
|
||||||
|
|
||||||
def _on_open(self, connection: pika.connection.Connection) -> None:
|
def _on_open(self, connection: pika.connection.Connection) -> None:
|
||||||
|
self._connection_failure_count = 0
|
||||||
try:
|
try:
|
||||||
self.connection.channel(
|
self.connection.channel(
|
||||||
on_open_callback = self._on_channel_open)
|
on_open_callback = self._on_channel_open)
|
||||||
|
|||||||
Reference in New Issue
Block a user