mirror of
https://github.com/zulip/zulip.git
synced 2025-11-02 21:13:36 +00:00
errors: Fix infinite recursion during memcached outages.
We have a very useful piece of code, _RateLimitFilter, which is designed to avoid sending us a billion error emails in the event that a Zulip production server is down in a way that throws the same exception a lot. The code uses memcached to ensure we send each traceback roughly once per Zulip server per 10 minutes (or if memcached is unavailable, at most 1/process/10 minutes, since we use memcached to coordinate between processes) However, if memcached is down, there is a logging.error call internal to the Django/memcached setup that happens inside the cache.set() call, and those aren't caught by the `except Exception` block around it. This ends up resulting in infinite recursion, eventually leading to Fatal Python error: Cannot recover from stack overflow., since this handler is configured to run for logging.error in addition to logging.exception. We fix this using a thread-local variable to detect whether we are being called recursively. This change should prevent some nasty failure modes we've had in the past where memcached being down resulted in infinite recursion (resulting in extra resources being consumed by our error notifications code, and most importantly, the error notifications not being sent). Fixes #12595.
This commit is contained in:
committed by
Tim Abbott
parent
177eb2e02e
commit
a7ec1a089c
@@ -6,49 +6,96 @@ from django.utils.timezone import utc as timezone_utc
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import threading
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Optional
|
from typing import Optional, Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.core.cache import cache
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
|
|
||||||
# Adapted http://djangosnippets.org/snippets/2242/ by user s29 (October 25, 2010)
|
|
||||||
|
|
||||||
class _RateLimitFilter:
|
class _RateLimitFilter:
|
||||||
|
"""This class is designed to rate-limit Django error reporting
|
||||||
|
notifications so that it won't send thousands of emails if the
|
||||||
|
database or cache is completely down. It uses a remote shared
|
||||||
|
cache (shared by all Django processes) for its default behavior
|
||||||
|
(so that the deduplication is global, not per-process), and a
|
||||||
|
local in-process cache for when it can't access the remote cache.
|
||||||
|
|
||||||
|
This is critical code because it is called every time
|
||||||
|
`logging.error` or `logging.exception` (or an exception) happens
|
||||||
|
in the codebase.
|
||||||
|
|
||||||
|
Adapted from https://djangosnippets.org/snippets/2242/.
|
||||||
|
|
||||||
|
"""
|
||||||
last_error = datetime.min.replace(tzinfo=timezone_utc)
|
last_error = datetime.min.replace(tzinfo=timezone_utc)
|
||||||
|
# This thread-local variable is used to detect recursive
|
||||||
|
# exceptions during exception handling (primarily intended for
|
||||||
|
# when accessing the shared cache throws an exception).
|
||||||
|
handling_exception = threading.local()
|
||||||
|
should_reset_handling_exception = False
|
||||||
|
|
||||||
|
def can_use_remote_cache(self) -> Tuple[bool, bool]:
|
||||||
|
if getattr(self.handling_exception, 'value', False):
|
||||||
|
# If we're processing an exception that occurred
|
||||||
|
# while handling an exception, this almost
|
||||||
|
# certainly was because interacting with the
|
||||||
|
# remote cache is failing (e.g. because the cache
|
||||||
|
# is down). Fall back to tracking duplicate
|
||||||
|
# exceptions in memory without the remote shared cache.
|
||||||
|
return False, False
|
||||||
|
|
||||||
|
# Now we test if the remote cache is accessible.
|
||||||
|
#
|
||||||
|
# This code path can only be reached if we are not potentially
|
||||||
|
# handling a recursive exception, so here we set
|
||||||
|
# self.handling_exception (in case the cache access we're
|
||||||
|
# about to do triggers a `logging.error` or exception that
|
||||||
|
# might recurse into this filter class), and actually record
|
||||||
|
# that this is the main exception handler thread.
|
||||||
|
try:
|
||||||
|
self.handling_exception.value = True
|
||||||
|
cache.set('RLF_TEST_KEY', 1, 1)
|
||||||
|
return cache.get('RLF_TEST_KEY') == 1, True
|
||||||
|
except Exception:
|
||||||
|
return False, True
|
||||||
|
|
||||||
def filter(self, record: logging.LogRecord) -> bool:
|
def filter(self, record: logging.LogRecord) -> bool:
|
||||||
from django.conf import settings
|
# When the original filter() call finishes executing, it's
|
||||||
from django.core.cache import cache
|
# going to change handling_exception.value to False. The
|
||||||
|
# local variable below tracks whether the *current*,
|
||||||
|
# potentially recursive, filter() call is allowed to touch
|
||||||
|
# that value (only the original will find this to be True
|
||||||
|
# at the end of its execution)
|
||||||
|
should_reset_handling_exception = False
|
||||||
|
try:
|
||||||
|
# Track duplicate errors
|
||||||
|
duplicate = False
|
||||||
|
rate = getattr(settings, '%s_LIMIT' % (self.__class__.__name__.upper(),),
|
||||||
|
600) # seconds
|
||||||
|
|
||||||
# Track duplicate errors
|
if rate > 0:
|
||||||
duplicate = False
|
(use_cache, should_reset_handling_exception) = self.can_use_remote_cache()
|
||||||
rate = getattr(settings, '%s_LIMIT' % (self.__class__.__name__.upper(),),
|
if use_cache:
|
||||||
600) # seconds
|
if record.exc_info is not None:
|
||||||
if rate > 0:
|
tb = '\n'.join(traceback.format_exception(*record.exc_info))
|
||||||
# Test if the cache works
|
else:
|
||||||
try:
|
tb = str(record)
|
||||||
cache.set('RLF_TEST_KEY', 1, 1)
|
key = self.__class__.__name__.upper() + hashlib.sha1(tb.encode()).hexdigest()
|
||||||
use_cache = cache.get('RLF_TEST_KEY') == 1
|
duplicate = cache.get(key) == 1
|
||||||
except Exception:
|
if not duplicate:
|
||||||
use_cache = False
|
cache.set(key, 1, rate)
|
||||||
|
|
||||||
if use_cache:
|
|
||||||
if record.exc_info is not None:
|
|
||||||
tb = '\n'.join(traceback.format_exception(*record.exc_info))
|
|
||||||
else:
|
else:
|
||||||
tb = str(record)
|
min_date = timezone_now() - timedelta(seconds=rate)
|
||||||
key = self.__class__.__name__.upper() + hashlib.sha1(tb.encode()).hexdigest()
|
duplicate = (self.last_error >= min_date)
|
||||||
duplicate = cache.get(key) == 1
|
if not duplicate:
|
||||||
if not duplicate:
|
self.last_error = timezone_now()
|
||||||
cache.set(key, 1, rate)
|
|
||||||
else:
|
|
||||||
min_date = timezone_now() - timedelta(seconds=rate)
|
|
||||||
duplicate = (self.last_error >= min_date)
|
|
||||||
if not duplicate:
|
|
||||||
self.last_error = timezone_now()
|
|
||||||
|
|
||||||
return not duplicate
|
return not duplicate
|
||||||
|
finally:
|
||||||
|
if should_reset_handling_exception:
|
||||||
|
self.handling_exception.value = False
|
||||||
|
|
||||||
class ZulipLimiter(_RateLimitFilter):
|
class ZulipLimiter(_RateLimitFilter):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -208,3 +208,14 @@ class ErrorFiltersTest(TestCase):
|
|||||||
"api_key=******&stream=******")
|
"api_key=******&stream=******")
|
||||||
self.assertEqual(clean_data_from_query_parameters("api_key=abcdz&stream=foo&topic=bar"),
|
self.assertEqual(clean_data_from_query_parameters("api_key=abcdz&stream=foo&topic=bar"),
|
||||||
"api_key=******&stream=******&topic=******")
|
"api_key=******&stream=******&topic=******")
|
||||||
|
|
||||||
|
class RateLimitFilterTest(ZulipTestCase):
|
||||||
|
def test_recursive_filter_handling(self) -> None:
|
||||||
|
def mocked_cache_get(key: str) -> int:
|
||||||
|
logging.error("Log an error to trigger recursive filter() calls in _RateLimitFilter.")
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
with patch("zerver.lib.logging_util.cache.get", side_effect=mocked_cache_get) as m:
|
||||||
|
logging.error("Log an error to trigger initial _RateLimitFilter.filter() call.")
|
||||||
|
# cache.get should have only been called once, by the original filter() call:
|
||||||
|
m.assert_called_once()
|
||||||
|
|||||||
Reference in New Issue
Block a user