From 0744961afe6f866de1b1dbeed4524c3b82faee39 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Sun, 24 Sep 2017 09:43:30 -0700 Subject: [PATCH] docs: Add more comprehensive documentation on error reporting. Fixes #2232. --- docs/logging.md | 155 ++++++++++++++++++++++++++++++++++--- static/js/blueslip.js | 2 + zerver/lib/error_notify.py | 1 + zerver/lib/logging_util.py | 1 + zerver/logging_handlers.py | 1 + zerver/views/report.py | 2 + 6 files changed, 152 insertions(+), 10 deletions(-) diff --git a/docs/logging.md b/docs/logging.md index 2bab392032..238b4c1cd8 100644 --- a/docs/logging.md +++ b/docs/logging.md @@ -1,10 +1,71 @@ -# Logging and Performance Debugging +# Logging and Error reporting -It's good to have the terminal running `run-dev.py` up as you work since error -messages including tracebacks along with every backend request will be printed -there. +Having a good system for logging error reporting is essential to +making a large project like Zulip successful. Without reliable error +reporting, one has to rely solely on bug reports from users in order +to produce a working product. -The messages will look similar to: +Our goal as a project is to have zero known 500 errors on the backend +and zero known JavaScript exceptions on the frontend. While there +will always be new bugs being introduced, that goal is impossible +without an efficient and effective error reporting framework. + +We expect to in the future integrate a service like [Sentry][sentry] +to make it easier for very large installations like zulipchat.com to +manage their exceptions and ensure they are all tracked down, but our +default email-based system is great for small installations. + +## Backend error reporting + +The [Django][django-errors] framework provides much of the +infrastructure needed by our error reporting system: + +* The ability to send emails to the server's administrators with any + 500 errors, using the `mail_admins` function. We enhance these data + with extra details (like what user was involved in the error) in + `zerver/logging_handlers.py`, and then send them to the + administrator in `zerver/lib/error_notify.py` (which also supports + sending Zulips to a stream about production errors). +* The ability to rate-limit certain errors to avoid sending hundreds + of emails in an outage (see `_RateLimitFilter` in + `zerver/lib/logging_util.py`) +* A nice framework for filtering passwords and other important user + data from the exception details, which we use in + `zerver/filters.py`. +* Middleware for handling `JsonableError`, our system for allowing + code anywhere in Django to report an API-facing `json_error` from + anywhere in a view code path. + +Since 500 errors in any Zulip server are usually a problem the server +administrator should investigate and/or report upstream, we have this +email reporting system configured to report errors by default. + +### Backend logging + +[Django's logging system][django-logging] uses the standard +[Python logging infrastructure][python-logging]. We have configured +them so that `logging.exception` and `logging.error` get emailed to +the server maintainer, while `logging.warning` will just appear in +`/var/log/zulip/errors.log`. Lower log levels just appear in the main +server log (as well as in the log for corresponding process, be it +`django.log` for the main Django processes or the appropriate +`events_*` log file for a queue worker). + +#### Backend logging format + +The main Zulip server log contains a line for each backend request. +It also contains warnings, errors, and the full tracebacks for any +Python exceptions. In production, it goes to +`/var/log/zulip/server.log`; in development, it goes to the terminal +where you run `run-dev.py`. + +In development, it's good to keep an eye on the `run-dev.py` console +as you work on backend changes, since it's a great way to notice bugs +you just introduced. + +In production, one usually wants to look at `errors.log` for errors +since the main server log can be very verbose, but the main server log +can be extremely valuable for investigating performance problems. ``` 2016-05-20 14:50:22,056 INFO 127.0.0.1 GET 302 528ms (db: 1ms/1q) (+start: 123ms) / (unauth via ?) @@ -17,8 +78,82 @@ The messages will look similar to: 2016-05-20 14:50:26,959 INFO 127.0.0.1 GET 200 588ms (db: 26ms/21q) / [1463769771:0] (cordelia@zulip.com via website) ``` -The format of this output is: timestamp, loglevel, IP, HTTP Method, HTTP status -code, time to process, (optional perf data details, e.g. database time/queries, -memcached time/queries, Django process startup time, markdown processing time, -etc.), URL, and "email via client" showing user account involved (if logged in) -and the type of client they used ("web", "Android", etc.). +The format of this output is: +* Timestamp +* Log level +* IP address +* HTTP Method +* HTTP status code +* Time to process +* (Optional perf data details, e.g. database time/queries, memcached +time/queries, Django process startup time, markdown processing time, +etc.) +* Endpoint/URL from zproject/urls.py +* "email via client" showing user account involved (if logged in) and +the type of client they used ("web", "Android", etc.). + +The performance data details are particularly useful for investigating +performance problems, since one can see at a glance whether a slow +request was caused by delays in the database, in the markdown +processor, in memcached, or in other Python code. + +One useful thing to note, however, is that the database time is only +the time spent connecting to and receiving a response from the +database. Especially when response are large, there can often be a +great deal of Python processing overhead to marshall the data from the +database into Django objects that is not accounted for in these +numbers. + +## Blueslip frontend error reporting + +We have a custom library, called `blueslip` (named after the form used +at MIT to report problems with the facilities), that takes care of +reporting JavaScript errors. In production, this means emailing the +server administrators (though the setting controlling this, +`BROWSER_ERROR_REPORTING`, is disabled by default, since most problems +are unlikely to be addressable by a system administrator, and it's +very hard to make JavaScript errors not at least somewhat spammy due +to the variety of browser versions and sets of extensions that someone +might use). In development, this means displaying a highly visible +overlay over the message view area, to make exceptions in testing a +new feature hard to miss. + +* Blueslip is implemented in `static/js/blueslip.js`. +* In order to capture essentially any error occuring in the browser, +blueslip does the following: + * Wraps every function passed into `$.ready()`, i.e., every + on-webapp-startup method used by Zulip. + * Wraps every jQuery AJAX request handler used by Zulip. + * Wraps every function passed into `$.on()`, i.e. all event + handlers declared in Zulip. + * Declares a default browser exception handler. + * Has methods for being manually triggered by Zulip JavaScript code + for warnings and assertion failures. +* Blueslip keeps a log of all the notices it has received during a + browser session, and includes them in reports to the server, so that + one can see cases where exceptions chained together. You can print + this log from the browser console using `blueslip.get_log()`. + +Blueslip supports several error levels: +* `blueslip.fatal`: For fatal errors that cannot be easily recovered + from. We try to avoid using it, since it kills the current JS + thread, rather than returning execution to the caller. Unhandled + exceptions in our JS code are treated like `blueslip.fatal`. +* `blueslip.error`: For logging of events that are definitely caused + by a bug and thus sufficiently important to be reported, but where + we can handle the error without creating major user-facing problems + (e.g. an exception when handling a presence update). +* `blueslip.warn`: For logging of events that are a problem but not + important enough to send an email about in production. They are, + however, highlighted in the JS console in development. +* `blueslip.log` (and `blueslip.info`): Logged to the JS console in + development and also in the blueslip log in production. Useful for + data that might help discern what state the browser was in during an + error (e.g. whether the user was in a narrow). +* `blueslip.debug`: Similar to `blueslip.log`, but are not printed to + the JS console in development. + +[django-errors]: https://docs.djangoproject.com/en/1.11/howto/error-reporting/ +[python-logging]: https://docs.python.org/3/library/logging.html +[django-logging]: https://docs.djangoproject.com/en/1.11/topics/logging/ +[sentry]: https://sentry.io diff --git a/static/js/blueslip.js b/static/js/blueslip.js index a128fa8536..87bd4f4fd3 100644 --- a/static/js/blueslip.js +++ b/static/js/blueslip.js @@ -1,3 +1,5 @@ +// System documented in https://zulip.readthedocs.io/en/latest/logging.html + // This must be included before the first call to $(document).ready // in order to be able to report exceptions that occur during their // execution. diff --git a/zerver/lib/error_notify.py b/zerver/lib/error_notify.py index 3081122725..be313904b5 100644 --- a/zerver/lib/error_notify.py +++ b/zerver/lib/error_notify.py @@ -1,3 +1,4 @@ +# System documented in https://zulip.readthedocs.io/en/latest/logging.html import logging diff --git a/zerver/lib/logging_util.py b/zerver/lib/logging_util.py index 8a71ab1110..206b816d0a 100644 --- a/zerver/lib/logging_util.py +++ b/zerver/lib/logging_util.py @@ -1,3 +1,4 @@ +# System documented in https://zulip.readthedocs.io/en/latest/logging.html from django.utils.timezone import now as timezone_now from django.utils.timezone import utc as timezone_utc diff --git a/zerver/logging_handlers.py b/zerver/logging_handlers.py index 2bce3fd803..3d737e4d5d 100644 --- a/zerver/logging_handlers.py +++ b/zerver/logging_handlers.py @@ -1,3 +1,4 @@ +# System documented in https://zulip.readthedocs.io/en/latest/logging.html from django.conf import settings from typing import Any, Dict, Optional diff --git a/zerver/views/report.py b/zerver/views/report.py index 3aa9944a0c..009576d7ff 100644 --- a/zerver/views/report.py +++ b/zerver/views/report.py @@ -1,3 +1,5 @@ +# System documented in https://zulip.readthedocs.io/en/latest/logging.html + from typing import Any, Dict, Optional, Text from django.conf import settings