From b15cb27fcc9ea1f4d4bb0da9d9ee9f2643c6f67b Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Fri, 18 Dec 2020 18:05:20 -0800 Subject: [PATCH] docs: Add a document explaining email/push notifications. With various fixes by Mateusz Mandera. --- docs/subsystems/events-system.md | 3 + docs/subsystems/index.rst | 1 + docs/subsystems/notifications.md | 160 ++++++++++++++++++++++++++++ docs/subsystems/sending-messages.md | 20 ++-- zerver/lib/email_notifications.py | 2 + zerver/lib/push_notifications.py | 2 + zerver/tornado/event_queue.py | 8 +- 7 files changed, 185 insertions(+), 11 deletions(-) create mode 100644 docs/subsystems/notifications.md diff --git a/docs/subsystems/events-system.md b/docs/subsystems/events-system.md index d0a7928729..4669232207 100644 --- a/docs/subsystems/events-system.md +++ b/docs/subsystems/events-system.md @@ -161,6 +161,9 @@ Server (which stores queues in memory) were to crash and lose its data, clients would recover, just as if they had lost Internet access briefly (there is some DoS risk to manage, though). +Note that the garbage-collection system has hooks that are important +for the implementation of [notifications](../subsystems/notifications.md). + (The Event Queue Server is designed to save any event queues to disk and reload them when the server is restarted, and catches exceptions carefully, so such incidents are very rare, but it's nice to have a diff --git a/docs/subsystems/index.rst b/docs/subsystems/index.rst index f04cf5bdb1..47ed924e06 100644 --- a/docs/subsystems/index.rst +++ b/docs/subsystems/index.rst @@ -10,6 +10,7 @@ Subsystems documentation html-css events-system sending-messages + notifications queuing custom-apps pointer diff --git a/docs/subsystems/notifications.md b/docs/subsystems/notifications.md new file mode 100644 index 0000000000..3b126b3b65 --- /dev/null +++ b/docs/subsystems/notifications.md @@ -0,0 +1,160 @@ +# Notifications in Zulip + +This is a design document aiming to provide context for developers +working on Zulip's email notifications and mobile push notifications +code paths. We recommend first becoming familiar with [sending +messages](../subsystems/sending-messages.md); this document expands on +the details of the email/mobile push notifications code path. + +## Important corner cases + +Here we name a few corner cases worth understanding in designing this +sort of notifications system: + +* The **Idle Desktop Problem**: We don't want the presence of a + desktop computer at the office to eat all notifications because the + user has an "online" client that they may not have used in 3 days. +* The **Hard Disconnect Problem**: A client can lose its connection to + the Internet (or be suspended, or whatever) at any time, and this + happens routinely. We want to ensure that races where a user closes + their laptop shortly after a notifiable message is sent does not + result in the user never receiving a notification about a message + (due to the system thinking that client received it). + +## The mobile/email notifications flow + +As a reminder, the relevant part of the flow for sending messages is +as follows: +* `do_send_messages` is the synchronous message-sending code path, + and passing the following data in its `send_event` call: + * Data about the message's content (E.g. mentions, wildcard + mentions, and alert words) and encodes it into the `UserMessage` + table's `flags` structure, which is in turn passed into + `send_event` for each user receiving the message. + * Data about user configuration relevant to the message, such as + `push_notify_user_ids` and `stream_notify_user_ids`, are included + alongside `flags` in the per-user data structure. + * The `presence_idle_user_ids` set, containing the subset of + recipient users who are mentioned, are PM recipients, have alert + words, or otherwise would normally get a notification, but have not + interacted with a Zulip client in the last few minutes. (Users who + have generally will not receive a notification unless the + `enable_online_push_notifications` flag is enabled). This data + structure ignores users for whom the message is not notifiable, + which is important to avoid this being thousands of `user_ids` for + messages to large streams with few currently active users. +* The Tornado [event queue system](../subsystems/events-system.md) + processes that data, as well as data about each user's active event + queues, to (1) push an event to each queue needing that message and + (2) for notifiable messages, pushing an event onto the + `missedmessage_mobile_notifications` and/or `missedmessage_emails` + queues. This important message-processing logic has notable extra + logic not present when processing normal events, both for details + like splicing `flags` to customize event payloads per-user, as well. + * The Tornado system determines whether the user is "offline/idle". + Zulip's email notifications are designed to not fire when the user + is actively using Zulip to avoid spam, and this is where those + checks are implemented. + * Users in `presence_idle_user_ids` are always considered idle: + the variable name means "users who are idle because of + presence". This is how we solve the Idle Desktop Problem; users + with an idle desktop are treated the same as users who aren't + logged in for this check. + * However, that check does not handle the Hard Disconnect Problem: + if a user was present 1 minute before a message was sent, and then + closed their laptop, the user will not be in + `presence_idle_user_ids`, and so without an additional mechanism, + messages sent shortly after a user leaves would never trigger a + notification (!). + * We solve that problem by also notifying if + `receiver_is_off_zulip` returns `True`, which checks whether the user has any + current events system clients registered to receive `message` + events. This check is done immediately (handling soft disconnects, + where E.g. the user closes their last Zulip tab and we get the + `DELETE /events/{queue_id}` request). + * The `receiver_is_off_zulip` check is effectively repeated when + event queues are garbage-collected (in `missedmessage_hook`) by + looking for whether the queue being garbage-collectee was the only + one; this second check solves the Hard Disconnect Problem, resulting in + notifications for these hard-disconnect cases usually coming 10 + minutes late. + * The message-edit code path has parallel logic in + `maybe_enqueue_notifications_for_message_update` for triggering + notifications in cases like a mention added during message + editing. + * The business logic for all these notification decisions made + inside Tornado has extensive automated test suites; e.g. + `test_message_edit_notifications.py` covers all the cases around + editing a message to add/remove a mention. + * We may in the future want to add some sort of system for letting + users see past notifications, to help with explaining and + debugging this system, since it has so much complexity. +* Desktop notifications are the simplest; they are implemented + client-side by the web/desktop app's logic + (`static/js/notifications.js`) inspecting the `flags` fields that + were spliced into `message` events by the Tornado system, as well as + the user's notification settings. +* The queue processors for those queues make the final determination + for whether to send a notification, and do the work to generate an + email (`zerver/lib/email_notifications.py`) or mobile + (`zerver/lib/push_notifications.py`) notification. We'll detail + this process in more detail for each system below, but it's + important to know that it's normal for a message to sit in these + queues for minutes (and in the future, possibly hours). +* Both queue processor code paths do additional filtering before + sending a notification: + * Messages that have already been marked as read by the user before + the queue processor runs never trigger a notification. + * Messages that were already deleted never trigger a notification. + * The user-level settings for whether email/mobile notifications are + disabled are rechecked, as the user may have disabled one of these + settings during the queuing period. + * The **Email notifications queue processor**, `MissedMessageWorker`, + takes care to wait for 2 minutes (hopefully in the future this will be a + configuration setting) and starts a thread to batch together multiple + messages into a single email. These features are unnecessary + for mobile push notifications, because we can live-update those + details with a future notification, whereas emails cannot be readily + updated once sent. Zulip's email notifications are styled similarly + to GitHub's email notifications, with a clean, simple design that + makes replying from an email client possible (using the [incoming + email integration](../production/email-gateway.md)). + * The **Push notifications queue processor**, + `PushNotificationsWorker`, is a simple wrapper around the + `push_notifications.py` code that actually sends the + notification. This logic is somewhat complicated by having to track + the number of unread push notifications to display on the mobile + apps' badges, as well as using the [Mobile Push Notifications + Service](../production/mobile-push-notifications.md) for self-hosted + systems. + +The following important constraints are worth understanding about the +structure of the system, when thinking about changes to it: + +* **Bulk database queries** are much more efficient for checking + details from the database like "which users receiving this message + are online". +* **Thousands of users**. Zulip supports thousands of users, and we + want to avoid `send_event()` pushing large amounts of per-user data + to Tornado via RabbitMQ for scalability reasons. +* **Tornado doesn't do database queries**. Because the Tornado system + is an asynchronous event-driven framework, and our Django database + library is synchronous, database queries are very expensive. So + these queries need to be done in either `do_send_messages` or the + queue processor logic. (For example, this means `presence` data + should be checked in either `do_send_messages` or the queue + processors, not in Tornado). +* **Future configuration**. Notification settings are an area that we + expect to only expand with time, with upcoming features like + following a topic (to get notifications for messages only within + that topic in a stream). There are a lot of different workflows + possible with Zulip's threading, and it's important to make it easy + for users to setup Zulip's notification to fit as many of those + workflows as possible. +* **Message editing**. Zulip supports editing messages, and that + interacts with notifications in ways that require careful handling: + Notifications should have + the latest edited content (users often fix typos 30 seconds after + sending a message), adding a mention when editing a message should + send a notification to the newly mentioned user(s), and deleting a + message should cancel any unsent notifications for it. diff --git a/docs/subsystems/sending-messages.md b/docs/subsystems/sending-messages.md index 65e8b98880..7c50caea98 100644 --- a/docs/subsystems/sending-messages.md +++ b/docs/subsystems/sending-messages.md @@ -51,15 +51,16 @@ This section details the ways in which it is different: * There is significant custom code inside the `process_message_event` function in `zerver/tornado/event_queue.py`. This custom code has a number of purposes: - * Triggering email and mobile push notifications for any users who + * Triggering [email and mobile push + notifications](../subsystems/notifications.md) for any users who do not have active clients and have settings of the form "push notifications when offline". In order to avoid doing any real computational work inside the Tornado codebase, this logic aims to just do the check for whether a notification should be generated, and then put an event into an appropriate - [queue](../subsystems/queuing.md) to actually send the - message. See `maybe_enqueue_notifications` and related code for - this part of the logic. + [queue](../subsystems/queuing.md) to actually send the message. + See `maybe_enqueue_notifications` and related code for this part + of the logic. * Splicing user-dependent data (E.g. `flags` such as when the user was `mentioned`) into the events. * Handling the [local echo details](#local-echo). @@ -373,11 +374,12 @@ it’ll arrive in the couple hundred milliseconds one would expect if the extra 4500 inactive subscribers didn’t exist. There are a few details that require special care with this system: -* Email and mobile push notifications. We need to make sure these are - still correctly delivered to soft-deactivated users; making this - work required careful work for those code paths that assumed a - `UserMessage` row would always exist for a message that triggers a - notification to a given user. +* [Email and mobile push + notifications](../subsystems/notifications.md). We need to make + sure these are still correctly delivered to soft-deactivated users; + making this work required careful work for those code paths that + assumed a `UserMessage` row would always exist for a message that + triggers a notification to a given user. * Digest emails, which use the `UserMessage` table extensively to determine what has happened in streams the user can see. We can use the user's subscriptions to construct what messages they should have diff --git a/zerver/lib/email_notifications.py b/zerver/lib/email_notifications.py index fe89ece6b9..7396490156 100644 --- a/zerver/lib/email_notifications.py +++ b/zerver/lib/email_notifications.py @@ -1,3 +1,5 @@ +# See https://zulip.readthedocs.io/en/latest/subsystems/notifications.html + import re from collections import defaultdict from datetime import timedelta diff --git a/zerver/lib/push_notifications.py b/zerver/lib/push_notifications.py index 33f62b7d6d..a09353c995 100644 --- a/zerver/lib/push_notifications.py +++ b/zerver/lib/push_notifications.py @@ -1,3 +1,5 @@ +# See https://zulip.readthedocs.io/en/latest/subsystems/notifications.html + import base64 import logging import re diff --git a/zerver/tornado/event_queue.py b/zerver/tornado/event_queue.py index 7eef27ebcc..4823786f57 100644 --- a/zerver/tornado/event_queue.py +++ b/zerver/tornado/event_queue.py @@ -765,7 +765,7 @@ def missedmessage_hook( def receiver_is_off_zulip(user_profile_id: int) -> bool: # If a user has no message-receiving event queues, they've got no open zulip - # session so we notify them + # session so we notify them. all_client_descriptors = get_client_descriptors_for_user(user_profile_id) message_event_queues = [ client for client in all_client_descriptors if client.accepts_messages() @@ -789,7 +789,11 @@ def maybe_enqueue_notifications( ) -> Dict[str, bool]: """This function has a complete unit test suite in `test_enqueue_notifications` that should be expanded as we add - more features here.""" + more features here. + + See https://zulip.readthedocs.io/en/latest/subsystems/notifications.html + for high-level design documentation. + """ notified: Dict[str, bool] = {} if (idle or always_push_notify) and (