digest: Cache per-stream recent topics, rather than batching.

The query plan for fetching recent messages from the arbitrary set of streams formed by the intersection of 30 random users can be quite bad, and can descend into a sequential scan on `zerver_recipient`. Worse, this work of pulling recent messages out is redone if the stream appears in the next batch of 30 users. Instead, pull the recent messages for a stream on a one-by-one basis, but cache them in an in-memory cache. Since digests are enqueued in 30-user batches but still one-realm-at-a-time, work will be saved both in terms of faster query plans whose results can also be reused across batches. This requires that we pull the stream-id to stream-name mapping for _all_ streams in the realm at once, but that is well-indexed and unlikely to cause performance issues -- in fact, it may be faster than pulling a random subset of the streams in the realm.
2025-11-02 21:13:36 +00:00 · 2023-08-31 18:37:08 +00:00
parent ffb6c95bba
commit b555d3f553
2 changed files with 49 additions and 27 deletions
--- a/zerver/lib/digest.py
+++ b/zerver/lib/digest.py
@@ -1,4 +1,5 @@
 import datetime
+import functools
 import heapq
 import logging
 from collections import defaultdict
@@ -149,9 +150,12 @@ def _enqueue_emails_for_realm(realm: Realm, cutoff: datetime.datetime) -> None:
        )


+# We cache both by stream-id and cutoff, which ensures the per-stream
+# cache also does not contain data from old digests
+@functools.lru_cache(maxsize=500)
 def get_recent_topics(
    realm_id: int,
-    stream_ids: List[int],
+    stream_id: int,
    cutoff_date: datetime.datetime,
 ) -> List[DigestTopic]:
    # Gather information about topic conversations, then
@@ -164,14 +168,14 @@ def get_recent_topics(
        Message.objects.filter(
            realm_id=realm_id,
            recipient__type=Recipient.STREAM,
-            recipient__type_id__in=stream_ids,
+            recipient__type_id=stream_id,
            date_sent__gt=cutoff_date,
        )
        .order_by(
            "id",  # we will sample the first few messages
        )
        .select_related(
-            "recipient",  # we need stream_id
+            "recipient",  # build_message_list looks up recipient.type
            "sender",  # we need the sender's full name
            "sending_client",  # for Message.sent_by_human
        )
@@ -184,7 +188,7 @@ def get_recent_topics(

    digest_topic_map: Dict[TopicKey, DigestTopic] = {}
    for message in messages:
-        topic_key = (message.recipient.type_id, message.topic_name())
+        topic_key = (stream_id, message.topic_name())

        if topic_key not in digest_topic_map:
            digest_topic_map[topic_key] = DigestTopic(topic_key)
@@ -298,13 +302,10 @@ def get_user_stream_map(user_ids: List[int], cutoff_date: datetime.datetime) ->
    return dct


-def get_slim_stream_id_map(stream_ids: Set[int]) -> Dict[int, Stream]:
+def get_slim_stream_id_map(realm: Realm) -> Dict[int, Stream]:
    # "slim" because it only fetches the names of the stream objects,
    # suitable for passing into build_message_list.
-    streams = Stream.objects.filter(
-        id__in=stream_ids,
-    ).only("id", "name")
-
+    streams = get_active_streams(realm).only("id", "name")
    return {stream.id: stream for stream in streams}


@@ -320,28 +321,19 @@ def bulk_get_digest_context(
    # Convert from epoch seconds to a datetime object.
    cutoff_date = datetime.datetime.fromtimestamp(int(cutoff), tz=datetime.timezone.utc)

-    result: Dict[int, Dict[str, Any]] = {}
+    stream_id_map = get_slim_stream_id_map(realm)
+    recently_created_streams = get_recently_created_streams(realm, cutoff_date)

    user_ids = [user.id for user in users]
    user_stream_map = get_user_stream_map(user_ids, cutoff_date)

-    all_stream_ids = set()
-
+    result: Dict[int, Dict[str, Any]] = {}
    for user in users:
        stream_ids = user_stream_map[user.id]
-        all_stream_ids |= stream_ids

-    # Get all the recent topics for all the users.  This does the heavy
-    # lifting of making an expensive query to the Message table.  Then
-    # for each user, we filter to just the streams they care about.
-    recent_topics = get_recent_topics(realm.id, sorted(all_stream_ids), cutoff_date)
-
-    stream_id_map = get_slim_stream_id_map(all_stream_ids)
-
-    recently_created_streams = get_recently_created_streams(realm, cutoff_date)
-
-    for user in users:
-        stream_ids = user_stream_map[user.id]
+        recent_topics = []
+        for stream_id in stream_ids:
+            recent_topics += get_recent_topics(realm.id, stream_id, cutoff_date)

        hot_topics = get_hot_topics(recent_topics, stream_ids)

--- a/zerver/tests/test_digest.py
+++ b/zerver/tests/test_digest.py
@@ -18,6 +18,7 @@ from zerver.lib.digest import (
    enqueue_emails,
    gather_new_streams,
    get_hot_topics,
+    get_recent_topics,
    get_recently_created_streams,
    get_user_stream_map,
 )
@@ -65,7 +66,10 @@ class TestDigestEmailMessages(ZulipTestCase):
        # This code is run when we call `confirmation.models.create_confirmation_link`.
        # To trigger this, we call the one_click_unsubscribe_link function below.
        one_click_unsubscribe_link(othello, "digest")
-        with self.assert_database_query_count(8):
+
+        # Clear the LRU cache on the stream topics
+        get_recent_topics.cache_clear()
+        with self.assert_database_query_count(10):
            bulk_handle_digest_email([othello.id], cutoff)

        self.assertEqual(mock_send_future_email.call_count, 1)
@@ -82,6 +86,30 @@ class TestDigestEmailMessages(ZulipTestCase):
        self.assertIn("some content", teaser_messages[0]["content"][0]["plain"])
        self.assertIn(teaser_messages[0]["sender"], expected_participants)

+        # If we run another batch, we reuse the topic queries; there
+        # are 3 reused streams and one new one, for a net of two fewer
+        # than before.
+        iago = self.example_user("iago")
+        with self.assert_database_query_count(8):
+            bulk_handle_digest_email([iago.id], cutoff)
+        self.assertEqual(get_recent_topics.cache_info().hits, 3)
+        self.assertEqual(get_recent_topics.cache_info().currsize, 4)
+
+        # Two users in the same batch, with only one new stream from
+        # the above.
+        cordelia = self.example_user("cordelia")
+        prospero = self.example_user("prospero")
+        with self.assert_database_query_count(9):
+            bulk_handle_digest_email([cordelia.id, prospero.id], cutoff)
+        self.assertEqual(get_recent_topics.cache_info().hits, 7)
+        self.assertEqual(get_recent_topics.cache_info().currsize, 5)
+
+        # If we use a different cutoff, it makes new cache entries.
+        with self.assert_database_query_count(12):
+            bulk_handle_digest_email([cordelia.id, prospero.id], cutoff + 1)
+        self.assertEqual(get_recent_topics.cache_info().hits, 8)
+        self.assertEqual(get_recent_topics.cache_info().currsize, 9)
+
    def test_bulk_handle_digest_email_skips_deactivated_users(self) -> None:
        """
        A user id may be added to the queue before the user is deactivated. In such a case,
@@ -141,7 +169,8 @@ class TestDigestEmailMessages(ZulipTestCase):
        # This code is run when we call `confirmation.models.create_confirmation_link`.
        # To trigger this, we call the one_click_unsubscribe_link function below.
        one_click_unsubscribe_link(polonius, "digest")
-        with self.assert_database_query_count(8):
+        get_recent_topics.cache_clear()
+        with self.assert_database_query_count(9):
            bulk_handle_digest_email([polonius.id], cutoff)

        self.assertEqual(mock_send_future_email.call_count, 1)
@@ -202,7 +231,8 @@ class TestDigestEmailMessages(ZulipTestCase):
        with mock.patch("zerver.lib.digest.send_future_email") as mock_send_future_email:
            digest_user_ids = [user.id for user in digest_users]

-            with self.assert_database_query_count(11):
+            get_recent_topics.cache_clear()
+            with self.assert_database_query_count(14):
                with self.assert_memcached_count(0):
                    bulk_handle_digest_email(digest_user_ids, cutoff)