digest: Cache per-stream recent topics, rather than batching.

The query plan for fetching recent messages from the arbitrary set of
streams formed by the intersection of 30 random users can be quite
bad, and can descend into a sequential scan on `zerver_recipient`.
Worse, this work of pulling recent messages out is redone if the
stream appears in the next batch of 30 users.

Instead, pull the recent messages for a stream on a one-by-one basis,
but cache them in an in-memory cache.  Since digests are enqueued in
30-user batches but still one-realm-at-a-time, work will be saved both
in terms of faster query plans whose results can also be reused across
batches.

This requires that we pull the stream-id to stream-name mapping for
_all_ streams in the realm at once, but that is well-indexed and
unlikely to cause performance issues -- in fact, it may be faster
than pulling a random subset of the streams in the realm.
This commit is contained in:
Alex Vandiver
2023-08-31 18:37:08 +00:00
committed by Tim Abbott
parent ffb6c95bba
commit b555d3f553
2 changed files with 49 additions and 27 deletions

View File

@@ -1,4 +1,5 @@
import datetime import datetime
import functools
import heapq import heapq
import logging import logging
from collections import defaultdict from collections import defaultdict
@@ -149,9 +150,12 @@ def _enqueue_emails_for_realm(realm: Realm, cutoff: datetime.datetime) -> None:
) )
# We cache both by stream-id and cutoff, which ensures the per-stream
# cache also does not contain data from old digests
@functools.lru_cache(maxsize=500)
def get_recent_topics( def get_recent_topics(
realm_id: int, realm_id: int,
stream_ids: List[int], stream_id: int,
cutoff_date: datetime.datetime, cutoff_date: datetime.datetime,
) -> List[DigestTopic]: ) -> List[DigestTopic]:
# Gather information about topic conversations, then # Gather information about topic conversations, then
@@ -164,14 +168,14 @@ def get_recent_topics(
Message.objects.filter( Message.objects.filter(
realm_id=realm_id, realm_id=realm_id,
recipient__type=Recipient.STREAM, recipient__type=Recipient.STREAM,
recipient__type_id__in=stream_ids, recipient__type_id=stream_id,
date_sent__gt=cutoff_date, date_sent__gt=cutoff_date,
) )
.order_by( .order_by(
"id", # we will sample the first few messages "id", # we will sample the first few messages
) )
.select_related( .select_related(
"recipient", # we need stream_id "recipient", # build_message_list looks up recipient.type
"sender", # we need the sender's full name "sender", # we need the sender's full name
"sending_client", # for Message.sent_by_human "sending_client", # for Message.sent_by_human
) )
@@ -184,7 +188,7 @@ def get_recent_topics(
digest_topic_map: Dict[TopicKey, DigestTopic] = {} digest_topic_map: Dict[TopicKey, DigestTopic] = {}
for message in messages: for message in messages:
topic_key = (message.recipient.type_id, message.topic_name()) topic_key = (stream_id, message.topic_name())
if topic_key not in digest_topic_map: if topic_key not in digest_topic_map:
digest_topic_map[topic_key] = DigestTopic(topic_key) digest_topic_map[topic_key] = DigestTopic(topic_key)
@@ -298,13 +302,10 @@ def get_user_stream_map(user_ids: List[int], cutoff_date: datetime.datetime) ->
return dct return dct
def get_slim_stream_id_map(stream_ids: Set[int]) -> Dict[int, Stream]: def get_slim_stream_id_map(realm: Realm) -> Dict[int, Stream]:
# "slim" because it only fetches the names of the stream objects, # "slim" because it only fetches the names of the stream objects,
# suitable for passing into build_message_list. # suitable for passing into build_message_list.
streams = Stream.objects.filter( streams = get_active_streams(realm).only("id", "name")
id__in=stream_ids,
).only("id", "name")
return {stream.id: stream for stream in streams} return {stream.id: stream for stream in streams}
@@ -320,28 +321,19 @@ def bulk_get_digest_context(
# Convert from epoch seconds to a datetime object. # Convert from epoch seconds to a datetime object.
cutoff_date = datetime.datetime.fromtimestamp(int(cutoff), tz=datetime.timezone.utc) cutoff_date = datetime.datetime.fromtimestamp(int(cutoff), tz=datetime.timezone.utc)
result: Dict[int, Dict[str, Any]] = {} stream_id_map = get_slim_stream_id_map(realm)
recently_created_streams = get_recently_created_streams(realm, cutoff_date)
user_ids = [user.id for user in users] user_ids = [user.id for user in users]
user_stream_map = get_user_stream_map(user_ids, cutoff_date) user_stream_map = get_user_stream_map(user_ids, cutoff_date)
all_stream_ids = set() result: Dict[int, Dict[str, Any]] = {}
for user in users: for user in users:
stream_ids = user_stream_map[user.id] stream_ids = user_stream_map[user.id]
all_stream_ids |= stream_ids
# Get all the recent topics for all the users. This does the heavy recent_topics = []
# lifting of making an expensive query to the Message table. Then for stream_id in stream_ids:
# for each user, we filter to just the streams they care about. recent_topics += get_recent_topics(realm.id, stream_id, cutoff_date)
recent_topics = get_recent_topics(realm.id, sorted(all_stream_ids), cutoff_date)
stream_id_map = get_slim_stream_id_map(all_stream_ids)
recently_created_streams = get_recently_created_streams(realm, cutoff_date)
for user in users:
stream_ids = user_stream_map[user.id]
hot_topics = get_hot_topics(recent_topics, stream_ids) hot_topics = get_hot_topics(recent_topics, stream_ids)

View File

@@ -18,6 +18,7 @@ from zerver.lib.digest import (
enqueue_emails, enqueue_emails,
gather_new_streams, gather_new_streams,
get_hot_topics, get_hot_topics,
get_recent_topics,
get_recently_created_streams, get_recently_created_streams,
get_user_stream_map, get_user_stream_map,
) )
@@ -65,7 +66,10 @@ class TestDigestEmailMessages(ZulipTestCase):
# This code is run when we call `confirmation.models.create_confirmation_link`. # This code is run when we call `confirmation.models.create_confirmation_link`.
# To trigger this, we call the one_click_unsubscribe_link function below. # To trigger this, we call the one_click_unsubscribe_link function below.
one_click_unsubscribe_link(othello, "digest") one_click_unsubscribe_link(othello, "digest")
with self.assert_database_query_count(8):
# Clear the LRU cache on the stream topics
get_recent_topics.cache_clear()
with self.assert_database_query_count(10):
bulk_handle_digest_email([othello.id], cutoff) bulk_handle_digest_email([othello.id], cutoff)
self.assertEqual(mock_send_future_email.call_count, 1) self.assertEqual(mock_send_future_email.call_count, 1)
@@ -82,6 +86,30 @@ class TestDigestEmailMessages(ZulipTestCase):
self.assertIn("some content", teaser_messages[0]["content"][0]["plain"]) self.assertIn("some content", teaser_messages[0]["content"][0]["plain"])
self.assertIn(teaser_messages[0]["sender"], expected_participants) self.assertIn(teaser_messages[0]["sender"], expected_participants)
# If we run another batch, we reuse the topic queries; there
# are 3 reused streams and one new one, for a net of two fewer
# than before.
iago = self.example_user("iago")
with self.assert_database_query_count(8):
bulk_handle_digest_email([iago.id], cutoff)
self.assertEqual(get_recent_topics.cache_info().hits, 3)
self.assertEqual(get_recent_topics.cache_info().currsize, 4)
# Two users in the same batch, with only one new stream from
# the above.
cordelia = self.example_user("cordelia")
prospero = self.example_user("prospero")
with self.assert_database_query_count(9):
bulk_handle_digest_email([cordelia.id, prospero.id], cutoff)
self.assertEqual(get_recent_topics.cache_info().hits, 7)
self.assertEqual(get_recent_topics.cache_info().currsize, 5)
# If we use a different cutoff, it makes new cache entries.
with self.assert_database_query_count(12):
bulk_handle_digest_email([cordelia.id, prospero.id], cutoff + 1)
self.assertEqual(get_recent_topics.cache_info().hits, 8)
self.assertEqual(get_recent_topics.cache_info().currsize, 9)
def test_bulk_handle_digest_email_skips_deactivated_users(self) -> None: def test_bulk_handle_digest_email_skips_deactivated_users(self) -> None:
""" """
A user id may be added to the queue before the user is deactivated. In such a case, A user id may be added to the queue before the user is deactivated. In such a case,
@@ -141,7 +169,8 @@ class TestDigestEmailMessages(ZulipTestCase):
# This code is run when we call `confirmation.models.create_confirmation_link`. # This code is run when we call `confirmation.models.create_confirmation_link`.
# To trigger this, we call the one_click_unsubscribe_link function below. # To trigger this, we call the one_click_unsubscribe_link function below.
one_click_unsubscribe_link(polonius, "digest") one_click_unsubscribe_link(polonius, "digest")
with self.assert_database_query_count(8): get_recent_topics.cache_clear()
with self.assert_database_query_count(9):
bulk_handle_digest_email([polonius.id], cutoff) bulk_handle_digest_email([polonius.id], cutoff)
self.assertEqual(mock_send_future_email.call_count, 1) self.assertEqual(mock_send_future_email.call_count, 1)
@@ -202,7 +231,8 @@ class TestDigestEmailMessages(ZulipTestCase):
with mock.patch("zerver.lib.digest.send_future_email") as mock_send_future_email: with mock.patch("zerver.lib.digest.send_future_email") as mock_send_future_email:
digest_user_ids = [user.id for user in digest_users] digest_user_ids = [user.id for user in digest_users]
with self.assert_database_query_count(11): get_recent_topics.cache_clear()
with self.assert_database_query_count(14):
with self.assert_memcached_count(0): with self.assert_memcached_count(0):
bulk_handle_digest_email(digest_user_ids, cutoff) bulk_handle_digest_email(digest_user_ids, cutoff)