From b4470ac8e10aba76b5ac64e4e546269982df0fff Mon Sep 17 00:00:00 2001 From: Steve Howell Date: Tue, 11 May 2021 11:55:49 +0000 Subject: [PATCH] performance: Add get_subscriptions_for_send_message. This new function optimizes how we fetch subscriptions for streams. Basically, it excludes most long-term-idle users from the query. With 8k users, of which all but 400 are long term idle, this speeds up get_recipient_info from about 150ms to 50ms. Overall this change appears to save a factor of 2-3 in the backend processing time for sending or editing a message in large, public streams in chat.zulip.org (at 18K users today). --- zerver/lib/actions.py | 16 ++++- zerver/lib/stream_subscription.py | 65 ++++++++++++++++++- zerver/lib/stream_topic.py | 8 --- zerver/tests/test_soft_deactivation.py | 89 +++++++++++++++++++++++--- zerver/tests/test_users.py | 12 ++++ 5 files changed, 169 insertions(+), 21 deletions(-) diff --git a/zerver/lib/actions.py b/zerver/lib/actions.py index fdd853b0e9..c8a2e87d0d 100644 --- a/zerver/lib/actions.py +++ b/zerver/lib/actions.py @@ -133,6 +133,7 @@ from zerver.lib.stream_subscription import ( get_stream_subscriptions_for_user, get_stream_subscriptions_for_users, get_subscribed_stream_ids_for_user, + get_subscriptions_for_send_message, get_user_ids_for_streams, num_subscribers_for_stream_id, subscriber_ids_with_stream_history_access, @@ -1422,6 +1423,8 @@ class RecipientInfoResult(TypedDict): def get_recipient_info( + *, + realm_id: int, recipient: Recipient, sender_id: int, stream_topic: Optional[StreamTopicTarget], @@ -1446,7 +1449,12 @@ def get_recipient_info( user_ids_muting_topic = stream_topic.user_ids_muting_topic() subscription_rows = ( - stream_topic.get_active_subscriptions() + get_subscriptions_for_send_message( + realm_id=realm_id, + stream_id=stream_topic.stream_id, + possible_wildcard_mention=possible_wildcard_mention, + possibly_mentioned_user_ids=possibly_mentioned_user_ids, + ) .annotate( user_profile_email_notifications=F( "user_profile__enable_stream_email_notifications" @@ -1732,6 +1740,7 @@ def build_message_send_dict( stream_topic = None info = get_recipient_info( + realm_id=realm.id, recipient=message_dict["message"].recipient, sender_id=message_dict["message"].sender_id, stream_topic=stream_topic, @@ -5691,10 +5700,12 @@ def do_update_message( changed_messages = [target_message] + realm = user_profile.realm + stream_being_edited = None if target_message.is_stream_message(): stream_id = target_message.recipient.type_id - stream_being_edited = get_stream_by_id_in_realm(stream_id, user_profile.realm) + stream_being_edited = get_stream_by_id_in_realm(stream_id, realm) event["stream_name"] = stream_being_edited.name ums = UserMessage.objects.filter(message=target_message.id) @@ -5753,6 +5764,7 @@ def do_update_message( stream_topic = None info = get_recipient_info( + realm_id=realm.id, recipient=target_message.recipient, sender_id=target_message.sender_id, stream_topic=stream_topic, diff --git a/zerver/lib/stream_subscription.py b/zerver/lib/stream_subscription.py index 7508a2b0fd..9d2b142d54 100644 --- a/zerver/lib/stream_subscription.py +++ b/zerver/lib/stream_subscription.py @@ -2,11 +2,11 @@ import itertools from collections import defaultdict from dataclasses import dataclass from operator import itemgetter -from typing import Any, Dict, List, Optional, Set +from typing import AbstractSet, Any, Dict, List, Optional, Set -from django.db.models.query import QuerySet +from django.db.models import Q, QuerySet -from zerver.models import Realm, Recipient, Stream, Subscription, UserProfile +from zerver.models import AlertWord, Realm, Recipient, Stream, Subscription, UserProfile @dataclass @@ -274,3 +274,62 @@ def subscriber_ids_with_stream_history_access(stream: Stream) -> Set[int]: stream.id, include_deactivated_users=False ).values_list("user_profile_id", flat=True) ) + + +def get_subscriptions_for_send_message( + *, + realm_id: int, + stream_id: int, + possible_wildcard_mention: bool, + possibly_mentioned_user_ids: AbstractSet[int], +) -> QuerySet: + """This function optimizes an important use case for large + streams. Open realms often have many long_term_idle users, which + can result in 10,000s of long_term_idle recipients in default + streams. do_send_messages has an optimization to avoid doing work + for long_term_idle unless message flags or notifications should be + generated. + + However, it's expensive even to fetch and process them all in + Python at all. This function returns all recipients of a stream + message that could possibly require action in the send-message + codepath. + + Basically, it returns all subscribers, excluding all long-term + idle users who it can prove will not receive a UserMessage row or + notification for the message (i.e. no alert words, mentions, or + email/push notifications are configured) and thus are not needed + for processing the message send. + + Critically, this function is called before the Markdown + processor. As a result, it returns all subscribers who have ANY + configured alert words, even if their alert words aren't present + in the message. Similarly, it returns all subscribers who match + the "possible mention" parameters. + + Downstream logic, which runs after the Markdown processor has + parsed the message, will do the precise determination. + """ + + query = get_active_subscriptions_for_stream_id( + stream_id, + include_deactivated_users=False, + ) + + if possible_wildcard_mention: + return query + + query = query.filter( + Q(user_profile__long_term_idle=False) + | Q(push_notifications=True) + | (Q(push_notifications=None) & Q(user_profile__enable_stream_push_notifications=True)) + | Q(email_notifications=True) + | (Q(email_notifications=None) & Q(user_profile__enable_stream_email_notifications=True)) + | Q(user_profile_id__in=possibly_mentioned_user_ids) + | Q( + user_profile_id__in=AlertWord.objects.filter(realm_id=realm_id).values_list( + "user_profile_id" + ) + ) + ) + return query diff --git a/zerver/lib/stream_topic.py b/zerver/lib/stream_topic.py index e94e6a0fcb..71b5418c54 100644 --- a/zerver/lib/stream_topic.py +++ b/zerver/lib/stream_topic.py @@ -1,8 +1,5 @@ from typing import Set -from django.db.models.query import QuerySet - -from zerver.lib.stream_subscription import get_active_subscriptions_for_stream_id from zerver.models import MutedTopic @@ -26,8 +23,3 @@ class StreamTopicTarget: "user_profile_id", ) return {row["user_profile_id"] for row in query} - - def get_active_subscriptions(self) -> QuerySet: - return get_active_subscriptions_for_stream_id( - self.stream_id, include_deactivated_users=True - ) diff --git a/zerver/tests/test_soft_deactivation.py b/zerver/tests/test_soft_deactivation.py index 6b6a978830..b721aafb91 100644 --- a/zerver/tests/test_soft_deactivation.py +++ b/zerver/tests/test_soft_deactivation.py @@ -1,3 +1,4 @@ +from typing import AbstractSet from unittest import mock from django.utils.timezone import now as timezone_now @@ -14,6 +15,7 @@ from zerver.lib.soft_deactivation import ( get_users_for_soft_deactivation, reactivate_user_if_soft_deactivated, ) +from zerver.lib.stream_subscription import get_subscriptions_for_send_message from zerver.lib.test_classes import ZulipTestCase from zerver.lib.test_helpers import ( get_subscription, @@ -22,6 +24,7 @@ from zerver.lib.test_helpers import ( queries_captured, ) from zerver.models import ( + AlertWord, Client, Message, RealmAuditLog, @@ -580,16 +583,21 @@ class SoftDeactivationMessageTest(ZulipTestCase): # In this test we are basically testing out the logic used out in # do_send_messages() in action.py for filtering the messages for which # UserMessage rows should be created for a soft-deactivated user. + AlertWord.objects.all().delete() + long_term_idle_user = self.example_user("hamlet") cordelia = self.example_user("cordelia") sender = self.example_user("iago") - stream_name = "Denmark" + stream_name = "Brand New Stream" topic_name = "foo" + realm_id = cordelia.realm_id self.subscribe(long_term_idle_user, stream_name) self.subscribe(cordelia, stream_name) self.subscribe(sender, stream_name) + stream_id = get_stream(stream_name, cordelia.realm).id + def send_stream_message(content: str) -> None: self.send_stream_message(sender, stream_name, content, topic_name) @@ -620,7 +628,35 @@ class SoftDeactivationMessageTest(ZulipTestCase): else: self.assertEqual(user_messages[-1].content, content) - def assert_stream_message_sent_to_idle_user(content: str) -> None: + def assert_num_possible_users( + expected_count: int, + *, + possible_wildcard_mention: bool = False, + possibly_mentioned_user_ids: AbstractSet[int] = set(), + ) -> None: + self.assertEqual( + len( + get_subscriptions_for_send_message( + realm_id=realm_id, + stream_id=stream_id, + possible_wildcard_mention=possible_wildcard_mention, + possibly_mentioned_user_ids=possibly_mentioned_user_ids, + ) + ), + expected_count, + ) + + def assert_stream_message_sent_to_idle_user( + content: str, + *, + possible_wildcard_mention: bool = False, + possibly_mentioned_user_ids: AbstractSet[int] = set(), + ) -> None: + assert_num_possible_users( + expected_count=3, + possible_wildcard_mention=possible_wildcard_mention, + possibly_mentioned_user_ids=possibly_mentioned_user_ids, + ) general_user_msg_count = len(get_user_messages(cordelia)) soft_deactivated_user_msg_count = len(get_user_messages(long_term_idle_user)) send_stream_message(content) @@ -629,7 +665,22 @@ class SoftDeactivationMessageTest(ZulipTestCase): assert_last_um_content(long_term_idle_user, content) assert_last_um_content(cordelia, content) - def assert_stream_message_not_sent_to_idle_user(content: str) -> None: + def assert_stream_message_not_sent_to_idle_user( + content: str, + *, + possibly_mentioned_user_ids: AbstractSet[int] = set(), + false_alarm_row: bool = False, + ) -> None: + if false_alarm_row: + # We will query for our idle user if he has **ANY** alert + # words, but we won't actually write a UserMessage row until + # we truly parse the message. We also get false alarms for + # messages with quoted mentions. + assert_num_possible_users( + 3, possibly_mentioned_user_ids=possibly_mentioned_user_ids + ) + else: + assert_num_possible_users(2) general_user_msg_count = len(get_user_messages(cordelia)) soft_deactivated_user_msg_count = len(get_user_messages(long_term_idle_user)) send_stream_message(content) @@ -696,7 +747,16 @@ class SoftDeactivationMessageTest(ZulipTestCase): # Test UserMessage row is created while user is deactivated if # user itself is mentioned. - assert_stream_message_sent_to_idle_user("Test @**King Hamlet** mention") + assert_stream_message_sent_to_idle_user( + "Test @**King Hamlet** mention", + possibly_mentioned_user_ids={long_term_idle_user.id}, + ) + + assert_stream_message_not_sent_to_idle_user( + "Test `@**King Hamlet**` mention", + possibly_mentioned_user_ids={long_term_idle_user.id}, + false_alarm_row=True, + ) # Test UserMessage row is not created while user is deactivated if # anyone is mentioned but the user. @@ -704,9 +764,15 @@ class SoftDeactivationMessageTest(ZulipTestCase): # Test UserMessage row is created while user is deactivated if # there is a wildcard mention such as @all or @everyone - assert_stream_message_sent_to_idle_user("Test @**all** mention") - assert_stream_message_sent_to_idle_user("Test @**everyone** mention") - assert_stream_message_sent_to_idle_user("Test @**stream** mention") + assert_stream_message_sent_to_idle_user( + "Test @**all** mention", possible_wildcard_mention=True + ) + assert_stream_message_sent_to_idle_user( + "Test @**everyone** mention", possible_wildcard_mention=True + ) + assert_stream_message_sent_to_idle_user( + "Test @**stream** mention", possible_wildcard_mention=True + ) assert_stream_message_not_sent_to_idle_user("Test @**bogus** mention") # Test UserMessage row is created while user is deactivated if there @@ -714,6 +780,13 @@ class SoftDeactivationMessageTest(ZulipTestCase): do_add_alert_words(long_term_idle_user, ["test_alert_word"]) assert_stream_message_sent_to_idle_user("Testing test_alert_word") + do_add_alert_words(cordelia, ["cordelia"]) + assert_stream_message_not_sent_to_idle_user("cordelia", false_alarm_row=True) + # Test UserMessage row is not created while user is deactivated if # message is a me message. - assert_stream_message_not_sent_to_idle_user("/me says test") + assert_stream_message_not_sent_to_idle_user("/me says test", false_alarm_row=True) + + # Sanity check after removing the alert word for Hamlet. + AlertWord.objects.filter(user_profile=long_term_idle_user).delete() + assert_stream_message_not_sent_to_idle_user("no alert words") diff --git a/zerver/tests/test_users.py b/zerver/tests/test_users.py index f76469eff7..9973824aaa 100644 --- a/zerver/tests/test_users.py +++ b/zerver/tests/test_users.py @@ -1510,6 +1510,7 @@ class RecipientInfoTest(ZulipTestCase): ) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1537,6 +1538,7 @@ class RecipientInfoTest(ZulipTestCase): hamlet.enable_stream_push_notifications = True hamlet.save() info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1546,6 +1548,7 @@ class RecipientInfoTest(ZulipTestCase): self.assertEqual(info["wildcard_mention_user_ids"], set()) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1557,6 +1560,7 @@ class RecipientInfoTest(ZulipTestCase): sub.push_notifications = False sub.save() info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1569,6 +1573,7 @@ class RecipientInfoTest(ZulipTestCase): sub.push_notifications = True sub.save() info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1584,6 +1589,7 @@ class RecipientInfoTest(ZulipTestCase): ) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1593,6 +1599,7 @@ class RecipientInfoTest(ZulipTestCase): self.assertEqual(info["wildcard_mention_user_ids"], set()) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1608,6 +1615,7 @@ class RecipientInfoTest(ZulipTestCase): sub.save() info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1623,6 +1631,7 @@ class RecipientInfoTest(ZulipTestCase): sub.save() info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1642,6 +1651,7 @@ class RecipientInfoTest(ZulipTestCase): ) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1665,6 +1675,7 @@ class RecipientInfoTest(ZulipTestCase): ) info = get_recipient_info( + realm_id=realm.id, recipient=recipient, sender_id=hamlet.id, stream_topic=stream_topic, @@ -1686,6 +1697,7 @@ class RecipientInfoTest(ZulipTestCase): with self.assertRaisesRegex(ValueError, "Bad recipient type"): invalid_recipient = Recipient(type=999) # 999 is not a valid type get_recipient_info( + realm_id=realm.id, recipient=invalid_recipient, sender_id=hamlet.id, stream_topic=stream_topic,