performance: Extract subscribers/peers in bulk.

We replace get_peer_user_ids_for_stream_change
with two bulk functions to get peers and/or
subscribers.

Note that we have three codepaths that care about
peers:

    subscribing existing users:
        we need to tell peers about new subscribers
        we need to tell subscribed user about old subscribers

    unsubscribing existing users:
        we only need to tell peers who unsubscribed

    subscribing new user:
        we only need to tell peers about the new user
        (right now we generate send_event
        calls to tell the new user about existing
        subscribers, but this is a waste
        of effort that we will fix soon)

The two bulk functions are this:

    bulk_get_subscriber_peer_info
    bulk_get_peers

They have some overlap in the implementation,
but there are some nuanced differences that are
described in the comments.

Looking up peers/subscribers in bulk leads to some
nice optimizations.

We will save some memchached traffic if you are
subscribing to multiple public streams.

We will save a query in the remove-subscriber
case if you are only dealing with private streams.
This commit is contained in:
Steve Howell
2020-10-13 10:53:23 +00:00
committed by Tim Abbott
parent 94e41c71f9
commit b4346d0276
3 changed files with 196 additions and 103 deletions

View File

@@ -1,10 +1,26 @@
import itertools
from collections import defaultdict
from dataclasses import dataclass
from operator import itemgetter
from typing import Any, Dict, List, Optional, Set, Tuple
from django.db.models.query import QuerySet
from zerver.models import Recipient, Stream, Subscription, UserProfile
from zerver.models import (
Realm,
Recipient,
Stream,
Subscription,
UserProfile,
active_non_guest_user_ids,
)
@dataclass
class SubscriberPeerInfo:
subscribed_ids: Dict[int, Set[int]]
peer_ids: Dict[int, Set[int]]
def get_active_subscriptions_for_stream_id(stream_id: int) -> QuerySet:
# TODO: Change return type to QuerySet[Subscription]
return Subscription.objects.filter(
@@ -73,6 +89,98 @@ def num_subscribers_for_stream_id(stream_id: int) -> int:
user_profile__is_active=True,
).count()
def get_user_ids_for_streams(stream_ids: Set[int]) -> Dict[int, Set[int]]:
all_subs = get_active_subscriptions_for_stream_ids(stream_ids).filter(
user_profile__is_active=True,
).values(
'recipient__type_id',
'user_profile_id',
).order_by(
'recipient__type_id',
)
get_stream_id = itemgetter('recipient__type_id')
result: Dict[int, Set[int]] = defaultdict(set)
for stream_id, rows in itertools.groupby(all_subs, get_stream_id):
user_ids = {row['user_profile_id'] for row in rows}
result[stream_id] = user_ids
return result
def bulk_get_subscriber_peer_info(
realm: Realm,
streams: List[Stream],
) -> SubscriberPeerInfo:
"""
Glossary:
subscribed_ids:
This shows the users who are actually subscribed to the
stream, which we generally send to the person subscribing
to the stream.
peer_ids:
These are the folks that need to know about a new subscriber.
It's usually a superset of the subscribers.
"""
subscribed_ids = {}
peer_ids = {}
private_stream_ids = {stream.id for stream in streams if stream.invite_only}
public_stream_ids = {stream.id for stream in streams if not stream.invite_only}
stream_user_ids = get_user_ids_for_streams(private_stream_ids | public_stream_ids)
if private_stream_ids:
realm_admin_ids = {user.id for user in realm.get_admin_users_and_bots()}
for stream_id in private_stream_ids:
subscribed_user_ids = stream_user_ids.get(stream_id, set())
subscribed_ids[stream_id] = subscribed_user_ids
peer_ids[stream_id] = subscribed_user_ids | realm_admin_ids
if public_stream_ids:
non_guests = active_non_guest_user_ids(realm.id)
for stream_id in public_stream_ids:
subscribed_user_ids = stream_user_ids.get(stream_id, set())
subscribed_ids[stream_id] = subscribed_user_ids
peer_ids[stream_id] = set(non_guests)
return SubscriberPeerInfo(
subscribed_ids=subscribed_ids,
peer_ids=peer_ids,
)
def bulk_get_peers(
realm: Realm,
streams: List[Stream],
) -> Dict[int, Set[int]]:
# This is almost a subset of bulk_get_subscriber_peer_info,
# with the nuance that we don't have to query subscribers
# for public streams. (The other functions tries to save
# a query hop.)
peer_ids = {}
private_stream_ids = {stream.id for stream in streams if stream.invite_only}
public_stream_ids = {stream.id for stream in streams if not stream.invite_only}
if private_stream_ids:
realm_admin_ids = {user.id for user in realm.get_admin_users_and_bots()}
stream_user_ids = get_user_ids_for_streams(private_stream_ids)
for stream_id in private_stream_ids:
subscribed_user_ids = stream_user_ids.get(stream_id, set())
peer_ids[stream_id] = subscribed_user_ids | realm_admin_ids
if public_stream_ids:
non_guests = active_non_guest_user_ids(realm.id)
for stream_id in public_stream_ids:
peer_ids[stream_id] = set(non_guests)
return peer_ids
def handle_stream_notifications_compatibility(user_profile: Optional[UserProfile],
stream_dict: Dict[str, Any],