tornado: Limit the width of the user queries, when they're needed.

Tornado requests try hard to not make SQL queries -- and if they're necessary, to minimize the number of them. Specifically, both session objects and user objects are cached in memcached, and we expect that both of them will have been filled there by Django before any requests are made to Tornado. In the event that memcached is flushed, or data is otherwise evicted, we perform two database queries -- one for the session, and one for the user. However, the *width* of the latter query has grown significantly over time, as the Realm object grew more fields, and recently with the addition of role groups, which require multiple joins each. This leads to a query which is over 12k of text long, and results in 319 columns. In the event of a memcached flush, this can result in a *significant* amount of SQL traffic, as nearly every active Tornado request will make that query. We do not wish to narrow the default query for Django; we instead tag the request in the REST wrapper, and use that to use a much narrower user cache entry. That narrower cache entry is filled before the queue is created in Django; we also use it to explicitly set the log data, so the second "half" of the continued Tornado request does not need to fetch any user data either when writing its log line. Because they use different cache keys, this only affects the session-based `/json/events` endpoint, which caches by user-id; the `/api/v1/events` endpoint, which uses an API-key cache, keeps its wide user object. The former is 50% of the total request volume, whereas the latter is only 2%, so adding an additional cache for it is unnecessary complexity.
2025-10-23 04:52:12 +00:00 · 2025-02-10 20:09:09 +00:00
parent 7c20f1d3ea
commit 58bf2a7935
8 changed files with 164 additions and 21 deletions
--- a/zerver/lib/cache.py
+++ b/zerver/lib/cache.py
@@ -429,6 +429,10 @@ def user_profile_by_id_cache_key(user_profile_id: int) -> str:
    return f"user_profile_by_id:{user_profile_id}"


+def user_profile_narrow_by_id_cache_key(user_profile_id: int) -> str:
+    return f"user_profile_narrow_by_id:{user_profile_id}"
+
+
 def user_profile_by_api_key_cache_key(api_key: str) -> str:
    return f"user_profile_by_api_key:{api_key}"

@@ -513,6 +517,7 @@ def delete_user_profile_caches(user_profiles: Iterable["UserProfile"], realm_id:
    keys = []
    for user_profile in user_profiles:
        keys.append(user_profile_by_id_cache_key(user_profile.id))
+        keys.append(user_profile_narrow_by_id_cache_key(user_profile.id))
        keys += map(user_profile_by_api_key_cache_key, get_all_api_keys(user_profile))
        keys.append(user_profile_cache_key_id(user_profile.email, realm_id))
        keys.append(user_profile_delivery_email_cache_key(user_profile.delivery_email, realm_id))
--- a/zerver/lib/rest.py
+++ b/zerver/lib/rest.py
@@ -19,6 +19,7 @@ from zerver.decorator import (
 from zerver.lib.exceptions import MissingAuthenticationError
 from zerver.lib.request import RequestNotes
 from zerver.lib.response import json_method_not_allowed
+from zerver.lib.sessions import narrow_request_user

 ParamT = ParamSpec("ParamT")
 METHODS = ("GET", "HEAD", "POST", "PUT", "DELETE", "PATCH")
@@ -152,6 +153,11 @@ def rest_dispatch(request: HttpRequest, /, **kwargs: object) -> HttpResponse:
    # Security implications of this portion of the code are minimal,
    # as we should worst-case fail closed if we miscategorize a request.

+    def check_is_authenticated(request: HttpRequest) -> bool:
+        if "narrow_user_session_cache" not in view_flags:
+            return request.user.is_authenticated
+        return narrow_request_user(request).is_authenticated
+
    # for some special views (e.g. serving a file that has been
    # uploaded), we support using the same URL for web and API clients.
    if "override_api_url_scheme" in view_flags and "Authorization" in request.headers:
@@ -168,7 +174,7 @@ def rest_dispatch(request: HttpRequest, /, **kwargs: object) -> HttpResponse:
        # React Native.  See last block for rate limiting notes.
        target_function = authenticated_uploads_api_view(skip_rate_limiting=True)(target_function)
    # /json views (web client) validate with a session token (cookie)
-    elif not request.path.startswith("/api") and request.user.is_authenticated:
+    elif not request.path.startswith("/api") and check_is_authenticated(request):
        # Authenticated via sessions framework, only CSRF check needed
        auth_kwargs = {}
        if "override_api_url_scheme" in view_flags:
--- a/zerver/lib/sessions.py
+++ b/zerver/lib/sessions.py
@@ -6,13 +6,17 @@ from typing import Any, Protocol, cast

 from django.conf import settings
 from django.contrib.auth import SESSION_KEY, get_user_model
+from django.contrib.auth.models import AnonymousUser
 from django.contrib.sessions.backends.base import SessionBase
 from django.contrib.sessions.models import Session
+from django.http import HttpRequest
+from django.utils.functional import LazyObject
 from django.utils.timezone import now as timezone_now

+from zerver.lib.request import RequestNotes
 from zerver.lib.timestamp import datetime_to_timestamp, timestamp_to_datetime
 from zerver.models import Realm, UserProfile
-from zerver.models.users import get_user_profile_by_id
+from zerver.models.users import get_user_profile_by_id, get_user_profile_narrow_by_id


 class SessionEngine(Protocol):
@@ -99,3 +103,33 @@ def get_expirable_session_var(
    if delete:
        del session[var_name]
    return value
+
+
+def narrow_request_user(
+    request: HttpRequest, *, user_id: int | None = None
+) -> UserProfile | AnonymousUser:
+    # In Tornado and other performance-critical paths, we want to not
+    # load the extremely wide default UserProfile select_related.  We
+    # respect the request.user if it has been explicitly set already,
+    # and otherwise perform a cached lookup of a much narrower view of
+    # the UserProfile; this is faster than the normal UserProfile both
+    # for cache misses (1.8ms vs 15ms) and cache hits (147μs vs
+    # 387μs).  We fill the requester_for_logs to skip a session and
+    # user memcached fetch when writing the log lines.
+    if not isinstance(request.user, LazyObject):
+        return request.user  # nocoverage
+
+    if user_id is None:
+        user_id = get_session_dict_user(request.session)
+    if user_id is None:
+        return AnonymousUser()  # nocoverage
+
+    try:
+        request.user = get_user_profile_narrow_by_id(user_id)
+        RequestNotes.get_notes(
+            request
+        ).requester_for_logs = request.user.format_requester_for_logs()
+    except UserProfile.DoesNotExist:  # nocoverage
+        request.user = AnonymousUser()
+
+    return request.user
--- a/zerver/models/users.py
+++ b/zerver/models/users.py
@@ -25,6 +25,7 @@ from zerver.lib.cache import (
    user_profile_by_api_key_cache_key,
    user_profile_by_id_cache_key,
    user_profile_cache_key,
+    user_profile_narrow_by_id_cache_key,
 )
 from zerver.lib.types import ProfileData, RawUserDict
 from zerver.lib.utils import generate_api_key
@@ -955,6 +956,24 @@ def get_user_profile_by_id(user_profile_id: int) -> UserProfile:
    return base_get_user_queryset().get(id=user_profile_id)


+@cache_with_key(user_profile_narrow_by_id_cache_key, timeout=3600 * 24 * 7)
+def get_user_profile_narrow_by_id(user_profile_id: int) -> UserProfile:
+    return (
+        UserProfile.objects.select_related("realm")
+        .only(
+            "id",
+            "bot_type",
+            "is_active",
+            "rate_limits",
+            "role",
+            "recipient_id",
+            "realm__string_id",
+            "realm__deactivated",
+        )
+        .get(id=user_profile_id)
+    )
+
+
 def get_user_profile_by_email(email: str) -> UserProfile:
    """This function is intended to be used for
    manual manage.py shell work; robust code must use get_user or
--- a/zerver/tests/test_tornado.py
+++ b/zerver/tests/test_tornado.py
@@ -1,6 +1,7 @@
 import asyncio
 import socket
-from collections.abc import Awaitable, Callable
+from collections.abc import Awaitable, Callable, Iterator
+from contextlib import contextmanager
 from functools import wraps
 from typing import Any, TypeVar
 from unittest import TestResult, mock
@@ -17,7 +18,10 @@ from tornado.httpclient import AsyncHTTPClient, HTTPResponse
 from tornado.httpserver import HTTPServer
 from typing_extensions import ParamSpec, override

+from zerver.lib.cache import user_profile_narrow_by_id_cache_key
 from zerver.lib.test_classes import ZulipTestCase
+from zerver.lib.test_helpers import cache_tries_captured, queries_captured
+from zerver.models import UserProfile
 from zerver.tornado import event_queue
 from zerver.tornado.application import create_tornado_application
 from zerver.tornado.event_queue import process_event
@@ -111,6 +115,20 @@ class EventsTestCase(TornadoWebTestCase):
        queue_id = await self.create_queue()
        self.assertIn(queue_id, event_queue.clients)

+    @contextmanager
+    def mocked_events(self, user_profile: UserProfile, event: dict[str, object]) -> Iterator[None]:
+        def process_events() -> None:
+            users = [user_profile.id]
+            process_event(event, users)
+
+        def wrapped_fetch_events(**query: Any) -> dict[str, Any]:
+            ret = event_queue.fetch_events(**query)
+            asyncio.get_running_loop().call_soon(process_events)
+            return ret
+
+        with mock.patch("zerver.tornado.views.fetch_events", side_effect=wrapped_fetch_events):
+            yield
+
    @async_to_sync_decorator
    async def test_events_async(self) -> None:
        user_profile = await in_django_thread(lambda: self.example_user("hamlet"))
@@ -123,20 +141,7 @@ class EventsTestCase(TornadoWebTestCase):

        path = f"/json/events?{urlencode(data)}"

-        def process_events() -> None:
-            users = [user_profile.id]
-            event = dict(
-                type="test",
-                data="test data",
-            )
-            process_event(event, users)
-
-        def wrapped_fetch_events(**query: Any) -> dict[str, Any]:
-            ret = event_queue.fetch_events(**query)
-            asyncio.get_running_loop().call_soon(process_events)
-            return ret
-
-        with mock.patch("zerver.tornado.views.fetch_events", side_effect=wrapped_fetch_events):
+        with self.mocked_events(user_profile, {"type": "test", "data": "test data"}):
            response = await self.fetch_async("GET", path)

        self.assertEqual(response.headers["Vary"], "Accept-Language, Cookie")
@@ -148,3 +153,61 @@ class EventsTestCase(TornadoWebTestCase):
            ],
        )
        self.assertEqual(data["result"], "success")
+
+    @async_to_sync_decorator
+    async def test_events_caching(self) -> None:
+        user_profile = await in_django_thread(lambda: self.example_user("hamlet"))
+        await in_django_thread(lambda: self.login_user(user_profile))
+        event_queue_id = await self.create_queue()
+        data = {
+            "queue_id": event_queue_id,
+            "last_event_id": -1,
+        }
+
+        path = f"/json/events?{urlencode(data)}"
+
+        with (
+            self.mocked_events(user_profile, {"type": "test", "data": "test data"}),
+            cache_tries_captured() as cache_gets,
+            queries_captured() as queries,
+        ):
+            await self.fetch_async("GET", path)
+
+            # Two cache fetches -- for the user and the client.  In
+            # production, the session would also be a cache access,
+            # but tests don't use cached sessions.
+            self.assert_length(cache_gets, 2)
+            self.assertEqual(
+                cache_gets[0], ("get", user_profile_narrow_by_id_cache_key(user_profile.id), None)
+            )
+            self.assertEqual(cache_gets[1][0], "get")
+            assert isinstance(cache_gets[1][1], str)
+            self.assertTrue(cache_gets[1][1].startswith("get_client:"))
+
+            # Three database queries -- session, user, and client.
+            # The user query should remain small; it is currently 470
+            # bytes, but anything under 1k should be Fine.
+            self.assert_length(queries, 3)
+            self.assertIn("django_session", queries[0].sql)
+            self.assertIn("zerver_userprofile", queries[1].sql)
+            self.assertLessEqual(len(queries[1].sql), 1024)
+            self.assertIn("zerver_client", queries[2].sql)
+
+        # Perform the same request again, preserving the caches.  We
+        # should only see one database query -- the session.  As noted
+        # above, in production even that would be cached.
+        with (
+            self.mocked_events(user_profile, {"type": "test", "data": "test data"}),
+            cache_tries_captured() as cache_gets,
+            queries_captured(keep_cache_warm=True) as queries,
+        ):
+            await self.fetch_async("GET", path)
+            self.assert_length(cache_gets, 1)
+            self.assertEqual(
+                cache_gets[0], ("get", user_profile_narrow_by_id_cache_key(user_profile.id), None)
+            )
+            # Client is cached in-process-memory, so doesn't even see
+            # a memcached hit
+
+            self.assert_length(queries, 1)
+            self.assertIn("django_session", queries[0].sql)
--- a/zerver/tornado/django_api.py
+++ b/zerver/tornado/django_api.py
@@ -16,6 +16,7 @@ from urllib3.util import Retry
 from zerver.lib.partial import partial
 from zerver.lib.queue import queue_json_publish_rollback_unsafe
 from zerver.models import Client, Realm, UserProfile
+from zerver.models.users import get_user_profile_narrow_by_id
 from zerver.tornado.sharding import (
    get_realm_tornado_ports,
    get_tornado_url,
@@ -97,6 +98,11 @@ def request_event_queue(
    if not settings.USING_TORNADO:
        return None

+    # We make sure to pre-fill the narrow user cache, to save
+    # session-based Tornado (/json/events) from having to go to the
+    # database.
+    get_user_profile_narrow_by_id(user_profile.id)
+
    tornado_url = get_tornado_url(get_user_tornado_port(user_profile))
    req = {
        "dont_block": "true",
@@ -134,6 +140,12 @@ def get_user_events(
    if not settings.USING_TORNADO:
        return []

+    # Pre-fill the narrow user cache, to save session-based Tornado
+    # (/json/events) from having to go to the database.  This is
+    # almost certainly filled already, from above, but there is little
+    # harm in forcing it.
+    get_user_profile_narrow_by_id(user_profile.id)
+
    tornado_url = get_tornado_url(get_user_tornado_port(user_profile))
    post_data: dict[str, Any] = {
        "queue_id": queue_id,
--- a/zerver/tornado/views.py
+++ b/zerver/tornado/views.py
@@ -14,10 +14,10 @@ from zerver.lib.exceptions import JsonableError
 from zerver.lib.queue import get_queue_client
 from zerver.lib.request import RequestNotes
 from zerver.lib.response import AsynchronousResponse, json_success
+from zerver.lib.sessions import narrow_request_user
 from zerver.lib.typed_endpoint import ApiParamConfig, DocumentationStatus, typed_endpoint
 from zerver.models import UserProfile
 from zerver.models.clients import get_client
-from zerver.models.users import get_user_profile_by_id
 from zerver.tornado.descriptors import is_current_port
 from zerver.tornado.event_queue import (
    access_client_descriptor,
@@ -100,8 +100,8 @@ def cleanup_event_queue(
@internal_api_view(True)
@typed_endpoint
 def get_events_internal(request: HttpRequest, *, user_profile_id: Json[int]) -> HttpResponse:
-    user_profile = get_user_profile_by_id(user_profile_id)
-    RequestNotes.get_notes(request).requester_for_logs = user_profile.format_requester_for_logs()
+    user_profile = narrow_request_user(request, user_id=user_profile_id)
+    assert isinstance(user_profile, UserProfile)
    assert is_current_port(get_user_tornado_port(user_profile))

    process_client(request, user_profile, client_name="internal")
--- a/zproject/urls.py
+++ b/zproject/urls.py
@@ -523,7 +523,11 @@ v1_api_and_json_patterns = [
    # used to register for an event queue in tornado
    rest_path("register", POST=(events_register_backend, {"allow_anonymous_user_web"})),
    # events -> zerver.tornado.views
-    rest_path("events", GET=get_events, DELETE=cleanup_event_queue),
+    rest_path(
+        "events",
+        GET=(get_events, {"narrow_user_session_cache"}),
+        DELETE=(cleanup_event_queue, {"narrow_user_session_cache"}),
+    ),
    # Used to generate a Zoom video call URL
    rest_path("calls/zoom/create", POST=make_zoom_video_call),
    # Used to generate a BigBlueButton video call URL