tornado: Limit the width of the user queries, when they're needed.

Tornado requests try hard to not make SQL queries -- and if they're
necessary, to minimize the number of them.  Specifically, both session
objects and user objects are cached in memcached, and we expect that
both of them will have been filled there by Django before any requests
are made to Tornado.

In the event that memcached is flushed, or data is otherwise evicted,
we perform two database queries -- one for the session, and one for
the user.  However, the *width* of the latter query has grown
significantly over time, as the Realm object grew more fields, and
recently with the addition of role groups, which require multiple
joins each.  This leads to a query which is over 12k of text long, and
results in 319 columns.  In the event of a memcached flush, this can
result in a *significant* amount of SQL traffic, as nearly every
active Tornado request will make that query.

We do not wish to narrow the default query for Django; we instead tag
the request in the REST wrapper, and use that to use a much narrower
user cache entry.  That narrower cache entry is filled before the
queue is created in Django; we also use it to explicitly set the log
data, so the second "half" of the continued Tornado request does not
need to fetch any user data either when writing its log line.

Because they use different cache keys, this only affects the
session-based `/json/events` endpoint, which caches by user-id; the
`/api/v1/events` endpoint, which uses an API-key cache, keeps its wide
user object.  The former is 50% of the total request volume, whereas
the latter is only 2%, so adding an additional cache for it is
unnecessary complexity.
This commit is contained in:
Alex Vandiver
2025-02-10 20:09:09 +00:00
committed by Tim Abbott
parent 7c20f1d3ea
commit 58bf2a7935
8 changed files with 164 additions and 21 deletions

View File

@@ -429,6 +429,10 @@ def user_profile_by_id_cache_key(user_profile_id: int) -> str:
return f"user_profile_by_id:{user_profile_id}"
def user_profile_narrow_by_id_cache_key(user_profile_id: int) -> str:
return f"user_profile_narrow_by_id:{user_profile_id}"
def user_profile_by_api_key_cache_key(api_key: str) -> str:
return f"user_profile_by_api_key:{api_key}"
@@ -513,6 +517,7 @@ def delete_user_profile_caches(user_profiles: Iterable["UserProfile"], realm_id:
keys = []
for user_profile in user_profiles:
keys.append(user_profile_by_id_cache_key(user_profile.id))
keys.append(user_profile_narrow_by_id_cache_key(user_profile.id))
keys += map(user_profile_by_api_key_cache_key, get_all_api_keys(user_profile))
keys.append(user_profile_cache_key_id(user_profile.email, realm_id))
keys.append(user_profile_delivery_email_cache_key(user_profile.delivery_email, realm_id))

View File

@@ -19,6 +19,7 @@ from zerver.decorator import (
from zerver.lib.exceptions import MissingAuthenticationError
from zerver.lib.request import RequestNotes
from zerver.lib.response import json_method_not_allowed
from zerver.lib.sessions import narrow_request_user
ParamT = ParamSpec("ParamT")
METHODS = ("GET", "HEAD", "POST", "PUT", "DELETE", "PATCH")
@@ -152,6 +153,11 @@ def rest_dispatch(request: HttpRequest, /, **kwargs: object) -> HttpResponse:
# Security implications of this portion of the code are minimal,
# as we should worst-case fail closed if we miscategorize a request.
def check_is_authenticated(request: HttpRequest) -> bool:
if "narrow_user_session_cache" not in view_flags:
return request.user.is_authenticated
return narrow_request_user(request).is_authenticated
# for some special views (e.g. serving a file that has been
# uploaded), we support using the same URL for web and API clients.
if "override_api_url_scheme" in view_flags and "Authorization" in request.headers:
@@ -168,7 +174,7 @@ def rest_dispatch(request: HttpRequest, /, **kwargs: object) -> HttpResponse:
# React Native. See last block for rate limiting notes.
target_function = authenticated_uploads_api_view(skip_rate_limiting=True)(target_function)
# /json views (web client) validate with a session token (cookie)
elif not request.path.startswith("/api") and request.user.is_authenticated:
elif not request.path.startswith("/api") and check_is_authenticated(request):
# Authenticated via sessions framework, only CSRF check needed
auth_kwargs = {}
if "override_api_url_scheme" in view_flags:

View File

@@ -6,13 +6,17 @@ from typing import Any, Protocol, cast
from django.conf import settings
from django.contrib.auth import SESSION_KEY, get_user_model
from django.contrib.auth.models import AnonymousUser
from django.contrib.sessions.backends.base import SessionBase
from django.contrib.sessions.models import Session
from django.http import HttpRequest
from django.utils.functional import LazyObject
from django.utils.timezone import now as timezone_now
from zerver.lib.request import RequestNotes
from zerver.lib.timestamp import datetime_to_timestamp, timestamp_to_datetime
from zerver.models import Realm, UserProfile
from zerver.models.users import get_user_profile_by_id
from zerver.models.users import get_user_profile_by_id, get_user_profile_narrow_by_id
class SessionEngine(Protocol):
@@ -99,3 +103,33 @@ def get_expirable_session_var(
if delete:
del session[var_name]
return value
def narrow_request_user(
request: HttpRequest, *, user_id: int | None = None
) -> UserProfile | AnonymousUser:
# In Tornado and other performance-critical paths, we want to not
# load the extremely wide default UserProfile select_related. We
# respect the request.user if it has been explicitly set already,
# and otherwise perform a cached lookup of a much narrower view of
# the UserProfile; this is faster than the normal UserProfile both
# for cache misses (1.8ms vs 15ms) and cache hits (147μs vs
# 387μs). We fill the requester_for_logs to skip a session and
# user memcached fetch when writing the log lines.
if not isinstance(request.user, LazyObject):
return request.user # nocoverage
if user_id is None:
user_id = get_session_dict_user(request.session)
if user_id is None:
return AnonymousUser() # nocoverage
try:
request.user = get_user_profile_narrow_by_id(user_id)
RequestNotes.get_notes(
request
).requester_for_logs = request.user.format_requester_for_logs()
except UserProfile.DoesNotExist: # nocoverage
request.user = AnonymousUser()
return request.user

View File

@@ -25,6 +25,7 @@ from zerver.lib.cache import (
user_profile_by_api_key_cache_key,
user_profile_by_id_cache_key,
user_profile_cache_key,
user_profile_narrow_by_id_cache_key,
)
from zerver.lib.types import ProfileData, RawUserDict
from zerver.lib.utils import generate_api_key
@@ -955,6 +956,24 @@ def get_user_profile_by_id(user_profile_id: int) -> UserProfile:
return base_get_user_queryset().get(id=user_profile_id)
@cache_with_key(user_profile_narrow_by_id_cache_key, timeout=3600 * 24 * 7)
def get_user_profile_narrow_by_id(user_profile_id: int) -> UserProfile:
return (
UserProfile.objects.select_related("realm")
.only(
"id",
"bot_type",
"is_active",
"rate_limits",
"role",
"recipient_id",
"realm__string_id",
"realm__deactivated",
)
.get(id=user_profile_id)
)
def get_user_profile_by_email(email: str) -> UserProfile:
"""This function is intended to be used for
manual manage.py shell work; robust code must use get_user or

View File

@@ -1,6 +1,7 @@
import asyncio
import socket
from collections.abc import Awaitable, Callable
from collections.abc import Awaitable, Callable, Iterator
from contextlib import contextmanager
from functools import wraps
from typing import Any, TypeVar
from unittest import TestResult, mock
@@ -17,7 +18,10 @@ from tornado.httpclient import AsyncHTTPClient, HTTPResponse
from tornado.httpserver import HTTPServer
from typing_extensions import ParamSpec, override
from zerver.lib.cache import user_profile_narrow_by_id_cache_key
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.test_helpers import cache_tries_captured, queries_captured
from zerver.models import UserProfile
from zerver.tornado import event_queue
from zerver.tornado.application import create_tornado_application
from zerver.tornado.event_queue import process_event
@@ -111,6 +115,20 @@ class EventsTestCase(TornadoWebTestCase):
queue_id = await self.create_queue()
self.assertIn(queue_id, event_queue.clients)
@contextmanager
def mocked_events(self, user_profile: UserProfile, event: dict[str, object]) -> Iterator[None]:
def process_events() -> None:
users = [user_profile.id]
process_event(event, users)
def wrapped_fetch_events(**query: Any) -> dict[str, Any]:
ret = event_queue.fetch_events(**query)
asyncio.get_running_loop().call_soon(process_events)
return ret
with mock.patch("zerver.tornado.views.fetch_events", side_effect=wrapped_fetch_events):
yield
@async_to_sync_decorator
async def test_events_async(self) -> None:
user_profile = await in_django_thread(lambda: self.example_user("hamlet"))
@@ -123,20 +141,7 @@ class EventsTestCase(TornadoWebTestCase):
path = f"/json/events?{urlencode(data)}"
def process_events() -> None:
users = [user_profile.id]
event = dict(
type="test",
data="test data",
)
process_event(event, users)
def wrapped_fetch_events(**query: Any) -> dict[str, Any]:
ret = event_queue.fetch_events(**query)
asyncio.get_running_loop().call_soon(process_events)
return ret
with mock.patch("zerver.tornado.views.fetch_events", side_effect=wrapped_fetch_events):
with self.mocked_events(user_profile, {"type": "test", "data": "test data"}):
response = await self.fetch_async("GET", path)
self.assertEqual(response.headers["Vary"], "Accept-Language, Cookie")
@@ -148,3 +153,61 @@ class EventsTestCase(TornadoWebTestCase):
],
)
self.assertEqual(data["result"], "success")
@async_to_sync_decorator
async def test_events_caching(self) -> None:
user_profile = await in_django_thread(lambda: self.example_user("hamlet"))
await in_django_thread(lambda: self.login_user(user_profile))
event_queue_id = await self.create_queue()
data = {
"queue_id": event_queue_id,
"last_event_id": -1,
}
path = f"/json/events?{urlencode(data)}"
with (
self.mocked_events(user_profile, {"type": "test", "data": "test data"}),
cache_tries_captured() as cache_gets,
queries_captured() as queries,
):
await self.fetch_async("GET", path)
# Two cache fetches -- for the user and the client. In
# production, the session would also be a cache access,
# but tests don't use cached sessions.
self.assert_length(cache_gets, 2)
self.assertEqual(
cache_gets[0], ("get", user_profile_narrow_by_id_cache_key(user_profile.id), None)
)
self.assertEqual(cache_gets[1][0], "get")
assert isinstance(cache_gets[1][1], str)
self.assertTrue(cache_gets[1][1].startswith("get_client:"))
# Three database queries -- session, user, and client.
# The user query should remain small; it is currently 470
# bytes, but anything under 1k should be Fine.
self.assert_length(queries, 3)
self.assertIn("django_session", queries[0].sql)
self.assertIn("zerver_userprofile", queries[1].sql)
self.assertLessEqual(len(queries[1].sql), 1024)
self.assertIn("zerver_client", queries[2].sql)
# Perform the same request again, preserving the caches. We
# should only see one database query -- the session. As noted
# above, in production even that would be cached.
with (
self.mocked_events(user_profile, {"type": "test", "data": "test data"}),
cache_tries_captured() as cache_gets,
queries_captured(keep_cache_warm=True) as queries,
):
await self.fetch_async("GET", path)
self.assert_length(cache_gets, 1)
self.assertEqual(
cache_gets[0], ("get", user_profile_narrow_by_id_cache_key(user_profile.id), None)
)
# Client is cached in-process-memory, so doesn't even see
# a memcached hit
self.assert_length(queries, 1)
self.assertIn("django_session", queries[0].sql)

View File

@@ -16,6 +16,7 @@ from urllib3.util import Retry
from zerver.lib.partial import partial
from zerver.lib.queue import queue_json_publish_rollback_unsafe
from zerver.models import Client, Realm, UserProfile
from zerver.models.users import get_user_profile_narrow_by_id
from zerver.tornado.sharding import (
get_realm_tornado_ports,
get_tornado_url,
@@ -97,6 +98,11 @@ def request_event_queue(
if not settings.USING_TORNADO:
return None
# We make sure to pre-fill the narrow user cache, to save
# session-based Tornado (/json/events) from having to go to the
# database.
get_user_profile_narrow_by_id(user_profile.id)
tornado_url = get_tornado_url(get_user_tornado_port(user_profile))
req = {
"dont_block": "true",
@@ -134,6 +140,12 @@ def get_user_events(
if not settings.USING_TORNADO:
return []
# Pre-fill the narrow user cache, to save session-based Tornado
# (/json/events) from having to go to the database. This is
# almost certainly filled already, from above, but there is little
# harm in forcing it.
get_user_profile_narrow_by_id(user_profile.id)
tornado_url = get_tornado_url(get_user_tornado_port(user_profile))
post_data: dict[str, Any] = {
"queue_id": queue_id,

View File

@@ -14,10 +14,10 @@ from zerver.lib.exceptions import JsonableError
from zerver.lib.queue import get_queue_client
from zerver.lib.request import RequestNotes
from zerver.lib.response import AsynchronousResponse, json_success
from zerver.lib.sessions import narrow_request_user
from zerver.lib.typed_endpoint import ApiParamConfig, DocumentationStatus, typed_endpoint
from zerver.models import UserProfile
from zerver.models.clients import get_client
from zerver.models.users import get_user_profile_by_id
from zerver.tornado.descriptors import is_current_port
from zerver.tornado.event_queue import (
access_client_descriptor,
@@ -100,8 +100,8 @@ def cleanup_event_queue(
@internal_api_view(True)
@typed_endpoint
def get_events_internal(request: HttpRequest, *, user_profile_id: Json[int]) -> HttpResponse:
user_profile = get_user_profile_by_id(user_profile_id)
RequestNotes.get_notes(request).requester_for_logs = user_profile.format_requester_for_logs()
user_profile = narrow_request_user(request, user_id=user_profile_id)
assert isinstance(user_profile, UserProfile)
assert is_current_port(get_user_tornado_port(user_profile))
process_client(request, user_profile, client_name="internal")

View File

@@ -523,7 +523,11 @@ v1_api_and_json_patterns = [
# used to register for an event queue in tornado
rest_path("register", POST=(events_register_backend, {"allow_anonymous_user_web"})),
# events -> zerver.tornado.views
rest_path("events", GET=get_events, DELETE=cleanup_event_queue),
rest_path(
"events",
GET=(get_events, {"narrow_user_session_cache"}),
DELETE=(cleanup_event_queue, {"narrow_user_session_cache"}),
),
# Used to generate a Zoom video call URL
rest_path("calls/zoom/create", POST=make_zoom_video_call),
# Used to generate a BigBlueButton video call URL