From 656ca17e1489d2ed50093e80425b67b39c843ab5 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 31 Jul 2024 16:07:10 +0000 Subject: [PATCH] thumbnail: Resolve a race condition when rendering messages. Messages are rendered outside of a transaction, for performance reasons, and then sent inside of one. This opens thumbnailing up to a race where the thumbnails have not yet been written when the message is rendered, but the message has not been sent when thumbnailing completes, causing `rewrite_thumbnailed_images` to be a no-op and the message being left with a spinner which never resolves. Explicitly lock and use he ImageAttachment data inside the message-sending transaction, to rewrite the message content with the latest information about the existing thumbnails. Despite the thumbnailing worker taking a lock on Message rows to update them, this does not lead to deadlocks -- the INSERT of the Message rows happens in a transaction, ensuring that either the message rending blocks the thumbnailing until the Message row is created, or that the `rewrite_thumbnailed_images` and Message INSERT waits until thumbnailing is complete (and updated no Message rows). (cherry picked from commit 6f20c15ae9e56c04e1977aa989c358b1f516daeb) --- zerver/actions/message_send.py | 27 ++++++++++++- zerver/lib/markdown/__init__.py | 5 ++- zerver/lib/thumbnail.py | 28 ++++++++++---- zerver/tests/test_markdown_thumbnail.py | 50 ++++++++++++++++++++++++- zerver/worker/thumbnail.py | 2 +- 5 files changed, 100 insertions(+), 12 deletions(-) diff --git a/zerver/actions/message_send.py b/zerver/actions/message_send.py index 37148f1296..2e324e4bf0 100644 --- a/zerver/actions/message_send.py +++ b/zerver/actions/message_send.py @@ -68,6 +68,7 @@ from zerver.lib.stream_subscription import ( from zerver.lib.stream_topic import StreamTopicTarget from zerver.lib.streams import access_stream_for_send_message, ensure_stream, subscribed_to_stream from zerver.lib.string_validation import check_stream_name +from zerver.lib.thumbnail import get_user_upload_previews, rewrite_thumbnailed_images from zerver.lib.timestamp import timestamp_to_datetime from zerver.lib.topic import participants_for_topic from zerver.lib.url_preview.types import UrlEmbedData @@ -864,7 +865,31 @@ def do_send_messages( send_request.message, send_request.rendering_result.potential_attachment_path_ids ): send_request.message.has_attachment = True - send_request.message.save(update_fields=["has_attachment"]) + update_fields = ["has_attachment"] + + # Lock the ImageAttachment rows that we pulled when rendering + # outside of the transaction; they may be out of date now, and we + # need to post-process the rendered_content to swap out the + # spinner. We would ideally take a `FOR SHARE` lock here + # (https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS), + # which would block UPDATEs but not other `FOR SHARE`, but Django + # does not support this yet: (https://code.djangoproject.com/ticket/10088) + assert send_request.message.rendered_content is not None + if send_request.rendering_result.thumbnail_spinners: + previews = get_user_upload_previews( + send_request.message.realm_id, + send_request.message.content, + lock=True, + path_ids=list(send_request.rendering_result.thumbnail_spinners), + ) + new_rendered_content = rewrite_thumbnailed_images( + send_request.message.rendered_content, previews + )[0] + if new_rendered_content is not None: + send_request.message.rendered_content = new_rendered_content + update_fields.append("rendered_content") + + send_request.message.save(update_fields=update_fields) ums: list[UserMessageLite] = [] for send_request in send_message_requests: diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index d54886d1b9..062509ec62 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -118,6 +118,7 @@ class MessageRenderingResult: links_for_preview: set[str] user_ids_with_alert_words: set[int] potential_attachment_path_ids: list[str] + thumbnail_spinners: set[str] @dataclass @@ -2625,6 +2626,7 @@ def do_convert( links_for_preview=set(), user_ids_with_alert_words=set(), potential_attachment_path_ids=[], + thumbnail_spinners=set(), ) _md_engine.zulip_message = message @@ -2683,9 +2685,10 @@ def do_convert( # Post-process the result with the rendered image previews: if user_upload_previews is not None: - content_with_thumbnails = rewrite_thumbnailed_images( + content_with_thumbnails, thumbnail_spinners = rewrite_thumbnailed_images( rendering_result.rendered_content, user_upload_previews ) + rendering_result.thumbnail_spinners = thumbnail_spinners if content_with_thumbnails is not None: rendering_result.rendered_content = content_with_thumbnails diff --git a/zerver/lib/thumbnail.py b/zerver/lib/thumbnail.py index 5980f90d26..26584f3ad0 100644 --- a/zerver/lib/thumbnail.py +++ b/zerver/lib/thumbnail.py @@ -333,11 +333,22 @@ class MarkdownImageMetadata: def get_user_upload_previews( - realm_id: int, content: str + realm_id: int, + content: str, + lock: bool = False, + path_ids: list[str] | None = None, ) -> dict[str, MarkdownImageMetadata | None]: - matches = re.findall(r"/user_uploads/(\d+/[/\w.-]+)", content) + if path_ids is None: + path_ids = re.findall(r"/user_uploads/(\d+/[/\w.-]+)", content) + if not path_ids: + return {} + upload_preview_data: dict[str, MarkdownImageMetadata | None] = {} - for image_attachment in ImageAttachment.objects.filter(realm_id=realm_id, path_id__in=matches): + + image_attachments = ImageAttachment.objects.filter(realm_id=realm_id, path_id__in=path_ids) + if lock: + image_attachments = image_attachments.select_for_update() + for image_attachment in image_attachments: if image_attachment.thumbnail_metadata == []: # Image exists, and header of it parsed as a valid image, # but has not been thumbnailed yet; we will render a @@ -379,10 +390,11 @@ def rewrite_thumbnailed_images( rendered_content: str, images: dict[str, MarkdownImageMetadata | None], to_delete: set[str] | None = None, -) -> str | None: +) -> tuple[str | None, set[str]]: if not images and not to_delete: - return None + return None, set() + remaining_thumbnails = set() parsed_message = BeautifulSoup(rendered_content, "html.parser") changed = False @@ -419,7 +431,7 @@ def rewrite_thumbnailed_images( # This happens routinely when a message contained multiple # unthumbnailed images, and only one of those images just # completed thumbnailing. - pass + remaining_thumbnails.add(path_id) else: changed = True del image_tag["class"] @@ -432,6 +444,6 @@ def rewrite_thumbnailed_images( if changed: # The formatter="html5" means we do not produce self-closing tags - return parsed_message.encode(formatter="html5").decode().strip() + return parsed_message.encode(formatter="html5").decode().strip(), remaining_thumbnails else: - return None + return None, remaining_thumbnails diff --git a/zerver/tests/test_markdown_thumbnail.py b/zerver/tests/test_markdown_thumbnail.py index b68d4a3e21..59ca8e1d26 100644 --- a/zerver/tests/test_markdown_thumbnail.py +++ b/zerver/tests/test_markdown_thumbnail.py @@ -4,13 +4,22 @@ from unittest.mock import patch import pyvips from zerver.actions.message_delete import do_delete_messages +from zerver.actions.message_send import check_message, do_send_messages +from zerver.lib.addressee import Addressee from zerver.lib.camo import get_camo_url from zerver.lib.markdown import render_message_markdown from zerver.lib.test_classes import ZulipTestCase from zerver.lib.test_helpers import get_test_image_file, read_test_image_file from zerver.lib.thumbnail import ThumbnailFormat from zerver.lib.upload import upload_message_attachment -from zerver.models import ArchivedAttachment, ArchivedMessage, Attachment, ImageAttachment, Message +from zerver.models import ( + ArchivedAttachment, + ArchivedMessage, + Attachment, + Client, + ImageAttachment, + Message, +) from zerver.models.clients import get_client from zerver.worker.thumbnail import ensure_thumbnails @@ -334,3 +343,42 @@ class MarkdownThumbnailTest(ZulipTestCase): private_message_id, f'

This image is private

\n{rendered_thumb}', ) + + def test_thumbnail_race(self) -> None: + """Test what happens when thumbnailing happens between rendering and sending""" + path_id = self.upload_image("img.png") + + self.assert_length(ImageAttachment.objects.get(path_id=path_id).thumbnail_metadata, 0) + + # Render, but do not send, the message referencing the image. + # This will render as a spinner, since the thumbnail has not + # been generated yet. + send_request = check_message( + self.example_user("othello"), + Client.objects.get_or_create(name="test suite")[0], + Addressee.for_stream_name("Verona", "test"), + f"[image](/user_uploads/{path_id})", + ) + expected = ( + f'

image

\n' + f'
' + '
' + ) + self.assertEqual(send_request.message.rendered_content, expected) + + # Thumbnail the image. The message does not exist yet, so + # nothing is re-written. + ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id)) + + # Send the message; this should re-check the ImageAttachment + # data, find the thumbnails, and update the rendered_content + # to no longer contain a spinner. + message_id = do_send_messages([send_request])[0].message_id + + rendered_thumb = ( + f'
' + f'
' + ) + self.assert_message_content_is( + message_id, f'

image

\n{rendered_thumb}' + ) diff --git a/zerver/worker/thumbnail.py b/zerver/worker/thumbnail.py index 9e5854e06c..98c9876606 100644 --- a/zerver/worker/thumbnail.py +++ b/zerver/worker/thumbnail.py @@ -170,7 +170,7 @@ def update_message_rendered_content( message.rendered_content, {} if image_data is None else {path_id: image_data}, {path_id} if image_data is None else set(), - ) + )[0] if rendered_content is None: # There were no updates -- for instance, if we re-run # ensure_thumbnails on an ImageAttachment we already