models: Add content_type to ImageAttachment.

This means that only ImageAttachment row needs to be fetched, and removes the need to pass around an extra parameter. This denormalization is safe, since in general Attachment rows are read-only, so we are not concerned with drift between the Attachment and ImageAttachment tables. We cannot make content_type non-null, since while the both the `content_type` column in Attachment and populating that from requests predates the ImageAttachment table, we have both backfilled ImageAttachment rows to consider, and imports may also leave files with no `content_type`. Any backfill of currently-null `content_type` values will thus need to update both tables. This change fixes a race condition when importing. ImageAttachment rows are imported before rendering Messages, which are both before importing Attachment rows; if the thumbnailing finished after the Message was imported but before Attachment rows were imported, then the re-rendering step would not know the image's content-type.
2025-10-23 04:52:12 +00:00 · 2025-01-30 15:58:25 +00:00
parent 0788942a68
commit 98362de185
7 changed files with 97 additions and 75 deletions
--- a/zerver/lib/thumbnail.py
+++ b/zerver/lib/thumbnail.py
@@ -9,14 +9,13 @@ from typing import TypeVar
 import pyvips
 from bs4 import BeautifulSoup
 from bs4.formatter import EntitySubstitution, HTMLFormatter
-from django.db.models import OuterRef, Subquery
 from django.utils.translation import gettext as _
 from typing_extensions import override

 from zerver.lib.exceptions import ErrorCode, JsonableError
 from zerver.lib.mime_types import INLINE_MIME_TYPES
 from zerver.lib.queue import queue_event_on_commit
-from zerver.models import Attachment, ImageAttachment
+from zerver.models import ImageAttachment

 DEFAULT_AVATAR_SIZE = 100
 MEDIUM_AVATAR_SIZE = 500
@@ -293,14 +292,14 @@ def resize_emoji(


 def missing_thumbnails(
-    image_attachment: ImageAttachment, original_content_type: str | None = None
+    image_attachment: ImageAttachment,
 ) -> list[ThumbnailFormat]:
    seen_thumbnails: set[StoredThumbnailFormat] = set()
    for existing_thumbnail in image_attachment.thumbnail_metadata:
        seen_thumbnails.add(StoredThumbnailFormat(**existing_thumbnail))

    potential_output_formats = list(THUMBNAIL_OUTPUT_FORMATS)
-    if original_content_type is None or original_content_type not in INLINE_MIME_TYPES:
+    if image_attachment.content_type not in INLINE_MIME_TYPES:
        if image_attachment.original_width_px >= image_attachment.original_height_px:
            additional_format = ThumbnailFormat(
                TRANSCODED_IMAGE_FORMAT.extension,
@@ -368,6 +367,7 @@ def maybe_thumbnail(
                original_height_px=height,
                frames=image.get_n_pages(),
                thumbnail_metadata=[],
+                content_type=content_type,
            )
            queue_event_on_commit("thumbnail", {"id": image_row.id})
            return image_row
@@ -415,15 +415,9 @@ def get_user_upload_previews(

    upload_preview_data: dict[str, MarkdownImageMetadata] = {}

-    image_attachments = (
-        ImageAttachment.objects.filter(realm_id=realm_id, path_id__in=path_ids)
-        .annotate(
-            original_content_type=Subquery(
-                Attachment.objects.filter(path_id=OuterRef("path_id")).values("content_type")
-            )
-        )
-        .order_by("id")
-    )
+    image_attachments = ImageAttachment.objects.filter(
+        realm_id=realm_id, path_id__in=path_ids
+    ).order_by("id")
    if lock:
        image_attachments = image_attachments.select_for_update(of=("self",))
    for image_attachment in image_attachments:
@@ -436,7 +430,7 @@ def get_user_upload_previews(
                is_animated=False,
                original_width_px=image_attachment.original_width_px,
                original_height_px=image_attachment.original_height_px,
-                original_content_type=image_attachment.original_content_type,
+                original_content_type=image_attachment.content_type,
            )

            # We re-queue the row for thumbnailing to make sure that
@@ -454,10 +448,8 @@ def get_user_upload_previews(
                is_animated=is_animated,
                original_width_px=image_attachment.original_width_px,
                original_height_px=image_attachment.original_height_px,
-                original_content_type=image_attachment.original_content_type,
-                transcoded_image=get_transcoded_format(
-                    image_attachment, image_attachment.original_content_type
-                ),
+                original_content_type=image_attachment.content_type,
+                transcoded_image=get_transcoded_format(image_attachment),
            )
    return upload_preview_data

@@ -484,7 +476,7 @@ def get_default_thumbnail_url(image_attachment: ImageAttachment) -> tuple[str, b


 def get_transcoded_format(
-    image_attachment: ImageAttachment, original_content_type: str | None
+    image_attachment: ImageAttachment,
 ) -> StoredThumbnailFormat | None:
    # Returns None if the original content-type is judged to be
    # renderable inline.  Otherwise, we return the largest thumbnail
@@ -492,7 +484,7 @@ def get_transcoded_format(
    # not in INLINE_MIME_TYPES get an extra large-resolution thumbnail
    # added to their list of formats, this is thus either None or a
    # high-resolution thumbnail.
-    if original_content_type is None or original_content_type in INLINE_MIME_TYPES:
+    if image_attachment.content_type is None or image_attachment.content_type in INLINE_MIME_TYPES:
        return None

    thumbs_by_size = sorted(
--- a/zerver/migrations/0660_add_imageattachment_content_type.py
+++ b/zerver/migrations/0660_add_imageattachment_content_type.py
@@ -0,0 +1,48 @@
+from django.db import migrations, models
+from django.db.backends.base.schema import BaseDatabaseSchemaEditor
+from django.db.migrations.state import StateApps
+from django.db.models import OuterRef, Subquery
+
+
+def populate_content_type(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
+    ImageAttachment = apps.get_model("zerver", "ImageAttachment")
+    Attachment = apps.get_model("zerver", "Attachment")
+
+    batch_size = 1000
+    min_id = 0
+    while True:
+        # Update content_types from corresponding Attachments in bulk
+        rows_updated = ImageAttachment.objects.filter(
+            id__gt=min_id, id__lte=min_id + batch_size
+        ).update(
+            content_type=Subquery(
+                Attachment.objects.filter(path_id=OuterRef("path_id")).values("content_type")
+            ),
+        )
+
+        if not rows_updated:
+            break
+
+        min_id += batch_size
+        print(f"Processed ImageAttachments through id {min_id}")
+
+
+class Migration(migrations.Migration):
+    atomic = False
+
+    dependencies = [
+        ("zerver", "0659_remove_realm_bot_creation_policy"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="imageattachment",
+            name="content_type",
+            field=models.TextField(null=True),
+        ),
+        migrations.RunPython(
+            populate_content_type,
+            reverse_code=migrations.RunPython.noop,
+            elidable=True,
+        ),
+    ]
--- a/zerver/models/messages.py
+++ b/zerver/models/messages.py
@@ -667,6 +667,7 @@ class ArchivedUserMessage(AbstractUserMessage):
 class ImageAttachment(models.Model):
    realm = models.ForeignKey(Realm, on_delete=CASCADE)
    path_id = models.TextField(db_index=True, unique=True)
+    content_type = models.TextField(null=True)

    original_width_px = models.IntegerField()
    original_height_px = models.IntegerField()
--- a/zerver/tests/test_markdown_thumbnail.py
+++ b/zerver/tests/test_markdown_thumbnail.py
@@ -192,7 +192,7 @@ class MarkdownThumbnailTest(ZulipTestCase):

        # Re-thumbnail with a non-overlapping set of sizes
        with self.thumbnail_formats(ThumbnailFormat("jpg", 100, 75, animated=False)):
-            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/gif")
+            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))

        # We generate a new size but leave the old ones
        self.assert_length(ImageAttachment.objects.get(path_id=path_id).thumbnail_metadata, 3)
@@ -227,7 +227,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
        )

        # Complete thumbnailing the second image first -- replacing only that spinner
-        ensure_thumbnails(ImageAttachment.objects.get(path_id=second_path_id), "image/jpeg")
+        ensure_thumbnails(ImageAttachment.objects.get(path_id=second_path_id))
        self.assert_message_content_is(
            message_id,
            (
@@ -247,7 +247,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
        )

        # Finish the other thumbnail
-        ensure_thumbnails(ImageAttachment.objects.get(path_id=first_path_id), "image/png")
+        ensure_thumbnails(ImageAttachment.objects.get(path_id=first_path_id))
        self.assert_message_content_is(
            message_id,
            (
@@ -283,7 +283,7 @@ class MarkdownThumbnailTest(ZulipTestCase):

        # Completing rendering after it is deleted should work, and
        # update the rendered content in the archived message
-        ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
+        ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
        expected = (
            f'<p><a href="/user_uploads/{path_id}">image</a></p>\n'
            f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="image">'
@@ -313,7 +313,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
            ) as thumb_mock,
            self.assertLogs("zerver.worker.thumbnail", "ERROR") as thumbnail_logs,
        ):
-            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
+            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
            thumb_mock.assert_called_once()
        self.assert_length(thumbnail_logs.output, 1)
        self.assertTrue(
@@ -360,7 +360,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
                ThumbnailFormat("webp", 200, 150, animated=False),
            ),
        ):
-            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
+            ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))

        # Called once per format
        self.assertEqual(thumb_mock.call_count, 2)
@@ -410,7 +410,7 @@ class MarkdownThumbnailTest(ZulipTestCase):

        # Thumbnail the image.  The message does not exist yet, so
        # nothing is re-written.
-        ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
+        ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))

        # Send the message; this should re-check the ImageAttachment
        # data, find the thumbnails, and update the rendered_content
--- a/zerver/tests/test_thumbnail.py
+++ b/zerver/tests/test_thumbnail.py
@@ -368,11 +368,11 @@ class TestStoreThumbnail(ZulipTestCase):
            self.assertEqual(thumbnailed_image.get_n_pages(), 2)

        with self.thumbnail_formats(ThumbnailFormat("webp", 100, 75, animated=True)):
-            self.assertEqual(ensure_thumbnails(image_attachment, "image/webp"), 0)
+            self.assertEqual(ensure_thumbnails(image_attachment), 0)
        self.assert_length(image_attachment.thumbnail_metadata, 1)

        with self.thumbnail_formats(ThumbnailFormat("webp", 150, 100, opts="Q=90", animated=False)):
-            self.assertEqual(ensure_thumbnails(image_attachment, "image/webp"), 1)
+            self.assertEqual(ensure_thumbnails(image_attachment), 1)
        self.assert_length(image_attachment.thumbnail_metadata, 2)

        bigger_thumbnail = StoredThumbnailFormat(**image_attachment.thumbnail_metadata[1])
@@ -594,12 +594,13 @@ class TestStoreThumbnail(ZulipTestCase):
            original_width_px=128,
            frames=1,
            thumbnail_metadata=[],
+            content_type="image/gif",
        )
        with self.thumbnail_formats(ThumbnailFormat("webp", 100, 75, animated=False)):
-            self.assert_length(missing_thumbnails(image_attachment, "image/gif"), 1)
+            self.assert_length(missing_thumbnails(image_attachment), 1)

            with self.assertLogs("zerver.worker.thumbnail", level="ERROR") as error_log:
-                self.assertEqual(ensure_thumbnails(image_attachment, "image/gif"), 0)
+                self.assertEqual(ensure_thumbnails(image_attachment), 0)

        libvips_version = (pyvips.version(0), pyvips.version(1))
        # This error message changed
@@ -619,25 +620,24 @@ class TestStoreThumbnail(ZulipTestCase):
            original_height_px=100,
            frames=1,
            thumbnail_metadata=[],
+            content_type="image/png",
        )
        with self.thumbnail_formats():
-            self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [])
+            self.assertEqual(missing_thumbnails(image_attachment), [])

        still_webp = ThumbnailFormat("webp", 100, 75, animated=False, opts="Q=90")
        with self.thumbnail_formats(still_webp):
-            self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_webp])
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp])

        anim_webp = ThumbnailFormat("webp", 100, 75, animated=True, opts="Q=90")
        with self.thumbnail_formats(still_webp, anim_webp):
            # It's not animated, so the animated format doesn't appear at all
-            self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_webp])
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp])

        still_jpeg = ThumbnailFormat("jpeg", 100, 75, animated=False, opts="Q=90")
        with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
            # But other still formats do
-            self.assertEqual(
-                missing_thumbnails(image_attachment, "image/png"), [still_webp, still_jpeg]
-            )
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp, still_jpeg])

        # If we have a rendered 150x100.webp, then we're not missing it
        rendered_still_webp = StoredThumbnailFormat(
@@ -652,14 +652,12 @@ class TestStoreThumbnail(ZulipTestCase):
        )
        image_attachment.thumbnail_metadata = [asdict(rendered_still_webp)]
        with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
-            self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_jpeg])
+            self.assertEqual(missing_thumbnails(image_attachment), [still_jpeg])

        # If we have the still, and it's animated, we do still need the animated
        image_attachment.frames = 10
        with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
-            self.assertEqual(
-                missing_thumbnails(image_attachment, "image/png"), [anim_webp, still_jpeg]
-            )
+            self.assertEqual(missing_thumbnails(image_attachment), [anim_webp, still_jpeg])

    def test_transcoded_format(self) -> None:
        image_attachment = ImageAttachment(
@@ -668,34 +666,29 @@ class TestStoreThumbnail(ZulipTestCase):
            original_height_px=100,
            frames=1,
            thumbnail_metadata=[],
+            content_type="image/tiff",
        )
        still_webp = ThumbnailFormat("webp", 100, 75, animated=False, opts="Q=90")
        with self.thumbnail_formats(still_webp):
            # We add a high-resolution transcoded format if the image isn't in INLINE_MIME_TYPES:
            transcoded = ThumbnailFormat("webp", 4032, 3024, animated=False)
-            self.assertEqual(
-                missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
-            )
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])

            # We flip to being portrait if the image is higher than it is wide
            transcoded = ThumbnailFormat("webp", 3024, 4032, animated=False)
            image_attachment.original_height_px = 300
-            self.assertEqual(
-                missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
-            )
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])

            # The format is not animated, even if the original was
            image_attachment.original_height_px = 100
            image_attachment.frames = 10
            transcoded = ThumbnailFormat("webp", 4032, 3024, animated=False)
-            self.assertEqual(
-                missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
-            )
+            self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])

            # We do not store on the image_attachment if we generated
            # a transcoded version; it just picks the largest format
            # if one is called for.
-            self.assertEqual(get_transcoded_format(image_attachment, "image/tiff"), None)
+            self.assertEqual(get_transcoded_format(image_attachment), None)
            image_attachment.thumbnail_metadata = [
                asdict(
                    StoredThumbnailFormat(
@@ -734,12 +727,14 @@ class TestStoreThumbnail(ZulipTestCase):
                    )
                ),
            ]
-            self.assertEqual(get_transcoded_format(image_attachment, "image/png"), None)
-            self.assertEqual(get_transcoded_format(image_attachment, None), None)
            self.assertEqual(
-                get_transcoded_format(image_attachment, "image/tiff"),
+                get_transcoded_format(image_attachment),
                ThumbnailFormat("webp", 4032, 3024, animated=False),
            )
+            image_attachment.content_type = "image/png"
+            self.assertEqual(get_transcoded_format(image_attachment), None)
+            image_attachment.content_type = None
+            self.assertEqual(get_transcoded_format(image_attachment), None)

    def test_maybe_thumbnail_from_stream(self) -> None:
        # If we put the file in place directly (e.g. simulating a
--- a/zerver/views/upload.py
+++ b/zerver/views/upload.py
@@ -342,7 +342,6 @@ def serve_file(
            with transaction.atomic(savepoint=False):
                ensure_thumbnails(
                    ImageAttachment.objects.select_for_update().get(id=image_attachment.id),
-                    attachment.content_type,
                )

        # Update the path that we are fetching to be the thumbnail
--- a/zerver/worker/thumbnail.py
+++ b/zerver/worker/thumbnail.py
@@ -6,7 +6,6 @@ from typing import Any

 import pyvips
 from django.db import transaction
-from django.db.models import OuterRef, Subquery
 from typing_extensions import override

 from zerver.actions.message_edit import do_update_embedded_data
@@ -22,7 +21,7 @@ from zerver.lib.thumbnail import (
    rewrite_thumbnailed_images,
 )
 from zerver.lib.upload import save_attachment_contents, upload_backend
-from zerver.models import ArchivedMessage, Attachment, ImageAttachment, Message
+from zerver.models import ArchivedMessage, ImageAttachment, Message
 from zerver.worker.base import QueueProcessingWorker, assign_queue

 logger = logging.getLogger(__name__)
@@ -40,21 +39,11 @@ class ThumbnailWorker(QueueProcessingWorker):
                # directly to a thumbnail URL we have not made yet.
                # This may mean that we may generate 0 thumbnail
                # images once we get the lock.
-                row = (
-                    ImageAttachment.objects.select_for_update(of=("self",))
-                    .annotate(
-                        original_content_type=Subquery(
-                            Attachment.objects.filter(path_id=OuterRef("path_id")).values(
-                                "content_type"
-                            )
-                        )
-                    )
-                    .get(id=event["id"])
-                )
+                row = ImageAttachment.objects.select_for_update(of=("self",)).get(id=event["id"])
            except ImageAttachment.DoesNotExist:  # nocoverage
                logger.info("ImageAttachment row %d missing", event["id"])
                return
-            uploaded_thumbnails = ensure_thumbnails(row, row.original_content_type)
+            uploaded_thumbnails = ensure_thumbnails(row)
        end = time.time()
        logger.info(
            "Processed %d thumbnails (%dms)",
@@ -63,10 +52,8 @@ class ThumbnailWorker(QueueProcessingWorker):
        )


-def ensure_thumbnails(
-    image_attachment: ImageAttachment, original_content_type: str | None = None
-) -> int:
-    needed_thumbnails = missing_thumbnails(image_attachment, original_content_type)
+def ensure_thumbnails(image_attachment: ImageAttachment) -> int:
+    needed_thumbnails = missing_thumbnails(image_attachment)

    if not needed_thumbnails:
        return 0
@@ -174,8 +161,8 @@ def ensure_thumbnails(
            is_animated=is_animated,
            original_width_px=image_attachment.original_width_px,
            original_height_px=image_attachment.original_height_px,
-            original_content_type=original_content_type,
-            transcoded_image=get_transcoded_format(image_attachment, original_content_type),
+            original_content_type=image_attachment.content_type,
+            transcoded_image=get_transcoded_format(image_attachment),
        ),
    )
    return written_images