models: Add content_type to ImageAttachment.

This means that only ImageAttachment row needs to be fetched, and
removes the need to pass around an extra parameter.  This
denormalization is safe, since in general Attachment rows are
read-only, so we are not concerned with drift between the Attachment
and ImageAttachment tables.

We cannot make content_type non-null, since while the both the
`content_type` column in Attachment and populating that from requests
predates the ImageAttachment table, we have both backfilled
ImageAttachment rows to consider, and imports may also leave files
with no `content_type`.  Any backfill of currently-null `content_type`
values will thus need to update both tables.

This change fixes a race condition when importing. ImageAttachment
rows are imported before rendering Messages, which are both before
importing Attachment rows; if the thumbnailing finished after the
Message was imported but before Attachment rows were imported, then
the re-rendering step would not know the image's content-type.
This commit is contained in:
Alex Vandiver
2025-01-30 15:58:25 +00:00
committed by Tim Abbott
parent 0788942a68
commit 98362de185
7 changed files with 97 additions and 75 deletions

View File

@@ -9,14 +9,13 @@ from typing import TypeVar
import pyvips
from bs4 import BeautifulSoup
from bs4.formatter import EntitySubstitution, HTMLFormatter
from django.db.models import OuterRef, Subquery
from django.utils.translation import gettext as _
from typing_extensions import override
from zerver.lib.exceptions import ErrorCode, JsonableError
from zerver.lib.mime_types import INLINE_MIME_TYPES
from zerver.lib.queue import queue_event_on_commit
from zerver.models import Attachment, ImageAttachment
from zerver.models import ImageAttachment
DEFAULT_AVATAR_SIZE = 100
MEDIUM_AVATAR_SIZE = 500
@@ -293,14 +292,14 @@ def resize_emoji(
def missing_thumbnails(
image_attachment: ImageAttachment, original_content_type: str | None = None
image_attachment: ImageAttachment,
) -> list[ThumbnailFormat]:
seen_thumbnails: set[StoredThumbnailFormat] = set()
for existing_thumbnail in image_attachment.thumbnail_metadata:
seen_thumbnails.add(StoredThumbnailFormat(**existing_thumbnail))
potential_output_formats = list(THUMBNAIL_OUTPUT_FORMATS)
if original_content_type is None or original_content_type not in INLINE_MIME_TYPES:
if image_attachment.content_type not in INLINE_MIME_TYPES:
if image_attachment.original_width_px >= image_attachment.original_height_px:
additional_format = ThumbnailFormat(
TRANSCODED_IMAGE_FORMAT.extension,
@@ -368,6 +367,7 @@ def maybe_thumbnail(
original_height_px=height,
frames=image.get_n_pages(),
thumbnail_metadata=[],
content_type=content_type,
)
queue_event_on_commit("thumbnail", {"id": image_row.id})
return image_row
@@ -415,15 +415,9 @@ def get_user_upload_previews(
upload_preview_data: dict[str, MarkdownImageMetadata] = {}
image_attachments = (
ImageAttachment.objects.filter(realm_id=realm_id, path_id__in=path_ids)
.annotate(
original_content_type=Subquery(
Attachment.objects.filter(path_id=OuterRef("path_id")).values("content_type")
)
)
.order_by("id")
)
image_attachments = ImageAttachment.objects.filter(
realm_id=realm_id, path_id__in=path_ids
).order_by("id")
if lock:
image_attachments = image_attachments.select_for_update(of=("self",))
for image_attachment in image_attachments:
@@ -436,7 +430,7 @@ def get_user_upload_previews(
is_animated=False,
original_width_px=image_attachment.original_width_px,
original_height_px=image_attachment.original_height_px,
original_content_type=image_attachment.original_content_type,
original_content_type=image_attachment.content_type,
)
# We re-queue the row for thumbnailing to make sure that
@@ -454,10 +448,8 @@ def get_user_upload_previews(
is_animated=is_animated,
original_width_px=image_attachment.original_width_px,
original_height_px=image_attachment.original_height_px,
original_content_type=image_attachment.original_content_type,
transcoded_image=get_transcoded_format(
image_attachment, image_attachment.original_content_type
),
original_content_type=image_attachment.content_type,
transcoded_image=get_transcoded_format(image_attachment),
)
return upload_preview_data
@@ -484,7 +476,7 @@ def get_default_thumbnail_url(image_attachment: ImageAttachment) -> tuple[str, b
def get_transcoded_format(
image_attachment: ImageAttachment, original_content_type: str | None
image_attachment: ImageAttachment,
) -> StoredThumbnailFormat | None:
# Returns None if the original content-type is judged to be
# renderable inline. Otherwise, we return the largest thumbnail
@@ -492,7 +484,7 @@ def get_transcoded_format(
# not in INLINE_MIME_TYPES get an extra large-resolution thumbnail
# added to their list of formats, this is thus either None or a
# high-resolution thumbnail.
if original_content_type is None or original_content_type in INLINE_MIME_TYPES:
if image_attachment.content_type is None or image_attachment.content_type in INLINE_MIME_TYPES:
return None
thumbs_by_size = sorted(

View File

@@ -0,0 +1,48 @@
from django.db import migrations, models
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import OuterRef, Subquery
def populate_content_type(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
ImageAttachment = apps.get_model("zerver", "ImageAttachment")
Attachment = apps.get_model("zerver", "Attachment")
batch_size = 1000
min_id = 0
while True:
# Update content_types from corresponding Attachments in bulk
rows_updated = ImageAttachment.objects.filter(
id__gt=min_id, id__lte=min_id + batch_size
).update(
content_type=Subquery(
Attachment.objects.filter(path_id=OuterRef("path_id")).values("content_type")
),
)
if not rows_updated:
break
min_id += batch_size
print(f"Processed ImageAttachments through id {min_id}")
class Migration(migrations.Migration):
atomic = False
dependencies = [
("zerver", "0659_remove_realm_bot_creation_policy"),
]
operations = [
migrations.AddField(
model_name="imageattachment",
name="content_type",
field=models.TextField(null=True),
),
migrations.RunPython(
populate_content_type,
reverse_code=migrations.RunPython.noop,
elidable=True,
),
]

View File

@@ -667,6 +667,7 @@ class ArchivedUserMessage(AbstractUserMessage):
class ImageAttachment(models.Model):
realm = models.ForeignKey(Realm, on_delete=CASCADE)
path_id = models.TextField(db_index=True, unique=True)
content_type = models.TextField(null=True)
original_width_px = models.IntegerField()
original_height_px = models.IntegerField()

View File

@@ -192,7 +192,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
# Re-thumbnail with a non-overlapping set of sizes
with self.thumbnail_formats(ThumbnailFormat("jpg", 100, 75, animated=False)):
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/gif")
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
# We generate a new size but leave the old ones
self.assert_length(ImageAttachment.objects.get(path_id=path_id).thumbnail_metadata, 3)
@@ -227,7 +227,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
)
# Complete thumbnailing the second image first -- replacing only that spinner
ensure_thumbnails(ImageAttachment.objects.get(path_id=second_path_id), "image/jpeg")
ensure_thumbnails(ImageAttachment.objects.get(path_id=second_path_id))
self.assert_message_content_is(
message_id,
(
@@ -247,7 +247,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
)
# Finish the other thumbnail
ensure_thumbnails(ImageAttachment.objects.get(path_id=first_path_id), "image/png")
ensure_thumbnails(ImageAttachment.objects.get(path_id=first_path_id))
self.assert_message_content_is(
message_id,
(
@@ -283,7 +283,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
# Completing rendering after it is deleted should work, and
# update the rendered content in the archived message
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
expected = (
f'<p><a href="/user_uploads/{path_id}">image</a></p>\n'
f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="image">'
@@ -313,7 +313,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
) as thumb_mock,
self.assertLogs("zerver.worker.thumbnail", "ERROR") as thumbnail_logs,
):
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
thumb_mock.assert_called_once()
self.assert_length(thumbnail_logs.output, 1)
self.assertTrue(
@@ -360,7 +360,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
ThumbnailFormat("webp", 200, 150, animated=False),
),
):
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
# Called once per format
self.assertEqual(thumb_mock.call_count, 2)
@@ -410,7 +410,7 @@ class MarkdownThumbnailTest(ZulipTestCase):
# Thumbnail the image. The message does not exist yet, so
# nothing is re-written.
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id), "image/png")
ensure_thumbnails(ImageAttachment.objects.get(path_id=path_id))
# Send the message; this should re-check the ImageAttachment
# data, find the thumbnails, and update the rendered_content

View File

@@ -368,11 +368,11 @@ class TestStoreThumbnail(ZulipTestCase):
self.assertEqual(thumbnailed_image.get_n_pages(), 2)
with self.thumbnail_formats(ThumbnailFormat("webp", 100, 75, animated=True)):
self.assertEqual(ensure_thumbnails(image_attachment, "image/webp"), 0)
self.assertEqual(ensure_thumbnails(image_attachment), 0)
self.assert_length(image_attachment.thumbnail_metadata, 1)
with self.thumbnail_formats(ThumbnailFormat("webp", 150, 100, opts="Q=90", animated=False)):
self.assertEqual(ensure_thumbnails(image_attachment, "image/webp"), 1)
self.assertEqual(ensure_thumbnails(image_attachment), 1)
self.assert_length(image_attachment.thumbnail_metadata, 2)
bigger_thumbnail = StoredThumbnailFormat(**image_attachment.thumbnail_metadata[1])
@@ -594,12 +594,13 @@ class TestStoreThumbnail(ZulipTestCase):
original_width_px=128,
frames=1,
thumbnail_metadata=[],
content_type="image/gif",
)
with self.thumbnail_formats(ThumbnailFormat("webp", 100, 75, animated=False)):
self.assert_length(missing_thumbnails(image_attachment, "image/gif"), 1)
self.assert_length(missing_thumbnails(image_attachment), 1)
with self.assertLogs("zerver.worker.thumbnail", level="ERROR") as error_log:
self.assertEqual(ensure_thumbnails(image_attachment, "image/gif"), 0)
self.assertEqual(ensure_thumbnails(image_attachment), 0)
libvips_version = (pyvips.version(0), pyvips.version(1))
# This error message changed
@@ -619,25 +620,24 @@ class TestStoreThumbnail(ZulipTestCase):
original_height_px=100,
frames=1,
thumbnail_metadata=[],
content_type="image/png",
)
with self.thumbnail_formats():
self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [])
self.assertEqual(missing_thumbnails(image_attachment), [])
still_webp = ThumbnailFormat("webp", 100, 75, animated=False, opts="Q=90")
with self.thumbnail_formats(still_webp):
self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_webp])
self.assertEqual(missing_thumbnails(image_attachment), [still_webp])
anim_webp = ThumbnailFormat("webp", 100, 75, animated=True, opts="Q=90")
with self.thumbnail_formats(still_webp, anim_webp):
# It's not animated, so the animated format doesn't appear at all
self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_webp])
self.assertEqual(missing_thumbnails(image_attachment), [still_webp])
still_jpeg = ThumbnailFormat("jpeg", 100, 75, animated=False, opts="Q=90")
with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
# But other still formats do
self.assertEqual(
missing_thumbnails(image_attachment, "image/png"), [still_webp, still_jpeg]
)
self.assertEqual(missing_thumbnails(image_attachment), [still_webp, still_jpeg])
# If we have a rendered 150x100.webp, then we're not missing it
rendered_still_webp = StoredThumbnailFormat(
@@ -652,14 +652,12 @@ class TestStoreThumbnail(ZulipTestCase):
)
image_attachment.thumbnail_metadata = [asdict(rendered_still_webp)]
with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
self.assertEqual(missing_thumbnails(image_attachment, "image/png"), [still_jpeg])
self.assertEqual(missing_thumbnails(image_attachment), [still_jpeg])
# If we have the still, and it's animated, we do still need the animated
image_attachment.frames = 10
with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
self.assertEqual(
missing_thumbnails(image_attachment, "image/png"), [anim_webp, still_jpeg]
)
self.assertEqual(missing_thumbnails(image_attachment), [anim_webp, still_jpeg])
def test_transcoded_format(self) -> None:
image_attachment = ImageAttachment(
@@ -668,34 +666,29 @@ class TestStoreThumbnail(ZulipTestCase):
original_height_px=100,
frames=1,
thumbnail_metadata=[],
content_type="image/tiff",
)
still_webp = ThumbnailFormat("webp", 100, 75, animated=False, opts="Q=90")
with self.thumbnail_formats(still_webp):
# We add a high-resolution transcoded format if the image isn't in INLINE_MIME_TYPES:
transcoded = ThumbnailFormat("webp", 4032, 3024, animated=False)
self.assertEqual(
missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
)
self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])
# We flip to being portrait if the image is higher than it is wide
transcoded = ThumbnailFormat("webp", 3024, 4032, animated=False)
image_attachment.original_height_px = 300
self.assertEqual(
missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
)
self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])
# The format is not animated, even if the original was
image_attachment.original_height_px = 100
image_attachment.frames = 10
transcoded = ThumbnailFormat("webp", 4032, 3024, animated=False)
self.assertEqual(
missing_thumbnails(image_attachment, "image/tiff"), [still_webp, transcoded]
)
self.assertEqual(missing_thumbnails(image_attachment), [still_webp, transcoded])
# We do not store on the image_attachment if we generated
# a transcoded version; it just picks the largest format
# if one is called for.
self.assertEqual(get_transcoded_format(image_attachment, "image/tiff"), None)
self.assertEqual(get_transcoded_format(image_attachment), None)
image_attachment.thumbnail_metadata = [
asdict(
StoredThumbnailFormat(
@@ -734,12 +727,14 @@ class TestStoreThumbnail(ZulipTestCase):
)
),
]
self.assertEqual(get_transcoded_format(image_attachment, "image/png"), None)
self.assertEqual(get_transcoded_format(image_attachment, None), None)
self.assertEqual(
get_transcoded_format(image_attachment, "image/tiff"),
get_transcoded_format(image_attachment),
ThumbnailFormat("webp", 4032, 3024, animated=False),
)
image_attachment.content_type = "image/png"
self.assertEqual(get_transcoded_format(image_attachment), None)
image_attachment.content_type = None
self.assertEqual(get_transcoded_format(image_attachment), None)
def test_maybe_thumbnail_from_stream(self) -> None:
# If we put the file in place directly (e.g. simulating a

View File

@@ -342,7 +342,6 @@ def serve_file(
with transaction.atomic(savepoint=False):
ensure_thumbnails(
ImageAttachment.objects.select_for_update().get(id=image_attachment.id),
attachment.content_type,
)
# Update the path that we are fetching to be the thumbnail

View File

@@ -6,7 +6,6 @@ from typing import Any
import pyvips
from django.db import transaction
from django.db.models import OuterRef, Subquery
from typing_extensions import override
from zerver.actions.message_edit import do_update_embedded_data
@@ -22,7 +21,7 @@ from zerver.lib.thumbnail import (
rewrite_thumbnailed_images,
)
from zerver.lib.upload import save_attachment_contents, upload_backend
from zerver.models import ArchivedMessage, Attachment, ImageAttachment, Message
from zerver.models import ArchivedMessage, ImageAttachment, Message
from zerver.worker.base import QueueProcessingWorker, assign_queue
logger = logging.getLogger(__name__)
@@ -40,21 +39,11 @@ class ThumbnailWorker(QueueProcessingWorker):
# directly to a thumbnail URL we have not made yet.
# This may mean that we may generate 0 thumbnail
# images once we get the lock.
row = (
ImageAttachment.objects.select_for_update(of=("self",))
.annotate(
original_content_type=Subquery(
Attachment.objects.filter(path_id=OuterRef("path_id")).values(
"content_type"
)
)
)
.get(id=event["id"])
)
row = ImageAttachment.objects.select_for_update(of=("self",)).get(id=event["id"])
except ImageAttachment.DoesNotExist: # nocoverage
logger.info("ImageAttachment row %d missing", event["id"])
return
uploaded_thumbnails = ensure_thumbnails(row, row.original_content_type)
uploaded_thumbnails = ensure_thumbnails(row)
end = time.time()
logger.info(
"Processed %d thumbnails (%dms)",
@@ -63,10 +52,8 @@ class ThumbnailWorker(QueueProcessingWorker):
)
def ensure_thumbnails(
image_attachment: ImageAttachment, original_content_type: str | None = None
) -> int:
needed_thumbnails = missing_thumbnails(image_attachment, original_content_type)
def ensure_thumbnails(image_attachment: ImageAttachment) -> int:
needed_thumbnails = missing_thumbnails(image_attachment)
if not needed_thumbnails:
return 0
@@ -174,8 +161,8 @@ def ensure_thumbnails(
is_animated=is_animated,
original_width_px=image_attachment.original_width_px,
original_height_px=image_attachment.original_height_px,
original_content_type=original_content_type,
transcoded_image=get_transcoded_format(image_attachment, original_content_type),
original_content_type=image_attachment.content_type,
transcoded_image=get_transcoded_format(image_attachment),
),
)
return written_images