uploads: Split out S3 and local file backends into separate files.

The uploads file is large, and conceptually the S3 and local-file backends are separable.
2025-10-31 20:13:46 +00:00 · 2022-12-14 20:51:37 +00:00
parent 497abc2e48
commit 7c0d414aff
22 changed files with 1293 additions and 1256 deletions
--- a/zerver/lib/upload/s3.py
+++ b/zerver/lib/upload/s3.py
@@ -0,0 +1,485 @@
+import logging
+import os
+import secrets
+import urllib
+from mimetypes import guess_type
+from typing import IO, Any, Callable, Optional
+
+import boto3
+import botocore
+from boto3.session import Session
+from botocore.client import Config
+from django.conf import settings
+from mypy_boto3_s3.client import S3Client
+from mypy_boto3_s3.service_resource import Bucket, Object
+
+from zerver.lib.avatar_hash import user_avatar_path
+from zerver.lib.upload.base import (
+    INLINE_MIME_TYPES,
+    MEDIUM_AVATAR_SIZE,
+    ZulipUploadBackend,
+    create_attachment,
+    resize_avatar,
+    resize_emoji,
+    resize_logo,
+    sanitize_name,
+)
+from zerver.models import Realm, RealmEmoji, UserProfile
+
+# Duration that the signed upload URLs that we redirect to when
+# accessing uploaded files are available for clients to fetch before
+# they expire.
+SIGNED_UPLOAD_URL_DURATION = 60
+
+# Performance note:
+#
+# For writing files to S3, the file could either be stored in RAM
+# (if it is less than 2.5MiB or so) or an actual temporary file on disk.
+#
+# Because we set FILE_UPLOAD_MAX_MEMORY_SIZE to 0, only the latter case
+# should occur in practice.
+#
+# This is great, because passing the pseudofile object that Django gives
+# you to boto would be a pain.
+
+# To come up with a s3 key we randomly generate a "directory". The
+# "file name" is the original filename provided by the user run
+# through a sanitization function.
+
+
+# https://github.com/boto/botocore/issues/2644 means that the IMDS
+# request _always_ pulls from the environment.  Monkey-patch the
+# `should_bypass_proxies` function if we need to skip them, based
+# on S3_SKIP_PROXY.
+if settings.S3_SKIP_PROXY is True:  # nocoverage
+    botocore.utils.should_bypass_proxies = lambda url: True
+
+
+def get_bucket(bucket_name: str, session: Optional[Session] = None) -> Bucket:
+    if session is None:
+        session = Session(settings.S3_KEY, settings.S3_SECRET_KEY)
+    bucket = session.resource(
+        "s3", region_name=settings.S3_REGION, endpoint_url=settings.S3_ENDPOINT_URL
+    ).Bucket(bucket_name)
+    return bucket
+
+
+def upload_image_to_s3(
+    bucket: Bucket,
+    file_name: str,
+    content_type: Optional[str],
+    user_profile: UserProfile,
+    contents: bytes,
+) -> None:
+    key = bucket.Object(file_name)
+    metadata = {
+        "user_profile_id": str(user_profile.id),
+        "realm_id": str(user_profile.realm_id),
+    }
+
+    content_disposition = ""
+    if content_type is None:
+        content_type = ""
+    if content_type not in INLINE_MIME_TYPES:
+        content_disposition = "attachment"
+
+    key.put(
+        Body=contents,
+        Metadata=metadata,
+        ContentType=content_type,
+        ContentDisposition=content_disposition,
+    )
+
+
+def get_signed_upload_url(path: str, download: bool = False) -> str:
+    client = boto3.client(
+        "s3",
+        aws_access_key_id=settings.S3_KEY,
+        aws_secret_access_key=settings.S3_SECRET_KEY,
+        region_name=settings.S3_REGION,
+        endpoint_url=settings.S3_ENDPOINT_URL,
+    )
+    params = {
+        "Bucket": settings.S3_AUTH_UPLOADS_BUCKET,
+        "Key": path,
+    }
+    if download:
+        params["ResponseContentDisposition"] = "attachment"
+
+    return client.generate_presigned_url(
+        ClientMethod="get_object",
+        Params=params,
+        ExpiresIn=SIGNED_UPLOAD_URL_DURATION,
+        HttpMethod="GET",
+    )
+
+
+class S3UploadBackend(ZulipUploadBackend):
+    def __init__(self) -> None:
+        self.session = Session(settings.S3_KEY, settings.S3_SECRET_KEY)
+        self.avatar_bucket = get_bucket(settings.S3_AVATAR_BUCKET, self.session)
+        self.uploads_bucket = get_bucket(settings.S3_AUTH_UPLOADS_BUCKET, self.session)
+
+        self._boto_client: Optional[S3Client] = None
+        self.public_upload_url_base = self.construct_public_upload_url_base()
+
+    def construct_public_upload_url_base(self) -> str:
+        # Return the pattern for public URL for a key in the S3 Avatar bucket.
+        # For Amazon S3 itself, this will return the following:
+        #     f"https://{self.avatar_bucket.name}.{network_location}/{key}"
+        #
+        # However, we need this function to properly handle S3 style
+        # file upload backends that Zulip supports, which can have a
+        # different URL format. Configuring no signature and providing
+        # no access key makes `generate_presigned_url` just return the
+        # normal public URL for a key.
+        #
+        # It unfortunately takes 2ms per query to call
+        # generate_presigned_url, even with our cached boto
+        # client. Since we need to potentially compute hundreds of
+        # avatar URLs in single `GET /messages` request, we instead
+        # back-compute the URL pattern here.
+
+        DUMMY_KEY = "dummy_key_ignored"
+        foo_url = self.get_boto_client().generate_presigned_url(
+            ClientMethod="get_object",
+            Params={
+                "Bucket": self.avatar_bucket.name,
+                "Key": DUMMY_KEY,
+            },
+            ExpiresIn=0,
+        )
+        split_url = urllib.parse.urlsplit(foo_url)
+        assert split_url.path.endswith(f"/{DUMMY_KEY}")
+
+        return urllib.parse.urlunsplit(
+            (split_url.scheme, split_url.netloc, split_url.path[: -len(DUMMY_KEY)], "", "")
+        )
+
+    def get_public_upload_url(
+        self,
+        key: str,
+    ) -> str:
+        assert not key.startswith("/")
+        return urllib.parse.urljoin(self.public_upload_url_base, key)
+
+    def get_boto_client(self) -> S3Client:
+        """
+        Creating the client takes a long time so we need to cache it.
+        """
+        if self._boto_client is None:
+            config = Config(signature_version=botocore.UNSIGNED)
+            self._boto_client = self.session.client(
+                "s3",
+                region_name=settings.S3_REGION,
+                endpoint_url=settings.S3_ENDPOINT_URL,
+                config=config,
+            )
+        return self._boto_client
+
+    def delete_file_from_s3(self, path_id: str, bucket: Bucket) -> bool:
+        key = bucket.Object(path_id)
+
+        try:
+            key.load()
+        except botocore.exceptions.ClientError:
+            file_name = path_id.split("/")[-1]
+            logging.warning(
+                "%s does not exist. Its entry in the database will be removed.", file_name
+            )
+            return False
+        key.delete()
+        return True
+
+    def get_public_upload_root_url(self) -> str:
+        return self.public_upload_url_base
+
+    def generate_message_upload_path(self, realm_id: str, uploaded_file_name: str) -> str:
+        return "/".join(
+            [
+                realm_id,
+                secrets.token_urlsafe(18),
+                sanitize_name(uploaded_file_name),
+            ]
+        )
+
+    def upload_message_file(
+        self,
+        uploaded_file_name: str,
+        uploaded_file_size: int,
+        content_type: Optional[str],
+        file_data: bytes,
+        user_profile: UserProfile,
+        target_realm: Optional[Realm] = None,
+    ) -> str:
+        if target_realm is None:
+            target_realm = user_profile.realm
+        s3_file_name = self.generate_message_upload_path(str(target_realm.id), uploaded_file_name)
+        url = f"/user_uploads/{s3_file_name}"
+
+        upload_image_to_s3(
+            self.uploads_bucket,
+            s3_file_name,
+            content_type,
+            user_profile,
+            file_data,
+        )
+
+        create_attachment(
+            uploaded_file_name, s3_file_name, user_profile, target_realm, uploaded_file_size
+        )
+        return url
+
+    def delete_message_image(self, path_id: str) -> bool:
+        return self.delete_file_from_s3(path_id, self.uploads_bucket)
+
+    def write_avatar_images(
+        self,
+        s3_file_name: str,
+        target_user_profile: UserProfile,
+        image_data: bytes,
+        content_type: Optional[str],
+    ) -> None:
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + ".original",
+            content_type,
+            target_user_profile,
+            image_data,
+        )
+
+        # custom 500px wide version
+        resized_medium = resize_avatar(image_data, MEDIUM_AVATAR_SIZE)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + "-medium.png",
+            "image/png",
+            target_user_profile,
+            resized_medium,
+        )
+
+        resized_data = resize_avatar(image_data)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name,
+            "image/png",
+            target_user_profile,
+            resized_data,
+        )
+        # See avatar_url in avatar.py for URL.  (That code also handles the case
+        # that users use gravatar.)
+
+    def upload_avatar_image(
+        self,
+        user_file: IO[bytes],
+        acting_user_profile: UserProfile,
+        target_user_profile: UserProfile,
+        content_type: Optional[str] = None,
+    ) -> None:
+        if content_type is None:
+            content_type = guess_type(user_file.name)[0]
+        s3_file_name = user_avatar_path(target_user_profile)
+
+        image_data = user_file.read()
+        self.write_avatar_images(s3_file_name, target_user_profile, image_data, content_type)
+
+    def delete_avatar_image(self, user: UserProfile) -> None:
+        path_id = user_avatar_path(user)
+
+        self.delete_file_from_s3(path_id + ".original", self.avatar_bucket)
+        self.delete_file_from_s3(path_id + "-medium.png", self.avatar_bucket)
+        self.delete_file_from_s3(path_id, self.avatar_bucket)
+
+    def get_avatar_key(self, file_name: str) -> Object:
+        key = self.avatar_bucket.Object(file_name)
+        return key
+
+    def copy_avatar(self, source_profile: UserProfile, target_profile: UserProfile) -> None:
+        s3_source_file_name = user_avatar_path(source_profile)
+        s3_target_file_name = user_avatar_path(target_profile)
+
+        key = self.get_avatar_key(s3_source_file_name + ".original")
+        image_data = key.get()["Body"].read()
+        content_type = key.content_type
+
+        self.write_avatar_images(s3_target_file_name, target_profile, image_data, content_type)
+
+    def get_avatar_url(self, hash_key: str, medium: bool = False) -> str:
+        medium_suffix = "-medium.png" if medium else ""
+        return self.get_public_upload_url(f"{hash_key}{medium_suffix}")
+
+    def get_export_tarball_url(self, realm: Realm, export_path: str) -> str:
+        # export_path has a leading /
+        return self.get_public_upload_url(export_path[1:])
+
+    def realm_avatar_and_logo_path(self, realm: Realm) -> str:
+        return os.path.join(str(realm.id), "realm")
+
+    def upload_realm_icon_image(self, icon_file: IO[bytes], user_profile: UserProfile) -> None:
+        content_type = guess_type(icon_file.name)[0]
+        s3_file_name = os.path.join(self.realm_avatar_and_logo_path(user_profile.realm), "icon")
+
+        image_data = icon_file.read()
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + ".original",
+            content_type,
+            user_profile,
+            image_data,
+        )
+
+        resized_data = resize_avatar(image_data)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + ".png",
+            "image/png",
+            user_profile,
+            resized_data,
+        )
+        # See avatar_url in avatar.py for URL.  (That code also handles the case
+        # that users use gravatar.)
+
+    def get_realm_icon_url(self, realm_id: int, version: int) -> str:
+        public_url = self.get_public_upload_url(f"{realm_id}/realm/icon.png")
+        return public_url + f"?version={version}"
+
+    def upload_realm_logo_image(
+        self, logo_file: IO[bytes], user_profile: UserProfile, night: bool
+    ) -> None:
+        content_type = guess_type(logo_file.name)[0]
+        if night:
+            basename = "night_logo"
+        else:
+            basename = "logo"
+        s3_file_name = os.path.join(self.realm_avatar_and_logo_path(user_profile.realm), basename)
+
+        image_data = logo_file.read()
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + ".original",
+            content_type,
+            user_profile,
+            image_data,
+        )
+
+        resized_data = resize_logo(image_data)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + ".png",
+            "image/png",
+            user_profile,
+            resized_data,
+        )
+        # See avatar_url in avatar.py for URL.  (That code also handles the case
+        # that users use gravatar.)
+
+    def get_realm_logo_url(self, realm_id: int, version: int, night: bool) -> str:
+        if not night:
+            file_name = "logo.png"
+        else:
+            file_name = "night_logo.png"
+        public_url = self.get_public_upload_url(f"{realm_id}/realm/{file_name}")
+        return public_url + f"?version={version}"
+
+    def ensure_avatar_image(self, user_profile: UserProfile, is_medium: bool = False) -> None:
+        # BUG: The else case should be user_avatar_path(user_profile) + ".png".
+        # See #12852 for details on this bug and how to migrate it.
+        file_extension = "-medium.png" if is_medium else ""
+        file_path = user_avatar_path(user_profile)
+        s3_file_name = file_path
+
+        key = self.avatar_bucket.Object(file_path + ".original")
+        image_data = key.get()["Body"].read()
+
+        if is_medium:
+            resized_avatar = resize_avatar(image_data, MEDIUM_AVATAR_SIZE)
+        else:
+            resized_avatar = resize_avatar(image_data)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            s3_file_name + file_extension,
+            "image/png",
+            user_profile,
+            resized_avatar,
+        )
+
+    def upload_emoji_image(
+        self, emoji_file: IO[bytes], emoji_file_name: str, user_profile: UserProfile
+    ) -> bool:
+        content_type = guess_type(emoji_file_name)[0]
+        emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format(
+            realm_id=user_profile.realm_id,
+            emoji_file_name=emoji_file_name,
+        )
+
+        image_data = emoji_file.read()
+        upload_image_to_s3(
+            self.avatar_bucket,
+            ".".join((emoji_path, "original")),
+            content_type,
+            user_profile,
+            image_data,
+        )
+
+        resized_image_data, is_animated, still_image_data = resize_emoji(image_data)
+        upload_image_to_s3(
+            self.avatar_bucket,
+            emoji_path,
+            content_type,
+            user_profile,
+            resized_image_data,
+        )
+        if is_animated:
+            still_path = RealmEmoji.STILL_PATH_ID_TEMPLATE.format(
+                realm_id=user_profile.realm_id,
+                emoji_filename_without_extension=os.path.splitext(emoji_file_name)[0],
+            )
+            assert still_image_data is not None
+            upload_image_to_s3(
+                self.avatar_bucket,
+                still_path,
+                "image/png",
+                user_profile,
+                still_image_data,
+            )
+
+        return is_animated
+
+    def get_emoji_url(self, emoji_file_name: str, realm_id: int, still: bool = False) -> str:
+        if still:
+            emoji_path = RealmEmoji.STILL_PATH_ID_TEMPLATE.format(
+                realm_id=realm_id,
+                emoji_filename_without_extension=os.path.splitext(emoji_file_name)[0],
+            )
+            return self.get_public_upload_url(emoji_path)
+        else:
+            emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format(
+                realm_id=realm_id, emoji_file_name=emoji_file_name
+            )
+            return self.get_public_upload_url(emoji_path)
+
+    def upload_export_tarball(
+        self,
+        realm: Optional[Realm],
+        tarball_path: str,
+        percent_callback: Optional[Callable[[Any], None]] = None,
+    ) -> str:
+        # We use the avatar bucket, because it's world-readable.
+        key = self.avatar_bucket.Object(
+            os.path.join("exports", secrets.token_hex(16), os.path.basename(tarball_path))
+        )
+
+        if percent_callback is None:
+            key.upload_file(Filename=tarball_path)
+        else:
+            key.upload_file(Filename=tarball_path, Callback=percent_callback)
+
+        public_url = self.get_public_upload_url(key.key)
+        return public_url
+
+    def delete_export_tarball(self, export_path: str) -> Optional[str]:
+        assert export_path.startswith("/")
+        path_id = export_path[1:]
+        if self.delete_file_from_s3(path_id, self.avatar_bucket):
+            return export_path
+        return None