delete_realm: Optimize attachment cleanup by batching.

This commit is contained in:
Alex Vandiver
2023-02-28 03:44:29 +00:00
committed by Tim Abbott
parent cdda4bc089
commit c9d1755a12
7 changed files with 128 additions and 11 deletions

View File

@@ -16,7 +16,7 @@ from zerver.actions.user_settings import do_delete_avatar_image
from zerver.lib.message import parse_message_time_limit_setting, update_first_visible_message_id
from zerver.lib.send_email import FromAddress, send_email_to_admins
from zerver.lib.sessions import delete_user_sessions
from zerver.lib.upload import delete_message_attachment
from zerver.lib.upload import delete_message_attachments
from zerver.lib.user_counts import realm_user_count_by_role
from zerver.models import (
ArchivedAttachment,
@@ -334,14 +334,22 @@ def do_add_deactivated_redirect(realm: Realm, redirect_url: str) -> None:
realm.save(update_fields=["deactivated_redirect"])
def do_delete_all_realm_attachments(realm: Realm) -> None:
def do_delete_all_realm_attachments(realm: Realm, *, batch_size: int = 1000) -> None:
# Delete attachment files from the storage backend, so that we
# don't leave them dangling.
for obj_class in Attachment, ArchivedAttachment:
for path_id in obj_class.objects.filter(realm_id=realm.id).values_list(
"path_id", flat=True
):
delete_message_attachment(path_id)
last_id = 0
while True:
to_delete = (
obj_class.objects.filter(realm_id=realm.id, id__gt=last_id) # type: ignore[misc] # Does not recognize shared 'id' PK column
.order_by("id")
.values_list("id", "path_id")[:batch_size]
)
if len(to_delete) > 0:
delete_message_attachments([row[1] for row in to_delete])
last_id = to_delete[len(to_delete) - 1][0]
if len(to_delete) < batch_size:
break
obj_class.objects.filter(realm=realm).delete()

View File

@@ -2,7 +2,7 @@ import io
import logging
import urllib
from mimetypes import guess_type
from typing import IO, Any, Callable, Optional, Tuple
from typing import IO, Any, Callable, List, Optional, Tuple
from urllib.parse import urljoin
from django.conf import settings
@@ -110,6 +110,10 @@ def delete_message_attachment(path_id: str) -> bool:
return upload_backend.delete_message_attachment(path_id)
def delete_message_attachments(path_ids: List[str]) -> None:
return upload_backend.delete_message_attachments(path_ids)
# Avatar image uploads

View File

@@ -2,7 +2,7 @@ import io
import os
import re
import unicodedata
from typing import IO, Any, Callable, Optional, Tuple
from typing import IO, Any, Callable, List, Optional, Tuple
from django.utils.translation import gettext as _
from markupsafe import Markup
@@ -206,6 +206,10 @@ class ZulipUploadBackend:
def delete_message_attachment(self, path_id: str) -> bool:
raise NotImplementedError
def delete_message_attachments(self, path_ids: List[str]) -> None:
for path_id in path_ids:
self.delete_message_attachment(path_id)
# Avatar image uploads
def get_avatar_url(self, hash_key: str, medium: bool = False) -> str:
raise NotImplementedError

View File

@@ -3,7 +3,7 @@ import os
import secrets
import urllib
from mimetypes import guess_type
from typing import IO, Any, Callable, Optional
from typing import IO, Any, Callable, List, Optional
import boto3
import botocore
@@ -233,6 +233,11 @@ class S3UploadBackend(ZulipUploadBackend):
def delete_message_attachment(self, path_id: str) -> bool:
return self.delete_file_from_s3(path_id, self.uploads_bucket)
def delete_message_attachments(self, path_ids: List[str]) -> None:
self.uploads_bucket.delete_objects(
Delete={"Objects": [{"Key": path_id} for path_id in path_ids]}
)
def write_avatar_images(
self,
s3_file_name: str,

View File

@@ -16,6 +16,7 @@ from zerver.actions.realm_settings import (
do_change_realm_org_type,
do_change_realm_plan_type,
do_deactivate_realm,
do_delete_all_realm_attachments,
do_reactivate_realm,
do_scrub_realm,
do_send_realm_reactivation_email,
@@ -27,7 +28,7 @@ from zerver.lib.realm_description import get_realm_rendered_description, get_rea
from zerver.lib.send_email import send_future_email
from zerver.lib.streams import create_stream_if_needed
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.upload import upload_message_attachment
from zerver.lib.upload import delete_message_attachments, upload_message_attachment
from zerver.models import (
Attachment,
CustomProfileField,
@@ -1417,6 +1418,42 @@ class RealmAPITest(ZulipTestCase):
class ScrubRealmTest(ZulipTestCase):
def test_do_delete_all_realm_attachments(self) -> None:
realm = get_realm("zulip")
hamlet = self.example_user("hamlet")
Attachment.objects.filter(realm=realm).delete()
assert settings.LOCAL_UPLOADS_DIR is not None
assert settings.LOCAL_FILES_DIR is not None
path_ids = []
for n in range(1, 4):
content = f"content{n}".encode()
url = upload_message_attachment(
f"dummy{n}.txt", len(content), "text/plain", content, hamlet
)
base = "/user_uploads/"
self.assertEqual(base, url[: len(base)])
path_id = re.sub("/user_uploads/", "", url)
self.assertTrue(os.path.isfile(os.path.join(settings.LOCAL_FILES_DIR, path_id)))
path_ids.append(path_id)
with mock.patch(
"zerver.actions.realm_settings.delete_message_attachments",
side_effect=delete_message_attachments,
) as p:
do_delete_all_realm_attachments(realm, batch_size=2)
self.assertEqual(p.call_count, 2)
p.assert_has_calls(
[
mock.call([path_ids[0], path_ids[1]]),
mock.call([path_ids[2]]),
]
)
self.assertEqual(Attachment.objects.filter(realm=realm).count(), 0)
for file_path in path_ids:
self.assertFalse(os.path.isfile(os.path.join(settings.LOCAL_FILES_DIR, path_id)))
def test_scrub_realm(self) -> None:
zulip = get_realm("zulip")
lear = get_realm("lear")

View File

@@ -19,6 +19,7 @@ from zerver.lib.test_helpers import (
from zerver.lib.upload import (
delete_export_tarball,
delete_message_attachment,
delete_message_attachments,
upload_emoji_image,
upload_export_tarball,
upload_message_attachment,
@@ -86,7 +87,35 @@ class LocalStorageTest(UploadSerializeMixin, ZulipTestCase):
response_dict = self.assert_json_success(result)
path_id = re.sub("/user_uploads/", "", response_dict["uri"])
assert settings.LOCAL_FILES_DIR is not None
file_path = os.path.join(settings.LOCAL_FILES_DIR, path_id)
self.assertTrue(os.path.isfile(file_path))
self.assertTrue(delete_message_attachment(path_id))
self.assertFalse(os.path.isfile(file_path))
def test_delete_message_attachments(self) -> None:
assert settings.LOCAL_UPLOADS_DIR is not None
assert settings.LOCAL_FILES_DIR is not None
user_profile = self.example_user("hamlet")
path_ids = []
for n in range(1, 1005):
uri = upload_message_attachment(
"dummy.txt", len(b"zulip!"), "text/plain", b"zulip!", user_profile
)
base = "/user_uploads/"
self.assertEqual(base, uri[: len(base)])
path_id = re.sub("/user_uploads/", "", uri)
path_ids.append(path_id)
file_path = os.path.join(settings.LOCAL_FILES_DIR, path_id)
self.assertTrue(os.path.isfile(file_path))
delete_message_attachments(path_ids)
for path_id in path_ids:
file_path = os.path.join(settings.LOCAL_FILES_DIR, path_id)
self.assertFalse(os.path.isfile(file_path))
def test_avatar_url(self) -> None:
self.login("hamlet")

View File

@@ -3,6 +3,7 @@ import os
import re
import urllib
from io import StringIO
from unittest.mock import patch
import botocore.exceptions
from django.conf import settings
@@ -22,6 +23,7 @@ from zerver.lib.test_helpers import (
from zerver.lib.upload import (
delete_export_tarball,
delete_message_attachment,
delete_message_attachments,
upload_export_tarball,
upload_message_attachment,
)
@@ -31,6 +33,7 @@ from zerver.lib.upload.base import (
MEDIUM_AVATAR_SIZE,
resize_avatar,
)
from zerver.lib.upload.s3 import S3UploadBackend
from zerver.models import (
Attachment,
RealmEmoji,
@@ -96,7 +99,7 @@ class S3Test(ZulipTestCase):
@use_s3_backend
def test_delete_message_attachment(self) -> None:
create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET)
bucket = create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET)[0]
user_profile = self.example_user("hamlet")
uri = upload_message_attachment(
@@ -104,10 +107,37 @@ class S3Test(ZulipTestCase):
)
path_id = re.sub("/user_uploads/", "", uri)
self.assertIsNotNone(bucket.Object(path_id).get())
self.assertTrue(delete_message_attachment(path_id))
with self.assertRaises(botocore.exceptions.ClientError):
bucket.Object(path_id).load()
@use_s3_backend
def test_delete_message_attachments(self) -> None:
bucket = create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET)[0]
user_profile = self.example_user("hamlet")
path_ids = []
for n in range(1, 5):
uri = upload_message_attachment(
"dummy.txt", len(b"zulip!"), "text/plain", b"zulip!", user_profile
)
path_id = re.sub("/user_uploads/", "", uri)
self.assertIsNotNone(bucket.Object(path_id).get())
path_ids.append(path_id)
with patch.object(S3UploadBackend, "delete_message_attachment") as single_delete:
delete_message_attachments(path_ids)
single_delete.assert_not_called()
for path_id in path_ids:
with self.assertRaises(botocore.exceptions.ClientError):
bucket.Object(path_id).load()
@use_s3_backend
def test_delete_message_attachment_when_file_doesnt_exist(self) -> None:
bucket = create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET)[0]
with self.assertRaises(botocore.exceptions.ClientError):
bucket.Object("non-existent-file").load()
with self.assertLogs(level="WARNING") as warn_log:
self.assertEqual(False, delete_message_attachment("non-existent-file"))
self.assertEqual(