import realm: Use processes for resizing avatar images.

This should significantly improve the data import performance when
importing large open source realms from Slack.

Fixes #11009.
This commit is contained in:
Vishnu Ks
2019-01-26 01:10:49 +05:30
committed by Tim Abbott
parent 33a322baa2
commit bec875a9af
2 changed files with 29 additions and 8 deletions

View File

@@ -26,6 +26,7 @@ from zerver.lib.bugdown import version as bugdown_version
from zerver.lib.upload import random_name, sanitize_name, \
guess_type, BadImageError
from zerver.lib.utils import generate_api_key, process_list_in_batches
from zerver.lib.parallel import run_parallel
from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \
UserMessage, Subscription, Message, RealmEmoji, \
RealmDomain, Recipient, get_user_profile_by_id, \
@@ -532,7 +533,7 @@ def bulk_import_client(data: TableData, model: Any, table: TableName) -> None:
client = Client.objects.create(name=item['name'])
update_id_map(table='client', old_id=item['id'], new_id=client.id)
def import_uploads(import_dir: Path, processing_avatars: bool=False,
def import_uploads(import_dir: Path, processes: int, processing_avatars: bool=False,
processing_emojis: bool=False) -> None:
if processing_avatars and processing_emojis:
raise AssertionError("Cannot import avatars and emojis at the same time!")
@@ -640,7 +641,8 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
# avatar. TODO: This implementation is hacky, both in that it
# does get_user_profile_by_id for each user, and in that it
# might be better to require the export to just have these.
for record in records:
def process_avatars(record: Dict[Any, Any]) -> int:
if record['s3_path'].endswith('.original'):
user_profile = get_user_profile_by_id(record['user_profile_id'])
if settings.LOCAL_UPLOADS_DIR is not None:
@@ -662,6 +664,16 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
user_profile.id))
# Delete the record of the avatar to avoid 404s.
do_change_avatar_fields(user_profile, UserProfile.AVATAR_FROM_GRAVATAR)
return 0
if processes == 1:
for record in records:
process_avatars(record)
else:
connection.close()
output = []
for (status, job) in run_parallel(process_avatars, records, processes):
output.append(job)
# Importing data suffers from a difficult ordering problem because of
# models that reference each other circularly. Here is a correct order.
@@ -681,7 +693,7 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
# Because the Python object => JSON conversion process is not fully
# faithful, we have to use a set of fixers (e.g. on DateTime objects
# and Foreign Keys) to do the import correctly.
def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
def do_import_realm(import_dir: Path, subdomain: str, processes: int=1) -> Realm:
logging.info("Importing realm dump %s" % (import_dir,))
if not os.path.exists(import_dir):
raise Exception("Missing import directory!")
@@ -922,14 +934,14 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
bulk_import_model(data, CustomProfileFieldValue)
# Import uploaded files and avatars
import_uploads(os.path.join(import_dir, "avatars"), processing_avatars=True)
import_uploads(os.path.join(import_dir, "uploads"))
import_uploads(os.path.join(import_dir, "avatars"), processes, processing_avatars=True)
import_uploads(os.path.join(import_dir, "uploads"), processes)
# We need to have this check as the emoji files are only present in the data
# importer from slack
# For Zulip export, this doesn't exist
if os.path.exists(os.path.join(import_dir, "emoji")):
import_uploads(os.path.join(import_dir, "emoji"), processing_emojis=True)
import_uploads(os.path.join(import_dir, "emoji"), processes, processing_emojis=True)
sender_map = {
user['id']: user

View File

@@ -7,7 +7,7 @@ from typing import Any
from django.conf import settings
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandParser
from django.core.management.base import BaseCommand, CommandParser, CommandError
from zerver.lib.import_realm import do_import_realm, do_import_system_bots
from zerver.forms import check_subdomain_available
@@ -37,6 +37,11 @@ import a database dump from one or more JSON files."""
parser.add_argument('export_paths', nargs='+',
metavar='<export path>',
help="list of export directories to import")
parser.add_argument('--processes',
dest='processes',
action="store",
default=6,
help='Number of processes to use for uploading Avatars to S3 in parallel')
parser.formatter_class = argparse.RawTextHelpFormatter
def do_destroy_and_rebuild_database(self, db_name: str) -> None:
@@ -44,6 +49,10 @@ import a database dump from one or more JSON files."""
subprocess.check_call([os.path.join(settings.DEPLOY_ROOT, "scripts/setup/flush-memcached")])
def handle(self, *args: Any, **options: Any) -> None:
num_processes = int(options['processes'])
if num_processes < 1:
raise CommandError('You must have at least one process.')
subdomain = options['subdomain']
if options["destroy_rebuild_database"]:
@@ -68,6 +77,6 @@ import a database dump from one or more JSON files."""
for path in paths:
print("Processing dump: %s ..." % (path,))
realm = do_import_realm(path, subdomain)
realm = do_import_realm(path, subdomain, num_processes)
print("Checking the system bots.")
do_import_system_bots(realm)