mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	import realm: Use processes for resizing avatar images.
This should significantly improve the data import performance when importing large open source realms from Slack. Fixes #11009.
This commit is contained in:
		@@ -26,6 +26,7 @@ from zerver.lib.bugdown import version as bugdown_version
 | 
				
			|||||||
from zerver.lib.upload import random_name, sanitize_name, \
 | 
					from zerver.lib.upload import random_name, sanitize_name, \
 | 
				
			||||||
    guess_type, BadImageError
 | 
					    guess_type, BadImageError
 | 
				
			||||||
from zerver.lib.utils import generate_api_key, process_list_in_batches
 | 
					from zerver.lib.utils import generate_api_key, process_list_in_batches
 | 
				
			||||||
 | 
					from zerver.lib.parallel import run_parallel
 | 
				
			||||||
from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \
 | 
					from zerver.models import UserProfile, Realm, Client, Huddle, Stream, \
 | 
				
			||||||
    UserMessage, Subscription, Message, RealmEmoji, \
 | 
					    UserMessage, Subscription, Message, RealmEmoji, \
 | 
				
			||||||
    RealmDomain, Recipient, get_user_profile_by_id, \
 | 
					    RealmDomain, Recipient, get_user_profile_by_id, \
 | 
				
			||||||
@@ -532,7 +533,7 @@ def bulk_import_client(data: TableData, model: Any, table: TableName) -> None:
 | 
				
			|||||||
            client = Client.objects.create(name=item['name'])
 | 
					            client = Client.objects.create(name=item['name'])
 | 
				
			||||||
        update_id_map(table='client', old_id=item['id'], new_id=client.id)
 | 
					        update_id_map(table='client', old_id=item['id'], new_id=client.id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def import_uploads(import_dir: Path, processing_avatars: bool=False,
 | 
					def import_uploads(import_dir: Path, processes: int, processing_avatars: bool=False,
 | 
				
			||||||
                   processing_emojis: bool=False) -> None:
 | 
					                   processing_emojis: bool=False) -> None:
 | 
				
			||||||
    if processing_avatars and processing_emojis:
 | 
					    if processing_avatars and processing_emojis:
 | 
				
			||||||
        raise AssertionError("Cannot import avatars and emojis at the same time!")
 | 
					        raise AssertionError("Cannot import avatars and emojis at the same time!")
 | 
				
			||||||
@@ -640,7 +641,8 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
 | 
				
			|||||||
        # avatar.  TODO: This implementation is hacky, both in that it
 | 
					        # avatar.  TODO: This implementation is hacky, both in that it
 | 
				
			||||||
        # does get_user_profile_by_id for each user, and in that it
 | 
					        # does get_user_profile_by_id for each user, and in that it
 | 
				
			||||||
        # might be better to require the export to just have these.
 | 
					        # might be better to require the export to just have these.
 | 
				
			||||||
        for record in records:
 | 
					
 | 
				
			||||||
 | 
					        def process_avatars(record: Dict[Any, Any]) -> int:
 | 
				
			||||||
            if record['s3_path'].endswith('.original'):
 | 
					            if record['s3_path'].endswith('.original'):
 | 
				
			||||||
                user_profile = get_user_profile_by_id(record['user_profile_id'])
 | 
					                user_profile = get_user_profile_by_id(record['user_profile_id'])
 | 
				
			||||||
                if settings.LOCAL_UPLOADS_DIR is not None:
 | 
					                if settings.LOCAL_UPLOADS_DIR is not None:
 | 
				
			||||||
@@ -662,6 +664,16 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
 | 
				
			|||||||
                        user_profile.id))
 | 
					                        user_profile.id))
 | 
				
			||||||
                    # Delete the record of the avatar to avoid 404s.
 | 
					                    # Delete the record of the avatar to avoid 404s.
 | 
				
			||||||
                    do_change_avatar_fields(user_profile, UserProfile.AVATAR_FROM_GRAVATAR)
 | 
					                    do_change_avatar_fields(user_profile, UserProfile.AVATAR_FROM_GRAVATAR)
 | 
				
			||||||
 | 
					            return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if processes == 1:
 | 
				
			||||||
 | 
					            for record in records:
 | 
				
			||||||
 | 
					                process_avatars(record)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            connection.close()
 | 
				
			||||||
 | 
					            output = []
 | 
				
			||||||
 | 
					            for (status, job) in run_parallel(process_avatars, records, processes):
 | 
				
			||||||
 | 
					                output.append(job)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Importing data suffers from a difficult ordering problem because of
 | 
					# Importing data suffers from a difficult ordering problem because of
 | 
				
			||||||
# models that reference each other circularly.  Here is a correct order.
 | 
					# models that reference each other circularly.  Here is a correct order.
 | 
				
			||||||
@@ -681,7 +693,7 @@ def import_uploads(import_dir: Path, processing_avatars: bool=False,
 | 
				
			|||||||
# Because the Python object => JSON conversion process is not fully
 | 
					# Because the Python object => JSON conversion process is not fully
 | 
				
			||||||
# faithful, we have to use a set of fixers (e.g. on DateTime objects
 | 
					# faithful, we have to use a set of fixers (e.g. on DateTime objects
 | 
				
			||||||
# and Foreign Keys) to do the import correctly.
 | 
					# and Foreign Keys) to do the import correctly.
 | 
				
			||||||
def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
 | 
					def do_import_realm(import_dir: Path, subdomain: str, processes: int=1) -> Realm:
 | 
				
			||||||
    logging.info("Importing realm dump %s" % (import_dir,))
 | 
					    logging.info("Importing realm dump %s" % (import_dir,))
 | 
				
			||||||
    if not os.path.exists(import_dir):
 | 
					    if not os.path.exists(import_dir):
 | 
				
			||||||
        raise Exception("Missing import directory!")
 | 
					        raise Exception("Missing import directory!")
 | 
				
			||||||
@@ -922,14 +934,14 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
 | 
				
			|||||||
    bulk_import_model(data, CustomProfileFieldValue)
 | 
					    bulk_import_model(data, CustomProfileFieldValue)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Import uploaded files and avatars
 | 
					    # Import uploaded files and avatars
 | 
				
			||||||
    import_uploads(os.path.join(import_dir, "avatars"), processing_avatars=True)
 | 
					    import_uploads(os.path.join(import_dir, "avatars"), processes, processing_avatars=True)
 | 
				
			||||||
    import_uploads(os.path.join(import_dir, "uploads"))
 | 
					    import_uploads(os.path.join(import_dir, "uploads"), processes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # We need to have this check as the emoji files are only present in the data
 | 
					    # We need to have this check as the emoji files are only present in the data
 | 
				
			||||||
    # importer from slack
 | 
					    # importer from slack
 | 
				
			||||||
    # For Zulip export, this doesn't exist
 | 
					    # For Zulip export, this doesn't exist
 | 
				
			||||||
    if os.path.exists(os.path.join(import_dir, "emoji")):
 | 
					    if os.path.exists(os.path.join(import_dir, "emoji")):
 | 
				
			||||||
        import_uploads(os.path.join(import_dir, "emoji"), processing_emojis=True)
 | 
					        import_uploads(os.path.join(import_dir, "emoji"), processes, processing_emojis=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    sender_map = {
 | 
					    sender_map = {
 | 
				
			||||||
        user['id']: user
 | 
					        user['id']: user
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -7,7 +7,7 @@ from typing import Any
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
from django.core.management import call_command
 | 
					from django.core.management import call_command
 | 
				
			||||||
from django.core.management.base import BaseCommand, CommandParser
 | 
					from django.core.management.base import BaseCommand, CommandParser, CommandError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from zerver.lib.import_realm import do_import_realm, do_import_system_bots
 | 
					from zerver.lib.import_realm import do_import_realm, do_import_system_bots
 | 
				
			||||||
from zerver.forms import check_subdomain_available
 | 
					from zerver.forms import check_subdomain_available
 | 
				
			||||||
@@ -37,6 +37,11 @@ import a database dump from one or more JSON files."""
 | 
				
			|||||||
        parser.add_argument('export_paths', nargs='+',
 | 
					        parser.add_argument('export_paths', nargs='+',
 | 
				
			||||||
                            metavar='<export path>',
 | 
					                            metavar='<export path>',
 | 
				
			||||||
                            help="list of export directories to import")
 | 
					                            help="list of export directories to import")
 | 
				
			||||||
 | 
					        parser.add_argument('--processes',
 | 
				
			||||||
 | 
					                            dest='processes',
 | 
				
			||||||
 | 
					                            action="store",
 | 
				
			||||||
 | 
					                            default=6,
 | 
				
			||||||
 | 
					                            help='Number of processes to use for uploading Avatars to S3 in parallel')
 | 
				
			||||||
        parser.formatter_class = argparse.RawTextHelpFormatter
 | 
					        parser.formatter_class = argparse.RawTextHelpFormatter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def do_destroy_and_rebuild_database(self, db_name: str) -> None:
 | 
					    def do_destroy_and_rebuild_database(self, db_name: str) -> None:
 | 
				
			||||||
@@ -44,6 +49,10 @@ import a database dump from one or more JSON files."""
 | 
				
			|||||||
        subprocess.check_call([os.path.join(settings.DEPLOY_ROOT, "scripts/setup/flush-memcached")])
 | 
					        subprocess.check_call([os.path.join(settings.DEPLOY_ROOT, "scripts/setup/flush-memcached")])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle(self, *args: Any, **options: Any) -> None:
 | 
					    def handle(self, *args: Any, **options: Any) -> None:
 | 
				
			||||||
 | 
					        num_processes = int(options['processes'])
 | 
				
			||||||
 | 
					        if num_processes < 1:
 | 
				
			||||||
 | 
					            raise CommandError('You must have at least one process.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        subdomain = options['subdomain']
 | 
					        subdomain = options['subdomain']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if options["destroy_rebuild_database"]:
 | 
					        if options["destroy_rebuild_database"]:
 | 
				
			||||||
@@ -68,6 +77,6 @@ import a database dump from one or more JSON files."""
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        for path in paths:
 | 
					        for path in paths:
 | 
				
			||||||
            print("Processing dump: %s ..." % (path,))
 | 
					            print("Processing dump: %s ..." % (path,))
 | 
				
			||||||
            realm = do_import_realm(path, subdomain)
 | 
					            realm = do_import_realm(path, subdomain, num_processes)
 | 
				
			||||||
            print("Checking the system bots.")
 | 
					            print("Checking the system bots.")
 | 
				
			||||||
            do_import_system_bots(realm)
 | 
					            do_import_system_bots(realm)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user