diff --git a/zerver/data_import/hipchat.py b/zerver/data_import/hipchat.py index 6d5bec0b19..5283a16b1d 100755 --- a/zerver/data_import/hipchat.py +++ b/zerver/data_import/hipchat.py @@ -679,6 +679,8 @@ def do_convert_data(input_tar_file: str, output_dir: str) -> None: # we process everything else, since we may introduce # mirror users when processing messages. realm['zerver_userprofile'] = user_handler.get_all_users() + realm['sort_by_date'] = True + create_converted_data_files(realm, output_dir, '/realm.json') logging.info('Start importing avatar data') diff --git a/zerver/lib/import_realm.py b/zerver/lib/import_realm.py index d5e804eca8..dac46fb7a8 100644 --- a/zerver/lib/import_realm.py +++ b/zerver/lib/import_realm.py @@ -655,6 +655,8 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm: with open(realm_data_filename) as f: data = ujson.load(f) + sort_by_date = data.get('sort_by_date', False) + bulk_import_client(data, Client, 'zerver_client') # We don't import the Stream model yet, since it depends on Realm, @@ -714,7 +716,7 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm: data['zerver_userprofile'].sort(key=lambda r: r['id']) # To remap foreign key for UserProfile.last_active_message_id - update_message_foreign_keys(import_dir) + update_message_foreign_keys(import_dir=import_dir, sort_by_date=sort_by_date) fix_datetime_fields(data, 'zerver_userprofile') update_model_ids(UserProfile, data, 'user_profile') @@ -915,7 +917,45 @@ def create_users(realm: Realm, name_list: Iterable[Tuple[str, str]], user_set.add((email, full_name, short_name, True)) bulk_create_users(realm, user_set, bot_type) -def update_message_foreign_keys(import_dir: Path) -> None: +def update_message_foreign_keys(import_dir: Path, + sort_by_date: bool) -> None: + old_id_list = get_incoming_message_ids( + import_dir=import_dir, + sort_by_date=sort_by_date, + ) + + count = len(old_id_list) + + new_id_list = allocate_ids(model_class=Message, count=count) + + for old_id, new_id in zip(old_id_list, new_id_list): + update_id_map( + table='message', + old_id=old_id, + new_id=new_id, + ) + + # We don't touch user_message keys here; that happens later when + # we're actually read the files a second time to get actual data. + +def get_incoming_message_ids(import_dir: Path, + sort_by_date: bool) -> List[int]: + ''' + This function reads in our entire collection of message + ids, which can be millions of integers for some installations. + And then we sort the list. This is necessary to ensure + that the sort order of incoming ids matches the sort order + of pub_date, which isn't always guaranteed by our + utilities that convert third party chat data. We also + need to move our ids to a new range if we're dealing + with a server that has data for other realms. + ''' + + if sort_by_date: + tups = list() # type: List[Tuple[int, int]] + else: + message_ids = [] # type: List[int] + dump_file_id = 1 while True: message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,)) @@ -925,9 +965,37 @@ def update_message_foreign_keys(import_dir: Path) -> None: with open(message_filename) as f: data = ujson.load(f) - update_model_ids(Message, data, 'message') + # Aggressively free up memory. + del data['zerver_usermessage'] + + for row in data['zerver_message']: + # We truncate pub_date to int to theoretically + # save memory and speed up the sort. For + # Zulip-to-Zulip imports, the + # message_id will generally be a good tiebreaker. + # If we occasionally mis-order the ids for two + # messages from the same second, it's not the + # end of the world, as it's likely those messages + # arrived to the original server in somewhat + # arbitrary order. + + message_id = row['id'] + + if sort_by_date: + pub_date = int(row['pub_date']) + tup = (pub_date, message_id) + tups.append(tup) + else: + message_ids.append(message_id) + dump_file_id += 1 + if sort_by_date: + tups.sort() + message_ids = [tup[1] for tup in tups] + + return message_ids + def import_message_data(import_dir: Path) -> None: dump_file_id = 1 while True: @@ -946,7 +1014,18 @@ def import_message_data(import_dir: Path) -> None: # Parser to update message content with the updated attachment urls fix_upload_links(data, 'zerver_message') - re_map_foreign_keys(data, 'zerver_message', 'id', related_table='message', id_field=True) + # We already create mappings for zerver_message ids + # in update_message_foreign_keys(), so here we simply + # apply them. + message_id_map = id_maps['message'] + for row in data['zerver_message']: + row['id'] = message_id_map[row['id']] + + for row in data['zerver_usermessage']: + assert(row['message'] in message_id_map) + + # A LOT HAPPENS HERE. + # This is where we actually import the message data. bulk_import_model(data, Message) fix_message_rendered_content(data, 'zerver_message') diff --git a/zerver/tests/fixtures/import_fixtures/messages-000001.json b/zerver/tests/fixtures/import_fixtures/messages-000001.json new file mode 100644 index 0000000000..e1d738a7bd --- /dev/null +++ b/zerver/tests/fixtures/import_fixtures/messages-000001.json @@ -0,0 +1,14 @@ +{ + "zerver_message":[ + { + "pub_date":1409000103, + "id":555 + }, + { + "pub_date":1409000101, + "id":888 + } + ], + "zerver_usermessage":[ + ] +} diff --git a/zerver/tests/fixtures/import_fixtures/messages-000002.json b/zerver/tests/fixtures/import_fixtures/messages-000002.json new file mode 100644 index 0000000000..fdeaf01952 --- /dev/null +++ b/zerver/tests/fixtures/import_fixtures/messages-000002.json @@ -0,0 +1,10 @@ +{ + "zerver_message":[ + { + "pub_date":1409000102, + "id":999 + } + ], + "zerver_usermessage":[ + ] +} diff --git a/zerver/tests/test_import_export.py b/zerver/tests/test_import_export.py index 588d4e78e8..26d2f215fd 100644 --- a/zerver/tests/test_import_export.py +++ b/zerver/tests/test_import_export.py @@ -21,6 +21,7 @@ from zerver.lib.export import ( ) from zerver.lib.import_realm import ( do_import_realm, + get_incoming_message_ids, ) from zerver.lib.avatar_hash import ( user_avatar_path, @@ -805,6 +806,22 @@ class ImportExportTest(ZulipTestCase): image_data = original_image_key.get_contents_as_string() self.assertEqual(image_data, test_image_data) + def test_get_incoming_message_ids(self) -> None: + import_dir = os.path.join(settings.DEPLOY_ROOT, "zerver", "tests", "fixtures", "import_fixtures") + message_ids = get_incoming_message_ids( + import_dir=import_dir, + sort_by_date=True, + ) + + self.assertEqual(message_ids, [888, 999, 555]) + + message_ids = get_incoming_message_ids( + import_dir=import_dir, + sort_by_date=False, + ) + + self.assertEqual(message_ids, [555, 888, 999]) + def test_plan_type(self) -> None: realm = get_realm('zulip') realm.plan_type = Realm.PREMIUM