import: Use pub_date to sort message ids.

When we create new ids for message rows, we
now sort the new ids by their corresponding
pub_date values in the rows.

This takes a sizable chunk of memory.

This feature only gets turned on if you
set sort_by_date to True in realm.json.
This commit is contained in:
Steve Howell
2018-10-16 10:34:47 +00:00
committed by showell
parent d1ff903534
commit bd9e4ef0c8
5 changed files with 126 additions and 4 deletions

View File

@@ -679,6 +679,8 @@ def do_convert_data(input_tar_file: str, output_dir: str) -> None:
# we process everything else, since we may introduce
# mirror users when processing messages.
realm['zerver_userprofile'] = user_handler.get_all_users()
realm['sort_by_date'] = True
create_converted_data_files(realm, output_dir, '/realm.json')
logging.info('Start importing avatar data')

View File

@@ -655,6 +655,8 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
with open(realm_data_filename) as f:
data = ujson.load(f)
sort_by_date = data.get('sort_by_date', False)
bulk_import_client(data, Client, 'zerver_client')
# We don't import the Stream model yet, since it depends on Realm,
@@ -714,7 +716,7 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
data['zerver_userprofile'].sort(key=lambda r: r['id'])
# To remap foreign key for UserProfile.last_active_message_id
update_message_foreign_keys(import_dir)
update_message_foreign_keys(import_dir=import_dir, sort_by_date=sort_by_date)
fix_datetime_fields(data, 'zerver_userprofile')
update_model_ids(UserProfile, data, 'user_profile')
@@ -915,7 +917,45 @@ def create_users(realm: Realm, name_list: Iterable[Tuple[str, str]],
user_set.add((email, full_name, short_name, True))
bulk_create_users(realm, user_set, bot_type)
def update_message_foreign_keys(import_dir: Path) -> None:
def update_message_foreign_keys(import_dir: Path,
sort_by_date: bool) -> None:
old_id_list = get_incoming_message_ids(
import_dir=import_dir,
sort_by_date=sort_by_date,
)
count = len(old_id_list)
new_id_list = allocate_ids(model_class=Message, count=count)
for old_id, new_id in zip(old_id_list, new_id_list):
update_id_map(
table='message',
old_id=old_id,
new_id=new_id,
)
# We don't touch user_message keys here; that happens later when
# we're actually read the files a second time to get actual data.
def get_incoming_message_ids(import_dir: Path,
sort_by_date: bool) -> List[int]:
'''
This function reads in our entire collection of message
ids, which can be millions of integers for some installations.
And then we sort the list. This is necessary to ensure
that the sort order of incoming ids matches the sort order
of pub_date, which isn't always guaranteed by our
utilities that convert third party chat data. We also
need to move our ids to a new range if we're dealing
with a server that has data for other realms.
'''
if sort_by_date:
tups = list() # type: List[Tuple[int, int]]
else:
message_ids = [] # type: List[int]
dump_file_id = 1
while True:
message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,))
@@ -925,9 +965,37 @@ def update_message_foreign_keys(import_dir: Path) -> None:
with open(message_filename) as f:
data = ujson.load(f)
update_model_ids(Message, data, 'message')
# Aggressively free up memory.
del data['zerver_usermessage']
for row in data['zerver_message']:
# We truncate pub_date to int to theoretically
# save memory and speed up the sort. For
# Zulip-to-Zulip imports, the
# message_id will generally be a good tiebreaker.
# If we occasionally mis-order the ids for two
# messages from the same second, it's not the
# end of the world, as it's likely those messages
# arrived to the original server in somewhat
# arbitrary order.
message_id = row['id']
if sort_by_date:
pub_date = int(row['pub_date'])
tup = (pub_date, message_id)
tups.append(tup)
else:
message_ids.append(message_id)
dump_file_id += 1
if sort_by_date:
tups.sort()
message_ids = [tup[1] for tup in tups]
return message_ids
def import_message_data(import_dir: Path) -> None:
dump_file_id = 1
while True:
@@ -946,7 +1014,18 @@ def import_message_data(import_dir: Path) -> None:
# Parser to update message content with the updated attachment urls
fix_upload_links(data, 'zerver_message')
re_map_foreign_keys(data, 'zerver_message', 'id', related_table='message', id_field=True)
# We already create mappings for zerver_message ids
# in update_message_foreign_keys(), so here we simply
# apply them.
message_id_map = id_maps['message']
for row in data['zerver_message']:
row['id'] = message_id_map[row['id']]
for row in data['zerver_usermessage']:
assert(row['message'] in message_id_map)
# A LOT HAPPENS HERE.
# This is where we actually import the message data.
bulk_import_model(data, Message)
fix_message_rendered_content(data, 'zerver_message')

View File

@@ -0,0 +1,14 @@
{
"zerver_message":[
{
"pub_date":1409000103,
"id":555
},
{
"pub_date":1409000101,
"id":888
}
],
"zerver_usermessage":[
]
}

View File

@@ -0,0 +1,10 @@
{
"zerver_message":[
{
"pub_date":1409000102,
"id":999
}
],
"zerver_usermessage":[
]
}

View File

@@ -21,6 +21,7 @@ from zerver.lib.export import (
)
from zerver.lib.import_realm import (
do_import_realm,
get_incoming_message_ids,
)
from zerver.lib.avatar_hash import (
user_avatar_path,
@@ -805,6 +806,22 @@ class ImportExportTest(ZulipTestCase):
image_data = original_image_key.get_contents_as_string()
self.assertEqual(image_data, test_image_data)
def test_get_incoming_message_ids(self) -> None:
import_dir = os.path.join(settings.DEPLOY_ROOT, "zerver", "tests", "fixtures", "import_fixtures")
message_ids = get_incoming_message_ids(
import_dir=import_dir,
sort_by_date=True,
)
self.assertEqual(message_ids, [888, 999, 555])
message_ids = get_incoming_message_ids(
import_dir=import_dir,
sort_by_date=False,
)
self.assertEqual(message_ids, [555, 888, 999])
def test_plan_type(self) -> None:
realm = get_realm('zulip')
realm.plan_type = Realm.PREMIUM