mirror of
https://github.com/zulip/zulip.git
synced 2025-11-09 08:26:11 +00:00
import: Use pub_date to sort message ids.
When we create new ids for message rows, we now sort the new ids by their corresponding pub_date values in the rows. This takes a sizable chunk of memory. This feature only gets turned on if you set sort_by_date to True in realm.json.
This commit is contained in:
@@ -679,6 +679,8 @@ def do_convert_data(input_tar_file: str, output_dir: str) -> None:
|
|||||||
# we process everything else, since we may introduce
|
# we process everything else, since we may introduce
|
||||||
# mirror users when processing messages.
|
# mirror users when processing messages.
|
||||||
realm['zerver_userprofile'] = user_handler.get_all_users()
|
realm['zerver_userprofile'] = user_handler.get_all_users()
|
||||||
|
realm['sort_by_date'] = True
|
||||||
|
|
||||||
create_converted_data_files(realm, output_dir, '/realm.json')
|
create_converted_data_files(realm, output_dir, '/realm.json')
|
||||||
|
|
||||||
logging.info('Start importing avatar data')
|
logging.info('Start importing avatar data')
|
||||||
|
|||||||
@@ -655,6 +655,8 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
|
|||||||
with open(realm_data_filename) as f:
|
with open(realm_data_filename) as f:
|
||||||
data = ujson.load(f)
|
data = ujson.load(f)
|
||||||
|
|
||||||
|
sort_by_date = data.get('sort_by_date', False)
|
||||||
|
|
||||||
bulk_import_client(data, Client, 'zerver_client')
|
bulk_import_client(data, Client, 'zerver_client')
|
||||||
|
|
||||||
# We don't import the Stream model yet, since it depends on Realm,
|
# We don't import the Stream model yet, since it depends on Realm,
|
||||||
@@ -714,7 +716,7 @@ def do_import_realm(import_dir: Path, subdomain: str) -> Realm:
|
|||||||
data['zerver_userprofile'].sort(key=lambda r: r['id'])
|
data['zerver_userprofile'].sort(key=lambda r: r['id'])
|
||||||
|
|
||||||
# To remap foreign key for UserProfile.last_active_message_id
|
# To remap foreign key for UserProfile.last_active_message_id
|
||||||
update_message_foreign_keys(import_dir)
|
update_message_foreign_keys(import_dir=import_dir, sort_by_date=sort_by_date)
|
||||||
|
|
||||||
fix_datetime_fields(data, 'zerver_userprofile')
|
fix_datetime_fields(data, 'zerver_userprofile')
|
||||||
update_model_ids(UserProfile, data, 'user_profile')
|
update_model_ids(UserProfile, data, 'user_profile')
|
||||||
@@ -915,7 +917,45 @@ def create_users(realm: Realm, name_list: Iterable[Tuple[str, str]],
|
|||||||
user_set.add((email, full_name, short_name, True))
|
user_set.add((email, full_name, short_name, True))
|
||||||
bulk_create_users(realm, user_set, bot_type)
|
bulk_create_users(realm, user_set, bot_type)
|
||||||
|
|
||||||
def update_message_foreign_keys(import_dir: Path) -> None:
|
def update_message_foreign_keys(import_dir: Path,
|
||||||
|
sort_by_date: bool) -> None:
|
||||||
|
old_id_list = get_incoming_message_ids(
|
||||||
|
import_dir=import_dir,
|
||||||
|
sort_by_date=sort_by_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
count = len(old_id_list)
|
||||||
|
|
||||||
|
new_id_list = allocate_ids(model_class=Message, count=count)
|
||||||
|
|
||||||
|
for old_id, new_id in zip(old_id_list, new_id_list):
|
||||||
|
update_id_map(
|
||||||
|
table='message',
|
||||||
|
old_id=old_id,
|
||||||
|
new_id=new_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# We don't touch user_message keys here; that happens later when
|
||||||
|
# we're actually read the files a second time to get actual data.
|
||||||
|
|
||||||
|
def get_incoming_message_ids(import_dir: Path,
|
||||||
|
sort_by_date: bool) -> List[int]:
|
||||||
|
'''
|
||||||
|
This function reads in our entire collection of message
|
||||||
|
ids, which can be millions of integers for some installations.
|
||||||
|
And then we sort the list. This is necessary to ensure
|
||||||
|
that the sort order of incoming ids matches the sort order
|
||||||
|
of pub_date, which isn't always guaranteed by our
|
||||||
|
utilities that convert third party chat data. We also
|
||||||
|
need to move our ids to a new range if we're dealing
|
||||||
|
with a server that has data for other realms.
|
||||||
|
'''
|
||||||
|
|
||||||
|
if sort_by_date:
|
||||||
|
tups = list() # type: List[Tuple[int, int]]
|
||||||
|
else:
|
||||||
|
message_ids = [] # type: List[int]
|
||||||
|
|
||||||
dump_file_id = 1
|
dump_file_id = 1
|
||||||
while True:
|
while True:
|
||||||
message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,))
|
message_filename = os.path.join(import_dir, "messages-%06d.json" % (dump_file_id,))
|
||||||
@@ -925,9 +965,37 @@ def update_message_foreign_keys(import_dir: Path) -> None:
|
|||||||
with open(message_filename) as f:
|
with open(message_filename) as f:
|
||||||
data = ujson.load(f)
|
data = ujson.load(f)
|
||||||
|
|
||||||
update_model_ids(Message, data, 'message')
|
# Aggressively free up memory.
|
||||||
|
del data['zerver_usermessage']
|
||||||
|
|
||||||
|
for row in data['zerver_message']:
|
||||||
|
# We truncate pub_date to int to theoretically
|
||||||
|
# save memory and speed up the sort. For
|
||||||
|
# Zulip-to-Zulip imports, the
|
||||||
|
# message_id will generally be a good tiebreaker.
|
||||||
|
# If we occasionally mis-order the ids for two
|
||||||
|
# messages from the same second, it's not the
|
||||||
|
# end of the world, as it's likely those messages
|
||||||
|
# arrived to the original server in somewhat
|
||||||
|
# arbitrary order.
|
||||||
|
|
||||||
|
message_id = row['id']
|
||||||
|
|
||||||
|
if sort_by_date:
|
||||||
|
pub_date = int(row['pub_date'])
|
||||||
|
tup = (pub_date, message_id)
|
||||||
|
tups.append(tup)
|
||||||
|
else:
|
||||||
|
message_ids.append(message_id)
|
||||||
|
|
||||||
dump_file_id += 1
|
dump_file_id += 1
|
||||||
|
|
||||||
|
if sort_by_date:
|
||||||
|
tups.sort()
|
||||||
|
message_ids = [tup[1] for tup in tups]
|
||||||
|
|
||||||
|
return message_ids
|
||||||
|
|
||||||
def import_message_data(import_dir: Path) -> None:
|
def import_message_data(import_dir: Path) -> None:
|
||||||
dump_file_id = 1
|
dump_file_id = 1
|
||||||
while True:
|
while True:
|
||||||
@@ -946,7 +1014,18 @@ def import_message_data(import_dir: Path) -> None:
|
|||||||
# Parser to update message content with the updated attachment urls
|
# Parser to update message content with the updated attachment urls
|
||||||
fix_upload_links(data, 'zerver_message')
|
fix_upload_links(data, 'zerver_message')
|
||||||
|
|
||||||
re_map_foreign_keys(data, 'zerver_message', 'id', related_table='message', id_field=True)
|
# We already create mappings for zerver_message ids
|
||||||
|
# in update_message_foreign_keys(), so here we simply
|
||||||
|
# apply them.
|
||||||
|
message_id_map = id_maps['message']
|
||||||
|
for row in data['zerver_message']:
|
||||||
|
row['id'] = message_id_map[row['id']]
|
||||||
|
|
||||||
|
for row in data['zerver_usermessage']:
|
||||||
|
assert(row['message'] in message_id_map)
|
||||||
|
|
||||||
|
# A LOT HAPPENS HERE.
|
||||||
|
# This is where we actually import the message data.
|
||||||
bulk_import_model(data, Message)
|
bulk_import_model(data, Message)
|
||||||
|
|
||||||
fix_message_rendered_content(data, 'zerver_message')
|
fix_message_rendered_content(data, 'zerver_message')
|
||||||
|
|||||||
14
zerver/tests/fixtures/import_fixtures/messages-000001.json
vendored
Normal file
14
zerver/tests/fixtures/import_fixtures/messages-000001.json
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"zerver_message":[
|
||||||
|
{
|
||||||
|
"pub_date":1409000103,
|
||||||
|
"id":555
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pub_date":1409000101,
|
||||||
|
"id":888
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"zerver_usermessage":[
|
||||||
|
]
|
||||||
|
}
|
||||||
10
zerver/tests/fixtures/import_fixtures/messages-000002.json
vendored
Normal file
10
zerver/tests/fixtures/import_fixtures/messages-000002.json
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"zerver_message":[
|
||||||
|
{
|
||||||
|
"pub_date":1409000102,
|
||||||
|
"id":999
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"zerver_usermessage":[
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -21,6 +21,7 @@ from zerver.lib.export import (
|
|||||||
)
|
)
|
||||||
from zerver.lib.import_realm import (
|
from zerver.lib.import_realm import (
|
||||||
do_import_realm,
|
do_import_realm,
|
||||||
|
get_incoming_message_ids,
|
||||||
)
|
)
|
||||||
from zerver.lib.avatar_hash import (
|
from zerver.lib.avatar_hash import (
|
||||||
user_avatar_path,
|
user_avatar_path,
|
||||||
@@ -805,6 +806,22 @@ class ImportExportTest(ZulipTestCase):
|
|||||||
image_data = original_image_key.get_contents_as_string()
|
image_data = original_image_key.get_contents_as_string()
|
||||||
self.assertEqual(image_data, test_image_data)
|
self.assertEqual(image_data, test_image_data)
|
||||||
|
|
||||||
|
def test_get_incoming_message_ids(self) -> None:
|
||||||
|
import_dir = os.path.join(settings.DEPLOY_ROOT, "zerver", "tests", "fixtures", "import_fixtures")
|
||||||
|
message_ids = get_incoming_message_ids(
|
||||||
|
import_dir=import_dir,
|
||||||
|
sort_by_date=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(message_ids, [888, 999, 555])
|
||||||
|
|
||||||
|
message_ids = get_incoming_message_ids(
|
||||||
|
import_dir=import_dir,
|
||||||
|
sort_by_date=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(message_ids, [555, 888, 999])
|
||||||
|
|
||||||
def test_plan_type(self) -> None:
|
def test_plan_type(self) -> None:
|
||||||
realm = get_realm('zulip')
|
realm = get_realm('zulip')
|
||||||
realm.plan_type = Realm.PREMIUM
|
realm.plan_type = Realm.PREMIUM
|
||||||
|
|||||||
Reference in New Issue
Block a user