user export: Be more selective about exported messages.

This commit is contained in:
Steve Howell
2021-12-09 20:31:46 +00:00
committed by Tim Abbott
parent fa654fd7a0
commit 6a5c407b05
2 changed files with 61 additions and 8 deletions

View File

@@ -1944,8 +1944,12 @@ def do_export_user(user_profile: UserProfile, output_dir: Path) -> None:
export_file = os.path.join(output_dir, "user.json")
write_table_data(output_file=export_file, data=response)
reaction_message_ids: Set[int] = {row["message"] for row in response["zerver_reaction"]}
logging.info("Exporting messages")
export_messages_single_user(user_profile, output_dir)
export_messages_single_user(
user_profile, output_dir=output_dir, reaction_message_ids=reaction_message_ids
)
logging.info("Exporting images")
export_uploads_and_avatars(user_profile.realm, user=user_profile, output_dir=output_dir)
@@ -2090,18 +2094,27 @@ def chunkify(lst: List[int], chunk_size: int) -> List[List[int]]:
def export_messages_single_user(
user_profile: UserProfile, output_dir: Path, chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE
user_profile: UserProfile, *, output_dir: Path, reaction_message_ids: Set[int]
) -> None:
# Do a slim query to find all message ids that pertain to us.
# TODO: be more selective about which message ids we export.
all_message_ids = get_id_list_gently_from_database(
base_query=UserMessage.objects.filter(user_profile=user_profile),
id_field="message_id",
messages_from_me = Message.objects.filter(sender=user_profile)
my_subscriptions = Subscription.objects.filter(
user_profile=user_profile, recipient__type__in=[Recipient.PERSONAL, Recipient.HUDDLE]
)
my_recipient_ids = [sub.recipient_id for sub in my_subscriptions]
messages_to_me = Message.objects.filter(recipient_id__in=my_recipient_ids)
# Find all message ids that pertain to us.
all_message_ids: Set[int] = set()
for query in [messages_from_me, messages_to_me]:
all_message_ids |= set(get_id_list_gently_from_database(base_query=query, id_field="id"))
all_message_ids |= reaction_message_ids
dump_file_id = 1
for message_id_chunk in chunkify(all_message_ids, chunk_size):
for message_id_chunk in chunkify(sorted(list(all_message_ids)), MESSAGE_BATCH_CHUNK_SIZE):
fat_query = (
UserMessage.objects.select_related("message", "message__sending_client")
.filter(user_profile=user_profile, message_id__in=message_id_chunk)

View File

@@ -32,6 +32,7 @@ from zerver.lib.test_helpers import (
create_s3_buckets,
get_test_image_file,
most_recent_message,
most_recent_usermessage,
use_s3_backend,
)
from zerver.lib.topic_mutes import add_topic_mute
@@ -637,6 +638,10 @@ class ImportExportTest(ZulipTestCase):
def test_export_single_user(self) -> None:
hamlet = self.example_user("hamlet")
cordelia = self.example_user("cordelia")
othello = self.example_user("othello")
polonius = self.example_user("polonius")
self.subscribe(cordelia, "Denmark")
smile_message_id = self.send_stream_message(hamlet, "Denmark", "SMILE!")
@@ -650,6 +655,26 @@ class ImportExportTest(ZulipTestCase):
reaction = Reaction.objects.order_by("id").last()
assert reaction
# Send a message that Cordelia should not have in the export.
self.send_stream_message(othello, "Denmark", "bogus")
hi_stream_message_id = self.send_stream_message(cordelia, "Denmark", "hi stream")
assert most_recent_usermessage(cordelia).message_id == hi_stream_message_id
# Try to fool the export again
self.send_personal_message(othello, hamlet)
self.send_huddle_message(othello, [hamlet, polonius])
hi_hamlet_message_id = self.send_personal_message(cordelia, hamlet, "hi hamlet")
hi_peeps_message_id = self.send_huddle_message(cordelia, [hamlet, othello], "hi peeps")
bye_peeps_message_id = self.send_huddle_message(othello, [cordelia, hamlet], "bye peeps")
bye_hamlet_message_id = self.send_personal_message(cordelia, hamlet, "bye hamlet")
hi_myself_message_id = self.send_personal_message(cordelia, cordelia, "hi myself")
bye_stream_message_id = self.send_stream_message(cordelia, "Denmark", "bye stream")
output_dir = self._make_output_dir()
cordelia = self.example_user("cordelia")
@@ -682,6 +707,21 @@ class ImportExportTest(ZulipTestCase):
exported_messages_recipient = self.get_set(messages["zerver_message"], "recipient")
self.assertIn(list(exported_messages_recipient)[0], exported_recipient_id)
excerpt = [(rec["id"], rec["content"]) for rec in messages["zerver_message"][-8:]]
self.assertEqual(
excerpt,
[
(smile_message_id, "SMILE!"),
(hi_stream_message_id, "hi stream"),
(hi_hamlet_message_id, "hi hamlet"),
(hi_peeps_message_id, "hi peeps"),
(bye_peeps_message_id, "bye peeps"),
(bye_hamlet_message_id, "bye hamlet"),
(hi_myself_message_id, "hi myself"),
(bye_stream_message_id, "bye stream"),
],
)
(exported_reaction,) = user["zerver_reaction"]
self.assertEqual(
exported_reaction,