Files
zulip/zerver/tests/test_retention.py
Mateusz Mandera 0e9fa4f028 retention: Support stream-based retention policies.
We change the archiving scheme to allow having stream based retention
policies. In the first step of the archiving process, we loop over
streams and archive their expired messages and related objects.
Then we separately archive all expired personal and huddle messages and
related objects. As the last step, we scan for redundant attachments
which can now be deleted.
To achieve this, we have to rewrite a significant portion of the
retention code and rework some of the database queries.
For the sake of simplicity, we neither archive nor delete cross-realm
messages, except cross-realm stream messages – in their case they can
be processed in the same manner as ordinary stream messages.
In the query for archiving personal and huddle messages we simply
exclude those sent by cross-realm bots.
We change the tests to adapt to these modifications.
2019-06-06 11:17:42 -07:00

551 lines
23 KiB
Python

# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from django.utils.timezone import now as timezone_now
from zerver.lib.actions import internal_send_private_message, do_add_submessage
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.upload import create_attachment
from zerver.models import (Message, Realm, UserProfile, Stream, ArchivedUserMessage, SubMessage,
ArchivedMessage, Attachment, ArchivedAttachment, UserMessage,
Reaction, ArchivedReaction, ArchivedSubMessage,
get_realm, get_user_profile_by_email, get_system_bot)
from zerver.lib.retention import (
archive_messages,
move_messages_to_archive
)
# Class with helper functions useful for testing archiving of reactions:
from zerver.tests.test_reactions import EmojiReactionBase
ZULIP_REALM_DAYS = 30
MIT_REALM_DAYS = 100
class RetentionTestingBase(ZulipTestCase):
"""
Test receiving expired messages retention tool.
"""
def setUp(self) -> None:
self.zulip_realm = get_realm('zulip')
self.mit_realm = get_realm('zephyr')
self._set_realm_message_retention_value(self.zulip_realm, ZULIP_REALM_DAYS)
self._set_realm_message_retention_value(self.mit_realm, MIT_REALM_DAYS)
# Set publication date of all existing messages to "now", so that we have full
# control over what's expired and what isn't.
Message.objects.all().update(pub_date=timezone_now())
def _set_realm_message_retention_value(self, realm: Realm, retention_period: Optional[int]) -> None:
realm.message_retention_days = retention_period
realm.save()
for stream in Stream.objects.filter(realm=realm):
if not retention_period or not stream.message_retention_days:
stream.message_retention_days = retention_period
stream.save()
def _change_messages_pub_date(self, msgs_ids: List[int], pub_date: datetime) -> None:
Message.objects.filter(id__in=msgs_ids).update(pub_date=pub_date)
def _make_mit_messages(self, message_quantity: int, pub_date: datetime) -> Any:
# send messages from mit.edu realm and change messages pub date
sender = self.mit_user('espuser')
recipient = self.mit_user('starnine')
msg_ids = [self.send_personal_message(sender.email, recipient.email,
sender_realm='zephyr')
for i in range(message_quantity)]
self._change_messages_pub_date(msg_ids, pub_date)
return msg_ids
def _send_cross_realm_message(self) -> int:
# Send message from bot to users from different realm.
bot_email = 'notification-bot@zulip.com'
get_user_profile_by_email(bot_email)
mit_user = UserProfile.objects.filter(realm=self.mit_realm).first()
msg_id = internal_send_private_message(
realm=mit_user.realm,
sender=get_system_bot(bot_email),
recipient_user=mit_user,
content='test message',
)
assert msg_id is not None
return msg_id
def _make_expired_zulip_messages(self, message_quantity: int) -> List[int]:
msg_ids = list(Message.objects.order_by('id').filter(
sender__realm=self.zulip_realm).values_list('id', flat=True)[3:3 + message_quantity])
self._change_messages_pub_date(
msg_ids,
timezone_now() - timedelta(ZULIP_REALM_DAYS+1)
)
return msg_ids
def _get_usermessage_ids(self, message_ids: List[int]) -> List[int]:
return list(UserMessage.objects.filter(message_id__in=message_ids).values_list('id', flat=True))
def _verify_archive_data(self, expected_message_ids: List[int],
expected_usermessage_ids: List[int]) -> None:
self.assertEqual(
set(ArchivedMessage.objects.values_list('id', flat=True)),
set(expected_message_ids)
)
self.assertEqual(
set(ArchivedUserMessage.objects.values_list('id', flat=True)),
set(expected_usermessage_ids)
)
def _send_messages_with_attachments(self) -> Dict[str, int]:
user_profile = self.example_user("hamlet")
sender_email = user_profile.email
sample_size = 10
dummy_files = [
('zulip.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt', sample_size),
('temp_file.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py', sample_size),
('abc.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py', sample_size)
]
for file_name, path_id, size in dummy_files:
create_attachment(file_name, path_id, user_profile, size)
self.subscribe(user_profile, "Denmark")
body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py"
expired_message_id = self.send_stream_message(sender_email, "Denmark", body)
actual_message_id = self.send_stream_message(sender_email, "Denmark", body)
other_message_id = self.send_stream_message("othello@zulip.com", "Denmark", body)
self._change_messages_pub_date([expired_message_id], timezone_now() - timedelta(days=MIT_REALM_DAYS + 1))
return {'expired_message_id': expired_message_id, 'actual_message_id': actual_message_id,
'other_user_message_id': other_message_id}
class TestArchivingGeneral(RetentionTestingBase):
def test_no_expired_messages(self) -> None:
archive_messages()
self.assertEqual(ArchivedUserMessage.objects.count(), 0)
self.assertEqual(ArchivedMessage.objects.count(), 0)
def test_expired_messages_in_each_realm(self) -> None:
"""General test for archiving expired messages properly with
multiple realms involved"""
# Make some expired messages in MIT:
expired_mit_msg_ids = self._make_mit_messages(
5,
timezone_now() - timedelta(days=MIT_REALM_DAYS+1)
)
# Make some non-expired messages in MIT:
self._make_mit_messages(4, timezone_now() - timedelta(days=MIT_REALM_DAYS-1))
# Change some Zulip messages to be expired:
expired_zulip_msg_ids = list(Message.objects.order_by('id').filter(
sender__realm=self.zulip_realm).values_list('id', flat=True)[3:10])
self._change_messages_pub_date(
expired_zulip_msg_ids,
timezone_now() - timedelta(ZULIP_REALM_DAYS+1)
)
expired_msg_ids = expired_mit_msg_ids + expired_zulip_msg_ids
expired_usermsg_ids = self._get_usermessage_ids(expired_msg_ids)
archive_messages()
self._verify_archive_data(expired_msg_ids, expired_usermsg_ids)
def test_expired_messages_in_one_realm(self) -> None:
"""Test with a retention policy set for only the MIT realm"""
self._set_realm_message_retention_value(self.zulip_realm, None)
# Make some expired messages in MIT:
expired_mit_msg_ids = self._make_mit_messages(
5,
timezone_now() - timedelta(days=MIT_REALM_DAYS+1)
)
# Make some non-expired messages in MIT:
self._make_mit_messages(4, timezone_now() - timedelta(days=MIT_REALM_DAYS-1))
# Change some Zulip messages pub_date, but the realm has no retention policy,
# so they shouldn't get archived
zulip_msg_ids = list(Message.objects.order_by('id').filter(
sender__realm=self.zulip_realm).values_list('id', flat=True)[3:10])
self._change_messages_pub_date(
zulip_msg_ids,
timezone_now() - timedelta(ZULIP_REALM_DAYS+1)
)
# Only MIT has a retention policy:
expired_msg_ids = expired_mit_msg_ids
expired_usermsg_ids = self._get_usermessage_ids(expired_msg_ids)
archive_messages()
self._verify_archive_data(expired_msg_ids, expired_usermsg_ids)
self._set_realm_message_retention_value(self.zulip_realm, ZULIP_REALM_DAYS)
def test_cross_realm_messages_not_archived(self) -> None:
"""Check that cross-realm messages don't get archived or deleted."""
msg_id = self._send_cross_realm_message()
# Make the message expired on both realms:
self._change_messages_pub_date([msg_id], timezone_now() - timedelta(MIT_REALM_DAYS+1))
archive_messages()
self.assertEqual(ArchivedMessage.objects.count(), 0)
self.assertEqual(ArchivedUserMessage.objects.count(), 0)
self.assertEqual(UserMessage.objects.filter(message_id=msg_id).count(), 2)
self.assertTrue(Message.objects.filter(id=msg_id).exists())
def test_archive_message_tool(self) -> None:
"""End-to-end test of the archiving tool, directly calling
archive_messages."""
# Make some expired messages in MIT:
expired_mit_msg_ids = self._make_mit_messages(
5,
timezone_now() - timedelta(days=MIT_REALM_DAYS+1)
)
# Make some non-expired messages in MIT:
self._make_mit_messages(4, timezone_now() - timedelta(days=MIT_REALM_DAYS-1))
# Change some Zulip messages to be expired:
expired_zulip_msg_ids = self._make_expired_zulip_messages(7)
expired_crossrealm_msg_id = self._send_cross_realm_message()
# Make the message expired on both realms:
self._change_messages_pub_date(
[expired_crossrealm_msg_id],
timezone_now() - timedelta(MIT_REALM_DAYS+1)
)
expired_msg_ids = expired_mit_msg_ids + expired_zulip_msg_ids
expired_usermsg_ids = self._get_usermessage_ids(expired_msg_ids)
archive_messages()
# Make sure we archived what neeeded:
self._verify_archive_data(expired_msg_ids, expired_usermsg_ids)
# Check that the archived messages were deleted:
self.assertEqual(UserMessage.objects.filter(id__in=expired_usermsg_ids).count(), 0)
self.assertEqual(Message.objects.filter(id__in=expired_msg_ids).count(), 0)
def test_archiving_attachments(self) -> None:
"""End-to-end test for the logic for archiving attachments. This test
is hard to read without first reading _send_messages_with_attachments"""
msgs_ids = self._send_messages_with_attachments()
# First, confirm deleting the oldest message
# (`expired_message_id`) creates ArchivedAttachment objects
# and associates that message ID with them, but does not
# delete the Attachment object.
archive_messages()
self.assertEqual(ArchivedAttachment.objects.count(), 3)
self.assertEqual(
list(ArchivedAttachment.objects.distinct('messages__id').values_list('messages__id',
flat=True)),
[msgs_ids['expired_message_id']]
)
self.assertEqual(Attachment.objects.count(), 3)
# Now make `actual_message_id` expired too. We still don't
# delete the Attachment objects.
self._change_messages_pub_date([msgs_ids['actual_message_id']],
timezone_now() - timedelta(days=MIT_REALM_DAYS + 1))
archive_messages()
self.assertEqual(Attachment.objects.count(), 3)
# Finally, make the last message mentioning those attachments
# expired. We should now delete the Attachment objects and
# each ArchivedAttachment object should list all 3 messages.
self._change_messages_pub_date([msgs_ids['other_user_message_id']],
timezone_now() - timedelta(days=MIT_REALM_DAYS + 1))
archive_messages()
self.assertEqual(Attachment.objects.count(), 0)
self.assertEqual(ArchivedAttachment.objects.count(), 3)
self.assertEqual(
list(ArchivedAttachment.objects.distinct('messages__id').order_by('messages__id').values_list(
'messages__id', flat=True)),
sorted(msgs_ids.values())
)
class TestArchivingSubMessages(RetentionTestingBase):
def test_archiving_submessages(self) -> None:
expired_msg_ids = self._make_expired_zulip_messages(2)
cordelia = self.example_user('cordelia')
hamlet = self.example_user('hamlet')
do_add_submessage(
realm=self.zulip_realm,
sender_id=cordelia.id,
message_id=expired_msg_ids[0],
msg_type='whatever',
content='{"name": "alice", "salary": 20}'
)
do_add_submessage(
realm=self.zulip_realm,
sender_id=hamlet.id,
message_id=expired_msg_ids[0],
msg_type='whatever',
content='{"name": "john", "salary": 30}'
)
do_add_submessage(
realm=self.zulip_realm,
sender_id=cordelia.id,
message_id=expired_msg_ids[1],
msg_type='whatever',
content='{"name": "jack", "salary": 10}'
)
submessage_ids = list(
SubMessage.objects.filter(message_id__in=expired_msg_ids).values_list('id', flat=True)
)
self.assertEqual(len(submessage_ids), 3)
self.assertEqual(SubMessage.objects.filter(id__in=submessage_ids).count(), 3)
archive_messages()
self.assertEqual(SubMessage.objects.filter(id__in=submessage_ids).count(), 0)
self.assertEqual(
set(ArchivedSubMessage.objects.filter(id__in=submessage_ids).values_list('id', flat=True)),
set(submessage_ids)
)
class TestArchivingReactions(RetentionTestingBase, EmojiReactionBase):
def test_archiving_reactions(self) -> None:
expired_msg_ids = self._make_expired_zulip_messages(2)
self.post_zulip_reaction(expired_msg_ids[0], 'hamlet')
self.post_zulip_reaction(expired_msg_ids[0], 'cordelia')
self.post_zulip_reaction(expired_msg_ids[1], 'hamlet')
reaction_ids = list(
Reaction.objects.filter(message_id__in=expired_msg_ids).values_list('id', flat=True)
)
self.assertEqual(len(reaction_ids), 3)
self.assertEqual(Reaction.objects.filter(id__in=reaction_ids).count(), 3)
archive_messages()
self.assertEqual(Reaction.objects.filter(id__in=reaction_ids).count(), 0)
self.assertEqual(
set(ArchivedReaction.objects.filter(id__in=reaction_ids).values_list('id', flat=True)),
set(reaction_ids)
)
class MoveMessageToArchiveBase(ZulipTestCase):
def setUp(self) -> None:
self.sender = 'hamlet@zulip.com'
self.recipient = 'cordelia@zulip.com'
def _create_attachments(self) -> None:
sample_size = 10
dummy_files = [
('zulip.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt', sample_size),
('temp_file.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py', sample_size),
('abc.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py', sample_size),
('hello.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/hello.txt', sample_size),
('new.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/new.py', sample_size)
]
user_profile = self.example_user('hamlet')
for file_name, path_id, size in dummy_files:
create_attachment(file_name, path_id, user_profile, size)
def _get_usermsg_ids(self, msg_ids: List[int]) -> List[int]:
return list(UserMessage.objects.filter(
message_id__in=msg_ids).order_by('id').values_list('id', flat=True))
def _assert_archive_empty(self) -> None:
self.assertFalse(ArchivedUserMessage.objects.exists())
self.assertFalse(ArchivedMessage.objects.exists())
self.assertFalse(ArchivedAttachment.objects.exists())
def _check_messages_after_archiving(self, msg_ids: List[int], usermsg_ids: List[int]) -> None:
self.assertEqual(
set(ArchivedMessage.objects.filter(id__in=msg_ids).values_list('id', flat=True)),
set(msg_ids)
)
self.assertEqual(
set(ArchivedUserMessage.objects.filter(id__in=usermsg_ids).values_list('id', flat=True)),
set(usermsg_ids)
)
self.assertFalse(Message.objects.filter(id__in=msg_ids).exists())
self.assertFalse(UserMessage.objects.filter(id__in=usermsg_ids).exists())
class MoveMessageToArchiveGeneral(MoveMessageToArchiveBase):
def test_personal_messages_archiving(self) -> None:
msg_ids = [self.send_personal_message(self.sender, self.recipient)
for i in range(0, 3)]
usermsg_ids = self._get_usermsg_ids(msg_ids)
self._assert_archive_empty()
move_messages_to_archive(message_ids=msg_ids)
self._check_messages_after_archiving(msg_ids, usermsg_ids)
def test_stream_messages_archiving(self) -> None:
msg_ids = [self.send_stream_message(self.sender, "Verona")
for i in range(0, 3)]
usermsg_ids = self._get_usermsg_ids(msg_ids)
self._assert_archive_empty()
move_messages_to_archive(message_ids=msg_ids)
self._check_messages_after_archiving(msg_ids, usermsg_ids)
def test_archiving_messages_second_time(self) -> None:
msg_ids = [self.send_stream_message(self.sender, "Verona")
for i in range(0, 3)]
usermsg_ids = self._get_usermsg_ids(msg_ids)
self._assert_archive_empty()
move_messages_to_archive(message_ids=msg_ids)
self._check_messages_after_archiving(msg_ids, usermsg_ids)
with self.assertRaises(Message.DoesNotExist):
move_messages_to_archive(message_ids=msg_ids)
def test_archiving_messages_with_attachment(self) -> None:
self._create_attachments()
body1 = """Some files here ...[zulip.txt](
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py ....
Some more.... http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py
"""
body2 = """Some files here
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt ...
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/hello.txt ....
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/new.py ....
"""
msg_ids = [
self.send_personal_message(self.sender, self.recipient, body1),
self.send_personal_message(self.sender, self.recipient, body2)
]
attachment_id_to_message_ids = {} # type: Dict[int, List[int]]
attachment_ids = list(
Attachment.objects.filter(messages__id__in=msg_ids).values_list("id", flat=True)
)
for attachment_id in attachment_ids:
attachment_id_to_message_ids[attachment_id] = list(
Message.objects.filter(attachment__id=attachment_id).values_list("id", flat=True)
)
usermsg_ids = self._get_usermsg_ids(msg_ids)
self._assert_archive_empty()
move_messages_to_archive(message_ids=msg_ids)
self._check_messages_after_archiving(msg_ids, usermsg_ids)
self.assertFalse(Attachment.objects.exists())
archived_attachment_ids = list(
ArchivedAttachment.objects.filter(messages__id__in=msg_ids).values_list("id", flat=True)
)
self.assertEqual(set(attachment_ids), set(archived_attachment_ids))
for attachment_id in archived_attachment_ids:
self.assertEqual(
set(attachment_id_to_message_ids[attachment_id]),
set(ArchivedMessage.objects.filter(
archivedattachment__id=attachment_id).values_list("id", flat=True))
)
def test_archiving_message_with_shared_attachment(self) -> None:
# Make sure that attachments still in use in other messages don't get deleted:
self._create_attachments()
body = """Some files here ...[zulip.txt](
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py ....
Some more.... http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py ...
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/new.py ....
http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/hello.txt ....
"""
msg_id = self.send_personal_message(self.sender, self.recipient, body)
# Simulate a reply with the same contents.
reply_msg_id = self.send_personal_message(
from_email=self.recipient,
to_email=self.sender,
content=body,
)
usermsg_ids = self._get_usermsg_ids([msg_id])
attachment_ids = list(
Attachment.objects.filter(messages__id=msg_id).values_list("id", flat=True)
)
self._assert_archive_empty()
move_messages_to_archive(message_ids=[msg_id])
self._check_messages_after_archiving([msg_id], usermsg_ids)
self.assertEqual(Attachment.objects.count(), 5)
self.assertEqual(
set(ArchivedAttachment.objects.filter(messages__id=msg_id).values_list("id", flat=True)),
set(attachment_ids)
)
move_messages_to_archive(message_ids=[reply_msg_id])
# Now the attachment should have been deleted:
self.assertEqual(Attachment.objects.count(), 0)
class MoveMessageToArchiveWithSubMessages(MoveMessageToArchiveBase):
def test_archiving_message_with_submessages(self) -> None:
msg_id = self.send_stream_message(self.sender, "Verona")
cordelia = self.example_user('cordelia')
hamlet = self.example_user('hamlet')
do_add_submessage(
realm=get_realm('zulip'),
sender_id=cordelia.id,
message_id=msg_id,
msg_type='whatever',
content='{"name": "alice", "salary": 20}'
)
do_add_submessage(
realm=get_realm('zulip'),
sender_id=hamlet.id,
message_id=msg_id,
msg_type='whatever',
content='{"name": "john", "salary": 30}'
)
submessage_ids = list(
SubMessage.objects.filter(message_id=msg_id).values_list('id', flat=True)
)
self.assertEqual(SubMessage.objects.filter(id__in=submessage_ids).count(), 2)
move_messages_to_archive(message_ids=[msg_id])
self.assertEqual(
set(ArchivedSubMessage.objects.filter(message_id=msg_id).values_list("id", flat=True)),
set(submessage_ids)
)
self.assertEqual(SubMessage.objects.filter(id__in=submessage_ids).count(), 0)
class MoveMessageToArchiveWithReactions(MoveMessageToArchiveBase, EmojiReactionBase):
def test_archiving_message_with_reactions(self) -> None:
msg_id = self.send_stream_message(self.sender, "Verona")
self.post_zulip_reaction(msg_id, 'hamlet')
self.post_zulip_reaction(msg_id, 'cordelia')
reaction_ids = list(
Reaction.objects.filter(message_id=msg_id).values_list('id', flat=True)
)
self.assertEqual(Reaction.objects.filter(id__in=reaction_ids).count(), 2)
move_messages_to_archive(message_ids=[msg_id])
self.assertEqual(
set(ArchivedReaction.objects.filter(message_id=msg_id).values_list("id", flat=True)),
set(reaction_ids)
)
self.assertEqual(Reaction.objects.filter(id__in=reaction_ids).count(), 0)