retention: Use new ArchiveTransaction model.

We add a new model, ArchiveTransaction, to tie archived objects together
in a coherent way, according to the batches in which they are archived.
This enables making a better system for restoring from archive, and it
seems just more sensible to tie the archived objects in this way, rather
the somewhat vague setting of archive_timestamp to each object using
timezone_now().
This commit is contained in:
Mateusz Mandera
2019-06-18 19:54:09 +02:00
committed by Tim Abbott
parent e8d49330f2
commit a2cce62c1c
4 changed files with 90 additions and 24 deletions

View File

@@ -88,6 +88,7 @@ ALL_ZULIP_TABLES = {
'zerver_attachment_messages', 'zerver_attachment_messages',
'zerver_archivedreaction', 'zerver_archivedreaction',
'zerver_archivedsubmessage', 'zerver_archivedsubmessage',
'zerver_archivetransaction',
'zerver_botconfigdata', 'zerver_botconfigdata',
'zerver_botstoragedata', 'zerver_botstoragedata',
'zerver_client', 'zerver_client',
@@ -176,6 +177,7 @@ NON_EXPORTED_TABLES = {
'zerver_archivedattachment_messages', 'zerver_archivedattachment_messages',
'zerver_archivedreaction', 'zerver_archivedreaction',
'zerver_archivedsubmessage', 'zerver_archivedsubmessage',
'zerver_archivetransaction',
# Social auth tables are not needed post-export, since we don't # Social auth tables are not needed post-export, since we don't
# use any of this state outside of a direct authentication flow. # use any of this state outside of a direct authentication flow.

View File

@@ -9,7 +9,7 @@ from django.utils.timezone import now as timezone_now
from zerver.lib.logging_util import log_to_file from zerver.lib.logging_util import log_to_file
from zerver.models import (Message, UserMessage, ArchivedMessage, ArchivedUserMessage, Realm, from zerver.models import (Message, UserMessage, ArchivedMessage, ArchivedUserMessage, Realm,
Attachment, ArchivedAttachment, Reaction, ArchivedReaction, Attachment, ArchivedAttachment, Reaction, ArchivedReaction,
SubMessage, ArchivedSubMessage, Recipient, Stream, SubMessage, ArchivedSubMessage, Recipient, Stream, ArchiveTransaction,
get_stream_recipients, get_user_including_cross_realm) get_stream_recipients, get_user_including_cross_realm)
from typing import Any, Dict, Iterator, List from typing import Any, Dict, Iterator, List
@@ -51,7 +51,6 @@ def move_rows(src_model: Any, raw_query: str, returning_id: bool=False,
sql_args = { sql_args = {
'src_fields': ','.join(src_fields), 'src_fields': ','.join(src_fields),
'dst_fields': ','.join(dst_fields), 'dst_fields': ','.join(dst_fields),
'archive_timestamp': timezone_now()
} }
sql_args.update(kwargs) sql_args.update(kwargs)
with connection.cursor() as cursor: with connection.cursor() as cursor:
@@ -96,8 +95,8 @@ def move_expired_messages_to_archive_by_recipient(recipient: Recipient,
# Important: This function is a generator, and you need to iterate # Important: This function is a generator, and you need to iterate
# through the Iterator it returns to execute the queries. # through the Iterator it returns to execute the queries.
query = """ query = """
INSERT INTO zerver_archivedmessage ({dst_fields}, archive_timestamp) INSERT INTO zerver_archivedmessage ({dst_fields})
SELECT {src_fields}, '{archive_timestamp}' SELECT {src_fields}
FROM zerver_message FROM zerver_message
LEFT JOIN zerver_archivedmessage ON zerver_archivedmessage.id = zerver_message.id LEFT JOIN zerver_archivedmessage ON zerver_archivedmessage.id = zerver_message.id
WHERE zerver_message.recipient_id = {recipient_id} WHERE zerver_message.recipient_id = {recipient_id}
@@ -126,8 +125,8 @@ def move_expired_personal_and_huddle_messages_to_archive(realm: Realm,
# TODO: Remove the "zerver_userprofile.id NOT IN {cross_realm_bot_ids}" clause # TODO: Remove the "zerver_userprofile.id NOT IN {cross_realm_bot_ids}" clause
# once https://github.com/zulip/zulip/issues/11015 is solved. # once https://github.com/zulip/zulip/issues/11015 is solved.
query = """ query = """
INSERT INTO zerver_archivedmessage ({dst_fields}, archive_timestamp) INSERT INTO zerver_archivedmessage ({dst_fields})
SELECT {src_fields}, '{archive_timestamp}' SELECT {src_fields}
FROM zerver_message FROM zerver_message
INNER JOIN zerver_recipient ON zerver_recipient.id = zerver_message.recipient_id INNER JOIN zerver_recipient ON zerver_recipient.id = zerver_message.recipient_id
INNER JOIN zerver_userprofile ON zerver_userprofile.id = zerver_message.sender_id INNER JOIN zerver_userprofile ON zerver_userprofile.id = zerver_message.sender_id
@@ -153,8 +152,8 @@ def move_to_archive_and_delete_models_with_message_key(msg_ids: List[int]) -> No
for model in models_with_message_key: for model in models_with_message_key:
query = """ query = """
WITH archived_data AS ( WITH archived_data AS (
INSERT INTO {archive_table_name} ({dst_fields}, archive_timestamp) INSERT INTO {archive_table_name} ({dst_fields})
SELECT {src_fields}, '{archive_timestamp}' SELECT {src_fields}
FROM {table_name} FROM {table_name}
LEFT JOIN {archive_table_name} ON {archive_table_name}.id = {table_name}.id LEFT JOIN {archive_table_name} ON {archive_table_name}.id = {table_name}.id
WHERE {table_name}.message_id IN {message_ids} WHERE {table_name}.message_id IN {message_ids}
@@ -172,8 +171,8 @@ def move_attachments_to_archive(msg_ids: List[int]) -> None:
assert len(msg_ids) > 0 assert len(msg_ids) > 0
query = """ query = """
INSERT INTO zerver_archivedattachment ({dst_fields}, archive_timestamp) INSERT INTO zerver_archivedattachment ({dst_fields})
SELECT {src_fields}, '{archive_timestamp}' SELECT {src_fields}
FROM zerver_attachment FROM zerver_attachment
INNER JOIN zerver_attachment_messages INNER JOIN zerver_attachment_messages
ON zerver_attachment_messages.attachment_id = zerver_attachment.id ON zerver_attachment_messages.attachment_id = zerver_attachment.id
@@ -226,7 +225,7 @@ def move_related_objects_to_archive(msg_ids: List[int]) -> None:
move_attachments_to_archive(msg_ids) move_attachments_to_archive(msg_ids)
move_attachment_messages_to_archive(msg_ids) move_attachment_messages_to_archive(msg_ids)
def run_archiving_in_chunks(message_id_chunks: Iterator[List[int]]) -> int: def run_archiving_in_chunks(message_id_chunks: Iterator[List[int]], realm: Realm) -> int:
# This function is carefully designed to achieve our # This function is carefully designed to achieve our
# transactionality goals: A batch of messages is either fully # transactionality goals: A batch of messages is either fully
# archived-and-deleted or not transactionally. # archived-and-deleted or not transactionally.
@@ -244,6 +243,11 @@ def run_archiving_in_chunks(message_id_chunks: Iterator[List[int]]) -> int:
except StopIteration: except StopIteration:
break break
archive_transaction = ArchiveTransaction.objects.create(
type=ArchiveTransaction.RETENTION_POLICY_BASED, realm=realm
)
ArchivedMessage.objects.filter(id__in=chunk).update(archive_transaction=archive_transaction)
move_related_objects_to_archive(chunk) move_related_objects_to_archive(chunk)
delete_messages(chunk) delete_messages(chunk)
message_count += len(chunk) message_count += len(chunk)
@@ -251,16 +255,16 @@ def run_archiving_in_chunks(message_id_chunks: Iterator[List[int]]) -> int:
return message_count return message_count
def archive_messages_by_recipient(recipient: Recipient, message_retention_days: int, def archive_messages_by_recipient(recipient: Recipient, message_retention_days: int,
chunk_size: int=MESSAGE_BATCH_SIZE) -> int: realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> int:
message_id_chunks = move_expired_messages_to_archive_by_recipient(recipient, message_retention_days, message_id_chunks = move_expired_messages_to_archive_by_recipient(recipient, message_retention_days,
chunk_size) chunk_size)
return run_archiving_in_chunks(message_id_chunks) return run_archiving_in_chunks(message_id_chunks, realm)
def archive_personal_and_huddle_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> None: def archive_personal_and_huddle_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
logger.info("Archiving personal and huddle messages for realm " + realm.string_id) logger.info("Archiving personal and huddle messages for realm " + realm.string_id)
message_id_chunks = move_expired_personal_and_huddle_messages_to_archive(realm, chunk_size) message_id_chunks = move_expired_personal_and_huddle_messages_to_archive(realm, chunk_size)
message_count = run_archiving_in_chunks(message_id_chunks) message_count = run_archiving_in_chunks(message_id_chunks, realm)
logger.info("Done. Archived {} messages".format(message_count)) logger.info("Done. Archived {} messages".format(message_count))
@@ -284,7 +288,7 @@ def archive_stream_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) ->
message_count = 0 message_count = 0
for recipient in recipients: for recipient in recipients:
message_count += archive_messages_by_recipient( message_count += archive_messages_by_recipient(
recipient, retention_policy_dict[recipient.type_id], chunk_size recipient, retention_policy_dict[recipient.type_id], realm, chunk_size
) )
logger.info("Done. Archived {} messages.".format(message_count)) logger.info("Done. Archived {} messages.".format(message_count))
@@ -306,7 +310,9 @@ def move_messages_to_archive(message_ids: List[int]) -> None:
if not messages: if not messages:
raise Message.DoesNotExist raise Message.DoesNotExist
ArchivedMessage.objects.bulk_create([ArchivedMessage(**message) for message in messages]) archive_transaction = ArchiveTransaction.objects.create(type=ArchiveTransaction.MANUAL)
ArchivedMessage.objects.bulk_create([ArchivedMessage(archive_transaction=archive_transaction,
**message) for message in messages])
move_related_objects_to_archive(message_ids) move_related_objects_to_archive(message_ids)
# Remove data from main tables # Remove data from main tables

View File

@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.20 on 2019-06-23 21:20
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('zerver', '0230_rename_to_enable_stream_audible_notifications'),
]
operations = [
migrations.CreateModel(
name='ArchiveTransaction',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('timestamp', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('restored', models.BooleanField(db_index=True, default=False)),
('type', models.PositiveSmallIntegerField(db_index=True)),
('realm', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='zerver.Realm')),
],
),
migrations.RemoveField(
model_name='archivedattachment',
name='archive_timestamp',
),
migrations.RemoveField(
model_name='archivedmessage',
name='archive_timestamp',
),
migrations.RemoveField(
model_name='archivedreaction',
name='archive_timestamp',
),
migrations.RemoveField(
model_name='archivedsubmessage',
name='archive_timestamp',
),
migrations.RemoveField(
model_name='archivedusermessage',
name='archive_timestamp',
),
migrations.AddField(
model_name='archivedmessage',
name='archive_transaction',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='zerver.ArchiveTransaction'),
),
]

View File

@@ -1519,14 +1519,26 @@ class AbstractMessage(models.Model):
return "<%s: %s / %s / %s>" % (self.__class__.__name__, display_recipient, return "<%s: %s / %s / %s>" % (self.__class__.__name__, display_recipient,
self.subject, self.sender) self.subject, self.sender)
class ArchiveTransaction(models.Model):
timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime
# Marks if the data archived in this transaction has been restored:
restored = models.BooleanField(default=False, db_index=True) # type: bool
type = models.PositiveSmallIntegerField(db_index=True) # type: int
# Valid types:
RETENTION_POLICY_BASED = 1 # Archiving was executed due to automated retention policies
MANUAL = 2 # Archiving was run manually, via move_messages_to_archive function
# ForeignKey to the realm with which objects archived in this transaction are associated.
# If type is set to MANUAL, this should be null.
realm = models.ForeignKey(Realm, null=True, on_delete=CASCADE) # type: Optional[Realm]
class ArchivedMessage(AbstractMessage): class ArchivedMessage(AbstractMessage):
"""Used as a temporary holding place for deleted messages before they """Used as a temporary holding place for deleted messages before they
are permanently deleted. This is an important part of a robust are permanently deleted. This is an important part of a robust
'message retention' feature. 'message retention' feature.
""" """
archive_timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime archive_transaction = models.ForeignKey(ArchiveTransaction, on_delete=CASCADE, null=True) # type: Optional[ArchiveTransaction]
class Message(AbstractMessage): class Message(AbstractMessage):
@@ -1671,7 +1683,6 @@ class SubMessage(AbstractSubMessage):
class ArchivedSubMessage(AbstractSubMessage): class ArchivedSubMessage(AbstractSubMessage):
message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: ArchivedMessage message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: ArchivedMessage
archive_timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime
post_save.connect(flush_submessage, sender=SubMessage) post_save.connect(flush_submessage, sender=SubMessage)
@@ -1730,7 +1741,6 @@ class Reaction(AbstractReaction):
class ArchivedReaction(AbstractReaction): class ArchivedReaction(AbstractReaction):
message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: ArchivedMessage message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: ArchivedMessage
archive_timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime
# Whenever a message is sent, for each user subscribed to the # Whenever a message is sent, for each user subscribed to the
# corresponding Recipient object, we add a row to the UserMessage # corresponding Recipient object, we add a row to the UserMessage
@@ -1859,8 +1869,6 @@ class ArchivedUserMessage(AbstractUserMessage):
a robust 'message retention' feature. a robust 'message retention' feature.
""" """
message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: Message message = models.ForeignKey(ArchivedMessage, on_delete=CASCADE) # type: Message
archive_timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime
class AbstractAttachment(models.Model): class AbstractAttachment(models.Model):
file_name = models.TextField(db_index=True) # type: str file_name = models.TextField(db_index=True) # type: str
@@ -1895,10 +1903,8 @@ class ArchivedAttachment(AbstractAttachment):
before they are permanently deleted. This is an important part of before they are permanently deleted. This is an important part of
a robust 'message retention' feature. a robust 'message retention' feature.
""" """
archive_timestamp = models.DateTimeField(default=timezone_now, db_index=True) # type: datetime.datetime
messages = models.ManyToManyField(ArchivedMessage) # type: Manager messages = models.ManyToManyField(ArchivedMessage) # type: Manager
class Attachment(AbstractAttachment): class Attachment(AbstractAttachment):
messages = models.ManyToManyField(Message) # type: Manager messages = models.ManyToManyField(Message) # type: Manager