mirror of
https://github.com/zulip/zulip.git
synced 2025-11-02 04:53:36 +00:00
retention: Fix OOM issues when deleting large numbers of transactions.
For unknown reasons, deleting 10,000s of ArchiveTransaction objects results in rapidly growing memory in the job making the request in the Django process, eventually leading to an OOM kill. I don't understand why Django behaves that way; I would have expected the failure mode to instead be a serious load problem on the database server, but perhaps the way Django's internal deletion logic handles cascading the deletes to many millions of ArchiveMessages and other ForeignKey objects requires tracking a lot of data in memory. The solution is the same in any case, which is to batch the deletions to execute a reasonable number of them at once. Doing a single ArchiveTransaction at a time would likely result in huge numbers of database queries in a loop, which performs very poorly. So we balance by batching deletions in groups of 100 ArchiveTransactions; testing this in production, I saw no spike of memory usage materially beyond that of a normal Django process, and each bulk-deletion transaction takes several seconds to process (meaning per-transaction overhead is negligible).
This commit is contained in:
@@ -20,6 +20,7 @@ logger = logging.getLogger('zulip.retention')
|
||||
log_to_file(logger, settings.RETENTION_LOG_PATH)
|
||||
|
||||
MESSAGE_BATCH_SIZE = 1000
|
||||
TRANSACTION_DELETION_BATCH_SIZE = 100
|
||||
|
||||
models_with_message_key: List[Dict[str, Any]] = [
|
||||
{
|
||||
@@ -522,9 +523,14 @@ def restore_all_data_from_archive(restore_manual_transactions: bool=True) -> Non
|
||||
def clean_archived_data() -> None:
|
||||
logger.info("Cleaning old archive data.")
|
||||
check_date = timezone_now() - timedelta(days=settings.ARCHIVED_DATA_VACUUMING_DELAY_DAYS)
|
||||
# Appropriate archived objects will get deleted through the on_delete=CASCADE property:
|
||||
transactions = ArchiveTransaction.objects.filter(timestamp__lt=check_date)
|
||||
count = transactions.count()
|
||||
transactions.delete()
|
||||
# Associated archived objects will get deleted through the on_delete=CASCADE property:
|
||||
count = 0
|
||||
transaction_ids = list(ArchiveTransaction.objects.filter(
|
||||
timestamp__lt=check_date).values_list("id", flat=True))
|
||||
while len(transaction_ids) > 0:
|
||||
transaction_block = transaction_ids[0:TRANSACTION_DELETION_BATCH_SIZE]
|
||||
transaction_ids = transaction_ids[TRANSACTION_DELETION_BATCH_SIZE:]
|
||||
ArchiveTransaction.objects.filter(id__in=transaction_block).delete()
|
||||
count += len(transaction_block)
|
||||
|
||||
logger.info("Deleted %s old ArchiveTransactions.", count)
|
||||
|
||||
Reference in New Issue
Block a user