mirror of
https://github.com/zulip/zulip.git
synced 2025-11-21 06:58:31 +00:00
This avoids risk of OOM issues on servers with relatively limited RAM and millions of messages of history; apparently, fetching all messages ordered by ID could be quite memory-intensive even with an iterator usage model. Fortunately, we have other migrations that already follow this pattern of iterating over messages, so it's easy to borrow existing code to make this migration run reasonably.
94 lines
3.5 KiB
Python
94 lines
3.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Generated by Django 1.11.24 on 2019-10-07 05:25
|
|
from __future__ import unicode_literals
|
|
|
|
from django.db import migrations
|
|
from django.db.backends.postgresql_psycopg2.schema import DatabaseSchemaEditor
|
|
from django.db.migrations.state import StateApps
|
|
|
|
import lxml
|
|
import time
|
|
BATCH_SIZE = 1000
|
|
|
|
def process_batch(apps: StateApps, id_start: int, id_end: int, last_id: int) -> None:
|
|
Message = apps.get_model('zerver', 'Message')
|
|
for message in Message.objects.filter(id__gte=id_start, id__lte=id_end).order_by("id"):
|
|
if message.rendered_content == "":
|
|
# There have been bugs in the past that made it possible
|
|
# for a message to have "" as its rendered_content; we
|
|
# need to skip those because lxml won't process them.
|
|
#
|
|
# They should safely already have the correct state
|
|
# has_link=has_image=has_attachment=False.
|
|
continue
|
|
|
|
if message.id % 1000 == 0:
|
|
print("Processed %s / %s" % (message.id, last_id))
|
|
|
|
# Because we maintain the Attachment table, this should be as
|
|
# simple as just just checking if there's any Attachment
|
|
# objects associated with this message.
|
|
has_attachment = message.attachment_set.exists()
|
|
|
|
# For has_link and has_image, we need to parse the messages.
|
|
# Links are simple -- look for a link in the message.
|
|
lxml_obj = lxml.html.fromstring(message.rendered_content)
|
|
has_link = False
|
|
for link in lxml_obj.xpath("//a"):
|
|
has_link = True
|
|
break
|
|
|
|
# has_image refers to inline image previews, so we just check
|
|
# for the relevant CSS class.
|
|
has_image = False
|
|
for img in lxml_obj.find_class("message_inline_image"):
|
|
has_image = True
|
|
break
|
|
|
|
if (message.has_link == has_link and
|
|
message.has_attachment == has_attachment and
|
|
message.has_image == has_image):
|
|
# No need to spend time with the database if there aren't changes.
|
|
continue
|
|
message.has_image = has_image
|
|
message.has_link = has_link
|
|
message.has_attachment = has_attachment
|
|
message.save(update_fields=['has_link', 'has_attachment', 'has_image'])
|
|
|
|
def fix_has_link(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None:
|
|
Message = apps.get_model('zerver', 'Message')
|
|
if not Message.objects.exists():
|
|
# Nothing to do, and Message.objects.latest() will crash.
|
|
return
|
|
|
|
# This migration logic assumes that either the server is not
|
|
# running, or that it's being run after the logic to correct how
|
|
# `has_link` and friends are set for new messages have been
|
|
# deployed.
|
|
last_id = Message.objects.latest("id").id
|
|
|
|
id_range_lower_bound = 0
|
|
id_range_upper_bound = 0 + BATCH_SIZE
|
|
while id_range_upper_bound <= last_id:
|
|
process_batch(apps, id_range_lower_bound, id_range_upper_bound, last_id)
|
|
|
|
id_range_lower_bound = id_range_upper_bound + 1
|
|
id_range_upper_bound = id_range_lower_bound + BATCH_SIZE
|
|
time.sleep(0.1)
|
|
|
|
if last_id > id_range_lower_bound:
|
|
# Copy for the last batch.
|
|
process_batch(apps, id_range_lower_bound, last_id, last_id)
|
|
|
|
class Migration(migrations.Migration):
|
|
atomic = False
|
|
|
|
dependencies = [
|
|
('zerver', '0256_userprofile_stream_set_recipient_column_values'),
|
|
]
|
|
|
|
operations = [
|
|
migrations.RunPython(fix_has_link,
|
|
reverse_code=migrations.RunPython.noop),
|
|
]
|