Files
zulip/zerver/migrations/0257_fix_has_link_attribute.py
Tim Abbott 7bde70bb52 migrations: Batch fix_has_link_attribute migration.
This avoids risk of OOM issues on servers with relatively limited RAM
and millions of messages of history; apparently, fetching all messages
ordered by ID could be quite memory-intensive even with an iterator
usage model.

Fortunately, we have other migrations that already follow this pattern
of iterating over messages, so it's easy to borrow existing code to
make this migration run reasonably.
2019-12-12 15:29:49 -08:00

94 lines
3.5 KiB
Python

# -*- coding: utf-8 -*-
# Generated by Django 1.11.24 on 2019-10-07 05:25
from __future__ import unicode_literals
from django.db import migrations
from django.db.backends.postgresql_psycopg2.schema import DatabaseSchemaEditor
from django.db.migrations.state import StateApps
import lxml
import time
BATCH_SIZE = 1000
def process_batch(apps: StateApps, id_start: int, id_end: int, last_id: int) -> None:
Message = apps.get_model('zerver', 'Message')
for message in Message.objects.filter(id__gte=id_start, id__lte=id_end).order_by("id"):
if message.rendered_content == "":
# There have been bugs in the past that made it possible
# for a message to have "" as its rendered_content; we
# need to skip those because lxml won't process them.
#
# They should safely already have the correct state
# has_link=has_image=has_attachment=False.
continue
if message.id % 1000 == 0:
print("Processed %s / %s" % (message.id, last_id))
# Because we maintain the Attachment table, this should be as
# simple as just just checking if there's any Attachment
# objects associated with this message.
has_attachment = message.attachment_set.exists()
# For has_link and has_image, we need to parse the messages.
# Links are simple -- look for a link in the message.
lxml_obj = lxml.html.fromstring(message.rendered_content)
has_link = False
for link in lxml_obj.xpath("//a"):
has_link = True
break
# has_image refers to inline image previews, so we just check
# for the relevant CSS class.
has_image = False
for img in lxml_obj.find_class("message_inline_image"):
has_image = True
break
if (message.has_link == has_link and
message.has_attachment == has_attachment and
message.has_image == has_image):
# No need to spend time with the database if there aren't changes.
continue
message.has_image = has_image
message.has_link = has_link
message.has_attachment = has_attachment
message.save(update_fields=['has_link', 'has_attachment', 'has_image'])
def fix_has_link(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None:
Message = apps.get_model('zerver', 'Message')
if not Message.objects.exists():
# Nothing to do, and Message.objects.latest() will crash.
return
# This migration logic assumes that either the server is not
# running, or that it's being run after the logic to correct how
# `has_link` and friends are set for new messages have been
# deployed.
last_id = Message.objects.latest("id").id
id_range_lower_bound = 0
id_range_upper_bound = 0 + BATCH_SIZE
while id_range_upper_bound <= last_id:
process_batch(apps, id_range_lower_bound, id_range_upper_bound, last_id)
id_range_lower_bound = id_range_upper_bound + 1
id_range_upper_bound = id_range_lower_bound + BATCH_SIZE
time.sleep(0.1)
if last_id > id_range_lower_bound:
# Copy for the last batch.
process_batch(apps, id_range_lower_bound, last_id, last_id)
class Migration(migrations.Migration):
atomic = False
dependencies = [
('zerver', '0256_userprofile_stream_set_recipient_column_values'),
]
operations = [
migrations.RunPython(fix_has_link,
reverse_code=migrations.RunPython.noop),
]