zulip/zerver/migrations/0460_backfill_realmauditlog_extradata_to_json_field.py

# Generated by Django 4.0.7 on 2022-09-30 20:30

import ast

import orjson
from django.db import migrations, transaction
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import F, JSONField, Model
from django.db.models.functions import Cast, JSONObject

# This migration is mostly the same as
# backfill_remote_realmauditlog_extradata_to_json_field in zilencer.

OLD_VALUE = "1"
NEW_VALUE = "2"
USER_FULL_NAME_CHANGED = 124
REALM_DISCOUNT_CHANGED = 209
BATCH_SIZE = 5000

DISCOUNT_DATA_TEMPLATE = """Audit log entry {id} with event type REALM_DISCOUNT_CHANGED is skipped.
The data consistency needs to be manually checked.
  Discount data to remove after the upcoming JSONField migration:
{data_to_remove}
  Discount data to keep after the upcoming JSONField migration:
{data_to_keep}
"""

OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten.
  The old value is:
{old_value}
  The new value is:
{new_value}
"""


@transaction.atomic
def do_bulk_backfill_extra_data(
    audit_log_model: type[Model], id_lower_bound: int, id_upper_bound: int
) -> None:
    # First handle the special case for audit logs with the
    # USER_FULL_NAME_CHANGED event, which stores the full name not as
    # str(dict()) but a plain str. Note that we only update the entries where
    # extra_data_json has the default value, because we do not want to override
    # existing audit log entries with a NEW_VALUE of None for extra_data_json.
    # We do not need to skip existing entries for other parts of backfilling
    # because we have double-write implemented so that the backfilled value
    # will still be consistent.
    audit_log_model._default_manager.filter(
        event_type=USER_FULL_NAME_CHANGED,
        id__range=(id_lower_bound, id_upper_bound),
        extra_data_json={},
        # extra_data used to keeps track of the old name. As a result, we know
        # nothing about what NEW_VALUE would be especially if the name has been
        # changed multiple times. extra_data_json is a JSONObject whose
        # OLD_VALUE and NEW_VALUE is mapped from the value of the extra_data
        # field (which is just a old full name string) and None, respectively.
        # Documentation for JSONObject:
        # https://docs.djangoproject.com/en/5.0/ref/models/database-functions/#jsonobject
    ).update(extra_data_json=JSONObject(**{OLD_VALUE: "extra_data", NEW_VALUE: None}))

    inconsistent_extra_data_json: list[tuple[int, str, object, object]] = []
    # A dict converted with str() will start with a open bracket followed by a
    # single quote, as opposed to a JSON-encoded value, which will use a
    # _double_ quote. We use this to filter out those entries with malformed
    # extra_data to be handled later. This should only update rows with
    # extra_data populated with orjson.dumps.

    # The first query below checks for entries that would have extra_data_json
    # being overwritten by the migration with a value inconsistent with its
    # previous value.
    inconsistent_extra_data_json.extend(
        audit_log_model._default_manager.filter(
            extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound)
        )
        .annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField()))
        .exclude(extra_data__startswith="{'")
        .exclude(event_type=USER_FULL_NAME_CHANGED)
        .exclude(extra_data_json={})
        .exclude(extra_data_json=F("new_extra_data_json"))
        .values_list("id", "extra_data", "extra_data_json", "new_extra_data_json")
    )
    (
        audit_log_model._default_manager.filter(
            extra_data__isnull=False,
            id__range=(id_lower_bound, id_upper_bound),
            extra_data_json__inconsistent_old_extra_data__isnull=True,
        )
        .exclude(extra_data__startswith="{'")
        .exclude(event_type=USER_FULL_NAME_CHANGED)
        .update(extra_data_json=Cast("extra_data", output_field=JSONField()))
    )

    python_valued_audit_log_entries = audit_log_model._default_manager.filter(
        extra_data__startswith="{'",
        id__range=(id_lower_bound, id_upper_bound),
        extra_data_json__inconsistent_old_extra_data__isnull=True,
    )
    for audit_log_entry in python_valued_audit_log_entries:
        # extra_data for entries that store dict stringified with builtins.str()
        # are converted back with ast.literal_eval for safety and efficiency.
        # str()'d extra_data with the REALM_DISCOUNT_CHANGED event type is not
        # handled by this migration. We expect that all such entries are
        # manually converted beforehand or an error will occur during the
        # migration, because ast.literal_eval does not allow the evaluation of
        # Decimal.
        old_value = audit_log_entry.extra_data_json  # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed.
        if audit_log_entry.event_type == REALM_DISCOUNT_CHANGED:  # type: ignore[attr-defined] # Explained above.
            print(
                DISCOUNT_DATA_TEMPLATE.format(
                    id=audit_log_entry.id,  # type: ignore[attr-defined] # Explained above.
                    data_to_remove=audit_log_entry.extra_data,  # type: ignore[attr-defined] # Explained above.
                    data_to_keep=old_value,
                )
            )
            continue
        new_value = ast.literal_eval(audit_log_entry.extra_data)  # type: ignore[attr-defined] # Explained above.
        if old_value not in ({}, new_value):
            inconsistent_extra_data_json.append(
                (audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value)  # type: ignore[attr-defined] # Explained above.
            )
        audit_log_entry.extra_data_json = new_value  # type: ignore[attr-defined] # Explained above.
    audit_log_model._default_manager.bulk_update(
        python_valued_audit_log_entries, fields=["extra_data_json"]
    )

    if inconsistent_extra_data_json:
        audit_log_entries = []
        for (
            audit_log_entry_id,
            old_extra_data,
            old_extra_data_json,
            new_extra_data_json,
        ) in inconsistent_extra_data_json:
            audit_log_entry = audit_log_model._default_manager.get(id=audit_log_entry_id)
            assert isinstance(old_extra_data_json, dict)
            if "inconsistent_old_extra_data" in old_extra_data_json:
                # Skip entries that have been backfilled and detected as
                # anomalies before.
                continue
            assert isinstance(new_extra_data_json, dict)
            audit_log_entry.extra_data_json = {  # type: ignore[attr-defined] # Explained above.
                **new_extra_data_json,
                "inconsistent_old_extra_data": old_extra_data,
                "inconsistent_old_extra_data_json": old_extra_data_json,
            }
            audit_log_entries.append(audit_log_entry)
            print(
                OVERWRITE_TEMPLATE.format(
                    id=audit_log_entry_id,
                    old_value=orjson.dumps(old_extra_data_json).decode(),
                    new_value=orjson.dumps(new_extra_data_json).decode(),
                )
            )
        audit_log_model._default_manager.bulk_update(audit_log_entries, fields=["extra_data_json"])


def backfill_extra_data(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
    audit_log_model = apps.get_model("zerver", "RealmAuditLog")
    if not audit_log_model.objects.filter(extra_data__isnull=False).exists():
        return

    audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False)
    id_lower_bound = audit_log_entries.earliest("id").id
    id_upper_bound = audit_log_entries.latest("id").id
    while id_lower_bound <= id_upper_bound:
        do_bulk_backfill_extra_data(
            audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound)
        )
        id_lower_bound += BATCH_SIZE + 1


class Migration(migrations.Migration):
    atomic = False

    dependencies = [
        ("zerver", "0459_remove_invalid_characters_from_user_group_name"),
    ]

    operations = [
        migrations.RunPython(
            backfill_extra_data, reverse_code=migrations.RunPython.noop, elidable=True
        ),
    ]