Files
zulip/zerver/migrations/0460_backfill_realmauditlog_extradata_to_json_field.py
2024-07-13 22:28:22 -07:00

185 lines
8.3 KiB
Python

# Generated by Django 4.0.7 on 2022-09-30 20:30
import ast
import orjson
from django.db import migrations, transaction
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import F, JSONField, Model
from django.db.models.functions import Cast, JSONObject
# This migration is mostly the same as
# backfill_remote_realmauditlog_extradata_to_json_field in zilencer.
OLD_VALUE = "1"
NEW_VALUE = "2"
USER_FULL_NAME_CHANGED = 124
REALM_DISCOUNT_CHANGED = 209
BATCH_SIZE = 5000
DISCOUNT_DATA_TEMPLATE = """Audit log entry {id} with event type REALM_DISCOUNT_CHANGED is skipped.
The data consistency needs to be manually checked.
Discount data to remove after the upcoming JSONField migration:
{data_to_remove}
Discount data to keep after the upcoming JSONField migration:
{data_to_keep}
"""
OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten.
The old value is:
{old_value}
The new value is:
{new_value}
"""
@transaction.atomic
def do_bulk_backfill_extra_data(
audit_log_model: type[Model], id_lower_bound: int, id_upper_bound: int
) -> None:
# First handle the special case for audit logs with the
# USER_FULL_NAME_CHANGED event, which stores the full name not as
# str(dict()) but a plain str. Note that we only update the entries where
# extra_data_json has the default value, because we do not want to override
# existing audit log entries with a NEW_VALUE of None for extra_data_json.
# We do not need to skip existing entries for other parts of backfilling
# because we have double-write implemented so that the backfilled value
# will still be consistent.
audit_log_model._default_manager.filter(
event_type=USER_FULL_NAME_CHANGED,
id__range=(id_lower_bound, id_upper_bound),
extra_data_json={},
# extra_data used to keeps track of the old name. As a result, we know
# nothing about what NEW_VALUE would be especially if the name has been
# changed multiple times. extra_data_json is a JSONObject whose
# OLD_VALUE and NEW_VALUE is mapped from the value of the extra_data
# field (which is just a old full name string) and None, respectively.
# Documentation for JSONObject:
# https://docs.djangoproject.com/en/5.0/ref/models/database-functions/#jsonobject
).update(extra_data_json=JSONObject(**{OLD_VALUE: "extra_data", NEW_VALUE: None}))
inconsistent_extra_data_json: list[tuple[int, str, object, object]] = []
# A dict converted with str() will start with a open bracket followed by a
# single quote, as opposed to a JSON-encoded value, which will use a
# _double_ quote. We use this to filter out those entries with malformed
# extra_data to be handled later. This should only update rows with
# extra_data populated with orjson.dumps.
# The first query below checks for entries that would have extra_data_json
# being overwritten by the migration with a value inconsistent with its
# previous value.
inconsistent_extra_data_json.extend(
audit_log_model._default_manager.filter(
extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound)
)
.annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField()))
.exclude(extra_data__startswith="{'")
.exclude(event_type=USER_FULL_NAME_CHANGED)
.exclude(extra_data_json={})
.exclude(extra_data_json=F("new_extra_data_json"))
.values_list("id", "extra_data", "extra_data_json", "new_extra_data_json")
)
(
audit_log_model._default_manager.filter(
extra_data__isnull=False,
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
.exclude(extra_data__startswith="{'")
.exclude(event_type=USER_FULL_NAME_CHANGED)
.update(extra_data_json=Cast("extra_data", output_field=JSONField()))
)
python_valued_audit_log_entries = audit_log_model._default_manager.filter(
extra_data__startswith="{'",
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
for audit_log_entry in python_valued_audit_log_entries:
# extra_data for entries that store dict stringified with builtins.str()
# are converted back with ast.literal_eval for safety and efficiency.
# str()'d extra_data with the REALM_DISCOUNT_CHANGED event type is not
# handled by this migration. We expect that all such entries are
# manually converted beforehand or an error will occur during the
# migration, because ast.literal_eval does not allow the evaluation of
# Decimal.
old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed.
if audit_log_entry.event_type == REALM_DISCOUNT_CHANGED: # type: ignore[attr-defined] # Explained above.
print(
DISCOUNT_DATA_TEMPLATE.format(
id=audit_log_entry.id, # type: ignore[attr-defined] # Explained above.
data_to_remove=audit_log_entry.extra_data, # type: ignore[attr-defined] # Explained above.
data_to_keep=old_value,
)
)
continue
new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above.
if old_value not in ({}, new_value):
inconsistent_extra_data_json.append(
(audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value) # type: ignore[attr-defined] # Explained above.
)
audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above.
audit_log_model._default_manager.bulk_update(
python_valued_audit_log_entries, fields=["extra_data_json"]
)
if inconsistent_extra_data_json:
audit_log_entries = []
for (
audit_log_entry_id,
old_extra_data,
old_extra_data_json,
new_extra_data_json,
) in inconsistent_extra_data_json:
audit_log_entry = audit_log_model._default_manager.get(id=audit_log_entry_id)
assert isinstance(old_extra_data_json, dict)
if "inconsistent_old_extra_data" in old_extra_data_json:
# Skip entries that have been backfilled and detected as
# anomalies before.
continue
assert isinstance(new_extra_data_json, dict)
audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above.
**new_extra_data_json,
"inconsistent_old_extra_data": old_extra_data,
"inconsistent_old_extra_data_json": old_extra_data_json,
}
audit_log_entries.append(audit_log_entry)
print(
OVERWRITE_TEMPLATE.format(
id=audit_log_entry_id,
old_value=orjson.dumps(old_extra_data_json).decode(),
new_value=orjson.dumps(new_extra_data_json).decode(),
)
)
audit_log_model._default_manager.bulk_update(audit_log_entries, fields=["extra_data_json"])
def backfill_extra_data(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
audit_log_model = apps.get_model("zerver", "RealmAuditLog")
if not audit_log_model.objects.filter(extra_data__isnull=False).exists():
return
audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False)
id_lower_bound = audit_log_entries.earliest("id").id
id_upper_bound = audit_log_entries.latest("id").id
while id_lower_bound <= id_upper_bound:
do_bulk_backfill_extra_data(
audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound)
)
id_lower_bound += BATCH_SIZE + 1
class Migration(migrations.Migration):
atomic = False
dependencies = [
("zerver", "0459_remove_invalid_characters_from_user_group_name"),
]
operations = [
migrations.RunPython(
backfill_extra_data, reverse_code=migrations.RunPython.noop, elidable=True
),
]