# Generated by Django 4.0.7 on 2022-09-30 20:30 import ast import orjson from django.db import migrations, transaction from django.db.backends.base.schema import BaseDatabaseSchemaEditor from django.db.migrations.state import StateApps from django.db.models import F, JSONField, Model from django.db.models.functions import Cast, JSONObject # This migration is mostly the same as # backfill_remote_realmauditlog_extradata_to_json_field in zilencer. OLD_VALUE = "1" NEW_VALUE = "2" USER_FULL_NAME_CHANGED = 124 REALM_DISCOUNT_CHANGED = 209 BATCH_SIZE = 5000 DISCOUNT_DATA_TEMPLATE = """Audit log entry {id} with event type REALM_DISCOUNT_CHANGED is skipped. The data consistency needs to be manually checked. Discount data to remove after the upcoming JSONField migration: {data_to_remove} Discount data to keep after the upcoming JSONField migration: {data_to_keep} """ OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten. The old value is: {old_value} The new value is: {new_value} """ @transaction.atomic def do_bulk_backfill_extra_data( audit_log_model: type[Model], id_lower_bound: int, id_upper_bound: int ) -> None: # First handle the special case for audit logs with the # USER_FULL_NAME_CHANGED event, which stores the full name not as # str(dict()) but a plain str. Note that we only update the entries where # extra_data_json has the default value, because we do not want to override # existing audit log entries with a NEW_VALUE of None for extra_data_json. # We do not need to skip existing entries for other parts of backfilling # because we have double-write implemented so that the backfilled value # will still be consistent. audit_log_model._default_manager.filter( event_type=USER_FULL_NAME_CHANGED, id__range=(id_lower_bound, id_upper_bound), extra_data_json={}, # extra_data used to keeps track of the old name. As a result, we know # nothing about what NEW_VALUE would be especially if the name has been # changed multiple times. extra_data_json is a JSONObject whose # OLD_VALUE and NEW_VALUE is mapped from the value of the extra_data # field (which is just a old full name string) and None, respectively. # Documentation for JSONObject: # https://docs.djangoproject.com/en/5.0/ref/models/database-functions/#jsonobject ).update(extra_data_json=JSONObject(**{OLD_VALUE: "extra_data", NEW_VALUE: None})) inconsistent_extra_data_json: list[tuple[int, str, object, object]] = [] # A dict converted with str() will start with a open bracket followed by a # single quote, as opposed to a JSON-encoded value, which will use a # _double_ quote. We use this to filter out those entries with malformed # extra_data to be handled later. This should only update rows with # extra_data populated with orjson.dumps. # The first query below checks for entries that would have extra_data_json # being overwritten by the migration with a value inconsistent with its # previous value. inconsistent_extra_data_json.extend( audit_log_model._default_manager.filter( extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound) ) .annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField())) .exclude(extra_data__startswith="{'") .exclude(event_type=USER_FULL_NAME_CHANGED) .exclude(extra_data_json={}) .exclude(extra_data_json=F("new_extra_data_json")) .values_list("id", "extra_data", "extra_data_json", "new_extra_data_json") ) ( audit_log_model._default_manager.filter( extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound), extra_data_json__inconsistent_old_extra_data__isnull=True, ) .exclude(extra_data__startswith="{'") .exclude(event_type=USER_FULL_NAME_CHANGED) .update(extra_data_json=Cast("extra_data", output_field=JSONField())) ) python_valued_audit_log_entries = audit_log_model._default_manager.filter( extra_data__startswith="{'", id__range=(id_lower_bound, id_upper_bound), extra_data_json__inconsistent_old_extra_data__isnull=True, ) for audit_log_entry in python_valued_audit_log_entries: # extra_data for entries that store dict stringified with builtins.str() # are converted back with ast.literal_eval for safety and efficiency. # str()'d extra_data with the REALM_DISCOUNT_CHANGED event type is not # handled by this migration. We expect that all such entries are # manually converted beforehand or an error will occur during the # migration, because ast.literal_eval does not allow the evaluation of # Decimal. old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed. if audit_log_entry.event_type == REALM_DISCOUNT_CHANGED: # type: ignore[attr-defined] # Explained above. print( DISCOUNT_DATA_TEMPLATE.format( id=audit_log_entry.id, # type: ignore[attr-defined] # Explained above. data_to_remove=audit_log_entry.extra_data, # type: ignore[attr-defined] # Explained above. data_to_keep=old_value, ) ) continue new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above. if old_value not in ({}, new_value): inconsistent_extra_data_json.append( (audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value) # type: ignore[attr-defined] # Explained above. ) audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above. audit_log_model._default_manager.bulk_update( python_valued_audit_log_entries, fields=["extra_data_json"] ) if inconsistent_extra_data_json: audit_log_entries = [] for ( audit_log_entry_id, old_extra_data, old_extra_data_json, new_extra_data_json, ) in inconsistent_extra_data_json: audit_log_entry = audit_log_model._default_manager.get(id=audit_log_entry_id) assert isinstance(old_extra_data_json, dict) if "inconsistent_old_extra_data" in old_extra_data_json: # Skip entries that have been backfilled and detected as # anomalies before. continue assert isinstance(new_extra_data_json, dict) audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above. **new_extra_data_json, "inconsistent_old_extra_data": old_extra_data, "inconsistent_old_extra_data_json": old_extra_data_json, } audit_log_entries.append(audit_log_entry) print( OVERWRITE_TEMPLATE.format( id=audit_log_entry_id, old_value=orjson.dumps(old_extra_data_json).decode(), new_value=orjson.dumps(new_extra_data_json).decode(), ) ) audit_log_model._default_manager.bulk_update(audit_log_entries, fields=["extra_data_json"]) def backfill_extra_data(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None: audit_log_model = apps.get_model("zerver", "RealmAuditLog") if not audit_log_model.objects.filter(extra_data__isnull=False).exists(): return audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False) id_lower_bound = audit_log_entries.earliest("id").id id_upper_bound = audit_log_entries.latest("id").id while id_lower_bound <= id_upper_bound: do_bulk_backfill_extra_data( audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound) ) id_lower_bound += BATCH_SIZE + 1 class Migration(migrations.Migration): atomic = False dependencies = [ ("zerver", "0459_remove_invalid_characters_from_user_group_name"), ] operations = [ migrations.RunPython( backfill_extra_data, reverse_code=migrations.RunPython.noop, elidable=True ), ]