migrations: Backfill extra_data_json for audit log entries.

This migration is reasonably complex because of various anomalies in existing data. Note that there are cases when extra_data does not contain data that is proper json with possibly single quotes. Thus we need to use "ast.literal_eval" to cover that. There is also a special case for "event_type == USER_FULL_NAME_CHANGED", where extra_data is a plain str. This event_type is only used for RealmAuditLog, so the zilencer migration script does not need to handle it. The migration does not handle "event_type == REALM_DISCOUNT_CHANGED" because ast.literal_eval only allow Python literals. We expect the admin to populate the jsonified extra_data for extra_data_json manually beforehand. This chunks the backfilling migration to reduce potential block time. The migration for zilencer is mostly similar to the one for zerver; except that the backfill helper is added in a wrapper and unrelated events are removed. **Logging and error recovery** We print out a warning when the extra_data_json field of an entry would have been overwritten by a value inconsistent with what we derived from extra_data. Usually this only happens when the extra_data was corrupted before this migration. This prevents data loss by backing up possibly corrupted data in extra_data_json with the keys "inconsistent_old_extra_data" and "inconsistent_old_extra_data_json". More roundtrips to the database are needed for inconsistent data, which are expected to be infrequent. This also outputs messages when there are audit log entries with decimals, indicating that such entries are not backfilled. Do note that audit log entries with decimals are not populated with "inconsistent_old_extra_data_*" in the JSONField, because they are not overwritten. For such audit log entries with "extra_data_json" marked as inconsistent, we skip them in the migration. Because when we have discovered anomalies in a previous run, there is no need to overwrite them again nesting the extra keys we added to it. **Testing** We create a migration test case utilizing the property of bulk_create that it doesn't call our modified save method. We extend ZulipTestCase to support verifying console output at the test case level. The implementation is crude but the use case should be rare enough that we don't need it to be too elaborate. Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2025-11-02 13:03:29 +00:00 · 2023-07-15 12:43:23 -04:00
parent b29ec4d62e
commit a0cf624eaa
4 changed files with 570 additions and 78 deletions
--- a/zilencer/migrations/0027_backfill_remote_realmauditlog_extradata_to_json_field.py
+++ b/zilencer/migrations/0027_backfill_remote_realmauditlog_extradata_to_json_field.py
@@ -0,0 +1,148 @@
+# Generated by Django 4.0.7 on 2022-09-30 20:30
+
+import ast
+from typing import Callable, List, Tuple, Type
+
+import orjson
+from django.db import migrations, transaction
+from django.db.backends.base.schema import BaseDatabaseSchemaEditor
+from django.db.migrations.state import StateApps
+from django.db.models import F, JSONField, Model
+from django.db.models.functions import Cast
+
+# This migration is mostly the same as
+# backfill_realmauditlog_extradata_to_json_field in zerver.
+
+OLD_VALUE = "1"
+NEW_VALUE = "2"
+USER_FULL_NAME_CHANGED = 124
+REALM_DISCOUNT_CHANGED = 209
+BATCH_SIZE = 5000
+
+OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten.
+  The old value is:
+{old_value}
+  The new value is:
+{new_value}
+"""
+
+
+@transaction.atomic
+def do_bulk_backfill_extra_data(
+    audit_log_model: Type[Model], id_lower_bound: int, id_upper_bound: int
+) -> None:
+    inconsistent_extra_data_json: List[Tuple[int, str, object, object]] = []
+    # A dict converted with str() will start with a open bracket followed by a
+    # single quote, as opposed to a JSON-encoded value, which will use a
+    # _double_ quote. We use this to filter out those entries with malformed
+    # extra_data to be handled later. This should only update rows with
+    # extra_data populated with orjson.dumps.
+
+    # The first query below checks for entries that would have extra_data_json
+    # being overwritten by the migration with a value inconsistent with its
+    # previous value.
+    inconsistent_extra_data_json.extend(
+        audit_log_model.objects.filter(
+            extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound)
+        )
+        .annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField()))
+        .exclude(extra_data__startswith="{'")
+        .exclude(extra_data_json={})
+        .exclude(extra_data_json=F("new_extra_data_json"))
+        .values_list("id", "extra_data", "extra_data_json", "new_extra_data_json")
+    )
+    (
+        audit_log_model.objects.filter(
+            extra_data__isnull=False,
+            id__range=(id_lower_bound, id_upper_bound),
+            extra_data_json__inconsistent_old_extra_data__isnull=True,
+        )
+        .exclude(extra_data__startswith="{'")
+        .update(extra_data_json=Cast("extra_data", output_field=JSONField()))
+    )
+
+    python_valued_audit_log_entries = audit_log_model.objects.filter(
+        extra_data__startswith="{'",
+        id__range=(id_lower_bound, id_upper_bound),
+        extra_data_json__inconsistent_old_extra_data__isnull=True,
+    )
+    for audit_log_entry in python_valued_audit_log_entries:
+        # extra_data for entries that store dict stringified with builtins.str()
+        # are converted back with ast.literal_eval for safety and efficiency.
+        old_value = audit_log_entry.extra_data_json  # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed.
+        new_value = ast.literal_eval(audit_log_entry.extra_data)  # type: ignore[attr-defined] # Explained above.
+        if old_value != {} and old_value != new_value:
+            inconsistent_extra_data_json.append((audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value))  # type: ignore[attr-defined] # Explained above.
+        audit_log_entry.extra_data_json = new_value  # type: ignore[attr-defined] # Explained above.
+    audit_log_model.objects.bulk_update(python_valued_audit_log_entries, fields=["extra_data_json"])
+
+    if inconsistent_extra_data_json:
+        audit_log_entries = []
+        for (
+            audit_log_entry_id,
+            old_extra_data,
+            old_extra_data_json,
+            new_extra_data_json,
+        ) in inconsistent_extra_data_json:
+            audit_log_entry = audit_log_model.objects.get(id=audit_log_entry_id)
+            assert isinstance(old_extra_data_json, dict)
+            if "inconsistent_old_extra_data" in old_extra_data_json:
+                # Skip entries that have been backfilled and detected as
+                # anomalies before.
+                continue
+            assert isinstance(new_extra_data_json, dict)
+            audit_log_entry.extra_data_json = {  # type: ignore[attr-defined] # Explained above.
+                **new_extra_data_json,
+                "inconsistent_old_extra_data": old_extra_data,
+                "inconsistent_old_extra_data_json": old_extra_data_json,
+            }
+            audit_log_entries.append(audit_log_entry)
+            print(
+                OVERWRITE_TEMPLATE.format(
+                    id=audit_log_entry_id,
+                    old_value=orjson.dumps(old_extra_data_json).decode(),
+                    new_value=orjson.dumps(new_extra_data_json).decode(),
+                )
+            )
+        audit_log_model.objects.bulk_update(audit_log_entries, fields=["extra_data_json"])
+
+
+def backfill_extra_data(model_name: str) -> Callable[[StateApps, BaseDatabaseSchemaEditor], None]:
+    def inner(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
+        audit_log_model = apps.get_model("zilencer", model_name)
+        if not audit_log_model.objects.filter(extra_data__isnull=False).exists():
+            return
+
+        audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False)
+        id_lower_bound = audit_log_entries.earliest("id").id
+        id_upper_bound = audit_log_entries.latest("id").id
+        while id_lower_bound <= id_upper_bound:
+            do_bulk_backfill_extra_data(
+                audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound)
+            )
+            id_lower_bound += BATCH_SIZE + 1
+
+        do_bulk_backfill_extra_data(audit_log_model, id_lower_bound, id_upper_bound)
+
+    return inner
+
+
+class Migration(migrations.Migration):
+    atomic = False
+
+    dependencies = [
+        ("zilencer", "0026_auditlog_models_extra_data_json"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            backfill_extra_data("RemoteRealmAuditLog"),
+            reverse_code=migrations.RunPython.noop,
+            elidable=True,
+        ),
+        migrations.RunPython(
+            backfill_extra_data("RemoteZulipServerAuditLog"),
+            reverse_code=migrations.RunPython.noop,
+            elidable=True,
+        ),
+    ]