diff --git a/zerver/lib/test_classes.py b/zerver/lib/test_classes.py index e19d23bf43..bdc38f2d0f 100644 --- a/zerver/lib/test_classes.py +++ b/zerver/lib/test_classes.py @@ -141,6 +141,10 @@ class UploadSerializeMixin(SerializeMixin): class ZulipTestCaseMixin(SimpleTestCase): # Ensure that the test system just shows us diffs maxDiff: Optional[int] = None + # This bypasses BAN_CONSOLE_OUTPUT for the test case when set. + # Override this to verify if the given extra console output matches the + # expectation. + expected_console_output: Optional[str] = None def setUp(self) -> None: super().setUp() @@ -170,7 +174,7 @@ class ZulipTestCaseMixin(SimpleTestCase): self.mock_initialize.stop() def run(self, result: Optional[TestResult] = None) -> Optional[TestResult]: # nocoverage - if not settings.BAN_CONSOLE_OUTPUT: + if not settings.BAN_CONSOLE_OUTPUT and self.expected_console_output is None: return super().run(result) extra_output_finder = ExtraConsoleOutputFinder() with tee_stderr_and_find_extra_console_output( @@ -180,6 +184,11 @@ class ZulipTestCaseMixin(SimpleTestCase): if extra_output_finder.full_extra_output and ( test_result is None or test_result.wasSuccessful() ): + extra_output = extra_output_finder.full_extra_output.decode(errors="replace") + if self.expected_console_output is not None: + self.assertEqual(extra_output, self.expected_console_output) + return test_result + exception_message = f""" ---- UNEXPECTED CONSOLE OUTPUT DETECTED ---- @@ -196,7 +205,7 @@ You should be able to quickly reproduce this failure with: ./tools/test-backend --ban-console-output {self.id()} Output: -{extra_output_finder.full_extra_output.decode(errors="replace")} +{extra_output} -------------------------------------------- """ raise ExtraConsoleOutputInTestError(exception_message) diff --git a/zerver/migrations/0460_backfill_realmauditlog_extradata_to_json_field.py b/zerver/migrations/0460_backfill_realmauditlog_extradata_to_json_field.py new file mode 100644 index 0000000000..8bcc35c0d1 --- /dev/null +++ b/zerver/migrations/0460_backfill_realmauditlog_extradata_to_json_field.py @@ -0,0 +1,181 @@ +# Generated by Django 4.0.7 on 2022-09-30 20:30 + +import ast +from typing import List, Tuple, Type + +import orjson +from django.db import migrations, transaction +from django.db.backends.base.schema import BaseDatabaseSchemaEditor +from django.db.migrations.state import StateApps +from django.db.models import F, JSONField, Model +from django.db.models.functions import Cast, JSONObject + +# This migration is mostly the same as +# backfill_remote_realmauditlog_extradata_to_json_field in zilencer. + +OLD_VALUE = "1" +NEW_VALUE = "2" +USER_FULL_NAME_CHANGED = 124 +REALM_DISCOUNT_CHANGED = 209 +BATCH_SIZE = 5000 + +DISCOUNT_DATA_TEMPLATE = """Audit log entry {id} with event type REALM_DISCOUNT_CHANGED is skipped. +The data consistency needs to be manually checked. + Discount data to remove after the upcoming JSONField migration: +{data_to_remove} + Discount data to keep after the upcoming JSONField migration: +{data_to_keep} +""" + +OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten. + The old value is: +{old_value} + The new value is: +{new_value} +""" + + +@transaction.atomic +def do_bulk_backfill_extra_data( + audit_log_model: Type[Model], id_lower_bound: int, id_upper_bound: int +) -> None: + # First handle the special case for audit logs with the + # USER_FULL_NAME_CHANGED event, which stores the full name not as + # str(dict()) but a plain str. Note that we only update the entries where + # extra_data_json has the default value, because we do not want to override + # existing audit log entries with a NEW_VALUE of None for extra_data_json. + # We do not need to skip existing entries for other parts of backfilling + # because we have double-write implemented so that the backfilled value + # will still be consistent. + audit_log_model.objects.filter( + event_type=USER_FULL_NAME_CHANGED, + id__range=(id_lower_bound, id_upper_bound), + extra_data_json={}, + # extra_data used to keeps track of the old name. As a result, we know + # nothing about what NEW_VALUE would be especially if the name has been + # changed multiple times. extra_data_json is a JSONObject whose + # OLD_VALUE and NEW_VALUE is mapped from the value of the extra_data + # field (which is just a old full name string) and None, respectively. + # Documentation for JSONObject: + # https://docs.djangoproject.com/en/4.2/ref/models/database-functions/#jsonobject + ).update(extra_data_json=JSONObject(**{OLD_VALUE: "extra_data", NEW_VALUE: None})) + + inconsistent_extra_data_json: List[Tuple[int, str, object, object]] = [] + # A dict converted with str() will start with a open bracket followed by a + # single quote, as opposed to a JSON-encoded value, which will use a + # _double_ quote. We use this to filter out those entries with malformed + # extra_data to be handled later. This should only update rows with + # extra_data populated with orjson.dumps. + + # The first query below checks for entries that would have extra_data_json + # being overwritten by the migration with a value inconsistent with its + # previous value. + inconsistent_extra_data_json.extend( + audit_log_model.objects.filter( + extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound) + ) + .annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField())) + .exclude(extra_data__startswith="{'") + .exclude(event_type=USER_FULL_NAME_CHANGED) + .exclude(extra_data_json={}) + .exclude(extra_data_json=F("new_extra_data_json")) + .values_list("id", "extra_data", "extra_data_json", "new_extra_data_json") + ) + ( + audit_log_model.objects.filter( + extra_data__isnull=False, + id__range=(id_lower_bound, id_upper_bound), + extra_data_json__inconsistent_old_extra_data__isnull=True, + ) + .exclude(extra_data__startswith="{'") + .exclude(event_type=USER_FULL_NAME_CHANGED) + .update(extra_data_json=Cast("extra_data", output_field=JSONField())) + ) + + python_valued_audit_log_entries = audit_log_model.objects.filter( + extra_data__startswith="{'", + id__range=(id_lower_bound, id_upper_bound), + extra_data_json__inconsistent_old_extra_data__isnull=True, + ) + for audit_log_entry in python_valued_audit_log_entries: + # extra_data for entries that store dict stringified with builtins.str() + # are converted back with ast.literal_eval for safety and efficiency. + # str()'d extra_data with the REALM_DISCOUNT_CHANGED event type is not + # handled by this migration. We expect that all such entries are + # manually converted beforehand or an error will occur during the + # migration, because ast.literal_eval does not allow the evaluation of + # Decimal. + old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed. + if audit_log_entry.event_type == REALM_DISCOUNT_CHANGED: # type: ignore[attr-defined] # Explained above. + print( + DISCOUNT_DATA_TEMPLATE.format( + id=audit_log_entry.id, # type: ignore[attr-defined] # Explained above. + data_to_remove=audit_log_entry.extra_data, # type: ignore[attr-defined] # Explained above. + data_to_keep=old_value, + ) + ) + continue + new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above. + if old_value != {} and old_value != new_value: + inconsistent_extra_data_json.append((audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value)) # type: ignore[attr-defined] # Explained above. + audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above. + audit_log_model.objects.bulk_update(python_valued_audit_log_entries, fields=["extra_data_json"]) + + if inconsistent_extra_data_json: + audit_log_entries = [] + for ( + audit_log_entry_id, + old_extra_data, + old_extra_data_json, + new_extra_data_json, + ) in inconsistent_extra_data_json: + audit_log_entry = audit_log_model.objects.get(id=audit_log_entry_id) + assert isinstance(old_extra_data_json, dict) + if "inconsistent_old_extra_data" in old_extra_data_json: + # Skip entries that have been backfilled and detected as + # anomalies before. + continue + assert isinstance(new_extra_data_json, dict) + audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above. + **new_extra_data_json, + "inconsistent_old_extra_data": old_extra_data, + "inconsistent_old_extra_data_json": old_extra_data_json, + } + audit_log_entries.append(audit_log_entry) + print( + OVERWRITE_TEMPLATE.format( + id=audit_log_entry_id, + old_value=orjson.dumps(old_extra_data_json).decode(), + new_value=orjson.dumps(new_extra_data_json).decode(), + ) + ) + audit_log_model.objects.bulk_update(audit_log_entries, fields=["extra_data_json"]) + + +def backfill_extra_data(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None: + audit_log_model = apps.get_model("zerver", "RealmAuditLog") + if not audit_log_model.objects.filter(extra_data__isnull=False).exists(): + return + + audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False) + id_lower_bound = audit_log_entries.earliest("id").id + id_upper_bound = audit_log_entries.latest("id").id + while id_lower_bound <= id_upper_bound: + do_bulk_backfill_extra_data( + audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound) + ) + id_lower_bound += BATCH_SIZE + 1 + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("zerver", "0459_remove_invalid_characters_from_user_group_name"), + ] + + operations = [ + migrations.RunPython( + backfill_extra_data, reverse_code=migrations.RunPython.noop, elidable=True + ), + ] diff --git a/zerver/tests/test_migrations.py b/zerver/tests/test_migrations.py index bab4c46fe0..dfaece8824 100644 --- a/zerver/tests/test_migrations.py +++ b/zerver/tests/test_migrations.py @@ -4,8 +4,12 @@ # You can also read # https://www.caktusgroup.com/blog/2016/02/02/writing-unit-tests-django-migrations/ # to get a tutorial on the framework that inspired this feature. +from decimal import Decimal +from typing import Optional +import orjson from django.db.migrations.state import StateApps +from django.utils.timezone import now as timezone_now from zerver.lib.test_classes import MigrationsTestCase from zerver.lib.test_helpers import use_db_models @@ -25,89 +29,239 @@ from zerver.lib.test_helpers import use_db_models # As a result, we generally mark these tests as skipped once they have # been tested for a migration being merged. +USER_ACTIVATED = 102 +USER_FULL_NAME_CHANGED = 124 +REALM_DISCOUNT_CHANGED = 209 +OLD_VALUE = "1" +NEW_VALUE = "2" -class LinkifierURLFormatString(MigrationsTestCase): - migrate_from = "0440_realmfilter_url_template" - migrate_to = "0441_backfill_realmfilter_url_template" + +class RealmAuditLogExtraData(MigrationsTestCase): + migrate_from = "0459_remove_invalid_characters_from_user_group_name" + migrate_to = "0460_backfill_realmauditlog_extradata_to_json_field" + + full_name_change_log_id: Optional[int] = None + valid_json_log_id: Optional[int] = None + str_json_log_id: Optional[int] = None + # The BATCH_SIZE is defined as 5000 in + # backfill_realmauditlog_extradata_to_json_field, this later is used to test + # if batching works properly. + DATA_SIZE = 10005 + expected_console_output = """Audit log entry 50003 with event type REALM_DISCOUNT_CHANGED is skipped. +The data consistency needs to be manually checked. + Discount data to remove after the upcoming JSONField migration: +{'old_discount': Decimal('25.0000'), 'new_discount': Decimal('50')} + Discount data to keep after the upcoming JSONField migration: +{} + +Audit log entry 50004 with event type REALM_DISCOUNT_CHANGED is skipped. +The data consistency needs to be manually checked. + Discount data to remove after the upcoming JSONField migration: +{'old_discount': Decimal('25.0000'), 'new_discount': Decimal('50')} + Discount data to keep after the upcoming JSONField migration: +{'new_discount': '50', 'old_discount': '25.0000'} + +Audit log entry with id 50001 has extra_data_json been inconsistently overwritten. + The old value is: +{"corrupted":"foo"} + The new value is: +{"key":"value"} + +Audit log entry with id 50002 has extra_data_json been inconsistently overwritten. + The old value is: +{"corrupted":"bar"} + The new value is: +{"key":"value"} + +""" @use_db_models def setUpBeforeMigration(self, apps: StateApps) -> None: - RealmFilter = apps.get_model("zerver", "RealmFilter") + Realm = apps.get_model("zerver", "Realm") + RealmAuditLog = apps.get_model("zerver", "RealmAuditLog") + event_time = timezone_now() + realm = Realm.objects.get(string_id="zulip") - iago = self.example_user("iago") + full_name_change_log = RealmAuditLog( + realm=realm, + event_type=USER_FULL_NAME_CHANGED, + event_time=event_time, + extra_data="foo", + ) - urls = [ - "http://example.com/", - "https://example.com/", - "https://user:password@example.com/", - "https://example.com/@user/thing", - "https://example.com/!path", - "https://example.com/foo.bar", - "https://example.com/foo[bar]", - "https://example.com/{foo}", - "https://example.com/{foo}{bars}", - "https://example.com/{foo}/and/{bar}", - "https://example.com/?foo={foo}", - "https://example.com/%ab", - "https://example.com/%ba", - "https://example.com/%21", - "https://example.com/words%20with%20spaces", - "https://example.com/back%20to%20{back}", - "https://example.com/encoded%2fwith%2fletters", - "https://example.com/encoded%2Fwith%2Fupper%2Fcase%2Fletters", - "https://example.com/%%", - "https://example.com/%%(", - "https://example.com/%%()", - "https://example.com/%%(foo", - "https://example.com/%%(foo)", - "https://example.com/%%(foo)s", - "https://example.com/%(foo)s", - "https://example.com/%(foo)s%(bar)s", - ] - self.linkifier_ids = [] + new_full_name_change_log = RealmAuditLog( + realm=realm, + event_type=USER_FULL_NAME_CHANGED, + event_time=event_time, + extra_data="foo", + extra_data_json={OLD_VALUE: "foo", NEW_VALUE: "bar"}, + ) - for index, url in enumerate(urls): - self.linkifier_ids.append( - RealmFilter.objects.create( - realm=iago.realm, - pattern=f"dummy{index}", - url_format_string=url, - ).id + valid_json_log = RealmAuditLog( + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=orjson.dumps({"key": "value"}).decode(), + ) + + str_json_log = RealmAuditLog( + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=str({"key": "value"}), + ) + + self.backfilled_inconsistent_log_id = RealmAuditLog.objects.create( + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=orjson.dumps({"key": "baz"}).decode(), + extra_data_json={ + "key": "baz", + "inconsistent_old_extra_data": orjson.dumps({"key": "baz"}).decode(), + "inconsistent_old_extra_data_json": {"key": "value corrupted"}, + }, + ).id + + # The following audit log entries have preset ids because we use + # them to assert the generated log output that is defined before + # the test case is run. + inconsistent_json_log = RealmAuditLog( + id=50001, + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=orjson.dumps({"key": "value"}).decode(), + extra_data_json={"corrupted": "foo"}, + ) + + inconsistent_str_json_log = RealmAuditLog( + id=50002, + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=str({"key": "value"}), + extra_data_json={"corrupted": "bar"}, + ) + + self.old_decimal_log_id = RealmAuditLog.objects.create( + id=50003, + realm=realm, + event_type=REALM_DISCOUNT_CHANGED, + event_time=event_time, + extra_data=str({"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")}), + ).id + + self.new_decimal_log_id = RealmAuditLog.objects.create( + id=50004, + realm=realm, + event_type=REALM_DISCOUNT_CHANGED, + event_time=event_time, + extra_data=str({"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")}), + extra_data_json={"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")}, + ).id + + RealmAuditLog.objects.bulk_create( + [ + full_name_change_log, + new_full_name_change_log, + valid_json_log, + str_json_log, + inconsistent_json_log, + inconsistent_str_json_log, + ] + ) + self.full_name_change_log_id = full_name_change_log.id + self.new_full_name_change_log_id = new_full_name_change_log.id + self.valid_json_log_id = valid_json_log.id + self.str_json_log_id = str_json_log.id + + other_logs = [] + for i in range(self.DATA_SIZE): + other_logs.append( + RealmAuditLog( + realm=realm, + event_type=USER_ACTIVATED, + event_time=event_time, + extra_data=orjson.dumps({"data": i}).decode(), + ) ) - - def test_converted_url_templates(self) -> None: - RealmFilter = self.apps.get_model("zerver", "RealmFilter") - - expected_urls = [ - "http://example.com/", - "https://example.com/", - "https://user:password@example.com/", - "https://example.com/@user/thing", - "https://example.com/!path", - "https://example.com/foo.bar", - "https://example.com/foo[bar]", - "https://example.com/%7Bfoo%7D", - "https://example.com/%7Bfoo%7D%7Bbars%7D", - "https://example.com/%7Bfoo%7D/and/%7Bbar%7D", - "https://example.com/?foo=%7Bfoo%7D", - "https://example.com/%ab", - "https://example.com/%ba", - "https://example.com/%21", - "https://example.com/words%20with%20spaces", - "https://example.com/back%20to%20%7Bback%7D", - "https://example.com/encoded%2fwith%2fletters", - "https://example.com/encoded%2Fwith%2Fupper%2Fcase%2Fletters", - "https://example.com/%", - "https://example.com/%(", - "https://example.com/%()", - "https://example.com/%(foo", - "https://example.com/%(foo)", - "https://example.com/%(foo)s", - "https://example.com/{foo}", - "https://example.com/{foo}{bar}", + self.other_logs_id = [ + audit_log.id for audit_log in RealmAuditLog.objects.bulk_create(other_logs) ] - for linkifier_id, expected in zip(self.linkifier_ids, expected_urls): - linkifier = RealmFilter.objects.filter(id=linkifier_id).first() - self.assertIsNotNone(linkifier) - self.assertEqual(linkifier.url_template, expected) + # No new audit log entry should have extra_data_json populated as of + # now except for the entries created with non-default values. + self.assert_length( + RealmAuditLog.objects.filter( + event_time__gte=event_time, + ).exclude( + extra_data_json={}, + ), + 5, + ) + + def test_realmaudit_log_extra_data_to_json(self) -> None: + RealmAuditLog = self.apps.get_model("zerver", "RealmAuditLog") + + self.assertIsNotNone(self.full_name_change_log_id) + self.assertIsNotNone(self.valid_json_log_id) + self.assertIsNotNone(self.str_json_log_id) + + full_name_change_log = RealmAuditLog.objects.filter(id=self.full_name_change_log_id).first() + new_full_name_change_log = RealmAuditLog.objects.filter( + id=self.new_full_name_change_log_id + ).first() + valid_json_log = RealmAuditLog.objects.filter(id=self.valid_json_log_id).first() + str_json_log = RealmAuditLog.objects.filter(id=self.str_json_log_id).first() + + self.assertIsNotNone(full_name_change_log) + self.assertEqual(full_name_change_log.extra_data_json, {"1": "foo", "2": None}) + + self.assertIsNotNone(new_full_name_change_log) + self.assertEqual(new_full_name_change_log.extra_data_json, {"1": "foo", "2": "bar"}) + + self.assertIsNotNone(valid_json_log) + self.assertEqual(valid_json_log.extra_data_json, {"key": "value"}) + + self.assertIsNotNone(str_json_log) + self.assertEqual(str_json_log.extra_data_json, {"key": "value"}) + + other_logs = RealmAuditLog.objects.filter(id__in=self.other_logs_id).order_by("id") + self.assertIsNotNone(other_logs) + self.assert_length(other_logs, self.DATA_SIZE) + for index, audit_log in enumerate(other_logs): + self.assertEqual(audit_log.extra_data_json, {"data": index}) + + inconsistent_json_log = RealmAuditLog.objects.get( + extra_data_json__inconsistent_old_extra_data=orjson.dumps({"key": "value"}).decode() + ) + self.assertIsNotNone(inconsistent_json_log) + self.assertEqual(inconsistent_json_log.id, 50001) + self.assertEqual( + inconsistent_json_log.extra_data_json["inconsistent_old_extra_data_json"], + {"corrupted": "foo"}, + ) + + inconsistent_str_json_log = RealmAuditLog.objects.get( + extra_data_json__inconsistent_old_extra_data=str({"key": "value"}) + ) + self.assertIsNotNone(inconsistent_str_json_log) + self.assertEqual(inconsistent_str_json_log.id, 50002) + self.assertEqual( + inconsistent_str_json_log.extra_data_json["inconsistent_old_extra_data_json"], + {"corrupted": "bar"}, + ) + + backfilled_inconsistent_log = RealmAuditLog.objects.get( + id=self.backfilled_inconsistent_log_id + ) + self.assertIsNotNone(backfilled_inconsistent_log) + self.assertEqual( + backfilled_inconsistent_log.extra_data_json, + { + "key": "baz", + "inconsistent_old_extra_data": orjson.dumps({"key": "baz"}).decode(), + "inconsistent_old_extra_data_json": {"key": "value corrupted"}, + }, + ) diff --git a/zilencer/migrations/0027_backfill_remote_realmauditlog_extradata_to_json_field.py b/zilencer/migrations/0027_backfill_remote_realmauditlog_extradata_to_json_field.py new file mode 100644 index 0000000000..71e449e381 --- /dev/null +++ b/zilencer/migrations/0027_backfill_remote_realmauditlog_extradata_to_json_field.py @@ -0,0 +1,148 @@ +# Generated by Django 4.0.7 on 2022-09-30 20:30 + +import ast +from typing import Callable, List, Tuple, Type + +import orjson +from django.db import migrations, transaction +from django.db.backends.base.schema import BaseDatabaseSchemaEditor +from django.db.migrations.state import StateApps +from django.db.models import F, JSONField, Model +from django.db.models.functions import Cast + +# This migration is mostly the same as +# backfill_realmauditlog_extradata_to_json_field in zerver. + +OLD_VALUE = "1" +NEW_VALUE = "2" +USER_FULL_NAME_CHANGED = 124 +REALM_DISCOUNT_CHANGED = 209 +BATCH_SIZE = 5000 + +OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten. + The old value is: +{old_value} + The new value is: +{new_value} +""" + + +@transaction.atomic +def do_bulk_backfill_extra_data( + audit_log_model: Type[Model], id_lower_bound: int, id_upper_bound: int +) -> None: + inconsistent_extra_data_json: List[Tuple[int, str, object, object]] = [] + # A dict converted with str() will start with a open bracket followed by a + # single quote, as opposed to a JSON-encoded value, which will use a + # _double_ quote. We use this to filter out those entries with malformed + # extra_data to be handled later. This should only update rows with + # extra_data populated with orjson.dumps. + + # The first query below checks for entries that would have extra_data_json + # being overwritten by the migration with a value inconsistent with its + # previous value. + inconsistent_extra_data_json.extend( + audit_log_model.objects.filter( + extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound) + ) + .annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField())) + .exclude(extra_data__startswith="{'") + .exclude(extra_data_json={}) + .exclude(extra_data_json=F("new_extra_data_json")) + .values_list("id", "extra_data", "extra_data_json", "new_extra_data_json") + ) + ( + audit_log_model.objects.filter( + extra_data__isnull=False, + id__range=(id_lower_bound, id_upper_bound), + extra_data_json__inconsistent_old_extra_data__isnull=True, + ) + .exclude(extra_data__startswith="{'") + .update(extra_data_json=Cast("extra_data", output_field=JSONField())) + ) + + python_valued_audit_log_entries = audit_log_model.objects.filter( + extra_data__startswith="{'", + id__range=(id_lower_bound, id_upper_bound), + extra_data_json__inconsistent_old_extra_data__isnull=True, + ) + for audit_log_entry in python_valued_audit_log_entries: + # extra_data for entries that store dict stringified with builtins.str() + # are converted back with ast.literal_eval for safety and efficiency. + old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed. + new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above. + if old_value != {} and old_value != new_value: + inconsistent_extra_data_json.append((audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value)) # type: ignore[attr-defined] # Explained above. + audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above. + audit_log_model.objects.bulk_update(python_valued_audit_log_entries, fields=["extra_data_json"]) + + if inconsistent_extra_data_json: + audit_log_entries = [] + for ( + audit_log_entry_id, + old_extra_data, + old_extra_data_json, + new_extra_data_json, + ) in inconsistent_extra_data_json: + audit_log_entry = audit_log_model.objects.get(id=audit_log_entry_id) + assert isinstance(old_extra_data_json, dict) + if "inconsistent_old_extra_data" in old_extra_data_json: + # Skip entries that have been backfilled and detected as + # anomalies before. + continue + assert isinstance(new_extra_data_json, dict) + audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above. + **new_extra_data_json, + "inconsistent_old_extra_data": old_extra_data, + "inconsistent_old_extra_data_json": old_extra_data_json, + } + audit_log_entries.append(audit_log_entry) + print( + OVERWRITE_TEMPLATE.format( + id=audit_log_entry_id, + old_value=orjson.dumps(old_extra_data_json).decode(), + new_value=orjson.dumps(new_extra_data_json).decode(), + ) + ) + audit_log_model.objects.bulk_update(audit_log_entries, fields=["extra_data_json"]) + + +def backfill_extra_data(model_name: str) -> Callable[[StateApps, BaseDatabaseSchemaEditor], None]: + def inner(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None: + audit_log_model = apps.get_model("zilencer", model_name) + if not audit_log_model.objects.filter(extra_data__isnull=False).exists(): + return + + audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False) + id_lower_bound = audit_log_entries.earliest("id").id + id_upper_bound = audit_log_entries.latest("id").id + while id_lower_bound <= id_upper_bound: + do_bulk_backfill_extra_data( + audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound) + ) + id_lower_bound += BATCH_SIZE + 1 + + do_bulk_backfill_extra_data(audit_log_model, id_lower_bound, id_upper_bound) + + return inner + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("zilencer", "0026_auditlog_models_extra_data_json"), + ] + + operations = [ + migrations.RunPython( + backfill_extra_data("RemoteRealmAuditLog"), + reverse_code=migrations.RunPython.noop, + elidable=True, + ), + migrations.RunPython( + backfill_extra_data("RemoteZulipServerAuditLog"), + reverse_code=migrations.RunPython.noop, + elidable=True, + ), + ]