Files
zulip/analytics/migrations/0015_clear_duplicate_counts.py
Anders Kaseorg 6e4c3e41dc python: Normalize quotes with Black.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
2021-02-12 13:11:19 -08:00

66 lines
2.7 KiB
Python

from django.db import migrations
from django.db.backends.postgresql.schema import DatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import Count, Sum
def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None:
"""This is a preparatory migration for our Analytics tables.
The backstory is that Django's unique_together indexes do not properly
handle the subgroup=None corner case (allowing duplicate rows that have a
subgroup of None), which meant that in race conditions, rather than updating
an existing row for the property/(realm, stream, user)/time with subgroup=None, Django would
create a duplicate row.
In the next migration, we'll add a proper constraint to fix this bug, but
we need to fix any existing problematic rows before we can add that constraint.
We fix this in an appropriate fashion for each type of CountStat object; mainly
this means deleting the extra rows, but for LoggingCountStat objects, we need to
additionally combine the sums.
"""
count_tables = dict(
realm=apps.get_model("analytics", "RealmCount"),
user=apps.get_model("analytics", "UserCount"),
stream=apps.get_model("analytics", "StreamCount"),
installation=apps.get_model("analytics", "InstallationCount"),
)
for name, count_table in count_tables.items():
value = [name, "property", "end_time"]
if name == "installation":
value = ["property", "end_time"]
counts = (
count_table.objects.filter(subgroup=None)
.values(*value)
.annotate(Count("id"), Sum("value"))
.filter(id__count__gt=1)
)
for count in counts:
count.pop("id__count")
total_value = count.pop("value__sum")
duplicate_counts = list(count_table.objects.filter(**count))
first_count = duplicate_counts[0]
if count["property"] in ["invites_sent::day", "active_users_log:is_bot:day"]:
# For LoggingCountStat objects, the right fix is to combine the totals;
# for other CountStat objects, we expect the duplicates to have the same value.
# And so all we need to do is delete them.
first_count.value = total_value
first_count.save()
to_cleanup = duplicate_counts[1:]
for duplicate_count in to_cleanup:
duplicate_count.delete()
class Migration(migrations.Migration):
dependencies = [
("analytics", "0014_remove_fillstate_last_modified"),
]
operations = [
migrations.RunPython(clear_duplicate_counts, reverse_code=migrations.RunPython.noop),
]