analytics: Remove interval field from *Count tables.

Includes a database migration. The interval field was originally there to
facilitate time aggregation (e.g. aggregate_hour_to_day), but we now do such
aggregations in views code or in the frontend.
This commit is contained in:
Rishi Gupta
2017-01-16 13:05:51 -08:00
committed by Tim Abbott
parent a8f2ebb443
commit 68fcb4152f
5 changed files with 79 additions and 37 deletions

View File

@@ -109,8 +109,8 @@ def do_fill_count_stat_at_hour(stat, end_time):
else: # stat.interval == CountStat.GAUGE else: # stat.interval == CountStat.GAUGE
start_time = MIN_TIME start_time = MIN_TIME
do_pull_from_zerver(stat, start_time, end_time, stat.interval) do_pull_from_zerver(stat, start_time, end_time)
do_aggregate_to_summary_table(stat, end_time, stat.interval) do_aggregate_to_summary_table(stat, end_time)
def do_delete_count_stat_at_hour(stat, end_time): def do_delete_count_stat_at_hour(stat, end_time):
# type: (CountStat, datetime) -> None # type: (CountStat, datetime) -> None
@@ -127,8 +127,8 @@ def do_drop_all_analytics_tables():
InstallationCount.objects.all().delete() InstallationCount.objects.all().delete()
FillState.objects.all().delete() FillState.objects.all().delete()
def do_aggregate_to_summary_table(stat, end_time, interval): def do_aggregate_to_summary_table(stat, end_time):
# type: (CountStat, datetime, str) -> None # type: (CountStat, datetime) -> None
cursor = connection.cursor() cursor = connection.cursor()
# Aggregate into RealmCount # Aggregate into RealmCount
@@ -136,23 +136,21 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
if analytics_table in (UserCount, StreamCount): if analytics_table in (UserCount, StreamCount):
realmcount_query = """ realmcount_query = """
INSERT INTO analytics_realmcount INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time, interval) (realm_id, value, property, subgroup, end_time)
SELECT SELECT
zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s', zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s',
%(analytics_table)s.subgroup, %%(end_time)s, '%(interval)s' %(analytics_table)s.subgroup, %%(end_time)s
FROM zerver_realm FROM zerver_realm
JOIN %(analytics_table)s JOIN %(analytics_table)s
ON ON
( (
%(analytics_table)s.realm_id = zerver_realm.id AND %(analytics_table)s.realm_id = zerver_realm.id AND
%(analytics_table)s.property = '%(property)s' AND %(analytics_table)s.property = '%(property)s' AND
%(analytics_table)s.end_time = %%(end_time)s AND %(analytics_table)s.end_time = %%(end_time)s
%(analytics_table)s.interval = '%(interval)s'
) )
GROUP BY zerver_realm.id, %(analytics_table)s.subgroup GROUP BY zerver_realm.id, %(analytics_table)s.subgroup
""" % {'analytics_table': analytics_table._meta.db_table, """ % {'analytics_table': analytics_table._meta.db_table,
'property': stat.property, 'property': stat.property}
'interval': interval}
start = time.time() start = time.time()
cursor.execute(realmcount_query, {'end_time': end_time}) cursor.execute(realmcount_query, {'end_time': end_time})
end = time.time() end = time.time()
@@ -161,18 +159,16 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
# Aggregate into InstallationCount # Aggregate into InstallationCount
installationcount_query = """ installationcount_query = """
INSERT INTO analytics_installationcount INSERT INTO analytics_installationcount
(value, property, subgroup, end_time, interval) (value, property, subgroup, end_time)
SELECT SELECT
sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s, '%(interval)s' sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s
FROM analytics_realmcount FROM analytics_realmcount
WHERE WHERE
( (
property = '%(property)s' AND property = '%(property)s' AND
end_time = %%(end_time)s AND end_time = %%(end_time)s
interval = '%(interval)s'
) GROUP BY analytics_realmcount.subgroup ) GROUP BY analytics_realmcount.subgroup
""" % {'property': stat.property, """ % {'property': stat.property}
'interval': interval}
start = time.time() start = time.time()
cursor.execute(installationcount_query, {'end_time': end_time}) cursor.execute(installationcount_query, {'end_time': end_time})
end = time.time() end = time.time()
@@ -180,8 +176,8 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
cursor.close() cursor.close()
# This is the only method that hits the prod databases directly. # This is the only method that hits the prod databases directly.
def do_pull_from_zerver(stat, start_time, end_time, interval): def do_pull_from_zerver(stat, start_time, end_time):
# type: (CountStat, datetime, datetime, str) -> None # type: (CountStat, datetime, datetime) -> None
zerver_table = stat.zerver_count_query.zerver_table._meta.db_table # type: ignore zerver_table = stat.zerver_count_query.zerver_table._meta.db_table # type: ignore
join_args = ' '.join('AND %s.%s = %s' % (zerver_table, key, value) join_args = ' '.join('AND %s.%s = %s' % (zerver_table, key, value)
for key, value in stat.filter_args.items()) for key, value in stat.filter_args.items())
@@ -197,7 +193,6 @@ def do_pull_from_zerver(stat, start_time, end_time, interval):
# the string formatting prior so that cursor.execute runs it as sql # the string formatting prior so that cursor.execute runs it as sql
query_ = stat.zerver_count_query.query % {'zerver_table': zerver_table, query_ = stat.zerver_count_query.query % {'zerver_table': zerver_table,
'property': stat.property, 'property': stat.property,
'interval': interval,
'join_args': join_args, 'join_args': join_args,
'subgroup': subgroup, 'subgroup': subgroup,
'group_by_clause': group_by_clause} 'group_by_clause': group_by_clause}
@@ -210,9 +205,9 @@ def do_pull_from_zerver(stat, start_time, end_time, interval):
count_user_by_realm_query = """ count_user_by_realm_query = """
INSERT INTO analytics_realmcount INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time, interval) (realm_id, value, property, subgroup, end_time)
SELECT SELECT
zerver_realm.id, count(%(zerver_table)s),'%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' zerver_realm.id, count(%(zerver_table)s),'%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_realm FROM zerver_realm
JOIN zerver_userprofile JOIN zerver_userprofile
ON ON
@@ -231,9 +226,9 @@ zerver_count_user_by_realm = ZerverCountQuery(UserProfile, RealmCount, count_use
# currently .sender_id is only Message specific thing # currently .sender_id is only Message specific thing
count_message_by_user_query = """ count_message_by_user_query = """
INSERT INTO analytics_usercount INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time, interval) (user_id, realm_id, value, property, subgroup, end_time)
SELECT SELECT
zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_userprofile FROM zerver_userprofile
JOIN zerver_message JOIN zerver_message
ON ON
@@ -252,9 +247,9 @@ zerver_count_message_by_user = ZerverCountQuery(Message, UserCount, count_messag
# Currently unused and untested # Currently unused and untested
count_stream_by_realm_query = """ count_stream_by_realm_query = """
INSERT INTO analytics_realmcount INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time, interval) (realm_id, value, property, subgroup, end_time)
SELECT SELECT
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_realm FROM zerver_realm
JOIN zerver_stream JOIN zerver_stream
ON ON
@@ -276,8 +271,8 @@ zerver_count_stream_by_realm = ZerverCountQuery(Stream, RealmCount, count_stream
# it uses 'message_type' from the subquery to fill in the subgroup column. # it uses 'message_type' from the subquery to fill in the subgroup column.
count_message_type_by_user_query = """ count_message_type_by_user_query = """
INSERT INTO analytics_usercount INSERT INTO analytics_usercount
(realm_id, user_id, value, property, subgroup, end_time, interval) (realm_id, user_id, value, property, subgroup, end_time)
SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s, '%(interval)s' SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s
FROM FROM
( (
SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*), SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*),
@@ -314,9 +309,9 @@ zerver_count_message_type_by_user = ZerverCountQuery(Message, UserCount, count_m
# the UserProfile table, consider writing a new query for efficiency. # the UserProfile table, consider writing a new query for efficiency.
count_message_by_stream_query = """ count_message_by_stream_query = """
INSERT INTO analytics_streamcount INSERT INTO analytics_streamcount
(stream_id, realm_id, value, property, subgroup, end_time, interval) (stream_id, realm_id, value, property, subgroup, end_time)
SELECT SELECT
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_stream FROM zerver_stream
JOIN zerver_recipient JOIN zerver_recipient
ON ON

View File

@@ -64,7 +64,7 @@ class Command(BaseCommand):
for subgroup, values in fixture_data.items(): for subgroup, values in fixture_data.items():
table.objects.bulk_create([ table.objects.bulk_create([
table(property=stat.property, subgroup=subgroup, end_time=end_time, table(property=stat.property, subgroup=subgroup, end_time=end_time,
interval=stat.interval, value=value, **id_args) value=value, **id_args)
for end_time, value in zip(end_times, values) if value != 0]) for end_time, value in zip(end_times, values) if value != 0])
stat = COUNT_STATS['active_users:is_bot:day'] stat = COUNT_STATS['active_users:is_bot:day']

View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.4 on 2017-01-16 20:50
from __future__ import unicode_literals
from django.conf import settings
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('analytics', '0006_add_subgroup_to_unique_constraints'),
]
operations = [
migrations.AlterUniqueTogether(
name='installationcount',
unique_together=set([('property', 'subgroup', 'end_time')]),
),
migrations.RemoveField(
model_name='installationcount',
name='interval',
),
migrations.AlterUniqueTogether(
name='realmcount',
unique_together=set([('realm', 'property', 'subgroup', 'end_time')]),
),
migrations.RemoveField(
model_name='realmcount',
name='interval',
),
migrations.AlterUniqueTogether(
name='streamcount',
unique_together=set([('stream', 'property', 'subgroup', 'end_time')]),
),
migrations.RemoveField(
model_name='streamcount',
name='interval',
),
migrations.AlterUniqueTogether(
name='usercount',
unique_together=set([('user', 'property', 'subgroup', 'end_time')]),
),
migrations.RemoveField(
model_name='usercount',
name='interval',
),
]

View File

@@ -46,7 +46,6 @@ class BaseCount(ModelReprMixin, models.Model):
property = models.CharField(max_length=32) # type: Text property = models.CharField(max_length=32) # type: Text
subgroup = models.CharField(max_length=16, null=True) # type: Text subgroup = models.CharField(max_length=16, null=True) # type: Text
end_time = models.DateTimeField() # type: datetime.datetime end_time = models.DateTimeField() # type: datetime.datetime
interval = models.CharField(max_length=8) # type: Text
value = models.BigIntegerField() # type: int value = models.BigIntegerField() # type: int
anomaly = models.ForeignKey(Anomaly, null=True) # type: Optional[Anomaly] anomaly = models.ForeignKey(Anomaly, null=True) # type: Optional[Anomaly]
@@ -66,7 +65,7 @@ class BaseCount(ModelReprMixin, models.Model):
class InstallationCount(BaseCount): class InstallationCount(BaseCount):
class Meta(object): class Meta(object):
unique_together = ("property", "subgroup", "end_time", "interval") unique_together = ("property", "subgroup", "end_time")
@staticmethod @staticmethod
def extended_id(): def extended_id():
@@ -86,7 +85,7 @@ class RealmCount(BaseCount):
realm = models.ForeignKey(Realm) realm = models.ForeignKey(Realm)
class Meta(object): class Meta(object):
unique_together = ("realm", "property", "subgroup", "end_time", "interval") unique_together = ("realm", "property", "subgroup", "end_time")
@staticmethod @staticmethod
def extended_id(): def extended_id():
@@ -107,7 +106,7 @@ class UserCount(BaseCount):
realm = models.ForeignKey(Realm) realm = models.ForeignKey(Realm)
class Meta(object): class Meta(object):
unique_together = ("user", "property", "subgroup", "end_time", "interval") unique_together = ("user", "property", "subgroup", "end_time")
@staticmethod @staticmethod
def extended_id(): def extended_id():
@@ -128,7 +127,7 @@ class StreamCount(BaseCount):
realm = models.ForeignKey(Realm) realm = models.ForeignKey(Realm)
class Meta(object): class Meta(object):
unique_together = ("stream", "property", "subgroup", "end_time", "interval") unique_together = ("stream", "property", "subgroup", "end_time")
@staticmethod @staticmethod
def extended_id(): def extended_id():

View File

@@ -149,8 +149,8 @@ class AnalyticsTestCase(TestCase):
class TestProcessCountStat(AnalyticsTestCase): class TestProcessCountStat(AnalyticsTestCase):
def make_dummy_count_stat(self, current_time): def make_dummy_count_stat(self, current_time):
# type: (datetime) -> CountStat # type: (datetime) -> CountStat
dummy_query = """INSERT INTO analytics_realmcount (realm_id, property, end_time, interval, value) dummy_query = """INSERT INTO analytics_realmcount (realm_id, property, end_time, value)
VALUES (1, 'test stat', '%(end_time)s','hour', 22)""" % {'end_time': current_time} VALUES (1, 'test stat', '%(end_time)s', 22)""" % {'end_time': current_time}
count_stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, dummy_query), count_stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, dummy_query),
{}, None, CountStat.HOUR, False) {}, None, CountStat.HOUR, False)
return count_stat return count_stat