diff --git a/analytics/lib/counts.py b/analytics/lib/counts.py index fff495fe7f..5b42ac1078 100644 --- a/analytics/lib/counts.py +++ b/analytics/lib/counts.py @@ -8,7 +8,7 @@ from analytics.models import InstallationCount, RealmCount, \ from zerver.models import Realm, UserProfile, Message, Stream, models from zerver.lib.timestamp import floor_to_day -from typing import Any, Optional, Type +from typing import Any, Optional, Type, Tuple from six import text_type import logging @@ -36,12 +36,13 @@ class CountStat(object): # Allowed intervals are HOUR, DAY, and, GAUGE GAUGE = 'gauge' - def __init__(self, property, zerver_count_query, filter_args, frequency, is_gauge): - # type: (text_type, ZerverCountQuery, Dict[str, bool], str, bool) -> None + def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, is_gauge): + # type: (text_type, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, bool) -> None self.property = property self.zerver_count_query = zerver_count_query # might have to do something different for bitfields self.filter_args = filter_args + self.group_by = group_by if frequency not in self.FREQUENCIES: raise ValueError("Unknown frequency: %s" % (frequency,)) self.frequency = frequency @@ -120,9 +121,10 @@ def do_aggregate_to_summary_table(stat, end_time, interval): if analytics_table in (UserCount, StreamCount): realmcount_query = """ INSERT INTO analytics_realmcount - (realm_id, value, property, end_time, interval) + (realm_id, value, property, subgroup, end_time, interval) SELECT - zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s', %%(end_time)s, '%(interval)s' + zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s', + %(analytics_table)s.subgroup, %%(end_time)s, '%(interval)s' FROM zerver_realm LEFT JOIN %(analytics_table)s ON @@ -132,11 +134,10 @@ def do_aggregate_to_summary_table(stat, end_time, interval): %(analytics_table)s.end_time = %%(end_time)s AND %(analytics_table)s.interval = '%(interval)s' ) - GROUP BY zerver_realm.id + GROUP BY zerver_realm.id, %(analytics_table)s.subgroup """ % {'analytics_table' : analytics_table._meta.db_table, 'property' : stat.property, 'interval' : interval} - start = time.time() cursor.execute(realmcount_query, {'end_time': end_time}) end = time.time() @@ -145,19 +146,18 @@ def do_aggregate_to_summary_table(stat, end_time, interval): # Aggregate into InstallationCount installationcount_query = """ INSERT INTO analytics_installationcount - (value, property, end_time, interval) + (value, property, subgroup, end_time, interval) SELECT - COALESCE(sum(value), 0), '%(property)s', %%(end_time)s, '%(interval)s' + COALESCE(sum(value), 0), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s, '%(interval)s' FROM analytics_realmcount WHERE ( property = '%(property)s' AND end_time = %%(end_time)s AND interval = '%(interval)s' - ) + ) GROUP BY analytics_realmcount.subgroup """ % {'property': stat.property, 'interval': interval} - start = time.time() cursor.execute(installationcount_query, {'end_time': end_time}) end = time.time() @@ -170,13 +170,22 @@ def do_pull_from_zerver(stat, start_time, end_time, interval): zerver_table = stat.zerver_count_query.zerver_table._meta.db_table # type: ignore join_args = ' '.join('AND %s.%s = %s' % (zerver_table, key, value) \ for key, value in stat.filter_args.items()) + if stat.group_by is None: + subgroup = 'NULL' + group_by_clause = '' + else: + subgroup = '%s.%s' % (stat.group_by[0]._meta.db_table, stat.group_by[1]) + group_by_clause = ', ' + subgroup + # We do string replacement here because passing join_args as a param # may result in problems when running cursor.execute; we do # the string formatting prior so that cursor.execute runs it as sql query_ = stat.zerver_count_query.query % {'zerver_table' : zerver_table, 'property' : stat.property, 'interval' : interval, - 'join_args' : join_args} + 'join_args' : join_args, + 'subgroup': subgroup, + 'group_by_clause': group_by_clause} cursor = connection.cursor() start = time.time() cursor.execute(query_, {'time_start': start_time, 'time_end': end_time}) @@ -186,9 +195,9 @@ def do_pull_from_zerver(stat, start_time, end_time, interval): count_user_by_realm_query = """ INSERT INTO analytics_realmcount - (realm_id, value, property, end_time, interval) + (realm_id, value, property, subgroup, end_time, interval) SELECT - zerver_realm.id, count(%(zerver_table)s),'%(property)s', %%(time_end)s, '%(interval)s' + zerver_realm.id, count(%(zerver_table)s),'%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' FROM zerver_realm LEFT JOIN zerver_userprofile ON @@ -200,16 +209,16 @@ count_user_by_realm_query = """ ) WHERE zerver_realm.date_created < %%(time_end)s - GROUP BY zerver_realm.id + GROUP BY zerver_realm.id %(group_by_clause)s """ zerver_count_user_by_realm = ZerverCountQuery(UserProfile, RealmCount, count_user_by_realm_query) # currently .sender_id is only Message specific thing count_message_by_user_query = """ INSERT INTO analytics_usercount - (user_id, realm_id, value, property, end_time, interval) + (user_id, realm_id, value, property, subgroup, end_time, interval) SELECT - zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %%(time_end)s, '%(interval)s' + zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' FROM zerver_userprofile JOIN zerver_message ON @@ -221,15 +230,15 @@ count_message_by_user_query = """ ) WHERE zerver_userprofile.date_joined < %%(time_end)s - GROUP BY zerver_userprofile.id + GROUP BY zerver_userprofile.id %(group_by_clause)s """ zerver_count_message_by_user = ZerverCountQuery(Message, UserCount, count_message_by_user_query) count_message_by_stream_query = """ INSERT INTO analytics_streamcount - (stream_id, realm_id, value, property, end_time, interval) + (stream_id, realm_id, value, property, subgroup, end_time, interval) SELECT - zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %%(time_end)s, '%(interval)s' + zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' FROM zerver_stream INNER JOIN zerver_recipient ON @@ -246,15 +255,15 @@ count_message_by_stream_query = """ zerver_stream.date_created < %%(time_end)s %(join_args)s ) - GROUP BY zerver_stream.id + GROUP BY zerver_stream.id %(group_by_clause)s """ zerver_count_message_by_stream = ZerverCountQuery(Message, StreamCount, count_message_by_stream_query) count_stream_by_realm_query = """ INSERT INTO analytics_realmcount - (realm_id, value, property, end_time, interval) + (realm_id, value, property, subgroup, end_time, interval) SELECT - zerver_realm.id, count(*), '%(property)s', %%(time_end)s, '%(interval)s' + zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s' FROM zerver_realm LEFT JOIN zerver_stream ON @@ -266,13 +275,14 @@ count_stream_by_realm_query = """ ) WHERE zerver_realm.date_created < %%(time_end)s - GROUP BY zerver_realm.id + GROUP BY zerver_realm.id %(group_by_clause)s """ zerver_count_stream_by_realm = ZerverCountQuery(Stream, RealmCount, count_stream_by_realm_query) COUNT_STATS = { - 'active_humans': CountStat('active_humans', zerver_count_user_by_realm, - {'is_bot': False, 'is_active': True}, CountStat.DAY, True), - 'active_bots': CountStat('active_bots', zerver_count_user_by_realm, - {'is_bot': True, 'is_active': True}, CountStat.DAY, True), - 'messages_sent': CountStat('messages_sent', zerver_count_message_by_user, {}, CountStat.HOUR, False)} + 'active_users_by_is_bot': CountStat('active_users_by_is_bot', zerver_count_user_by_realm, + {'is_active': True}, (UserProfile, 'is_bot'), CountStat.DAY, True), + 'messages_sent': CountStat('messages_sent', zerver_count_message_by_user, {}, None, + CountStat.HOUR, False), + 'messages_sent_by_is_bot': CountStat('messages_sent_by_is_bot', zerver_count_message_by_user, {}, (UserProfile, 'is_bot'), + CountStat.DAY, False)} diff --git a/analytics/migrations/0004_add_subgroup.py b/analytics/migrations/0004_add_subgroup.py new file mode 100644 index 0000000000..ce2f8531b0 --- /dev/null +++ b/analytics/migrations/0004_add_subgroup.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('analytics', '0003_fillstate'), + ] + + operations = [ + migrations.AddField( + model_name='installationcount', + name='subgroup', + field=models.CharField(max_length=16, null=True), + ), + migrations.AddField( + model_name='realmcount', + name='subgroup', + field=models.CharField(max_length=16, null=True), + ), + migrations.AddField( + model_name='streamcount', + name='subgroup', + field=models.CharField(max_length=16, null=True), + ), + migrations.AddField( + model_name='usercount', + name='subgroup', + field=models.CharField(max_length=16, null=True), + ), + ] diff --git a/analytics/models.py b/analytics/models.py index d6e6e007ab..0e184b043f 100644 --- a/analytics/models.py +++ b/analytics/models.py @@ -52,6 +52,7 @@ class BaseCount(ModelReprMixin, models.Model): # the order of the columns in the migration to make sure they # match how you'd like the table to be arranged. property = models.CharField(max_length=40) # type: text_type + subgroup = models.CharField(max_length=16, null=True) # type: text_type end_time = models.DateTimeField() # type: datetime.datetime interval = models.CharField(max_length=20) # type: text_type value = models.BigIntegerField() # type: int diff --git a/analytics/tests/test_counts.py b/analytics/tests/test_counts.py index 25f2a5430d..4b06b26266 100644 --- a/analytics/tests/test_counts.py +++ b/analytics/tests/test_counts.py @@ -25,7 +25,7 @@ class AnalyticsTestCase(TestCase): TIME_LAST_HOUR = TIME_ZERO - HOUR count_stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, 'select 0'), - {}, CountStat.HOUR, False) + {}, None, CountStat.HOUR, False) def setUp(self): # type: () -> None @@ -90,7 +90,7 @@ class TestUpdateAnalyticsCounts(AnalyticsTestCase): # might change if we refactor count_query stat = CountStat('test_stat_write', zerver_count_stream_by_realm, - {'invite_only': False}, CountStat.HOUR, False) + {'invite_only': False}, None, CountStat.HOUR, False) # add some stuff to zerver_* self.create_stream(name='stream1') @@ -105,7 +105,7 @@ class TestUpdateAnalyticsCounts(AnalyticsTestCase): def test_update_analytics_tables(self): # type: () -> None - stat = CountStat('test_messages_sent', zerver_count_message_by_user, {}, CountStat.HOUR, False) + stat = CountStat('test_messages_sent', zerver_count_message_by_user, {}, None, CountStat.HOUR, False) user1 = self.create_user('email1') user2 = self.create_user('email2') @@ -167,7 +167,7 @@ class TestProcessCountStat(AnalyticsTestCase): # test users added in last hour def test_add_new_users(self): # type: () -> None - stat = CountStat('add_new_user_test', zerver_count_user_by_realm, {}, CountStat.HOUR, False) + stat = CountStat('add_new_user_test', zerver_count_user_by_realm, {}, None, CountStat.HOUR, False) # add new users to realm in last hour self.create_user('email1') @@ -185,7 +185,7 @@ class TestProcessCountStat(AnalyticsTestCase): def test_count_before_realm_creation(self): # type: () -> None stat = CountStat('test_active_humans', zerver_count_user_by_realm, - {'is_bot': False, 'is_active': True}, CountStat.HOUR, False) + {'is_bot': False, 'is_active': True}, None, CountStat.HOUR, False) realm = Realm.objects.create(domain='domain', name='name', date_created=self.TIME_ZERO) self.create_user('email', realm=realm) @@ -198,7 +198,7 @@ class TestProcessCountStat(AnalyticsTestCase): # type: () -> None # test that rows with empty counts are returned if realm exists stat = CountStat('test_active_humans', zerver_count_user_by_realm, - {'is_bot': False, 'is_active': True}, CountStat.HOUR, False) + {'is_bot': False, 'is_active': True}, None, CountStat.HOUR, False) do_fill_count_stat_at_hour(stat, self.TIME_ZERO) self.assertCountEquals(RealmCount, 'test_active_humans', 0) @@ -216,7 +216,7 @@ class TestAggregates(AnalyticsTestCase): class TestXByYQueries(AnalyticsTestCase): def test_message_to_stream_aggregation(self): # type: () -> None - stat = CountStat('test_messages_to_stream', zerver_count_message_by_stream, {}, CountStat.HOUR, False) + stat = CountStat('test_messages_to_stream', zerver_count_message_by_stream, {}, None, CountStat.HOUR, False) # write some messages user = self.create_user('email') @@ -236,12 +236,11 @@ class TestCountStats(AnalyticsTestCase): def test_human_and_bot_count_by_realm(self): # type: () -> None stats = [ - CountStat('test_active_humans', zerver_count_user_by_realm, {'is_bot': False, 'is_active': True}, + CountStat('test_active_humans', zerver_count_user_by_realm, {'is_bot': False, 'is_active': True}, None, CountStat.HOUR, False), - CountStat('test_active_bots', zerver_count_user_by_realm, {'is_bot': True, 'is_active': True}, + CountStat('test_active_bots', zerver_count_user_by_realm, {'is_bot': True, 'is_active': True}, None, CountStat.HOUR, False)] - # TODO these dates should probably be explicit, since the default args for the commands are timezone.now() dependent. self.create_user('email1-bot', is_bot=True) self.create_user('email2-bot', is_bot=True) self.create_user('email3-human', is_bot=False)