mirror of
https://github.com/zulip/zulip.git
synced 2025-11-04 14:03:30 +00:00
analytics: Add subgroup column to analytics tables.
This is a major change to the analytics schema, and is the first step in a
number of refactorings and performance improvements. For instance, it allows
* Grouping sets of similar CountStats in the *Count tables. For instance,
active{_humans,_bots} will now have the same property, but have different
subgroup values.
* Combining queries that differ only in their value on 1 filter clause, so
that we make fewer passes through the zerver tables. For instance, instead
of running a query for each of messages_sent_to_public_streams and
messages_sent_to_private_streams, we can now run a single query with a
group by on Stream.invite_only, and store the group by value in the
subgroup column.
This commit is contained in:
@@ -8,7 +8,7 @@ from analytics.models import InstallationCount, RealmCount, \
|
||||
from zerver.models import Realm, UserProfile, Message, Stream, models
|
||||
from zerver.lib.timestamp import floor_to_day
|
||||
|
||||
from typing import Any, Optional, Type
|
||||
from typing import Any, Optional, Type, Tuple
|
||||
from six import text_type
|
||||
|
||||
import logging
|
||||
@@ -36,12 +36,13 @@ class CountStat(object):
|
||||
# Allowed intervals are HOUR, DAY, and, GAUGE
|
||||
GAUGE = 'gauge'
|
||||
|
||||
def __init__(self, property, zerver_count_query, filter_args, frequency, is_gauge):
|
||||
# type: (text_type, ZerverCountQuery, Dict[str, bool], str, bool) -> None
|
||||
def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, is_gauge):
|
||||
# type: (text_type, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, bool) -> None
|
||||
self.property = property
|
||||
self.zerver_count_query = zerver_count_query
|
||||
# might have to do something different for bitfields
|
||||
self.filter_args = filter_args
|
||||
self.group_by = group_by
|
||||
if frequency not in self.FREQUENCIES:
|
||||
raise ValueError("Unknown frequency: %s" % (frequency,))
|
||||
self.frequency = frequency
|
||||
@@ -120,9 +121,10 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
|
||||
if analytics_table in (UserCount, StreamCount):
|
||||
realmcount_query = """
|
||||
INSERT INTO analytics_realmcount
|
||||
(realm_id, value, property, end_time, interval)
|
||||
(realm_id, value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s', %%(end_time)s, '%(interval)s'
|
||||
zerver_realm.id, COALESCE(sum(%(analytics_table)s.value), 0), '%(property)s',
|
||||
%(analytics_table)s.subgroup, %%(end_time)s, '%(interval)s'
|
||||
FROM zerver_realm
|
||||
LEFT JOIN %(analytics_table)s
|
||||
ON
|
||||
@@ -132,11 +134,10 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
|
||||
%(analytics_table)s.end_time = %%(end_time)s AND
|
||||
%(analytics_table)s.interval = '%(interval)s'
|
||||
)
|
||||
GROUP BY zerver_realm.id
|
||||
GROUP BY zerver_realm.id, %(analytics_table)s.subgroup
|
||||
""" % {'analytics_table' : analytics_table._meta.db_table,
|
||||
'property' : stat.property,
|
||||
'interval' : interval}
|
||||
|
||||
start = time.time()
|
||||
cursor.execute(realmcount_query, {'end_time': end_time})
|
||||
end = time.time()
|
||||
@@ -145,19 +146,18 @@ def do_aggregate_to_summary_table(stat, end_time, interval):
|
||||
# Aggregate into InstallationCount
|
||||
installationcount_query = """
|
||||
INSERT INTO analytics_installationcount
|
||||
(value, property, end_time, interval)
|
||||
(value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
COALESCE(sum(value), 0), '%(property)s', %%(end_time)s, '%(interval)s'
|
||||
COALESCE(sum(value), 0), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s, '%(interval)s'
|
||||
FROM analytics_realmcount
|
||||
WHERE
|
||||
(
|
||||
property = '%(property)s' AND
|
||||
end_time = %%(end_time)s AND
|
||||
interval = '%(interval)s'
|
||||
)
|
||||
) GROUP BY analytics_realmcount.subgroup
|
||||
""" % {'property': stat.property,
|
||||
'interval': interval}
|
||||
|
||||
start = time.time()
|
||||
cursor.execute(installationcount_query, {'end_time': end_time})
|
||||
end = time.time()
|
||||
@@ -170,13 +170,22 @@ def do_pull_from_zerver(stat, start_time, end_time, interval):
|
||||
zerver_table = stat.zerver_count_query.zerver_table._meta.db_table # type: ignore
|
||||
join_args = ' '.join('AND %s.%s = %s' % (zerver_table, key, value) \
|
||||
for key, value in stat.filter_args.items())
|
||||
if stat.group_by is None:
|
||||
subgroup = 'NULL'
|
||||
group_by_clause = ''
|
||||
else:
|
||||
subgroup = '%s.%s' % (stat.group_by[0]._meta.db_table, stat.group_by[1])
|
||||
group_by_clause = ', ' + subgroup
|
||||
|
||||
# We do string replacement here because passing join_args as a param
|
||||
# may result in problems when running cursor.execute; we do
|
||||
# the string formatting prior so that cursor.execute runs it as sql
|
||||
query_ = stat.zerver_count_query.query % {'zerver_table' : zerver_table,
|
||||
'property' : stat.property,
|
||||
'interval' : interval,
|
||||
'join_args' : join_args}
|
||||
'join_args' : join_args,
|
||||
'subgroup': subgroup,
|
||||
'group_by_clause': group_by_clause}
|
||||
cursor = connection.cursor()
|
||||
start = time.time()
|
||||
cursor.execute(query_, {'time_start': start_time, 'time_end': end_time})
|
||||
@@ -186,9 +195,9 @@ def do_pull_from_zerver(stat, start_time, end_time, interval):
|
||||
|
||||
count_user_by_realm_query = """
|
||||
INSERT INTO analytics_realmcount
|
||||
(realm_id, value, property, end_time, interval)
|
||||
(realm_id, value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
zerver_realm.id, count(%(zerver_table)s),'%(property)s', %%(time_end)s, '%(interval)s'
|
||||
zerver_realm.id, count(%(zerver_table)s),'%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s'
|
||||
FROM zerver_realm
|
||||
LEFT JOIN zerver_userprofile
|
||||
ON
|
||||
@@ -200,16 +209,16 @@ count_user_by_realm_query = """
|
||||
)
|
||||
WHERE
|
||||
zerver_realm.date_created < %%(time_end)s
|
||||
GROUP BY zerver_realm.id
|
||||
GROUP BY zerver_realm.id %(group_by_clause)s
|
||||
"""
|
||||
zerver_count_user_by_realm = ZerverCountQuery(UserProfile, RealmCount, count_user_by_realm_query)
|
||||
|
||||
# currently .sender_id is only Message specific thing
|
||||
count_message_by_user_query = """
|
||||
INSERT INTO analytics_usercount
|
||||
(user_id, realm_id, value, property, end_time, interval)
|
||||
(user_id, realm_id, value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %%(time_end)s, '%(interval)s'
|
||||
zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s'
|
||||
FROM zerver_userprofile
|
||||
JOIN zerver_message
|
||||
ON
|
||||
@@ -221,15 +230,15 @@ count_message_by_user_query = """
|
||||
)
|
||||
WHERE
|
||||
zerver_userprofile.date_joined < %%(time_end)s
|
||||
GROUP BY zerver_userprofile.id
|
||||
GROUP BY zerver_userprofile.id %(group_by_clause)s
|
||||
"""
|
||||
zerver_count_message_by_user = ZerverCountQuery(Message, UserCount, count_message_by_user_query)
|
||||
|
||||
count_message_by_stream_query = """
|
||||
INSERT INTO analytics_streamcount
|
||||
(stream_id, realm_id, value, property, end_time, interval)
|
||||
(stream_id, realm_id, value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %%(time_end)s, '%(interval)s'
|
||||
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s'
|
||||
FROM zerver_stream
|
||||
INNER JOIN zerver_recipient
|
||||
ON
|
||||
@@ -246,15 +255,15 @@ count_message_by_stream_query = """
|
||||
zerver_stream.date_created < %%(time_end)s
|
||||
%(join_args)s
|
||||
)
|
||||
GROUP BY zerver_stream.id
|
||||
GROUP BY zerver_stream.id %(group_by_clause)s
|
||||
"""
|
||||
zerver_count_message_by_stream = ZerverCountQuery(Message, StreamCount, count_message_by_stream_query)
|
||||
|
||||
count_stream_by_realm_query = """
|
||||
INSERT INTO analytics_realmcount
|
||||
(realm_id, value, property, end_time, interval)
|
||||
(realm_id, value, property, subgroup, end_time, interval)
|
||||
SELECT
|
||||
zerver_realm.id, count(*), '%(property)s', %%(time_end)s, '%(interval)s'
|
||||
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s, '%(interval)s'
|
||||
FROM zerver_realm
|
||||
LEFT JOIN zerver_stream
|
||||
ON
|
||||
@@ -266,13 +275,14 @@ count_stream_by_realm_query = """
|
||||
)
|
||||
WHERE
|
||||
zerver_realm.date_created < %%(time_end)s
|
||||
GROUP BY zerver_realm.id
|
||||
GROUP BY zerver_realm.id %(group_by_clause)s
|
||||
"""
|
||||
zerver_count_stream_by_realm = ZerverCountQuery(Stream, RealmCount, count_stream_by_realm_query)
|
||||
|
||||
COUNT_STATS = {
|
||||
'active_humans': CountStat('active_humans', zerver_count_user_by_realm,
|
||||
{'is_bot': False, 'is_active': True}, CountStat.DAY, True),
|
||||
'active_bots': CountStat('active_bots', zerver_count_user_by_realm,
|
||||
{'is_bot': True, 'is_active': True}, CountStat.DAY, True),
|
||||
'messages_sent': CountStat('messages_sent', zerver_count_message_by_user, {}, CountStat.HOUR, False)}
|
||||
'active_users_by_is_bot': CountStat('active_users_by_is_bot', zerver_count_user_by_realm,
|
||||
{'is_active': True}, (UserProfile, 'is_bot'), CountStat.DAY, True),
|
||||
'messages_sent': CountStat('messages_sent', zerver_count_message_by_user, {}, None,
|
||||
CountStat.HOUR, False),
|
||||
'messages_sent_by_is_bot': CountStat('messages_sent_by_is_bot', zerver_count_message_by_user, {}, (UserProfile, 'is_bot'),
|
||||
CountStat.DAY, False)}
|
||||
|
||||
34
analytics/migrations/0004_add_subgroup.py
Normal file
34
analytics/migrations/0004_add_subgroup.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('analytics', '0003_fillstate'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='installationcount',
|
||||
name='subgroup',
|
||||
field=models.CharField(max_length=16, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='realmcount',
|
||||
name='subgroup',
|
||||
field=models.CharField(max_length=16, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='streamcount',
|
||||
name='subgroup',
|
||||
field=models.CharField(max_length=16, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='usercount',
|
||||
name='subgroup',
|
||||
field=models.CharField(max_length=16, null=True),
|
||||
),
|
||||
]
|
||||
@@ -52,6 +52,7 @@ class BaseCount(ModelReprMixin, models.Model):
|
||||
# the order of the columns in the migration to make sure they
|
||||
# match how you'd like the table to be arranged.
|
||||
property = models.CharField(max_length=40) # type: text_type
|
||||
subgroup = models.CharField(max_length=16, null=True) # type: text_type
|
||||
end_time = models.DateTimeField() # type: datetime.datetime
|
||||
interval = models.CharField(max_length=20) # type: text_type
|
||||
value = models.BigIntegerField() # type: int
|
||||
|
||||
@@ -25,7 +25,7 @@ class AnalyticsTestCase(TestCase):
|
||||
TIME_LAST_HOUR = TIME_ZERO - HOUR
|
||||
|
||||
count_stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, 'select 0'),
|
||||
{}, CountStat.HOUR, False)
|
||||
{}, None, CountStat.HOUR, False)
|
||||
|
||||
def setUp(self):
|
||||
# type: () -> None
|
||||
@@ -90,7 +90,7 @@ class TestUpdateAnalyticsCounts(AnalyticsTestCase):
|
||||
# might change if we refactor count_query
|
||||
|
||||
stat = CountStat('test_stat_write', zerver_count_stream_by_realm,
|
||||
{'invite_only': False}, CountStat.HOUR, False)
|
||||
{'invite_only': False}, None, CountStat.HOUR, False)
|
||||
|
||||
# add some stuff to zerver_*
|
||||
self.create_stream(name='stream1')
|
||||
@@ -105,7 +105,7 @@ class TestUpdateAnalyticsCounts(AnalyticsTestCase):
|
||||
|
||||
def test_update_analytics_tables(self):
|
||||
# type: () -> None
|
||||
stat = CountStat('test_messages_sent', zerver_count_message_by_user, {}, CountStat.HOUR, False)
|
||||
stat = CountStat('test_messages_sent', zerver_count_message_by_user, {}, None, CountStat.HOUR, False)
|
||||
|
||||
user1 = self.create_user('email1')
|
||||
user2 = self.create_user('email2')
|
||||
@@ -167,7 +167,7 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||
# test users added in last hour
|
||||
def test_add_new_users(self):
|
||||
# type: () -> None
|
||||
stat = CountStat('add_new_user_test', zerver_count_user_by_realm, {}, CountStat.HOUR, False)
|
||||
stat = CountStat('add_new_user_test', zerver_count_user_by_realm, {}, None, CountStat.HOUR, False)
|
||||
|
||||
# add new users to realm in last hour
|
||||
self.create_user('email1')
|
||||
@@ -185,7 +185,7 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||
def test_count_before_realm_creation(self):
|
||||
# type: () -> None
|
||||
stat = CountStat('test_active_humans', zerver_count_user_by_realm,
|
||||
{'is_bot': False, 'is_active': True}, CountStat.HOUR, False)
|
||||
{'is_bot': False, 'is_active': True}, None, CountStat.HOUR, False)
|
||||
|
||||
realm = Realm.objects.create(domain='domain', name='name', date_created=self.TIME_ZERO)
|
||||
self.create_user('email', realm=realm)
|
||||
@@ -198,7 +198,7 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||
# type: () -> None
|
||||
# test that rows with empty counts are returned if realm exists
|
||||
stat = CountStat('test_active_humans', zerver_count_user_by_realm,
|
||||
{'is_bot': False, 'is_active': True}, CountStat.HOUR, False)
|
||||
{'is_bot': False, 'is_active': True}, None, CountStat.HOUR, False)
|
||||
do_fill_count_stat_at_hour(stat, self.TIME_ZERO)
|
||||
self.assertCountEquals(RealmCount, 'test_active_humans', 0)
|
||||
|
||||
@@ -216,7 +216,7 @@ class TestAggregates(AnalyticsTestCase):
|
||||
class TestXByYQueries(AnalyticsTestCase):
|
||||
def test_message_to_stream_aggregation(self):
|
||||
# type: () -> None
|
||||
stat = CountStat('test_messages_to_stream', zerver_count_message_by_stream, {}, CountStat.HOUR, False)
|
||||
stat = CountStat('test_messages_to_stream', zerver_count_message_by_stream, {}, None, CountStat.HOUR, False)
|
||||
|
||||
# write some messages
|
||||
user = self.create_user('email')
|
||||
@@ -236,12 +236,11 @@ class TestCountStats(AnalyticsTestCase):
|
||||
def test_human_and_bot_count_by_realm(self):
|
||||
# type: () -> None
|
||||
stats = [
|
||||
CountStat('test_active_humans', zerver_count_user_by_realm, {'is_bot': False, 'is_active': True},
|
||||
CountStat('test_active_humans', zerver_count_user_by_realm, {'is_bot': False, 'is_active': True}, None,
|
||||
CountStat.HOUR, False),
|
||||
CountStat('test_active_bots', zerver_count_user_by_realm, {'is_bot': True, 'is_active': True},
|
||||
CountStat('test_active_bots', zerver_count_user_by_realm, {'is_bot': True, 'is_active': True}, None,
|
||||
CountStat.HOUR, False)]
|
||||
|
||||
# TODO these dates should probably be explicit, since the default args for the commands are timezone.now() dependent.
|
||||
self.create_user('email1-bot', is_bot=True)
|
||||
self.create_user('email2-bot', is_bot=True)
|
||||
self.create_user('email3-human', is_bot=False)
|
||||
|
||||
Reference in New Issue
Block a user