Files
zulip/analytics/lib/counts.py
Rishi Gupta 0596c4a810 analytics: Enforce various datetime arguments are in UTC.
Sort of a hacky hammer, but
* The original design of the analytics system mistakenly attempted to play
  nicely with non-UTC datetimes.
* Timezone errors are really hard to find and debug, and don't jump out that
  easily when reading code.

I don't know of any outstanding errors, but putting a few "assert this
timezone is in UTC" around will hopefully reduce the chance that there are
any current or future timezone errors.

Note that none of these functions are called outside of the analytics code
(and tests). This commit also doesn't change any current behavior, assuming
a database where all datetimes have been being stored in UTC.
2017-10-05 11:22:06 -07:00

536 lines
23 KiB
Python

from django.conf import settings
from django.db import connection, models
from django.db.models import F
from analytics.models import InstallationCount, RealmCount, \
UserCount, StreamCount, BaseCount, FillState, Anomaly, installation_epoch, \
last_successful_fill
from zerver.models import Realm, UserProfile, Message, Stream, \
UserActivityInterval, RealmAuditLog, models
from zerver.lib.timestamp import floor_to_day, floor_to_hour, ceiling_to_day, \
ceiling_to_hour
from typing import Any, Callable, Dict, List, Optional, Text, Tuple, Type, Union
from collections import defaultdict, OrderedDict
from datetime import timedelta, datetime
from zerver.lib.logging_util import create_logger
import time
## Logging setup ##
logger = create_logger('zulip.management', settings.ANALYTICS_LOG_PATH, 'INFO')
# You can't subtract timedelta.max from a datetime, so use this instead
TIMEDELTA_MAX = timedelta(days=365*1000)
## Class definitions ##
class CountStat(object):
HOUR = 'hour'
DAY = 'day'
FREQUENCIES = frozenset([HOUR, DAY])
def __init__(self, property, data_collector, frequency, interval=None):
# type: (str, DataCollector, str, Optional[timedelta]) -> None
self.property = property
self.data_collector = data_collector
# might have to do something different for bitfields
if frequency not in self.FREQUENCIES:
raise AssertionError("Unknown frequency: %s" % (frequency,))
self.frequency = frequency
if interval is not None:
self.interval = interval
elif frequency == CountStat.HOUR:
self.interval = timedelta(hours=1)
else: # frequency == CountStat.DAY
self.interval = timedelta(days=1)
def __unicode__(self):
# type: () -> Text
return u"<CountStat: %s>" % (self.property,)
class LoggingCountStat(CountStat):
def __init__(self, property, output_table, frequency):
# type: (str, Type[BaseCount], str) -> None
CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
class DependentCountStat(CountStat):
def __init__(self, property, data_collector, frequency, interval=None, dependencies=[]):
# type: (str, DataCollector, str, Optional[timedelta], List[str]) -> None
CountStat.__init__(self, property, data_collector, frequency, interval=interval)
self.dependencies = dependencies
class DataCollector(object):
def __init__(self, output_table, pull_function):
# type: (Type[BaseCount], Optional[Callable[[str, datetime, datetime], int]]) -> None
self.output_table = output_table
self.pull_function = pull_function
## CountStat-level operations ##
def process_count_stat(stat, fill_to_time):
# type: (CountStat, datetime) -> None
if stat.frequency == CountStat.HOUR:
time_increment = timedelta(hours=1)
elif stat.frequency == CountStat.DAY:
time_increment = timedelta(days=1)
else:
raise AssertionError("Unknown frequency: %s" % (stat.frequency,))
if fill_to_time.tzinfo is None:
raise ValueError("fill_to_time must be timezone aware: %s" % (fill_to_time,))
if floor_to_hour(fill_to_time) != fill_to_time:
raise ValueError("fill_to_time must be on an hour boundary: %s" % (fill_to_time,))
fill_state = FillState.objects.filter(property=stat.property).first()
if fill_state is None:
currently_filled = installation_epoch()
fill_state = FillState.objects.create(property=stat.property,
end_time=currently_filled,
state=FillState.DONE)
logger.info("INITIALIZED %s %s" % (stat.property, currently_filled))
elif fill_state.state == FillState.STARTED:
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
do_delete_counts_at_hour(stat, fill_state.end_time)
currently_filled = fill_state.end_time - time_increment
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
logger.info("UNDO DONE %s" % (stat.property,))
elif fill_state.state == FillState.DONE:
currently_filled = fill_state.end_time
else:
raise AssertionError("Unknown value for FillState.state: %s." % (fill_state.state,))
if isinstance(stat, DependentCountStat):
for dependency in stat.dependencies:
dependency_fill_time = last_successful_fill(dependency)
if dependency_fill_time is None:
logger.warning("DependentCountStat %s run before dependency %s." %
(stat.property, dependency))
return
fill_to_time = min(fill_to_time, dependency_fill_time)
currently_filled = currently_filled + time_increment
while currently_filled <= fill_to_time:
logger.info("START %s %s" % (stat.property, currently_filled))
start = time.time()
do_update_fill_state(fill_state, currently_filled, FillState.STARTED)
do_fill_count_stat_at_hour(stat, currently_filled)
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
end = time.time()
currently_filled = currently_filled + time_increment
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
def do_update_fill_state(fill_state, end_time, state):
# type: (FillState, datetime, int) -> None
fill_state.end_time = end_time
fill_state.state = state
fill_state.save()
# We assume end_time is valid (e.g. is on a day or hour boundary as appropriate)
# and is timezone aware. It is the caller's responsibility to enforce this!
def do_fill_count_stat_at_hour(stat, end_time):
# type: (CountStat, datetime) -> None
start_time = end_time - stat.interval
if not isinstance(stat, LoggingCountStat):
timer = time.time()
assert(stat.data_collector.pull_function is not None)
rows_added = stat.data_collector.pull_function(stat.property, start_time, end_time)
logger.info("%s run pull_function (%dms/%sr)" %
(stat.property, (time.time()-timer)*1000, rows_added))
do_aggregate_to_summary_table(stat, end_time)
def do_delete_counts_at_hour(stat, end_time):
# type: (CountStat, datetime) -> None
if isinstance(stat, LoggingCountStat):
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
if stat.data_collector.output_table in [UserCount, StreamCount]:
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
else:
UserCount.objects.filter(property=stat.property, end_time=end_time).delete()
StreamCount.objects.filter(property=stat.property, end_time=end_time).delete()
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
def do_aggregate_to_summary_table(stat, end_time):
# type: (CountStat, datetime) -> None
cursor = connection.cursor()
# Aggregate into RealmCount
output_table = stat.data_collector.output_table
if output_table in (UserCount, StreamCount):
realmcount_query = """
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, COALESCE(sum(%(output_table)s.value), 0), '%(property)s',
%(output_table)s.subgroup, %%(end_time)s
FROM zerver_realm
JOIN %(output_table)s
ON
zerver_realm.id = %(output_table)s.realm_id
WHERE
%(output_table)s.property = '%(property)s' AND
%(output_table)s.end_time = %%(end_time)s
GROUP BY zerver_realm.id, %(output_table)s.subgroup
""" % {'output_table': output_table._meta.db_table,
'property': stat.property}
start = time.time()
cursor.execute(realmcount_query, {'end_time': end_time})
end = time.time()
logger.info("%s RealmCount aggregation (%dms/%sr)" % (stat.property, (end-start)*1000, cursor.rowcount))
# Aggregate into InstallationCount
installationcount_query = """
INSERT INTO analytics_installationcount
(value, property, subgroup, end_time)
SELECT
sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s
FROM analytics_realmcount
WHERE
property = '%(property)s' AND
end_time = %%(end_time)s
GROUP BY analytics_realmcount.subgroup
""" % {'property': stat.property}
start = time.time()
cursor.execute(installationcount_query, {'end_time': end_time})
end = time.time()
logger.info("%s InstallationCount aggregation (%dms/%sr)" % (stat.property, (end-start)*1000, cursor.rowcount))
cursor.close()
## Utility functions called from outside counts.py ##
# called from zerver/lib/actions.py; should not throw any errors
def do_increment_logging_stat(zerver_object, stat, subgroup, event_time, increment=1):
# type: (Union[Realm, UserProfile, Stream], CountStat, Optional[Union[str, int, bool]], datetime, int) -> None
table = stat.data_collector.output_table
if table == RealmCount:
id_args = {'realm': zerver_object}
elif table == UserCount:
id_args = {'realm': zerver_object.realm, 'user': zerver_object}
else: # StreamCount
id_args = {'realm': zerver_object.realm, 'stream': zerver_object}
if stat.frequency == CountStat.DAY:
end_time = ceiling_to_day(event_time)
else: # CountStat.HOUR:
end_time = ceiling_to_hour(event_time)
row, created = table.objects.get_or_create(
property=stat.property, subgroup=subgroup, end_time=end_time,
defaults={'value': increment}, **id_args)
if not created:
row.value = F('value') + increment
row.save(update_fields=['value'])
def do_drop_all_analytics_tables():
# type: () -> None
UserCount.objects.all().delete()
StreamCount.objects.all().delete()
RealmCount.objects.all().delete()
InstallationCount.objects.all().delete()
FillState.objects.all().delete()
Anomaly.objects.all().delete()
def do_drop_single_stat(property):
# type: (str) -> None
UserCount.objects.filter(property=property).delete()
StreamCount.objects.filter(property=property).delete()
RealmCount.objects.filter(property=property).delete()
InstallationCount.objects.filter(property=property).delete()
FillState.objects.filter(property=property).delete()
## DataCollector-level operations ##
def do_pull_by_sql_query(property, start_time, end_time, query, group_by):
# type: (str, datetime, datetime, str, Optional[Tuple[models.Model, str]]) -> int
if group_by is None:
subgroup = 'NULL'
group_by_clause = ''
else:
subgroup = '%s.%s' % (group_by[0]._meta.db_table, group_by[1])
group_by_clause = ', ' + subgroup
# We do string replacement here because cursor.execute will reject a
# group_by_clause given as a param.
# We pass in the datetimes as params to cursor.execute so that we don't have to
# think about how to convert python datetimes to SQL datetimes.
query_ = query % {'property': property, 'subgroup': subgroup,
'group_by_clause': group_by_clause}
cursor = connection.cursor()
cursor.execute(query_, {'time_start': start_time, 'time_end': end_time})
rowcount = cursor.rowcount
cursor.close()
return rowcount
def sql_data_collector(output_table, query, group_by):
# type: (Type[BaseCount], str, Optional[Tuple[models.Model, str]]) -> DataCollector
def pull_function(property, start_time, end_time):
# type: (str, datetime, datetime) -> int
return do_pull_by_sql_query(property, start_time, end_time, query, group_by)
return DataCollector(output_table, pull_function)
def do_pull_minutes_active(property, start_time, end_time):
# type: (str, datetime, datetime) -> int
user_activity_intervals = UserActivityInterval.objects.filter(
end__gt=start_time, start__lt=end_time
).select_related(
'user_profile'
).values_list(
'user_profile_id', 'user_profile__realm_id', 'start', 'end')
seconds_active = defaultdict(float) # type: Dict[Tuple[int, int], float]
for user_id, realm_id, interval_start, interval_end in user_activity_intervals:
start = max(start_time, interval_start)
end = min(end_time, interval_end)
seconds_active[(user_id, realm_id)] += (end - start).total_seconds()
rows = [UserCount(user_id=ids[0], realm_id=ids[1], property=property,
end_time=end_time, value=int(seconds // 60))
for ids, seconds in seconds_active.items() if seconds >= 60]
UserCount.objects.bulk_create(rows)
return len(rows)
count_message_by_user_query = """
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_userprofile
JOIN zerver_message
ON
zerver_userprofile.id = zerver_message.sender_id
WHERE
zerver_userprofile.date_joined < %%(time_end)s AND
zerver_message.pub_date >= %%(time_start)s AND
zerver_message.pub_date < %%(time_end)s
GROUP BY zerver_userprofile.id %(group_by_clause)s
"""
# Note: ignores the group_by / group_by_clause.
count_message_type_by_user_query = """
INSERT INTO analytics_usercount
(realm_id, user_id, value, property, subgroup, end_time)
SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s
FROM
(
SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*),
CASE WHEN
zerver_recipient.type = 1 THEN 'private_message'
WHEN
zerver_recipient.type = 3 THEN 'huddle_message'
WHEN
zerver_stream.invite_only = TRUE THEN 'private_stream'
ELSE 'public_stream'
END
message_type
FROM zerver_userprofile
JOIN zerver_message
ON
zerver_userprofile.id = zerver_message.sender_id AND
zerver_message.pub_date >= %%(time_start)s AND
zerver_message.pub_date < %%(time_end)s
JOIN zerver_recipient
ON
zerver_message.recipient_id = zerver_recipient.id
LEFT JOIN zerver_stream
ON
zerver_recipient.type_id = zerver_stream.id
GROUP BY zerver_userprofile.realm_id, zerver_userprofile.id, zerver_recipient.type, zerver_stream.invite_only
) AS subquery
GROUP BY realm_id, id, message_type
"""
# This query joins to the UserProfile table since all current queries that
# use this also subgroup on UserProfile.is_bot. If in the future there is a
# stat that counts messages by stream and doesn't need the UserProfile
# table, consider writing a new query for efficiency.
count_message_by_stream_query = """
INSERT INTO analytics_streamcount
(stream_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_stream
JOIN zerver_recipient
ON
zerver_stream.id = zerver_recipient.type_id
JOIN zerver_message
ON
zerver_recipient.id = zerver_message.recipient_id
JOIN zerver_userprofile
ON
zerver_message.sender_id = zerver_userprofile.id
WHERE
zerver_stream.date_created < %%(time_end)s AND
zerver_recipient.type = 2 AND
zerver_message.pub_date >= %%(time_start)s AND
zerver_message.pub_date < %%(time_end)s
GROUP BY zerver_stream.id %(group_by_clause)s
"""
# Hardcodes the query needed by active_users:is_bot:day, since that is
# currently the only stat that uses this.
count_user_by_realm_query = """
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, count(*),'%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_realm
JOIN zerver_userprofile
ON
zerver_realm.id = zerver_userprofile.realm_id
WHERE
zerver_realm.date_created < %%(time_end)s AND
zerver_userprofile.date_joined >= %%(time_start)s AND
zerver_userprofile.date_joined < %%(time_end)s AND
zerver_userprofile.is_active = TRUE
GROUP BY zerver_realm.id %(group_by_clause)s
"""
# Currently hardcodes the query needed for active_users_audit:is_bot:day.
# Assumes that a user cannot have two RealmAuditLog entries with the same event_time and
# event_type in ['user_created', 'user_deactivated', etc].
# In particular, it's important to ensure that migrations don't cause that to happen.
check_realmauditlog_by_user_query = """
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
ral1.modified_user_id, ral1.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_realmauditlog ral1
JOIN (
SELECT modified_user_id, max(event_time) AS max_event_time
FROM zerver_realmauditlog
WHERE
event_type in ('user_created', 'user_deactivated', 'user_activated', 'user_reactivated') AND
event_time < %%(time_end)s
GROUP BY modified_user_id
) ral2
ON
ral1.event_time = max_event_time AND
ral1.modified_user_id = ral2.modified_user_id
JOIN zerver_userprofile
ON
ral1.modified_user_id = zerver_userprofile.id
WHERE
ral1.event_type in ('user_created', 'user_activated', 'user_reactivated')
"""
check_useractivityinterval_by_user_query = """
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_userprofile.id, zerver_userprofile.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_userprofile
JOIN zerver_useractivityinterval
ON
zerver_userprofile.id = zerver_useractivityinterval.user_profile_id
WHERE
zerver_useractivityinterval.end >= %%(time_start)s AND
zerver_useractivityinterval.start < %%(time_end)s
GROUP BY zerver_userprofile.id %(group_by_clause)s
"""
count_realm_active_humans_query = """
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
usercount1.realm_id, count(*), '%(property)s', NULL, %%(time_end)s
FROM (
SELECT realm_id, user_id
FROM analytics_usercount
WHERE
property = 'active_users_audit:is_bot:day' AND
subgroup = 'false' AND
end_time = %%(time_end)s
) usercount1
JOIN (
SELECT realm_id, user_id
FROM analytics_usercount
WHERE
property = '15day_actives::day' AND
end_time = %%(time_end)s
) usercount2
ON
usercount1.user_id = usercount2.user_id
GROUP BY usercount1.realm_id
"""
# Currently unused and untested
count_stream_by_realm_query = """
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
FROM zerver_realm
JOIN zerver_stream
ON
zerver_realm.id = zerver_stream.realm_id AND
WHERE
zerver_realm.date_created < %%(time_end)s AND
zerver_stream.date_created >= %%(time_start)s AND
zerver_stream.date_created < %%(time_end)s
GROUP BY zerver_realm.id %(group_by_clause)s
"""
## CountStat declarations ##
count_stats_ = [
# Messages Sent stats
# Stats that count the number of messages sent in various ways.
# These are also the set of stats that read from the Message table.
CountStat('messages_sent:is_bot:hour',
sql_data_collector(UserCount, count_message_by_user_query, (UserProfile, 'is_bot')),
CountStat.HOUR),
CountStat('messages_sent:message_type:day',
sql_data_collector(UserCount, count_message_type_by_user_query, None), CountStat.DAY),
CountStat('messages_sent:client:day',
sql_data_collector(UserCount, count_message_by_user_query, (Message, 'sending_client_id')),
CountStat.DAY),
CountStat('messages_in_stream:is_bot:day',
sql_data_collector(StreamCount, count_message_by_stream_query, (UserProfile, 'is_bot')),
CountStat.DAY),
# Number of Users stats
# Stats that count the number of active users in the UserProfile.is_active sense.
# 'active_users_audit:is_bot:day' is the canonical record of which users were
# active on which days (in the UserProfile.is_active sense).
# Important that this stay a daily stat, so that 'realm_active_humans::day' works as expected.
CountStat('active_users_audit:is_bot:day',
sql_data_collector(UserCount, check_realmauditlog_by_user_query, (UserProfile, 'is_bot')),
CountStat.DAY),
# Sanity check on 'active_users_audit:is_bot:day', and a archetype for future LoggingCountStats.
# In RealmCount, 'active_users_audit:is_bot:day' should be the partial
# sum sequence of 'active_users_log:is_bot:day', for any realm that
# started after the latter stat was introduced.
LoggingCountStat('active_users_log:is_bot:day', RealmCount, CountStat.DAY),
# Another sanity check on 'active_users_audit:is_bot:day'. Is only an
# approximation, e.g. if a user is deactivated between the end of the
# day and when this stat is run, they won't be counted. However, is the
# simplest of the three to inspect by hand.
CountStat('active_users:is_bot:day',
sql_data_collector(RealmCount, count_user_by_realm_query, (UserProfile, 'is_bot')),
CountStat.DAY, interval=TIMEDELTA_MAX),
# User Activity stats
# Stats that measure user activity in the UserActivityInterval sense.
CountStat('15day_actives::day',
sql_data_collector(UserCount, check_useractivityinterval_by_user_query, None),
CountStat.DAY, interval=timedelta(days=15)-UserActivityInterval.MIN_INTERVAL_LENGTH),
CountStat('minutes_active::day', DataCollector(UserCount, do_pull_minutes_active), CountStat.DAY),
# Dependent stats
# Must come after their dependencies.
# Canonical account of the number of active humans in a realm on each day.
DependentCountStat('realm_active_humans::day',
sql_data_collector(RealmCount, count_realm_active_humans_query, None),
CountStat.DAY,
dependencies=['active_users_audit:is_bot:day', '15day_actives::day'])
]
COUNT_STATS = OrderedDict([(stat.property, stat) for stat in count_stats_])