Add tool to query our usage stats as of a given date.

This contains the various fixes that needed to be made in order to get
accurate statistics.

Most notably, the active_users_between function in the previous
version of zerver/lib/statistics.py was broken for end dates in the
past, because it used the UserActivity table to get its data -- so in
fact it really was querying "users last active between".

This commit isn't super clean, but I figure we're probably better off
having our latest code for historical usage data in git so it doesn't
bitrot and anyone can improve on it.

(imported from commit 24ff2f24a22e5bdc004ea8043d8da12deb97ff2f)
This commit is contained in:
Tim Abbott
2013-10-18 11:40:16 -04:00
parent 6ccf19bed6
commit 3b7bf691e7
2 changed files with 79 additions and 19 deletions

View File

@@ -0,0 +1,24 @@
from __future__ import absolute_import
import datetime
import pytz
from optparse import make_option
from django.core.management.base import BaseCommand
from zerver.lib.statistics import activity_averages_during_day
class Command(BaseCommand):
help = "Generate statistics on user activity for a given day."
option_list = BaseCommand.option_list + \
(make_option('--date', default=None, action='store',
help="Day to query in format 2013-12-05. Default is yesterday"),)
def handle(self, *args, **options):
if options["date"] is None:
date = datetime.datetime.now() - datetime.timedelta(days=1)
else:
date = datetime.datetime.strptime(options["date"], "%Y-%m-%d")
print "Activity data for", date
print activity_averages_during_day(date)
print "Please note that the total registered user count is a total for today"

View File

@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from zerver.models import UserProfile, UserActivity, UserActivityInterval
from zerver.models import UserProfile, UserActivity, UserActivityInterval, Message
from django.utils.timezone import utc
@@ -19,22 +19,34 @@ def median(data):
after = size//2
return (data[before] + data[after]) / 2.0
def active_users_to_measure():
users_who_sent_query = Message.objects.select_related("sender") \
.exclude(sending_client__name__contains="mirror") \
.exclude(sending_client__name__contains="API")
def active_users():
# Return a list of active users we want to count towards various
# statistics. This eliminates bots, @zulip.com, @customer29.invalid and customer3.invalid
exclude_realms = ["zulip.com", "customer29.invalid", "customer3.invalid"]
exclude_realms = ["zulip.com", "customer29.invalid", "customer3.invalid",
"ios_appreview.zulip.com", "wdaher.com", "customer30.invalid"]
return UserProfile.objects.filter(is_bot=False, is_active=True) \
.exclude(realm__domain__in=exclude_realms) \
.select_related()
# Return a set of users who have done some activity in the given timespan--that is,
# we have a UserActivity row for them. This counts pointer moves, flag updates, etc.
def users_active_between(begin, end):
activities = UserActivity.objects.filter(last_visit__gt=begin, last_visit__lt=end)
active = set([a.user_profile for a in activities])
def users_who_sent_between(begin, end):
sender_objs = users_who_sent_query.filter(pub_date__gt=begin, pub_date__lt=end) \
.values("sender__id")
return set(s["sender__id"] for s in sender_objs)
interesting_users = set(active_users_to_measure())
return active.intersection(interesting_users)
def users_who_sent_ever():
return set(s["sender__id"] for s in users_who_sent_query.values("sender__id"))
def active_users_to_measure():
senders = users_who_sent_ever()
return [u for u in active_users() if u.id in senders]
def active_users_who_sent_between(begin, end):
senders = users_who_sent_between(begin, end)
return [u for u in active_users() if u.id in senders]
# Return the amount of Zulip usage for this user between the two
# given dates
@@ -51,7 +63,7 @@ def seconds_usage_between(user_profile, begin, end):
def seconds_active_during_day(day):
begin_day = day.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=utc)
end_day = day.replace(hour=23, minute=59, second=59, microsecond=0, tzinfo=utc)
active_users = users_active_between(begin_day, end_day)
active_users = active_users_to_measure()
# Exclude Friday CUSTOMER4 activity numbers
if day.weekday() == 4:
@@ -59,25 +71,47 @@ def seconds_active_during_day(day):
return [seconds_usage_between(user, begin_day, end_day).total_seconds() for user in active_users]
def calculate_stats(data):
def users_active_nosend_during_day(day):
begin_day = day.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=utc)
end_day = day.replace(hour=23, minute=59, second=59, microsecond=0, tzinfo=utc)
active_users = active_users_to_measure()
today_senders = users_who_sent_between(begin_day, end_day)
today_users = []
for user_profile in active_users:
intervals = UserActivityInterval.objects.filter(user_profile=user_profile,
end__gte=begin_day,
start__lte=end_day)
if len(intervals) != 0:
today_users.append(user_profile)
return [u for u in today_users if not u.id in today_senders]
def calculate_stats(data, all_users):
if len(data) == 0:
return 0, 0
return {"# data points": 0}
mean_data = sum(data) / len(data)
median_data = median(data)
active_user_count = len([x for x in data if x > 1])
mean_data = sum(data) / active_user_count
median_data = median([x for x in data if x > 1])
return {'mean': str(timedelta(seconds=mean_data)), 'median': str(timedelta(seconds=median_data)), '# data points': len(data)}
return {'active users': active_user_count,
'total users': len(all_users),
'mean': str(timedelta(seconds=mean_data)),
'median': str(timedelta(seconds=median_data)),
'# data points': len(data)}
# Return an info dict {mean: , median} containing the mean/median seconds users were active on a given day
def activity_averages_during_day(day):
users_to_measure = active_users_to_measure()
seconds_active = seconds_active_during_day(day)
return calculate_stats(seconds_active)
return calculate_stats(seconds_active, all_users=users_to_measure)
# Returns an info dict {mean: , median} with engagement numbers for all users according
# to active_users_to_measure. This will ignore weekends, and ignore users.customer4.invalid
# on Fridays
def activity_averages_between(begin, end, by_day=True):
seconds_active = {}
users_to_measure = active_users_to_measure()
for i in range((end - begin).days):
day = begin + timedelta(days=i)
@@ -88,6 +122,8 @@ def activity_averages_between(begin, end, by_day=True):
seconds_active[day] = seconds_active_during_day(day)
if by_day:
return dict((day, calculate_stats(values)) for day, values in seconds_active.iteritems())
return dict((day, calculate_stats(values, all_users=users_to_measure))
for day, values in seconds_active.iteritems())
else:
return calculate_stats(list(chain.from_iterable(seconds_active.values())))
return calculate_stats(list(chain.from_iterable(seconds_active.values())),
all_users=users_to_measure)