analytics: Replace CountStat.is_gauge with interval.

Groundwork for allowing stats like "Monthly Active Users".

CountStat.interval is no longer as clean a value as before, so removed it
from views.get_chart_data. It wasn't being used by the frontend anyway.

Removing interval from logger calls in counts.py is not a big loss since we
now include the frequency (which is typically also the interval) in
CountStat.property.
This commit is contained in:
Rishi Gupta
2017-03-15 21:08:36 -07:00
committed by Tim Abbott
parent d6c5c672d3
commit 9b661ca91f
7 changed files with 32 additions and 46 deletions

View File

@@ -29,18 +29,16 @@ logger = logging.getLogger("zulip.management")
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
# First post office in Boston
MIN_TIME = datetime(1639, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
# You can't subtract timedelta.max from a datetime, so use this instead
TIMEDELTA_MAX = timedelta(days=365*1000)
class CountStat(object):
HOUR = 'hour'
DAY = 'day'
FREQUENCIES = frozenset([HOUR, DAY])
# Allowed intervals are HOUR, DAY, and, GAUGE
GAUGE = 'gauge'
def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, is_gauge):
# type: (str, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, bool) -> None
def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, interval=None):
# type: (str, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, Optional[timedelta]) -> None
self.property = property
self.zerver_count_query = zerver_count_query
# might have to do something different for bitfields
@@ -49,7 +47,12 @@ class CountStat(object):
if frequency not in self.FREQUENCIES:
raise AssertionError("Unknown frequency: %s" % (frequency,))
self.frequency = frequency
self.interval = self.GAUGE if is_gauge else frequency
if interval is not None:
self.interval = interval
elif frequency == CountStat.HOUR:
self.interval = timedelta(hours=1)
else: # frequency == CountStat.DAY
self.interval = timedelta(days=1)
self.is_logging = False
self.custom_pull_function = None # type: Optional[Callable[[CountStat, datetime, datetime], None]]
@@ -60,15 +63,15 @@ class CountStat(object):
class LoggingCountStat(CountStat):
def __init__(self, property, analytics_table, frequency):
# type: (str, Type[BaseCount], str) -> None
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {}, None,
frequency, False)
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {},
None, frequency)
self.is_logging = True
class CustomPullCountStat(CountStat):
def __init__(self, property, analytics_table, frequency, custom_pull_function):
# type: (str, Type[BaseCount], str, Callable[[CountStat, datetime, datetime], None]) -> None
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {}, None,
frequency, False)
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {},
None, frequency)
self.custom_pull_function = custom_pull_function
class ZerverCountQuery(object):
@@ -106,14 +109,14 @@ def process_count_stat(stat, fill_to_time):
currently_filled = currently_filled + timedelta(hours = 1)
while currently_filled <= fill_to_time:
logger.info("START %s %s %s" % (stat.property, stat.interval, currently_filled))
logger.info("START %s %s" % (stat.property, currently_filled))
start = time.time()
do_update_fill_state(fill_state, currently_filled, FillState.STARTED)
do_fill_count_stat_at_hour(stat, currently_filled)
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
end = time.time()
currently_filled = currently_filled + timedelta(hours = 1)
logger.info("DONE %s %s (%dms)" % (stat.property, stat.interval, (end-start)*1000))
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
# We assume end_time is on an hour boundary, and is timezone aware.
# It is the caller's responsibility to enforce this!
@@ -122,13 +125,7 @@ def do_fill_count_stat_at_hour(stat, end_time):
if stat.frequency == CountStat.DAY and (end_time != floor_to_day(end_time)):
return
if stat.interval == CountStat.HOUR:
start_time = end_time - timedelta(hours = 1)
elif stat.interval == CountStat.DAY:
start_time = end_time - timedelta(days = 1)
else: # stat.interval == CountStat.GAUGE
start_time = MIN_TIME
start_time = end_time - stat.interval
if stat.custom_pull_function is not None:
stat.custom_pull_function(stat, start_time, end_time)
elif not stat.is_logging:
@@ -404,15 +401,15 @@ def do_pull_minutes_active(stat, start_time, end_time):
count_stats_ = [
CountStat('active_users:is_bot:day', zerver_count_user_by_realm, {'is_active': True},
(UserProfile, 'is_bot'), CountStat.DAY, True),
(UserProfile, 'is_bot'), CountStat.DAY, interval=TIMEDELTA_MAX),
CountStat('messages_sent:is_bot:hour', zerver_count_message_by_user, {},
(UserProfile, 'is_bot'), CountStat.HOUR, False),
(UserProfile, 'is_bot'), CountStat.HOUR),
CountStat('messages_sent:message_type:day', zerver_count_message_type_by_user, {},
None, CountStat.DAY, False),
None, CountStat.DAY),
CountStat('messages_sent:client:day', zerver_count_message_by_user, {},
(Message, 'sending_client_id'), CountStat.DAY, False),
(Message, 'sending_client_id'), CountStat.DAY),
CountStat('messages_in_stream:is_bot:day', zerver_count_message_by_stream, {},
(UserProfile, 'is_bot'), CountStat.DAY, False),
(UserProfile, 'is_bot'), CountStat.DAY),
LoggingCountStat('active_users_log:is_bot:day', RealmCount, CountStat.DAY),
CustomPullCountStat('minutes_active::day', UserCount, CountStat.DAY, do_pull_minutes_active)
]

View File

@@ -14,7 +14,7 @@ from six.moves import range, zip
def generate_time_series_data(days=100, business_hours_base=10, non_business_hours_base=10,
growth=1, autocorrelation=0, spikiness=1, holiday_rate=0,
frequency=CountStat.DAY, is_gauge=False, random_seed=26):
frequency=CountStat.DAY, partial_sum=False, random_seed=26):
# type: (int, float, float, float, float, float, float, str, bool, int) -> List[int]
"""
Generate semi-realistic looking time series data for testing analytics graphs.
@@ -32,7 +32,7 @@ def generate_time_series_data(days=100, business_hours_base=10, non_business_hou
the variance.
holiday_rate -- Fraction of days randomly set to 0, largely for testing how we handle 0s.
frequency -- Should be CountStat.HOUR or CountStat.DAY.
is_gauge -- If True, return partial sum of the series.
partial_sum -- If True, return partial sum of the series.
random_seed -- Seed for random number generator.
"""
if frequency == CountStat.HOUR:
@@ -64,7 +64,7 @@ def generate_time_series_data(days=100, business_hours_base=10, non_business_hou
values = [0 if holiday else int(v + sqrt(v)*noise_scalar*spikiness)
for v, noise_scalar, holiday in zip(values_no_noise, noise_scalars, holidays)]
if is_gauge:
if partial_sum:
for i in range(1, length):
values[i] = values[i-1] + values[i]
return [max(v, 0) for v in values]

View File

@@ -44,7 +44,7 @@ class Command(BaseCommand):
days=self.DAYS_OF_DATA, business_hours_base=business_hours_base,
non_business_hours_base=non_business_hours_base, growth=growth,
autocorrelation=autocorrelation, spikiness=spikiness, holiday_rate=holiday_rate,
frequency=stat.frequency, is_gauge=(stat.interval == CountStat.GAUGE),
frequency=stat.frequency, partial_sum=(stat.interval > timedelta(days=1000)),
random_seed=self.random_seed)
def handle(self, *args, **options):

View File

@@ -157,7 +157,7 @@ class TestProcessCountStat(AnalyticsTestCase):
dummy_query = """INSERT INTO analytics_realmcount (realm_id, property, end_time, value)
VALUES (1, 'test stat', '%(end_time)s', 22)""" % {'end_time': current_time}
stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, dummy_query),
{}, None, CountStat.HOUR, False)
{}, None, CountStat.HOUR)
return stat
def assertFillStateEquals(self, end_time, state=FillState.DONE, property=None):

View File

@@ -77,7 +77,6 @@ class TestGetChartData(ZulipTestCase):
'msg': '',
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
'frequency': CountStat.DAY,
'interval': CountStat.GAUGE,
'realm': {'bot': self.data(100), 'human': self.data(101)},
'display_order': None,
'result': 'success',
@@ -95,7 +94,6 @@ class TestGetChartData(ZulipTestCase):
'msg': '',
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_hour],
'frequency': CountStat.HOUR,
'interval': CountStat.HOUR,
'realm': {'bot': self.data(100), 'human': self.data(101)},
'user': {'bot': self.data(0), 'human': self.data(200)},
'display_order': None,
@@ -115,7 +113,6 @@ class TestGetChartData(ZulipTestCase):
'msg': '',
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
'frequency': CountStat.DAY,
'interval': CountStat.DAY,
'realm': {'Public streams': self.data(100), 'Private streams': self.data(0),
'Private messages': self.data(101), 'Group private messages': self.data(0)},
'user': {'Public streams': self.data(200), 'Private streams': self.data(201),
@@ -141,7 +138,6 @@ class TestGetChartData(ZulipTestCase):
'msg': '',
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
'frequency': CountStat.DAY,
'interval': CountStat.DAY,
'realm': {'client 4': self.data(100), 'client 3': self.data(101),
'client 2': self.data(102)},
'user': {'client 3': self.data(200), 'client 1': self.data(201)},

View File

@@ -101,7 +101,7 @@ def get_chart_data(request, user_profile, chart_name=REQ(),
raise JsonableError(_("No analytics data available. Please contact your server administrator."))
end_times = time_range(start, end, stat.frequency, min_length)
data = {'end_times': end_times, 'frequency': stat.frequency, 'interval': stat.interval}
data = {'end_times': end_times, 'frequency': stat.frequency}
for table in tables:
if table == RealmCount:
data['realm'] = get_time_series_by_subgroup(

View File

@@ -102,15 +102,9 @@ realm.
subgroup. E.g. (UserProfile, is_bot).
- frequency: How often to run the CountStat. Either 'hour' or
'day'. E.g. 'day'.
- interval: Either 'hour', 'day', or 'gauge'. If 'hour' or 'day', we're
interested in events that happen in the hour or day preceeding the
end_time. If gauge, we're interested in the state of the system at
end_time. Example: 'gauge'. (If 'hour', our example CountStat would
instead be measuring the number of currently active users who joined in
the last hour).
Note that one should be careful about making new gauge CountStats; see
[Performance Strategy](#performance-strategy) below.
- interval: A timedelta that restricts events to the following time interval:
[end_time - interval, end_time). Example: TIMEDELTA_MAX. We're interested
in currently active users that joined any time since the start of time.
## The FillState table
@@ -160,8 +154,7 @@ efficient:
- Not storing rows when the value is 0. An hourly user stat would otherwise
collect 24 * 365 * roughly .5MB per db row = 4GB of data per user per
year, most of whose values are 0. A related note is to be cautious about
adding gauge queries, since gauge measurements are typically non-zero
rather than being typically zero.
adding queries that are typically non-0 instead of being typically 0.
## Backend Testing