mirror of
https://github.com/zulip/zulip.git
synced 2025-11-02 21:13:36 +00:00
analytics: Replace CountStat.is_gauge with interval.
Groundwork for allowing stats like "Monthly Active Users". CountStat.interval is no longer as clean a value as before, so removed it from views.get_chart_data. It wasn't being used by the frontend anyway. Removing interval from logger calls in counts.py is not a big loss since we now include the frequency (which is typically also the interval) in CountStat.property.
This commit is contained in:
@@ -29,18 +29,16 @@ logger = logging.getLogger("zulip.management")
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# First post office in Boston
|
||||
MIN_TIME = datetime(1639, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
|
||||
# You can't subtract timedelta.max from a datetime, so use this instead
|
||||
TIMEDELTA_MAX = timedelta(days=365*1000)
|
||||
|
||||
class CountStat(object):
|
||||
HOUR = 'hour'
|
||||
DAY = 'day'
|
||||
FREQUENCIES = frozenset([HOUR, DAY])
|
||||
# Allowed intervals are HOUR, DAY, and, GAUGE
|
||||
GAUGE = 'gauge'
|
||||
|
||||
def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, is_gauge):
|
||||
# type: (str, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, bool) -> None
|
||||
def __init__(self, property, zerver_count_query, filter_args, group_by, frequency, interval=None):
|
||||
# type: (str, ZerverCountQuery, Dict[str, bool], Optional[Tuple[models.Model, str]], str, Optional[timedelta]) -> None
|
||||
self.property = property
|
||||
self.zerver_count_query = zerver_count_query
|
||||
# might have to do something different for bitfields
|
||||
@@ -49,7 +47,12 @@ class CountStat(object):
|
||||
if frequency not in self.FREQUENCIES:
|
||||
raise AssertionError("Unknown frequency: %s" % (frequency,))
|
||||
self.frequency = frequency
|
||||
self.interval = self.GAUGE if is_gauge else frequency
|
||||
if interval is not None:
|
||||
self.interval = interval
|
||||
elif frequency == CountStat.HOUR:
|
||||
self.interval = timedelta(hours=1)
|
||||
else: # frequency == CountStat.DAY
|
||||
self.interval = timedelta(days=1)
|
||||
self.is_logging = False
|
||||
self.custom_pull_function = None # type: Optional[Callable[[CountStat, datetime, datetime], None]]
|
||||
|
||||
@@ -60,15 +63,15 @@ class CountStat(object):
|
||||
class LoggingCountStat(CountStat):
|
||||
def __init__(self, property, analytics_table, frequency):
|
||||
# type: (str, Type[BaseCount], str) -> None
|
||||
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {}, None,
|
||||
frequency, False)
|
||||
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {},
|
||||
None, frequency)
|
||||
self.is_logging = True
|
||||
|
||||
class CustomPullCountStat(CountStat):
|
||||
def __init__(self, property, analytics_table, frequency, custom_pull_function):
|
||||
# type: (str, Type[BaseCount], str, Callable[[CountStat, datetime, datetime], None]) -> None
|
||||
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {}, None,
|
||||
frequency, False)
|
||||
CountStat.__init__(self, property, ZerverCountQuery(None, analytics_table, None), {},
|
||||
None, frequency)
|
||||
self.custom_pull_function = custom_pull_function
|
||||
|
||||
class ZerverCountQuery(object):
|
||||
@@ -106,14 +109,14 @@ def process_count_stat(stat, fill_to_time):
|
||||
|
||||
currently_filled = currently_filled + timedelta(hours = 1)
|
||||
while currently_filled <= fill_to_time:
|
||||
logger.info("START %s %s %s" % (stat.property, stat.interval, currently_filled))
|
||||
logger.info("START %s %s" % (stat.property, currently_filled))
|
||||
start = time.time()
|
||||
do_update_fill_state(fill_state, currently_filled, FillState.STARTED)
|
||||
do_fill_count_stat_at_hour(stat, currently_filled)
|
||||
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
||||
end = time.time()
|
||||
currently_filled = currently_filled + timedelta(hours = 1)
|
||||
logger.info("DONE %s %s (%dms)" % (stat.property, stat.interval, (end-start)*1000))
|
||||
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
|
||||
|
||||
# We assume end_time is on an hour boundary, and is timezone aware.
|
||||
# It is the caller's responsibility to enforce this!
|
||||
@@ -122,13 +125,7 @@ def do_fill_count_stat_at_hour(stat, end_time):
|
||||
if stat.frequency == CountStat.DAY and (end_time != floor_to_day(end_time)):
|
||||
return
|
||||
|
||||
if stat.interval == CountStat.HOUR:
|
||||
start_time = end_time - timedelta(hours = 1)
|
||||
elif stat.interval == CountStat.DAY:
|
||||
start_time = end_time - timedelta(days = 1)
|
||||
else: # stat.interval == CountStat.GAUGE
|
||||
start_time = MIN_TIME
|
||||
|
||||
start_time = end_time - stat.interval
|
||||
if stat.custom_pull_function is not None:
|
||||
stat.custom_pull_function(stat, start_time, end_time)
|
||||
elif not stat.is_logging:
|
||||
@@ -404,15 +401,15 @@ def do_pull_minutes_active(stat, start_time, end_time):
|
||||
|
||||
count_stats_ = [
|
||||
CountStat('active_users:is_bot:day', zerver_count_user_by_realm, {'is_active': True},
|
||||
(UserProfile, 'is_bot'), CountStat.DAY, True),
|
||||
(UserProfile, 'is_bot'), CountStat.DAY, interval=TIMEDELTA_MAX),
|
||||
CountStat('messages_sent:is_bot:hour', zerver_count_message_by_user, {},
|
||||
(UserProfile, 'is_bot'), CountStat.HOUR, False),
|
||||
(UserProfile, 'is_bot'), CountStat.HOUR),
|
||||
CountStat('messages_sent:message_type:day', zerver_count_message_type_by_user, {},
|
||||
None, CountStat.DAY, False),
|
||||
None, CountStat.DAY),
|
||||
CountStat('messages_sent:client:day', zerver_count_message_by_user, {},
|
||||
(Message, 'sending_client_id'), CountStat.DAY, False),
|
||||
(Message, 'sending_client_id'), CountStat.DAY),
|
||||
CountStat('messages_in_stream:is_bot:day', zerver_count_message_by_stream, {},
|
||||
(UserProfile, 'is_bot'), CountStat.DAY, False),
|
||||
(UserProfile, 'is_bot'), CountStat.DAY),
|
||||
LoggingCountStat('active_users_log:is_bot:day', RealmCount, CountStat.DAY),
|
||||
CustomPullCountStat('minutes_active::day', UserCount, CountStat.DAY, do_pull_minutes_active)
|
||||
]
|
||||
|
||||
@@ -14,7 +14,7 @@ from six.moves import range, zip
|
||||
|
||||
def generate_time_series_data(days=100, business_hours_base=10, non_business_hours_base=10,
|
||||
growth=1, autocorrelation=0, spikiness=1, holiday_rate=0,
|
||||
frequency=CountStat.DAY, is_gauge=False, random_seed=26):
|
||||
frequency=CountStat.DAY, partial_sum=False, random_seed=26):
|
||||
# type: (int, float, float, float, float, float, float, str, bool, int) -> List[int]
|
||||
"""
|
||||
Generate semi-realistic looking time series data for testing analytics graphs.
|
||||
@@ -32,7 +32,7 @@ def generate_time_series_data(days=100, business_hours_base=10, non_business_hou
|
||||
the variance.
|
||||
holiday_rate -- Fraction of days randomly set to 0, largely for testing how we handle 0s.
|
||||
frequency -- Should be CountStat.HOUR or CountStat.DAY.
|
||||
is_gauge -- If True, return partial sum of the series.
|
||||
partial_sum -- If True, return partial sum of the series.
|
||||
random_seed -- Seed for random number generator.
|
||||
"""
|
||||
if frequency == CountStat.HOUR:
|
||||
@@ -64,7 +64,7 @@ def generate_time_series_data(days=100, business_hours_base=10, non_business_hou
|
||||
|
||||
values = [0 if holiday else int(v + sqrt(v)*noise_scalar*spikiness)
|
||||
for v, noise_scalar, holiday in zip(values_no_noise, noise_scalars, holidays)]
|
||||
if is_gauge:
|
||||
if partial_sum:
|
||||
for i in range(1, length):
|
||||
values[i] = values[i-1] + values[i]
|
||||
return [max(v, 0) for v in values]
|
||||
|
||||
@@ -44,7 +44,7 @@ class Command(BaseCommand):
|
||||
days=self.DAYS_OF_DATA, business_hours_base=business_hours_base,
|
||||
non_business_hours_base=non_business_hours_base, growth=growth,
|
||||
autocorrelation=autocorrelation, spikiness=spikiness, holiday_rate=holiday_rate,
|
||||
frequency=stat.frequency, is_gauge=(stat.interval == CountStat.GAUGE),
|
||||
frequency=stat.frequency, partial_sum=(stat.interval > timedelta(days=1000)),
|
||||
random_seed=self.random_seed)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -157,7 +157,7 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||
dummy_query = """INSERT INTO analytics_realmcount (realm_id, property, end_time, value)
|
||||
VALUES (1, 'test stat', '%(end_time)s', 22)""" % {'end_time': current_time}
|
||||
stat = CountStat('test stat', ZerverCountQuery(Recipient, UserCount, dummy_query),
|
||||
{}, None, CountStat.HOUR, False)
|
||||
{}, None, CountStat.HOUR)
|
||||
return stat
|
||||
|
||||
def assertFillStateEquals(self, end_time, state=FillState.DONE, property=None):
|
||||
|
||||
@@ -77,7 +77,6 @@ class TestGetChartData(ZulipTestCase):
|
||||
'msg': '',
|
||||
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
|
||||
'frequency': CountStat.DAY,
|
||||
'interval': CountStat.GAUGE,
|
||||
'realm': {'bot': self.data(100), 'human': self.data(101)},
|
||||
'display_order': None,
|
||||
'result': 'success',
|
||||
@@ -95,7 +94,6 @@ class TestGetChartData(ZulipTestCase):
|
||||
'msg': '',
|
||||
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_hour],
|
||||
'frequency': CountStat.HOUR,
|
||||
'interval': CountStat.HOUR,
|
||||
'realm': {'bot': self.data(100), 'human': self.data(101)},
|
||||
'user': {'bot': self.data(0), 'human': self.data(200)},
|
||||
'display_order': None,
|
||||
@@ -115,7 +113,6 @@ class TestGetChartData(ZulipTestCase):
|
||||
'msg': '',
|
||||
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
|
||||
'frequency': CountStat.DAY,
|
||||
'interval': CountStat.DAY,
|
||||
'realm': {'Public streams': self.data(100), 'Private streams': self.data(0),
|
||||
'Private messages': self.data(101), 'Group private messages': self.data(0)},
|
||||
'user': {'Public streams': self.data(200), 'Private streams': self.data(201),
|
||||
@@ -141,7 +138,6 @@ class TestGetChartData(ZulipTestCase):
|
||||
'msg': '',
|
||||
'end_times': [datetime_to_timestamp(dt) for dt in self.end_times_day],
|
||||
'frequency': CountStat.DAY,
|
||||
'interval': CountStat.DAY,
|
||||
'realm': {'client 4': self.data(100), 'client 3': self.data(101),
|
||||
'client 2': self.data(102)},
|
||||
'user': {'client 3': self.data(200), 'client 1': self.data(201)},
|
||||
|
||||
@@ -101,7 +101,7 @@ def get_chart_data(request, user_profile, chart_name=REQ(),
|
||||
raise JsonableError(_("No analytics data available. Please contact your server administrator."))
|
||||
|
||||
end_times = time_range(start, end, stat.frequency, min_length)
|
||||
data = {'end_times': end_times, 'frequency': stat.frequency, 'interval': stat.interval}
|
||||
data = {'end_times': end_times, 'frequency': stat.frequency}
|
||||
for table in tables:
|
||||
if table == RealmCount:
|
||||
data['realm'] = get_time_series_by_subgroup(
|
||||
|
||||
@@ -102,15 +102,9 @@ realm.
|
||||
subgroup. E.g. (UserProfile, is_bot).
|
||||
- frequency: How often to run the CountStat. Either 'hour' or
|
||||
'day'. E.g. 'day'.
|
||||
- interval: Either 'hour', 'day', or 'gauge'. If 'hour' or 'day', we're
|
||||
interested in events that happen in the hour or day preceeding the
|
||||
end_time. If gauge, we're interested in the state of the system at
|
||||
end_time. Example: 'gauge'. (If 'hour', our example CountStat would
|
||||
instead be measuring the number of currently active users who joined in
|
||||
the last hour).
|
||||
|
||||
Note that one should be careful about making new gauge CountStats; see
|
||||
[Performance Strategy](#performance-strategy) below.
|
||||
- interval: A timedelta that restricts events to the following time interval:
|
||||
[end_time - interval, end_time). Example: TIMEDELTA_MAX. We're interested
|
||||
in currently active users that joined any time since the start of time.
|
||||
|
||||
## The FillState table
|
||||
|
||||
@@ -160,8 +154,7 @@ efficient:
|
||||
- Not storing rows when the value is 0. An hourly user stat would otherwise
|
||||
collect 24 * 365 * roughly .5MB per db row = 4GB of data per user per
|
||||
year, most of whose values are 0. A related note is to be cautious about
|
||||
adding gauge queries, since gauge measurements are typically non-zero
|
||||
rather than being typically zero.
|
||||
adding queries that are typically non-0 instead of being typically 0.
|
||||
|
||||
## Backend Testing
|
||||
|
||||
|
||||
Reference in New Issue
Block a user