counts: Add function compute_max_monthly_messages for remote servers.

This calculates the largest amount of messages sent within a month for
the last 3 months. The query is targeted for the specific use-case in
this function - for finding the count for a specific server. For
calculating this in bulk for a large number of remote server an
adapted, bulk query will be needed - rather than running this one in a
loop, which would likely be very inefficient.
This commit is contained in:
Mateusz Mandera
2023-10-18 13:18:12 +02:00
committed by Tim Abbott
parent 617d2d509c
commit 3cafdbdc1e
5 changed files with 209 additions and 2 deletions

View File

@@ -23,6 +23,7 @@ from zerver.models import (
get_org_type_display_name, get_org_type_display_name,
get_realm, get_realm,
) )
from zilencer.lib.remote_counts import MissingDataError
if TYPE_CHECKING: if TYPE_CHECKING:
from django.test.client import _MonkeyPatchedWSGIResponse as TestHttpResponse from django.test.client import _MonkeyPatchedWSGIResponse as TestHttpResponse
@@ -63,8 +64,20 @@ class TestRemoteServerSupportEndpoint(ZulipTestCase):
result, result,
) )
with mock.patch("analytics.views.support.compute_max_monthly_messages", return_value=1000):
result = self.client_get("/activity/remote/support", {"q": "zulip-1.example.com"}) result = self.client_get("/activity/remote/support", {"q": "zulip-1.example.com"})
self.assert_in_success_response(["<h3>zulip-1.example.com</h3>"], result) self.assert_in_success_response(["<h3>zulip-1.example.com</h3>"], result)
self.assert_in_success_response(["<b>Max monthly messages</b>: 1000"], result)
self.assert_not_in_success_response(["<h3>zulip-2.example.com</h3>"], result)
with mock.patch(
"analytics.views.support.compute_max_monthly_messages", side_effect=MissingDataError
):
result = self.client_get("/activity/remote/support", {"q": "zulip-1.example.com"})
self.assert_in_success_response(["<h3>zulip-1.example.com</h3>"], result)
self.assert_in_success_response(
["<b>Max monthly messages</b>: Recent data missing"], result
)
self.assert_not_in_success_response(["<h3>zulip-2.example.com</h3>"], result) self.assert_not_in_success_response(["<h3>zulip-2.example.com</h3>"], result)
result = self.client_get("/activity/remote/support", {"q": "example.com"}) result = self.client_get("/activity/remote/support", {"q": "example.com"})

View File

@@ -3,7 +3,7 @@ from contextlib import suppress
from dataclasses import dataclass from dataclasses import dataclass
from datetime import timedelta from datetime import timedelta
from decimal import Decimal from decimal import Decimal
from typing import Any, Dict, Iterable, List, Optional from typing import Any, Dict, Iterable, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode
from django.conf import settings from django.conf import settings
@@ -47,8 +47,10 @@ from zerver.models import (
get_user_profile_by_id, get_user_profile_by_id,
) )
from zerver.views.invite import get_invitee_emails_set from zerver.views.invite import get_invitee_emails_set
from zilencer.lib.remote_counts import MissingDataError
if settings.ZILENCER_ENABLED: if settings.ZILENCER_ENABLED:
from zilencer.lib.remote_counts import compute_max_monthly_messages
from zilencer.models import RemoteZulipServer from zilencer.models import RemoteZulipServer
if settings.BILLING_ENABLED: if settings.BILLING_ENABLED:
@@ -444,10 +446,20 @@ def remote_servers_support(
remote_servers = get_remote_servers_for_support( remote_servers = get_remote_servers_for_support(
email_to_search=email_to_search, hostname_to_search=hostname_to_search email_to_search=email_to_search, hostname_to_search=hostname_to_search
) )
remote_server_to_max_monthly_messages: Dict[int, Union[int, str]] = dict()
for remote_server in remote_servers:
try:
remote_server_to_max_monthly_messages[remote_server.id] = compute_max_monthly_messages(
remote_server
)
except MissingDataError:
remote_server_to_max_monthly_messages[remote_server.id] = "Recent data missing"
return render( return render(
request, request,
"analytics/remote_server_support.html", "analytics/remote_server_support.html",
context=dict( context=dict(
remote_servers=remote_servers, remote_servers=remote_servers,
remote_server_to_max_monthly_messages=remote_server_to_max_monthly_messages,
), ),
) )

View File

@@ -0,0 +1,104 @@
import datetime
import time_machine
from django.utils.timezone import now as timezone_now
from typing_extensions import override
from zerver.lib.test_classes import ZulipTestCase
from zilencer.lib.remote_counts import MissingDataError, compute_max_monthly_messages
from zilencer.models import RemoteInstallationCount, RemoteZulipServer
class RemoteCountTest(ZulipTestCase):
@override
def setUp(self) -> None:
self.server_uuid = "6cde5f7a-1f7e-4978-9716-49f69ebfc9fe"
self.server = RemoteZulipServer(
uuid=self.server_uuid,
api_key="magic_secret_api_key",
hostname="demo.example.com",
last_updated=timezone_now(),
)
self.server.save()
super().setUp()
def test_compute_max_monthly_messages(self) -> None:
now = timezone_now()
# Note: We will use this modified now_offset value to subtract N days from it,
# to simulate the data in the time series for the day now - N days. This avoids
# inconsistent behavior on the boundaries. E.g. does an entry with
# end_time=now - 30 days belong to the "last 30 days" interval or the 30 days before that?
# Using now_offset avoids this ambiguity.
now_offset = now + datetime.timedelta(hours=1)
# First try with absolutely no analytics data.
with self.assertRaises(MissingDataError):
compute_max_monthly_messages(self.server)
# This one-off row is just because we use this property as a proxy for
# "the server submitted useful analytics data" in compute_max_monthly_messages.
# Servers without such an entry raises MissingDataError as illustrated above.
# See the function's implementation for details.
RemoteInstallationCount.objects.create(
server=self.server,
remote_id=1,
property="active_users_audit:is_bot:day",
value=5,
end_time=now_offset - datetime.timedelta(days=4),
)
# If we're missing any message data (which is the same as message data with 0, because
# we actually don't record 0s), then the function should just very reasonably return 0.
self.assertEqual(compute_max_monthly_messages(self.server), 0)
# Last 30 days of data:
RemoteInstallationCount.objects.bulk_create(
RemoteInstallationCount(
server=self.server,
remote_id=1,
property="messages_sent:message_type:day",
value=10,
end_time=now_offset - datetime.timedelta(days=t),
)
for t in range(1, 31)
)
# 30 days before that:
# This will be the peak of the last 3 months - with 900 messages total
RemoteInstallationCount.objects.bulk_create(
RemoteInstallationCount(
server=self.server,
remote_id=1,
property="messages_sent:message_type:day",
value=30,
end_time=now_offset - datetime.timedelta(days=30 + t),
)
for t in range(1, 31)
)
# Additional 30 days before that:
# This is the last month we're considering for the calculation
RemoteInstallationCount.objects.bulk_create(
RemoteInstallationCount(
server=self.server,
remote_id=1,
property="messages_sent:message_type:day",
value=20,
end_time=now_offset - datetime.timedelta(days=60 + t),
)
for t in range(1, 31)
)
# Additional 30 days before that:
# This is >90 days ago and should be ignored for the calculation. We simulate the highest
# amounts of messages here, to test that this is indeed ignored.
RemoteInstallationCount.objects.bulk_create(
RemoteInstallationCount(
server=self.server,
remote_id=1,
property="messages_sent:message_type:day",
value=100,
end_time=now_offset - datetime.timedelta(days=90 + t),
)
for t in range(1, 31)
)
with time_machine.travel(now, tick=False):
self.assertEqual(compute_max_monthly_messages(self.server), 900)

View File

@@ -25,6 +25,7 @@
<h3>{{ remote_server.hostname }}</h3> <h3>{{ remote_server.hostname }}</h3>
<b>Contact email</b>: {{ remote_server.contact_email }}<br /> <b>Contact email</b>: {{ remote_server.contact_email }}<br />
<b>Last updated</b>: {{ remote_server.last_updated|timesince }} ago<br /> <b>Last updated</b>: {{ remote_server.last_updated|timesince }} ago<br />
<b>Max monthly messages</b>: {{ remote_server_to_max_monthly_messages[remote_server.id] }}<br />
</div> </div>
{% endfor %} {% endfor %}
</div> </div>

View File

@@ -0,0 +1,77 @@
import datetime
from django.db import connection
from django.utils.timezone import now as timezone_now
from psycopg2.sql import SQL, Literal
from zilencer.models import RemoteInstallationCount, RemoteZulipServer
class MissingDataError(Exception):
pass
def compute_max_monthly_messages(remote_server: RemoteZulipServer) -> int:
# Calculate the maximum amount of messages that the server had within a month.
# out of the last 3 months.
# We would like to just check whether we have current data for the
# actual property we care about
# ('messages_sent:message_type:day'). But because our analytics
# tables have implicit zeros, that can't distinguish missing data
# from days with no messages. So we filter on `active_users_audit`
# instead, which will never be zero for an initialized server.
if not RemoteInstallationCount.objects.filter(
server=remote_server,
property="active_users_audit:is_bot:day",
end_time__lte=timezone_now() - datetime.timedelta(days=3),
).exists():
raise MissingDataError
query = SQL(
"""
WITH server_message_stats_daily AS -- Up to 4 rows per day for different subgroups
(
SELECT
r.end_time,
r.value AS message_count
FROM
zilencer_remoteinstallationcount r
WHERE
r.property = 'messages_sent:message_type:day'
AND end_time >= CURRENT_TIMESTAMP(0) - INTERVAL '90 days'
AND r.server_id = {server_id}
),
server_message_stats_monthly AS (
SELECT
CASE
WHEN current_timestamp(0) - end_time <= INTERVAL '30 days' THEN 0
WHEN current_timestamp(0) - end_time <= INTERVAL '60 days' THEN 1
WHEN current_timestamp(0) - end_time <= INTERVAL '90 days' THEN 2
END AS billing_month,
SUM(message_count) AS message_count
FROM
server_message_stats_daily
GROUP BY
1
),
server_max_monthly_messages AS (
SELECT
MAX(message_count) AS message_count
FROM
server_message_stats_monthly
WHERE
billing_month IS NOT NULL
)
SELECT
-- Return zeros, rather than nulls,
-- for reporting servers with zero messages.
COALESCE(server_max_monthly_messages.message_count, 0) AS message_count
FROM
server_max_monthly_messages;
"""
).format(server_id=Literal(remote_server.id))
with connection.cursor() as cursor:
cursor.execute(query)
result = cursor.fetchone()[0]
return int(result)