analytics: Store realm disk space used as a CountStat.

Fixes #29632.

The issue description explains this well:

We currently recalculate `currently_used_upload_space_bytes` every file
upload, by dint of calling `flush_used_upload_space_cache`  on
save/delete, and then immediately calling
`user_profile.realm.currently_used_upload_space_bytes()` in
`notify_attachment_update`.  Since this walks the Attachments table,
recalculating this can take seconds in large realms.

Switch this to using a CountStat, so we don't need to walk significant
chunks of the Attachment table when we upload an attachment.  This will
also give us a historical daily graph of usage.
This commit is contained in:
Mateusz Mandera
2024-04-19 03:06:53 +02:00
committed by Tim Abbott
parent 4c4a443002
commit 9406bfbc0a
4 changed files with 124 additions and 5 deletions

View File

@@ -485,6 +485,40 @@ def sql_data_collector(
return DataCollector(output_table, pull_function)
def count_upload_space_used_by_realm_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause: Composable = SQL("")
else:
realm_clause = SQL("zerver_attachment.realm_id = {} AND").format(Literal(realm.id))
# Note: This query currently has to go through the entire table,
# summing all the sizes of attachments for every realm. This can be improved
# by having a query which looks at the latest CountStat for each realm,
# and sums it with only the new attachments.
# There'd be additional complexity added by the fact that attachments can
# also be deleted. Partially this can be accounted for by subtracting
# ArchivedAttachment sizes, but there's still the issue of attachments
# which can be directly deleted via the API.
return lambda kwargs: SQL(
"""
INSERT INTO analytics_realmcount (realm_id, property, end_time, value)
SELECT
zerver_attachment.realm_id,
%(property)s,
%(time_end)s,
COALESCE(SUM(zerver_attachment.size), 0)
FROM
zerver_attachment
WHERE
{realm_clause}
zerver_attachment.create_time < %(time_end)s
GROUP BY
zerver_attachment.realm_id
"""
).format(**kwargs, realm_clause=realm_clause)
def do_pull_minutes_active(
property: str, start_time: datetime, end_time: datetime, realm: Optional[Realm] = None
) -> int:
@@ -863,6 +897,11 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]:
CountStat.DAY,
interval=TIMEDELTA_MAX,
),
CountStat(
"upload_quota_used_bytes::day",
sql_data_collector(RealmCount, count_upload_space_used_by_realm_query(realm), None),
CountStat.DAY,
),
# Messages read stats. messages_read::hour is the total
# number of messages read, whereas
# messages_read_interactions::hour tries to count the total