analytics: Improve escaping correctness with psycopg2.sql.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg
2020-06-09 01:46:28 -07:00
committed by Tim Abbott
parent db197d0abd
commit 5839fdf963
3 changed files with 175 additions and 129 deletions

View File

@@ -8,6 +8,7 @@ from typing import Callable, Dict, List, \
from django.conf import settings
from django.db import connection
from django.db.models import F
from psycopg2.sql import Composable, Identifier, Literal, SQL
from analytics.models import BaseCount, \
FillState, InstallationCount, RealmCount, StreamCount, \
@@ -164,31 +165,35 @@ def do_aggregate_to_summary_table(stat: CountStat, end_time: datetime,
# Aggregate into RealmCount
output_table = stat.data_collector.output_table
if realm is not None:
realm_clause = "AND zerver_realm.id = %s" % (realm.id,)
realm_clause = SQL("AND zerver_realm.id = {}").format(Literal(realm.id))
else:
realm_clause = ""
realm_clause = SQL("")
if output_table in (UserCount, StreamCount):
realmcount_query = """
realmcount_query = SQL("""
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, COALESCE(sum(%(output_table)s.value), 0), '%(property)s',
%(output_table)s.subgroup, %%(end_time)s
zerver_realm.id, COALESCE(sum({output_table}.value), 0), %(property)s,
{output_table}.subgroup, %(end_time)s
FROM zerver_realm
JOIN %(output_table)s
JOIN {output_table}
ON
zerver_realm.id = %(output_table)s.realm_id
zerver_realm.id = {output_table}.realm_id
WHERE
%(output_table)s.property = '%(property)s' AND
%(output_table)s.end_time = %%(end_time)s
%(realm_clause)s
GROUP BY zerver_realm.id, %(output_table)s.subgroup
""" % {'output_table': output_table._meta.db_table,
'property': stat.property,
'realm_clause': realm_clause}
{output_table}.property = %(property)s AND
{output_table}.end_time = %(end_time)s
{realm_clause}
GROUP BY zerver_realm.id, {output_table}.subgroup
""").format(
output_table=Identifier(output_table._meta.db_table),
realm_clause=realm_clause,
)
start = time.time()
cursor.execute(realmcount_query, {'end_time': end_time})
cursor.execute(realmcount_query, {
'property': stat.property,
'end_time': end_time,
})
end = time.time()
logger.info(
"%s RealmCount aggregation (%dms/%sr)",
@@ -201,19 +206,22 @@ def do_aggregate_to_summary_table(stat: CountStat, end_time: datetime,
#
# TODO: Add support for updating installation data after
# changing an individual realm's values.
installationcount_query = """
installationcount_query = SQL("""
INSERT INTO analytics_installationcount
(value, property, subgroup, end_time)
SELECT
sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s
sum(value), %(property)s, analytics_realmcount.subgroup, %(end_time)s
FROM analytics_realmcount
WHERE
property = '%(property)s' AND
end_time = %%(end_time)s
property = %(property)s AND
end_time = %(end_time)s
GROUP BY analytics_realmcount.subgroup
""" % {'property': stat.property}
""")
start = time.time()
cursor.execute(installationcount_query, {'end_time': end_time})
cursor.execute(installationcount_query, {
'property': stat.property,
'end_time': end_time,
})
end = time.time()
logger.info(
"%s InstallationCount aggregation (%dms/%sr)",
@@ -264,29 +272,45 @@ def do_drop_single_stat(property: str) -> None:
## DataCollector-level operations ##
def do_pull_by_sql_query(property: str, start_time: datetime, end_time: datetime, query: str,
group_by: Optional[Tuple[models.Model, str]]) -> int:
QueryFn = Callable[[Dict[str, Composable]], Composable]
def do_pull_by_sql_query(
property: str,
start_time: datetime,
end_time: datetime,
query: QueryFn,
group_by: Optional[Tuple[models.Model, str]],
) -> int:
if group_by is None:
subgroup = 'NULL'
group_by_clause = ''
subgroup = SQL('NULL')
group_by_clause = SQL('')
else:
subgroup = '%s.%s' % (group_by[0]._meta.db_table, group_by[1])
group_by_clause = ', ' + subgroup
subgroup = Identifier(group_by[0]._meta.db_table, group_by[1])
group_by_clause = SQL(', {}').format(subgroup)
# We do string replacement here because cursor.execute will reject a
# group_by_clause given as a param.
# We pass in the datetimes as params to cursor.execute so that we don't have to
# think about how to convert python datetimes to SQL datetimes.
query_ = query % {'property': property, 'subgroup': subgroup,
'group_by_clause': group_by_clause}
query_ = query({
'subgroup': subgroup,
'group_by_clause': group_by_clause,
})
cursor = connection.cursor()
cursor.execute(query_, {'time_start': start_time, 'time_end': end_time})
cursor.execute(query_, {
'property': property,
'time_start': start_time,
'time_end': end_time,
})
rowcount = cursor.rowcount
cursor.close()
return rowcount
def sql_data_collector(output_table: Type[BaseCount], query: str,
group_by: Optional[Tuple[models.Model, str]]) -> DataCollector:
def sql_data_collector(
output_table: Type[BaseCount],
query: QueryFn,
group_by: Optional[Tuple[models.Model, str]],
) -> DataCollector:
def pull_function(property: str, start_time: datetime, end_time: datetime,
realm: Optional[Realm] = None) -> int:
# The pull function type needs to accept a Realm argument
@@ -320,39 +344,39 @@ def do_pull_minutes_active(property: str, start_time: datetime, end_time: dateti
UserCount.objects.bulk_create(rows)
return len(rows)
def count_message_by_user_query(realm: Optional[Realm]) -> str:
def count_message_by_user_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "zerver_userprofile.realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("zerver_userprofile.realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_userprofile.id, zerver_userprofile.realm_id, count(*),
'%(property)s', %(subgroup)s, %%(time_end)s
%(property)s, {subgroup}, %(time_end)s
FROM zerver_userprofile
JOIN zerver_message
ON
zerver_userprofile.id = zerver_message.sender_id
WHERE
zerver_userprofile.date_joined < %%(time_end)s AND
zerver_message.date_sent >= %%(time_start)s AND
zerver_userprofile.date_joined < %(time_end)s AND
zerver_message.date_sent >= %(time_start)s AND
{realm_clause}
zerver_message.date_sent < %%(time_end)s
GROUP BY zerver_userprofile.id %(group_by_clause)s
""".format(realm_clause=realm_clause)
zerver_message.date_sent < %(time_end)s
GROUP BY zerver_userprofile.id {group_by_clause}
""").format(**kwargs, realm_clause=realm_clause)
# Note: ignores the group_by / group_by_clause.
def count_message_type_by_user_query(realm: Optional[Realm]) -> str:
def count_message_type_by_user_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "zerver_userprofile.realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("zerver_userprofile.realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_usercount
(realm_id, user_id, value, property, subgroup, end_time)
SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s
SELECT realm_id, id, SUM(count) AS value, %(property)s, message_type, %(time_end)s
FROM
(
SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*),
@@ -370,9 +394,9 @@ def count_message_type_by_user_query(realm: Optional[Realm]) -> str:
JOIN zerver_message
ON
zerver_userprofile.id = zerver_message.sender_id AND
zerver_message.date_sent >= %%(time_start)s AND
zerver_message.date_sent >= %(time_start)s AND
{realm_clause}
zerver_message.date_sent < %%(time_end)s
zerver_message.date_sent < %(time_end)s
JOIN zerver_recipient
ON
zerver_message.recipient_id = zerver_recipient.id
@@ -384,22 +408,22 @@ def count_message_type_by_user_query(realm: Optional[Realm]) -> str:
zerver_recipient.type, zerver_stream.invite_only
) AS subquery
GROUP BY realm_id, id, message_type
""".format(realm_clause=realm_clause)
""").format(**kwargs, realm_clause=realm_clause)
# This query joins to the UserProfile table since all current queries that
# use this also subgroup on UserProfile.is_bot. If in the future there is a
# stat that counts messages by stream and doesn't need the UserProfile
# table, consider writing a new query for efficiency.
def count_message_by_stream_query(realm: Optional[Realm]) -> str:
def count_message_by_stream_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "zerver_stream.realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("zerver_stream.realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_streamcount
(stream_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
zerver_stream.id, zerver_stream.realm_id, count(*), %(property)s, {subgroup}, %(time_end)s
FROM zerver_stream
JOIN zerver_recipient
ON
@@ -411,53 +435,53 @@ def count_message_by_stream_query(realm: Optional[Realm]) -> str:
ON
zerver_message.sender_id = zerver_userprofile.id
WHERE
zerver_stream.date_created < %%(time_end)s AND
zerver_stream.date_created < %(time_end)s AND
zerver_recipient.type = 2 AND
zerver_message.date_sent >= %%(time_start)s AND
zerver_message.date_sent >= %(time_start)s AND
{realm_clause}
zerver_message.date_sent < %%(time_end)s
GROUP BY zerver_stream.id %(group_by_clause)s
""".format(realm_clause=realm_clause)
zerver_message.date_sent < %(time_end)s
GROUP BY zerver_stream.id {group_by_clause}
""").format(**kwargs, realm_clause=realm_clause)
# Hardcodes the query needed by active_users:is_bot:day, since that is
# currently the only stat that uses this.
def count_user_by_realm_query(realm: Optional[Realm]) -> str:
def count_user_by_realm_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "zerver_userprofile.realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("zerver_userprofile.realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, count(*),'%(property)s', %(subgroup)s, %%(time_end)s
zerver_realm.id, count(*), %(property)s, {subgroup}, %(time_end)s
FROM zerver_realm
JOIN zerver_userprofile
ON
zerver_realm.id = zerver_userprofile.realm_id
WHERE
zerver_realm.date_created < %%(time_end)s AND
zerver_userprofile.date_joined >= %%(time_start)s AND
zerver_userprofile.date_joined < %%(time_end)s AND
zerver_realm.date_created < %(time_end)s AND
zerver_userprofile.date_joined >= %(time_start)s AND
zerver_userprofile.date_joined < %(time_end)s AND
{realm_clause}
zerver_userprofile.is_active = TRUE
GROUP BY zerver_realm.id %(group_by_clause)s
""".format(realm_clause=realm_clause)
GROUP BY zerver_realm.id {group_by_clause}
""").format(**kwargs, realm_clause=realm_clause)
# Currently hardcodes the query needed for active_users_audit:is_bot:day.
# Assumes that a user cannot have two RealmAuditLog entries with the same event_time and
# event_type in [RealmAuditLog.USER_CREATED, USER_DEACTIVATED, etc].
# In particular, it's important to ensure that migrations don't cause that to happen.
def check_realmauditlog_by_user_query(realm: Optional[Realm]) -> str:
def check_realmauditlog_by_user_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
ral1.modified_user_id, ral1.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
ral1.modified_user_id, ral1.realm_id, 1, %(property)s, {subgroup}, %(time_end)s
FROM zerver_realmauditlog ral1
JOIN (
SELECT modified_user_id, max(event_time) AS max_event_time
@@ -465,7 +489,7 @@ def check_realmauditlog_by_user_query(realm: Optional[Realm]) -> str:
WHERE
event_type in ({user_created}, {user_activated}, {user_deactivated}, {user_reactivated}) AND
{realm_clause}
event_time < %%(time_end)s
event_time < %(time_end)s
GROUP BY modified_user_id
) ral2
ON
@@ -476,43 +500,46 @@ def check_realmauditlog_by_user_query(realm: Optional[Realm]) -> str:
ral1.modified_user_id = zerver_userprofile.id
WHERE
ral1.event_type in ({user_created}, {user_activated}, {user_reactivated})
""".format(user_created=RealmAuditLog.USER_CREATED,
user_activated=RealmAuditLog.USER_ACTIVATED,
user_deactivated=RealmAuditLog.USER_DEACTIVATED,
user_reactivated=RealmAuditLog.USER_REACTIVATED,
realm_clause=realm_clause)
""").format(
**kwargs,
user_created=Literal(RealmAuditLog.USER_CREATED),
user_activated=Literal(RealmAuditLog.USER_ACTIVATED),
user_deactivated=Literal(RealmAuditLog.USER_DEACTIVATED),
user_reactivated=Literal(RealmAuditLog.USER_REACTIVATED),
realm_clause=realm_clause,
)
def check_useractivityinterval_by_user_query(realm: Optional[Realm]) -> str:
def check_useractivityinterval_by_user_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "zerver_userprofile.realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("zerver_userprofile.realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_usercount
(user_id, realm_id, value, property, subgroup, end_time)
SELECT
zerver_userprofile.id, zerver_userprofile.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
zerver_userprofile.id, zerver_userprofile.realm_id, 1, %(property)s, {subgroup}, %(time_end)s
FROM zerver_userprofile
JOIN zerver_useractivityinterval
ON
zerver_userprofile.id = zerver_useractivityinterval.user_profile_id
WHERE
zerver_useractivityinterval.end >= %%(time_start)s AND
zerver_useractivityinterval.end >= %(time_start)s AND
{realm_clause}
zerver_useractivityinterval.start < %%(time_end)s
GROUP BY zerver_userprofile.id %(group_by_clause)s
""".format(realm_clause=realm_clause)
zerver_useractivityinterval.start < %(time_end)s
GROUP BY zerver_userprofile.id {group_by_clause}
""").format(**kwargs, realm_clause=realm_clause)
def count_realm_active_humans_query(realm: Optional[Realm]) -> str:
def count_realm_active_humans_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause = ""
realm_clause = SQL("")
else:
realm_clause = "realm_id = %s AND" % (realm.id,)
return """
realm_clause = SQL("realm_id = {} AND").format(Literal(realm.id))
return lambda kwargs: SQL("""
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
usercount1.realm_id, count(*), '%(property)s', NULL, %%(time_end)s
usercount1.realm_id, count(*), %(property)s, NULL, %(time_end)s
FROM (
SELECT realm_id, user_id
FROM analytics_usercount
@@ -520,7 +547,7 @@ def count_realm_active_humans_query(realm: Optional[Realm]) -> str:
property = 'active_users_audit:is_bot:day' AND
subgroup = 'false' AND
{realm_clause}
end_time = %%(time_end)s
end_time = %(time_end)s
) usercount1
JOIN (
SELECT realm_id, user_id
@@ -528,29 +555,29 @@ def count_realm_active_humans_query(realm: Optional[Realm]) -> str:
WHERE
property = '15day_actives::day' AND
{realm_clause}
end_time = %%(time_end)s
end_time = %(time_end)s
) usercount2
ON
usercount1.user_id = usercount2.user_id
GROUP BY usercount1.realm_id
""".format(realm_clause=realm_clause)
""").format(**kwargs, realm_clause=realm_clause)
# Currently unused and untested
count_stream_by_realm_query = """
count_stream_by_realm_query = lambda kwargs: SQL("""
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
zerver_realm.id, count(*), %(property)s, {subgroup}, %(time_end)s
FROM zerver_realm
JOIN zerver_stream
ON
zerver_realm.id = zerver_stream.realm_id AND
WHERE
zerver_realm.date_created < %%(time_end)s AND
zerver_stream.date_created >= %%(time_start)s AND
zerver_stream.date_created < %%(time_end)s
GROUP BY zerver_realm.id %(group_by_clause)s
"""
zerver_realm.date_created < %(time_end)s AND
zerver_stream.date_created >= %(time_start)s AND
zerver_stream.date_created < %(time_end)s
GROUP BY zerver_realm.id {group_by_clause}
""").format(**kwargs)
def get_count_stats(realm: Optional[Realm]=None) -> Dict[str, CountStat]:
## CountStat declarations ##