mirror of
				https://github.com/zulip/zulip.git
				synced 2025-10-30 19:43:47 +00:00 
			
		
		
		
	Fixes #20028. There's no reason to have a special `RealmCreationKey` class - the `Confirmation` system already does this job. This is somewhat complicated by the need to write a migration for `RealmCreationKey`->`Confirmation` for pre-existing, valid objects, to avoid breaking realm creation links that haven't been used yet.
		
			
				
	
	
		
			3101 lines
		
	
	
		
			111 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			3101 lines
		
	
	
		
			111 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # This is the main code for the `./manage.py export` data export tool.
 | |
| # User docs: https://zulip.readthedocs.io/en/latest/production/export-and-import.html
 | |
| #
 | |
| # Most developers will interact with this primarily when they add a
 | |
| # new table to the schema, in which case they likely need to (1) add
 | |
| # it the lists in `ALL_ZULIP_TABLES` and similar data structures and
 | |
| # (2) if it doesn't belong in EXCLUDED_TABLES, add a Config object for
 | |
| # it to get_realm_config.
 | |
| import glob
 | |
| import hashlib
 | |
| import logging
 | |
| import os
 | |
| import random
 | |
| import secrets
 | |
| import shutil
 | |
| import subprocess
 | |
| import tempfile
 | |
| from collections.abc import Callable, Iterable, Mapping
 | |
| from contextlib import suppress
 | |
| from datetime import datetime
 | |
| from email.headerregistry import Address
 | |
| from functools import cache
 | |
| from typing import TYPE_CHECKING, Any, Optional, TypeAlias, TypedDict, cast
 | |
| from urllib.parse import urlsplit
 | |
| 
 | |
| import orjson
 | |
| from django.apps import apps
 | |
| from django.conf import settings
 | |
| from django.db import connection
 | |
| from django.db.models import Exists, Model, OuterRef, Q
 | |
| from django.forms.models import model_to_dict
 | |
| from django.utils.timezone import is_naive as timezone_is_naive
 | |
| from django.utils.timezone import now as timezone_now
 | |
| from psycopg2 import sql
 | |
| 
 | |
| import zerver.lib.upload
 | |
| from analytics.models import RealmCount, StreamCount, UserCount
 | |
| from scripts.lib.zulip_tools import overwrite_symlink
 | |
| from version import ZULIP_VERSION
 | |
| from zerver.lib.avatar_hash import user_avatar_base_path_from_ids
 | |
| from zerver.lib.migration_status import MigrationStatusJson, parse_migration_status
 | |
| from zerver.lib.pysa import mark_sanitized
 | |
| from zerver.lib.stream_color import STREAM_ASSIGNMENT_COLORS
 | |
| from zerver.lib.timestamp import datetime_to_timestamp
 | |
| from zerver.lib.upload.s3 import get_bucket
 | |
| from zerver.lib.utils import get_fk_field_name
 | |
| from zerver.models import (
 | |
|     AlertWord,
 | |
|     Attachment,
 | |
|     BotConfigData,
 | |
|     BotStorageData,
 | |
|     ChannelFolder,
 | |
|     Client,
 | |
|     CustomProfileField,
 | |
|     CustomProfileFieldValue,
 | |
|     DefaultStream,
 | |
|     DirectMessageGroup,
 | |
|     GroupGroupMembership,
 | |
|     Message,
 | |
|     MutedUser,
 | |
|     NamedUserGroup,
 | |
|     NavigationView,
 | |
|     OnboardingStep,
 | |
|     OnboardingUserMessage,
 | |
|     Reaction,
 | |
|     Realm,
 | |
|     RealmAuditLog,
 | |
|     RealmAuthenticationMethod,
 | |
|     RealmDomain,
 | |
|     RealmEmoji,
 | |
|     RealmExport,
 | |
|     RealmFilter,
 | |
|     RealmPlayground,
 | |
|     RealmUserDefault,
 | |
|     Recipient,
 | |
|     ScheduledMessage,
 | |
|     Service,
 | |
|     Stream,
 | |
|     Subscription,
 | |
|     UserActivity,
 | |
|     UserActivityInterval,
 | |
|     UserGroup,
 | |
|     UserGroupMembership,
 | |
|     UserMessage,
 | |
|     UserPresence,
 | |
|     UserProfile,
 | |
|     UserStatus,
 | |
|     UserTopic,
 | |
| )
 | |
| from zerver.models.presence import PresenceSequence
 | |
| from zerver.models.realm_audit_logs import AuditLogEventType
 | |
| from zerver.models.realms import get_fake_email_domain, get_realm
 | |
| from zerver.models.saved_snippets import SavedSnippet
 | |
| from zerver.models.users import ExternalAuthID, get_system_bot, get_user_profile_by_id
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     from mypy_boto3_s3.service_resource import Object
 | |
| 
 | |
| # Custom mypy types follow:
 | |
| Record: TypeAlias = dict[str, Any]
 | |
| TableName = str
 | |
| TableData: TypeAlias = dict[TableName, list[Record]]
 | |
| Field = str
 | |
| Path = str
 | |
| Context: TypeAlias = dict[str, Any]
 | |
| FilterArgs: TypeAlias = dict[str, Any]
 | |
| IdSource: TypeAlias = tuple[TableName, Field]
 | |
| SourceFilter: TypeAlias = Callable[[Record], bool]
 | |
| 
 | |
| CustomFetch: TypeAlias = Callable[[TableData, Context], None]
 | |
| CustomReturnIds: TypeAlias = Callable[[TableData], set[int]]
 | |
| CustomProcessResults: TypeAlias = Callable[[list[Record], Context], list[Record]]
 | |
| 
 | |
| 
 | |
| class MessagePartial(TypedDict):
 | |
|     zerver_message: list[Record]
 | |
|     zerver_userprofile_ids: list[int]
 | |
|     realm_id: int
 | |
| 
 | |
| 
 | |
| MESSAGE_BATCH_CHUNK_SIZE = 1000
 | |
| 
 | |
| ALL_ZULIP_TABLES = {
 | |
|     "analytics_fillstate",
 | |
|     "analytics_installationcount",
 | |
|     "analytics_realmcount",
 | |
|     "analytics_streamcount",
 | |
|     "analytics_usercount",
 | |
|     "otp_static_staticdevice",
 | |
|     "otp_static_statictoken",
 | |
|     "otp_totp_totpdevice",
 | |
|     "social_auth_association",
 | |
|     "social_auth_code",
 | |
|     "social_auth_nonce",
 | |
|     "social_auth_partial",
 | |
|     "social_auth_usersocialauth",
 | |
|     "two_factor_phonedevice",
 | |
|     "zerver_alertword",
 | |
|     "zerver_archivedattachment",
 | |
|     "zerver_archivedattachment_messages",
 | |
|     "zerver_archivedmessage",
 | |
|     "zerver_archivedusermessage",
 | |
|     "zerver_attachment",
 | |
|     "zerver_attachment_messages",
 | |
|     "zerver_attachment_scheduled_messages",
 | |
|     "zerver_archivedreaction",
 | |
|     "zerver_archivedsubmessage",
 | |
|     "zerver_archivetransaction",
 | |
|     "zerver_botconfigdata",
 | |
|     "zerver_botstoragedata",
 | |
|     "zerver_channelemailaddress",
 | |
|     "zerver_channelfolder",
 | |
|     "zerver_client",
 | |
|     "zerver_customprofilefield",
 | |
|     "zerver_customprofilefieldvalue",
 | |
|     "zerver_defaultstream",
 | |
|     "zerver_defaultstreamgroup",
 | |
|     "zerver_defaultstreamgroup_streams",
 | |
|     "zerver_draft",
 | |
|     "zerver_emailchangestatus",
 | |
|     "zerver_externalauthid",
 | |
|     "zerver_groupgroupmembership",
 | |
|     "zerver_huddle",
 | |
|     "zerver_imageattachment",
 | |
|     "zerver_message",
 | |
|     "zerver_missedmessageemailaddress",
 | |
|     "zerver_multiuseinvite",
 | |
|     "zerver_multiuseinvite_streams",
 | |
|     "zerver_multiuseinvite_groups",
 | |
|     "zerver_namedusergroup",
 | |
|     "zerver_navigationview",
 | |
|     "zerver_onboardingstep",
 | |
|     "zerver_onboardingusermessage",
 | |
|     "zerver_preregistrationrealm",
 | |
|     "zerver_preregistrationuser",
 | |
|     "zerver_preregistrationuser_streams",
 | |
|     "zerver_preregistrationuser_groups",
 | |
|     "zerver_presencesequence",
 | |
|     "zerver_pushdevice",
 | |
|     "zerver_pushdevicetoken",
 | |
|     "zerver_reaction",
 | |
|     "zerver_realm",
 | |
|     "zerver_realmauditlog",
 | |
|     "zerver_realmauthenticationmethod",
 | |
|     "zerver_realmcreationstatus",
 | |
|     "zerver_realmdomain",
 | |
|     "zerver_realmemoji",
 | |
|     "zerver_realmexport",
 | |
|     "zerver_realmfilter",
 | |
|     "zerver_realmplayground",
 | |
|     "zerver_realmreactivationstatus",
 | |
|     "zerver_realmuserdefault",
 | |
|     "zerver_recipient",
 | |
|     "zerver_savedsnippet",
 | |
|     "zerver_scheduledemail",
 | |
|     "zerver_scheduledemail_users",
 | |
|     "zerver_scheduledmessage",
 | |
|     "zerver_scheduledmessagenotificationemail",
 | |
|     "zerver_service",
 | |
|     "zerver_stream",
 | |
|     "zerver_submessage",
 | |
|     "zerver_subscription",
 | |
|     "zerver_useractivity",
 | |
|     "zerver_useractivityinterval",
 | |
|     "zerver_usergroup",
 | |
|     "zerver_usergroupmembership",
 | |
|     "zerver_usermessage",
 | |
|     "zerver_userpresence",
 | |
|     "zerver_userprofile",
 | |
|     "zerver_userprofile_groups",
 | |
|     "zerver_userprofile_user_permissions",
 | |
|     "zerver_userstatus",
 | |
|     "zerver_usertopic",
 | |
|     "zerver_muteduser",
 | |
| }
 | |
| 
 | |
| # This set contains those database tables that we expect to not be
 | |
| # included in the export.  This tool does validation to ensure that
 | |
| # every table in the database is either exported or listed here, to
 | |
| # ensure we never accidentally fail to export a table.
 | |
| NON_EXPORTED_TABLES = {
 | |
|     # These invitation/confirmation flow tables don't make sense to
 | |
|     # export, since invitations links will be broken by the server URL
 | |
|     # change anyway:
 | |
|     "zerver_emailchangestatus",
 | |
|     "zerver_multiuseinvite",
 | |
|     "zerver_multiuseinvite_streams",
 | |
|     "zerver_multiuseinvite_groups",
 | |
|     "zerver_preregistrationrealm",
 | |
|     "zerver_preregistrationuser",
 | |
|     "zerver_preregistrationuser_streams",
 | |
|     "zerver_preregistrationuser_groups",
 | |
|     "zerver_realmreactivationstatus",
 | |
|     "zerver_realmcreationstatus",
 | |
|     # Missed message addresses are low value to export since
 | |
|     # missed-message email addresses include the server's hostname and
 | |
|     # expire after a few days.
 | |
|     "zerver_missedmessageemailaddress",
 | |
|     # Scheduled message notification email data is for internal use by the server.
 | |
|     "zerver_scheduledmessagenotificationemail",
 | |
|     # When switching servers, clients will need to re-log in and
 | |
|     # reregister for push notifications anyway.
 | |
|     "zerver_pushdevice",
 | |
|     "zerver_pushdevicetoken",
 | |
|     # We don't use these generated Django tables
 | |
|     "zerver_userprofile_groups",
 | |
|     "zerver_userprofile_user_permissions",
 | |
|     # These is used for scheduling future activity; it could make
 | |
|     # sense to export, but is relatively low value.
 | |
|     "zerver_scheduledemail",
 | |
|     "zerver_scheduledemail_users",
 | |
|     # These tables are related to a user's 2FA authentication
 | |
|     # configuration, which will need to be set up again on the new
 | |
|     # server.
 | |
|     "two_factor_phonedevice",
 | |
|     "otp_static_staticdevice",
 | |
|     "otp_static_statictoken",
 | |
|     "otp_totp_totpdevice",
 | |
|     # These archive tables should not be exported (they are to support
 | |
|     # restoring content accidentally deleted due to software bugs in
 | |
|     # the retention policy feature)
 | |
|     "zerver_archivedmessage",
 | |
|     "zerver_archivedusermessage",
 | |
|     "zerver_archivedattachment",
 | |
|     "zerver_archivedattachment_messages",
 | |
|     "zerver_archivedreaction",
 | |
|     "zerver_archivedsubmessage",
 | |
|     "zerver_archivetransaction",
 | |
|     # Social auth tables are not needed post-export, since we don't
 | |
|     # use any of this state outside of a direct authentication flow.
 | |
|     "social_auth_association",
 | |
|     "social_auth_code",
 | |
|     "social_auth_nonce",
 | |
|     "social_auth_partial",
 | |
|     "social_auth_usersocialauth",
 | |
|     # We will likely never want to migrate this table, since it's a
 | |
|     # total of all the realmcount values on the server.  Might need to
 | |
|     # recompute it after a fillstate import.
 | |
|     "analytics_installationcount",
 | |
|     # Fillstate will require some cleverness to do the right partial export.
 | |
|     "analytics_fillstate",
 | |
|     # These are for unfinished features; we'll want to add them to the
 | |
|     # export before they reach full production status.
 | |
|     "zerver_defaultstreamgroup",
 | |
|     "zerver_defaultstreamgroup_streams",
 | |
|     "zerver_submessage",
 | |
|     # Drafts don't need to be exported as they are supposed to be more ephemeral.
 | |
|     "zerver_draft",
 | |
|     # The importer cannot trust ImageAttachment objects anyway and needs to check
 | |
|     # and process images for thumbnailing on its own.
 | |
|     "zerver_imageattachment",
 | |
|     # ChannelEmailAddress entries are low value to export since
 | |
|     # channel email addresses include the server's hostname.
 | |
|     "zerver_channelemailaddress",
 | |
|     # For any tables listed below here, it's a bug that they are not present in the export.
 | |
| }
 | |
| 
 | |
| IMPLICIT_TABLES = {
 | |
|     # ManyToMany relationships are exported implicitly when importing
 | |
|     # the parent table.
 | |
|     "zerver_attachment_messages",
 | |
|     "zerver_attachment_scheduled_messages",
 | |
| }
 | |
| 
 | |
| ATTACHMENT_TABLES = {
 | |
|     "zerver_attachment",
 | |
| }
 | |
| 
 | |
| MESSAGE_TABLES = {
 | |
|     # message tables get special treatment, because they're by far our
 | |
|     # largest tables and need to be paginated.
 | |
|     "zerver_message",
 | |
|     "zerver_usermessage",
 | |
|     # zerver_reaction belongs here, since it's added late because it
 | |
|     # has a foreign key into the Message table.
 | |
|     "zerver_reaction",
 | |
| }
 | |
| 
 | |
| # These get their own file as analytics data can be quite large and
 | |
| # would otherwise make realm.json unpleasant to manually inspect
 | |
| ANALYTICS_TABLES = {
 | |
|     "analytics_realmcount",
 | |
|     "analytics_streamcount",
 | |
|     "analytics_usercount",
 | |
| }
 | |
| 
 | |
| # This data structure lists all the Django DateTimeField fields in the
 | |
| # data model.  These are converted to floats during the export process
 | |
| # via floatify_datetime_fields, and back during the import process.
 | |
| #
 | |
| # TODO: This data structure could likely eventually be replaced by
 | |
| # inspecting the corresponding Django models
 | |
| DATE_FIELDS: dict[TableName, list[Field]] = {
 | |
|     "analytics_installationcount": ["end_time"],
 | |
|     "analytics_realmcount": ["end_time"],
 | |
|     "analytics_streamcount": ["end_time"],
 | |
|     "analytics_usercount": ["end_time"],
 | |
|     "zerver_attachment": ["create_time"],
 | |
|     "zerver_channelfolder": ["date_created"],
 | |
|     "zerver_externalauthid": ["date_created"],
 | |
|     "zerver_message": ["last_edit_time", "date_sent"],
 | |
|     "zerver_muteduser": ["date_muted"],
 | |
|     "zerver_realmauditlog": ["event_time"],
 | |
|     "zerver_realm": ["date_created"],
 | |
|     "zerver_realmexport": [
 | |
|         "date_requested",
 | |
|         "date_started",
 | |
|         "date_succeeded",
 | |
|         "date_failed",
 | |
|         "date_deleted",
 | |
|     ],
 | |
|     "zerver_savedsnippet": ["date_created"],
 | |
|     "zerver_scheduledmessage": ["scheduled_timestamp", "request_timestamp"],
 | |
|     "zerver_stream": ["date_created"],
 | |
|     "zerver_namedusergroup": ["date_created"],
 | |
|     "zerver_useractivityinterval": ["start", "end"],
 | |
|     "zerver_useractivity": ["last_visit"],
 | |
|     "zerver_onboardingstep": ["timestamp"],
 | |
|     "zerver_userpresence": ["last_active_time", "last_connected_time"],
 | |
|     "zerver_userprofile": ["date_joined", "last_login", "last_reminder"],
 | |
|     "zerver_userprofile_mirrordummy": ["date_joined", "last_login", "last_reminder"],
 | |
|     "zerver_userstatus": ["timestamp"],
 | |
|     "zerver_usertopic": ["last_updated"],
 | |
| }
 | |
| 
 | |
| 
 | |
| def sanity_check_output(data: TableData) -> None:
 | |
|     # First, we verify that the export tool has a declared
 | |
|     # configuration for every table declared in the `models` modules.
 | |
|     target_models = [
 | |
|         *apps.get_app_config("analytics").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("django_otp").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("otp_static").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("otp_totp").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("phonenumber").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("social_django").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("two_factor").get_models(include_auto_created=True),
 | |
|         *apps.get_app_config("zerver").get_models(include_auto_created=True),
 | |
|     ]
 | |
|     all_tables_db = {model._meta.db_table for model in target_models}
 | |
| 
 | |
|     # These assertion statements will fire when we add a new database
 | |
|     # table that is not included in Zulip's data exports.  Generally,
 | |
|     # you can add your new table to `ALL_ZULIP_TABLES` and
 | |
|     # `NON_EXPORTED_TABLES` during early work on a new feature so that
 | |
|     # CI passes.
 | |
|     #
 | |
|     # We'll want to make sure we handle it for exports before
 | |
|     # releasing the new feature, but doing so correctly requires some
 | |
|     # expertise on this export system.
 | |
|     error_message = f"""
 | |
|     It appears you've added a new database table, but haven't yet
 | |
|     registered it in ALL_ZULIP_TABLES and the related declarations
 | |
|     in {__file__} for what to include in data exports.
 | |
|     """
 | |
| 
 | |
|     assert all_tables_db == ALL_ZULIP_TABLES, error_message
 | |
|     assert NON_EXPORTED_TABLES.issubset(ALL_ZULIP_TABLES), error_message
 | |
|     assert IMPLICIT_TABLES.issubset(ALL_ZULIP_TABLES), error_message
 | |
|     assert ATTACHMENT_TABLES.issubset(ALL_ZULIP_TABLES), error_message
 | |
|     assert ANALYTICS_TABLES.issubset(ALL_ZULIP_TABLES), error_message
 | |
| 
 | |
|     tables = set(ALL_ZULIP_TABLES)
 | |
|     tables -= NON_EXPORTED_TABLES
 | |
|     tables -= IMPLICIT_TABLES
 | |
|     tables -= MESSAGE_TABLES
 | |
|     tables -= ATTACHMENT_TABLES
 | |
|     tables -= ANALYTICS_TABLES
 | |
| 
 | |
|     for table in tables:
 | |
|         if table not in data:
 | |
|             logging.warning("??? NO DATA EXPORTED FOR TABLE %s!!!", table)
 | |
| 
 | |
| 
 | |
| def write_data_to_file(output_file: Path, data: Any) -> None:
 | |
|     """
 | |
|     IMPORTANT: You generally don't want to call this directly.
 | |
| 
 | |
|     Instead use one of the higher level helpers:
 | |
| 
 | |
|         write_table_data
 | |
|         write_records_json_file
 | |
| 
 | |
|     The one place we call this directly is for message partials.
 | |
|     """
 | |
|     with open(output_file, "wb") as f:
 | |
|         # Because we don't pass a default handler, OPT_PASSTHROUGH_DATETIME
 | |
|         # actually causes orjson to raise a TypeError on datetime objects. This
 | |
|         # is what we want, because it helps us check that we correctly
 | |
|         # post-processed them to serialize to UNIX timestamps rather than ISO
 | |
|         # 8601 strings for historical reasons.
 | |
|         f.write(orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_PASSTHROUGH_DATETIME))
 | |
|     logging.info("Finished writing %s", output_file)
 | |
| 
 | |
| 
 | |
| def write_table_data(output_file: str, data: dict[str, Any]) -> None:
 | |
|     # We sort by ids mostly so that humans can quickly do diffs
 | |
|     # on two export jobs to see what changed (either due to new
 | |
|     # data arriving or new code being deployed).
 | |
|     for value in data.values():
 | |
|         if isinstance(value, list):
 | |
|             value.sort(key=lambda row: row["id"])
 | |
| 
 | |
|     assert output_file.endswith(".json")
 | |
| 
 | |
|     write_data_to_file(output_file, data)
 | |
| 
 | |
| 
 | |
| def write_records_json_file(output_dir: str, records: list[dict[str, Any]]) -> None:
 | |
|     # We want a somewhat deterministic sorting order here. All of our
 | |
|     # versions of records.json include a "path" field in each element,
 | |
|     # even though there's some variation among avatars/emoji/realm_icons/uploads
 | |
|     # in other fields that get written.
 | |
|     #
 | |
|     # The sorting order of paths isn't entirely sensical to humans,
 | |
|     # because they include ids and even some random numbers,
 | |
|     # but if you export the same realm twice, you should get identical results.
 | |
|     records.sort(key=lambda record: record["path"])
 | |
| 
 | |
|     output_file = os.path.join(output_dir, "records.json")
 | |
|     with open(output_file, "wb") as f:
 | |
|         # For legacy reasons we allow datetime objects here, unlike
 | |
|         # write_data_to_file.
 | |
|         f.write(orjson.dumps(records, option=orjson.OPT_INDENT_2))
 | |
|     logging.info("Finished writing %s", output_file)
 | |
| 
 | |
| 
 | |
| def make_raw(query: Any, exclude: list[Field] | None = None) -> list[Record]:
 | |
|     """
 | |
|     Takes a Django query and returns a JSONable list
 | |
|     of dictionaries corresponding to the database rows.
 | |
|     """
 | |
|     rows = []
 | |
|     for instance in query:
 | |
|         data = model_to_dict(instance, exclude=exclude)
 | |
|         """
 | |
|         In Django 1.11.5, model_to_dict evaluates the QuerySet of
 | |
|         many-to-many field to give us a list of instances. We require
 | |
|         a list of primary keys, so we get the primary keys from the
 | |
|         instances below.
 | |
|         """
 | |
|         for field in instance._meta.many_to_many:
 | |
|             if exclude is not None and field.name in exclude:
 | |
|                 continue
 | |
|             value = data[field.name]
 | |
|             data[field.name] = [row.id for row in value]
 | |
| 
 | |
|         rows.append(data)
 | |
| 
 | |
|     return rows
 | |
| 
 | |
| 
 | |
| def floatify_datetime_fields(data: TableData, table: TableName) -> None:
 | |
|     for item in data[table]:
 | |
|         for field in DATE_FIELDS[table]:
 | |
|             dt = item[field]
 | |
|             if dt is None:
 | |
|                 continue
 | |
|             assert isinstance(dt, datetime)
 | |
|             assert not timezone_is_naive(dt)
 | |
|             item[field] = dt.timestamp()
 | |
| 
 | |
| 
 | |
| class Config:
 | |
|     """A Config object configures a single table for exporting (and, maybe
 | |
|     some day importing as well.  This configuration defines what
 | |
|     process needs to be followed to correctly extract the set of
 | |
|     objects to export.
 | |
| 
 | |
|     You should never mutate Config objects as part of the export;
 | |
|     instead use the data to determine how you populate other
 | |
|     data structures.
 | |
| 
 | |
|     There are parent/children relationships between Config objects.
 | |
|     The parent should be instantiated first.  The child will
 | |
|     append itself to the parent's list of children.
 | |
| 
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         table: str | None = None,
 | |
|         model: Any | None = None,
 | |
|         normal_parent: Optional["Config"] = None,
 | |
|         virtual_parent: Optional["Config"] = None,
 | |
|         filter_args: FilterArgs | None = None,
 | |
|         custom_fetch: CustomFetch | None = None,
 | |
|         custom_tables: list[TableName] | None = None,
 | |
|         custom_return_ids: CustomReturnIds | None = None,
 | |
|         custom_process_results: CustomProcessResults | None = None,
 | |
|         concat_and_destroy: list[TableName] | None = None,
 | |
|         id_source: IdSource | None = None,
 | |
|         source_filter: SourceFilter | None = None,
 | |
|         include_rows: Field | None = None,
 | |
|         is_seeded: bool = False,
 | |
|         exclude: list[Field] | None = None,
 | |
|         limit_to_consenting_users: bool | None = None,
 | |
|         collect_client_ids: bool = False,
 | |
|     ) -> None:
 | |
|         assert table or custom_tables
 | |
|         self.table = table
 | |
|         self.model = model
 | |
|         self.normal_parent = normal_parent
 | |
|         self.virtual_parent = virtual_parent
 | |
|         self.filter_args = filter_args
 | |
|         self.include_rows = include_rows
 | |
|         self.is_seeded = is_seeded
 | |
|         self.exclude = exclude
 | |
|         self.custom_fetch = custom_fetch
 | |
|         self.custom_tables = custom_tables
 | |
|         self.custom_return_ids = custom_return_ids
 | |
|         self.custom_process_results = custom_process_results
 | |
|         self.concat_and_destroy = concat_and_destroy
 | |
|         self.id_source = id_source
 | |
|         self.source_filter = source_filter
 | |
|         self.limit_to_consenting_users = limit_to_consenting_users
 | |
|         self.collect_client_ids = collect_client_ids
 | |
|         self.children: list[Config] = []
 | |
| 
 | |
|         if self.include_rows:
 | |
|             assert self.include_rows.endswith("_id__in")
 | |
| 
 | |
|         if self.custom_fetch:
 | |
|             # enforce a naming convention
 | |
|             assert self.custom_fetch.__name__.startswith("custom_fetch_")
 | |
|             if self.normal_parent is not None:
 | |
|                 raise AssertionError(
 | |
|                     """
 | |
|                     If you have a custom fetcher, then specify
 | |
|                     your parent as a virtual_parent.
 | |
|                     """
 | |
|                 )
 | |
| 
 | |
|             if self.collect_client_ids:
 | |
|                 raise AssertionError(
 | |
|                     """
 | |
|                     If you're using custom_fetch with collect_client_ids, you need to
 | |
|                     extend the related logic to handle how to collect Client ids with your
 | |
|                     customer fetcher.
 | |
|                     """
 | |
|                 )
 | |
| 
 | |
|         if normal_parent is not None:
 | |
|             self.parent: Config | None = normal_parent
 | |
|         else:
 | |
|             self.parent = None
 | |
| 
 | |
|         if virtual_parent is not None and normal_parent is not None:
 | |
|             raise AssertionError(
 | |
|                 """
 | |
|                 If you specify a normal_parent, please
 | |
|                 do not create a virtual_parent.
 | |
|                 """
 | |
|             )
 | |
| 
 | |
|         if normal_parent is not None:
 | |
|             normal_parent.children.append(self)
 | |
|         elif virtual_parent is not None:
 | |
|             virtual_parent.children.append(self)
 | |
|         elif is_seeded is None:
 | |
|             raise AssertionError(
 | |
|                 """
 | |
|                 You must specify a parent if you are
 | |
|                 not using is_seeded.
 | |
|                 """
 | |
|             )
 | |
| 
 | |
|         if (
 | |
|             (parent := normal_parent or virtual_parent) is not None
 | |
|             and parent.table == "zerver_userprofile"
 | |
|             and limit_to_consenting_users is None
 | |
|         ):
 | |
|             raise AssertionError(
 | |
|                 """
 | |
|                 Config having UserProfile as a parent must pass limit_to_consenting_users
 | |
|                 explicitly.
 | |
|                 """
 | |
|             )
 | |
| 
 | |
|         if self.id_source is not None:
 | |
|             if self.virtual_parent is None:
 | |
|                 raise AssertionError(
 | |
|                     """
 | |
|                     You must specify a virtual_parent if you are
 | |
|                     using id_source."""
 | |
|                 )
 | |
|             if self.id_source[0] != self.virtual_parent.table:
 | |
|                 raise AssertionError(
 | |
|                     f"""
 | |
|                     Configuration error.  To populate {self.table}, you
 | |
|                     want data from {self.id_source[0]}, but that differs from
 | |
|                     the table name of your virtual parent ({self.virtual_parent.table}),
 | |
|                     which suggests you many not have set up
 | |
|                     the ordering correctly.  You may simply
 | |
|                     need to assign a virtual_parent, or there
 | |
|                     may be deeper issues going on."""
 | |
|                 )
 | |
| 
 | |
|         if self.limit_to_consenting_users:
 | |
|             # Combining these makes no sense. limit_to_consenting_users is used to restrict queries
 | |
|             # for Configs which use include_rows="user_profile_id__in" to only pass user ids of users
 | |
|             # who have consented to private data export.
 | |
|             # If a Config defines its own custom_fetch, then it is fully responsible for doing its own
 | |
|             # queries - so it doesn't integrate with limit_to_consenting_users.
 | |
|             assert not self.custom_fetch
 | |
| 
 | |
|             assert include_rows in ["user_profile_id__in", "user_id__in", "bot_profile_id__in"]
 | |
|             assert normal_parent is not None and normal_parent.table == "zerver_userprofile"
 | |
| 
 | |
|     def return_ids(self, response: TableData) -> set[int]:
 | |
|         if self.custom_return_ids is not None:
 | |
|             return self.custom_return_ids(response)
 | |
|         else:
 | |
|             assert self.table is not None
 | |
|             return {row["id"] for row in response[self.table]}
 | |
| 
 | |
| 
 | |
| def export_from_config(
 | |
|     response: TableData,
 | |
|     config: Config,
 | |
|     seed_object: Any | None = None,
 | |
|     context: Context | None = None,
 | |
| ) -> None:
 | |
|     table = config.table
 | |
|     parent = config.parent
 | |
|     model = config.model
 | |
| 
 | |
|     if context is None:
 | |
|         context = {}
 | |
| 
 | |
|     if config.custom_tables:
 | |
|         exported_tables = config.custom_tables
 | |
|     else:
 | |
|         assert table is not None, """
 | |
|             You must specify config.custom_tables if you
 | |
|             are not specifying config.table"""
 | |
|         exported_tables = [table]
 | |
| 
 | |
|     for t in exported_tables:
 | |
|         logging.info("Exporting via export_from_config:  %s", t)
 | |
| 
 | |
|     rows = None
 | |
|     if config.is_seeded:
 | |
|         rows = [seed_object]
 | |
| 
 | |
|     elif config.custom_fetch:
 | |
|         config.custom_fetch(
 | |
|             response,
 | |
|             context,
 | |
|         )
 | |
|         if config.custom_tables:
 | |
|             for t in config.custom_tables:
 | |
|                 if t not in response:
 | |
|                     raise AssertionError(f"Custom fetch failed to populate {t}")
 | |
| 
 | |
|     elif config.concat_and_destroy:
 | |
|         # When we concat_and_destroy, we are working with
 | |
|         # temporary "tables" that are lists of records that
 | |
|         # should already be ready to export.
 | |
|         data: list[Record] = []
 | |
|         for t in config.concat_and_destroy:
 | |
|             data += response[t]
 | |
|             del response[t]
 | |
|             logging.info("Deleted temporary %s", t)
 | |
|         assert table is not None
 | |
|         response[table] = data
 | |
| 
 | |
|     elif config.normal_parent:
 | |
|         # In this mode, our current model is figuratively Article,
 | |
|         # and normal_parent is figuratively Blog, and
 | |
|         # now we just need to get all the articles
 | |
|         # contained by the blogs.
 | |
|         model = config.model
 | |
|         assert parent is not None
 | |
|         assert parent.table is not None
 | |
|         assert config.include_rows is not None
 | |
|         parent_ids = parent.return_ids(response)
 | |
|         filter_params: dict[str, object] = {config.include_rows: parent_ids}
 | |
| 
 | |
|         if config.filter_args is not None:
 | |
|             filter_params.update(config.filter_args)
 | |
|         if config.limit_to_consenting_users:
 | |
|             if "realm" in context:
 | |
|                 realm = context["realm"]
 | |
|                 export_type = context["export_type"]
 | |
|                 assert isinstance(realm, Realm)
 | |
|                 if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|                     # In a public export, no private data is exported, so
 | |
|                     # no users are considered consenting.
 | |
|                     consenting_user_ids: set[int] | None = set()
 | |
|                 elif export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|                     consenting_user_ids = context["exportable_user_ids"]
 | |
|                 else:
 | |
|                     assert export_type == RealmExport.EXPORT_FULL_WITHOUT_CONSENT
 | |
|                     # In a full export without consent, the concept is meaningless,
 | |
|                     # so set this to None. All private data will be exported without consulting
 | |
|                     # consenting_user_ids so we set this to None so that any code in this flow
 | |
|                     # which (incorrectly) tries to access them fails explicitly.
 | |
|                     consenting_user_ids = None
 | |
|             else:
 | |
|                 # Single user export. This should not be really relevant, because
 | |
|                 # limit_to_consenting_users is unlikely to be used in Configs in that codepath,
 | |
|                 # as we should be exporting only a single user's data anyway; but it's still
 | |
|                 # useful to have this case written correctly for robustness.
 | |
|                 assert "user" in context
 | |
|                 assert isinstance(context["user"], UserProfile)
 | |
|                 export_type = None
 | |
|                 consenting_user_ids = {context["user"].id}
 | |
| 
 | |
|             user_profile_id_in_key = config.include_rows
 | |
| 
 | |
|             # Sanity check.
 | |
|             assert user_profile_id_in_key in [
 | |
|                 "user_profile_id__in",
 | |
|                 "user_id__in",
 | |
|                 "bot_profile_id__in",
 | |
|             ]
 | |
| 
 | |
|             user_profile_id_in = filter_params[user_profile_id_in_key]
 | |
|             assert isinstance(user_profile_id_in, set)
 | |
| 
 | |
|             if export_type != RealmExport.EXPORT_FULL_WITHOUT_CONSENT:
 | |
|                 assert consenting_user_ids is not None
 | |
|                 filter_params[user_profile_id_in_key] = consenting_user_ids.intersection(
 | |
|                     user_profile_id_in
 | |
|                 )
 | |
| 
 | |
|         assert model is not None
 | |
|         try:
 | |
|             query = model.objects.filter(**filter_params)
 | |
|         except Exception:
 | |
|             print(
 | |
|                 f"""
 | |
|                 Something about your Config seems to make it difficult
 | |
|                 to construct a query.
 | |
| 
 | |
|                 table: {table}
 | |
|                 parent: {parent.table}
 | |
| 
 | |
|                 filter_params: {filter_params}
 | |
|                 """
 | |
|             )
 | |
|             raise
 | |
| 
 | |
|         rows = list(query)
 | |
| 
 | |
|     elif config.id_source:
 | |
|         # In this mode, we are the figurative Blog, and we now
 | |
|         # need to look at the current response to get all the
 | |
|         # blog ids from the Article rows we fetched previously.
 | |
|         model = config.model
 | |
|         assert model is not None
 | |
|         # This will be a tuple of the form ('zerver_article', 'blog').
 | |
|         (child_table, field) = config.id_source
 | |
|         child_rows = response[child_table]
 | |
|         if config.source_filter:
 | |
|             child_rows = [r for r in child_rows if config.source_filter(r)]
 | |
|         lookup_ids = [r[field] for r in child_rows]
 | |
|         filter_params = dict(id__in=lookup_ids)
 | |
|         if config.filter_args:
 | |
|             filter_params.update(config.filter_args)
 | |
|         query = model.objects.filter(**filter_params)
 | |
|         rows = list(query)
 | |
| 
 | |
|     if rows is not None:
 | |
|         assert table is not None  # Hint for mypy
 | |
|         response[table] = make_raw(rows, exclude=config.exclude)
 | |
|         if config.collect_client_ids and "collected_client_ids_set" in context:
 | |
|             model = cast(type[Model], model)
 | |
|             assert issubclass(model, Model)
 | |
|             client_id_field_name = get_fk_field_name(model, Client)
 | |
|             assert client_id_field_name is not None
 | |
|             context["collected_client_ids_set"].update(
 | |
|                 {row[client_id_field_name] for row in response[table]}
 | |
|             )
 | |
| 
 | |
|     # Post-process rows
 | |
|     custom_process_results = config.custom_process_results
 | |
|     for t in exported_tables:
 | |
|         if custom_process_results is not None:
 | |
|             # The config might specify a function to do final processing
 | |
|             # of the exported data for the tables - e.g. to strip out private data.
 | |
|             response[t] = custom_process_results(response[t], context)
 | |
|         if t in DATE_FIELDS:
 | |
|             floatify_datetime_fields(response, t)
 | |
| 
 | |
|     # Now walk our children.  It's extremely important to respect
 | |
|     # the order of children here.
 | |
|     for child_config in config.children:
 | |
|         export_from_config(
 | |
|             response=response,
 | |
|             config=child_config,
 | |
|             context=context,
 | |
|         )
 | |
| 
 | |
| 
 | |
| def get_realm_config() -> Config:
 | |
|     # This function generates the main Config object that defines how
 | |
|     # to do a full-realm export of a single realm from a Zulip server.
 | |
| 
 | |
|     realm_config = Config(
 | |
|         table="zerver_realm",
 | |
|         is_seeded=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmauthenticationmethod",
 | |
|         model=RealmAuthenticationMethod,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_presencesequence",
 | |
|         model=PresenceSequence,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         custom_tables=["zerver_scheduledmessage"],
 | |
|         virtual_parent=realm_config,
 | |
|         custom_fetch=custom_fetch_scheduled_messages,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_defaultstream",
 | |
|         model=DefaultStream,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_customprofilefield",
 | |
|         model=CustomProfileField,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmauditlog",
 | |
|         virtual_parent=realm_config,
 | |
|         custom_fetch=custom_fetch_realm_audit_logs_for_realm,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmemoji",
 | |
|         model=RealmEmoji,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmdomain",
 | |
|         model=RealmDomain,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmexport",
 | |
|         model=RealmExport,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmfilter",
 | |
|         model=RealmFilter,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmplayground",
 | |
|         model=RealmPlayground,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmuserdefault",
 | |
|         model=RealmUserDefault,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_onboardingusermessage",
 | |
|         model=OnboardingUserMessage,
 | |
|         virtual_parent=realm_config,
 | |
|         custom_fetch=custom_fetch_onboarding_usermessage,
 | |
|     )
 | |
| 
 | |
|     user_profile_config = Config(
 | |
|         custom_tables=[
 | |
|             "zerver_userprofile",
 | |
|             "zerver_userprofile_mirrordummy",
 | |
|         ],
 | |
|         # When child tables want to fetch the list of ids of objects exported from
 | |
|         # the parent table, they should get ids from both zerver_userprofile and
 | |
|         # zerver_userprofile_mirrordummy:
 | |
|         custom_return_ids=lambda table_data: {
 | |
|             row["id"] for row in table_data["zerver_userprofile"]
 | |
|         }.union({row["id"] for row in table_data["zerver_userprofile_mirrordummy"]}),
 | |
|         # set table for children who treat us as normal parent
 | |
|         table="zerver_userprofile",
 | |
|         virtual_parent=realm_config,
 | |
|         custom_fetch=custom_fetch_user_profile,
 | |
|     )
 | |
| 
 | |
|     user_groups_config = Config(
 | |
|         table="zerver_usergroup",
 | |
|         model=UserGroup,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|         exclude=["direct_members", "direct_subgroups"],
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_namedusergroup",
 | |
|         model=NamedUserGroup,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_for_sharding_id__in",
 | |
|         exclude=["realm", "direct_members", "direct_subgroups"],
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_usergroupmembership",
 | |
|         model=UserGroupMembership,
 | |
|         normal_parent=user_groups_config,
 | |
|         include_rows="user_group_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_groupgroupmembership",
 | |
|         model=GroupGroupMembership,
 | |
|         normal_parent=user_groups_config,
 | |
|         include_rows="supergroup_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         custom_tables=[
 | |
|             "zerver_userprofile_crossrealm",
 | |
|         ],
 | |
|         virtual_parent=user_profile_config,
 | |
|         custom_fetch=custom_fetch_user_profile_cross_realm,
 | |
|         # This is just a Config for exporting cross-realm bots;
 | |
|         # the concept of limit_to_consenting_users is not applicable here.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_service",
 | |
|         model=Service,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_botstoragedata",
 | |
|         model=BotStorageData,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="bot_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_botconfigdata",
 | |
|         model=BotConfigData,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="bot_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     # Some of these tables are intermediate "tables" that we
 | |
|     # create only for the export.  Think of them as similar to views.
 | |
| 
 | |
|     user_subscription_config = Config(
 | |
|         table="_user_subscription",
 | |
|         model=Subscription,
 | |
|         normal_parent=user_profile_config,
 | |
|         filter_args={"recipient__type": Recipient.PERSONAL},
 | |
|         include_rows="user_profile_id__in",
 | |
|         # This is merely for fetching Subscriptions to users' own PERSONAL Recipient.
 | |
|         # It is just "glue" data for internal data model consistency purposes
 | |
|         # with no user-specific information.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="_user_recipient",
 | |
|         model=Recipient,
 | |
|         virtual_parent=user_subscription_config,
 | |
|         id_source=("_user_subscription", "recipient"),
 | |
|     )
 | |
| 
 | |
|     stream_config = Config(
 | |
|         table="zerver_stream",
 | |
|         model=Stream,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     stream_recipient_config = Config(
 | |
|         table="_stream_recipient",
 | |
|         model=Recipient,
 | |
|         normal_parent=stream_config,
 | |
|         include_rows="type_id__in",
 | |
|         filter_args={"type": Recipient.STREAM},
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="_stream_subscription",
 | |
|         model=Subscription,
 | |
|         normal_parent=stream_recipient_config,
 | |
|         include_rows="recipient_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         custom_tables=[
 | |
|             "_huddle_recipient",
 | |
|             "_huddle_subscription",
 | |
|             "zerver_huddle",
 | |
|         ],
 | |
|         virtual_parent=user_profile_config,
 | |
|         custom_fetch=custom_fetch_direct_message_groups,
 | |
|         # It is the custom_fetch function that must handle consent logic if applicable.
 | |
|         # limit_to_consenting_users can't be used here.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     # Now build permanent tables from our temp tables.
 | |
|     Config(
 | |
|         table="zerver_recipient",
 | |
|         virtual_parent=realm_config,
 | |
|         concat_and_destroy=[
 | |
|             "_user_recipient",
 | |
|             "_stream_recipient",
 | |
|             "_huddle_recipient",
 | |
|         ],
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_subscription",
 | |
|         virtual_parent=realm_config,
 | |
|         concat_and_destroy=[
 | |
|             "_user_subscription",
 | |
|             "_stream_subscription",
 | |
|             "_huddle_subscription",
 | |
|         ],
 | |
|         custom_process_results=custom_process_subscription_in_realm_config,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_channelfolder",
 | |
|         model=ChannelFolder,
 | |
|         normal_parent=realm_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     add_user_profile_child_configs(user_profile_config)
 | |
| 
 | |
|     return realm_config
 | |
| 
 | |
| 
 | |
| def custom_process_subscription_in_realm_config(
 | |
|     subscriptions: list[Record], context: Context
 | |
| ) -> list[Record]:
 | |
|     export_type = context["export_type"]
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITHOUT_CONSENT:
 | |
|         return subscriptions
 | |
| 
 | |
|     exportable_user_ids_from_context = context["exportable_user_ids"]
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         assert exportable_user_ids_from_context is not None
 | |
|         consented_user_ids = exportable_user_ids_from_context
 | |
|     else:
 | |
|         assert export_type == RealmExport.EXPORT_PUBLIC
 | |
|         assert exportable_user_ids_from_context is None
 | |
|         consented_user_ids = set()
 | |
| 
 | |
|     def scrub_subscription_if_needed(subscription: Record) -> Record:
 | |
|         if subscription["user_profile"] in consented_user_ids:
 | |
|             return subscription
 | |
|         # We create a replacement Subscription, setting only the essential fields,
 | |
|         # while allowing all the other ones to fall back to the defaults
 | |
|         # defined in the model.
 | |
|         scrubbed_subscription = Subscription(
 | |
|             id=subscription["id"],
 | |
|             user_profile_id=subscription["user_profile"],
 | |
|             recipient_id=subscription["recipient"],
 | |
|             active=subscription["active"],
 | |
|             is_user_active=subscription["is_user_active"],
 | |
|             # Letting the color be the default color for every stream would create a visually
 | |
|             # jarring experience. Instead, we can pick colors randomly for a normal-feeling
 | |
|             # experience, without leaking any information about the user's preferences.
 | |
|             color=random.choice(STREAM_ASSIGNMENT_COLORS),
 | |
|         )
 | |
|         subscription_dict = model_to_dict(scrubbed_subscription)
 | |
|         return subscription_dict
 | |
| 
 | |
|     processed_rows = map(scrub_subscription_if_needed, subscriptions)
 | |
|     return list(processed_rows)
 | |
| 
 | |
| 
 | |
| def add_user_profile_child_configs(user_profile_config: Config) -> None:
 | |
|     """
 | |
|     We add tables here that are keyed by user, and for which
 | |
|     we fetch rows using the same scheme whether we are
 | |
|     exporting a realm or a single user.
 | |
| 
 | |
|     For any table where there is nuance between how you
 | |
|     fetch for realms vs. single users, it's best to just
 | |
|     keep things simple and have each caller maintain its
 | |
|     own slightly different 4/5 line Config (while still
 | |
|     possibly calling common code deeper in the stack).
 | |
| 
 | |
|     As of now, we do NOT include bot tables like Service.
 | |
|     """
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_alertword",
 | |
|         model=AlertWord,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_customprofilefieldvalue",
 | |
|         model=CustomProfileFieldValue,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         # Values of a user's custom profile fields are public.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_muteduser",
 | |
|         model=MutedUser,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_navigationview",
 | |
|         model=NavigationView,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_onboardingstep",
 | |
|         model=OnboardingStep,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_savedsnippet",
 | |
|         model=SavedSnippet,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_useractivity",
 | |
|         model=UserActivity,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|         collect_client_ids=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_useractivityinterval",
 | |
|         model=UserActivityInterval,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         # Note that only exporting UserActivityInterval data for consenting
 | |
|         # users means it will be impossible to re-compute certain analytics
 | |
|         # CountStat statistics from the raw data. This is an acceptable downside
 | |
|         # of this class of data export.
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_userpresence",
 | |
|         model=UserPresence,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         # Presence data is public.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_userstatus",
 | |
|         model=UserStatus,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|         collect_client_ids=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_usertopic",
 | |
|         model=UserTopic,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_externalauthid",
 | |
|         model=ExternalAuthID,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_id__in",
 | |
|         limit_to_consenting_users=True,
 | |
|     )
 | |
| 
 | |
| 
 | |
| # We exclude these fields for the following reasons:
 | |
| # * api_key is a secret.
 | |
| # * password is a secret.
 | |
| # * uuid is unlikely to be useful if the domain changes.
 | |
| EXCLUDED_USER_PROFILE_FIELDS = ["api_key", "password", "uuid"]
 | |
| 
 | |
| 
 | |
| def get_randomized_exported_user_dummy_email_address(realm: Realm) -> str:
 | |
|     random_token = secrets.token_hex(16)
 | |
|     return Address(
 | |
|         username=f"exported-user-{random_token}", domain=get_fake_email_domain(realm.host)
 | |
|     ).addr_spec
 | |
| 
 | |
| 
 | |
| def custom_fetch_user_profile(response: TableData, context: Context) -> None:
 | |
|     realm = context["realm"]
 | |
|     export_type = context["export_type"]
 | |
|     exportable_user_ids = context["exportable_user_ids"]
 | |
|     if export_type != RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         # exportable_user_ids should only be passed for consent exports.
 | |
|         assert exportable_user_ids is None
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|         # In a public export, none of the users are considered "exportable",
 | |
|         # as we're not exporting anybody's private data.
 | |
|         # The only difference between PUBLIC and a theoretical EXPORT_FULL_WITH_CONSENT
 | |
|         # where 0 users are consenting is that in a PUBLIC export we won't turn users
 | |
|         # into mirrr dummy users. A public export is meant to provide useful accounts
 | |
|         # for everybody after importing; just with all private data removed.
 | |
|         # The only exception to that will be users with email visibility set to "nobody",
 | |
|         # as they can't be functional accounts without a real delivery email - which can't
 | |
|         # be exported.
 | |
|         exportable_user_ids = set()
 | |
| 
 | |
|     query = UserProfile.objects.filter(realm_id=realm.id).exclude(
 | |
|         # These were, in some early versions of Zulip, inserted into
 | |
|         # the first realm that was created.  In those cases, rather
 | |
|         # than include them here, we will include them in the
 | |
|         # crossrealm user list, below.
 | |
|         email__in=settings.CROSS_REALM_BOT_EMAILS,
 | |
|     )
 | |
|     exclude = EXCLUDED_USER_PROFILE_FIELDS
 | |
|     rows = make_raw(list(query), exclude=exclude)
 | |
| 
 | |
|     normal_rows: list[Record] = []
 | |
|     dummy_rows: list[Record] = []
 | |
| 
 | |
|     realm_user_default = RealmUserDefault.objects.get(realm=realm)
 | |
|     for row in rows:
 | |
|         if exportable_user_ids is not None:
 | |
|             if row["id"] in exportable_user_ids:
 | |
|                 pass
 | |
|             else:
 | |
|                 # In a consent export, non-exportable users should be turned into mirror dummies, with the
 | |
|                 # notable exception of users who were already deactivated. Mirror dummies can sign up with the
 | |
|                 # matching email address to reactivate their account. However, deactivated users are
 | |
|                 # specifically meant to be prevented from re-entering the organization with the deactivated
 | |
|                 # account. In order to maintain that restriction through the export->import cycle, we need to
 | |
|                 # keep deactivated accounts as just deactivated - without flipping is_mirror_dummy=True.
 | |
|                 if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT and row["is_active"]:
 | |
|                     row["is_mirror_dummy"] = True
 | |
|                     row["is_active"] = False
 | |
| 
 | |
|                 if row["email_address_visibility"] == UserProfile.EMAIL_ADDRESS_VISIBILITY_NOBODY:
 | |
|                     # The user chose not to make their email address visible even to the realm administrators.
 | |
|                     # Generate a dummy email address for them so that this preference can't be bypassed
 | |
|                     # through the export feature.
 | |
|                     row["delivery_email"] = get_randomized_exported_user_dummy_email_address(realm)
 | |
|                     if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|                         # In a public export, this account obviously becomes unusable due to not having
 | |
|                         # a functional delivery_email.
 | |
|                         row["is_mirror_dummy"] = True
 | |
|                         row["is_active"] = False
 | |
| 
 | |
|                 for settings_name in RealmUserDefault.property_types:
 | |
|                     if settings_name == "email_address_visibility":
 | |
|                         # We should respect users' preference for whether to show their email
 | |
|                         # address to others across the export->import cycle.
 | |
|                         continue
 | |
| 
 | |
|                     value = getattr(realm_user_default, settings_name)
 | |
|                     row[settings_name] = value
 | |
| 
 | |
|         if row["is_mirror_dummy"]:
 | |
|             dummy_rows.append(row)
 | |
|         else:
 | |
|             normal_rows.append(row)
 | |
| 
 | |
|     response["zerver_userprofile"] = normal_rows
 | |
|     response["zerver_userprofile_mirrordummy"] = dummy_rows
 | |
| 
 | |
| 
 | |
| def custom_fetch_user_profile_cross_realm(response: TableData, context: Context) -> None:
 | |
|     realm = context["realm"]
 | |
|     response["zerver_userprofile_crossrealm"] = []
 | |
| 
 | |
|     bot_name_to_default_email = {
 | |
|         "NOTIFICATION_BOT": "notification-bot@zulip.com",
 | |
|         "EMAIL_GATEWAY_BOT": "emailgateway@zulip.com",
 | |
|         "WELCOME_BOT": "welcome-bot@zulip.com",
 | |
|     }
 | |
| 
 | |
|     if realm.string_id == settings.SYSTEM_BOT_REALM:
 | |
|         return
 | |
| 
 | |
|     internal_realm = get_realm(settings.SYSTEM_BOT_REALM)
 | |
|     for bot in settings.INTERNAL_BOTS:
 | |
|         bot_name = bot["var_name"]
 | |
|         if bot_name not in bot_name_to_default_email:
 | |
|             continue
 | |
| 
 | |
|         bot_email = bot["email_template"] % (settings.INTERNAL_BOT_DOMAIN,)
 | |
|         bot_default_email = bot_name_to_default_email[bot_name]
 | |
|         bot_user_id = get_system_bot(bot_email, internal_realm.id).id
 | |
| 
 | |
|         recipient_id = Recipient.objects.get(type_id=bot_user_id, type=Recipient.PERSONAL).id
 | |
|         response["zerver_userprofile_crossrealm"].append(
 | |
|             dict(
 | |
|                 email=bot_default_email,
 | |
|                 id=bot_user_id,
 | |
|                 recipient_id=recipient_id,
 | |
|             )
 | |
|         )
 | |
| 
 | |
| 
 | |
| def fetch_attachment_data(
 | |
|     response: TableData, realm_id: int, message_ids: set[int], scheduled_message_ids: set[int]
 | |
| ) -> list[Attachment]:
 | |
|     attachments = list(
 | |
|         Attachment.objects.filter(
 | |
|             Q(messages__in=message_ids) | Q(scheduled_messages__in=scheduled_message_ids),
 | |
|             realm_id=realm_id,
 | |
|         ).distinct()
 | |
|     )
 | |
|     response["zerver_attachment"] = make_raw(attachments)
 | |
|     floatify_datetime_fields(response, "zerver_attachment")
 | |
| 
 | |
|     """
 | |
|     We usually export most messages for the realm, but not
 | |
|     quite ALL messages for the realm.  So, we need to
 | |
|     clean up our attachment data to have correct
 | |
|     values for response['zerver_attachment'][<n>]['messages'].
 | |
| 
 | |
|     Same reasoning applies to scheduled_messages.
 | |
|     """
 | |
|     for row in response["zerver_attachment"]:
 | |
|         filtered_message_ids = set(row["messages"]).intersection(message_ids)
 | |
|         row["messages"] = sorted(filtered_message_ids)
 | |
| 
 | |
|         filtered_scheduled_message_ids = set(row["scheduled_messages"]).intersection(
 | |
|             scheduled_message_ids
 | |
|         )
 | |
|         row["scheduled_messages"] = sorted(filtered_scheduled_message_ids)
 | |
| 
 | |
|     return attachments
 | |
| 
 | |
| 
 | |
| def custom_fetch_realm_audit_logs_for_user(response: TableData, context: Context) -> None:
 | |
|     """To be expansive, we include audit log entries for events that
 | |
|     either modified the target user or where the target user modified
 | |
|     something (E.g. if they changed the settings for a stream).
 | |
|     """
 | |
|     user = context["user"]
 | |
|     query = RealmAuditLog.objects.filter(Q(modified_user_id=user.id) | Q(acting_user_id=user.id))
 | |
|     rows = make_raw(list(query))
 | |
|     response["zerver_realmauditlog"] = rows
 | |
| 
 | |
| 
 | |
| def fetch_reaction_data(response: TableData, message_ids: set[int]) -> None:
 | |
|     query = Reaction.objects.filter(message_id__in=list(message_ids))
 | |
|     response["zerver_reaction"] = make_raw(list(query))
 | |
| 
 | |
| 
 | |
| def fetch_client_data(response: TableData, client_ids: set[int]) -> None:
 | |
|     query = Client.objects.filter(id__in=list(client_ids))
 | |
|     response["zerver_client"] = make_raw(list(query))
 | |
| 
 | |
| 
 | |
| def custom_fetch_direct_message_groups(response: TableData, context: Context) -> None:
 | |
|     realm = context["realm"]
 | |
|     export_type = context["export_type"]
 | |
|     exportable_user_ids_from_context = context["exportable_user_ids"]
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         assert exportable_user_ids_from_context is not None
 | |
|         consented_user_ids = exportable_user_ids_from_context
 | |
|     elif export_type == RealmExport.EXPORT_FULL_WITHOUT_CONSENT:
 | |
|         assert exportable_user_ids_from_context is None
 | |
|     else:
 | |
|         assert export_type == RealmExport.EXPORT_PUBLIC
 | |
|         consented_user_ids = set()
 | |
| 
 | |
|     user_profile_ids = {
 | |
|         r["id"] for r in response["zerver_userprofile"] + response["zerver_userprofile_mirrordummy"]
 | |
|     }
 | |
| 
 | |
|     recipient_filter = Q()
 | |
|     if export_type != RealmExport.EXPORT_FULL_WITHOUT_CONSENT:
 | |
|         # First we find the set of recipient ids of DirectMessageGroups which can be exported.
 | |
|         # A DirectMessageGroup can be exported only if at least one of its users is consenting
 | |
|         # to the export of private data.
 | |
|         # We can find this set by gathering all the Subscriptions of consenting users to
 | |
|         # DirectMessageGroups and collecting the set of recipient_ids from those Subscriptions.
 | |
|         exportable_direct_message_group_recipient_ids = set(
 | |
|             Subscription.objects.filter(
 | |
|                 recipient__type=Recipient.DIRECT_MESSAGE_GROUP, user_profile__in=consented_user_ids
 | |
|             )
 | |
|             .distinct("recipient_id")
 | |
|             .values_list("recipient_id", flat=True)
 | |
|         )
 | |
|         recipient_filter = Q(recipient_id__in=exportable_direct_message_group_recipient_ids)
 | |
| 
 | |
|     # Now we fetch all the Subscription objects to the exportable DireMessageGroups in the realm.
 | |
|     realm_direct_message_group_subs = (
 | |
|         Subscription.objects.select_related("recipient")
 | |
|         .filter(
 | |
|             recipient__type=Recipient.DIRECT_MESSAGE_GROUP,
 | |
|             user_profile__in=user_profile_ids,
 | |
|         )
 | |
|         .filter(recipient_filter)
 | |
|     )
 | |
|     realm_direct_message_group_recipient_ids = {
 | |
|         sub.recipient_id for sub in realm_direct_message_group_subs
 | |
|     }
 | |
| 
 | |
|     # Mark all Direct Message groups whose recipient ID contains a cross-realm user.
 | |
|     unsafe_direct_message_group_recipient_ids = set()
 | |
|     for sub in Subscription.objects.select_related("user_profile").filter(
 | |
|         recipient__in=realm_direct_message_group_recipient_ids
 | |
|     ):
 | |
|         if sub.user_profile.realm_id != realm.id:
 | |
|             # In almost every case the other realm will be zulip.com
 | |
|             unsafe_direct_message_group_recipient_ids.add(sub.recipient_id)
 | |
| 
 | |
|     # Now filter down to just those direct message groups that are
 | |
|     # entirely within the realm.
 | |
|     #
 | |
|     # This is important for ensuring that the User objects needed
 | |
|     # to import it on the other end exist (since we're only
 | |
|     # exporting the users from this realm), at the cost of losing
 | |
|     # some of these cross-realm messages.
 | |
|     direct_message_group_subs = [
 | |
|         sub
 | |
|         for sub in realm_direct_message_group_subs
 | |
|         if sub.recipient_id not in unsafe_direct_message_group_recipient_ids
 | |
|     ]
 | |
|     direct_message_group_recipient_ids = {sub.recipient_id for sub in direct_message_group_subs}
 | |
|     direct_message_group_ids = {sub.recipient.type_id for sub in direct_message_group_subs}
 | |
| 
 | |
|     direct_message_group_subscription_dicts = make_raw(direct_message_group_subs)
 | |
|     direct_message_group_recipients = make_raw(
 | |
|         Recipient.objects.filter(id__in=direct_message_group_recipient_ids)
 | |
|     )
 | |
| 
 | |
|     response["_huddle_recipient"] = direct_message_group_recipients
 | |
|     response["_huddle_subscription"] = direct_message_group_subscription_dicts
 | |
|     response["zerver_huddle"] = make_raw(
 | |
|         DirectMessageGroup.objects.filter(id__in=direct_message_group_ids)
 | |
|     )
 | |
|     if export_type == RealmExport.EXPORT_PUBLIC and any(
 | |
|         response[t] for t in ["_huddle_recipient", "_huddle_subscription", "zerver_huddle"]
 | |
|     ):
 | |
|         raise AssertionError(
 | |
|             "Public export should not result in exporting any data in _huddle tables"
 | |
|         )
 | |
| 
 | |
| 
 | |
| def custom_fetch_scheduled_messages(response: TableData, context: Context) -> None:
 | |
|     """
 | |
|     Simple custom fetch function to fetch only the ScheduledMessage objects that we're allowed to.
 | |
|     """
 | |
|     realm = context["realm"]
 | |
|     exportable_scheduled_message_ids = context["exportable_scheduled_message_ids"]
 | |
| 
 | |
|     query = ScheduledMessage.objects.filter(realm=realm, id__in=exportable_scheduled_message_ids)
 | |
|     rows = make_raw(list(query))
 | |
| 
 | |
|     response["zerver_scheduledmessage"] = rows
 | |
| 
 | |
| 
 | |
| PRESERVED_AUDIT_LOG_EVENT_TYPES = [
 | |
|     AuditLogEventType.SUBSCRIPTION_CREATED,
 | |
|     AuditLogEventType.SUBSCRIPTION_ACTIVATED,
 | |
|     AuditLogEventType.SUBSCRIPTION_DEACTIVATED,
 | |
| ]
 | |
| 
 | |
| 
 | |
| def custom_fetch_realm_audit_logs_for_realm(response: TableData, context: Context) -> None:
 | |
|     """
 | |
|     Simple custom fetch function to fix up .acting_user for some RealmAuditLog objects
 | |
|     and limit what objects are fetched when doing export with consent.
 | |
| 
 | |
|     Certain RealmAuditLog objects have an acting_user that is in a different .realm, due to
 | |
|     the possibility of server administrators (typically with the .is_staff permission) taking
 | |
|     certain actions to modify UserProfiles or Realms, which will set the .acting_user to
 | |
|     the administrator's UserProfile, which can be in a different realm. Such an acting_user
 | |
|     cannot be imported during organization import on another server, so we need to just set it
 | |
|     to None.
 | |
|     """
 | |
|     realm = context["realm"]
 | |
|     export_type = context["export_type"]
 | |
|     exportable_user_ids_from_context = context["exportable_user_ids"]
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITHOUT_CONSENT:
 | |
|         assert exportable_user_ids_from_context is None
 | |
|     elif export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         assert exportable_user_ids_from_context is not None
 | |
|         consenting_user_ids = exportable_user_ids_from_context
 | |
|     else:
 | |
|         assert export_type == RealmExport.EXPORT_PUBLIC
 | |
|         assert exportable_user_ids_from_context is None
 | |
|         consenting_user_ids = set()
 | |
| 
 | |
|     query = RealmAuditLog.objects.filter(realm=realm).select_related("acting_user")
 | |
|     realmauditlog_objects = list(query)
 | |
|     for realmauditlog in realmauditlog_objects:
 | |
|         if realmauditlog.acting_user is not None and realmauditlog.acting_user.realm_id != realm.id:
 | |
|             realmauditlog.acting_user = None
 | |
| 
 | |
|     # We want to drop all RealmAuditLog objects where modified_user is not a consenting
 | |
|     # user, except those of event_type in PRESERVED_AUDIT_LOG_EVENT_TYPES.
 | |
|     realmauditlog_objects_for_export = []
 | |
|     for realmauditlog in realmauditlog_objects:
 | |
|         if (
 | |
|             export_type == RealmExport.EXPORT_FULL_WITHOUT_CONSENT
 | |
|             or (realmauditlog.event_type in PRESERVED_AUDIT_LOG_EVENT_TYPES)
 | |
|             or (realmauditlog.modified_user_id is None)
 | |
|             or (realmauditlog.modified_user_id in consenting_user_ids)
 | |
|         ):
 | |
|             realmauditlog_objects_for_export.append(realmauditlog)
 | |
|             continue
 | |
| 
 | |
|     rows = make_raw(realmauditlog_objects_for_export)
 | |
| 
 | |
|     response["zerver_realmauditlog"] = rows
 | |
| 
 | |
| 
 | |
| def custom_fetch_onboarding_usermessage(response: TableData, context: Context) -> None:
 | |
|     realm = context["realm"]
 | |
|     response["zerver_onboardingusermessage"] = []
 | |
| 
 | |
|     onboarding_usermessage_query = OnboardingUserMessage.objects.filter(realm=realm)
 | |
|     for onboarding_usermessage in onboarding_usermessage_query:
 | |
|         onboarding_usermessage_obj = model_to_dict(onboarding_usermessage)
 | |
|         onboarding_usermessage_obj["flags_mask"] = onboarding_usermessage.flags.mask
 | |
|         del onboarding_usermessage_obj["flags"]
 | |
|         response["zerver_onboardingusermessage"].append(onboarding_usermessage_obj)
 | |
| 
 | |
| 
 | |
| def fetch_usermessages(
 | |
|     realm: Realm,
 | |
|     message_ids: set[int],
 | |
|     user_profile_ids: set[int],
 | |
|     message_filename: Path,
 | |
|     export_full_with_consent: bool,
 | |
|     consented_user_ids: set[int] | None = None,
 | |
| ) -> list[Record]:
 | |
|     # UserMessage export security rule: You can export UserMessages
 | |
|     # for the messages you exported for the users in your realm.
 | |
|     user_message_query = UserMessage.objects.filter(
 | |
|         user_profile__realm=realm, message_id__in=message_ids
 | |
|     )
 | |
|     if export_full_with_consent:
 | |
|         assert consented_user_ids is not None
 | |
|         user_profile_ids = consented_user_ids & user_profile_ids
 | |
|     user_message_chunk = []
 | |
|     for user_message in user_message_query:
 | |
|         if user_message.user_profile_id not in user_profile_ids:
 | |
|             continue
 | |
|         user_message_obj = model_to_dict(user_message)
 | |
|         user_message_obj["flags_mask"] = user_message.flags.mask
 | |
|         del user_message_obj["flags"]
 | |
|         user_message_chunk.append(user_message_obj)
 | |
|     logging.info("Fetched UserMessages for %s", message_filename)
 | |
|     return user_message_chunk
 | |
| 
 | |
| 
 | |
| def export_usermessages_batch(
 | |
|     input_path: Path,
 | |
|     output_path: Path,
 | |
|     export_full_with_consent: bool,
 | |
|     consented_user_ids: set[int] | None = None,
 | |
| ) -> None:
 | |
|     """As part of the system for doing parallel exports, this runs on one
 | |
|     batch of Message objects and adds the corresponding UserMessage
 | |
|     objects. (This is called by the export_usermessage_batch
 | |
|     management command).
 | |
| 
 | |
|     See write_message_partial_for_query for more context."""
 | |
|     assert input_path.endswith((".partial", ".locked"))
 | |
|     assert output_path.endswith(".json")
 | |
| 
 | |
|     with open(input_path, "rb") as input_file:
 | |
|         input_data: MessagePartial = orjson.loads(input_file.read())
 | |
| 
 | |
|     message_ids = {item["id"] for item in input_data["zerver_message"]}
 | |
|     user_profile_ids = set(input_data["zerver_userprofile_ids"])
 | |
|     realm = Realm.objects.get(id=input_data["realm_id"])
 | |
|     zerver_usermessage_data = fetch_usermessages(
 | |
|         realm,
 | |
|         message_ids,
 | |
|         user_profile_ids,
 | |
|         output_path,
 | |
|         export_full_with_consent,
 | |
|         consented_user_ids=consented_user_ids,
 | |
|     )
 | |
| 
 | |
|     output_data: TableData = dict(
 | |
|         zerver_message=input_data["zerver_message"],
 | |
|         zerver_usermessage=zerver_usermessage_data,
 | |
|     )
 | |
|     write_table_data(output_path, output_data)
 | |
|     os.unlink(input_path)
 | |
| 
 | |
| 
 | |
| def export_partial_message_files(
 | |
|     realm: Realm,
 | |
|     response: TableData,
 | |
|     export_type: int,
 | |
|     collected_client_ids: set[int],
 | |
|     exportable_user_ids: set[int] | None,
 | |
|     chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE,
 | |
|     output_dir: Path | None = None,
 | |
| ) -> set[int]:
 | |
|     if output_dir is None:
 | |
|         output_dir = tempfile.mkdtemp(prefix="zulip-export")
 | |
| 
 | |
|     def get_ids(records: Iterable[Mapping[str, Any]]) -> set[int]:
 | |
|         return {x["id"] for x in records}
 | |
| 
 | |
|     # Basic security rule: You can export everything either...
 | |
|     #   - sent by someone in your exportable_user_ids
 | |
|     #        OR
 | |
|     #   - received by someone in your exportable_user_ids (which
 | |
|     #     equates to a recipient object we are exporting)
 | |
|     #
 | |
|     # TODO: In theory, you should be able to export messages in
 | |
|     # cross-realm direct message threads; currently, this only
 | |
|     # exports cross-realm messages received by your realm that
 | |
|     # were sent by Zulip system bots (e.g. emailgateway,
 | |
|     # notification-bot).
 | |
| 
 | |
|     # Here, "we" and "us" refers to the inner circle of users who
 | |
|     # were specified as being allowed to be exported.  "Them"
 | |
|     # refers to other users.
 | |
|     user_ids_for_us = get_ids(
 | |
|         response["zerver_userprofile"],
 | |
|     )
 | |
|     ids_of_our_possible_senders = get_ids(
 | |
|         response["zerver_userprofile"]
 | |
|         + response["zerver_userprofile_mirrordummy"]
 | |
|         + response["zerver_userprofile_crossrealm"]
 | |
|     )
 | |
| 
 | |
|     consented_user_ids: set[int] = set()
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         assert exportable_user_ids is not None
 | |
|         consented_user_ids = exportable_user_ids
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|         recipient_streams = Stream.objects.filter(realm=realm, invite_only=False)
 | |
|         recipient_ids = Recipient.objects.filter(
 | |
|             type=Recipient.STREAM, type_id__in=recipient_streams
 | |
|         ).values_list("id", flat=True)
 | |
|         recipient_ids_for_us = get_ids(response["zerver_recipient"]) & set(recipient_ids)
 | |
|     elif export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         public_streams = Stream.objects.filter(realm=realm, invite_only=False)
 | |
|         public_stream_recipient_ids = Recipient.objects.filter(
 | |
|             type=Recipient.STREAM, type_id__in=public_streams
 | |
|         ).values_list("id", flat=True)
 | |
| 
 | |
|         streams_with_protected_history_recipient_ids = Stream.objects.filter(
 | |
|             realm=realm, history_public_to_subscribers=False
 | |
|         ).values_list("recipient_id", flat=True)
 | |
| 
 | |
|         consented_recipient_ids = Subscription.objects.filter(
 | |
|             user_profile_id__in=consented_user_ids
 | |
|         ).values_list("recipient_id", flat=True)
 | |
| 
 | |
|         recipient_ids_set = set(public_stream_recipient_ids) | set(consented_recipient_ids) - set(
 | |
|             streams_with_protected_history_recipient_ids
 | |
|         )
 | |
|         recipient_ids_for_us = get_ids(response["zerver_recipient"]) & recipient_ids_set
 | |
|     else:
 | |
|         recipient_ids_for_us = get_ids(response["zerver_recipient"])
 | |
|         # For a full export, we have implicit consent for all users in the export.
 | |
|         consented_user_ids = user_ids_for_us
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|         messages_we_received = Message.objects.filter(
 | |
|             # Uses index: zerver_message_realm_sender_recipient
 | |
|             realm_id=realm.id,
 | |
|             sender__in=ids_of_our_possible_senders,
 | |
|             recipient__in=recipient_ids_for_us,
 | |
|         )
 | |
| 
 | |
|         # For the public stream export, we only need the messages those streams received.
 | |
|         message_queries = [
 | |
|             messages_we_received,
 | |
|         ]
 | |
|     else:
 | |
|         message_queries = []
 | |
| 
 | |
|         # We capture most messages here: Messages that were sent by
 | |
|         # anyone in the export and received by any of the users who we
 | |
|         # have consent to export.
 | |
|         messages_we_received = Message.objects.filter(
 | |
|             # Uses index: zerver_message_realm_sender_recipient
 | |
|             realm_id=realm.id,
 | |
|             sender__in=ids_of_our_possible_senders,
 | |
|             recipient__in=recipient_ids_for_us,
 | |
|         )
 | |
|         message_queries.append(messages_we_received)
 | |
| 
 | |
|         if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|             # Export with member consent requires some careful handling to make sure
 | |
|             # we only include messages that a consenting user can access.
 | |
|             has_usermessage_expression = Exists(
 | |
|                 UserMessage.objects.filter(
 | |
|                     user_profile_id__in=consented_user_ids, message_id=OuterRef("id")
 | |
|                 )
 | |
|             )
 | |
|             messages_we_received_in_protected_history_streams = Message.objects.alias(
 | |
|                 has_usermessage=has_usermessage_expression
 | |
|             ).filter(
 | |
|                 # Uses index: zerver_message_realm_sender_recipient
 | |
|                 realm_id=realm.id,
 | |
|                 sender__in=ids_of_our_possible_senders,
 | |
|                 recipient_id__in=(
 | |
|                     set(consented_recipient_ids) & set(streams_with_protected_history_recipient_ids)
 | |
|                 ),
 | |
|                 has_usermessage=True,
 | |
|             )
 | |
| 
 | |
|             message_queries.append(messages_we_received_in_protected_history_streams)
 | |
| 
 | |
|         # The above query is missing some messages that consenting
 | |
|         # users have access to, namely, direct messages sent by one
 | |
|         # of the users in our export to another user (since the only
 | |
|         # subscriber to a Recipient object for Recipient.PERSONAL is
 | |
|         # the recipient, not the sender). The `consented_user_ids`
 | |
|         # list has precisely those users whose Recipient.PERSONAL
 | |
|         # recipient ID was already present in recipient_ids_for_us
 | |
|         # above.
 | |
|         ids_of_non_exported_possible_recipients = ids_of_our_possible_senders - consented_user_ids
 | |
| 
 | |
|         recipients_for_them = Recipient.objects.filter(
 | |
|             type=Recipient.PERSONAL, type_id__in=ids_of_non_exported_possible_recipients
 | |
|         ).values("id")
 | |
|         recipient_ids_for_them = get_ids(recipients_for_them)
 | |
| 
 | |
|         messages_we_sent_to_them = Message.objects.filter(
 | |
|             # Uses index: zerver_message_realm_sender_recipient
 | |
|             realm_id=realm.id,
 | |
|             sender__in=consented_user_ids,
 | |
|             recipient__in=recipient_ids_for_them,
 | |
|         )
 | |
| 
 | |
|         message_queries.append(messages_we_sent_to_them)
 | |
| 
 | |
|     all_message_ids: set[int] = set()
 | |
| 
 | |
|     for message_query in message_queries:
 | |
|         message_ids = set(get_id_list_gently_from_database(base_query=message_query, id_field="id"))
 | |
| 
 | |
|         # We expect our queries to be disjoint, although this assertion
 | |
|         # isn't strictly necessary if you don't mind a little bit of
 | |
|         # overhead.
 | |
|         assert len(message_ids.intersection(all_message_ids)) == 0
 | |
| 
 | |
|         all_message_ids |= message_ids
 | |
| 
 | |
|     message_id_chunks = chunkify(sorted(all_message_ids), chunk_size=MESSAGE_BATCH_CHUNK_SIZE)
 | |
| 
 | |
|     write_message_partials(
 | |
|         realm=realm,
 | |
|         message_id_chunks=message_id_chunks,
 | |
|         output_dir=output_dir,
 | |
|         user_profile_ids=user_ids_for_us,
 | |
|         collected_client_ids=collected_client_ids,
 | |
|     )
 | |
| 
 | |
|     return all_message_ids
 | |
| 
 | |
| 
 | |
| def write_message_partials(
 | |
|     *,
 | |
|     realm: Realm,
 | |
|     message_id_chunks: list[list[int]],
 | |
|     output_dir: Path,
 | |
|     user_profile_ids: set[int],
 | |
|     collected_client_ids: set[int],
 | |
| ) -> None:
 | |
|     dump_file_id = 1
 | |
| 
 | |
|     for message_id_chunk in message_id_chunks:
 | |
|         # Uses index: zerver_message_pkey
 | |
|         actual_query = Message.objects.filter(id__in=message_id_chunk).order_by("id")
 | |
|         message_chunk = make_raw(actual_query)
 | |
| 
 | |
|         for row in message_chunk:
 | |
|             collected_client_ids.add(row["sending_client"])
 | |
| 
 | |
|         # Figure out the name of our shard file.
 | |
|         message_filename = os.path.join(output_dir, f"messages-{dump_file_id:06}.json")
 | |
|         message_filename += ".partial"
 | |
|         logging.info("Fetched messages for %s", message_filename)
 | |
| 
 | |
|         # Clean up our messages.
 | |
|         table_data: TableData = {}
 | |
|         table_data["zerver_message"] = message_chunk
 | |
|         floatify_datetime_fields(table_data, "zerver_message")
 | |
| 
 | |
|         # Build up our output for the .partial file, which needs
 | |
|         # a list of user_profile_ids to search for (as well as
 | |
|         # the realm id).
 | |
|         output: MessagePartial = dict(
 | |
|             zerver_message=table_data["zerver_message"],
 | |
|             zerver_userprofile_ids=list(user_profile_ids),
 | |
|             realm_id=realm.id,
 | |
|         )
 | |
| 
 | |
|         # And write the data.
 | |
|         write_data_to_file(message_filename, output)
 | |
|         dump_file_id += 1
 | |
| 
 | |
| 
 | |
| def export_uploads_and_avatars(
 | |
|     realm: Realm,
 | |
|     *,
 | |
|     attachments: list[Attachment] | None = None,
 | |
|     user: UserProfile | None,
 | |
|     output_dir: Path,
 | |
| ) -> None:
 | |
|     uploads_output_dir = os.path.join(output_dir, "uploads")
 | |
|     avatars_output_dir = os.path.join(output_dir, "avatars")
 | |
|     realm_icons_output_dir = os.path.join(output_dir, "realm_icons")
 | |
|     emoji_output_dir = os.path.join(output_dir, "emoji")
 | |
| 
 | |
|     for dir_path in (
 | |
|         uploads_output_dir,
 | |
|         avatars_output_dir,
 | |
|         emoji_output_dir,
 | |
|     ):
 | |
|         if not os.path.exists(dir_path):
 | |
|             os.makedirs(dir_path)
 | |
| 
 | |
|     # Avoid creating realm_icons_output_dir for single user exports
 | |
|     if user is None and not os.path.exists(realm_icons_output_dir):
 | |
|         os.makedirs(realm_icons_output_dir)
 | |
| 
 | |
|     if user is None:
 | |
|         handle_system_bots = True
 | |
|         users = list(UserProfile.objects.filter(realm=realm))
 | |
|         assert attachments is not None
 | |
|         realm_emojis = list(RealmEmoji.objects.filter(realm_id=realm.id))
 | |
|     else:
 | |
|         handle_system_bots = False
 | |
|         users = [user]
 | |
|         attachments = list(Attachment.objects.filter(owner_id=user.id))
 | |
|         realm_emojis = list(RealmEmoji.objects.filter(author_id=user.id))
 | |
| 
 | |
|     if settings.LOCAL_UPLOADS_DIR:
 | |
|         assert settings.LOCAL_FILES_DIR
 | |
|         assert settings.LOCAL_AVATARS_DIR
 | |
|         # Small installations and developers will usually just store files locally.
 | |
|         export_uploads_from_local(
 | |
|             realm,
 | |
|             local_dir=settings.LOCAL_FILES_DIR,
 | |
|             output_dir=uploads_output_dir,
 | |
|             attachments=attachments,
 | |
|         )
 | |
|         export_avatars_from_local(
 | |
|             realm,
 | |
|             local_dir=settings.LOCAL_AVATARS_DIR,
 | |
|             output_dir=avatars_output_dir,
 | |
|             users=users,
 | |
|             handle_system_bots=handle_system_bots,
 | |
|         )
 | |
|         export_emoji_from_local(
 | |
|             realm,
 | |
|             local_dir=settings.LOCAL_AVATARS_DIR,
 | |
|             output_dir=emoji_output_dir,
 | |
|             realm_emojis=realm_emojis,
 | |
|         )
 | |
| 
 | |
|         if user is None:
 | |
|             export_realm_icons(
 | |
|                 realm,
 | |
|                 local_dir=settings.LOCAL_AVATARS_DIR,
 | |
|                 output_dir=realm_icons_output_dir,
 | |
|             )
 | |
|     else:
 | |
|         user_ids = {user.id for user in users}
 | |
| 
 | |
|         # Some bigger installations will have their data stored on S3.
 | |
| 
 | |
|         path_ids = {attachment.path_id for attachment in attachments}
 | |
| 
 | |
|         export_files_from_s3(
 | |
|             realm,
 | |
|             handle_system_bots=handle_system_bots,
 | |
|             flavor="upload",
 | |
|             bucket_name=settings.S3_AUTH_UPLOADS_BUCKET,
 | |
|             object_prefix=f"{realm.id}/",
 | |
|             output_dir=uploads_output_dir,
 | |
|             user_ids=user_ids,
 | |
|             valid_hashes=path_ids,
 | |
|         )
 | |
| 
 | |
|         avatar_hash_values = set()
 | |
|         for avatar_user in users:
 | |
|             avatar_path = user_avatar_base_path_from_ids(
 | |
|                 avatar_user.id, avatar_user.avatar_version, realm.id
 | |
|             )
 | |
|             avatar_hash_values.add(avatar_path)
 | |
|             avatar_hash_values.add(avatar_path + ".original")
 | |
| 
 | |
|         export_files_from_s3(
 | |
|             realm,
 | |
|             handle_system_bots=handle_system_bots,
 | |
|             flavor="avatar",
 | |
|             bucket_name=settings.S3_AVATAR_BUCKET,
 | |
|             object_prefix=f"{realm.id}/",
 | |
|             output_dir=avatars_output_dir,
 | |
|             user_ids=user_ids,
 | |
|             valid_hashes=avatar_hash_values,
 | |
|         )
 | |
| 
 | |
|         emoji_paths = set()
 | |
|         for realm_emoji in realm_emojis:
 | |
|             emoji_path = get_emoji_path(realm_emoji)
 | |
|             emoji_paths.add(emoji_path)
 | |
|             emoji_paths.add(emoji_path + ".original")
 | |
| 
 | |
|         export_files_from_s3(
 | |
|             realm,
 | |
|             handle_system_bots=handle_system_bots,
 | |
|             flavor="emoji",
 | |
|             bucket_name=settings.S3_AVATAR_BUCKET,
 | |
|             object_prefix=f"{realm.id}/emoji/images/",
 | |
|             output_dir=emoji_output_dir,
 | |
|             user_ids=user_ids,
 | |
|             valid_hashes=emoji_paths,
 | |
|         )
 | |
| 
 | |
|         if user is None:
 | |
|             export_files_from_s3(
 | |
|                 realm,
 | |
|                 handle_system_bots=handle_system_bots,
 | |
|                 flavor="realm_icon_or_logo",
 | |
|                 bucket_name=settings.S3_AVATAR_BUCKET,
 | |
|                 object_prefix=f"{realm.id}/realm/",
 | |
|                 output_dir=realm_icons_output_dir,
 | |
|                 user_ids=user_ids,
 | |
|                 valid_hashes=None,
 | |
|             )
 | |
| 
 | |
| 
 | |
| def _get_exported_s3_record(
 | |
|     bucket_name: str, key: "Object", processing_emoji: bool
 | |
| ) -> dict[str, Any]:
 | |
|     # Helper function for export_files_from_s3
 | |
|     record: dict[str, Any] = dict(
 | |
|         s3_path=key.key,
 | |
|         bucket=bucket_name,
 | |
|         size=key.content_length,
 | |
|         last_modified=key.last_modified,
 | |
|         content_type=key.content_type,
 | |
|         md5=key.e_tag,
 | |
|     )
 | |
|     record.update(key.metadata)
 | |
| 
 | |
|     if processing_emoji:
 | |
|         file_name = os.path.basename(key.key)
 | |
|         # Both the main emoji file and the .original version should have the same
 | |
|         # file_name value in the record, as they reference the same emoji.
 | |
|         file_name = file_name.removesuffix(".original")
 | |
|         record["file_name"] = file_name
 | |
| 
 | |
|     if "user_profile_id" in record:
 | |
|         user_profile = get_user_profile_by_id(int(record["user_profile_id"]))
 | |
|         record["user_profile_email"] = user_profile.email
 | |
| 
 | |
|         # Fix the record ids
 | |
|         record["user_profile_id"] = int(record["user_profile_id"])
 | |
| 
 | |
|         # A few early avatars don't have 'realm_id' on the object; fix their metadata
 | |
|         if "realm_id" not in record:
 | |
|             record["realm_id"] = user_profile.realm_id
 | |
|     else:
 | |
|         # There are some rare cases in which 'user_profile_id' may not be present
 | |
|         # in S3 metadata. Eg: Exporting an organization which was created
 | |
|         # initially from a local export won't have the "user_profile_id" metadata
 | |
|         # set for realm_icons and realm_logos.
 | |
|         pass
 | |
| 
 | |
|     if "realm_id" in record:
 | |
|         record["realm_id"] = int(record["realm_id"])
 | |
|     else:
 | |
|         raise Exception("Missing realm_id")
 | |
| 
 | |
|     if "avatar_version" in record:
 | |
|         record["avatar_version"] = int(record["avatar_version"])
 | |
| 
 | |
|     return record
 | |
| 
 | |
| 
 | |
| def _save_s3_object_to_file(
 | |
|     key: "Object",
 | |
|     output_dir: str,
 | |
|     processing_uploads: bool,
 | |
| ) -> None:
 | |
|     # Helper function for export_files_from_s3
 | |
|     if not processing_uploads:
 | |
|         filename = os.path.join(output_dir, key.key)
 | |
|     else:
 | |
|         fields = key.key.split("/")
 | |
|         if len(fields) != 3:
 | |
|             raise AssertionError(f"Suspicious key with invalid format {key.key}")
 | |
|         filename = os.path.join(output_dir, key.key)
 | |
| 
 | |
|     if "../" in filename:
 | |
|         raise AssertionError(f"Suspicious file with invalid format {filename}")
 | |
| 
 | |
|     # Use 'mark_sanitized' to cause Pysa to ignore the flow of user controlled
 | |
|     # data into the filesystem sink, because we've already prevented directory
 | |
|     # traversal with our assertion above.
 | |
|     dirname = mark_sanitized(os.path.dirname(filename))
 | |
| 
 | |
|     if not os.path.exists(dirname):
 | |
|         os.makedirs(dirname)
 | |
|     key.download_file(Filename=filename)
 | |
| 
 | |
| 
 | |
| def export_files_from_s3(
 | |
|     realm: Realm,
 | |
|     handle_system_bots: bool,
 | |
|     flavor: str,
 | |
|     bucket_name: str,
 | |
|     object_prefix: str,
 | |
|     output_dir: Path,
 | |
|     user_ids: set[int],
 | |
|     valid_hashes: set[str] | None,
 | |
| ) -> None:
 | |
|     processing_uploads = flavor == "upload"
 | |
|     processing_emoji = flavor == "emoji"
 | |
| 
 | |
|     bucket = get_bucket(bucket_name)
 | |
|     records = []
 | |
| 
 | |
|     logging.info("Downloading %s files from %s", flavor, bucket_name)
 | |
| 
 | |
|     email_gateway_bot: UserProfile | None = None
 | |
| 
 | |
|     if handle_system_bots and settings.EMAIL_GATEWAY_BOT is not None:
 | |
|         internal_realm = get_realm(settings.SYSTEM_BOT_REALM)
 | |
|         email_gateway_bot = get_system_bot(settings.EMAIL_GATEWAY_BOT, internal_realm.id)
 | |
|         user_ids.add(email_gateway_bot.id)
 | |
| 
 | |
|     count = 0
 | |
|     for bkey in bucket.objects.filter(Prefix=object_prefix):
 | |
|         if valid_hashes is not None and bkey.Object().key not in valid_hashes:
 | |
|             continue
 | |
| 
 | |
|         key = bucket.Object(bkey.key)
 | |
| 
 | |
|         """
 | |
|         For very old realms we may not have proper metadata. If you really need
 | |
|         an export to bypass these checks, flip the following flag.
 | |
|         """
 | |
|         checking_metadata = True
 | |
|         if checking_metadata:
 | |
|             if "realm_id" not in key.metadata:
 | |
|                 raise AssertionError(f"Missing realm_id in key metadata: {key.metadata}")
 | |
| 
 | |
|             if "user_profile_id" not in key.metadata:
 | |
|                 raise AssertionError(f"Missing user_profile_id in key metadata: {key.metadata}")
 | |
| 
 | |
|             if int(key.metadata["user_profile_id"]) not in user_ids:
 | |
|                 continue
 | |
| 
 | |
|             # This can happen if an email address has moved realms
 | |
|             if key.metadata["realm_id"] != str(realm.id):
 | |
|                 if email_gateway_bot is None or key.metadata["user_profile_id"] != str(
 | |
|                     email_gateway_bot.id
 | |
|                 ):
 | |
|                     raise AssertionError(
 | |
|                         f"Key metadata problem: {key.key} / {key.metadata} / {realm.id}"
 | |
|                     )
 | |
|                 # Email gateway bot sends messages, potentially including attachments, cross-realm.
 | |
|                 print(f"File uploaded by email gateway bot: {key.key} / {key.metadata}")
 | |
| 
 | |
|         record = _get_exported_s3_record(bucket_name, key, processing_emoji)
 | |
| 
 | |
|         record["path"] = key.key
 | |
|         _save_s3_object_to_file(key, output_dir, processing_uploads)
 | |
| 
 | |
|         records.append(record)
 | |
|         count += 1
 | |
| 
 | |
|         if count % 100 == 0:
 | |
|             logging.info("Finished %s", count)
 | |
| 
 | |
|     write_records_json_file(output_dir, records)
 | |
| 
 | |
| 
 | |
| def export_uploads_from_local(
 | |
|     realm: Realm, local_dir: Path, output_dir: Path, attachments: list[Attachment]
 | |
| ) -> None:
 | |
|     records = []
 | |
|     for count, attachment in enumerate(attachments, 1):
 | |
|         # Use 'mark_sanitized' to work around false positive caused by Pysa
 | |
|         # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
 | |
|         # are user controlled
 | |
|         path_id = mark_sanitized(attachment.path_id)
 | |
| 
 | |
|         local_path = os.path.join(local_dir, path_id)
 | |
|         output_path = os.path.join(output_dir, path_id)
 | |
| 
 | |
|         os.makedirs(os.path.dirname(output_path), exist_ok=True)
 | |
|         shutil.copy2(local_path, output_path)
 | |
|         stat = os.stat(local_path)
 | |
|         record = dict(
 | |
|             realm_id=attachment.realm_id,
 | |
|             user_profile_id=attachment.owner.id,
 | |
|             user_profile_email=attachment.owner.email,
 | |
|             s3_path=path_id,
 | |
|             path=path_id,
 | |
|             size=stat.st_size,
 | |
|             last_modified=stat.st_mtime,
 | |
|             content_type=None,
 | |
|         )
 | |
|         records.append(record)
 | |
| 
 | |
|         if count % 100 == 0:
 | |
|             logging.info("Finished %s", count)
 | |
| 
 | |
|     write_records_json_file(output_dir, records)
 | |
| 
 | |
| 
 | |
| def export_avatars_from_local(
 | |
|     realm: Realm,
 | |
|     local_dir: Path,
 | |
|     output_dir: Path,
 | |
|     users: list[UserProfile],
 | |
|     handle_system_bots: bool,
 | |
| ) -> None:
 | |
|     count = 0
 | |
|     records = []
 | |
| 
 | |
|     if handle_system_bots:
 | |
|         internal_realm = get_realm(settings.SYSTEM_BOT_REALM)
 | |
|         users += [
 | |
|             get_system_bot(settings.NOTIFICATION_BOT, internal_realm.id),
 | |
|             get_system_bot(settings.EMAIL_GATEWAY_BOT, internal_realm.id),
 | |
|             get_system_bot(settings.WELCOME_BOT, internal_realm.id),
 | |
|         ]
 | |
| 
 | |
|     for user in users:
 | |
|         if user.avatar_source == UserProfile.AVATAR_FROM_GRAVATAR:
 | |
|             continue
 | |
| 
 | |
|         avatar_path = user_avatar_base_path_from_ids(user.id, user.avatar_version, realm.id)
 | |
|         wildcard = os.path.join(local_dir, avatar_path + ".*")
 | |
| 
 | |
|         for local_path in glob.glob(wildcard):
 | |
|             logging.info(
 | |
|                 "Copying avatar file for user %s from %s",
 | |
|                 user.email,
 | |
|                 local_path,
 | |
|             )
 | |
|             fn = os.path.relpath(local_path, local_dir)
 | |
|             output_path = os.path.join(output_dir, fn)
 | |
|             os.makedirs(str(os.path.dirname(output_path)), exist_ok=True)
 | |
|             shutil.copy2(str(local_path), str(output_path))
 | |
|             stat = os.stat(local_path)
 | |
|             record = dict(
 | |
|                 realm_id=realm.id,
 | |
|                 user_profile_id=user.id,
 | |
|                 user_profile_email=user.email,
 | |
|                 avatar_version=user.avatar_version,
 | |
|                 s3_path=fn,
 | |
|                 path=fn,
 | |
|                 size=stat.st_size,
 | |
|                 last_modified=stat.st_mtime,
 | |
|                 content_type=None,
 | |
|             )
 | |
|             records.append(record)
 | |
| 
 | |
|             count += 1
 | |
| 
 | |
|             if count % 100 == 0:
 | |
|                 logging.info("Finished %s", count)
 | |
| 
 | |
|     write_records_json_file(output_dir, records)
 | |
| 
 | |
| 
 | |
| def export_realm_icons(realm: Realm, local_dir: Path, output_dir: Path) -> None:
 | |
|     records = []
 | |
|     dir_relative_path = zerver.lib.upload.upload_backend.realm_avatar_and_logo_path(realm)
 | |
|     icons_wildcard = os.path.join(local_dir, dir_relative_path, "*")
 | |
|     for icon_absolute_path in glob.glob(icons_wildcard):
 | |
|         icon_file_name = os.path.basename(icon_absolute_path)
 | |
|         icon_relative_path = os.path.join(str(realm.id), icon_file_name)
 | |
|         output_path = os.path.join(output_dir, icon_relative_path)
 | |
|         os.makedirs(str(os.path.dirname(output_path)), exist_ok=True)
 | |
|         shutil.copy2(str(icon_absolute_path), str(output_path))
 | |
|         record = dict(realm_id=realm.id, path=icon_relative_path, s3_path=icon_relative_path)
 | |
|         records.append(record)
 | |
| 
 | |
|     write_records_json_file(output_dir, records)
 | |
| 
 | |
| 
 | |
| def get_emoji_path(realm_emoji: RealmEmoji) -> str:
 | |
|     return RealmEmoji.PATH_ID_TEMPLATE.format(
 | |
|         realm_id=realm_emoji.realm_id,
 | |
|         emoji_file_name=realm_emoji.file_name,
 | |
|     )
 | |
| 
 | |
| 
 | |
| def export_emoji_from_local(
 | |
|     realm: Realm, local_dir: Path, output_dir: Path, realm_emojis: list[RealmEmoji]
 | |
| ) -> None:
 | |
|     records = []
 | |
| 
 | |
|     realm_emoji_helper_tuples: list[tuple[RealmEmoji, str]] = []
 | |
|     for realm_emoji in realm_emojis:
 | |
|         realm_emoji_path = get_emoji_path(realm_emoji)
 | |
| 
 | |
|         # Use 'mark_sanitized' to work around false positive caused by Pysa
 | |
|         # thinking that 'realm' (and thus 'attachment' and 'attachment.path_id')
 | |
|         # are user controlled
 | |
|         realm_emoji_path = mark_sanitized(realm_emoji_path)
 | |
| 
 | |
|         realm_emoji_path_original = realm_emoji_path + ".original"
 | |
| 
 | |
|         realm_emoji_helper_tuples.append((realm_emoji, realm_emoji_path))
 | |
|         realm_emoji_helper_tuples.append((realm_emoji, realm_emoji_path_original))
 | |
| 
 | |
|     for count, realm_emoji_helper_tuple in enumerate(realm_emoji_helper_tuples, 1):
 | |
|         realm_emoji_object, emoji_path = realm_emoji_helper_tuple
 | |
| 
 | |
|         local_path = os.path.join(local_dir, emoji_path)
 | |
|         output_path = os.path.join(output_dir, emoji_path)
 | |
| 
 | |
|         os.makedirs(os.path.dirname(output_path), exist_ok=True)
 | |
|         shutil.copy2(local_path, output_path)
 | |
|         # Realm emoji author is optional.
 | |
|         author = realm_emoji_object.author
 | |
|         author_id = None
 | |
|         if author:
 | |
|             author_id = author.id
 | |
|         record = dict(
 | |
|             realm_id=realm.id,
 | |
|             author=author_id,
 | |
|             path=emoji_path,
 | |
|             s3_path=emoji_path,
 | |
|             file_name=realm_emoji_object.file_name,
 | |
|             name=realm_emoji_object.name,
 | |
|             deactivated=realm_emoji_object.deactivated,
 | |
|         )
 | |
|         records.append(record)
 | |
| 
 | |
|         if count % 100 == 0:
 | |
|             logging.info("Finished %s", count)
 | |
| 
 | |
|     write_records_json_file(output_dir, records)
 | |
| 
 | |
| 
 | |
| def do_write_stats_file_for_realm_export(output_dir: Path) -> dict[str, int | dict[str, int]]:
 | |
|     stats_file = os.path.join(output_dir, "stats.json")
 | |
|     realm_file = os.path.join(output_dir, "realm.json")
 | |
|     attachment_file = os.path.join(output_dir, "attachment.json")
 | |
|     analytics_file = os.path.join(output_dir, "analytics.json")
 | |
|     message_files = glob.glob(os.path.join(output_dir, "messages-*.json"))
 | |
|     filenames = sorted([analytics_file, attachment_file, *message_files, realm_file])
 | |
| 
 | |
|     logging.info("Writing stats file: %s\n", stats_file)
 | |
| 
 | |
|     stats: dict[str, int | dict[str, int]] = {}
 | |
|     for filename in filenames:
 | |
|         name = os.path.splitext(os.path.basename(filename))[0]
 | |
|         with open(filename, "rb") as json_file:
 | |
|             data = orjson.loads(json_file.read())
 | |
|         stats[name] = {k: len(data[k]) for k in sorted(data)}
 | |
| 
 | |
|     for category in ["avatars", "uploads", "emoji", "realm_icons"]:
 | |
|         filename = os.path.join(output_dir, category, "records.json")
 | |
|         with open(filename, "rb") as json_file:
 | |
|             data = orjson.loads(json_file.read())
 | |
|         stats[f"{category}_records"] = len(data)
 | |
| 
 | |
|     with open(stats_file, "wb") as f:
 | |
|         f.write(orjson.dumps(stats, option=orjson.OPT_INDENT_2))
 | |
| 
 | |
|     return stats
 | |
| 
 | |
| 
 | |
| def get_exportable_scheduled_message_ids(
 | |
|     realm: Realm, export_type: int, exportable_user_ids: set[int] | None
 | |
| ) -> set[int]:
 | |
|     """
 | |
|     Scheduled messages are private to the sender, so which ones we export depends on the
 | |
|     public/consent/full export mode.
 | |
|     """
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_PUBLIC:
 | |
|         return set()
 | |
| 
 | |
|     if export_type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|         assert exportable_user_ids is not None
 | |
|         sender_ids = exportable_user_ids
 | |
|         return set(
 | |
|             ScheduledMessage.objects.filter(sender_id__in=sender_ids, realm=realm).values_list(
 | |
|                 "id", flat=True
 | |
|             )
 | |
|         )
 | |
| 
 | |
|     return set(ScheduledMessage.objects.filter(realm=realm).values_list("id", flat=True))
 | |
| 
 | |
| 
 | |
| def do_export_realm(
 | |
|     realm: Realm,
 | |
|     output_dir: Path,
 | |
|     threads: int,
 | |
|     export_type: int,
 | |
|     exportable_user_ids: set[int] | None = None,
 | |
|     export_as_active: bool | None = None,
 | |
| ) -> tuple[str, dict[str, int | dict[str, int]]]:
 | |
|     response: TableData = {}
 | |
|     if exportable_user_ids is not None:
 | |
|         # We only use this arg for consent exports. Any other usage
 | |
|         # indicates a bug.
 | |
|         assert export_type == RealmExport.EXPORT_FULL_WITH_CONSENT
 | |
| 
 | |
|     # We need at least one thread running to export
 | |
|     # UserMessage rows.  The management command should
 | |
|     # enforce this for us.
 | |
|     if not settings.TEST_SUITE:
 | |
|         assert threads >= 1
 | |
| 
 | |
|     realm_config = get_realm_config()
 | |
| 
 | |
|     create_soft_link(source=output_dir, in_progress=True)
 | |
| 
 | |
|     exportable_scheduled_message_ids = get_exportable_scheduled_message_ids(
 | |
|         realm, export_type, exportable_user_ids
 | |
|     )
 | |
|     collected_client_ids = set(
 | |
|         ScheduledMessage.objects.filter(id__in=exportable_scheduled_message_ids)
 | |
|         .order_by("sending_client_id")
 | |
|         .distinct("sending_client_id")
 | |
|         .values_list("sending_client_id", flat=True)
 | |
|     )
 | |
| 
 | |
|     logging.info("Exporting data from get_realm_config()...")
 | |
|     export_from_config(
 | |
|         response=response,
 | |
|         config=realm_config,
 | |
|         seed_object=realm,
 | |
|         context=dict(
 | |
|             realm=realm,
 | |
|             export_type=export_type,
 | |
|             exportable_user_ids=exportable_user_ids,
 | |
|             exportable_scheduled_message_ids=exportable_scheduled_message_ids,
 | |
|             collected_client_ids_set=collected_client_ids,
 | |
|         ),
 | |
|     )
 | |
|     logging.info("...DONE with get_realm_config() data")
 | |
| 
 | |
|     sanity_check_output(response)
 | |
| 
 | |
|     # We (sort of) export zerver_message rows here.  We write
 | |
|     # them to .partial files that are subsequently fleshed out
 | |
|     # by parallel processes to add in zerver_usermessage data.
 | |
|     # This is for performance reasons, of course.  Some installations
 | |
|     # have millions of messages.
 | |
|     logging.info("Exporting .partial files messages")
 | |
|     message_ids = export_partial_message_files(
 | |
|         realm,
 | |
|         response,
 | |
|         export_type=export_type,
 | |
|         exportable_user_ids=exportable_user_ids,
 | |
|         output_dir=output_dir,
 | |
|         collected_client_ids=collected_client_ids,
 | |
|     )
 | |
|     logging.info("%d messages were exported", len(message_ids))
 | |
| 
 | |
|     # zerver_reaction
 | |
|     zerver_reaction: TableData = {}
 | |
|     fetch_reaction_data(response=zerver_reaction, message_ids=message_ids)
 | |
|     response.update(zerver_reaction)
 | |
| 
 | |
|     zerver_client: TableData = {}
 | |
|     fetch_client_data(response=zerver_client, client_ids=collected_client_ids)
 | |
|     response.update(zerver_client)
 | |
| 
 | |
|     # Override the "deactivated" flag on the realm
 | |
|     if export_as_active is not None:
 | |
|         response["zerver_realm"][0]["deactivated"] = not export_as_active
 | |
| 
 | |
|     response["import_source"] = "zulip"  # type: ignore[assignment]  # this is an extra info field, not TableData
 | |
| 
 | |
|     # Write realm data
 | |
|     export_file = os.path.join(output_dir, "realm.json")
 | |
|     write_table_data(output_file=export_file, data=response)
 | |
| 
 | |
|     # Write analytics data
 | |
|     export_analytics_tables(realm=realm, output_dir=output_dir)
 | |
| 
 | |
|     # zerver_attachment
 | |
|     attachments = export_attachment_table(
 | |
|         realm=realm,
 | |
|         output_dir=output_dir,
 | |
|         message_ids=message_ids,
 | |
|         scheduled_message_ids=exportable_scheduled_message_ids,
 | |
|     )
 | |
| 
 | |
|     logging.info("Exporting uploaded files and avatars")
 | |
|     export_uploads_and_avatars(realm, attachments=attachments, user=None, output_dir=output_dir)
 | |
| 
 | |
|     # Start parallel jobs to export the UserMessage objects.
 | |
|     launch_user_message_subprocesses(
 | |
|         threads=threads,
 | |
|         output_dir=output_dir,
 | |
|         export_full_with_consent=export_type == RealmExport.EXPORT_FULL_WITH_CONSENT,
 | |
|         exportable_user_ids=exportable_user_ids,
 | |
|     )
 | |
| 
 | |
|     do_common_export_processes(output_dir)
 | |
| 
 | |
|     logging.info("Finished exporting %s", realm.string_id)
 | |
|     create_soft_link(source=output_dir, in_progress=False)
 | |
| 
 | |
|     stats = do_write_stats_file_for_realm_export(output_dir)
 | |
| 
 | |
|     logging.info("Compressing tarball...")
 | |
|     tarball_path = output_dir.rstrip("/") + ".tar.gz"
 | |
|     subprocess.check_call(
 | |
|         [
 | |
|             "tar",
 | |
|             f"-czf{tarball_path}",
 | |
|             f"-C{os.path.dirname(output_dir)}",
 | |
|             os.path.basename(output_dir),
 | |
|         ]
 | |
|     )
 | |
|     return tarball_path, stats
 | |
| 
 | |
| 
 | |
| def export_attachment_table(
 | |
|     realm: Realm, output_dir: Path, message_ids: set[int], scheduled_message_ids: set[int]
 | |
| ) -> list[Attachment]:
 | |
|     response: TableData = {}
 | |
|     attachments = fetch_attachment_data(
 | |
|         response=response,
 | |
|         realm_id=realm.id,
 | |
|         message_ids=message_ids,
 | |
|         scheduled_message_ids=scheduled_message_ids,
 | |
|     )
 | |
|     output_file = os.path.join(output_dir, "attachment.json")
 | |
|     write_table_data(output_file=output_file, data=response)
 | |
|     return attachments
 | |
| 
 | |
| 
 | |
| def create_soft_link(source: Path, in_progress: bool = True) -> None:
 | |
|     is_done = not in_progress
 | |
|     if settings.DEVELOPMENT:
 | |
|         in_progress_link = os.path.join(settings.DEPLOY_ROOT, "var", "export-in-progress")
 | |
|         done_link = os.path.join(settings.DEPLOY_ROOT, "var", "export-most-recent")
 | |
|     else:
 | |
|         in_progress_link = "/home/zulip/export-in-progress"
 | |
|         done_link = "/home/zulip/export-most-recent"
 | |
| 
 | |
|     if in_progress:
 | |
|         new_target = in_progress_link
 | |
|     else:
 | |
|         with suppress(FileNotFoundError):
 | |
|             os.remove(in_progress_link)
 | |
|         new_target = done_link
 | |
| 
 | |
|     overwrite_symlink(source, new_target)
 | |
|     if is_done:
 | |
|         logging.info("See %s for output files", new_target)
 | |
| 
 | |
| 
 | |
| def launch_user_message_subprocesses(
 | |
|     threads: int,
 | |
|     output_dir: Path,
 | |
|     export_full_with_consent: bool,
 | |
|     exportable_user_ids: set[int] | None,
 | |
| ) -> None:
 | |
|     logging.info("Launching %d PARALLEL subprocesses to export UserMessage rows", threads)
 | |
|     pids = {}
 | |
| 
 | |
|     if export_full_with_consent:
 | |
|         assert exportable_user_ids is not None
 | |
|         consented_user_ids_filepath = os.path.join(output_dir, "consented_user_ids.json")
 | |
|         with open(consented_user_ids_filepath, "wb") as f:
 | |
|             f.write(orjson.dumps(list(exportable_user_ids)))
 | |
|         logging.info("Created consented_user_ids.json file.")
 | |
| 
 | |
|     for shard_id in range(threads):
 | |
|         arguments = [
 | |
|             os.path.join(settings.DEPLOY_ROOT, "manage.py"),
 | |
|             "export_usermessage_batch",
 | |
|             f"--path={output_dir}",
 | |
|             f"--thread={shard_id}",
 | |
|         ]
 | |
|         if export_full_with_consent:
 | |
|             arguments.append("--export-full-with-consent")
 | |
| 
 | |
|         process = subprocess.Popen(arguments)
 | |
|         pids[process.pid] = shard_id
 | |
| 
 | |
|     while pids:
 | |
|         pid, status = os.wait()
 | |
|         shard = pids.pop(pid)
 | |
|         print(f"Shard {shard} finished, status {status}")
 | |
| 
 | |
| 
 | |
| def do_export_user(user_profile: UserProfile, output_dir: Path) -> None:
 | |
|     response: TableData = {}
 | |
| 
 | |
|     export_single_user(user_profile, response)
 | |
|     export_file = os.path.join(output_dir, "user.json")
 | |
|     write_table_data(output_file=export_file, data=response)
 | |
| 
 | |
|     reaction_message_ids: set[int] = {row["message"] for row in response["zerver_reaction"]}
 | |
| 
 | |
|     logging.info("Exporting messages")
 | |
|     export_messages_single_user(
 | |
|         user_profile, output_dir=output_dir, reaction_message_ids=reaction_message_ids
 | |
|     )
 | |
| 
 | |
|     logging.info("Exporting images")
 | |
|     export_uploads_and_avatars(user_profile.realm, user=user_profile, output_dir=output_dir)
 | |
| 
 | |
| 
 | |
| def export_single_user(user_profile: UserProfile, response: TableData) -> None:
 | |
|     config = get_single_user_config()
 | |
|     export_from_config(
 | |
|         response=response,
 | |
|         config=config,
 | |
|         seed_object=user_profile,
 | |
|         context=dict(user=user_profile),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def get_single_user_config() -> Config:
 | |
|     # This function defines the limited configuration for what data to
 | |
|     # export when exporting all data that a single Zulip user has
 | |
|     # access to in an organization.
 | |
| 
 | |
|     # zerver_userprofile
 | |
|     user_profile_config = Config(
 | |
|         table="zerver_userprofile",
 | |
|         is_seeded=True,
 | |
|         exclude=EXCLUDED_USER_PROFILE_FIELDS,
 | |
|     )
 | |
| 
 | |
|     # zerver_subscription
 | |
|     subscription_config = Config(
 | |
|         table="zerver_subscription",
 | |
|         model=Subscription,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         # Exports with consent are not relevant in the context of exporting
 | |
|         # a single user.
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     # zerver_recipient
 | |
|     recipient_config = Config(
 | |
|         table="zerver_recipient",
 | |
|         model=Recipient,
 | |
|         virtual_parent=subscription_config,
 | |
|         id_source=("zerver_subscription", "recipient"),
 | |
|     )
 | |
| 
 | |
|     # zerver_stream
 | |
|     #
 | |
|     # TODO: We currently export the existence of private streams, but
 | |
|     # not their message history, in the "export with partial member
 | |
|     # consent" code path.  This consistent with our documented policy,
 | |
|     # since that data is available to the organization administrator
 | |
|     # who initiated the export, but unnecessary and potentially
 | |
|     # confusing; it'd be better to just skip those streams from the
 | |
|     # export (which would require more complex export logic for the
 | |
|     # subscription/recipient/stream tables to exclude private streams
 | |
|     # with no consenting subscribers).
 | |
|     Config(
 | |
|         table="zerver_stream",
 | |
|         model=Stream,
 | |
|         virtual_parent=recipient_config,
 | |
|         id_source=("zerver_recipient", "type_id"),
 | |
|         source_filter=lambda r: r["type"] == Recipient.STREAM,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="analytics_usercount",
 | |
|         model=UserCount,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_id__in",
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_realmauditlog",
 | |
|         model=RealmAuditLog,
 | |
|         virtual_parent=user_profile_config,
 | |
|         # See the docstring for why we use a custom fetch here.
 | |
|         custom_fetch=custom_fetch_realm_audit_logs_for_user,
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="zerver_reaction",
 | |
|         model=Reaction,
 | |
|         normal_parent=user_profile_config,
 | |
|         include_rows="user_profile_id__in",
 | |
|         limit_to_consenting_users=False,
 | |
|     )
 | |
| 
 | |
|     add_user_profile_child_configs(user_profile_config)
 | |
| 
 | |
|     return user_profile_config
 | |
| 
 | |
| 
 | |
| def get_id_list_gently_from_database(*, base_query: Any, id_field: str) -> list[int]:
 | |
|     """
 | |
|     Use this function if you need a HUGE number of ids from
 | |
|     the database, and you don't mind a few extra trips.  Particularly
 | |
|     for exports, we don't really care about a little extra time
 | |
|     to finish the export--the much bigger concern is that we don't
 | |
|     want to overload our database all at once, nor do we want to
 | |
|     keep a whole bunch of Django objects around in memory.
 | |
| 
 | |
|     So our general process is to call this function first, and then
 | |
|     we call chunkify to break our ids into small chunks for "fat query"
 | |
|     batches.
 | |
| 
 | |
|     Even if you are not working at huge scale, this function can
 | |
|     also be used for the convenience of its API.
 | |
|     """
 | |
|     min_id = -1
 | |
|     all_ids = []
 | |
|     batch_size = 10000  # we are just getting ints
 | |
| 
 | |
|     assert id_field == "id" or id_field.endswith("_id")
 | |
| 
 | |
|     while True:
 | |
|         filter_args = {f"{id_field}__gt": min_id}
 | |
|         new_ids = list(
 | |
|             base_query.values_list(id_field, flat=True)
 | |
|             .filter(**filter_args)
 | |
|             .order_by(id_field)[:batch_size]
 | |
|         )
 | |
|         if len(new_ids) == 0:
 | |
|             break
 | |
|         all_ids += new_ids
 | |
|         min_id = new_ids[-1]
 | |
| 
 | |
|     return all_ids
 | |
| 
 | |
| 
 | |
| def chunkify(lst: list[int], chunk_size: int) -> list[list[int]]:
 | |
|     # chunkify([1,2,3,4,5], 2) == [[1,2], [3,4], [5]]
 | |
|     result = []
 | |
|     i = 0
 | |
|     while True:
 | |
|         chunk = lst[i : i + chunk_size]
 | |
|         if len(chunk) == 0:
 | |
|             break
 | |
|         else:
 | |
|             result.append(chunk)
 | |
|             i += chunk_size
 | |
| 
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def export_messages_single_user(
 | |
|     user_profile: UserProfile, *, output_dir: Path, reaction_message_ids: set[int]
 | |
| ) -> None:
 | |
|     @cache
 | |
|     def get_recipient(recipient_id: int) -> str:
 | |
|         recipient = Recipient.objects.get(id=recipient_id)
 | |
| 
 | |
|         if recipient.type == Recipient.STREAM:
 | |
|             stream = Stream.objects.values("name").get(id=recipient.type_id)
 | |
|             return stream["name"]
 | |
| 
 | |
|         user_names = (
 | |
|             UserProfile.objects.filter(
 | |
|                 subscription__recipient_id=recipient.id,
 | |
|             )
 | |
|             .order_by("full_name")
 | |
|             .values_list("full_name", flat=True)
 | |
|         )
 | |
| 
 | |
|         return ", ".join(user_names)
 | |
| 
 | |
|     messages_from_me = Message.objects.filter(
 | |
|         # Uses index: zerver_message_realm_sender_recipient (prefix)
 | |
|         realm_id=user_profile.realm_id,
 | |
|         sender=user_profile,
 | |
|     )
 | |
| 
 | |
|     my_subscriptions = Subscription.objects.filter(
 | |
|         user_profile=user_profile,
 | |
|         recipient__type__in=[Recipient.PERSONAL, Recipient.DIRECT_MESSAGE_GROUP],
 | |
|     )
 | |
|     my_recipient_ids = [sub.recipient_id for sub in my_subscriptions]
 | |
|     messages_to_me = Message.objects.filter(
 | |
|         # Uses index: zerver_message_realm_recipient_id (prefix)
 | |
|         realm_id=user_profile.realm_id,
 | |
|         recipient_id__in=my_recipient_ids,
 | |
|     )
 | |
| 
 | |
|     # Find all message ids that pertain to us.
 | |
|     all_message_ids: set[int] = set()
 | |
| 
 | |
|     for query in [messages_from_me, messages_to_me]:
 | |
|         all_message_ids |= set(get_id_list_gently_from_database(base_query=query, id_field="id"))
 | |
| 
 | |
|     all_message_ids |= reaction_message_ids
 | |
| 
 | |
|     dump_file_id = 1
 | |
|     for message_id_chunk in chunkify(sorted(all_message_ids), MESSAGE_BATCH_CHUNK_SIZE):
 | |
|         fat_query = (
 | |
|             UserMessage.objects.select_related("message", "message__sending_client")
 | |
|             .filter(user_profile=user_profile, message_id__in=message_id_chunk)
 | |
|             .order_by("message_id")
 | |
|         )
 | |
| 
 | |
|         user_message_chunk = list(fat_query)
 | |
| 
 | |
|         message_chunk = []
 | |
|         for user_message in user_message_chunk:
 | |
|             item = model_to_dict(user_message.message)
 | |
|             item["flags"] = user_message.flags_list()
 | |
|             item["flags_mask"] = user_message.flags.mask
 | |
|             # Add a few nice, human-readable details
 | |
|             item["sending_client_name"] = user_message.message.sending_client.name
 | |
|             item["recipient_name"] = get_recipient(user_message.message.recipient_id)
 | |
|             message_chunk.append(item)
 | |
| 
 | |
|         message_filename = os.path.join(output_dir, f"messages-{dump_file_id:06}.json")
 | |
|         logging.info("Fetched messages for %s", message_filename)
 | |
| 
 | |
|         output = {"zerver_message": message_chunk}
 | |
|         floatify_datetime_fields(output, "zerver_message")
 | |
| 
 | |
|         write_table_data(message_filename, output)
 | |
|         dump_file_id += 1
 | |
| 
 | |
| 
 | |
| def export_analytics_tables(realm: Realm, output_dir: Path) -> None:
 | |
|     response: TableData = {}
 | |
| 
 | |
|     logging.info("Fetching analytics table data")
 | |
|     config = get_analytics_config()
 | |
|     export_from_config(
 | |
|         response=response,
 | |
|         config=config,
 | |
|         seed_object=realm,
 | |
|     )
 | |
| 
 | |
|     # The seeding logic results in a duplicate zerver_realm object
 | |
|     # being included in the analytics data.  We don't want it, as that
 | |
|     # data is already in `realm.json`, so we just delete it here
 | |
|     # before writing to disk.
 | |
|     del response["zerver_realm"]
 | |
| 
 | |
|     export_file = os.path.join(output_dir, "analytics.json")
 | |
|     write_table_data(output_file=export_file, data=response)
 | |
| 
 | |
| 
 | |
| def get_analytics_config() -> Config:
 | |
|     # The Config function defines what data to export for the
 | |
|     # analytics.json file in a full-realm export.
 | |
| 
 | |
|     analytics_config = Config(
 | |
|         table="zerver_realm",
 | |
|         is_seeded=True,
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="analytics_realmcount",
 | |
|         model=RealmCount,
 | |
|         normal_parent=analytics_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="analytics_usercount",
 | |
|         model=UserCount,
 | |
|         normal_parent=analytics_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     Config(
 | |
|         table="analytics_streamcount",
 | |
|         model=StreamCount,
 | |
|         normal_parent=analytics_config,
 | |
|         include_rows="realm_id__in",
 | |
|     )
 | |
| 
 | |
|     return analytics_config
 | |
| 
 | |
| 
 | |
| def get_consented_user_ids(realm: Realm) -> set[int]:
 | |
|     # A UserProfile is consenting to private data export if either:
 | |
|     # 1) It is a human account and enabled allow_private_data_export.
 | |
|     # 2) It is a bot account with allow_private_data_export toggled on.
 | |
|     # 3) It is a bot whose owner is (1).
 | |
|     # 4) It is a mirror dummy. This is a special case that requires some
 | |
|     #    explanation. There are two cases where an account will be a mirror dummy:
 | |
|     #    a) It comes from a 3rd party export (e.g. from Slack) - in some cases,
 | |
|     #       certain limited accounts are turned into Zulip mirror dummy accounts.
 | |
|     #       For such an account, the admins already have access to all the original data,
 | |
|     #       so we can freely consider the user as consenting and export everything.
 | |
|     #    b) It was imported from another Zulip export; and it was a non-consented user
 | |
|     #       in it. Thus, only public data of the user was exported->imported.
 | |
|     #       Therefore, again we can consider the user as consenting and export
 | |
|     #       everything - all this data is public by construction.
 | |
| 
 | |
|     query = sql.SQL("""
 | |
|         WITH consenting_humans AS (
 | |
|             SELECT id
 | |
|             FROM zerver_userprofile
 | |
|             WHERE allow_private_data_export
 | |
|               AND NOT is_bot
 | |
|               AND realm_id = {realm_id}
 | |
|         )
 | |
|         SELECT id
 | |
|         FROM zerver_userprofile
 | |
|         WHERE
 | |
|             (id IN (SELECT id FROM consenting_humans))
 | |
|             OR (allow_private_data_export AND is_bot AND realm_id = {realm_id})
 | |
|             OR (
 | |
|                 bot_owner_id IN (SELECT id FROM consenting_humans)
 | |
|                 AND is_bot
 | |
|                 AND realm_id = {realm_id}
 | |
|             )
 | |
|             OR (is_mirror_dummy AND realm_id = {realm_id})
 | |
|     """).format(realm_id=sql.Literal(realm.id))
 | |
| 
 | |
|     with connection.cursor() as cursor:
 | |
|         cursor.execute(query)
 | |
|         rows = cursor.fetchall()
 | |
|     return {row[0] for row in rows}
 | |
| 
 | |
| 
 | |
| def export_realm_wrapper(
 | |
|     export_row: RealmExport,
 | |
|     output_dir: str,
 | |
|     threads: int,
 | |
|     upload: bool,
 | |
|     percent_callback: Callable[[Any], None] | None = None,
 | |
|     export_as_active: bool | None = None,
 | |
| ) -> str | None:
 | |
|     try:
 | |
|         export_row.status = RealmExport.STARTED
 | |
|         export_row.date_started = timezone_now()
 | |
|         export_row.save(update_fields=["status", "date_started"])
 | |
| 
 | |
|         exportable_user_ids = None
 | |
|         if export_row.type == RealmExport.EXPORT_FULL_WITH_CONSENT:
 | |
|             exportable_user_ids = get_consented_user_ids(export_row.realm)
 | |
| 
 | |
|         tarball_path, stats = do_export_realm(
 | |
|             realm=export_row.realm,
 | |
|             output_dir=output_dir,
 | |
|             threads=threads,
 | |
|             export_type=export_row.type,
 | |
|             export_as_active=export_as_active,
 | |
|             exportable_user_ids=exportable_user_ids,
 | |
|         )
 | |
| 
 | |
|         RealmAuditLog.objects.create(
 | |
|             acting_user=export_row.acting_user,
 | |
|             realm=export_row.realm,
 | |
|             event_type=AuditLogEventType.REALM_EXPORTED,
 | |
|             event_time=timezone_now(),
 | |
|             extra_data={"realm_export_id": export_row.id},
 | |
|         )
 | |
| 
 | |
|         shutil.rmtree(output_dir)
 | |
|         print(f"Tarball written to {tarball_path}")
 | |
| 
 | |
|         print("Calculating SHA-256 checksum of tarball...")
 | |
| 
 | |
|         # TODO: We can directly use 'hashlib.file_digest'. (New in Python 3.11)
 | |
|         sha256_hash = hashlib.sha256()
 | |
|         with open(tarball_path, "rb") as f:
 | |
|             buf = bytearray(2**18)
 | |
|             view = memoryview(buf)
 | |
|             while True:
 | |
|                 size = f.readinto(buf)
 | |
|                 if size == 0:
 | |
|                     break
 | |
|                 sha256_hash.update(view[:size])
 | |
| 
 | |
|         export_row.sha256sum_hex = sha256_hash.hexdigest()
 | |
|         export_row.tarball_size_bytes = os.path.getsize(tarball_path)
 | |
|         export_row.status = RealmExport.SUCCEEDED
 | |
|         export_row.date_succeeded = timezone_now()
 | |
|         export_row.stats = stats
 | |
| 
 | |
|         print(f"SHA-256 checksum is {export_row.sha256sum_hex}")
 | |
| 
 | |
|         if not upload:
 | |
|             export_row.save(
 | |
|                 update_fields=[
 | |
|                     "sha256sum_hex",
 | |
|                     "tarball_size_bytes",
 | |
|                     "status",
 | |
|                     "date_succeeded",
 | |
|                     "stats",
 | |
|                 ]
 | |
|             )
 | |
|             return None
 | |
| 
 | |
|         print("Uploading export tarball...")
 | |
|         public_url = zerver.lib.upload.upload_backend.upload_export_tarball(
 | |
|             export_row.realm, tarball_path, percent_callback=percent_callback
 | |
|         )
 | |
|         print(f"\nUploaded to {public_url}")
 | |
| 
 | |
|         # Update the export_path field now that the export is complete.
 | |
|         export_row.export_path = urlsplit(public_url).path
 | |
|         export_row.save(
 | |
|             update_fields=[
 | |
|                 "sha256sum_hex",
 | |
|                 "tarball_size_bytes",
 | |
|                 "status",
 | |
|                 "date_succeeded",
 | |
|                 "stats",
 | |
|                 "export_path",
 | |
|             ]
 | |
|         )
 | |
| 
 | |
|         os.remove(tarball_path)
 | |
|         print(f"Successfully deleted the tarball at {tarball_path}")
 | |
|         return public_url
 | |
|     except Exception:
 | |
|         export_row.status = RealmExport.FAILED
 | |
|         export_row.date_failed = timezone_now()
 | |
|         export_row.save(update_fields=["status", "date_failed"])
 | |
|         raise
 | |
| 
 | |
| 
 | |
| def get_realm_exports_serialized(realm: Realm) -> list[dict[str, Any]]:
 | |
|     # Exclude exports made via shell. 'acting_user=None', since they
 | |
|     # aren't supported in the current API format.
 | |
|     #
 | |
|     # TODO: We should return those via the API as well, with an
 | |
|     # appropriate way to express for who issued them; this requires an
 | |
|     # API change.
 | |
|     all_exports = RealmExport.objects.filter(realm=realm).exclude(acting_user=None)
 | |
|     exports_dict = {}
 | |
|     for export in all_exports:
 | |
|         export_url = None
 | |
|         export_path = export.export_path
 | |
|         pending = export.status in [RealmExport.REQUESTED, RealmExport.STARTED]
 | |
| 
 | |
|         if export.status == RealmExport.SUCCEEDED:
 | |
|             assert export_path is not None
 | |
|             export_url = zerver.lib.upload.upload_backend.get_export_tarball_url(realm, export_path)
 | |
| 
 | |
|         deleted_timestamp = (
 | |
|             datetime_to_timestamp(export.date_deleted) if export.date_deleted else None
 | |
|         )
 | |
|         failed_timestamp = datetime_to_timestamp(export.date_failed) if export.date_failed else None
 | |
|         acting_user = export.acting_user
 | |
|         assert acting_user is not None
 | |
|         exports_dict[export.id] = dict(
 | |
|             id=export.id,
 | |
|             export_time=datetime_to_timestamp(export.date_requested),
 | |
|             acting_user_id=acting_user.id,
 | |
|             export_url=export_url,
 | |
|             deleted_timestamp=deleted_timestamp,
 | |
|             failed_timestamp=failed_timestamp,
 | |
|             pending=pending,
 | |
|             export_type=export.type,
 | |
|         )
 | |
|     return sorted(exports_dict.values(), key=lambda export_dict: export_dict["id"])
 | |
| 
 | |
| 
 | |
| def export_migration_status(output_dir: str) -> None:
 | |
|     migration_status_json = MigrationStatusJson(
 | |
|         migrations_by_app=parse_migration_status(),
 | |
|         zulip_version=ZULIP_VERSION,
 | |
|     )
 | |
|     output_file = os.path.join(output_dir, "migration_status.json")
 | |
|     with open(output_file, "wb") as f:
 | |
|         f.write(orjson.dumps(migration_status_json, option=orjson.OPT_INDENT_2))
 | |
| 
 | |
| 
 | |
| def do_common_export_processes(output_dir: str) -> None:
 | |
|     # Performs common task(s) necessary for preparing Zulip data exports.
 | |
|     # This function is typically shared with migration tools in the
 | |
|     # `zerver/data_import` directory.
 | |
| 
 | |
|     logging.info("Exporting migration status")
 | |
|     export_migration_status(output_dir)
 | |
| 
 | |
| 
 | |
| def check_export_with_consent_is_usable(realm: Realm) -> bool:
 | |
|     # Users without consent enabled will end up deactivated in the exported
 | |
|     # data. An organization without a consenting Owner would therefore not be
 | |
|     # functional after export->import. That's most likely not desired by the user
 | |
|     # so check for such a case.
 | |
|     consented_user_ids = get_consented_user_ids(realm)
 | |
|     return UserProfile.objects.filter(
 | |
|         id__in=consented_user_ids, role=UserProfile.ROLE_REALM_OWNER, realm=realm
 | |
|     ).exists()
 | |
| 
 | |
| 
 | |
| def check_public_export_is_usable(realm: Realm) -> bool:
 | |
|     # Since users with email visibility set to NOBODY won't have their real emails
 | |
|     # exported, this could result in a lack of functional Owner accounts.
 | |
|     # We make sure that at least one Owner can have their real email address exported.
 | |
|     return UserProfile.objects.filter(
 | |
|         role=UserProfile.ROLE_REALM_OWNER,
 | |
|         email_address_visibility__in=[
 | |
|             UserProfile.EMAIL_ADDRESS_VISIBILITY_EVERYONE,
 | |
|             UserProfile.EMAIL_ADDRESS_VISIBILITY_MEMBERS,
 | |
|             UserProfile.EMAIL_ADDRESS_VISIBILITY_MODERATORS,
 | |
|             UserProfile.EMAIL_ADDRESS_VISIBILITY_ADMINS,
 | |
|         ],
 | |
|         realm=realm,
 | |
|     ).exists()
 |