mirror of
https://github.com/zulip/zulip.git
synced 2025-10-23 04:52:12 +00:00
1878 lines
71 KiB
Python
1878 lines
71 KiB
Python
import itertools
|
||
import logging
|
||
import os
|
||
import posixpath
|
||
import random
|
||
import re
|
||
import secrets
|
||
import shutil
|
||
import time
|
||
import zipfile
|
||
from collections import defaultdict
|
||
from collections.abc import Iterator
|
||
from datetime import datetime, timezone
|
||
from email.errors import HeaderDefect
|
||
from email.headerregistry import Address
|
||
from typing import Any, TypeAlias
|
||
from urllib.parse import SplitResult, urlsplit
|
||
|
||
import orjson
|
||
import requests
|
||
from django.conf import settings
|
||
from django.forms.models import model_to_dict
|
||
from django.utils.timezone import now as timezone_now
|
||
|
||
from zerver.data_import.import_util import (
|
||
ZerverFieldsT,
|
||
build_attachment,
|
||
build_avatar,
|
||
build_defaultstream,
|
||
build_direct_message_group,
|
||
build_message,
|
||
build_realm,
|
||
build_recipient,
|
||
build_stream,
|
||
build_subscription,
|
||
build_usermessages,
|
||
build_zerver_realm,
|
||
create_converted_data_files,
|
||
long_term_idle_helper,
|
||
make_subscriber_map,
|
||
process_avatars,
|
||
process_emojis,
|
||
process_uploads,
|
||
validate_user_emails_for_import,
|
||
)
|
||
from zerver.data_import.sequencer import NEXT_ID
|
||
from zerver.data_import.slack_message_conversion import (
|
||
convert_to_zulip_markdown,
|
||
get_user_full_name,
|
||
process_slack_block_and_attachment,
|
||
)
|
||
from zerver.lib.emoji import codepoint_to_name, get_emoji_file_name
|
||
from zerver.lib.exceptions import SlackImportInvalidFileError
|
||
from zerver.lib.export import MESSAGE_BATCH_CHUNK_SIZE, do_common_export_processes
|
||
from zerver.lib.message import truncate_content
|
||
from zerver.lib.mime_types import guess_type
|
||
from zerver.lib.storage import static_path
|
||
from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, resize_realm_icon
|
||
from zerver.lib.upload import sanitize_name
|
||
from zerver.models import (
|
||
CustomProfileField,
|
||
CustomProfileFieldValue,
|
||
Reaction,
|
||
Realm,
|
||
RealmEmoji,
|
||
Recipient,
|
||
UserProfile,
|
||
)
|
||
from zerver.models.constants import MAX_TOPIC_NAME_LENGTH
|
||
|
||
SlackToZulipUserIDT: TypeAlias = dict[str, int]
|
||
AddedChannelsT: TypeAlias = dict[str, tuple[str, int]]
|
||
AddedMPIMsT: TypeAlias = dict[str, tuple[str, int]]
|
||
DMMembersT: TypeAlias = dict[str, tuple[str, str]]
|
||
SlackToZulipRecipientT: TypeAlias = dict[str, int]
|
||
|
||
# We can look up unicode codepoints for Slack emoji using iamcal emoji
|
||
# data. https://emojipedia.org/slack/, documents Slack's emoji names
|
||
# are derived from https://github.com/iamcal/emoji-data; this seems
|
||
# likely to remain true since Cal is a Slack's cofounder.
|
||
emoji_data_file_path = static_path("generated/emoji/emoji-datasource-google-emoji.json")
|
||
with open(emoji_data_file_path, "rb") as emoji_data_file:
|
||
emoji_data = orjson.loads(emoji_data_file.read())
|
||
|
||
|
||
def get_emoji_code(emoji_dict: dict[str, Any]) -> str:
|
||
# This function is identical with the function with the same name at
|
||
# tools/setup/emoji/emoji_setup_utils.py.
|
||
# This function is unlikely to be changed, unless iamcal changes their data
|
||
# structure.
|
||
emoji_code = emoji_dict.get("non_qualified") or emoji_dict["unified"]
|
||
return emoji_code.lower()
|
||
|
||
|
||
# Build the translation dict from Slack emoji name to codepoint.
|
||
slack_emoji_name_to_codepoint: dict[str, str] = {}
|
||
for emoji_dict in emoji_data:
|
||
short_name = emoji_dict["short_name"]
|
||
emoji_code = get_emoji_code(emoji_dict)
|
||
slack_emoji_name_to_codepoint[short_name] = emoji_code
|
||
for sn in emoji_dict["short_names"]:
|
||
if sn != short_name:
|
||
slack_emoji_name_to_codepoint[sn] = emoji_code
|
||
|
||
|
||
class SlackBotEmail:
|
||
duplicate_email_count: dict[str, int] = {}
|
||
# Mapping of `bot_id` to final email assigned to the bot.
|
||
assigned_email: dict[str, str] = {}
|
||
|
||
@classmethod
|
||
def get_email(cls, user_profile: ZerverFieldsT, domain_name: str) -> str:
|
||
slack_bot_id = user_profile["bot_id"]
|
||
if slack_bot_id in cls.assigned_email:
|
||
return cls.assigned_email[slack_bot_id]
|
||
|
||
if "real_name_normalized" in user_profile:
|
||
slack_bot_name = user_profile["real_name_normalized"]
|
||
elif "first_name" in user_profile:
|
||
slack_bot_name = user_profile["first_name"]
|
||
else:
|
||
raise AssertionError("Could not identify bot type")
|
||
|
||
email = Address(
|
||
username=slack_bot_name.replace("Bot", "").replace(" ", "").lower() + "-bot",
|
||
domain=domain_name,
|
||
).addr_spec
|
||
# The address formed above may not be a valid email format - e.g. containing
|
||
# non-ASCII characters in the local part, if the slack_bot_name contains them.
|
||
# Only Address(addr_spec=...) triggers the necessary validation.
|
||
# Thus we call it here, and if issues are detected, we fall back to forming the
|
||
# email address in a safer way - using the bot id string.
|
||
try:
|
||
Address(addr_spec=email)
|
||
except HeaderDefect:
|
||
email = Address(
|
||
username=slack_bot_id + "-bot",
|
||
domain=domain_name,
|
||
).addr_spec
|
||
|
||
if email in cls.duplicate_email_count:
|
||
cls.duplicate_email_count[email] += 1
|
||
address = Address(addr_spec=email)
|
||
email_username = address.username + "-" + str(cls.duplicate_email_count[email])
|
||
email = Address(username=email_username, domain=address.domain).addr_spec
|
||
else:
|
||
cls.duplicate_email_count[email] = 1
|
||
|
||
cls.assigned_email[slack_bot_id] = email
|
||
return email
|
||
|
||
|
||
def rm_tree(path: str) -> None:
|
||
if os.path.exists(path):
|
||
shutil.rmtree(path)
|
||
|
||
|
||
def slack_workspace_to_realm(
|
||
domain_name: str,
|
||
realm_id: int,
|
||
user_list: list[ZerverFieldsT],
|
||
realm_subdomain: str,
|
||
slack_data_dir: str,
|
||
custom_emoji_list: ZerverFieldsT,
|
||
) -> tuple[
|
||
ZerverFieldsT,
|
||
SlackToZulipUserIDT,
|
||
SlackToZulipRecipientT,
|
||
AddedChannelsT,
|
||
AddedMPIMsT,
|
||
DMMembersT,
|
||
list[ZerverFieldsT],
|
||
ZerverFieldsT,
|
||
]:
|
||
"""
|
||
Returns:
|
||
1. realm, converted realm data
|
||
2. slack_user_id_to_zulip_user_id, which is a dictionary to map from Slack user id to Zulip user id
|
||
3. slack_recipient_name_to_zulip_recipient_id, which is a dictionary to map from Slack recipient
|
||
name(channel names, mpim names, usernames, etc) to Zulip recipient id
|
||
4. added_channels, which is a dictionary to map from channel name to channel id, Zulip stream_id
|
||
5. added_mpims, which is a dictionary to map from MPIM name to MPIM id, Zulip direct_message_group_id
|
||
6. dm_members, which is a dictionary to map from DM id to tuple of DM participants.
|
||
7. avatars, which is list to map avatars to Zulip avatar records.json
|
||
8. emoji_url_map, which is maps emoji name to its Slack URL
|
||
"""
|
||
NOW = float(timezone_now().timestamp())
|
||
|
||
zerver_realm: list[ZerverFieldsT] = build_zerver_realm(realm_id, realm_subdomain, NOW, "Slack")
|
||
realm = build_realm(zerver_realm, realm_id, domain_name, import_source="slack")
|
||
|
||
(
|
||
zerver_userprofile,
|
||
avatars,
|
||
slack_user_id_to_zulip_user_id,
|
||
zerver_customprofilefield,
|
||
zerver_customprofilefield_value,
|
||
) = users_to_zerver_userprofile(slack_data_dir, user_list, realm_id, int(NOW), domain_name)
|
||
(
|
||
realm,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
) = channels_to_zerver_stream(
|
||
slack_data_dir, realm_id, realm, slack_user_id_to_zulip_user_id, zerver_userprofile
|
||
)
|
||
|
||
zerver_realmemoji, emoji_url_map = build_realmemoji(custom_emoji_list, realm_id)
|
||
realm["zerver_realmemoji"] = zerver_realmemoji
|
||
|
||
# See https://zulip.com/help/set-default-channels-for-new-users
|
||
# for documentation on zerver_defaultstream
|
||
realm["zerver_userprofile"] = zerver_userprofile
|
||
|
||
realm["zerver_customprofilefield"] = zerver_customprofilefield
|
||
realm["zerver_customprofilefieldvalue"] = zerver_customprofilefield_value
|
||
|
||
return (
|
||
realm,
|
||
slack_user_id_to_zulip_user_id,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
avatars,
|
||
emoji_url_map,
|
||
)
|
||
|
||
|
||
def build_realmemoji(
|
||
custom_emoji_list: ZerverFieldsT, realm_id: int
|
||
) -> tuple[list[ZerverFieldsT], ZerverFieldsT]:
|
||
zerver_realmemoji = []
|
||
emoji_url_map = {}
|
||
emoji_id = 0
|
||
for emoji_name, url in custom_emoji_list.items():
|
||
split_url = urlsplit(url)
|
||
if split_url.hostname == "emoji.slack-edge.com":
|
||
# Some of the emojis we get from the API have invalid links
|
||
# this is to prevent errors related to them
|
||
content_type = guess_type(posixpath.basename(split_url.path))[0]
|
||
assert content_type is not None
|
||
realmemoji = RealmEmoji(
|
||
name=emoji_name,
|
||
id=emoji_id,
|
||
file_name=get_emoji_file_name(content_type, emoji_id),
|
||
deactivated=False,
|
||
)
|
||
|
||
realmemoji_dict = model_to_dict(realmemoji, exclude=["realm", "author"])
|
||
realmemoji_dict["author"] = None
|
||
realmemoji_dict["realm"] = realm_id
|
||
|
||
emoji_url_map[emoji_name] = url
|
||
zerver_realmemoji.append(realmemoji_dict)
|
||
emoji_id += 1
|
||
return zerver_realmemoji, emoji_url_map
|
||
|
||
|
||
def users_to_zerver_userprofile(
|
||
slack_data_dir: str, users: list[ZerverFieldsT], realm_id: int, timestamp: Any, domain_name: str
|
||
) -> tuple[
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
SlackToZulipUserIDT,
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
]:
|
||
"""
|
||
Returns:
|
||
1. zerver_userprofile, which is a list of user profile
|
||
2. avatar_list, which is list to map avatars to Zulip avatar records.json
|
||
3. slack_user_id_to_zulip_user_id, which is a dictionary to map from Slack user ID to Zulip
|
||
user id
|
||
4. zerver_customprofilefield, which is a list of all custom profile fields
|
||
5. zerver_customprofilefield_values, which is a list of user profile fields
|
||
"""
|
||
logging.info("######### IMPORTING USERS STARTED #########\n")
|
||
zerver_userprofile = []
|
||
zerver_customprofilefield: list[ZerverFieldsT] = []
|
||
zerver_customprofilefield_values: list[ZerverFieldsT] = []
|
||
avatar_list: list[ZerverFieldsT] = []
|
||
slack_user_id_to_zulip_user_id = {}
|
||
|
||
# The user data we get from the Slack API does not contain custom profile data
|
||
# Hence we get it from the Slack zip file
|
||
slack_data_file_user_list = get_data_file(slack_data_dir + "/users.json")
|
||
|
||
slack_user_id_to_custom_profile_fields: ZerverFieldsT = {}
|
||
slack_custom_field_name_to_zulip_custom_field_id: ZerverFieldsT = {}
|
||
|
||
for user in slack_data_file_user_list:
|
||
process_slack_custom_fields(user, slack_user_id_to_custom_profile_fields)
|
||
|
||
# We have only one primary owner in Slack, see link
|
||
# https://get.slack.help/hc/en-us/articles/201912948-Owners-and-Administrators
|
||
# This is to import the primary owner first from all the users
|
||
user_id_count = custom_profile_field_value_id_count = custom_profile_field_id_count = 0
|
||
primary_owner_id = user_id_count
|
||
user_id_count += 1
|
||
|
||
found_emails: dict[str, int] = {}
|
||
for user in users:
|
||
slack_user_id = user["id"]
|
||
|
||
if user.get("is_primary_owner", False):
|
||
user_id = primary_owner_id
|
||
else:
|
||
user_id = user_id_count
|
||
|
||
email = get_user_email(user, domain_name)
|
||
if email.lower() in found_emails:
|
||
slack_user_id_to_zulip_user_id[slack_user_id] = found_emails[email.lower()]
|
||
logging.info("%s: %s MERGED", slack_user_id, email)
|
||
continue
|
||
found_emails[email.lower()] = user_id
|
||
|
||
# ref: https://zulip.com/help/change-your-profile-picture
|
||
avatar_source, avatar_url = build_avatar_url(slack_user_id, user)
|
||
if avatar_source == UserProfile.AVATAR_FROM_USER:
|
||
build_avatar(user_id, realm_id, email, avatar_url, timestamp, avatar_list)
|
||
role = UserProfile.ROLE_MEMBER
|
||
if get_owner(user):
|
||
role = UserProfile.ROLE_REALM_OWNER
|
||
elif get_admin(user):
|
||
role = UserProfile.ROLE_REALM_ADMINISTRATOR
|
||
if get_guest(user):
|
||
role = UserProfile.ROLE_GUEST
|
||
timezone = get_user_timezone(user)
|
||
|
||
if slack_user_id in slack_user_id_to_custom_profile_fields:
|
||
(
|
||
slack_custom_field_name_to_zulip_custom_field_id,
|
||
custom_profile_field_id_count,
|
||
) = build_customprofile_field(
|
||
zerver_customprofilefield,
|
||
slack_user_id_to_custom_profile_fields[slack_user_id],
|
||
custom_profile_field_id_count,
|
||
realm_id,
|
||
slack_custom_field_name_to_zulip_custom_field_id,
|
||
)
|
||
custom_profile_field_value_id_count = build_customprofilefields_values(
|
||
slack_custom_field_name_to_zulip_custom_field_id,
|
||
slack_user_id_to_custom_profile_fields[slack_user_id],
|
||
user_id,
|
||
custom_profile_field_value_id_count,
|
||
zerver_customprofilefield_values,
|
||
)
|
||
|
||
if is_slackbot(user):
|
||
is_bot = True
|
||
else:
|
||
is_bot = user.get("is_bot", False)
|
||
if is_bot:
|
||
bot_type = 1
|
||
else:
|
||
bot_type = None
|
||
|
||
userprofile = UserProfile(
|
||
full_name=get_user_full_name(user),
|
||
is_active=not user.get("deleted", False) and not user["is_mirror_dummy"],
|
||
is_mirror_dummy=user["is_mirror_dummy"],
|
||
id=user_id,
|
||
email=email,
|
||
delivery_email=email,
|
||
avatar_source=avatar_source,
|
||
is_bot=is_bot,
|
||
role=role,
|
||
bot_type=bot_type,
|
||
date_joined=timestamp,
|
||
timezone=timezone,
|
||
last_login=timestamp,
|
||
)
|
||
userprofile_dict = model_to_dict(userprofile)
|
||
# Set realm id separately as the corresponding realm is not yet a Realm model instance
|
||
userprofile_dict["realm"] = realm_id
|
||
|
||
zerver_userprofile.append(userprofile_dict)
|
||
slack_user_id_to_zulip_user_id[slack_user_id] = user_id
|
||
if not user.get("is_primary_owner", False):
|
||
user_id_count += 1
|
||
|
||
logging.info("%s: %s -> %s", slack_user_id, user["name"], userprofile_dict["email"])
|
||
|
||
validate_user_emails_for_import(list(found_emails))
|
||
process_customprofilefields(zerver_customprofilefield, zerver_customprofilefield_values)
|
||
logging.info("######### IMPORTING USERS FINISHED #########\n")
|
||
return (
|
||
zerver_userprofile,
|
||
avatar_list,
|
||
slack_user_id_to_zulip_user_id,
|
||
zerver_customprofilefield,
|
||
zerver_customprofilefield_values,
|
||
)
|
||
|
||
|
||
def build_customprofile_field(
|
||
customprofile_field: list[ZerverFieldsT],
|
||
fields: ZerverFieldsT,
|
||
custom_profile_field_id: int,
|
||
realm_id: int,
|
||
slack_custom_field_name_to_zulip_custom_field_id: ZerverFieldsT,
|
||
) -> tuple[ZerverFieldsT, int]:
|
||
# The name of the custom profile field is not provided in the Slack data
|
||
# Hash keys of the fields are provided
|
||
# Reference: https://api.slack.com/methods/users.profile.set
|
||
for field in fields:
|
||
if field not in slack_custom_field_name_to_zulip_custom_field_id:
|
||
slack_custom_fields = ["phone", "skype"]
|
||
if field in slack_custom_fields:
|
||
field_name = field
|
||
else:
|
||
field_name = f"Slack custom field {custom_profile_field_id + 1}"
|
||
customprofilefield = CustomProfileField(
|
||
id=custom_profile_field_id,
|
||
name=field_name,
|
||
field_type=1, # For now this is defaulted to 'SHORT_TEXT'
|
||
# Processing is done in the function 'process_customprofilefields'
|
||
)
|
||
|
||
customprofilefield_dict = model_to_dict(customprofilefield, exclude=["realm"])
|
||
customprofilefield_dict["realm"] = realm_id
|
||
|
||
slack_custom_field_name_to_zulip_custom_field_id[field] = custom_profile_field_id
|
||
custom_profile_field_id += 1
|
||
customprofile_field.append(customprofilefield_dict)
|
||
return slack_custom_field_name_to_zulip_custom_field_id, custom_profile_field_id
|
||
|
||
|
||
def process_slack_custom_fields(
|
||
user: ZerverFieldsT, slack_user_id_to_custom_profile_fields: ZerverFieldsT
|
||
) -> None:
|
||
slack_user_id_to_custom_profile_fields[user["id"]] = {}
|
||
if user["profile"].get("fields"):
|
||
slack_user_id_to_custom_profile_fields[user["id"]] = user["profile"]["fields"]
|
||
|
||
slack_custom_fields = ["phone", "skype"]
|
||
for field in slack_custom_fields:
|
||
if field in user["profile"]:
|
||
slack_user_id_to_custom_profile_fields[user["id"]][field] = {
|
||
"value": user["profile"][field]
|
||
}
|
||
|
||
|
||
def build_customprofilefields_values(
|
||
slack_custom_field_name_to_zulip_custom_field_id: ZerverFieldsT,
|
||
fields: ZerverFieldsT,
|
||
user_id: int,
|
||
custom_field_id: int,
|
||
custom_field_values: list[ZerverFieldsT],
|
||
) -> int:
|
||
for field, value in fields.items():
|
||
if value["value"] == "":
|
||
continue
|
||
custom_field_value = CustomProfileFieldValue(id=custom_field_id, value=value["value"])
|
||
|
||
custom_field_value_dict = model_to_dict(
|
||
custom_field_value, exclude=["user_profile", "field"]
|
||
)
|
||
custom_field_value_dict["user_profile"] = user_id
|
||
custom_field_value_dict["field"] = slack_custom_field_name_to_zulip_custom_field_id[field]
|
||
|
||
custom_field_values.append(custom_field_value_dict)
|
||
custom_field_id += 1
|
||
return custom_field_id
|
||
|
||
|
||
def process_customprofilefields(
|
||
customprofilefield: list[ZerverFieldsT], customprofilefield_value: list[ZerverFieldsT]
|
||
) -> None:
|
||
for field in customprofilefield:
|
||
for field_value in customprofilefield_value:
|
||
if field_value["field"] == field["id"] and len(field_value["value"]) > 50:
|
||
field["field_type"] = 2 # corresponding to Long text
|
||
break
|
||
|
||
|
||
def is_slackbot(user: ZerverFieldsT) -> bool:
|
||
return get_user_full_name(user).lower() == "slackbot"
|
||
|
||
|
||
def get_user_email(user: ZerverFieldsT, domain_name: str) -> str:
|
||
if "email" in user["profile"]:
|
||
return user["profile"]["email"]
|
||
if user["is_mirror_dummy"]:
|
||
return Address(username=user["name"], domain=f"{user['team_domain']}.slack.com").addr_spec
|
||
if "bot_id" in user["profile"]:
|
||
return SlackBotEmail.get_email(user["profile"], domain_name)
|
||
if is_slackbot(user):
|
||
return Address(username="imported-slackbot-bot", domain=domain_name).addr_spec
|
||
raise AssertionError(f"Could not find email address for Slack user {user}")
|
||
|
||
|
||
def build_avatar_url(slack_user_id: str, user: ZerverFieldsT) -> tuple[str, str]:
|
||
avatar_url: str = ""
|
||
avatar_source = UserProfile.AVATAR_FROM_GRAVATAR
|
||
if user["profile"].get("avatar_hash"):
|
||
# Process avatar image for a typical Slack user.
|
||
team_id = user["team_id"]
|
||
avatar_hash = user["profile"]["avatar_hash"]
|
||
avatar_url = f"https://ca.slack-edge.com/{team_id}-{slack_user_id}-{avatar_hash}"
|
||
avatar_source = UserProfile.AVATAR_FROM_USER
|
||
elif user.get("is_integration_bot") and "image_72" in user["profile"]:
|
||
# Unlike other Slack user types, Slacks integration bot avatar URL ends with
|
||
# a file type extension (.png, in this case).
|
||
# e.g https://avatars.slack-edge.com/2024-05-01/7218497908_deb94eac4c_512.png
|
||
avatar_url = user["profile"]["image_72"]
|
||
content_type = guess_type(avatar_url)[0]
|
||
if content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
|
||
logging.info(
|
||
"Unsupported avatar type (%s) for user -> %s\n", content_type, user.get("name")
|
||
)
|
||
avatar_source = UserProfile.AVATAR_FROM_GRAVATAR
|
||
else:
|
||
avatar_source = UserProfile.AVATAR_FROM_USER
|
||
else:
|
||
logging.info("Failed to process avatar for user -> %s\n", user.get("name"))
|
||
return avatar_source, avatar_url
|
||
|
||
|
||
def get_owner(user: ZerverFieldsT) -> bool:
|
||
owner = user.get("is_owner", False)
|
||
primary_owner = user.get("is_primary_owner", False)
|
||
|
||
return primary_owner or owner
|
||
|
||
|
||
def get_admin(user: ZerverFieldsT) -> bool:
|
||
admin = user.get("is_admin", False)
|
||
return admin
|
||
|
||
|
||
def get_guest(user: ZerverFieldsT) -> bool:
|
||
restricted_user = user.get("is_restricted", False)
|
||
ultra_restricted_user = user.get("is_ultra_restricted", False)
|
||
|
||
# Slack's Single channel and multi channel guests both have
|
||
# is_restricted set to True. So assuming Slack doesn't change their
|
||
# data model, it would also be correct to just check whether
|
||
# is_restricted is set to True.
|
||
return restricted_user or ultra_restricted_user
|
||
|
||
|
||
def get_user_timezone(user: ZerverFieldsT) -> str:
|
||
_default_timezone = "America/New_York"
|
||
timezone = user.get("tz", _default_timezone)
|
||
if timezone is None or "/" not in timezone:
|
||
timezone = _default_timezone
|
||
return timezone
|
||
|
||
|
||
SLACK_DEFAULT_ANNOUNCEMENTS_CHANNEL_NAME = "general"
|
||
|
||
|
||
def channels_to_zerver_stream(
|
||
slack_data_dir: str,
|
||
realm_id: int,
|
||
realm: dict[str, Any],
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
zerver_userprofile: list[ZerverFieldsT],
|
||
) -> tuple[
|
||
dict[str, list[ZerverFieldsT]], AddedChannelsT, AddedMPIMsT, DMMembersT, SlackToZulipRecipientT
|
||
]:
|
||
"""
|
||
Returns:
|
||
1. realm, converted realm data
|
||
2. added_channels, which is a dictionary to map from channel name to channel id, Zulip stream_id
|
||
3. added_mpims, which is a dictionary to map from MPIM(multiparty IM) name to MPIM id, Zulip
|
||
direct_message_group_id
|
||
4. dm_members, which is a dictionary to map from DM id to tuple of DM participants.
|
||
5. slack_recipient_name_to_zulip_recipient_id, which is a dictionary to map from Slack recipient
|
||
name(channel names, mpim names, usernames etc) to Zulip recipient_id
|
||
"""
|
||
logging.info("######### IMPORTING CHANNELS STARTED #########\n")
|
||
|
||
zerver_realm = realm["zerver_realm"]
|
||
|
||
added_channels = {}
|
||
added_mpims = {}
|
||
dm_members = {}
|
||
slack_recipient_name_to_zulip_recipient_id = {}
|
||
|
||
realm["zerver_stream"] = []
|
||
realm["zerver_huddle"] = []
|
||
realm["zerver_subscription"] = []
|
||
realm["zerver_recipient"] = []
|
||
realm["zerver_defaultstream"] = []
|
||
|
||
subscription_id_count = recipient_id_count = 0
|
||
stream_id_count = defaultstream_id = 0
|
||
direct_message_group_id_count = 0
|
||
|
||
def process_channels(channels: list[dict[str, Any]], invite_only: bool = False) -> None:
|
||
nonlocal stream_id_count, recipient_id_count, defaultstream_id, subscription_id_count
|
||
|
||
for channel in channels:
|
||
# map Slack's topic and purpose content into Zulip's stream description.
|
||
# WARN This mapping is lossy since the topic.creator, topic.last_set,
|
||
# purpose.creator, purpose.last_set fields are not preserved.
|
||
description = channel["purpose"]["value"]
|
||
stream_id = stream_id_count
|
||
recipient_id = recipient_id_count
|
||
|
||
stream = build_stream(
|
||
float(channel["created"]),
|
||
realm_id,
|
||
channel["name"],
|
||
description,
|
||
stream_id,
|
||
channel["is_archived"],
|
||
invite_only,
|
||
)
|
||
realm["zerver_stream"].append(stream)
|
||
|
||
slack_default_channels = ["general", "random"]
|
||
if channel["name"] in slack_default_channels and not stream["deactivated"]:
|
||
defaultstream = build_defaultstream(realm_id, stream_id, defaultstream_id)
|
||
realm["zerver_defaultstream"].append(defaultstream)
|
||
defaultstream_id += 1
|
||
|
||
added_channels[stream["name"]] = (channel["id"], stream_id)
|
||
|
||
recipient = build_recipient(stream_id, recipient_id, Recipient.STREAM)
|
||
realm["zerver_recipient"].append(recipient)
|
||
slack_recipient_name_to_zulip_recipient_id[stream["name"]] = recipient_id
|
||
|
||
subscription_id_count = get_subscription(
|
||
channel["members"],
|
||
realm["zerver_subscription"],
|
||
recipient_id,
|
||
slack_user_id_to_zulip_user_id,
|
||
subscription_id_count,
|
||
)
|
||
|
||
stream_id_count += 1
|
||
recipient_id_count += 1
|
||
logging.info("%s -> created", channel["name"])
|
||
|
||
if channel["name"] == SLACK_DEFAULT_ANNOUNCEMENTS_CHANNEL_NAME:
|
||
zerver_realm[0]["new_stream_announcements_stream"] = stream["id"]
|
||
zerver_realm[0]["zulip_update_announcements_stream"] = stream["id"]
|
||
logging.info(
|
||
"Using the channel %s as default announcements channel.", channel["name"]
|
||
)
|
||
|
||
# TODO map Slack's pins to Zulip's stars
|
||
# There is the security model that Slack's pins are known to the team owner
|
||
# as evident from where it is stored at (channels)
|
||
# "pins": [
|
||
# {
|
||
# "id": "1444755381.000003",
|
||
# "type": "C",
|
||
# "user": "U061A5N1G",
|
||
# "owner": "U061A5N1G",
|
||
# "created": "1444755463"
|
||
# }
|
||
# ],
|
||
|
||
public_channels = get_data_file(slack_data_dir + "/channels.json")
|
||
process_channels(public_channels)
|
||
|
||
try:
|
||
private_channels = get_data_file(slack_data_dir + "/groups.json")
|
||
except FileNotFoundError:
|
||
private_channels = []
|
||
process_channels(private_channels, True)
|
||
|
||
# mpim is the Slack equivalent of direct message group.
|
||
def process_mpims(mpims: list[dict[str, Any]]) -> None:
|
||
nonlocal direct_message_group_id_count, recipient_id_count, subscription_id_count
|
||
|
||
for mpim in mpims:
|
||
direct_message_group = build_direct_message_group(
|
||
direct_message_group_id_count, len(mpim["members"])
|
||
)
|
||
realm["zerver_huddle"].append(direct_message_group)
|
||
|
||
added_mpims[mpim["name"]] = (mpim["id"], direct_message_group_id_count)
|
||
|
||
recipient = build_recipient(
|
||
direct_message_group_id_count, recipient_id_count, Recipient.DIRECT_MESSAGE_GROUP
|
||
)
|
||
realm["zerver_recipient"].append(recipient)
|
||
slack_recipient_name_to_zulip_recipient_id[mpim["name"]] = recipient_id_count
|
||
|
||
subscription_id_count = get_subscription(
|
||
mpim["members"],
|
||
realm["zerver_subscription"],
|
||
recipient_id_count,
|
||
slack_user_id_to_zulip_user_id,
|
||
subscription_id_count,
|
||
)
|
||
|
||
direct_message_group_id_count += 1
|
||
recipient_id_count += 1
|
||
logging.info("%s -> created", mpim["name"])
|
||
|
||
try:
|
||
mpims = get_data_file(slack_data_dir + "/mpims.json")
|
||
except FileNotFoundError:
|
||
mpims = []
|
||
process_mpims(mpims)
|
||
|
||
# This may have duplicated zulip user_ids, since we merge multiple
|
||
# Slack same-email shared-channel users into one Zulip dummy user
|
||
zulip_user_to_recipient: dict[int, int] = {}
|
||
for slack_user_id, zulip_user_id in slack_user_id_to_zulip_user_id.items():
|
||
if zulip_user_id in zulip_user_to_recipient:
|
||
slack_recipient_name_to_zulip_recipient_id[slack_user_id] = zulip_user_to_recipient[
|
||
zulip_user_id
|
||
]
|
||
continue
|
||
recipient = build_recipient(zulip_user_id, recipient_id_count, Recipient.PERSONAL)
|
||
slack_recipient_name_to_zulip_recipient_id[slack_user_id] = recipient_id_count
|
||
zulip_user_to_recipient[zulip_user_id] = recipient_id_count
|
||
sub = build_subscription(recipient_id_count, zulip_user_id, subscription_id_count)
|
||
realm["zerver_recipient"].append(recipient)
|
||
realm["zerver_subscription"].append(sub)
|
||
recipient_id_count += 1
|
||
subscription_id_count += 1
|
||
|
||
def process_dms(dms: list[dict[str, Any]]) -> None:
|
||
for dm in dms:
|
||
user_a = dm["members"][0]
|
||
user_b = dm["members"][1]
|
||
dm_members[dm["id"]] = (user_a, user_b)
|
||
|
||
try:
|
||
dms = get_data_file(slack_data_dir + "/dms.json")
|
||
except FileNotFoundError:
|
||
dms = []
|
||
process_dms(dms)
|
||
|
||
logging.info("######### IMPORTING STREAMS FINISHED #########\n")
|
||
return (
|
||
realm,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
)
|
||
|
||
|
||
def get_subscription(
|
||
channel_members: list[str],
|
||
zerver_subscription: list[ZerverFieldsT],
|
||
recipient_id: int,
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
subscription_id: int,
|
||
) -> int:
|
||
processed_zulip_user_ids = set()
|
||
for slack_user_id in channel_members:
|
||
zulip_user_id = slack_user_id_to_zulip_user_id[slack_user_id]
|
||
if zulip_user_id in processed_zulip_user_ids:
|
||
# Multiple slack user ids can map to the same Zulip user id,
|
||
# due to merging of accounts which share the same email address.
|
||
# We don't want to create duplicate subscriptions for a user,
|
||
# so if we've already seen this zulip_user_id, we skip ahead.
|
||
continue
|
||
|
||
processed_zulip_user_ids.add(zulip_user_id)
|
||
sub = build_subscription(recipient_id, zulip_user_id, subscription_id)
|
||
zerver_subscription.append(sub)
|
||
subscription_id += 1
|
||
return subscription_id
|
||
|
||
|
||
def process_long_term_idle_users(
|
||
slack_data_dir: str,
|
||
users: list[ZerverFieldsT],
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
added_channels: AddedChannelsT,
|
||
added_mpims: AddedMPIMsT,
|
||
dm_members: DMMembersT,
|
||
zerver_userprofile: list[ZerverFieldsT],
|
||
) -> set[int]:
|
||
return long_term_idle_helper(
|
||
get_messages_iterator(slack_data_dir, added_channels, added_mpims, dm_members),
|
||
get_message_sending_user,
|
||
get_timestamp_from_message,
|
||
lambda id: slack_user_id_to_zulip_user_id[id],
|
||
iter(user["id"] for user in users),
|
||
zerver_userprofile,
|
||
)
|
||
|
||
|
||
def convert_slack_workspace_messages(
|
||
slack_data_dir: str,
|
||
users: list[ZerverFieldsT],
|
||
realm_id: int,
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
slack_recipient_name_to_zulip_recipient_id: SlackToZulipRecipientT,
|
||
added_channels: AddedChannelsT,
|
||
added_mpims: AddedMPIMsT,
|
||
dm_members: DMMembersT,
|
||
realm: ZerverFieldsT,
|
||
zerver_userprofile: list[ZerverFieldsT],
|
||
zerver_realmemoji: list[ZerverFieldsT],
|
||
domain_name: str,
|
||
output_dir: str,
|
||
convert_slack_threads: bool,
|
||
chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE,
|
||
) -> tuple[list[ZerverFieldsT], list[ZerverFieldsT], list[ZerverFieldsT]]:
|
||
"""
|
||
Returns:
|
||
1. reactions, which is a list of the reactions
|
||
2. uploads, which is a list of uploads to be mapped in uploads records.json
|
||
3. attachment, which is a list of the attachments
|
||
"""
|
||
|
||
long_term_idle = process_long_term_idle_users(
|
||
slack_data_dir,
|
||
users,
|
||
slack_user_id_to_zulip_user_id,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
zerver_userprofile,
|
||
)
|
||
|
||
all_messages = get_messages_iterator(slack_data_dir, added_channels, added_mpims, dm_members)
|
||
logging.info("######### IMPORTING MESSAGES STARTED #########\n")
|
||
|
||
total_reactions: list[ZerverFieldsT] = []
|
||
total_attachments: list[ZerverFieldsT] = []
|
||
total_uploads: list[ZerverFieldsT] = []
|
||
|
||
dump_file_id = 1
|
||
|
||
subscriber_map = make_subscriber_map(
|
||
zerver_subscription=realm["zerver_subscription"],
|
||
)
|
||
|
||
while message_data := list(itertools.islice(all_messages, chunk_size)):
|
||
(
|
||
zerver_message,
|
||
zerver_usermessage,
|
||
attachment,
|
||
uploads,
|
||
reactions,
|
||
) = channel_message_to_zerver_message(
|
||
realm_id,
|
||
users,
|
||
slack_user_id_to_zulip_user_id,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
message_data,
|
||
zerver_realmemoji,
|
||
subscriber_map,
|
||
added_channels,
|
||
dm_members,
|
||
domain_name,
|
||
long_term_idle,
|
||
convert_slack_threads,
|
||
)
|
||
|
||
message_json = dict(zerver_message=zerver_message, zerver_usermessage=zerver_usermessage)
|
||
|
||
message_file = f"/messages-{dump_file_id:06}.json"
|
||
logging.info("Writing messages to %s\n", output_dir + message_file)
|
||
create_converted_data_files(message_json, output_dir, message_file)
|
||
|
||
total_reactions += reactions
|
||
total_attachments += attachment
|
||
total_uploads += uploads
|
||
|
||
dump_file_id += 1
|
||
|
||
logging.info("######### IMPORTING MESSAGES FINISHED #########\n")
|
||
return total_reactions, total_uploads, total_attachments
|
||
|
||
|
||
def get_messages_iterator(
|
||
slack_data_dir: str,
|
||
added_channels: dict[str, Any],
|
||
added_mpims: AddedMPIMsT,
|
||
dm_members: DMMembersT,
|
||
) -> Iterator[ZerverFieldsT]:
|
||
"""This function is an iterator that returns all the messages across
|
||
all Slack channels, in order by timestamp. It's important to
|
||
not read all the messages into memory at once, because for
|
||
large imports that can OOM kill."""
|
||
|
||
dir_names = [*added_channels, *added_mpims, *dm_members]
|
||
all_json_names: dict[str, list[str]] = defaultdict(list)
|
||
for dir_name in dir_names:
|
||
dir_path = os.path.join(slack_data_dir, dir_name)
|
||
json_names = os.listdir(dir_path)
|
||
for json_name in json_names:
|
||
if json_name.endswith(".json"):
|
||
all_json_names[json_name].append(dir_path)
|
||
|
||
# Sort json_name by date
|
||
for json_name in sorted(all_json_names.keys()):
|
||
messages_for_one_day: list[ZerverFieldsT] = []
|
||
for dir_path in all_json_names[json_name]:
|
||
message_dir = os.path.join(dir_path, json_name)
|
||
dir_name = os.path.basename(dir_path)
|
||
messages = []
|
||
for message in get_data_file(message_dir):
|
||
if message.get("user") == "U00":
|
||
# Skip messages involving the "U00" user,
|
||
# which is apparently used in some channel rename
|
||
# messages. It's likely just the result of some
|
||
# bug in Slack's export system. Arguably we could
|
||
# change this to point to slackbot instead, but
|
||
# skipping those messages is simpler.
|
||
continue
|
||
if message.get("mimetype") == "application/vnd.slack-docs":
|
||
# This is a Slack "Post" which is HTML-formatted,
|
||
# and we don't have a clean way to import at the
|
||
# moment. We skip them on import.
|
||
continue
|
||
if dir_name in added_channels:
|
||
message["channel_name"] = dir_name
|
||
elif dir_name in added_mpims:
|
||
message["mpim_name"] = dir_name
|
||
elif dir_name in dm_members:
|
||
message["pm_name"] = dir_name
|
||
messages.append(message)
|
||
messages_for_one_day += messages
|
||
|
||
# we sort the messages according to the timestamp to show messages with
|
||
# the proper date order
|
||
yield from sorted(messages_for_one_day, key=get_timestamp_from_message)
|
||
|
||
|
||
# This is cached globally so that thread parent lookup works across multiple calls to
|
||
# channel_message_to_zerver_message, and across multiple message JSON files (e.g.
|
||
# for responses posted on a date after the thread root was created).
|
||
# The keys for this map are thread_ts values (timestamps) - as that's what appears to
|
||
# be the most sensible "thread identifier" for our purposes; Slack doesn't provide
|
||
# a thread ID.
|
||
thread_parent_map: dict[str, str] = {}
|
||
|
||
|
||
def get_parent_user_id_from_thread_message(thread_message: ZerverFieldsT, subtype: str) -> str:
|
||
"""
|
||
This retrieves the user id of the sender of the original thread
|
||
message.
|
||
"""
|
||
|
||
# Some messages posted by bots don't have a user key, but only a bot_id (namely, ones with
|
||
# subtype bot_message). For those, use bot_id as fallback when the user field doesn't exist.
|
||
try:
|
||
if subtype == "thread_broadcast":
|
||
try:
|
||
return thread_message["root"]["user"]
|
||
except KeyError:
|
||
return thread_message["root"]["bot_id"]
|
||
elif thread_message["thread_ts"] == thread_message["ts"]:
|
||
# This is the original thread message. We're following the logic recommended
|
||
# in Slack's documentation here:
|
||
# https://docs.slack.dev/messaging/retrieving-messages/#finding_threads
|
||
# - Identify parent messages by comparing the thread_ts and ts values. If they are equal,
|
||
# the message is a parent message.
|
||
# - Threaded replies are also identified by comparing the thread_ts and ts values.
|
||
# If they are different, the message is a reply.
|
||
try:
|
||
ret = thread_message["user"]
|
||
except KeyError:
|
||
ret = thread_message["bot_id"]
|
||
# Cache the thread parent's user/bot ID for later use. This will allow us to determine
|
||
# the parent user id for thread replies.
|
||
thread_parent_map[thread_message["thread_ts"]] = ret
|
||
return ret
|
||
else:
|
||
try:
|
||
return thread_message["parent_user_id"]
|
||
except KeyError:
|
||
return thread_message["bot_id"]
|
||
except KeyError:
|
||
# If Slack doesn't specify the parent user/bot ID in this message, use the cached one.
|
||
#
|
||
# TODO: Our caching strategy works under the assumption that we visit thread messages
|
||
# in the order of oldest-to-newest - so that we see the thread's parent message before
|
||
# thread replies. If messages are unsorted, we might process a
|
||
# reply before its parent, resulting in KeyError because the parent’s user ID hasn’t been cached yet.
|
||
return thread_parent_map[thread_message["thread_ts"]]
|
||
|
||
|
||
def get_zulip_thread_topic_name(
|
||
message_content: str, thread_ts: datetime, thread_counter: dict[str, int]
|
||
) -> str:
|
||
"""
|
||
The topic name format is date + message snippet + counter.
|
||
|
||
e.g "2024-05-22 Hello this is a long message that will be c… (1)"
|
||
"""
|
||
thread_date = thread_ts.strftime(r"%Y-%m-%d")
|
||
|
||
# Truncate
|
||
truncated_zulip_topic_name = truncate_content(
|
||
f"{thread_date} {message_content}".strip(), MAX_TOPIC_NAME_LENGTH, "…"
|
||
)
|
||
collision = thread_counter[truncated_zulip_topic_name]
|
||
thread_counter[truncated_zulip_topic_name] += 1
|
||
count = (f" ({collision + 1})") if collision > 0 else ""
|
||
|
||
# Important: The count is at the end, after …, so we need to
|
||
# subtract its length when doing truncation.
|
||
final_topic_name = (
|
||
truncate_content(
|
||
f"{thread_date} {message_content}".strip(), MAX_TOPIC_NAME_LENGTH - len(f"{count}"), "…"
|
||
)
|
||
+ f"{count}"
|
||
)
|
||
return final_topic_name
|
||
|
||
|
||
def channel_message_to_zerver_message(
|
||
realm_id: int,
|
||
users: list[ZerverFieldsT],
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
slack_recipient_name_to_zulip_recipient_id: SlackToZulipRecipientT,
|
||
all_messages: list[ZerverFieldsT],
|
||
zerver_realmemoji: list[ZerverFieldsT],
|
||
subscriber_map: dict[int, set[int]],
|
||
added_channels: AddedChannelsT,
|
||
dm_members: DMMembersT,
|
||
domain_name: str,
|
||
long_term_idle: set[int],
|
||
convert_slack_threads: bool,
|
||
) -> tuple[
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
list[ZerverFieldsT],
|
||
]:
|
||
"""
|
||
Returns:
|
||
1. zerver_message, which is a list of the messages
|
||
2. zerver_usermessage, which is a list of the usermessages
|
||
3. zerver_attachment, which is a list of the attachments
|
||
4. uploads_list, which is a list of uploads to be mapped in uploads records.json
|
||
5. reaction_list, which is a list of all user reactions
|
||
"""
|
||
zerver_message = []
|
||
zerver_usermessage: list[ZerverFieldsT] = []
|
||
uploads_list: list[ZerverFieldsT] = []
|
||
zerver_attachment: list[ZerverFieldsT] = []
|
||
reaction_list: list[ZerverFieldsT] = []
|
||
|
||
total_user_messages = 0
|
||
total_skipped_user_messages = 0
|
||
thread_counter: dict[str, int] = defaultdict(int)
|
||
thread_map: dict[str, str] = {}
|
||
for message in all_messages:
|
||
slack_user_id = get_message_sending_user(message)
|
||
if not slack_user_id:
|
||
# Ignore messages without slack_user_id
|
||
# These are Sometimes produced by Slack
|
||
continue
|
||
|
||
subtype = message.get("subtype", False)
|
||
if subtype in [
|
||
# Zulip doesn't have a pinned_item concept
|
||
"pinned_item",
|
||
"unpinned_item",
|
||
# Slack's channel join/leave notices are spammy
|
||
"channel_join",
|
||
"channel_leave",
|
||
"channel_name",
|
||
]:
|
||
continue
|
||
|
||
formatted_block = process_slack_block_and_attachment(message)
|
||
|
||
# Leave it as is if formatted_block is an empty string, it's likely
|
||
# one of the unhandled_types.
|
||
if formatted_block != "":
|
||
# For most cases, the value of message["text"] will be just an
|
||
# empty string.
|
||
message["text"] = formatted_block
|
||
|
||
try:
|
||
content, mentioned_user_ids, has_link = convert_to_zulip_markdown(
|
||
message["text"], users, added_channels, slack_user_id_to_zulip_user_id
|
||
)
|
||
except Exception:
|
||
print("Slack message unexpectedly missing text representation:")
|
||
print(orjson.dumps(message, option=orjson.OPT_INDENT_2).decode())
|
||
continue
|
||
rendered_content = None
|
||
|
||
if "channel_name" in message:
|
||
is_direct_message_type = False
|
||
recipient_id = slack_recipient_name_to_zulip_recipient_id[message["channel_name"]]
|
||
elif "mpim_name" in message:
|
||
is_direct_message_type = True
|
||
recipient_id = slack_recipient_name_to_zulip_recipient_id[message["mpim_name"]]
|
||
elif "pm_name" in message:
|
||
is_direct_message_type = True
|
||
sender = get_message_sending_user(message)
|
||
members = dm_members[message["pm_name"]]
|
||
if sender == members[0]:
|
||
recipient_id = slack_recipient_name_to_zulip_recipient_id[members[1]]
|
||
sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[members[0]]
|
||
else:
|
||
recipient_id = slack_recipient_name_to_zulip_recipient_id[members[0]]
|
||
sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[members[1]]
|
||
|
||
message_id = NEXT_ID("message")
|
||
|
||
if "reactions" in message:
|
||
build_reactions(
|
||
reaction_list,
|
||
message["reactions"],
|
||
slack_user_id_to_zulip_user_id,
|
||
message_id,
|
||
zerver_realmemoji,
|
||
)
|
||
|
||
# Process different subtypes of slack messages
|
||
|
||
# Subtypes which have only the action in the message should
|
||
# be rendered with '/me' in the content initially
|
||
# For example "sh_room_created" has the message 'started a call'
|
||
# which should be displayed as '/me started a call'
|
||
if subtype in ["bot_add", "sh_room_created", "me_message"]:
|
||
content = f"/me {content}"
|
||
if subtype == "file_comment":
|
||
# The file_comment message type only indicates the
|
||
# responsible user in a subfield.
|
||
message["user"] = message["comment"]["user"]
|
||
|
||
file_info = process_message_files(
|
||
message=message,
|
||
domain_name=domain_name,
|
||
realm_id=realm_id,
|
||
message_id=message_id,
|
||
slack_user_id=slack_user_id,
|
||
users=users,
|
||
slack_user_id_to_zulip_user_id=slack_user_id_to_zulip_user_id,
|
||
zerver_attachment=zerver_attachment,
|
||
uploads_list=uploads_list,
|
||
)
|
||
|
||
content = "\n".join([part for part in [content, file_info["content"]] if part != ""])
|
||
has_link = has_link or file_info["has_link"]
|
||
|
||
has_attachment = file_info["has_attachment"]
|
||
has_image = file_info["has_image"]
|
||
|
||
# Slack's unthreaded messages go into a single topic, while
|
||
# threads each generate a unique topic labeled by the date,
|
||
# a snippet of the original message and a counter if there
|
||
# are any thread with the same topic name
|
||
topic_name = "imported from Slack"
|
||
if convert_slack_threads and not is_direct_message_type and "thread_ts" in message:
|
||
thread_ts = datetime.fromtimestamp(float(message["thread_ts"]), tz=timezone.utc)
|
||
thread_ts_str = thread_ts.strftime(r"%Y/%m/%d %H:%M:%S")
|
||
parent_user_id = get_parent_user_id_from_thread_message(message, subtype)
|
||
thread_key = f"{thread_ts_str}-{parent_user_id}"
|
||
|
||
if thread_key in thread_map:
|
||
topic_name = thread_map[thread_key]
|
||
else:
|
||
topic_name = get_zulip_thread_topic_name(content, thread_ts, thread_counter)
|
||
thread_map[thread_key] = topic_name
|
||
|
||
if is_direct_message_type:
|
||
topic_name = ""
|
||
|
||
zulip_message = build_message(
|
||
topic_name=topic_name,
|
||
date_sent=get_timestamp_from_message(message),
|
||
message_id=message_id,
|
||
content=content,
|
||
rendered_content=rendered_content,
|
||
user_id=slack_user_id_to_zulip_user_id[slack_user_id],
|
||
recipient_id=recipient_id,
|
||
realm_id=realm_id,
|
||
is_channel_message=not is_direct_message_type,
|
||
has_image=has_image,
|
||
has_link=has_link,
|
||
has_attachment=has_attachment,
|
||
is_direct_message_type=is_direct_message_type,
|
||
)
|
||
zerver_message.append(zulip_message)
|
||
|
||
(num_created, num_skipped) = build_usermessages(
|
||
zerver_usermessage=zerver_usermessage,
|
||
subscriber_map=subscriber_map,
|
||
recipient_id=recipient_id,
|
||
mentioned_user_ids=mentioned_user_ids,
|
||
message_id=message_id,
|
||
is_private=is_direct_message_type,
|
||
long_term_idle=long_term_idle,
|
||
)
|
||
total_user_messages += num_created
|
||
total_skipped_user_messages += num_skipped
|
||
|
||
if "pm_name" in message and recipient_id != sender_recipient_id:
|
||
(num_created, num_skipped) = build_usermessages(
|
||
zerver_usermessage=zerver_usermessage,
|
||
subscriber_map=subscriber_map,
|
||
recipient_id=sender_recipient_id,
|
||
mentioned_user_ids=mentioned_user_ids,
|
||
message_id=message_id,
|
||
is_private=is_direct_message_type,
|
||
long_term_idle=long_term_idle,
|
||
)
|
||
total_user_messages += num_created
|
||
total_skipped_user_messages += num_skipped
|
||
|
||
logging.debug(
|
||
"Created %s UserMessages; deferred %s due to long-term idle",
|
||
total_user_messages,
|
||
total_skipped_user_messages,
|
||
)
|
||
return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, reaction_list
|
||
|
||
|
||
def process_message_files(
|
||
message: ZerverFieldsT,
|
||
domain_name: str,
|
||
realm_id: int,
|
||
message_id: int,
|
||
slack_user_id: str,
|
||
users: list[ZerverFieldsT],
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
zerver_attachment: list[ZerverFieldsT],
|
||
uploads_list: list[ZerverFieldsT],
|
||
) -> dict[str, Any]:
|
||
has_attachment = False
|
||
has_image = False
|
||
has_link = False
|
||
|
||
files = message.get("files", [])
|
||
|
||
subtype = message.get("subtype")
|
||
|
||
if subtype == "file_share":
|
||
# In Slack messages, uploads can either have the subtype as 'file_share' or
|
||
# have the upload information in 'files' keyword
|
||
files = [message["file"]]
|
||
|
||
markdown_links = []
|
||
|
||
for fileinfo in files:
|
||
if fileinfo.get("mode", "") in ["tombstone", "hidden_by_limit"]:
|
||
# Slack sometimes includes tombstone mode files with no
|
||
# real data on the actual file (presumably in cases where
|
||
# the file was deleted). hidden_by_limit mode is for files
|
||
# that are hidden because of 10k cap in free plan.
|
||
continue
|
||
|
||
if fileinfo.get("file_access", "") in ["access_denied", "file_not_found"]:
|
||
# Slack sometimes includes file stubs for files it declares
|
||
# inaccessible and does not further reference.
|
||
continue
|
||
|
||
url = fileinfo["url_private"]
|
||
split_url = urlsplit(url)
|
||
|
||
if split_url.hostname == "files.slack.com":
|
||
# For attachments with Slack download link
|
||
has_attachment = True
|
||
has_link = True
|
||
has_image = "image" in fileinfo["mimetype"]
|
||
|
||
file_user = [
|
||
iterate_user for iterate_user in users if message["user"] == iterate_user["id"]
|
||
]
|
||
file_user_email = get_user_email(file_user[0], domain_name)
|
||
|
||
s3_path, content_for_link = get_attachment_path_and_content(fileinfo, realm_id)
|
||
markdown_links.append(content_for_link)
|
||
|
||
build_uploads(
|
||
slack_user_id_to_zulip_user_id[slack_user_id],
|
||
realm_id,
|
||
file_user_email,
|
||
fileinfo,
|
||
s3_path,
|
||
uploads_list,
|
||
)
|
||
|
||
build_attachment(
|
||
realm_id,
|
||
{message_id},
|
||
slack_user_id_to_zulip_user_id[slack_user_id],
|
||
fileinfo,
|
||
s3_path,
|
||
zerver_attachment,
|
||
)
|
||
else:
|
||
# For attachments with link not from Slack
|
||
# Example: Google drive integration
|
||
has_link = True
|
||
if "title" in fileinfo:
|
||
file_name = fileinfo["title"]
|
||
else:
|
||
file_name = fileinfo["name"]
|
||
markdown_links.append("[{}]({})".format(file_name, fileinfo["url_private"]))
|
||
|
||
content = "\n".join(markdown_links)
|
||
|
||
return dict(
|
||
content=content,
|
||
has_attachment=has_attachment,
|
||
has_image=has_image,
|
||
has_link=has_link,
|
||
)
|
||
|
||
|
||
def get_attachment_path_and_content(fileinfo: ZerverFieldsT, realm_id: int) -> tuple[str, str]:
|
||
# Should be kept in sync with its equivalent in zerver/lib/uploads in the function
|
||
# 'upload_message_attachment'
|
||
s3_path = "/".join(
|
||
[
|
||
str(realm_id),
|
||
format(random.randint(0, 255), "x"),
|
||
secrets.token_urlsafe(18),
|
||
sanitize_name(fileinfo["name"]),
|
||
]
|
||
)
|
||
attachment_path = f"/user_uploads/{s3_path}"
|
||
content = "[{}]({})".format(fileinfo["title"], attachment_path)
|
||
|
||
return s3_path, content
|
||
|
||
|
||
def build_reactions(
|
||
reaction_list: list[ZerverFieldsT],
|
||
reactions: list[ZerverFieldsT],
|
||
slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
|
||
message_id: int,
|
||
zerver_realmemoji: list[ZerverFieldsT],
|
||
) -> None:
|
||
realmemoji = {}
|
||
for realm_emoji in zerver_realmemoji:
|
||
realmemoji[realm_emoji["name"]] = realm_emoji["id"]
|
||
|
||
# Slack's data exports use encode skin tone variants on emoji
|
||
# reactions like this: `clap::skin-tone-2`. For now, we only
|
||
# use the name of the base emoji, since Zulip's emoji
|
||
# reactions system doesn't yet support skin tone modifiers.
|
||
# We need to merge and dedup reactions, as someone may have
|
||
# reacted to `clap::skin-tone-1` and `clap::skin-tone-2`, etc.
|
||
merged_reactions = defaultdict(set)
|
||
for slack_reaction in reactions:
|
||
emoji_name = slack_reaction["name"].split("::", maxsplit=1)[0]
|
||
merged_reactions[emoji_name].update(slack_reaction["users"])
|
||
reactions = [{"name": k, "users": v, "count": len(v)} for k, v in merged_reactions.items()]
|
||
|
||
processed_reactions: set[tuple[int, int, str, str]] = set()
|
||
# For the Unicode emoji codes, we use equivalent of
|
||
# function 'get_emoji_data' in 'zerver/lib/emoji' here
|
||
for slack_reaction in reactions:
|
||
emoji_name = slack_reaction["name"]
|
||
if emoji_name in slack_emoji_name_to_codepoint:
|
||
emoji_code = slack_emoji_name_to_codepoint[emoji_name]
|
||
try:
|
||
zulip_emoji_name = codepoint_to_name[emoji_code]
|
||
except KeyError:
|
||
print(f"WARN: Emoji found in iamcal but not Zulip: {emoji_name}")
|
||
continue
|
||
# Convert Slack emoji name to Zulip emoji name.
|
||
emoji_name = zulip_emoji_name
|
||
reaction_type = Reaction.UNICODE_EMOJI
|
||
elif emoji_name in realmemoji:
|
||
emoji_code = realmemoji[emoji_name]
|
||
reaction_type = Reaction.REALM_EMOJI
|
||
else:
|
||
print(f"WARN: Emoji not found in iamcal: {emoji_name}")
|
||
continue
|
||
|
||
for slack_user_id in slack_reaction["users"]:
|
||
if slack_user_id not in slack_user_id_to_zulip_user_id:
|
||
# Deleted users still have reaction references but no profile, so we skip
|
||
continue
|
||
|
||
reaction_id = NEXT_ID("reaction")
|
||
reaction = Reaction(
|
||
id=reaction_id,
|
||
emoji_code=emoji_code,
|
||
emoji_name=emoji_name,
|
||
reaction_type=reaction_type,
|
||
)
|
||
|
||
reaction_dict = model_to_dict(reaction, exclude=["message", "user_profile"])
|
||
reaction_dict["message"] = message_id
|
||
zulip_user_id = slack_user_id_to_zulip_user_id[slack_user_id]
|
||
reaction_dict["user_profile"] = zulip_user_id
|
||
|
||
reaction_tuple = (zulip_user_id, message_id, reaction_type, emoji_code)
|
||
if reaction_tuple in processed_reactions:
|
||
# Due to possible merging of Slack accounts into a single Zulip account,
|
||
# we need to ensure reactions don't get duplicated, violating the unique
|
||
# constraint on the (user_profile_id, message_id, reaction_type, emoji_code)
|
||
# index.
|
||
continue
|
||
processed_reactions.add(reaction_tuple)
|
||
|
||
reaction_list.append(reaction_dict)
|
||
|
||
|
||
def build_uploads(
|
||
user_id: int,
|
||
realm_id: int,
|
||
email: str,
|
||
fileinfo: ZerverFieldsT,
|
||
s3_path: str,
|
||
uploads_list: list[ZerverFieldsT],
|
||
) -> None:
|
||
upload = dict(
|
||
path=fileinfo["url_private"], # Save Slack's URL here, which is used later while processing
|
||
realm_id=realm_id,
|
||
content_type=None,
|
||
user_profile_id=user_id,
|
||
last_modified=fileinfo["timestamp"],
|
||
user_profile_email=email,
|
||
s3_path=s3_path,
|
||
size=fileinfo["size"],
|
||
)
|
||
uploads_list.append(upload)
|
||
|
||
|
||
def get_message_sending_user(message: ZerverFieldsT) -> str | None:
|
||
if "user" in message:
|
||
return message["user"]
|
||
if message.get("file"):
|
||
return message["file"].get("user")
|
||
if message.get("bot_id"):
|
||
return message.get("bot_id")
|
||
return None
|
||
|
||
|
||
def get_timestamp_from_message(message: ZerverFieldsT) -> float:
|
||
return float(message["ts"])
|
||
|
||
|
||
def is_integration_bot_message(message: ZerverFieldsT) -> bool:
|
||
return message.get("subtype") == "bot_message" and "user" not in message and "bot_id" in message
|
||
|
||
|
||
def convert_bot_info_to_slack_user(bot_info: dict[str, Any]) -> ZerverFieldsT:
|
||
# We use "image_72," an icon-sized 72x72 pixel image, for the Slack integration
|
||
# bots avatar because it is the best available option. As a consequence, this
|
||
# will make the avatar appear blurry in places where a medium-sized avatar
|
||
# (500x500px) is expected, such as in the user profile menu.
|
||
|
||
bot_user = {
|
||
"id": bot_info["id"],
|
||
"name": bot_info["name"],
|
||
"deleted": bot_info["deleted"],
|
||
"is_mirror_dummy": False,
|
||
"real_name": bot_info["name"],
|
||
"is_integration_bot": True,
|
||
"profile": {
|
||
"bot_id": bot_info["id"],
|
||
"first_name": bot_info["name"],
|
||
},
|
||
}
|
||
if "image_72" in bot_info["icons"]:
|
||
# Otherwise, gravatar will be used.
|
||
bot_user["profile"]["image_72"] = bot_info["icons"]["image_72"]
|
||
|
||
return bot_user
|
||
|
||
|
||
def make_deleted_placeholder(bot_id: str) -> ZerverFieldsT:
|
||
name = f"Deleted Slack Bot {bot_id}"
|
||
bot_user = {
|
||
"id": bot_id,
|
||
"name": name,
|
||
"deleted": True,
|
||
"is_mirror_dummy": False,
|
||
"real_name": name,
|
||
"is_integration_bot": True,
|
||
"profile": {
|
||
# Intentionally skip image_72. Gravatar should be used.
|
||
"bot_id": bot_id,
|
||
"first_name": name,
|
||
},
|
||
}
|
||
return bot_user
|
||
|
||
|
||
def fetch_shared_channel_users(
|
||
user_list: list[ZerverFieldsT], slack_data_dir: str, token: str
|
||
) -> None:
|
||
normal_user_ids = set()
|
||
mirror_dummy_user_ids = set()
|
||
added_channels = {}
|
||
integration_bot_users: list[str] = []
|
||
team_id_to_domain: dict[str, str] = {}
|
||
for user in user_list:
|
||
user["is_mirror_dummy"] = False
|
||
normal_user_ids.add(user["id"])
|
||
|
||
public_channels = get_data_file(slack_data_dir + "/channels.json")
|
||
try:
|
||
private_channels = get_data_file(slack_data_dir + "/groups.json")
|
||
except FileNotFoundError:
|
||
private_channels = []
|
||
try:
|
||
direct_message_groups = get_data_file(slack_data_dir + "/mpims.json")
|
||
except FileNotFoundError:
|
||
direct_message_groups = []
|
||
for channel in public_channels + private_channels + direct_message_groups:
|
||
added_channels[channel["name"]] = True
|
||
for user_id in channel["members"]:
|
||
if user_id not in normal_user_ids:
|
||
mirror_dummy_user_ids.add(user_id)
|
||
if os.path.exists(slack_data_dir + "/dms.json"):
|
||
dms = get_data_file(slack_data_dir + "/dms.json")
|
||
for dm_data in dms:
|
||
for user_id in dm_data["members"]:
|
||
if user_id not in normal_user_ids:
|
||
mirror_dummy_user_ids.add(user_id)
|
||
|
||
all_messages = get_messages_iterator(slack_data_dir, added_channels, {}, {})
|
||
for message in all_messages:
|
||
if is_integration_bot_message(message):
|
||
# This message is likely from an integration bot. Since Slack's integration
|
||
# bots doesn't have user profiles, we need to artificially create users for
|
||
# them to convert their messages.
|
||
bot_id = message["bot_id"]
|
||
if bot_id in integration_bot_users:
|
||
continue
|
||
try:
|
||
bot_info = get_slack_api_data(
|
||
"https://slack.com/api/bots.info", "bot", token=token, bot=bot_id
|
||
)
|
||
except SlackBotNotFoundError:
|
||
logging.info("Bot %s not found, creating a deleted placeholder", bot_id)
|
||
bot_user = make_deleted_placeholder(bot_id)
|
||
else:
|
||
bot_user = convert_bot_info_to_slack_user(bot_info)
|
||
|
||
user_list.append(bot_user)
|
||
integration_bot_users.append(bot_id)
|
||
else:
|
||
user_id = get_message_sending_user(message)
|
||
if user_id is None or user_id in normal_user_ids:
|
||
continue
|
||
mirror_dummy_user_ids.add(user_id)
|
||
|
||
# Fetch data on the mirror_dummy_user_ids from the Slack API (it's
|
||
# not included in the data export file).
|
||
for user_id in mirror_dummy_user_ids:
|
||
user = get_slack_api_data(
|
||
"https://slack.com/api/users.info", "user", token=token, user=user_id
|
||
)
|
||
team_id = user["team_id"]
|
||
if team_id not in team_id_to_domain:
|
||
team = get_slack_api_data(
|
||
"https://slack.com/api/team.info", "team", token=token, team=team_id
|
||
)
|
||
team_id_to_domain[team_id] = team["domain"]
|
||
user["team_domain"] = team_id_to_domain[team_id]
|
||
user["is_mirror_dummy"] = True
|
||
user_list.append(user)
|
||
|
||
|
||
def fetch_team_icons(
|
||
zerver_realm: dict[str, Any], team_info_dict: dict[str, Any], output_dir: str
|
||
) -> list[dict[str, Any]]:
|
||
records = []
|
||
|
||
team_icons_dict = team_info_dict["icon"]
|
||
if team_icons_dict.get("image_default", False):
|
||
return []
|
||
|
||
icon_url = (
|
||
team_icons_dict.get("image_original", None)
|
||
or team_icons_dict.get("image_230", None)
|
||
or team_icons_dict.get("image_132", None)
|
||
or team_icons_dict.get("image_102", None)
|
||
)
|
||
if icon_url is None:
|
||
return []
|
||
|
||
response = requests.get(icon_url, stream=True)
|
||
response_raw = response.raw
|
||
|
||
realm_id = zerver_realm["id"]
|
||
os.makedirs(os.path.join(output_dir, str(realm_id)), exist_ok=True)
|
||
|
||
original_icon_output_path = os.path.join(output_dir, str(realm_id), "icon.original")
|
||
with open(original_icon_output_path, "wb") as output_file:
|
||
shutil.copyfileobj(response_raw, output_file)
|
||
records.append(
|
||
{
|
||
"realm_id": realm_id,
|
||
"path": os.path.join(str(realm_id), "icon.original"),
|
||
"s3_path": os.path.join(str(realm_id), "icon.original"),
|
||
"content_type": response.headers["Content-Type"],
|
||
}
|
||
)
|
||
|
||
resized_icon_output_path = os.path.join(output_dir, str(realm_id), "icon.png")
|
||
with (
|
||
open(resized_icon_output_path, "wb") as output_file,
|
||
open(original_icon_output_path, "rb") as original_file,
|
||
):
|
||
resized_data = resize_realm_icon(original_file.read())
|
||
output_file.write(resized_data)
|
||
records.append(
|
||
{
|
||
"realm_id": realm_id,
|
||
"path": os.path.join(str(realm_id), "icon.png"),
|
||
"s3_path": os.path.join(str(realm_id), "icon.png"),
|
||
"content_type": "image/png",
|
||
}
|
||
)
|
||
|
||
zerver_realm["icon_source"] = Realm.ICON_UPLOADED
|
||
|
||
return records
|
||
|
||
|
||
def do_convert_zipfile(
|
||
original_path: str,
|
||
output_dir: str,
|
||
token: str,
|
||
threads: int = 6,
|
||
convert_slack_threads: bool = False,
|
||
) -> None:
|
||
assert original_path.endswith(".zip")
|
||
slack_data_dir = original_path.removesuffix(".zip")
|
||
try:
|
||
os.makedirs(slack_data_dir, exist_ok=True)
|
||
|
||
with zipfile.ZipFile(original_path) as zipObj:
|
||
total_size = 0
|
||
for fileinfo in zipObj.infolist():
|
||
# Slack's export doesn't set the UTF-8 flag on each
|
||
# filename entry, despite encoding them as such, so
|
||
# zipfile mojibake's the output. Explicitly re-interpret
|
||
# it as UTF-8 misdecoded as cp437, the default.
|
||
fileinfo.flag_bits |= 0x800
|
||
fileinfo.filename = fileinfo.filename.encode("cp437").decode("utf-8")
|
||
zipObj.NameToInfo[fileinfo.filename] = fileinfo
|
||
|
||
# The only files we expect to find in a Slack export are .json files:
|
||
# something.json
|
||
# channelname/
|
||
# channelname/2024-01-02.json
|
||
#
|
||
# Canvases may also appear in exports, either in their own
|
||
# top-level directories, or as `canvas_in_the_conversation.json`
|
||
# files in channel directories. We do not parse these currently.
|
||
if not re.match(r"[^/]+(\.json|/([^/]+\.json)?)$", fileinfo.filename):
|
||
raise SlackImportInvalidFileError(
|
||
"Uploaded zip file is not a valid Slack export."
|
||
)
|
||
|
||
# file_size is the uncompressed size of the file
|
||
total_size += fileinfo.file_size
|
||
|
||
# Based on historical Slack exports, anything that is more
|
||
# than a 10x size magnification is suspect, particularly
|
||
# if it results in over 1GB.
|
||
if total_size > 1024 * 1024 * 1024 and total_size > 10 * os.path.getsize(original_path):
|
||
raise SlackImportInvalidFileError("Uploaded zip file is not a valid Slack export.")
|
||
|
||
zipObj.extractall(slack_data_dir)
|
||
|
||
do_convert_directory(slack_data_dir, output_dir, token, threads, convert_slack_threads)
|
||
finally:
|
||
# Always clean up the uncompressed directory
|
||
rm_tree(slack_data_dir)
|
||
|
||
|
||
SLACK_IMPORT_TOKEN_SCOPES = {"emoji:read", "users:read", "users:read.email", "team:read"}
|
||
|
||
|
||
def do_convert_directory(
|
||
slack_data_dir: str,
|
||
output_dir: str,
|
||
token: str,
|
||
threads: int = 6,
|
||
convert_slack_threads: bool = False,
|
||
) -> None:
|
||
check_token_access(token, SLACK_IMPORT_TOKEN_SCOPES)
|
||
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
if os.listdir(output_dir):
|
||
raise Exception("Output directory should be empty!")
|
||
|
||
if not os.path.isfile(os.path.join(slack_data_dir, "channels.json")):
|
||
raise ValueError("Import does not have the layout we expect from a Slack export!")
|
||
|
||
# We get the user data from the legacy token method of Slack API, which is depreciated
|
||
# but we use it as the user email data is provided only in this method.
|
||
# Fetching from this endpoint requires using pagination, as only a subset
|
||
# of the users might be returned in any single request.
|
||
# We use the limit value of 200, as that's suggested in Slack's documentation for this
|
||
# endpoint.
|
||
user_list = get_slack_api_data(
|
||
"https://slack.com/api/users.list", "members", token=token, pagination_limit=200
|
||
)
|
||
fetch_shared_channel_users(user_list, slack_data_dir, token)
|
||
|
||
custom_emoji_list = get_slack_api_data("https://slack.com/api/emoji.list", "emoji", token=token)
|
||
|
||
# Subdomain is set by the user while running the import command
|
||
realm_subdomain = ""
|
||
realm_id = 0
|
||
domain_name = SplitResult("", settings.EXTERNAL_HOST, "", "", "").hostname
|
||
assert isinstance(domain_name, str)
|
||
|
||
(
|
||
realm,
|
||
slack_user_id_to_zulip_user_id,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
avatar_list,
|
||
emoji_url_map,
|
||
) = slack_workspace_to_realm(
|
||
domain_name, realm_id, user_list, realm_subdomain, slack_data_dir, custom_emoji_list
|
||
)
|
||
|
||
reactions, uploads_list, zerver_attachment = convert_slack_workspace_messages(
|
||
slack_data_dir,
|
||
user_list,
|
||
realm_id,
|
||
slack_user_id_to_zulip_user_id,
|
||
slack_recipient_name_to_zulip_recipient_id,
|
||
added_channels,
|
||
added_mpims,
|
||
dm_members,
|
||
realm,
|
||
realm["zerver_userprofile"],
|
||
realm["zerver_realmemoji"],
|
||
domain_name,
|
||
output_dir,
|
||
convert_slack_threads,
|
||
)
|
||
|
||
# Move zerver_reactions to realm.json file
|
||
realm["zerver_reaction"] = reactions
|
||
|
||
emoji_folder = os.path.join(output_dir, "emoji")
|
||
os.makedirs(emoji_folder, exist_ok=True)
|
||
emoji_records = process_emojis(realm["zerver_realmemoji"], emoji_folder, emoji_url_map, threads)
|
||
|
||
avatar_folder = os.path.join(output_dir, "avatars")
|
||
avatar_realm_folder = os.path.join(avatar_folder, str(realm_id))
|
||
os.makedirs(avatar_realm_folder, exist_ok=True)
|
||
avatar_records = process_avatars(
|
||
avatar_list, avatar_folder, realm_id, threads, size_url_suffix="-512"
|
||
)
|
||
|
||
uploads_folder = os.path.join(output_dir, "uploads")
|
||
os.makedirs(os.path.join(uploads_folder, str(realm_id)), exist_ok=True)
|
||
uploads_records = process_uploads(uploads_list, uploads_folder, threads)
|
||
attachment = {"zerver_attachment": zerver_attachment}
|
||
|
||
team_info_dict = get_slack_api_data("https://slack.com/api/team.info", "team", token=token)
|
||
realm_icons_folder = os.path.join(output_dir, "realm_icons")
|
||
realm_icon_records = fetch_team_icons(
|
||
realm["zerver_realm"][0], team_info_dict, realm_icons_folder
|
||
)
|
||
|
||
create_converted_data_files(realm, output_dir, "/realm.json")
|
||
create_converted_data_files(emoji_records, output_dir, "/emoji/records.json")
|
||
create_converted_data_files(avatar_records, output_dir, "/avatars/records.json")
|
||
create_converted_data_files(uploads_records, output_dir, "/uploads/records.json")
|
||
create_converted_data_files(attachment, output_dir, "/attachment.json")
|
||
create_converted_data_files(realm_icon_records, output_dir, "/realm_icons/records.json")
|
||
do_common_export_processes(output_dir)
|
||
|
||
logging.info("######### DATA CONVERSION FINISHED #########\n")
|
||
logging.info("Zulip data dump created at %s", output_dir)
|
||
|
||
|
||
def get_data_file(path: str) -> Any:
|
||
with open(path, "rb") as fp:
|
||
data = orjson.loads(fp.read())
|
||
return data
|
||
|
||
|
||
def check_token_access(token: str, required_scopes: set[str]) -> None:
|
||
if token.startswith("xoxp-"):
|
||
logging.info("This is a Slack user token, which grants all rights the user has!")
|
||
elif token.startswith("xoxb-"):
|
||
data = requests.get(
|
||
"https://slack.com/api/api.test", headers={"Authorization": f"Bearer {token}"}
|
||
)
|
||
if data.status_code != 200:
|
||
raise ValueError(
|
||
f"Failed to fetch data (HTTP status {data.status_code}) for Slack token: {token}"
|
||
)
|
||
if not data.json()["ok"]:
|
||
error = data.json()["error"]
|
||
if error != "missing_scope":
|
||
logging.error("Slack token is invalid: %s", error)
|
||
raise ValueError(f"Invalid token: {token}")
|
||
has_scopes = set(data.headers.get("x-oauth-scopes", "").split(","))
|
||
missing_scopes = required_scopes - has_scopes
|
||
if missing_scopes:
|
||
raise ValueError(
|
||
f"Slack token is missing the following required scopes: {sorted(missing_scopes)}"
|
||
)
|
||
else:
|
||
raise Exception("Invalid token. Valid tokens start with xoxb-.")
|
||
|
||
|
||
def get_slack_api_data(
|
||
slack_api_url: str,
|
||
get_param: str,
|
||
*,
|
||
pagination_limit: int | None = None,
|
||
raise_if_rate_limited: bool = False,
|
||
**kwargs: Any,
|
||
) -> Any:
|
||
if not kwargs.get("token"):
|
||
raise AssertionError("Slack token missing in kwargs")
|
||
|
||
token = kwargs.pop("token")
|
||
accumulated_result = []
|
||
cursor: str | None = None
|
||
while True:
|
||
if pagination_limit is not None:
|
||
# If we're fetching with pagination, this might take a while, so we want reasonable logging to show
|
||
# progress and what's being fetched.
|
||
logging.info(
|
||
"Fetching page from %s with cursor: %s and limit: %s",
|
||
slack_api_url,
|
||
cursor,
|
||
pagination_limit,
|
||
)
|
||
|
||
params: dict[str, int | str] = {"limit": pagination_limit} if pagination_limit else {}
|
||
if cursor:
|
||
params["cursor"] = cursor
|
||
params.update(kwargs)
|
||
|
||
response = requests.get(
|
||
slack_api_url, headers={"Authorization": f"Bearer {token}"}, params=params
|
||
)
|
||
|
||
if response.status_code == 429:
|
||
if raise_if_rate_limited:
|
||
raise Exception("Exceeded Slack rate limits.")
|
||
retry_after = int(response.headers.get("retry-after", 1))
|
||
logging.info("Rate limit exceeded. Retrying in %s seconds...", retry_after)
|
||
time.sleep(retry_after)
|
||
continue
|
||
|
||
if response.status_code != requests.codes.ok:
|
||
logging.info("HTTP error: %s, Response: %s", response.status_code, response.text)
|
||
raise Exception("HTTP error accessing the Slack API.")
|
||
|
||
result = response.json()
|
||
if not result["ok"]:
|
||
if result["error"] == "bot_not_found":
|
||
raise SlackBotNotFoundError
|
||
|
||
raise Exception("Error accessing Slack API: {}".format(result["error"]))
|
||
|
||
result_data = result[get_param]
|
||
|
||
if pagination_limit is None:
|
||
# We're not using pagination, so we don't want to loop and should just return the result.
|
||
return result_data
|
||
|
||
accumulated_result.extend(result_data)
|
||
if not result.get("response_metadata", {}).get("next_cursor"):
|
||
# Everything has been fetched.
|
||
break
|
||
|
||
cursor = result["response_metadata"]["next_cursor"]
|
||
|
||
return accumulated_result
|
||
|
||
|
||
class SlackBotNotFoundError(Exception):
|
||
pass
|