slack_import: Improve topic name for Slack threads.

This updates the topic name format for Slack threads to include a
snippet of the original message. Currently the format looks like this;
"{date} Slack thread {n}", which provides little to no context about the
thread.

Currently we also use the `thread_ts_str` key to keep track of Slack
threads we're converting, this key format makes it so that it has a
small chance of combining threads with the same timestamp under one
topic. This commit updates it to use a new key format, `thread_key`.

Fixes #27661.
This commit is contained in:
PieterCK
2025-01-02 19:50:31 +07:00
committed by Tim Abbott
parent 4210ccc5db
commit 9de1b47ffa
3 changed files with 221 additions and 34 deletions

View File

@@ -165,8 +165,8 @@ in mind about the import process:
| Multi Channel Guest | Guest |
| Channel creator | none |
- Slack threads are imported as topics with names like "2023-05-30
Slack thread 1".
- Slack threads are imported as topics with names that include snippets of the
original message, such as "2023-05-30 Hi, can anyone reply if you're o…".
- Message edit history and `@user joined #channel_name` messages are not imported.

View File

@@ -51,6 +51,7 @@ from zerver.data_import.slack_message_conversion import (
from zerver.lib.emoji import codepoint_to_name, get_emoji_file_name
from zerver.lib.exceptions import SlackImportInvalidFileError
from zerver.lib.export import MESSAGE_BATCH_CHUNK_SIZE, do_common_export_processes
from zerver.lib.message import truncate_content
from zerver.lib.mime_types import guess_type
from zerver.lib.storage import static_path
from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, resize_realm_icon
@@ -64,6 +65,7 @@ from zerver.models import (
Recipient,
UserProfile,
)
from zerver.models.constants import MAX_TOPIC_NAME_LENGTH
SlackToZulipUserIDT: TypeAlias = dict[str, int]
AddedChannelsT: TypeAlias = dict[str, tuple[str, int]]
@@ -898,6 +900,49 @@ def get_messages_iterator(
yield from sorted(messages_for_one_day, key=get_timestamp_from_message)
def get_parent_user_id_from_thread_message(thread_message: ZerverFieldsT, subtype: str) -> str:
"""
This retrieves the user id of the sender of the original thread
message.
"""
if subtype == "thread_broadcast":
return thread_message["root"]["user"]
elif thread_message["thread_ts"] == thread_message["ts"]:
# This is the original thread message
return thread_message["user"]
else:
return thread_message["parent_user_id"]
def get_zulip_thread_topic_name(
message_content: str, thread_ts: datetime, thread_counter: dict[str, int]
) -> str:
"""
The topic name format is date + message snippet + counter.
e.g "2024-05-22 Hello this is a long message that will be c… (1)"
"""
thread_date = thread_ts.strftime(r"%Y-%m-%d")
# Truncate
truncated_zulip_topic_name = truncate_content(
f"{thread_date} {message_content}".strip(), MAX_TOPIC_NAME_LENGTH, ""
)
collision = thread_counter[truncated_zulip_topic_name]
thread_counter[truncated_zulip_topic_name] += 1
count = (f" ({collision + 1})") if collision > 0 else ""
# Important: The count is at the end, after …, so we need to
# subtract its length when doing truncation.
final_topic_name = (
truncate_content(
f"{thread_date} {message_content}".strip(), MAX_TOPIC_NAME_LENGTH - len(f"{count}"), ""
)
+ f"{count}"
)
return final_topic_name
def channel_message_to_zerver_message(
realm_id: int,
users: list[ZerverFieldsT],
@@ -1034,22 +1079,21 @@ def channel_message_to_zerver_message(
has_image = file_info["has_image"]
# Slack's unthreaded messages go into a single topic, while
# threads each generate a unique topic labeled by the date and
# a counter among topics on that day.
# threads each generate a unique topic labeled by the date,
# a snippet of the original message and a counter if there
# are any thread with the same topic name
topic_name = "imported from Slack"
if convert_slack_threads and not is_direct_message_type and "thread_ts" in message:
thread_ts = datetime.fromtimestamp(float(message["thread_ts"]), tz=timezone.utc)
thread_ts_str = thread_ts.strftime(r"%Y/%m/%d %H:%M:%S")
# The topic name is "2015-08-18 Slack thread 2", where the counter at the end is to disambiguate
# threads with the same date.
if thread_ts_str in thread_map:
topic_name = thread_map[thread_ts_str]
parent_user_id = get_parent_user_id_from_thread_message(message, subtype)
thread_key = f"{thread_ts_str}-{parent_user_id}"
if thread_key in thread_map:
topic_name = thread_map[thread_key]
else:
thread_date = thread_ts.strftime(r"%Y-%m-%d")
thread_counter[thread_date] += 1
count = thread_counter[thread_date]
topic_name = f"{thread_date} Slack thread {count}"
thread_map[thread_ts_str] = topic_name
topic_name = get_zulip_thread_topic_name(content, thread_ts, thread_counter)
thread_map[thread_key] = topic_name
if is_direct_message_type:
topic_name = ""

View File

@@ -1359,7 +1359,13 @@ class SlackImporter(ZulipTestCase):
user_data = [
{"id": "U066MTL5U", "name": "john doe", "deleted": False, "real_name": "John"},
{"id": "U061A5N1G", "name": "jane doe", "deleted": False, "real_name": "Jane"},
{"id": "U061A1R2R", "name": "jon", "deleted": False, "real_name": "Jon"},
{
"id": "U061A1R2R",
"name": "jon",
"deleted": False,
"real_name": "Jon",
"profile": {"email": "jon@example.com"},
},
]
slack_user_id_to_zulip_user_id = {"U066MTL5U": 5, "U061A5N1G": 24, "U061A1R2R": 43}
@@ -1380,10 +1386,10 @@ class SlackImporter(ZulipTestCase):
"channel_name": "random",
},
{
"text": "random",
"text": "message body text",
"user": "U061A5N1G",
"ts": "1439868294.000006",
# Thread!
"ts": "1434139102.000002",
# Start of thread 1!
"thread_ts": "1434139102.000002",
"channel_name": "random",
},
@@ -1391,23 +1397,106 @@ class SlackImporter(ZulipTestCase):
"text": "random",
"user": "U061A5N1G",
"ts": "1439868294.000007",
# A reply to thread 1
"parent_user_id": "U061A5N1G",
"thread_ts": "1434139102.000002",
"channel_name": "random",
},
{
"text": "random",
"text": "random message but it's too long for the thread topic name",
"user": "U061A5N1G",
"ts": "1439868294.000008",
# A different Thread!
# Start of thread 2!
"thread_ts": "1439868294.000008",
"channel_name": "random",
},
{
"text": "random",
"text": "replying to the second thread :)",
"user": "U061A1R2R",
"ts": "1439869294.000008",
# A reply to thread 2
"parent_user_id": "U061A5N1G",
"thread_ts": "1439868294.000008",
"channel_name": "random",
},
{
"text": "message body text",
"user": "U061A5N1G",
"ts": "1439868295.000008",
# Another different Thread!
"thread_ts": "1439868295.000008",
"ts": "1434139200.000002",
# Start of thread 3!
"thread_ts": "1434139200.000002",
"channel_name": "random",
},
{
"text": "The first reply to the third thread",
"user": "U061A1R2R",
"ts": "1439869295.000008",
# A reply to thread 3!
"parent_user_id": "U061A5N1G",
"thread_ts": "1434139200.000002",
"channel_name": "random",
},
{
"text": "<@U061A1R2R> please reply to this message",
"user": "U061A5N1G",
"ts": "1437139200.000002",
# Start of thread 4!
"thread_ts": "1437139200.000002",
"channel_name": "random",
},
{
"text": "Yes?",
"user": "U061A1R2R",
"ts": "1440869295.000008",
# A reply to thread 4!
"parent_user_id": "U061A5N1G",
"thread_ts": "1434139200.000002",
"channel_name": "random",
},
{
"text": "Look!",
"user": "U061A1R2R",
"ts": "1537139200.000002",
# Start of thread 5!
"thread_ts": "1537139200.000002",
"has_image": True,
"channel_name": "random",
"files": [
{
"url_private": "https://files.slack.com/apple.png",
"title": "Apple",
"name": "apple.png",
"mimetype": "image/png",
"timestamp": 9999,
"created": 8888,
"size": 3000000,
}
],
},
{
"text": "Delicious",
"user": "U061A5N1G",
"ts": "1637139200.000002",
# A reply to thread 5!
"parent_user_id": "U061A1R2R",
"thread_ts": "1537139200.000002",
"channel_name": "random",
},
{
"text": "*foo* _bar_ ~baz~ [qux](https://chat.zulip.org)",
"user": "U061A1R2R",
"ts": "1547139200.000002",
# Start of thread 6!
"thread_ts": "1547139200.000002",
"channel_name": "random",
},
{
"text": "Delicious",
"user": "U061A5N1G",
"ts": "1637139200.000002",
# A reply to thread 6!
"parent_user_id": "U061A1R2R",
"thread_ts": "1547139200.000002",
"channel_name": "random",
},
]
@@ -1445,26 +1534,80 @@ class SlackImporter(ZulipTestCase):
# functioning already tested in helper function
self.assertEqual(zerver_usermessage, [])
# subtype: channel_join is filtered
self.assert_length(zerver_message, 5)
self.assert_length(zerver_message, 13)
self.assertEqual(uploads, [])
self.assertEqual(attachment, [])
self.assert_length(uploads, 1)
self.assert_length(attachment, 1)
# Message conversion already tested in tests.test_slack_message_conversion
self.assertEqual(zerver_message[0]["content"], "@**Jane**: hey!")
self.assertEqual(zerver_message[0]["has_link"], False)
self.assertEqual(zerver_message[1]["content"], "random")
self.assertEqual(zerver_message[1][EXPORT_TOPIC_NAME], "2015-06-12 Slack thread 1")
self.assertEqual(zerver_message[2][EXPORT_TOPIC_NAME], "2015-06-12 Slack thread 1")
# A new thread with a different date from 2015-06-12, starts the counter from 1.
self.assertEqual(zerver_message[3][EXPORT_TOPIC_NAME], "2015-08-18 Slack thread 1")
# A new thread with a different timestamp, but the same date as 2015-08-18, starts the
# counter from 2.
self.assertEqual(zerver_message[4][EXPORT_TOPIC_NAME], "2015-08-18 Slack thread 2")
self.assertEqual(
zerver_message[1]["recipient"], slack_recipient_name_to_zulip_recipient_id["random"]
)
### THREAD 1 CONVERSATION ###
# Test thread topic name contains message snippet
expected_thread_1_message_1_content = "message body text"
expected_thread_1_topic_name = "2015-06-12 message body text"
self.assertEqual(zerver_message[1]["content"], expected_thread_1_message_1_content)
self.assertEqual(zerver_message[1][EXPORT_TOPIC_NAME], expected_thread_1_topic_name)
# Thread reply is in the correct thread topic
self.assertEqual(zerver_message[2]["content"], "random")
self.assertEqual(zerver_message[2][EXPORT_TOPIC_NAME], expected_thread_1_topic_name)
### THREAD 2 CONVERSATION ###
# Test thread topic name cut off
expected_thread_2_message_1_content = (
"random message but it's too long for the thread topic name"
)
expected_thread_2_topic_name = (
"2015-08-18 random message but it's too long for the thread …"
)
self.assertEqual(zerver_message[3]["content"], expected_thread_2_message_1_content)
self.assertEqual(zerver_message[3][EXPORT_TOPIC_NAME], expected_thread_2_topic_name)
# Record that truncation should use the full maximum topic length.
self.assert_length(zerver_message[3][EXPORT_TOPIC_NAME], 60)
expected_thread_2_reply_1_message = "replying to the second thread :)"
self.assertEqual(zerver_message[4]["content"], expected_thread_2_reply_1_message)
self.assertEqual(zerver_message[4][EXPORT_TOPIC_NAME], expected_thread_2_topic_name)
### THREAD 3 CONVERSATION ###
# Test thread topic name collision
expected_thread_3_message_1_content = "message body text"
expected_thread_3_topic_name = "2015-06-12 message body text (2)"
self.assertEqual(zerver_message[5]["content"], expected_thread_3_message_1_content)
self.assertEqual(zerver_message[5][EXPORT_TOPIC_NAME], expected_thread_3_topic_name)
### THREAD 4 CONVERSATION ###
# Test mention syntax in thread topic name
expected_thread_4_message_1_content = "@**Jon** please reply to this message"
expected_thread_4_topic_name = "2015-07-17 @**Jon** please reply to this message"
self.assertEqual(zerver_message[7]["content"], expected_thread_4_message_1_content)
self.assertEqual(zerver_message[7][EXPORT_TOPIC_NAME], expected_thread_4_topic_name)
### THREAD 5 CONVERSATION ###
# Test file link in thread topic name
expected_thread_4_message_1_content = "Look!\n[Apple](/user_uploads/"
expected_thread_4_topic_name = "2018-09-16 Look!\n[Apple](/user_uploads/"
self.assertTrue(
zerver_message[9]["content"].startswith(expected_thread_4_message_1_content)
)
self.assertTrue(
zerver_message[9][EXPORT_TOPIC_NAME].startswith(expected_thread_4_topic_name)
)
### THREAD 6 CONVERSATION ###
# Test various formatting syntaxes in thread topic name
expected_thread_4_message_1_content = "**foo** *bar* ~~baz~~ [qux](https://chat.zulip.org)"
expected_thread_4_topic_name = (
"2019-01-10 **foo** *bar* ~~baz~~ [qux](https://chat.zulip.o…"
)
self.assertEqual(zerver_message[11]["content"], expected_thread_4_message_1_content)
self.assertEqual(zerver_message[11][EXPORT_TOPIC_NAME], expected_thread_4_topic_name)
@mock.patch("zerver.data_import.slack.build_usermessages", return_value=(2, 4))
def test_channel_message_to_zerver_message_with_integration_bots(
self, mock_build_usermessage: mock.Mock