From 946e4096fcc6d78c42c8cfe5c484d3a6d4322009 Mon Sep 17 00:00:00 2001 From: PieterCK Date: Fri, 18 Apr 2025 14:54:22 +0700 Subject: [PATCH] slack_regex: Refactor how the capture groups are defined. This updates the patterns to use a more unicode-aware pattern for their first and last capture groups. The new patterns have the same behaviours, but they're expressed in a more coherent way. For example, the existing patterns lists what characters to look for, skipping ceratin characters it don't want to match (e.g, closing brackets & quote are skipped). The new pattern narrows down what it looks for (whitespace, punctuation, symbols) and explicitly list what it don't want to match (closing quote and bracket, etc). This also refactors the `convert_markdown_syntax` to use the `regex` module instead of the `re` module because the `regex` module has full unicode support. --- .../data_import/slack_message_conversion.py | 50 ++++++++++++++++--- .../fixtures/slack_message_conversion.json | 5 ++ 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/zerver/data_import/slack_message_conversion.py b/zerver/data_import/slack_message_conversion.py index 1922559aa9..4d97dec016 100644 --- a/zerver/data_import/slack_message_conversion.py +++ b/zerver/data_import/slack_message_conversion.py @@ -2,6 +2,8 @@ import re from itertools import zip_longest from typing import Any, Literal, TypeAlias, TypedDict, cast +import regex + from zerver.lib.types import Validator from zerver.lib.validator import ( WildValue, @@ -48,25 +50,57 @@ SLACK_USERMENTION_REGEX = r""" # Hence, ~stri~ke doesn't format the word in Slack, but ~~stri~~ke # formats the word in Zulip SLACK_STRIKETHROUGH_REGEX = r""" - (^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^) # Start after specified characters + ( + # Capture punctuation (\p{P}), white space (\p{Zs}), + # symbols (\p{S}) or newline. + # Skip ~ to not reformat the same string twice + # Skip @ and \ + # Skip closing brackets & closing quote (\p{Pf}\p{Pe}) + (?![~`@\\\p{Pf}\p{Pe}]) + [\p{P}\p{Zs}\p{S}]|^ + ) (\~) # followed by a ~ ([ -)+-}—]*)([ -}]+) # any character except ~ (\~) # followed by a ~ - ($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^) # ends with specified characters + ( + # Capture punctuation, white space, symbols or end of + # line. + # Skip ~ to not reformat the same string twice + # Skip @ and \ + # Skip opening brackets & opening quote (\p{Pi}\p{Ps}) + (?![~`@\\\p{Pi}\p{Ps}]) + [\p{P}\p{Zs}\p{S}]|$ + ) """ SLACK_ITALIC_REGEX = r""" - (^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~) + # Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference + # being, this skips _ instead of ~ + ( + (?![_`@\\\p{Pf}\p{Pe}]) + [\p{P}\p{Zs}\p{S}]|^ + ) (\_) ([ -^`~—]*)([ -^`-~]+) # any character except _ (\_) - ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~) + ( + (?![_`@\\\p{Pi}\p{Ps}]) + [\p{P}\p{Zs}\p{S}]|$ + ) """ SLACK_BOLD_REGEX = r""" - (^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~) + # Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference + # being, this skips * instead of ~ + ( + (?![*`@\\\p{Pf}\p{Pe}]) + [\p{P}\p{Zs}\p{S}]|^ + ) (\*) ([ -)+-~—]*)([ -)+-~]+) # any character except * (\*) - ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~) + ( + (?![*`@\\\p{Pi}\p{Ps}]) + [\p{P}\p{Zs}\p{S}]|$ + ) """ @@ -131,14 +165,14 @@ def convert_mailto_format(text: str) -> tuple[str, bool]: # Map italic, bold and strikethrough Markdown -def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str: +def convert_markdown_syntax(text: str, pattern: str, zulip_keyword: str) -> str: """ Returns: 1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~' 2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**' 3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*' """ - for match in re.finditer(regex, text, re.VERBOSE): + for match in regex.finditer(pattern, text, re.VERBOSE): converted_token = ( match.group(1) + zulip_keyword diff --git a/zerver/tests/fixtures/slack_message_conversion.json b/zerver/tests/fixtures/slack_message_conversion.json index 7725743998..7e6c0dd3e2 100644 --- a/zerver/tests/fixtures/slack_message_conversion.json +++ b/zerver/tests/fixtures/slack_message_conversion.json @@ -84,6 +84,11 @@ "name": "italic_and_strike_conversion", "input": "_~italic~_ and ~_strike_~", "conversion_output": "*~~italic~~* and ~~*strike*~~" + }, + { + "name": "unicode_quotes", + "input": "«~strike~» 「*bold*」 ❰_italic_❱", + "conversion_output": "«~~strike~~» 「**bold**」 ❰*italic*❱" } ] }