slack_regex: Refactor how the capture groups are defined.

This updates the patterns to use a more unicode-aware pattern for their first and last capture groups. The new patterns have the same behaviours, but they're expressed in a more coherent way. For example, the existing patterns lists what characters to look for, skipping ceratin characters it don't want to match (e.g, closing brackets & quote are skipped). The new pattern narrows down what it looks for (whitespace, punctuation, symbols) and explicitly list what it don't want to match (closing quote and bracket, etc). This also refactors the `convert_markdown_syntax` to use the `regex` module instead of the `re` module because the `regex` module has full unicode support.
2025-11-02 04:53:36 +00:00 · 2025-04-18 14:54:22 +07:00
parent f33c5b9c1e
commit 946e4096fc
2 changed files with 47 additions and 8 deletions
--- a/zerver/data_import/slack_message_conversion.py
+++ b/zerver/data_import/slack_message_conversion.py
@@ -2,6 +2,8 @@ import re
 from itertools import zip_longest
 from typing import Any, Literal, TypeAlias, TypedDict, cast

+import regex
+
 from zerver.lib.types import Validator
 from zerver.lib.validator import (
    WildValue,
@@ -48,25 +50,57 @@ SLACK_USERMENTION_REGEX = r"""
 # Hence, ~stri~ke doesn't format the word in Slack, but ~~stri~~ke
 # formats the word in Zulip
 SLACK_STRIKETHROUGH_REGEX = r"""
-                             (^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^)     # Start after specified characters
+                             (
+                                # Capture punctuation (\p{P}), white space (\p{Zs}),
+                                # symbols (\p{S}) or newline.
+                                # Skip ~ to not reformat the same string twice
+                                # Skip @ and \
+                                # Skip closing brackets & closing quote (\p{Pf}\p{Pe})
+                                (?![~`@\\\p{Pf}\p{Pe}])
+                                [\p{P}\p{Zs}\p{S}]|^
+                             )
                             (\~)                                  # followed by a ~
                                 ([ -)+-}—]*)([ -}]+)              # any character except ~
                             (\~)                                  # followed by a ~
-                             ($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^)  # ends with specified characters
+                             (
+                                # Capture punctuation, white space, symbols or end of
+                                # line.
+                                # Skip ~ to not reformat the same string twice
+                                # Skip @ and \
+                                # Skip opening brackets & opening quote (\p{Pi}\p{Ps})
+                                (?![~`@\\\p{Pi}\p{Ps}])
+                                [\p{P}\p{Zs}\p{S}]|$
+                             )
                             """
 SLACK_ITALIC_REGEX = r"""
-                      (^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
+                      # Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
+                      # being, this skips _ instead of ~
+                      (
+                        (?![_`@\\\p{Pf}\p{Pe}])
+                        [\p{P}\p{Zs}\p{S}]|^
+                      )
                      (\_)
                          ([ -^`~—]*)([ -^`-~]+)                  # any character except _
                      (\_)
-                      ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
+                      (
+                        (?![_`@\\\p{Pi}\p{Ps}])
+                        [\p{P}\p{Zs}\p{S}]|$
+                      )
                      """
 SLACK_BOLD_REGEX = r"""
-                    (^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
+                    # Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
+                    # being, this skips * instead of ~
+                    (
+                        (?![*`@\\\p{Pf}\p{Pe}])
+                        [\p{P}\p{Zs}\p{S}]|^
+                    )
                    (\*)
                        ([ -)+-~—]*)([ -)+-~]+)                   # any character except *
                    (\*)
-                    ($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
+                    (
+                        (?![*`@\\\p{Pi}\p{Ps}])
+                        [\p{P}\p{Zs}\p{S}]|$
+                    )
                    """


@@ -131,14 +165,14 @@ def convert_mailto_format(text: str) -> tuple[str, bool]:


 # Map italic, bold and strikethrough Markdown
-def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
+def convert_markdown_syntax(text: str, pattern: str, zulip_keyword: str) -> str:
    """
    Returns:
    1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'
    2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**'
    3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
    """
-    for match in re.finditer(regex, text, re.VERBOSE):
+    for match in regex.finditer(pattern, text, re.VERBOSE):
        converted_token = (
            match.group(1)
            + zulip_keyword
--- a/zerver/tests/fixtures/slack_message_conversion.json
+++ b/zerver/tests/fixtures/slack_message_conversion.json
@@ -84,6 +84,11 @@
      "name": "italic_and_strike_conversion",
      "input": "_~italic~_ and ~_strike_~",
      "conversion_output": "*~~italic~~* and ~~*strike*~~"
+    },
+    {
+      "name": "unicode_quotes",
+      "input": "«~strike~» 「*bold*」 ❰_italic_❱",
+      "conversion_output": "«~~strike~~» 「**bold**」 ❰*italic*❱"
    }
  ]
 }