slack_regex: Refactor how the capture groups are defined.

This updates the patterns to use a more unicode-aware pattern for their
first and last capture groups. The new patterns have the same
behaviours, but they're expressed in a more coherent way.

For example, the existing patterns lists what characters to look for,
skipping ceratin characters it don't want to match (e.g, closing
brackets & quote are skipped). The new pattern narrows down what it
looks for (whitespace, punctuation, symbols) and explicitly list what it
don't want to match (closing quote and bracket, etc).

This also refactors the `convert_markdown_syntax` to use the `regex`
module instead of the `re` module because the `regex` module has full
unicode support.
This commit is contained in:
PieterCK
2025-04-18 14:54:22 +07:00
committed by Tim Abbott
parent f33c5b9c1e
commit 946e4096fc
2 changed files with 47 additions and 8 deletions

View File

@@ -2,6 +2,8 @@ import re
from itertools import zip_longest
from typing import Any, Literal, TypeAlias, TypedDict, cast
import regex
from zerver.lib.types import Validator
from zerver.lib.validator import (
WildValue,
@@ -48,25 +50,57 @@ SLACK_USERMENTION_REGEX = r"""
# Hence, ~stri~ke doesn't format the word in Slack, but ~~stri~~ke
# formats the word in Zulip
SLACK_STRIKETHROUGH_REGEX = r"""
(^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^) # Start after specified characters
(
# Capture punctuation (\p{P}), white space (\p{Zs}),
# symbols (\p{S}) or newline.
# Skip ~ to not reformat the same string twice
# Skip @ and \
# Skip closing brackets & closing quote (\p{Pf}\p{Pe})
(?![~`@\\\p{Pf}\p{Pe}])
[\p{P}\p{Zs}\p{S}]|^
)
(\~) # followed by a ~
([ -)+-}—]*)([ -}]+) # any character except ~
(\~) # followed by a ~
($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^) # ends with specified characters
(
# Capture punctuation, white space, symbols or end of
# line.
# Skip ~ to not reformat the same string twice
# Skip @ and \
# Skip opening brackets & opening quote (\p{Pi}\p{Ps})
(?![~`@\\\p{Pi}\p{Ps}])
[\p{P}\p{Zs}\p{S}]|$
)
"""
SLACK_ITALIC_REGEX = r"""
(^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
# Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
# being, this skips _ instead of ~
(
(?![_`@\\\p{Pf}\p{Pe}])
[\p{P}\p{Zs}\p{S}]|^
)
(\_)
([ -^`~—]*)([ -^`-~]+) # any character except _
(\_)
($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
(
(?![_`@\\\p{Pi}\p{Ps}])
[\p{P}\p{Zs}\p{S}]|$
)
"""
SLACK_BOLD_REGEX = r"""
(^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
# Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
# being, this skips * instead of ~
(
(?![*`@\\\p{Pf}\p{Pe}])
[\p{P}\p{Zs}\p{S}]|^
)
(\*)
([ -)+-~—]*)([ -)+-~]+) # any character except *
(\*)
($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
(
(?![*`@\\\p{Pi}\p{Ps}])
[\p{P}\p{Zs}\p{S}]|$
)
"""
@@ -131,14 +165,14 @@ def convert_mailto_format(text: str) -> tuple[str, bool]:
# Map italic, bold and strikethrough Markdown
def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
def convert_markdown_syntax(text: str, pattern: str, zulip_keyword: str) -> str:
"""
Returns:
1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'
2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**'
3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
"""
for match in re.finditer(regex, text, re.VERBOSE):
for match in regex.finditer(pattern, text, re.VERBOSE):
converted_token = (
match.group(1)
+ zulip_keyword

View File

@@ -84,6 +84,11 @@
"name": "italic_and_strike_conversion",
"input": "_~italic~_ and ~_strike_~",
"conversion_output": "*~~italic~~* and ~~*strike*~~"
},
{
"name": "unicode_quotes",
"input": "«~strike~» 「*bold*」 ❰_italic_❱",
"conversion_output": "«~~strike~~» 「**bold**」 ❰*italic*❱"
}
]
}