mirror of
https://github.com/zulip/zulip.git
synced 2025-11-01 20:44:04 +00:00
slack_regex: Refactor how the capture groups are defined.
This updates the patterns to use a more unicode-aware pattern for their first and last capture groups. The new patterns have the same behaviours, but they're expressed in a more coherent way. For example, the existing patterns lists what characters to look for, skipping ceratin characters it don't want to match (e.g, closing brackets & quote are skipped). The new pattern narrows down what it looks for (whitespace, punctuation, symbols) and explicitly list what it don't want to match (closing quote and bracket, etc). This also refactors the `convert_markdown_syntax` to use the `regex` module instead of the `re` module because the `regex` module has full unicode support.
This commit is contained in:
@@ -2,6 +2,8 @@ import re
|
||||
from itertools import zip_longest
|
||||
from typing import Any, Literal, TypeAlias, TypedDict, cast
|
||||
|
||||
import regex
|
||||
|
||||
from zerver.lib.types import Validator
|
||||
from zerver.lib.validator import (
|
||||
WildValue,
|
||||
@@ -48,25 +50,57 @@ SLACK_USERMENTION_REGEX = r"""
|
||||
# Hence, ~stri~ke doesn't format the word in Slack, but ~~stri~~ke
|
||||
# formats the word in Zulip
|
||||
SLACK_STRIKETHROUGH_REGEX = r"""
|
||||
(^|[ -(]|[+-/]|\*|\_|[:-?]|\{|\[|\||\^) # Start after specified characters
|
||||
(
|
||||
# Capture punctuation (\p{P}), white space (\p{Zs}),
|
||||
# symbols (\p{S}) or newline.
|
||||
# Skip ~ to not reformat the same string twice
|
||||
# Skip @ and \
|
||||
# Skip closing brackets & closing quote (\p{Pf}\p{Pe})
|
||||
(?![~`@\\\p{Pf}\p{Pe}])
|
||||
[\p{P}\p{Zs}\p{S}]|^
|
||||
)
|
||||
(\~) # followed by a ~
|
||||
([ -)+-}—]*)([ -}]+) # any character except ~
|
||||
(\~) # followed by a ~
|
||||
($|[ -']|[+-/]|[:-?]|\*|\_|\}|\)|\]|\||\^) # ends with specified characters
|
||||
(
|
||||
# Capture punctuation, white space, symbols or end of
|
||||
# line.
|
||||
# Skip ~ to not reformat the same string twice
|
||||
# Skip @ and \
|
||||
# Skip opening brackets & opening quote (\p{Pi}\p{Ps})
|
||||
(?![~`@\\\p{Pi}\p{Ps}])
|
||||
[\p{P}\p{Zs}\p{S}]|$
|
||||
)
|
||||
"""
|
||||
SLACK_ITALIC_REGEX = r"""
|
||||
(^|[ -*]|[+-/]|[:-?]|\{|\[|\||\^|~)
|
||||
# Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
|
||||
# being, this skips _ instead of ~
|
||||
(
|
||||
(?![_`@\\\p{Pf}\p{Pe}])
|
||||
[\p{P}\p{Zs}\p{S}]|^
|
||||
)
|
||||
(\_)
|
||||
([ -^`~—]*)([ -^`-~]+) # any character except _
|
||||
(\_)
|
||||
($|[ -']|[+-/]|[:-?]|\}|\)|\]|\*|\||\^|~)
|
||||
(
|
||||
(?![_`@\\\p{Pi}\p{Ps}])
|
||||
[\p{P}\p{Zs}\p{S}]|$
|
||||
)
|
||||
"""
|
||||
SLACK_BOLD_REGEX = r"""
|
||||
(^|[ -(]|[+-/]|[:-?]|\{|\[|\_|\||\^|~)
|
||||
# Same as `SLACK_STRIKETHROUGH_REGEX`s. The difference
|
||||
# being, this skips * instead of ~
|
||||
(
|
||||
(?![*`@\\\p{Pf}\p{Pe}])
|
||||
[\p{P}\p{Zs}\p{S}]|^
|
||||
)
|
||||
(\*)
|
||||
([ -)+-~—]*)([ -)+-~]+) # any character except *
|
||||
(\*)
|
||||
($|[ -']|[+-/]|[:-?]|\}|\)|\]|\_|\||\^|~)
|
||||
(
|
||||
(?![*`@\\\p{Pi}\p{Ps}])
|
||||
[\p{P}\p{Zs}\p{S}]|$
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
@@ -131,14 +165,14 @@ def convert_mailto_format(text: str) -> tuple[str, bool]:
|
||||
|
||||
|
||||
# Map italic, bold and strikethrough Markdown
|
||||
def convert_markdown_syntax(text: str, regex: str, zulip_keyword: str) -> str:
|
||||
def convert_markdown_syntax(text: str, pattern: str, zulip_keyword: str) -> str:
|
||||
"""
|
||||
Returns:
|
||||
1. For strikethrough formatting: This maps Slack's '~strike~' to Zulip's '~~strike~~'
|
||||
2. For bold formatting: This maps Slack's '*bold*' to Zulip's '**bold**'
|
||||
3. For italic formatting: This maps Slack's '_italic_' to Zulip's '*italic*'
|
||||
"""
|
||||
for match in re.finditer(regex, text, re.VERBOSE):
|
||||
for match in regex.finditer(pattern, text, re.VERBOSE):
|
||||
converted_token = (
|
||||
match.group(1)
|
||||
+ zulip_keyword
|
||||
|
||||
@@ -84,6 +84,11 @@
|
||||
"name": "italic_and_strike_conversion",
|
||||
"input": "_~italic~_ and ~_strike_~",
|
||||
"conversion_output": "*~~italic~~* and ~~*strike*~~"
|
||||
},
|
||||
{
|
||||
"name": "unicode_quotes",
|
||||
"input": "«~strike~» 「*bold*」 ❰_italic_❱",
|
||||
"conversion_output": "«~~strike~~» 「**bold**」 ❰*italic*❱"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user