mirror of
https://github.com/zulip/zulip.git
synced 2025-10-23 04:52:12 +00:00
bugdown: Process word boundaries properly in realm_filters.
Earlier, our realm filters didn't render for languages that do not use spaces (eg: Japanese) since we used to check for the presence of an actual space character. This commit replaces that logic with a complex scheme to detect word boundaries. Also, we convert the RealmFilterPattern to subclass InlineProcessor and make use of the new no-op feature in py-markdown 3.0.1 where we can tell py-markdown that our pattern didn't find a match despite the initial regex getting matched. Fixes #9883.
This commit is contained in:
committed by
Tim Abbott
parent
ad071ced47
commit
ff90c0101c
@@ -10,6 +10,7 @@ import logging
|
||||
import traceback
|
||||
import urllib
|
||||
import re
|
||||
import regex
|
||||
import os
|
||||
import html
|
||||
import platform
|
||||
@@ -1486,14 +1487,14 @@ def get_link_re() -> str:
|
||||
return normal_compile(LINK_RE)
|
||||
|
||||
def prepare_realm_pattern(source: str) -> str:
|
||||
""" Augment a realm filter so it only matches after start-of-string,
|
||||
whitespace, or opening delimiters, won't match if there are word
|
||||
characters directly after, and saves what was matched as "name". """
|
||||
return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + r')(?!\w)'
|
||||
""" Augment a realm filter to liberally match all occurences of the filter,
|
||||
along with the preceeding and proceeding characters for further analysis in
|
||||
the realm filter pattern and saves what was matched as "name". """
|
||||
return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'
|
||||
|
||||
# Given a regular expression pattern, linkifies groups that match it
|
||||
# using the provided format string to construct the URL.
|
||||
class RealmFilterPattern(markdown.inlinepatterns.Pattern):
|
||||
class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
|
||||
""" Applied a given realm filter to the input """
|
||||
|
||||
def __init__(self, source_pattern: str,
|
||||
@@ -1501,13 +1502,40 @@ class RealmFilterPattern(markdown.inlinepatterns.Pattern):
|
||||
markdown_instance: Optional[markdown.Markdown]=None) -> None:
|
||||
self.pattern = prepare_realm_pattern(source_pattern)
|
||||
self.format_string = format_string
|
||||
markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance)
|
||||
# To properly convert realm patterns in languages that do not use spaces
|
||||
# as separators, we have to apply a somewhat convulated approach. The third
|
||||
# party module `regex` has better unicode support than `re`. Also, we need
|
||||
# to keep two regular expressions because of how word boundaries are computed.
|
||||
#
|
||||
# For example, consider the message: 'hello#123world'
|
||||
# For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
|
||||
# and our pattern's beginning matches even when it
|
||||
# shouldn't. A simple hack is to convert the pattern
|
||||
# to 'a#123a' ('a' is a valid 'word' character).
|
||||
# Now, we get no word boundaries as follows: 'helloa#123world'
|
||||
# and we can safely reject this message.
|
||||
# Conversely, in languages like Japanese that do
|
||||
# not use spaces, a similar message would become: 'チケットは{\b}a#123a{\b}です'
|
||||
# and we can convert this message.
|
||||
|
||||
def handleMatch(self, m: Match[str]) -> Union[Element, str]:
|
||||
# Regex: - (should have nothing but listed symbols before)
|
||||
# - (should have word boundary on left with 'a')
|
||||
# - (should have word boundary on right with the second 'a')
|
||||
word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
|
||||
flags = regex.WORD | regex.DOTALL | regex.UNICODE
|
||||
self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
|
||||
super().__init__(self.pattern, markdown_instance)
|
||||
|
||||
def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
|
||||
Union[int, None],
|
||||
Union[int, None]]:
|
||||
string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
|
||||
if not self.word_boundary_pattern.search(string_new):
|
||||
return None, None, None
|
||||
db_data = self.markdown.zulip_db_data
|
||||
return url_to_a(db_data,
|
||||
self.format_string % m.groupdict(),
|
||||
m.group("name"))
|
||||
m.group("name")), m.start('name'), m.end('name')
|
||||
|
||||
class UserMentionPattern(markdown.inlinepatterns.Pattern):
|
||||
def handleMatch(self, m: Match[str]) -> Optional[Element]:
|
||||
|
@@ -755,6 +755,25 @@ class BugdownTest(ZulipTestCase):
|
||||
|
||||
self.assertEqual(converted, '<p><a href="https://trac.zulip.net/ticket/ZUL-123" target="_blank" title="https://trac.zulip.net/ticket/ZUL-123">#ZUL-123</a> was fixed and code was deployed to production, also <a href="https://trac.zulip.net/ticket/zul-321" target="_blank" title="https://trac.zulip.net/ticket/zul-321">#zul-321</a> was deployed to staging</p>')
|
||||
|
||||
def was_converted(content: str) -> bool:
|
||||
converted = bugdown.convert(content, message_realm=realm, message=msg)
|
||||
return 'trac.zulip.net' in converted
|
||||
|
||||
self.assertTrue(was_converted('Hello #123 World'))
|
||||
self.assertTrue(not was_converted('Hello #123World'))
|
||||
self.assertTrue(not was_converted('Hello#123 World'))
|
||||
self.assertTrue(not was_converted('Hello#123World'))
|
||||
self.assertTrue(was_converted('チケットは#123です'))
|
||||
self.assertTrue(was_converted('チケットは #123です'))
|
||||
self.assertTrue(was_converted('チケットは#123 です'))
|
||||
self.assertTrue(was_converted('チケットは #123 です'))
|
||||
self.assertTrue(was_converted('(#123)'))
|
||||
self.assertTrue(was_converted('#123>'))
|
||||
self.assertTrue(was_converted('"#123"'))
|
||||
self.assertTrue(was_converted('#123@'))
|
||||
self.assertTrue(not was_converted(')#123('))
|
||||
self.assertTrue(not was_converted('##123'))
|
||||
|
||||
def test_maybe_update_markdown_engines(self) -> None:
|
||||
realm = get_realm('zulip')
|
||||
url_format_string = r"https://trac.zulip.net/ticket/%(id)s"
|
||||
|
Reference in New Issue
Block a user