bugdown: Process word boundaries properly in realm_filters.

Earlier, our realm filters didn't render for languages that do not use spaces (eg: Japanese) since we used to check for the presence of an actual space character. This commit replaces that logic with a complex scheme to detect word boundaries. Also, we convert the RealmFilterPattern to subclass InlineProcessor and make use of the new no-op feature in py-markdown 3.0.1 where we can tell py-markdown that our pattern didn't find a match despite the initial regex getting matched. Fixes #9883.
2025-10-23 16:14:02 +00:00 · 2019-01-23 19:13:05 +00:00
parent ad071ced47
commit ff90c0101c
2 changed files with 55 additions and 8 deletions
--- a/zerver/lib/bugdown/init.py
+++ b/zerver/lib/bugdown/init.py
@@ -10,6 +10,7 @@ import logging
 import traceback
 import urllib
 import re
+import regex
 import os
 import html
 import platform
@@ -1486,14 +1487,14 @@ def get_link_re() -> str:
    return normal_compile(LINK_RE)

 def prepare_realm_pattern(source: str) -> str:
-    """ Augment a realm filter so it only matches after start-of-string,
-    whitespace, or opening delimiters, won't match if there are word
-    characters directly after, and saves what was matched as "name". """
-    return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + r')(?!\w)'
+    """ Augment a realm filter to liberally match all occurences of the filter,
+    along with the preceeding and proceeding characters for further analysis in
+    the realm filter pattern and saves what was matched as "name". """
+    return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'

 # Given a regular expression pattern, linkifies groups that match it
 # using the provided format string to construct the URL.
-class RealmFilterPattern(markdown.inlinepatterns.Pattern):
+class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
    """ Applied a given realm filter to the input """

    def __init__(self, source_pattern: str,
@@ -1501,13 +1502,40 @@ class RealmFilterPattern(markdown.inlinepatterns.Pattern):
                 markdown_instance: Optional[markdown.Markdown]=None) -> None:
        self.pattern = prepare_realm_pattern(source_pattern)
        self.format_string = format_string
-        markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance)
+        # To properly convert realm patterns in languages that do not use spaces
+        # as separators, we have to apply a somewhat convulated approach. The third
+        # party module `regex` has better unicode support than `re`. Also, we need
+        # to keep two regular expressions because of how word boundaries are computed.
+        #
+        # For example, consider the message:                'hello#123world'
+        # For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
+        # and our pattern's beginning matches even when it
+        # shouldn't. A simple hack is to convert the pattern
+        # to 'a#123a' ('a' is a valid 'word' character).
+        # Now, we get no word boundaries as follows:        'helloa#123world'
+        # and we can safely reject this message.
+        # Conversely, in languages like Japanese that do
+        # not use spaces, a similar message would become:   'チケットは{\b}a#123a{\b}です'
+        # and we can convert this message.

-    def handleMatch(self, m: Match[str]) -> Union[Element, str]:
+        # Regex: - (should have nothing but listed symbols before)
+        #        - (should have word boundary on left with 'a')
+        #        - (should have word boundary on right with the second 'a')
+        word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
+        flags = regex.WORD | regex.DOTALL | regex.UNICODE
+        self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
+        super().__init__(self.pattern, markdown_instance)
+
+    def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
+                                                             Union[int, None],
+                                                             Union[int, None]]:
+        string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
+        if not self.word_boundary_pattern.search(string_new):
+            return None, None, None
        db_data = self.markdown.zulip_db_data
        return url_to_a(db_data,
                        self.format_string % m.groupdict(),
-                        m.group("name"))
+                        m.group("name")), m.start('name'), m.end('name')

 class UserMentionPattern(markdown.inlinepatterns.Pattern):
    def handleMatch(self, m: Match[str]) -> Optional[Element]:
--- a/zerver/tests/test_bugdown.py
+++ b/zerver/tests/test_bugdown.py
@@ -755,6 +755,25 @@ class BugdownTest(ZulipTestCase):

        self.assertEqual(converted, '<p><a href="https://trac.zulip.net/ticket/ZUL-123" target="_blank" title="https://trac.zulip.net/ticket/ZUL-123">#ZUL-123</a> was fixed and code was deployed to production, also <a href="https://trac.zulip.net/ticket/zul-321" target="_blank" title="https://trac.zulip.net/ticket/zul-321">#zul-321</a> was deployed to staging</p>')

+        def was_converted(content: str) -> bool:
+            converted = bugdown.convert(content, message_realm=realm, message=msg)
+            return 'trac.zulip.net' in converted
+
+        self.assertTrue(was_converted('Hello #123 World'))
+        self.assertTrue(not was_converted('Hello #123World'))
+        self.assertTrue(not was_converted('Hello#123 World'))
+        self.assertTrue(not was_converted('Hello#123World'))
+        self.assertTrue(was_converted('チケットは#123です'))
+        self.assertTrue(was_converted('チケットは #123です'))
+        self.assertTrue(was_converted('チケットは#123 です'))
+        self.assertTrue(was_converted('チケットは #123 です'))
+        self.assertTrue(was_converted('(#123)'))
+        self.assertTrue(was_converted('#123>'))
+        self.assertTrue(was_converted('"#123"'))
+        self.assertTrue(was_converted('#123@'))
+        self.assertTrue(not was_converted(')#123('))
+        self.assertTrue(not was_converted('##123'))
+
    def test_maybe_update_markdown_engines(self) -> None:
        realm = get_realm('zulip')
        url_format_string = r"https://trac.zulip.net/ticket/%(id)s"