Revert "bugdown: Process word boundaries properly in realm_filters."

This reverts commit ff90c0101c but keeps the test cases added for reference. This was reverted because it was both not a clean solution and created other realm filters bugs involving dashes (etc.).
2025-10-23 04:52:12 +00:00 · 2019-03-07 22:11:54 +05:30
parent faaf84bb01
commit df68a3e963
2 changed files with 14 additions and 39 deletions
--- a/zerver/lib/bugdown/init.py
+++ b/zerver/lib/bugdown/init.py
@@ -10,7 +10,6 @@ import logging
 import traceback
 import urllib
 import re
-import regex
 import os
 import html
 import time
@@ -1485,14 +1484,14 @@ def get_link_re() -> str:
    return normal_compile(LINK_RE)

 def prepare_realm_pattern(source: str) -> str:
-    """ Augment a realm filter to liberally match all occurences of the filter,
-    along with the preceeding and proceeding characters for further analysis in
-    the realm filter pattern and saves what was matched as "name". """
-    return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'
+    """ Augment a realm filter so it only matches after start-of-string,
+    whitespace, or opening delimiters, won't match if there are word
+    characters directly after, and saves what was matched as "name". """
+    return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + r')(?!\w)'

 # Given a regular expression pattern, linkifies groups that match it
 # using the provided format string to construct the URL.
-class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
+class RealmFilterPattern(markdown.inlinepatterns.Pattern):
    """ Applied a given realm filter to the input """

    def __init__(self, source_pattern: str,
@@ -1500,40 +1499,13 @@ class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
                 markdown_instance: Optional[markdown.Markdown]=None) -> None:
        self.pattern = prepare_realm_pattern(source_pattern)
        self.format_string = format_string
-        # To properly convert realm patterns in languages that do not use spaces
-        # as separators, we have to apply a somewhat convulated approach. The third
-        # party module `regex` has better unicode support than `re`. Also, we need
-        # to keep two regular expressions because of how word boundaries are computed.
-        #
-        # For example, consider the message:                'hello#123world'
-        # For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
-        # and our pattern's beginning matches even when it
-        # shouldn't. A simple hack is to convert the pattern
-        # to 'a#123a' ('a' is a valid 'word' character).
-        # Now, we get no word boundaries as follows:        'helloa#123world'
-        # and we can safely reject this message.
-        # Conversely, in languages like Japanese that do
-        # not use spaces, a similar message would become:   'チケットは{\b}a#123a{\b}です'
-        # and we can convert this message.
+        markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance)

-        # Regex: - (should have nothing but listed symbols before)
-        #        - (should have word boundary on left with 'a')
-        #        - (should have word boundary on right with the second 'a')
-        word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
-        flags = regex.WORD | regex.DOTALL | regex.UNICODE
-        self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
-        super().__init__(self.pattern, markdown_instance)
-
-    def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
-                                                             Union[int, None],
-                                                             Union[int, None]]:
-        string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
-        if not self.word_boundary_pattern.search(string_new):
-            return None, None, None
+    def handleMatch(self, m: Match[str]) -> Union[Element, str]:
        db_data = self.markdown.zulip_db_data
        return url_to_a(db_data,
                        self.format_string % m.groupdict(),
-                        m.group("name")), m.start('name'), m.end('name')
+                        m.group("name"))

 class UserMentionPattern(markdown.inlinepatterns.Pattern):
    def handleMatch(self, m: Match[str]) -> Optional[Element]:
--- a/zerver/tests/test_bugdown.py
+++ b/zerver/tests/test_bugdown.py
@@ -794,9 +794,12 @@ class BugdownTest(ZulipTestCase):
        self.assertTrue(not was_converted('Hello #123World'))
        self.assertTrue(not was_converted('Hello#123 World'))
        self.assertTrue(not was_converted('Hello#123World'))
-        self.assertTrue(was_converted('チケットは#123です'))
-        self.assertTrue(was_converted('チケットは #123です'))
-        self.assertTrue(was_converted('チケットは#123 です'))
+        # Ideally, these should be converted, but bugdown doesn't
+        # handle word boundary detection in languages that don't use
+        # whitespace for that correctly yet.
+        self.assertTrue(not was_converted('チケットは#123です'))
+        self.assertTrue(not was_converted('チケットは #123です'))
+        self.assertTrue(not was_converted('チケットは#123 です'))
        self.assertTrue(was_converted('チケットは #123 です'))
        self.assertTrue(was_converted('(#123)'))
        self.assertTrue(was_converted('#123>'))