bugdown: Process word boundaries properly in realm_filters.

Earlier, our realm filters didn't render for languages that do not
use spaces (eg: Japanese) since we used to check for the presence
of an actual space character. This commit replaces that logic with
a complex scheme to detect word boundaries.

Also, we convert the RealmFilterPattern to subclass InlineProcessor
and make use of the new no-op feature in py-markdown 3.0.1 where we
can tell py-markdown that our pattern didn't find a match despite
the initial regex getting matched.

Fixes #9883.
This commit is contained in:
Rohitt Vashishtha
2019-01-23 19:13:05 +00:00
committed by Tim Abbott
parent ad071ced47
commit ff90c0101c
2 changed files with 55 additions and 8 deletions

View File

@@ -10,6 +10,7 @@ import logging
import traceback
import urllib
import re
import regex
import os
import html
import platform
@@ -1486,14 +1487,14 @@ def get_link_re() -> str:
return normal_compile(LINK_RE)
def prepare_realm_pattern(source: str) -> str:
""" Augment a realm filter so it only matches after start-of-string,
whitespace, or opening delimiters, won't match if there are word
characters directly after, and saves what was matched as "name". """
return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + r')(?!\w)'
""" Augment a realm filter to liberally match all occurences of the filter,
along with the preceeding and proceeding characters for further analysis in
the realm filter pattern and saves what was matched as "name". """
return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'
# Given a regular expression pattern, linkifies groups that match it
# using the provided format string to construct the URL.
class RealmFilterPattern(markdown.inlinepatterns.Pattern):
class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
""" Applied a given realm filter to the input """
def __init__(self, source_pattern: str,
@@ -1501,13 +1502,40 @@ class RealmFilterPattern(markdown.inlinepatterns.Pattern):
markdown_instance: Optional[markdown.Markdown]=None) -> None:
self.pattern = prepare_realm_pattern(source_pattern)
self.format_string = format_string
markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance)
# To properly convert realm patterns in languages that do not use spaces
# as separators, we have to apply a somewhat convulated approach. The third
# party module `regex` has better unicode support than `re`. Also, we need
# to keep two regular expressions because of how word boundaries are computed.
#
# For example, consider the message: 'hello#123world'
# For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
# and our pattern's beginning matches even when it
# shouldn't. A simple hack is to convert the pattern
# to 'a#123a' ('a' is a valid 'word' character).
# Now, we get no word boundaries as follows: 'helloa#123world'
# and we can safely reject this message.
# Conversely, in languages like Japanese that do
# not use spaces, a similar message would become: 'チケットは{\b}a#123a{\b}です'
# and we can convert this message.
def handleMatch(self, m: Match[str]) -> Union[Element, str]:
# Regex: - (should have nothing but listed symbols before)
# - (should have word boundary on left with 'a')
# - (should have word boundary on right with the second 'a')
word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
flags = regex.WORD | regex.DOTALL | regex.UNICODE
self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
super().__init__(self.pattern, markdown_instance)
def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
Union[int, None],
Union[int, None]]:
string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
if not self.word_boundary_pattern.search(string_new):
return None, None, None
db_data = self.markdown.zulip_db_data
return url_to_a(db_data,
self.format_string % m.groupdict(),
m.group("name"))
m.group("name")), m.start('name'), m.end('name')
class UserMentionPattern(markdown.inlinepatterns.Pattern):
def handleMatch(self, m: Match[str]) -> Optional[Element]:

View File

@@ -755,6 +755,25 @@ class BugdownTest(ZulipTestCase):
self.assertEqual(converted, '<p><a href="https://trac.zulip.net/ticket/ZUL-123" target="_blank" title="https://trac.zulip.net/ticket/ZUL-123">#ZUL-123</a> was fixed and code was deployed to production, also <a href="https://trac.zulip.net/ticket/zul-321" target="_blank" title="https://trac.zulip.net/ticket/zul-321">#zul-321</a> was deployed to staging</p>')
def was_converted(content: str) -> bool:
converted = bugdown.convert(content, message_realm=realm, message=msg)
return 'trac.zulip.net' in converted
self.assertTrue(was_converted('Hello #123 World'))
self.assertTrue(not was_converted('Hello #123World'))
self.assertTrue(not was_converted('Hello#123 World'))
self.assertTrue(not was_converted('Hello#123World'))
self.assertTrue(was_converted('チケットは#123です'))
self.assertTrue(was_converted('チケットは #123です'))
self.assertTrue(was_converted('チケットは#123 です'))
self.assertTrue(was_converted('チケットは #123 です'))
self.assertTrue(was_converted('(#123)'))
self.assertTrue(was_converted('#123>'))
self.assertTrue(was_converted('"#123"'))
self.assertTrue(was_converted('#123@'))
self.assertTrue(not was_converted(')#123('))
self.assertTrue(not was_converted('##123'))
def test_maybe_update_markdown_engines(self) -> None:
realm = get_realm('zulip')
url_format_string = r"https://trac.zulip.net/ticket/%(id)s"