bugdown: Process word boundaries properly in realm_filters.

Earlier, our realm filters didn't render for languages that do not
use spaces (eg: Japanese) since we used to check for the presence
of an actual space character. This commit replaces that logic with
a complex scheme to detect word boundaries.

Also, we convert the RealmFilterPattern to subclass InlineProcessor
and make use of the new no-op feature in py-markdown 3.0.1 where we
can tell py-markdown that our pattern didn't find a match despite
the initial regex getting matched.

Fixes #9883.
This commit is contained in:
Rohitt Vashishtha
2019-01-23 19:13:05 +00:00
committed by Tim Abbott
parent ad071ced47
commit ff90c0101c
2 changed files with 55 additions and 8 deletions

View File

@@ -10,6 +10,7 @@ import logging
import traceback import traceback
import urllib import urllib
import re import re
import regex
import os import os
import html import html
import platform import platform
@@ -1486,14 +1487,14 @@ def get_link_re() -> str:
return normal_compile(LINK_RE) return normal_compile(LINK_RE)
def prepare_realm_pattern(source: str) -> str: def prepare_realm_pattern(source: str) -> str:
""" Augment a realm filter so it only matches after start-of-string, """ Augment a realm filter to liberally match all occurences of the filter,
whitespace, or opening delimiters, won't match if there are word along with the preceeding and proceeding characters for further analysis in
characters directly after, and saves what was matched as "name". """ the realm filter pattern and saves what was matched as "name". """
return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + r')(?!\w)' return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'
# Given a regular expression pattern, linkifies groups that match it # Given a regular expression pattern, linkifies groups that match it
# using the provided format string to construct the URL. # using the provided format string to construct the URL.
class RealmFilterPattern(markdown.inlinepatterns.Pattern): class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
""" Applied a given realm filter to the input """ """ Applied a given realm filter to the input """
def __init__(self, source_pattern: str, def __init__(self, source_pattern: str,
@@ -1501,13 +1502,40 @@ class RealmFilterPattern(markdown.inlinepatterns.Pattern):
markdown_instance: Optional[markdown.Markdown]=None) -> None: markdown_instance: Optional[markdown.Markdown]=None) -> None:
self.pattern = prepare_realm_pattern(source_pattern) self.pattern = prepare_realm_pattern(source_pattern)
self.format_string = format_string self.format_string = format_string
markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance) # To properly convert realm patterns in languages that do not use spaces
# as separators, we have to apply a somewhat convulated approach. The third
# party module `regex` has better unicode support than `re`. Also, we need
# to keep two regular expressions because of how word boundaries are computed.
#
# For example, consider the message: 'hello#123world'
# For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
# and our pattern's beginning matches even when it
# shouldn't. A simple hack is to convert the pattern
# to 'a#123a' ('a' is a valid 'word' character).
# Now, we get no word boundaries as follows: 'helloa#123world'
# and we can safely reject this message.
# Conversely, in languages like Japanese that do
# not use spaces, a similar message would become: 'チケットは{\b}a#123a{\b}です'
# and we can convert this message.
def handleMatch(self, m: Match[str]) -> Union[Element, str]: # Regex: - (should have nothing but listed symbols before)
# - (should have word boundary on left with 'a')
# - (should have word boundary on right with the second 'a')
word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
flags = regex.WORD | regex.DOTALL | regex.UNICODE
self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
super().__init__(self.pattern, markdown_instance)
def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
Union[int, None],
Union[int, None]]:
string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
if not self.word_boundary_pattern.search(string_new):
return None, None, None
db_data = self.markdown.zulip_db_data db_data = self.markdown.zulip_db_data
return url_to_a(db_data, return url_to_a(db_data,
self.format_string % m.groupdict(), self.format_string % m.groupdict(),
m.group("name")) m.group("name")), m.start('name'), m.end('name')
class UserMentionPattern(markdown.inlinepatterns.Pattern): class UserMentionPattern(markdown.inlinepatterns.Pattern):
def handleMatch(self, m: Match[str]) -> Optional[Element]: def handleMatch(self, m: Match[str]) -> Optional[Element]:

View File

@@ -755,6 +755,25 @@ class BugdownTest(ZulipTestCase):
self.assertEqual(converted, '<p><a href="https://trac.zulip.net/ticket/ZUL-123" target="_blank" title="https://trac.zulip.net/ticket/ZUL-123">#ZUL-123</a> was fixed and code was deployed to production, also <a href="https://trac.zulip.net/ticket/zul-321" target="_blank" title="https://trac.zulip.net/ticket/zul-321">#zul-321</a> was deployed to staging</p>') self.assertEqual(converted, '<p><a href="https://trac.zulip.net/ticket/ZUL-123" target="_blank" title="https://trac.zulip.net/ticket/ZUL-123">#ZUL-123</a> was fixed and code was deployed to production, also <a href="https://trac.zulip.net/ticket/zul-321" target="_blank" title="https://trac.zulip.net/ticket/zul-321">#zul-321</a> was deployed to staging</p>')
def was_converted(content: str) -> bool:
converted = bugdown.convert(content, message_realm=realm, message=msg)
return 'trac.zulip.net' in converted
self.assertTrue(was_converted('Hello #123 World'))
self.assertTrue(not was_converted('Hello #123World'))
self.assertTrue(not was_converted('Hello#123 World'))
self.assertTrue(not was_converted('Hello#123World'))
self.assertTrue(was_converted('チケットは#123です'))
self.assertTrue(was_converted('チケットは #123です'))
self.assertTrue(was_converted('チケットは#123 です'))
self.assertTrue(was_converted('チケットは #123 です'))
self.assertTrue(was_converted('(#123)'))
self.assertTrue(was_converted('#123>'))
self.assertTrue(was_converted('"#123"'))
self.assertTrue(was_converted('#123@'))
self.assertTrue(not was_converted(')#123('))
self.assertTrue(not was_converted('##123'))
def test_maybe_update_markdown_engines(self) -> None: def test_maybe_update_markdown_engines(self) -> None:
realm = get_realm('zulip') realm = get_realm('zulip')
url_format_string = r"https://trac.zulip.net/ticket/%(id)s" url_format_string = r"https://trac.zulip.net/ticket/%(id)s"