bugdown: Restructure Bugdown to extend Markdown from being an extension.

Since we are building our parser from scratch now:

1. We have control over which proccessor goes at what priority number.
   Thus, we have also shifted the deprecated `.add()` calls to use the
   new `.register()` calls with explicit priorities, but maintaining
   the original order that the old method generated.

2. We do not have to remove the processors added by py-markdown that
   we do not use in Zulip; we explicitly add only the processors we
   do require.

3. We can cluster the building of each type of parser in one place,
   and in the order they need to be so that when we register them,
   there is no need to sort the list. This also makes for a huge
   improvement in the readability of the code, as all the components
   of each type are registered in the same function.

These are significant performance improvements, because we save on
calls to `str.startswith` in `.add()`, all the resources taken to
generate the default to-be-removed processors and the time taken to
sort the list of processors.

Following are the profiling results for the changes made. Here, we
build 10 engines one after the other and note the time taken to build
each of them. 1st pass represents the state after this commit and 2nd
pass represent the state after some regex modifications in the commits
that follow by Steve Howell. All times are in microseconds.

| nth Engine | Old Time | 1st Pass | 2nd Pass |
| ---------- | -------- | -------- | -------- |
|          1 |  92117.0 |  81775.0 |  76710.0 |
|          2 |   1254.0 |    558.0 |    341.0 |
|          3 |   1170.0 |    472.0 |    305.0 |
|          4 |   1155.0 |    519.0 |    301.0 |
|          5 |   1170.0 |    546.0 |    326.0 |
|          6 |   1271.0 |    609.0 |    416.0 |
|          7 |   1125.0 |    459.0 |    299.0 |
|          8 |   1146.0 |    476.0 |    390.0 |
|          9 |   1274.0 |    446.0 |    301.0 |
|         10 |   1135.0 |    451.0 |    297.0 |
This commit is contained in:
Rohitt Vashishtha
2019-01-20 08:10:58 +00:00
committed by Tim Abbott
parent 9f2c52c86e
commit 434094e599
2 changed files with 114 additions and 149 deletions

View File

@@ -570,9 +570,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
TWITTER_MAX_TO_PREVIEW = 3 TWITTER_MAX_TO_PREVIEW = 3
INLINE_PREVIEW_LIMIT_PER_MESSAGE = 5 INLINE_PREVIEW_LIMIT_PER_MESSAGE = 5
def __init__(self, md: markdown.Markdown, bugdown: 'Bugdown') -> None: def __init__(self, md: markdown.Markdown) -> None:
# Passing in bugdown for access to config to check if realm is zulip.com
self.bugdown = bugdown
markdown.treeprocessors.Treeprocessor.__init__(self, md) markdown.treeprocessors.Treeprocessor.__init__(self, md)
def get_actual_image_url(self, url: str) -> str: def get_actual_image_url(self, url: str) -> str:
@@ -1681,7 +1679,7 @@ def get_sub_registry(r: markdown.util.Registry, keys: List[str]) -> markdown.uti
DEFAULT_BUGDOWN_KEY = -1 DEFAULT_BUGDOWN_KEY = -1
ZEPHYR_MIRROR_BUGDOWN_KEY = -2 ZEPHYR_MIRROR_BUGDOWN_KEY = -2
class Bugdown(markdown.Extension): class Bugdown(markdown.Markdown):
def __init__(self, *args: Any, **kwargs: Union[bool, int, List[Any]]) -> None: def __init__(self, *args: Any, **kwargs: Union[bool, int, List[Any]]) -> None:
# define default configs # define default configs
self.config = { self.config = {
@@ -1693,157 +1691,135 @@ class Bugdown(markdown.Extension):
} }
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.set_output_format('html')
def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None: def build_parser(self) -> markdown.Markdown:
del md.preprocessors['reference'] # Build the parser using selected default features from py-markdown.
# The complete list of all available processors can be found in the
# super().build_parser() function.
#
# Note: for any py-markdown updates, manually check if we want any
# of the new features added upstream or not; they wouldn't get
# included by default.
self.preprocessors = self.build_preprocessors()
self.parser = self.build_block_parser()
self.inlinePatterns = self.build_inlinepatterns()
self.treeprocessors = self.build_treeprocessors()
self.postprocessors = self.build_postprocessors()
self.handle_zephyr_mirror()
return self
if self.getConfig('code_block_processor_disabled'): def build_preprocessors(self) -> markdown.util.Registry:
del md.parser.blockprocessors['code'] preprocessors = markdown.util.Registry()
preprocessors.register(AutoNumberOListPreprocessor(self), 'auto_number_olist', 40)
preprocessors.register(BugdownUListPreprocessor(self), 'hanging_ulists', 35)
preprocessors.register(markdown.preprocessors.NormalizeWhitespace(self), 'normalize_whitespace', 30)
preprocessors.register(fenced_code.FencedBlockPreprocessor(self), 'fenced_code_block', 25)
preprocessors.register(AlertWordsNotificationProcessor(self), 'custom_text_notifications', 20)
return preprocessors
for k in ('image_link', 'image_reference', 'automail', def build_block_parser(self) -> markdown.util.Registry:
'autolink', 'link', 'reference', 'short_reference', parser = markdown.blockprocessors.BlockParser(self)
'escape', 'strong_em', 'emphasis', 'emphasis2', parser.blockprocessors.register(markdown.blockprocessors.EmptyBlockProcessor(parser), 'empty', 85)
'linebreak', 'strong', 'backtick', 'em_strong', if not self.getConfig('code_block_processor_disabled'):
'strong2'): parser.blockprocessors.register(markdown.blockprocessors.CodeBlockProcessor(parser), 'code', 80)
md.inlinePatterns.deregister(k) # We get priority 75 from 'table' extension
parser.blockprocessors.register(markdown.blockprocessors.HRProcessor(parser), 'hr', 70)
try: parser.blockprocessors.register(UListProcessor(parser), 'ulist', 65)
# linebreak2 was removed upstream in version 3.2.1, so parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 60)
# don't throw an error if it is not there parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 55)
del md.inlinePatterns['linebreak2'] parser.blockprocessors.register(markdown.blockprocessors.ParagraphProcessor(parser), 'paragraph', 50)
except Exception: return parser
pass
# Having the extension operations split into a bunch of
# smaller functions both helps with organization and
# simplifies profiling of the markdown engine build time.
self.extend_alert_words(md)
self.extend_text_formatting(md)
self.extend_block_formatting(md)
self.extend_avatars(md)
self.extend_modal_links(md)
self.extend_mentions(md)
self.extend_stream_links(md)
self.extend_emojis(md)
self.extend_misc(md)
def extend_alert_words(self, md: markdown.Markdown) -> None:
md.preprocessors.add("custom_text_notifications", AlertWordsNotificationProcessor(md), "_end")
def extend_text_formatting(self, md: markdown.Markdown) -> None:
# Inline code block without whitespace stripping
md.inlinePatterns.add(
"backtick",
BacktickPattern(r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'),
"_begin")
md.inlinePatterns.add(
'strong_em',
markdown.inlinepatterns.DoubleTagPattern(
r'(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*', 'strong,em'),
'>backtick')
# Custom bold syntax: **foo** but not __foo__
md.inlinePatterns.add('strong',
markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'),
'>not_strong')
def build_inlinepatterns(self) -> markdown.util.Registry:
# Declare regexes for clean single line calls to .register().
NOT_STRONG_RE = markdown.inlinepatterns.NOT_STRONG_RE
# Custom strikethrough syntax: ~~foo~~ # Custom strikethrough syntax: ~~foo~~
md.inlinePatterns.add('del', DEL_RE = r'(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)'
markdown.inlinepatterns.SimpleTagPattern( # Custom bold syntax: **foo** but not __foo__
r'(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)', 'del'), '>strong')
# str inside ** must start and end with a word character # str inside ** must start and end with a word character
# it need for things like "const char *x = (char *)y" # it need for things like "const char *x = (char *)y"
md.inlinePatterns.add( EMPHASIS_RE = r'(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*'
'emphasis', ENTITY_RE = markdown.inlinepatterns.ENTITY_RE
markdown.inlinepatterns.SimpleTagPattern(r'(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*', 'em'), STRONG_EM_RE = r'(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*'
'>strong') # Inline code block without whitespace stripping
BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'
def extend_block_formatting(self, md: markdown.Markdown) -> None: # Add Inline Patterns
for k in ('hashheader', 'setextheader', 'olist', 'ulist', 'indent', 'quote'): reg = markdown.util.Registry()
del md.parser.blockprocessors[k] reg.register(BacktickPattern(BACKTICK_RE), 'backtick', 105)
reg.register(markdown.inlinepatterns.DoubleTagPattern(STRONG_EM_RE, 'strong,em'), 'strong_em', 100)
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr') reg.register(UserMentionPattern(mention.find_mentions, self), 'usermention', 95)
md.parser.blockprocessors.add('indent', ListIndentProcessor(md.parser), '<ulist') reg.register(Tex(r'\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B'), 'tex', 90)
md.parser.blockprocessors.add('quote', BlockQuoteProcessor(md.parser), '<ulist') reg.register(StreamPattern(verbose_compile(STREAM_LINK_REGEX), self), 'stream', 85)
reg.register(Avatar(AVATAR_REGEX, self), 'avatar', 80)
def extend_avatars(self, md: markdown.Markdown) -> None: reg.register(ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'), 'modal_link', 75)
# Note that !gravatar syntax should be deprecated long term. # Note that !gravatar syntax should be deprecated long term.
md.inlinePatterns.add('avatar', Avatar(AVATAR_REGEX, md), '>backtick') reg.register(Avatar(GRAVATAR_REGEX, self), 'gravatar', 70)
md.inlinePatterns.add('gravatar', Avatar(GRAVATAR_REGEX, md), '>backtick') reg.register(UserGroupMentionPattern(mention.user_group_mentions, self), 'usergroupmention', 65)
reg.register(AtomicLinkPattern(get_link_re(), self), 'link', 60)
def extend_modal_links(self, md: markdown.Markdown) -> None: reg.register(AutoLink(get_web_link_regex(), self), 'autolink', 55)
md.inlinePatterns.add( # Reserve priority 45-54 for Realm Filters
'modal_link', reg = self.register_realm_filters(reg)
ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'), reg.register(markdown.inlinepatterns.HtmlInlineProcessor(ENTITY_RE, self), 'entity', 40)
'>avatar') reg.register(markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'), 'strong', 35)
reg.register(markdown.inlinepatterns.SimpleTagPattern(EMPHASIS_RE, 'em'), 'emphasis', 30)
def extend_mentions(self, md: markdown.Markdown) -> None: reg.register(markdown.inlinepatterns.SimpleTagPattern(DEL_RE, 'del'), 'del', 25)
md.inlinePatterns.add('usermention', UserMentionPattern(mention.find_mentions, md), '>backtick') reg.register(markdown.inlinepatterns.SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 20)
md.inlinePatterns.add('usergroupmention', reg.register(Emoji(EMOJI_REGEX, self), 'emoji', 15)
UserGroupMentionPattern(mention.user_group_mentions, md), reg.register(EmoticonTranslation(emoticon_regex, self), 'translate_emoticons', 10)
'>backtick') # We get priority 5 from 'nl2br' extension
reg.register(UnicodeEmoji(unicode_emoji_regex), 'unicodeemoji', 0)
def extend_stream_links(self, md: markdown.Markdown) -> None: return reg
md.inlinePatterns.add('stream', StreamPattern(verbose_compile(STREAM_LINK_REGEX), md), '>backtick')
def extend_emojis(self, md: markdown.Markdown) -> None:
md.inlinePatterns.add(
'tex',
Tex(r'\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B'),
'>backtick')
md.inlinePatterns.add('emoji', Emoji(EMOJI_REGEX, md), '<nl')
md.inlinePatterns.add('translate_emoticons', EmoticonTranslation(emoticon_regex, md), '>emoji')
md.inlinePatterns.add('unicodeemoji', UnicodeEmoji(unicode_emoji_regex), '_end')
def extend_misc(self, md: markdown.Markdown) -> None:
md.inlinePatterns.add('link', AtomicLinkPattern(get_link_re(), md), '>avatar')
def register_realm_filters(self, inlinePatterns: markdown.util.Registry) -> markdown.util.Registry:
for (pattern, format_string, id) in self.getConfig("realm_filters"): for (pattern, format_string, id) in self.getConfig("realm_filters"):
md.inlinePatterns.add('realm_filters/%s' % (pattern,), inlinePatterns.register(RealmFilterPattern(pattern, format_string, self),
RealmFilterPattern(pattern, format_string, md), '>link') 'realm_filters/%s' % (pattern), 45)
return inlinePatterns
md.inlinePatterns.add('autolink', AutoLink(get_web_link_regex(), md), '>link')
md.preprocessors.add('hanging_ulists',
BugdownUListPreprocessor(md),
"_begin")
md.preprocessors.add('auto_number_olist',
AutoNumberOListPreprocessor(md),
"_begin")
md.treeprocessors.add("inline_interesting_links", InlineInterestingLinkProcessor(md, self), "_end")
def build_treeprocessors(self) -> markdown.util.Registry:
treeprocessors = markdown.util.Registry()
# We get priority 30 from 'hilite' extension
treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
treeprocessors.register(markdown.treeprocessors.PrettifyTreeprocessor(self), 'prettify', 20)
treeprocessors.register(InlineInterestingLinkProcessor(self), 'inline_interesting_links', 15)
if settings.CAMO_URI: if settings.CAMO_URI:
md.treeprocessors.add("rewrite_to_https", InlineHttpsProcessor(md), "_end") treeprocessors.register(InlineHttpsProcessor(self), 'rewrite_to_https', 10)
return treeprocessors
def build_postprocessors(self) -> markdown.util.Registry:
postprocessors = markdown.util.Registry()
postprocessors.register(markdown.postprocessors.RawHtmlPostprocessor(self), 'raw_html', 20)
postprocessors.register(markdown.postprocessors.AndSubstitutePostprocessor(), 'amp_substitute', 15)
postprocessors.register(markdown.postprocessors.UnescapePostprocessor(), 'unescape', 10)
return postprocessors
def getConfig(self, key: str, default: str='') -> Any:
""" Return a setting for the given key or an empty string. """
if key in self.config:
return self.config[key][0]
else:
return default
def handle_zephyr_mirror(self) -> None:
if self.getConfig("realm") == ZEPHYR_MIRROR_BUGDOWN_KEY: if self.getConfig("realm") == ZEPHYR_MIRROR_BUGDOWN_KEY:
# Disable almost all inline patterns for zephyr mirror # Disable almost all inline patterns for zephyr mirror
# users' traffic that is mirrored. Note that # users' traffic that is mirrored. Note that
# inline_interesting_links is a treeprocessor and thus is # inline_interesting_links is a treeprocessor and thus is
# not removed # not removed
md.inlinePatterns = get_sub_registry(md.inlinePatterns, ['autolink']) self.inlinePatterns = get_sub_registry(self.inlinePatterns, ['autolink'])
md.treeprocessors = get_sub_registry(md.treeprocessors, self.treeprocessors = get_sub_registry(self.treeprocessors, ['inline_interesting_links',
['inline_interesting_links',
'rewrite_to_https']) 'rewrite_to_https'])
# insert new 'inline' processor because we have changed md.inlinePatterns # insert new 'inline' processor because we have changed self.inlinePatterns
# but InlineProcessor copies md as self.md in __init__. # but InlineProcessor copies md as self.md in __init__.
md.treeprocessors.add('inline', self.treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
markdown.treeprocessors.InlineProcessor(md), self.preprocessors = get_sub_registry(self.preprocessors, ['custom_text_notifications'])
'>inline_interesting_links') self.parser.blockprocessors = get_sub_registry(self.parser.blockprocessors, ['paragraph'])
md.preprocessors = get_sub_registry(md.preprocessors, ['custom_text_notifications'])
md.parser.blockprocessors = get_sub_registry(md.parser.blockprocessors, ['paragraph'])
md_engines = {} # type: Dict[Tuple[int, bool], markdown.Markdown] md_engines = {} # type: Dict[Tuple[int, bool], markdown.Markdown]
realm_filter_data = {} # type: Dict[int, List[Tuple[str, str, int]]] realm_filter_data = {} # type: Dict[int, List[Tuple[str, str, int]]]
class EscapeHtml(markdown.Extension):
def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None:
del md.preprocessors['html_block']
del md.inlinePatterns['html']
def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None: def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None:
md_engine_key = (realm_filters_key, email_gateway) md_engine_key = (realm_filters_key, email_gateway)
if md_engine_key in md_engines: if md_engine_key in md_engines:
@@ -1859,8 +1835,10 @@ def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None:
def build_engine(realm_filters: List[Tuple[str, str, int]], def build_engine(realm_filters: List[Tuple[str, str, int]],
realm_filters_key: int, realm_filters_key: int,
email_gateway: bool) -> markdown.Markdown: email_gateway: bool) -> markdown.Markdown:
engine = markdown.Markdown( engine = Bugdown(
output_format = 'html', realm_filters=realm_filters,
realm=realm_filters_key,
code_block_processor_disabled=email_gateway,
extensions = [ extensions = [
nl2br.makeExtension(), nl2br.makeExtension(),
tables.makeExtension(), tables.makeExtension(),
@@ -1868,11 +1846,7 @@ def build_engine(realm_filters: List[Tuple[str, str, int]],
linenums=False, linenums=False,
guess_lang=False guess_lang=False
), ),
fenced_code.makeExtension(), ])
EscapeHtml(),
Bugdown(realm_filters=realm_filters,
realm=realm_filters_key,
code_block_processor_disabled=email_gateway)])
return engine return engine
def topic_links(realm_filters_key: int, topic_name: str) -> List[str]: def topic_links(realm_filters_key: int, topic_name: str) -> List[str]:

View File

@@ -113,16 +113,7 @@ class FencedCodeExtension(markdown.Extension):
def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None: def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None:
""" Add FencedBlockPreprocessor to the Markdown instance. """ """ Add FencedBlockPreprocessor to the Markdown instance. """
md.registerExtension(self) md.registerExtension(self)
md.preprocessors.register(FencedBlockPreprocessor(md), 'fenced_code_block', 25)
# Newer versions of Python-Markdown (starting at 2.3?) have
# a normalize_whitespace preprocessor that needs to go first.
position = ('>normalize_whitespace'
if 'normalize_whitespace' in md.preprocessors
else '_begin')
md.preprocessors.add('fenced_code_block',
FencedBlockPreprocessor(md),
position)
class BaseHandler: class BaseHandler: