bugdown: Restructure Bugdown to extend Markdown from being an extension.

Since we are building our parser from scratch now: 1. We have control over which proccessor goes at what priority number. Thus, we have also shifted the deprecated `.add()` calls to use the new `.register()` calls with explicit priorities, but maintaining the original order that the old method generated. 2. We do not have to remove the processors added by py-markdown that we do not use in Zulip; we explicitly add only the processors we do require. 3. We can cluster the building of each type of parser in one place, and in the order they need to be so that when we register them, there is no need to sort the list. This also makes for a huge improvement in the readability of the code, as all the components of each type are registered in the same function. These are significant performance improvements, because we save on calls to `str.startswith` in `.add()`, all the resources taken to generate the default to-be-removed processors and the time taken to sort the list of processors. Following are the profiling results for the changes made. Here, we build 10 engines one after the other and note the time taken to build each of them. 1st pass represents the state after this commit and 2nd pass represent the state after some regex modifications in the commits that follow by Steve Howell. All times are in microseconds. | nth Engine | Old Time | 1st Pass | 2nd Pass | | ---------- | -------- | -------- | -------- | | 1 | 92117.0 | 81775.0 | 76710.0 | | 2 | 1254.0 | 558.0 | 341.0 | | 3 | 1170.0 | 472.0 | 305.0 | | 4 | 1155.0 | 519.0 | 301.0 | | 5 | 1170.0 | 546.0 | 326.0 | | 6 | 1271.0 | 609.0 | 416.0 | | 7 | 1125.0 | 459.0 | 299.0 | | 8 | 1146.0 | 476.0 | 390.0 | | 9 | 1274.0 | 446.0 | 301.0 | | 10 | 1135.0 | 451.0 | 297.0 |
2025-11-08 07:52:19 +00:00 · 2019-01-20 08:10:58 +00:00
parent 9f2c52c86e
commit 434094e599
2 changed files with 114 additions and 149 deletions
--- a/zerver/lib/bugdown/init.py
+++ b/zerver/lib/bugdown/init.py
@@ -570,9 +570,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
    TWITTER_MAX_TO_PREVIEW = 3
    INLINE_PREVIEW_LIMIT_PER_MESSAGE = 5
-    def __init__(self, md: markdown.Markdown, bugdown: 'Bugdown') -> None:
+    def __init__(self, md: markdown.Markdown) -> None:
        # Passing in bugdown for access to config to check if realm is zulip.com
        self.bugdown = bugdown
        markdown.treeprocessors.Treeprocessor.__init__(self, md)
    def get_actual_image_url(self, url: str) -> str:
@@ -1681,7 +1679,7 @@ def get_sub_registry(r: markdown.util.Registry, keys: List[str]) -> markdown.uti
 DEFAULT_BUGDOWN_KEY = -1
 ZEPHYR_MIRROR_BUGDOWN_KEY = -2
-class Bugdown(markdown.Extension):
+class Bugdown(markdown.Markdown):
    def __init__(self, *args: Any, **kwargs: Union[bool, int, List[Any]]) -> None:
        # define default configs
        self.config = {
@@ -1693,157 +1691,135 @@ class Bugdown(markdown.Extension):
        }
        super().__init__(*args, **kwargs)
        self.set_output_format('html')
-    def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None:
+    def build_parser(self) -> markdown.Markdown:
-        del md.preprocessors['reference']
+        # Build the parser using selected default features from py-markdown.
        # The complete list of all available processors can be found in the
        # super().build_parser() function.
        #
        # Note: for any py-markdown updates, manually check if we want any
        # of the new features added upstream or not; they wouldn't get
        # included by default.
        self.preprocessors = self.build_preprocessors()
        self.parser = self.build_block_parser()
        self.inlinePatterns = self.build_inlinepatterns()
        self.treeprocessors = self.build_treeprocessors()
        self.postprocessors = self.build_postprocessors()
        self.handle_zephyr_mirror()
        return self
-        if self.getConfig('code_block_processor_disabled'):
+    def build_preprocessors(self) -> markdown.util.Registry:
-            del md.parser.blockprocessors['code']
+        preprocessors = markdown.util.Registry()
        preprocessors.register(AutoNumberOListPreprocessor(self), 'auto_number_olist', 40)
        preprocessors.register(BugdownUListPreprocessor(self), 'hanging_ulists', 35)
        preprocessors.register(markdown.preprocessors.NormalizeWhitespace(self), 'normalize_whitespace', 30)
        preprocessors.register(fenced_code.FencedBlockPreprocessor(self), 'fenced_code_block', 25)
        preprocessors.register(AlertWordsNotificationProcessor(self), 'custom_text_notifications', 20)
        return preprocessors
-        for k in ('image_link', 'image_reference', 'automail',
+    def build_block_parser(self) -> markdown.util.Registry:
-                  'autolink', 'link', 'reference', 'short_reference',
+        parser = markdown.blockprocessors.BlockParser(self)
-                  'escape', 'strong_em', 'emphasis', 'emphasis2',
+        parser.blockprocessors.register(markdown.blockprocessors.EmptyBlockProcessor(parser), 'empty', 85)
-                  'linebreak', 'strong', 'backtick', 'em_strong',
+        if not self.getConfig('code_block_processor_disabled'):
-                  'strong2'):
+            parser.blockprocessors.register(markdown.blockprocessors.CodeBlockProcessor(parser), 'code', 80)
-            md.inlinePatterns.deregister(k)
+        # We get priority 75 from 'table' extension
-
+        parser.blockprocessors.register(markdown.blockprocessors.HRProcessor(parser), 'hr', 70)
-        try:
+        parser.blockprocessors.register(UListProcessor(parser), 'ulist', 65)
-            # linebreak2 was removed upstream in version 3.2.1, so
+        parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 60)
-            # don't throw an error if it is not there
+        parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 55)
-            del md.inlinePatterns['linebreak2']
+        parser.blockprocessors.register(markdown.blockprocessors.ParagraphProcessor(parser), 'paragraph', 50)
-        except Exception:
+        return parser
            pass
        # Having the extension operations split into a bunch of
        # smaller functions both helps with organization and
        # simplifies profiling of the markdown engine build time.
        self.extend_alert_words(md)
        self.extend_text_formatting(md)
        self.extend_block_formatting(md)
        self.extend_avatars(md)
        self.extend_modal_links(md)
        self.extend_mentions(md)
        self.extend_stream_links(md)
        self.extend_emojis(md)
        self.extend_misc(md)
    def extend_alert_words(self, md: markdown.Markdown) -> None:
        md.preprocessors.add("custom_text_notifications", AlertWordsNotificationProcessor(md), "_end")
    def extend_text_formatting(self, md: markdown.Markdown) -> None:
        # Inline code block without whitespace stripping
        md.inlinePatterns.add(
            "backtick",
            BacktickPattern(r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'),
            "_begin")
        md.inlinePatterns.add(
            'strong_em',
            markdown.inlinepatterns.DoubleTagPattern(
                r'(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*', 'strong,em'),
            '>backtick')
        # Custom bold syntax: **foo** but not __foo__
        md.inlinePatterns.add('strong',
                              markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'),
                              '>not_strong')
    def build_inlinepatterns(self) -> markdown.util.Registry:
        # Declare regexes for clean single line calls to .register().
        NOT_STRONG_RE = markdown.inlinepatterns.NOT_STRONG_RE
        # Custom strikethrough syntax: ~~foo~~
-        md.inlinePatterns.add('del',
+        DEL_RE = r'(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)'
-                              markdown.inlinepatterns.SimpleTagPattern(
+        # Custom bold syntax: **foo** but not __foo__
                                  r'(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)', 'del'), '>strong')
        # str inside ** must start and end with a word character
        # it need for things like "const char *x = (char *)y"
-        md.inlinePatterns.add(
+        EMPHASIS_RE = r'(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*'
-            'emphasis',
+        ENTITY_RE = markdown.inlinepatterns.ENTITY_RE
-            markdown.inlinepatterns.SimpleTagPattern(r'(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*', 'em'),
+        STRONG_EM_RE = r'(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*'
-            '>strong')
+        # Inline code block without whitespace stripping
        BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'
-    def extend_block_formatting(self, md: markdown.Markdown) -> None:
+        # Add Inline Patterns
-        for k in ('hashheader', 'setextheader', 'olist', 'ulist', 'indent', 'quote'):
+        reg = markdown.util.Registry()
-            del md.parser.blockprocessors[k]
+        reg.register(BacktickPattern(BACKTICK_RE), 'backtick', 105)
-
+        reg.register(markdown.inlinepatterns.DoubleTagPattern(STRONG_EM_RE, 'strong,em'), 'strong_em', 100)
-        md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
+        reg.register(UserMentionPattern(mention.find_mentions, self), 'usermention', 95)
-        md.parser.blockprocessors.add('indent', ListIndentProcessor(md.parser), '<ulist')
+        reg.register(Tex(r'\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B'), 'tex', 90)
-        md.parser.blockprocessors.add('quote', BlockQuoteProcessor(md.parser), '<ulist')
+        reg.register(StreamPattern(verbose_compile(STREAM_LINK_REGEX), self), 'stream', 85)
-
+        reg.register(Avatar(AVATAR_REGEX, self), 'avatar', 80)
-    def extend_avatars(self, md: markdown.Markdown) -> None:
+        reg.register(ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'), 'modal_link', 75)
        # Note that !gravatar syntax should be deprecated long term.
-        md.inlinePatterns.add('avatar', Avatar(AVATAR_REGEX, md), '>backtick')
+        reg.register(Avatar(GRAVATAR_REGEX, self), 'gravatar', 70)
-        md.inlinePatterns.add('gravatar', Avatar(GRAVATAR_REGEX, md), '>backtick')
+        reg.register(UserGroupMentionPattern(mention.user_group_mentions, self), 'usergroupmention', 65)
-
+        reg.register(AtomicLinkPattern(get_link_re(), self), 'link', 60)
-    def extend_modal_links(self, md: markdown.Markdown) -> None:
+        reg.register(AutoLink(get_web_link_regex(), self), 'autolink', 55)
-        md.inlinePatterns.add(
+        # Reserve priority 45-54 for Realm Filters
-            'modal_link',
+        reg = self.register_realm_filters(reg)
-            ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'),
+        reg.register(markdown.inlinepatterns.HtmlInlineProcessor(ENTITY_RE, self), 'entity', 40)
-            '>avatar')
+        reg.register(markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'), 'strong', 35)
-
+        reg.register(markdown.inlinepatterns.SimpleTagPattern(EMPHASIS_RE, 'em'), 'emphasis', 30)
-    def extend_mentions(self, md: markdown.Markdown) -> None:
+        reg.register(markdown.inlinepatterns.SimpleTagPattern(DEL_RE, 'del'), 'del', 25)
-        md.inlinePatterns.add('usermention', UserMentionPattern(mention.find_mentions, md), '>backtick')
+        reg.register(markdown.inlinepatterns.SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 20)
-        md.inlinePatterns.add('usergroupmention',
+        reg.register(Emoji(EMOJI_REGEX, self), 'emoji', 15)
-                              UserGroupMentionPattern(mention.user_group_mentions, md),
+        reg.register(EmoticonTranslation(emoticon_regex, self), 'translate_emoticons', 10)
-                              '>backtick')
+        # We get priority 5 from 'nl2br' extension
-
+        reg.register(UnicodeEmoji(unicode_emoji_regex), 'unicodeemoji', 0)
-    def extend_stream_links(self, md: markdown.Markdown) -> None:
+        return reg
        md.inlinePatterns.add('stream', StreamPattern(verbose_compile(STREAM_LINK_REGEX), md), '>backtick')
    def extend_emojis(self, md: markdown.Markdown) -> None:
        md.inlinePatterns.add(
            'tex',
            Tex(r'\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B'),
            '>backtick')
        md.inlinePatterns.add('emoji', Emoji(EMOJI_REGEX, md), '<nl')
        md.inlinePatterns.add('translate_emoticons', EmoticonTranslation(emoticon_regex, md), '>emoji')
        md.inlinePatterns.add('unicodeemoji', UnicodeEmoji(unicode_emoji_regex), '_end')
    def extend_misc(self, md: markdown.Markdown) -> None:
        md.inlinePatterns.add('link', AtomicLinkPattern(get_link_re(), md), '>avatar')
    def register_realm_filters(self, inlinePatterns: markdown.util.Registry) -> markdown.util.Registry:
        for (pattern, format_string, id) in self.getConfig("realm_filters"):
-            md.inlinePatterns.add('realm_filters/%s' % (pattern,),
+            inlinePatterns.register(RealmFilterPattern(pattern, format_string, self),
-                                  RealmFilterPattern(pattern, format_string, md), '>link')
+                                    'realm_filters/%s' % (pattern), 45)
-
+        return inlinePatterns
        md.inlinePatterns.add('autolink', AutoLink(get_web_link_regex(), md), '>link')
        md.preprocessors.add('hanging_ulists',
                             BugdownUListPreprocessor(md),
                             "_begin")
        md.preprocessors.add('auto_number_olist',
                             AutoNumberOListPreprocessor(md),
                             "_begin")
        md.treeprocessors.add("inline_interesting_links", InlineInterestingLinkProcessor(md, self), "_end")
    def build_treeprocessors(self) -> markdown.util.Registry:
        treeprocessors = markdown.util.Registry()
        # We get priority 30 from 'hilite' extension
        treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
        treeprocessors.register(markdown.treeprocessors.PrettifyTreeprocessor(self), 'prettify', 20)
        treeprocessors.register(InlineInterestingLinkProcessor(self), 'inline_interesting_links', 15)
        if settings.CAMO_URI:
-            md.treeprocessors.add("rewrite_to_https", InlineHttpsProcessor(md), "_end")
+            treeprocessors.register(InlineHttpsProcessor(self), 'rewrite_to_https', 10)
        return treeprocessors
    def build_postprocessors(self) -> markdown.util.Registry:
        postprocessors = markdown.util.Registry()
        postprocessors.register(markdown.postprocessors.RawHtmlPostprocessor(self), 'raw_html', 20)
        postprocessors.register(markdown.postprocessors.AndSubstitutePostprocessor(), 'amp_substitute', 15)
        postprocessors.register(markdown.postprocessors.UnescapePostprocessor(), 'unescape', 10)
        return postprocessors
    def getConfig(self, key: str, default: str='') -> Any:
        """ Return a setting for the given key or an empty string. """
        if key in self.config:
            return self.config[key][0]
        else:
            return default
    def handle_zephyr_mirror(self) -> None:
        if self.getConfig("realm") == ZEPHYR_MIRROR_BUGDOWN_KEY:
            # Disable almost all inline patterns for zephyr mirror
            # users' traffic that is mirrored.  Note that
            # inline_interesting_links is a treeprocessor and thus is
            # not removed
-            md.inlinePatterns = get_sub_registry(md.inlinePatterns, ['autolink'])
+            self.inlinePatterns = get_sub_registry(self.inlinePatterns, ['autolink'])
-            md.treeprocessors = get_sub_registry(md.treeprocessors,
+            self.treeprocessors = get_sub_registry(self.treeprocessors, ['inline_interesting_links',
                                                 ['inline_interesting_links',
                                                                         'rewrite_to_https'])
-            # insert new 'inline' processor because we have changed md.inlinePatterns
+            # insert new 'inline' processor because we have changed self.inlinePatterns
            # but InlineProcessor copies md as self.md in __init__.
-            md.treeprocessors.add('inline',
+            self.treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
-                                  markdown.treeprocessors.InlineProcessor(md),
+            self.preprocessors = get_sub_registry(self.preprocessors, ['custom_text_notifications'])
-                                  '>inline_interesting_links')
+            self.parser.blockprocessors = get_sub_registry(self.parser.blockprocessors, ['paragraph'])
            md.preprocessors = get_sub_registry(md.preprocessors, ['custom_text_notifications'])
            md.parser.blockprocessors = get_sub_registry(md.parser.blockprocessors, ['paragraph'])
 md_engines = {}  # type: Dict[Tuple[int, bool], markdown.Markdown]
 realm_filter_data = {}  # type: Dict[int, List[Tuple[str, str, int]]]
 class EscapeHtml(markdown.Extension):
    def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None:
        del md.preprocessors['html_block']
        del md.inlinePatterns['html']
 def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None:
    md_engine_key = (realm_filters_key, email_gateway)
    if md_engine_key in md_engines:
@@ -1859,8 +1835,10 @@ def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None:
 def build_engine(realm_filters: List[Tuple[str, str, int]],
                 realm_filters_key: int,
                 email_gateway: bool) -> markdown.Markdown:
-    engine = markdown.Markdown(
+    engine = Bugdown(
-        output_format = 'html',
+        realm_filters=realm_filters,
        realm=realm_filters_key,
        code_block_processor_disabled=email_gateway,
        extensions = [
            nl2br.makeExtension(),
            tables.makeExtension(),
@@ -1868,11 +1846,7 @@ def build_engine(realm_filters: List[Tuple[str, str, int]],
                linenums=False,
                guess_lang=False
            ),
-            fenced_code.makeExtension(),
+        ])
            EscapeHtml(),
            Bugdown(realm_filters=realm_filters,
                    realm=realm_filters_key,
                    code_block_processor_disabled=email_gateway)])
    return engine
 def topic_links(realm_filters_key: int, topic_name: str) -> List[str]:
--- a/zerver/lib/bugdown/fenced_code.py
+++ b/zerver/lib/bugdown/fenced_code.py
@@ -113,16 +113,7 @@ class FencedCodeExtension(markdown.Extension):
    def extendMarkdown(self, md: markdown.Markdown, md_globals: Dict[str, Any]) -> None:
        """ Add FencedBlockPreprocessor to the Markdown instance. """
        md.registerExtension(self)
-
+        md.preprocessors.register(FencedBlockPreprocessor(md), 'fenced_code_block', 25)
        # Newer versions of Python-Markdown (starting at 2.3?) have
        # a normalize_whitespace preprocessor that needs to go first.
        position = ('>normalize_whitespace'
                    if 'normalize_whitespace' in md.preprocessors
                    else '_begin')
        md.preprocessors.add('fenced_code_block',
                             FencedBlockPreprocessor(md),
                             position)
 class BaseHandler: