linkifiers: Match JS implementation to server implementation.

Since the server-side implementation no longer uses look-ahead or (more importantly) look-behind, it is possible to exactly implement in Javascript. This removes a common class which would prevent local echo. This requires reworking the topic linking algorithm, to march the server's as well. The tests and behaviour are adjusted in so doing -- previously, the JS implementation would have linked `#foo` with a `foo` regex on the linkifier, but the server implementation would not have.
2025-11-04 05:53:43 +00:00 · 2023-11-14 20:26:31 +00:00
parent 70b20e9d2b
commit 091e2f177b
7 changed files with 68 additions and 79 deletions
--- a/web/src/linkifiers.ts
+++ b/web/src/linkifiers.ts
@@ -42,7 +42,7 @@ function python_to_js_linkifier(
        current_group += 1;
    }
    // Convert any python in-regex flags to RegExp flags
-    let js_flags = "g";
+    let js_flags = "gu";
    const inline_flag_re = /\(\?([Limsux]+)\)/;
    match = inline_flag_re.exec(pattern);

@@ -59,15 +59,11 @@ function python_to_js_linkifier(

        pattern = pattern.replace(inline_flag_re, "");
    }
-    // Ideally we should have been checking that linkifiers
-    // begin with certain characters but since there is no
-    // support for negative lookbehind in javascript, we check
-    // for this condition in `contains_backend_only_syntax()`
-    // function. If the condition is satisfied then the message
-    // is rendered locally, otherwise, we return false there and
-    // message is rendered on the backend which has proper support
-    // for negative lookbehind.
-    pattern = pattern + /(?!\w)/.source;
+    // This boundary-matching must be kept in sync with prepare_linkifier_pattern in
+    // zerver.lib.markdown.  It does not use look-ahead or look-behind, because re2
+    // does not support either.
+    pattern =
+        /(^|\s|\u0085|\p{Z}|['"(,:<])/u.source + "(" + pattern + ")" + /($|[^\p{L}\p{N}])/u.source;
    let final_regex = null;
    try {
        final_regex = new RegExp(pattern, js_flags);
--- a/web/src/markdown.js
+++ b/web/src/markdown.js
@@ -70,31 +70,6 @@ export function translate_emoticons_to_names({src, get_emoticon_translations}) {
    return translated;
 }

-function contains_problematic_linkifier({content, get_linkifier_map}) {
-    // If a linkifier doesn't start with some specified characters
-    // then don't render it locally. It is workaround for the fact that
-    // javascript regex doesn't support lookbehind.
-    for (const re of get_linkifier_map().keys()) {
-        const pattern = /[^\s"'(,:<]/.source + re.source + /(?!\w)/.source;
-        const regex = new RegExp(pattern);
-        if (regex.test(content)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-function content_contains_backend_only_syntax({content, get_linkifier_map}) {
-    // Try to guess whether or not a message contains syntax that only the
-    // backend Markdown processor can correctly handle.
-    // If it doesn't, we can immediately render it client-side for local echo.
-    return (
-        contains_preview_link(content) ||
-        contains_problematic_linkifier({content, get_linkifier_map})
-    );
-}
-
 function parse_with_options({raw_content, helper_config, options}) {
    // Given the raw markdown content of a message (raw_content)
    // we return the HTML content (content) and flags.
@@ -317,13 +292,27 @@ export function get_topic_links({topic, get_linkifier_map}) {
    let precedence = 0;

    for (const [pattern, {url_template, group_number_to_name}] of get_linkifier_map().entries()) {
-        let match;
-        while ((match = pattern.exec(topic)) !== null) {
-            const matched_groups = match.slice(1);
+        // Strip off the "g" modifier
+        const non_global_pattern = new RegExp(pattern.source, pattern.flags.replace("g", ""));
+        let pos = 0;
+        while (pos < topic.length) {
+            const match = non_global_pattern.exec(topic.slice(pos));
+            if (match === null) {
+                break;
+            }
+            // Advance position to the start of the body of the match, after the index
+            // offset and any leading whitespace (captured in match[1])
+            pos += match.index + match[1].length;
+
+            // match[0] is the entire match, which may have leading or trailing boundary
+            // characters. match[1] is the leading characters, match[2] is the main body of the
+            // match, and user-provided groups start in match[3].  Slice those user-provided groups
+            // off.
+            const user_groups = match.slice(3, -1);
            let i = 0;
            const template_context = {};
-            while (i < matched_groups.length) {
-                const matched_group = matched_groups[i];
+            while (i < user_groups.length) {
+                const matched_group = user_groups[i];
                const current_group = i + 1;
                template_context[group_number_to_name[current_group]] = matched_group;
                i += 1;
@@ -331,7 +320,13 @@ export function get_topic_links({topic, get_linkifier_map}) {
            const link_url = url_template.expand(template_context);
            // We store the starting index as well, to sort the order of occurrence of the links
            // in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py
-            links.push({url: link_url, text: match[0], index: match.index, precedence});
+            links.push({url: link_url, text: match[2], index: pos, precedence});
+
+            // Adjust the start point of the match for the next iteration -- we already advanced
+            // to the start of the user match, so now only advance by match[2], so that we don't
+            // include the length of any trailing characters. This means that patterns can overlap
+            // their whitespace.
+            pos += match[2].length;
        }
        precedence += 1;
    }
@@ -448,10 +443,10 @@ function handleEmoji({emoji_name, get_realm_emoji_url, get_emoji_codepoint}) {
 function handleLinkifier({pattern, matches, get_linkifier_map}) {
    const {url_template, group_number_to_name} = get_linkifier_map().get(pattern);

+    const user_groups = matches.slice(1);
    let current_group = 1;
    const template_context = {};
-
-    for (const match of matches) {
+    for (const match of user_groups) {
        template_context[group_number_to_name[current_group]] = match;
        current_group += 1;
    }
@@ -688,10 +683,7 @@ export function add_topic_links(message) {
 }

 export function contains_backend_only_syntax(content) {
-    return content_contains_backend_only_syntax({
-        content,
-        get_linkifier_map: web_app_helpers.get_linkifier_map,
-    });
+    return contains_preview_link(content);
 }

 export function parse_non_message(raw_content) {
--- a/web/tests/linkifiers.test.js
+++ b/web/tests/linkifiers.test.js
@@ -30,7 +30,10 @@ run_test("python_to_js_linkifier", () => {
        },
    ]);
    let actual_value = get_linkifier_regexes();
-    let expected_value = [/\/aa\/g(?!\w)/gim, /\/aa\/g(?!\w)/g];
+    let expected_value = [
+        /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gimu,
+        /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gu,
+    ];
    assert.deepEqual(actual_value, expected_value);
    // Test case with multiple replacements.
    linkifiers.update_linkifier_rules([
@@ -41,7 +44,7 @@ run_test("python_to_js_linkifier", () => {
        },
    ]);
    actual_value = get_linkifier_regexes();
-    expected_value = [/#cf(\d+)([A-Z][\dA-Z]*)(?!\w)/g];
+    expected_value = [/(^|\s|\u0085|\p{Z}|['"(,:<])(#cf(\d+)([A-Z][\dA-Z]*))($|[^\p{L}\p{N}])/gu];
    assert.deepEqual(actual_value, expected_value);
    // Test incorrect syntax.
    blueslip.expect("error", "python_to_js_linkifier failure!");
--- a/web/tests/markdown.test.js
+++ b/web/tests/markdown.test.js
@@ -456,9 +456,6 @@ test("marked", () => {
            expected:
                '<blockquote>\n<p>User group mention in quote: <span class="user-group-mention silent" data-user-group-id="2">Backend</span></p>\n</blockquote>\n<blockquote>\n<p>Another user group mention in quote: <span class="user-group-mention silent" data-user-group-id="1">hamletcharacters</span></p>\n</blockquote>',
        },
-        // Test only those linkifiers which don't return True for
-        // `contains_backend_only_syntax()`. Those which return True
-        // are tested separately.
        {
            input: "This is a linkifier #1234 with text after it",
            expected:
@@ -479,6 +476,14 @@ test("marked", () => {
            expected:
                '<p>This is a linkifier with <a href="https://zone_45.zulip.net/ticket/123" title="https://zone_45.zulip.net/ticket/123">ZGROUP_123:45</a> groups</p>',
        },
+        {
+            input: "Here is the PR-#123.",
+            expected: `<p>Here is the PR-<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
+        },
+        {
+            input: "Function abc() was introduced in (PR)#123.",
+            expected: `<p>Function abc() was introduced in (PR)<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
+        },
        {input: "Test *italic*", expected: "<p>Test <em>italic</em></p>"},
        {
            input: "T\n#**Denmark**",
@@ -845,16 +850,6 @@ test("message_flags", () => {
    assert.equal(message.flags.includes("mentioned"), false);
 });

-test("backend_only_linkifiers", () => {
-    const backend_only_linkifiers = [
-        "Here is the PR-#123.",
-        "Function abc() was introduced in (PR)#123.",
-    ];
-    for (const content of backend_only_linkifiers) {
-        assert.equal(markdown.contains_backend_only_syntax(content), true);
-    }
-});
-
 test("translate_emoticons_to_names", () => {
    const get_emoticon_translations = emoji.get_emoticon_translations;

--- a/web/tests/markdown_parse.test.js
+++ b/web/tests/markdown_parse.test.js
@@ -114,7 +114,7 @@ function get_realm_emoji_url(emoji_name) {
    return realm_emoji_map.get(emoji_name);
 }

-const regex = /#foo(\d+)(?!\w)/g;
+const regex = /(^|\s|\u0085|\p{Z}|['"(,:<])(#foo(\d+))($|[^\p{L}\p{N}])/gu;
 const linkifier_map = new Map();
 linkifier_map.set(regex, {
    url_template: url_template_lib.parse("http://foo.com/{id}"),
@@ -287,30 +287,30 @@ run_test("topic links repeated", () => {

 run_test("topic links overlapping", () => {
    linkifiers.initialize([
-        {pattern: "[a-z]+(?P<id>1\\d+) #[a-z]+", url_template: "http://a.com/{id}"},
+        {pattern: "[a-z]+(?P<id>1\\d+) :[a-z]+", url_template: "http://a.com/{id}"},
        {pattern: "[a-z]+(?P<id>1\\d+)", url_template: "http://b.com/{id}"},
-        {pattern: ".+#(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"},
-        {pattern: "#(?P<id>[a-z]+)", url_template: "http://c.com/{id}"},
+        {pattern: ".+:(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"},
+        {pattern: ":(?P<id>[a-z]+)", url_template: "http://c.com/{id}"},
    ]);
    // b.com's pattern should be matched while it overlaps with c.com's.
-    assert_topic_links("#foo100", [
+    assert_topic_links(":foo100", [
        {
            text: "foo100",
            url: "http://b.com/100",
        },
    ]);
    // a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's.
-    assert_topic_links("#asd123 #asd", [
+    assert_topic_links(":asd123 :asd", [
        {
-            text: "asd123 #asd",
+            text: "asd123 :asd",
            url: "http://a.com/123",
        },
    ]);
    // a.com's pattern do not match, wildcard.com's and b.com's patterns should match
    // and the links are ordered by the matched index.
-    assert_topic_links("/#asd #foo100", [
+    assert_topic_links("/:asd :foo100", [
        {
-            text: "/#asd",
+            text: "/:asd",
            url: "http://wildcard.com/asd",
        },
        {
@@ -318,27 +318,27 @@ run_test("topic links overlapping", () => {
            url: "http://b.com/100",
        },
    ]);
-    assert_topic_links("foo.anything/#asd", [
+    assert_topic_links("foo.anything/:asd", [
        {
-            text: "foo.anything/#asd",
+            text: "foo.anything/:asd",
            url: "http://wildcard.com/asd",
        },
    ]);

    // While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100",
    // we prioritize the linkifier match first.
-    assert_topic_links("http://foo.com/foo100", [
+    assert_topic_links("http://foo.com/:foo100", [
        {
            text: "foo100",
            url: "http://b.com/100",
        },
    ]);

-    // Here the raw URL "https://foo.com/#asd" appears after wildcard.com's match "something https://foo.com/#asd".
+    // Here the raw URL "https://foo.com/:asd" appears after wildcard.com's match "something https://foo.com/:asd".
    // The latter is prioritized and the raw URL does not get included.
-    assert_topic_links("something https://foo.com/#asd", [
+    assert_topic_links("something https://foo.com/:asd", [
        {
-            text: "something https://foo.com/#asd",
+            text: "something https://foo.com/:asd",
            url: "http://wildcard.com/asd",
        },
    ]);
--- a/web/third/marked/lib/marked.js
+++ b/web/third/marked/lib/marked.js
@@ -650,10 +650,10 @@ InlineLexer.prototype.output = function(src) {
    regexes.forEach(function (regex) {
      var ret = self.inlineReplacement(regex, src, function(regex, groups, match) {
        // Insert the created URL
-        href = self.linkifier(regex, groups, match);
+        href = self.linkifier(regex, groups.slice(1, -1), match);
        if (href !== undefined) {
          href = escape(href);
-          return self.renderer.link(href, href, match);
+          return groups[0] + self.renderer.link(href, href, groups[1]) + groups.slice(-1)[0]
        } else {
          return match;
        }
--- a/zerver/lib/markdown/init.py
+++ b/zerver/lib/markdown/init.py
@@ -1820,6 +1820,9 @@ def prepare_linkifier_pattern(source: str) -> str:
    # We use an extended definition of 'whitespace' which is
    # equivalent to \p{White_Space} -- since \s in re2 only matches
    # ASCII spaces, and re2 does not support \p{White_Space}.
+    #
+    # This implementation should be kept in sync with the one in
+    # web/src/linkifiers.js
    regex = rf"""
        (?P<{BEFORE_CAPTURE_GROUP}>
            ^  |