diff --git a/web/src/linkifiers.ts b/web/src/linkifiers.ts index 26d3e029c6..6a055f0025 100644 --- a/web/src/linkifiers.ts +++ b/web/src/linkifiers.ts @@ -42,7 +42,7 @@ function python_to_js_linkifier( current_group += 1; } // Convert any python in-regex flags to RegExp flags - let js_flags = "g"; + let js_flags = "gu"; const inline_flag_re = /\(\?([Limsux]+)\)/; match = inline_flag_re.exec(pattern); @@ -59,15 +59,11 @@ function python_to_js_linkifier( pattern = pattern.replace(inline_flag_re, ""); } - // Ideally we should have been checking that linkifiers - // begin with certain characters but since there is no - // support for negative lookbehind in javascript, we check - // for this condition in `contains_backend_only_syntax()` - // function. If the condition is satisfied then the message - // is rendered locally, otherwise, we return false there and - // message is rendered on the backend which has proper support - // for negative lookbehind. - pattern = pattern + /(?!\w)/.source; + // This boundary-matching must be kept in sync with prepare_linkifier_pattern in + // zerver.lib.markdown. It does not use look-ahead or look-behind, because re2 + // does not support either. + pattern = + /(^|\s|\u0085|\p{Z}|['"(,:<])/u.source + "(" + pattern + ")" + /($|[^\p{L}\p{N}])/u.source; let final_regex = null; try { final_regex = new RegExp(pattern, js_flags); diff --git a/web/src/markdown.js b/web/src/markdown.js index c89e5de21a..84bee8b57b 100644 --- a/web/src/markdown.js +++ b/web/src/markdown.js @@ -70,31 +70,6 @@ export function translate_emoticons_to_names({src, get_emoticon_translations}) { return translated; } -function contains_problematic_linkifier({content, get_linkifier_map}) { - // If a linkifier doesn't start with some specified characters - // then don't render it locally. It is workaround for the fact that - // javascript regex doesn't support lookbehind. - for (const re of get_linkifier_map().keys()) { - const pattern = /[^\s"'(,:<]/.source + re.source + /(?!\w)/.source; - const regex = new RegExp(pattern); - if (regex.test(content)) { - return true; - } - } - - return false; -} - -function content_contains_backend_only_syntax({content, get_linkifier_map}) { - // Try to guess whether or not a message contains syntax that only the - // backend Markdown processor can correctly handle. - // If it doesn't, we can immediately render it client-side for local echo. - return ( - contains_preview_link(content) || - contains_problematic_linkifier({content, get_linkifier_map}) - ); -} - function parse_with_options({raw_content, helper_config, options}) { // Given the raw markdown content of a message (raw_content) // we return the HTML content (content) and flags. @@ -317,13 +292,27 @@ export function get_topic_links({topic, get_linkifier_map}) { let precedence = 0; for (const [pattern, {url_template, group_number_to_name}] of get_linkifier_map().entries()) { - let match; - while ((match = pattern.exec(topic)) !== null) { - const matched_groups = match.slice(1); + // Strip off the "g" modifier + const non_global_pattern = new RegExp(pattern.source, pattern.flags.replace("g", "")); + let pos = 0; + while (pos < topic.length) { + const match = non_global_pattern.exec(topic.slice(pos)); + if (match === null) { + break; + } + // Advance position to the start of the body of the match, after the index + // offset and any leading whitespace (captured in match[1]) + pos += match.index + match[1].length; + + // match[0] is the entire match, which may have leading or trailing boundary + // characters. match[1] is the leading characters, match[2] is the main body of the + // match, and user-provided groups start in match[3]. Slice those user-provided groups + // off. + const user_groups = match.slice(3, -1); let i = 0; const template_context = {}; - while (i < matched_groups.length) { - const matched_group = matched_groups[i]; + while (i < user_groups.length) { + const matched_group = user_groups[i]; const current_group = i + 1; template_context[group_number_to_name[current_group]] = matched_group; i += 1; @@ -331,7 +320,13 @@ export function get_topic_links({topic, get_linkifier_map}) { const link_url = url_template.expand(template_context); // We store the starting index as well, to sort the order of occurrence of the links // in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py - links.push({url: link_url, text: match[0], index: match.index, precedence}); + links.push({url: link_url, text: match[2], index: pos, precedence}); + + // Adjust the start point of the match for the next iteration -- we already advanced + // to the start of the user match, so now only advance by match[2], so that we don't + // include the length of any trailing characters. This means that patterns can overlap + // their whitespace. + pos += match[2].length; } precedence += 1; } @@ -448,10 +443,10 @@ function handleEmoji({emoji_name, get_realm_emoji_url, get_emoji_codepoint}) { function handleLinkifier({pattern, matches, get_linkifier_map}) { const {url_template, group_number_to_name} = get_linkifier_map().get(pattern); + const user_groups = matches.slice(1); let current_group = 1; const template_context = {}; - - for (const match of matches) { + for (const match of user_groups) { template_context[group_number_to_name[current_group]] = match; current_group += 1; } @@ -688,10 +683,7 @@ export function add_topic_links(message) { } export function contains_backend_only_syntax(content) { - return content_contains_backend_only_syntax({ - content, - get_linkifier_map: web_app_helpers.get_linkifier_map, - }); + return contains_preview_link(content); } export function parse_non_message(raw_content) { diff --git a/web/tests/linkifiers.test.js b/web/tests/linkifiers.test.js index fcb56d4e8b..c09b3234ca 100644 --- a/web/tests/linkifiers.test.js +++ b/web/tests/linkifiers.test.js @@ -30,7 +30,10 @@ run_test("python_to_js_linkifier", () => { }, ]); let actual_value = get_linkifier_regexes(); - let expected_value = [/\/aa\/g(?!\w)/gim, /\/aa\/g(?!\w)/g]; + let expected_value = [ + /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gimu, + /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gu, + ]; assert.deepEqual(actual_value, expected_value); // Test case with multiple replacements. linkifiers.update_linkifier_rules([ @@ -41,7 +44,7 @@ run_test("python_to_js_linkifier", () => { }, ]); actual_value = get_linkifier_regexes(); - expected_value = [/#cf(\d+)([A-Z][\dA-Z]*)(?!\w)/g]; + expected_value = [/(^|\s|\u0085|\p{Z}|['"(,:<])(#cf(\d+)([A-Z][\dA-Z]*))($|[^\p{L}\p{N}])/gu]; assert.deepEqual(actual_value, expected_value); // Test incorrect syntax. blueslip.expect("error", "python_to_js_linkifier failure!"); diff --git a/web/tests/markdown.test.js b/web/tests/markdown.test.js index 50513127ae..23b548de17 100644 --- a/web/tests/markdown.test.js +++ b/web/tests/markdown.test.js @@ -456,9 +456,6 @@ test("marked", () => { expected: '
\n

User group mention in quote: Backend

\n
\n
\n

Another user group mention in quote: hamletcharacters

\n
', }, - // Test only those linkifiers which don't return True for - // `contains_backend_only_syntax()`. Those which return True - // are tested separately. { input: "This is a linkifier #1234 with text after it", expected: @@ -479,6 +476,14 @@ test("marked", () => { expected: '

This is a linkifier with ZGROUP_123:45 groups

', }, + { + input: "Here is the PR-#123.", + expected: `

Here is the PR-#123.

`, + }, + { + input: "Function abc() was introduced in (PR)#123.", + expected: `

Function abc() was introduced in (PR)#123.

`, + }, {input: "Test *italic*", expected: "

Test italic

"}, { input: "T\n#**Denmark**", @@ -845,16 +850,6 @@ test("message_flags", () => { assert.equal(message.flags.includes("mentioned"), false); }); -test("backend_only_linkifiers", () => { - const backend_only_linkifiers = [ - "Here is the PR-#123.", - "Function abc() was introduced in (PR)#123.", - ]; - for (const content of backend_only_linkifiers) { - assert.equal(markdown.contains_backend_only_syntax(content), true); - } -}); - test("translate_emoticons_to_names", () => { const get_emoticon_translations = emoji.get_emoticon_translations; diff --git a/web/tests/markdown_parse.test.js b/web/tests/markdown_parse.test.js index 1e53500078..6b150443f6 100644 --- a/web/tests/markdown_parse.test.js +++ b/web/tests/markdown_parse.test.js @@ -114,7 +114,7 @@ function get_realm_emoji_url(emoji_name) { return realm_emoji_map.get(emoji_name); } -const regex = /#foo(\d+)(?!\w)/g; +const regex = /(^|\s|\u0085|\p{Z}|['"(,:<])(#foo(\d+))($|[^\p{L}\p{N}])/gu; const linkifier_map = new Map(); linkifier_map.set(regex, { url_template: url_template_lib.parse("http://foo.com/{id}"), @@ -287,30 +287,30 @@ run_test("topic links repeated", () => { run_test("topic links overlapping", () => { linkifiers.initialize([ - {pattern: "[a-z]+(?P1\\d+) #[a-z]+", url_template: "http://a.com/{id}"}, + {pattern: "[a-z]+(?P1\\d+) :[a-z]+", url_template: "http://a.com/{id}"}, {pattern: "[a-z]+(?P1\\d+)", url_template: "http://b.com/{id}"}, - {pattern: ".+#(?P[a-z]+)", url_template: "http://wildcard.com/{id}"}, - {pattern: "#(?P[a-z]+)", url_template: "http://c.com/{id}"}, + {pattern: ".+:(?P[a-z]+)", url_template: "http://wildcard.com/{id}"}, + {pattern: ":(?P[a-z]+)", url_template: "http://c.com/{id}"}, ]); // b.com's pattern should be matched while it overlaps with c.com's. - assert_topic_links("#foo100", [ + assert_topic_links(":foo100", [ { text: "foo100", url: "http://b.com/100", }, ]); // a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's. - assert_topic_links("#asd123 #asd", [ + assert_topic_links(":asd123 :asd", [ { - text: "asd123 #asd", + text: "asd123 :asd", url: "http://a.com/123", }, ]); // a.com's pattern do not match, wildcard.com's and b.com's patterns should match // and the links are ordered by the matched index. - assert_topic_links("/#asd #foo100", [ + assert_topic_links("/:asd :foo100", [ { - text: "/#asd", + text: "/:asd", url: "http://wildcard.com/asd", }, { @@ -318,27 +318,27 @@ run_test("topic links overlapping", () => { url: "http://b.com/100", }, ]); - assert_topic_links("foo.anything/#asd", [ + assert_topic_links("foo.anything/:asd", [ { - text: "foo.anything/#asd", + text: "foo.anything/:asd", url: "http://wildcard.com/asd", }, ]); // While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100", // we prioritize the linkifier match first. - assert_topic_links("http://foo.com/foo100", [ + assert_topic_links("http://foo.com/:foo100", [ { text: "foo100", url: "http://b.com/100", }, ]); - // Here the raw URL "https://foo.com/#asd" appears after wildcard.com's match "something https://foo.com/#asd". + // Here the raw URL "https://foo.com/:asd" appears after wildcard.com's match "something https://foo.com/:asd". // The latter is prioritized and the raw URL does not get included. - assert_topic_links("something https://foo.com/#asd", [ + assert_topic_links("something https://foo.com/:asd", [ { - text: "something https://foo.com/#asd", + text: "something https://foo.com/:asd", url: "http://wildcard.com/asd", }, ]); diff --git a/web/third/marked/lib/marked.js b/web/third/marked/lib/marked.js index 80d725a262..a20e8eed43 100644 --- a/web/third/marked/lib/marked.js +++ b/web/third/marked/lib/marked.js @@ -650,10 +650,10 @@ InlineLexer.prototype.output = function(src) { regexes.forEach(function (regex) { var ret = self.inlineReplacement(regex, src, function(regex, groups, match) { // Insert the created URL - href = self.linkifier(regex, groups, match); + href = self.linkifier(regex, groups.slice(1, -1), match); if (href !== undefined) { href = escape(href); - return self.renderer.link(href, href, match); + return groups[0] + self.renderer.link(href, href, groups[1]) + groups.slice(-1)[0] } else { return match; } diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index a4c0ce4c6c..f002c73c13 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -1820,6 +1820,9 @@ def prepare_linkifier_pattern(source: str) -> str: # We use an extended definition of 'whitespace' which is # equivalent to \p{White_Space} -- since \s in re2 only matches # ASCII spaces, and re2 does not support \p{White_Space}. + # + # This implementation should be kept in sync with the one in + # web/src/linkifiers.js regex = rf""" (?P<{BEFORE_CAPTURE_GROUP}> ^ |