From c549dea9acc9a0cabaf69c1fba7831b5ddbc9396 Mon Sep 17 00:00:00 2001 From: Harshit Bansal Date: Mon, 15 May 2017 14:54:01 +0530 Subject: [PATCH] bugdown: Fix the regex used for unicode emoji matching. The regex we were using didn't cover all the unicode blocks to which our emojis belong. This commit fixes the regex to include all the unicode blocks and also updates the corresponding JS regex in marked.js. Fixes: #3460. --- static/third/marked/lib/marked.js | 10 ++++-- zerver/fixtures/bugdown-data.json | 18 ++++++++++ zerver/lib/bugdown/__init__.py | 55 ++++++++++++++++++++++++++----- 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/static/third/marked/lib/marked.js b/static/third/marked/lib/marked.js index 9b954508f5..00bfbc5303 100644 --- a/static/third/marked/lib/marked.js +++ b/static/third/marked/lib/marked.js @@ -537,7 +537,10 @@ inline.breaks = merge({}, inline.gfm, { inline.zulip = merge({}, inline.breaks, { emoji: /^:([A-Za-z0-9_\-\+]+?):/, - unicodeemoji: /^(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|[\u2600-\u26FF]|[\u2700-\u27BF])/, + unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' + + '\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' + + '[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' + + '[\u3000-\u303F]|[\u3200-\u32FF])'), usermention: /^(@(?:\*\*([^\*]+)\*\*|(\w+)))/m, // Match multi-word string between @** ** or match any one-word stream: /^#\*\*([^\*]+)\*\*/m, avatar: /^!avatar\(([^)]+)\)/, @@ -545,7 +548,10 @@ inline.zulip = merge({}, inline.breaks, { tex: /^(\$\$([^ _$](\\\$|[^$])*)(?! )\$\$)\B/, realm_filters: [], text: replace(inline.breaks.text) - ('|', '|(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|[\u2600-\u26FF]|[\u2700-\u27BF])|') + ('|', '|(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' + + '\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' + + '[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' + + '[\u3000-\u303F]|[\u3200-\u32FF])|') (']|', '#@:]|') () }); diff --git a/zerver/fixtures/bugdown-data.json b/zerver/fixtures/bugdown-data.json index 7b92e259b0..ba7180cc10 100644 --- a/zerver/fixtures/bugdown-data.json +++ b/zerver/fixtures/bugdown-data.json @@ -288,12 +288,30 @@ "expected_output":"

\"\ud83d\udca9\" word \"\ud83d\udca9\"<\/p>", "bugdown_matches_marked": true }, + { + "name": "miscellaneous_symbols_and_pictographs", + "input": "Merry Christmas!!\ud83c\udf84", + "expected_output":"

Merry Christmas!!\"\ud83c\udf84\"<\/p>", + "bugdown_matches_marked": true + }, { "name": "miscellaneous_and_dingbats_emoji", "input": "\u2693\u2797", "expected_output":"

\"\u2693\"\"\u2797\"<\/p>", "bugdown_matches_marked": true }, + { + "name": "supplemental_symbols_and_pictographs", + "input": "I am a robot \ud83e\udd16.", + "expected_output":"

I am a robot \"\ud83e\udd16\".<\/p>", + "bugdown_matches_marked": true + }, + { + "name": "miscellaneous_symbols_and_arrows", + "input": "Black upward arrow \u2b06", + "expected_output":"

Black upward arrow \"\u2b06\"<\/p>", + "bugdown_matches_marked": true + }, { "name": "unicode_emoji_without_space", "input": "Extra\ud83d\udc7dTerrestrial", diff --git a/zerver/lib/bugdown/__init__.py b/zerver/lib/bugdown/__init__.py index c6f60a181e..ce273f3797 100644 --- a/zerver/lib/bugdown/__init__.py +++ b/zerver/lib/bugdown/__init__.py @@ -1223,15 +1223,52 @@ class Bugdown(markdown.Extension): md.inlinePatterns.add('stream', StreamPattern(stream_group), '>backtick') md.inlinePatterns.add('tex', Tex(r'\B\$\$(?P[^ _$](\\\$|[^$])*)(?! )\$\$\B'), '>backtick') md.inlinePatterns.add('emoji', Emoji(r'(?P:[\w\-\+]+:)'), '_end') - md.inlinePatterns.add('unicodeemoji', UnicodeEmoji( - u'(?P[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF])'), - '_end') - # The equalent JS regex is \ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]| - # [\u2600-\u26FF]|[\u2700-\u27BF]. See below comments for explanation. The JS regex is used - # by marked.js for frontend unicode emoji processing. - # The JS regex \ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001F300-\U0001F64F - # The JS regex \ud83d[\ude80-\udeff] represents \U0001F680-\U0001F6FF - # Similiarly [\u2600-\u26FF]|[\u2700-\u27BF] represents \u2600-\u26FF\u2700-\u27BF + + # All of our emojis(non ZWJ sequences) belong to one of these unicode blocks: + # \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement + # \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement + # \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs + # \U0001f600-\U0001f64f - Emoticons (Emoji) + # \U0001f680-\U0001f6ff - Transport and Map Symbols + # \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs + # \u2000-\u206f - General Punctuation + # \u2300-\u23ff - Miscellaneous Technical + # \u2400-\u243f - Control Pictures + # \u2440-\u245f - Optical Character Recognition + # \u2460-\u24ff - Enclosed Alphanumerics + # \u2500-\u257f - Box Drawing + # \u2580-\u259f - Block Elements + # \u25a0-\u25ff - Geometric Shapes + # \u2600-\u26ff - Miscellaneous Symbols + # \u2700-\u27bf - Dingbats + # \u2900-\u297f - Supplemental Arrows-B + # \u2b00-\u2bff - Miscellaneous Symbols and Arrows + # \u3000-\u303f - CJK Symbols and Punctuation + # \u3200-\u32ff - Enclosed CJK Letters and Months + unicode_emoji_regex = u'(?P['\ + u'\U0001F100-\U0001F64F' \ + u'\U0001F680-\U0001F6FF' \ + u'\U0001F900-\U0001F9FF' \ + u'\u2000-\u206F' \ + u'\u2300-\u27BF' \ + u'\u2900-\u297F' \ + u'\u2B00-\u2BFF' \ + u'\u3000-\u303F' \ + u'\u3200-\u32FF' \ + u'])' + md.inlinePatterns.add('unicodeemoji', UnicodeEmoji(unicode_emoji_regex), '_end') + # The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]| + # \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]| + # [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for + # frontend unicode emoji processing. + # The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f + # The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff + # The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff + # The JS regex [\u2000-\u206f] represents \u2000-\u206f + # The JS regex [\u2300-\u27bf] represents \u2300-\u27bf + # Similarly other JS regexes can be mapped to the respective unicode blocks. + # For more information, please refer to the following article: + # http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript md.inlinePatterns.add('link', AtomicLinkPattern(markdown.inlinepatterns.LINK_RE, md), '>avatar')