bugdown: Fix the regex used for unicode emoji matching.

The regex we were using didn't cover all the unicode blocks
to which our emojis belong. This commit fixes the regex to
include all the unicode blocks and also updates the
corresponding JS regex in marked.js.

Fixes: #3460.
This commit is contained in:
Harshit Bansal
2017-05-15 14:54:01 +05:30
committed by Tim Abbott
parent f8824ea623
commit c549dea9ac
3 changed files with 72 additions and 11 deletions

View File

@@ -537,7 +537,10 @@ inline.breaks = merge({}, inline.gfm, {
inline.zulip = merge({}, inline.breaks, {
emoji: /^:([A-Za-z0-9_\-\+]+?):/,
unicodeemoji: /^(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|[\u2600-\u26FF]|[\u2700-\u27BF])/,
unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' +
'\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' +
'[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' +
'[\u3000-\u303F]|[\u3200-\u32FF])'),
usermention: /^(@(?:\*\*([^\*]+)\*\*|(\w+)))/m, // Match multi-word string between @** ** or match any one-word
stream: /^#\*\*([^\*]+)\*\*/m,
avatar: /^!avatar\(([^)]+)\)/,
@@ -545,7 +548,10 @@ inline.zulip = merge({}, inline.breaks, {
tex: /^(\$\$([^ _$](\\\$|[^$])*)(?! )\$\$)\B/,
realm_filters: [],
text: replace(inline.breaks.text)
('|', '|(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|[\u2600-\u26FF]|[\u2700-\u27BF])|')
('|', '|(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' +
'\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' +
'[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' +
'[\u3000-\u303F]|[\u3200-\u32FF])|')
(']|', '#@:]|')
()
});

View File

@@ -288,12 +288,30 @@
"expected_output":"<p><img alt=\"\ud83d\udca9\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/1f4a9.png\" title=\"\ud83d\udca9\"> word <img alt=\"\ud83d\udca9\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/1f4a9.png\" title=\"\ud83d\udca9\"><\/p>",
"bugdown_matches_marked": true
},
{
"name": "miscellaneous_symbols_and_pictographs",
"input": "Merry Christmas!!\ud83c\udf84",
"expected_output":"<p>Merry Christmas!!<img alt=\"\ud83c\udf84\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/1f384.png\" title=\"\ud83c\udf84\"><\/p>",
"bugdown_matches_marked": true
},
{
"name": "miscellaneous_and_dingbats_emoji",
"input": "\u2693\u2797",
"expected_output":"<p><img alt=\"\u2693\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/2693.png\" title=\"\u2693\"><img alt=\"\u2797\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/2797.png\" title=\"\u2797\"><\/p>",
"bugdown_matches_marked": true
},
{
"name": "supplemental_symbols_and_pictographs",
"input": "I am a robot \ud83e\udd16.",
"expected_output":"<p>I am a robot <img alt=\"\ud83e\udd16\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/1f916.png\" title=\"\ud83e\udd16\">.<\/p>",
"bugdown_matches_marked": true
},
{
"name": "miscellaneous_symbols_and_arrows",
"input": "Black upward arrow \u2b06",
"expected_output":"<p>Black upward arrow <img alt=\"\u2b06\" class=\"emoji\" src=\"\/static\/generated\/emoji\/images\/emoji\/unicode\/2b06.png\" title=\"\u2b06\"><\/p>",
"bugdown_matches_marked": true
},
{
"name": "unicode_emoji_without_space",
"input": "Extra\ud83d\udc7dTerrestrial",

View File

@@ -1223,15 +1223,52 @@ class Bugdown(markdown.Extension):
md.inlinePatterns.add('stream', StreamPattern(stream_group), '>backtick')
md.inlinePatterns.add('tex', Tex(r'\B\$\$(?P<body>[^ _$](\\\$|[^$])*)(?! )\$\$\B'), '>backtick')
md.inlinePatterns.add('emoji', Emoji(r'(?P<syntax>:[\w\-\+]+:)'), '_end')
md.inlinePatterns.add('unicodeemoji', UnicodeEmoji(
u'(?P<syntax>[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF])'),
'_end')
# The equalent JS regex is \ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|
# [\u2600-\u26FF]|[\u2700-\u27BF]. See below comments for explanation. The JS regex is used
# by marked.js for frontend unicode emoji processing.
# The JS regex \ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001F300-\U0001F64F
# The JS regex \ud83d[\ude80-\udeff] represents \U0001F680-\U0001F6FF
# Similiarly [\u2600-\u26FF]|[\u2700-\u27BF] represents \u2600-\u26FF\u2700-\u27BF
# All of our emojis(non ZWJ sequences) belong to one of these unicode blocks:
# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement
# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement
# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs
# \U0001f600-\U0001f64f - Emoticons (Emoji)
# \U0001f680-\U0001f6ff - Transport and Map Symbols
# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs
# \u2000-\u206f - General Punctuation
# \u2300-\u23ff - Miscellaneous Technical
# \u2400-\u243f - Control Pictures
# \u2440-\u245f - Optical Character Recognition
# \u2460-\u24ff - Enclosed Alphanumerics
# \u2500-\u257f - Box Drawing
# \u2580-\u259f - Block Elements
# \u25a0-\u25ff - Geometric Shapes
# \u2600-\u26ff - Miscellaneous Symbols
# \u2700-\u27bf - Dingbats
# \u2900-\u297f - Supplemental Arrows-B
# \u2b00-\u2bff - Miscellaneous Symbols and Arrows
# \u3000-\u303f - CJK Symbols and Punctuation
# \u3200-\u32ff - Enclosed CJK Letters and Months
unicode_emoji_regex = u'(?P<syntax>['\
u'\U0001F100-\U0001F64F' \
u'\U0001F680-\U0001F6FF' \
u'\U0001F900-\U0001F9FF' \
u'\u2000-\u206F' \
u'\u2300-\u27BF' \
u'\u2900-\u297F' \
u'\u2B00-\u2BFF' \
u'\u3000-\u303F' \
u'\u3200-\u32FF' \
u'])'
md.inlinePatterns.add('unicodeemoji', UnicodeEmoji(unicode_emoji_regex), '_end')
# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|
# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]|
# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for
# frontend unicode emoji processing.
# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f
# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff
# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff
# The JS regex [\u2000-\u206f] represents \u2000-\u206f
# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf
# Similarly other JS regexes can be mapped to the respective unicode blocks.
# For more information, please refer to the following article:
# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript
md.inlinePatterns.add('link', AtomicLinkPattern(markdown.inlinepatterns.LINK_RE, md), '>avatar')