From 234eb7a72357646a91c58dced7a4857834fdbb1b Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@zulipchat.com>
Date: Sun, 29 Jan 2017 12:33:44 -0800
Subject: [PATCH] emoji: Add a bunch of documentation in comments for emoji
 rules.

---
 tools/setup/emoji/build_emoji          |  3 ++
 tools/setup/emoji/emoji_setup_utils.py | 45 ++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/tools/setup/emoji/build_emoji b/tools/setup/emoji/build_emoji
index 1832c09150..0fcd7bdea6 100755
--- a/tools/setup/emoji/build_emoji
+++ b/tools/setup/emoji/build_emoji
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+#
+# See docs/emoji.md for a high-level explanation of how this system
+# works.
 from __future__ import print_function
 import os
 import glob
diff --git a/tools/setup/emoji/emoji_setup_utils.py b/tools/setup/emoji/emoji_setup_utils.py
index ca9754f2d1..6abae3e3a0 100644
--- a/tools/setup/emoji/emoji_setup_utils.py
+++ b/tools/setup/emoji/emoji_setup_utils.py
@@ -1,4 +1,12 @@
 from __future__ import absolute_import
+# This tool contains all of the rules that we use to decide which of
+# the various emoji names in emoji-map.json we should actually use in
+# autocomplete and emoji pickers.  You can't do all of them, because
+# otherwise there will be a ton of duplicates alphabetically next to
+# each other, which is confusing and looks bad (e.g. `angry` and
+# `angry_face` or `ab` and `ab_button` will always sort next to each
+# other, and you really want to just pick one).  See docs/emoji.md for
+# details on how this system works.
 
 from collections import defaultdict
 from itertools import permutations, chain
@@ -7,8 +15,10 @@ import ujson
 from six.moves import range, zip
 from typing import Text
 
-# the corresponding code point will be set to exactly these names as a final pass,
-# overriding any other rules
+# the corresponding code point will be set to exactly these names as a
+# final pass, overriding any other rules.  This is useful for cases
+# where the two names are very different, users might reasonably type
+# either name and be surprised when they can't find the relevant emoji.
 whitelisted_names = [
     ['date', 'calendar'], ['shirt', 'tshirt'], ['cupid', 'heart_with_arrow'],
     ['tada', 'party_popper'], ['parking', 'p_button'], ['car', 'automobile'],
@@ -48,6 +58,11 @@ whitelisted_names = [
     # ['ocean', 'water_wave'], wave is so common that we want it to point only to :wave:
 ]
 
+# We blacklist certain names in cases where the algorithms below would
+# choose incorrectly which one to keep.  For example, with `football`,
+# by default, our algorithm would pick just `football`, but we given
+# that :rugby_football: also exists, we want to keep
+# :american_football: instead.  So we just remove the shorter names here.
 blacklisted_names = frozenset([
     # would be chosen by words_supersets or superstrings
     'football', # american_football
@@ -135,8 +150,12 @@ def ideographless(names):
         return []
     return [name for name, has_ideograph in zip(names, has_ideographs) if not has_ideograph]
 
-# subsumed by longer, but still useful for breaking up a hand review of the
-# blacklist decisions
+# In the absence of a good reason not to, we prefer :angry: over
+# :angry_face:, since it's shorter and communicates the same idea.
+#
+# This rule is subsumed by the longer rule, but still useful for
+# breaking up a hand review of the whitelist/blacklist decisions,
+# since these cases are much more clear than the "longer" ones.
 def word_superset(names):
     # type: (List[str]) -> List[str]
     bags_of_words = [frozenset(name.split('_')) for name in names]
@@ -146,8 +165,12 @@ def word_superset(names):
             bad_names.add(names[j])
     return list(bad_names)
 
-# subsumed by longer, but still useful for breaking up a hand review of the
-# blacklist decisions
+# We prefer :dog: over :dog2: if they both point to the same unicode
+# character.
+#
+# This rule is subsumed by the longer rule, but still useful for
+# breaking up a hand review of the whitelist/blacklist decisions,
+# since these cases are much more clear than the "longer" ones.
 def superstring(names):
     # type: (List[str]) -> List[str]
     bad_names = set()
@@ -156,13 +179,21 @@ def superstring(names):
             bad_names.add(name2)
     return list(bad_names)
 
+# The shorter one is usually a better name.
 def longer(names):
     # type: (List[str]) -> List[str]
     lengths = [len(name) for name in names]
     min_length = min(lengths)
     return [name for name, length in zip(names, lengths) if length > min_length]
 
-
+# A lot of emoji that have a color in their name aren't actually the
+# right color, which is super confusing.  A big part of the reason is
+# that "black" and "white" actually mean filled-in and not-filled-in
+# to the Unicode committee, which is a poor choice by explains why
+# something with "black" in its name might be any solid color.  Users
+# want the emoji to have reasonable names, though, so we have to
+# correct the names with "black" or "white" in them.
+#
 # Ones found after a few minutes of inspection, and after all the other filters
 # have been applied. Probably others remaining.
 miscolored_names = frozenset(['eight_pointed_black_star', 'large_blue_diamond',