Skip to content

Instantly share code, notes, and snippets.

@msenol86
Last active July 21, 2022 12:33
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save msenol86/44082269be46aa446ccda9d02202e523 to your computer and use it in GitHub Desktop.
Save msenol86/44082269be46aa446ccda9d02202e523 to your computer and use it in GitHub Desktop.
import urllib.request
import re
EMOJI_TEST_FILENAME = "emoji-test.txt"
EMOJI_DATA_URL = "https://unicode.org/Public/emoji/14.0/emoji-test.txt"
def download_latest_emoji_test_data() :
response = urllib.request.urlopen(EMOJI_DATA_URL)
emoji_test_file = response.read()
with open(EMOJI_TEST_FILENAME, "wb") as tmp_file:
tmp_file.write(emoji_test_file)
def convert_unicode_chars_2(p_string_in_unicode):
"""
:param p_string: u'1F469 200D 1F469 200D 1F467 200D 1F466'
:return: u'\U0001f469\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466'
"""
return u"".join([chr(int(a_char, base=16)) for a_char in p_string_in_unicode.split(u" ")])
def get_normalize_short_names(p_string_in_unicode):
"""
family: woman, woman, girl, girl -> family_woman_woman_girl_girl
UP! button -> UP_button
Japanese “free of charge” button -> Japanese_free_of_charge_button
flag: Cocos (Keeling) Islands -> flag_Cocos_Keeling_Islands
three o’clock -> three_oclock
rescue worker’s helmet -> rescue_workers_helmet
flag: São Tomé & Príncipe -> flag_São_Tomé_Príncipe
:param p_string: input format men holding hands: dark skin tone, medium-dark skin tone
:return: output format men_holding_hands_dark_skin_tone_medium-dark_skin_tone
"""
# remove all non alphanumeric chars except space and dash
temp1 = "".join([c for c in p_string_in_unicode if c.isalnum() or c in [" ", "-"]])
# replace multiple spaces with single space
temp2 = re.sub(r" +", r" ", temp1)
# replace all spaces with underscore
temp3 = re.sub(r" ", r"_", temp2)
return temp3
def load_emoji_lookup():
try :
with open(EMOJI_TEST_FILENAME, "r", encoding="utf8") as unicode_data:
unicode_data_rows = unicode_data.readlines()
quailifed_emojis = [a_line for a_line in unicode_data_rows if
re.search(r'(minimally-qualified #|fully-qualified #)', a_line)]
tmp_dict = dict()
print("Count of quailifed_emojis: " + str(len(quailifed_emojis)))
for an_emoji_row in quailifed_emojis:
emoji_shortname = " ".join(an_emoji_row.split("#")[1].split(" ")[2:]).strip()
emoji_in_unicode = an_emoji_row.split("#")[1].split(" ")[1].strip()
tmp_dict[emoji_in_unicode] = u"" + get_normalize_short_names(emoji_shortname)
return tmp_dict
except FileNotFoundError as e:
print(EMOJI_TEST_FILENAME + " file not found. Downloading it ...")
download_latest_emoji_test_data()
print("File downloaded. Re-run the script")
return None
def is_contains_emoji(p_string_in_unicode):
"""
Instead of searching all chars of a text in a emoji lookup dictionary this function just
checks whether any char in the text is in unicode emoji range
It is much faster than a dictionary lookup for a large text
However it only tells whether a text contains an emoji. It does not return the found emojis
"""
range_min = ord(u'\U0001F300') # 127744
range_max = ord(u"\U0001FAF6") # 129782
range_min_2 = 126980
range_max_2 = 127569
range_min_3 = 169
range_max_3 = 174
range_min_4 = 8205
range_max_4 = 12953
if p_string_in_unicode:
for a_char in p_string_in_unicode:
char_code = ord(a_char)
if range_min <= char_code <= range_max:
# or range_min_2 <= char_code <= range_max_2 or range_min_3 <= char_code <= range_max_3 or range_min_4 <= char_code <= range_max_4:
return True
elif range_min_2 <= char_code <= range_max_2:
return True
elif range_min_3 <= char_code <= range_max_3:
return True
elif range_min_4 <= char_code <= range_max_4:
return True
return False
else:
return False
def test_emoji_range_with(emoji_dict : dict) :
passed_items = dict()
for a_key, a_value in emoji_dict.items():
if is_contains_emoji(a_key):
passed_items[a_key] = a_value
set1 = set(emoji_dict.items())
set2 = set(passed_items.items())
return set1 ^ set2
emoji_dict = load_emoji_lookup()
if emoji_dict is not None:
missing_emojis_list = [p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]
if len(missing_emojis_list) != 0:
print("List of emojis which are not in the range: " + str([p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]))
else:
print("Range values are correct and detects all emojis")
@msenol86
Copy link
Author

msenol86 commented Jul 14, 2020

Works with Python3. No extra packages required. Supports Emoji 14.04

@tejasvi
Copy link

tejasvi commented Jun 12, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment