msenol86/emojirange.py

## emojirange.py
import urllib.request
import re

EMOJI_TEST_FILENAME = "emoji-test.txt"
EMOJI_DATA_URL = "https://unicode.org/Public/emoji/14.0/emoji-test.txt"

def download_latest_emoji_test_data() :
    response = urllib.request.urlopen(EMOJI_DATA_URL)
    emoji_test_file = response.read()
    with open(EMOJI_TEST_FILENAME, "wb") as tmp_file:
        tmp_file.write(emoji_test_file)

def convert_unicode_chars_2(p_string_in_unicode):
    """
    :param p_string: u'1F469 200D 1F469 200D 1F467 200D 1F466'
    :return: u'\U0001f469\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466'
    """
    return u"".join([chr(int(a_char, base=16)) for a_char in p_string_in_unicode.split(u" ")])

def get_normalize_short_names(p_string_in_unicode):
    """
    family: woman, woman, girl, girl -> family_woman_woman_girl_girl
    UP! button -> UP_button
    Japanese “free of charge” button -> Japanese_free_of_charge_button
    flag: Cocos (Keeling) Islands -> flag_Cocos_Keeling_Islands
    three o’clock -> three_oclock
    rescue worker’s helmet -> rescue_workers_helmet
    flag: São Tomé & Príncipe -> flag_São_Tomé_Príncipe
    :param p_string: input format men holding hands: dark skin tone, medium-dark skin tone
    :return: output format men_holding_hands_dark_skin_tone_medium-dark_skin_tone
    """
    # remove all non alphanumeric chars except space and dash
    temp1 = "".join([c for c in p_string_in_unicode if c.isalnum() or c in [" ", "-"]])
    # replace multiple spaces with single space
    temp2 = re.sub(r" +", r" ", temp1)
    # replace all spaces with underscore
    temp3 = re.sub(r" ", r"_", temp2)
    return temp3

def load_emoji_lookup():
    try :
        with open(EMOJI_TEST_FILENAME, "r", encoding="utf8") as unicode_data:
            unicode_data_rows = unicode_data.readlines()
            quailifed_emojis = [a_line for a_line in unicode_data_rows if
                                re.search(r'(minimally-qualified #|fully-qualified     #)', a_line)]
            tmp_dict = dict()

            print("Count of quailifed_emojis: " + str(len(quailifed_emojis)))

            for an_emoji_row in quailifed_emojis:
                emoji_shortname = " ".join(an_emoji_row.split("#")[1].split(" ")[2:]).strip()
                emoji_in_unicode = an_emoji_row.split("#")[1].split(" ")[1].strip()

                tmp_dict[emoji_in_unicode] = u"" + get_normalize_short_names(emoji_shortname)

            return tmp_dict
    except FileNotFoundError as e:
        print(EMOJI_TEST_FILENAME + " file not found. Downloading it ...")
        download_latest_emoji_test_data()
        print("File downloaded. Re-run the script")
        return None


def is_contains_emoji(p_string_in_unicode):
    """
    Instead of searching all chars of a text in a emoji lookup dictionary this function just
    checks whether any char in the text is in unicode emoji range
    It is much faster than a dictionary lookup for a large text
    However it only tells whether a text contains an emoji. It does not return the found emojis
    """
    range_min = ord(u'\U0001F300') # 127744
    range_max = ord(u"\U0001FAF6") # 129782
    range_min_2 = 126980
    range_max_2 = 127569
    range_min_3 = 169
    range_max_3 = 174
    range_min_4 = 8205
    range_max_4 = 12953
    if p_string_in_unicode:
        for a_char in p_string_in_unicode:
            char_code = ord(a_char)
            if range_min <= char_code <= range_max:
                # or range_min_2 <= char_code <= range_max_2 or range_min_3 <= char_code <= range_max_3 or range_min_4 <= char_code <= range_max_4:
                return True
            elif range_min_2 <= char_code <= range_max_2:
                return True
            elif range_min_3 <= char_code <= range_max_3:
                return True
            elif range_min_4 <= char_code <= range_max_4:
                return True
        return False
    else:
        return False


def test_emoji_range_with(emoji_dict : dict) :
    passed_items = dict()
    for a_key, a_value in emoji_dict.items():
        if is_contains_emoji(a_key):
            passed_items[a_key] = a_value
    set1 = set(emoji_dict.items())
    set2 = set(passed_items.items())
    return set1 ^ set2

emoji_dict = load_emoji_lookup()
if emoji_dict is not None:
    missing_emojis_list = [p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]
    if len(missing_emojis_list) != 0:
        print("List of emojis which are not in the range: " + str([p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]))
    else:
        print("Range values are correct and detects all emojis")
	import urllib.request
	import re

	EMOJI_TEST_FILENAME = "emoji-test.txt"
	EMOJI_DATA_URL = "https://unicode.org/Public/emoji/14.0/emoji-test.txt"

	def download_latest_emoji_test_data() :
	response = urllib.request.urlopen(EMOJI_DATA_URL)
	emoji_test_file = response.read()
	with open(EMOJI_TEST_FILENAME, "wb") as tmp_file:
	tmp_file.write(emoji_test_file)

	def convert_unicode_chars_2(p_string_in_unicode):
	"""
	:param p_string: u'1F469 200D 1F469 200D 1F467 200D 1F466'
	:return: u'\U0001f469\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466'
	"""
	return u"".join([chr(int(a_char, base=16)) for a_char in p_string_in_unicode.split(u" ")])

	def get_normalize_short_names(p_string_in_unicode):
	"""
	family: woman, woman, girl, girl -> family_woman_woman_girl_girl
	UP! button -> UP_button
	Japanese “free of charge” button -> Japanese_free_of_charge_button
	flag: Cocos (Keeling) Islands -> flag_Cocos_Keeling_Islands
	three o’clock -> three_oclock
	rescue worker’s helmet -> rescue_workers_helmet
	flag: São Tomé & Príncipe -> flag_São_Tomé_Príncipe
	:param p_string: input format men holding hands: dark skin tone, medium-dark skin tone
	:return: output format men_holding_hands_dark_skin_tone_medium-dark_skin_tone
	"""
	# remove all non alphanumeric chars except space and dash
	temp1 = "".join([c for c in p_string_in_unicode if c.isalnum() or c in [" ", "-"]])
	# replace multiple spaces with single space
	temp2 = re.sub(r" +", r" ", temp1)
	# replace all spaces with underscore
	temp3 = re.sub(r" ", r"_", temp2)
	return temp3

	def load_emoji_lookup():
	try :
	with open(EMOJI_TEST_FILENAME, "r", encoding="utf8") as unicode_data:
	unicode_data_rows = unicode_data.readlines()
	quailifed_emojis = [a_line for a_line in unicode_data_rows if
	re.search(r'(minimally-qualified #\|fully-qualified #)', a_line)]
	tmp_dict = dict()

	print("Count of quailifed_emojis: " + str(len(quailifed_emojis)))

	for an_emoji_row in quailifed_emojis:
	emoji_shortname = " ".join(an_emoji_row.split("#")[1].split(" ")[2:]).strip()
	emoji_in_unicode = an_emoji_row.split("#")[1].split(" ")[1].strip()

	tmp_dict[emoji_in_unicode] = u"" + get_normalize_short_names(emoji_shortname)

	return tmp_dict
	except FileNotFoundError as e:
	print(EMOJI_TEST_FILENAME + " file not found. Downloading it ...")
	download_latest_emoji_test_data()
	print("File downloaded. Re-run the script")
	return None


	def is_contains_emoji(p_string_in_unicode):
	"""
	Instead of searching all chars of a text in a emoji lookup dictionary this function just
	checks whether any char in the text is in unicode emoji range
	It is much faster than a dictionary lookup for a large text
	However it only tells whether a text contains an emoji. It does not return the found emojis
	"""
	range_min = ord(u'\U0001F300') # 127744
	range_max = ord(u"\U0001FAF6") # 129782
	range_min_2 = 126980
	range_max_2 = 127569
	range_min_3 = 169
	range_max_3 = 174
	range_min_4 = 8205
	range_max_4 = 12953
	if p_string_in_unicode:
	for a_char in p_string_in_unicode:
	char_code = ord(a_char)
	if range_min <= char_code <= range_max:
	# or range_min_2 <= char_code <= range_max_2 or range_min_3 <= char_code <= range_max_3 or range_min_4 <= char_code <= range_max_4:
	return True
	elif range_min_2 <= char_code <= range_max_2:
	return True
	elif range_min_3 <= char_code <= range_max_3:
	return True
	elif range_min_4 <= char_code <= range_max_4:
	return True
	return False
	else:
	return False


	def test_emoji_range_with(emoji_dict : dict) :
	passed_items = dict()
	for a_key, a_value in emoji_dict.items():
	if is_contains_emoji(a_key):
	passed_items[a_key] = a_value
	set1 = set(emoji_dict.items())
	set2 = set(passed_items.items())
	return set1 ^ set2

	emoji_dict = load_emoji_lookup()
	if emoji_dict is not None:
	missing_emojis_list = [p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]
	if len(missing_emojis_list) != 0:
	print("List of emojis which are not in the range: " + str([p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""]))
	else:
	print("Range values are correct and detects all emojis")