Skip to content

Instantly share code, notes, and snippets.

@alexott
Created May 19, 2019 17:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexott/8d07abb61775bf56efea9f054d4bca18 to your computer and use it in GitHub Desktop.
Save alexott/8d07abb61775bf56efea9f054d4bca18 to your computer and use it in GitHub Desktop.
test ♈ up ☝️ light skin ☝🏻 Zimbabwe 🇿🇼 England 🏴 keycap0 0️⃣end 0
"""Module to work with emojis in text"""
import pickle
# TODO: add the function that will load all emojis, with their names, etc
def add_emoji(emojis, ch1, ch2 = ''):
# print('ch1=' + ch1 + ', ch2=' + ch2)
maybe_map = emojis.get(ch1, False)
if maybe_map:
if isinstance(maybe_map, dict):
maybe_map[ch2]=True
else:
maybe_map={ch2: True, '': True}
elif len(ch2) != 0:
emojis[ch1]={ch2: True}
else:
emojis[ch1]=True
# emoji-all.txt consists of files emoji-zwj-sequences.txt, emoji-data.txt, and
# emoji-sequences.txt downloaded from https://unicode.org/Public/emoji/12.0/,
# and with manually removed entries for #, numbers, and copyright/trademark/regmark
# TODO: make a list of exclusions, and load all files without manual editing
def load_emojis(fname="emoji-all.txt"):
"""Loads emojis from emoji-sequences.txt downloaded from https://unicode.org/Public/emoji/12.0/.
Returns a dictionary where key is the first character of emoji, and value is either True
if emoji consists of only of one unicode character, or dictionary with the rest of unicode characters.
In this dictionary the '' key represents the emoji that may consist of one, or more optional characters"""
emojis = {}
with open(fname, "r") as f:
for cnt, line in enumerate(f):
idx = line.find(';')
if len(line) == 0 or line[0] == '#' or idx == -1:
continue
line = line[0:idx]
line = line.strip()
if len(line) == 0:
continue
idx = line.find('..')
if idx != -1:
first_num=line[0:idx]
sec_num=line[(idx+2):]
for i in range(int(first_num, 16), int(sec_num, 16)+1):
add_emoji(emojis, chr(i))
else:
idx=line.find(' ')
if idx != -1:
first_num=line[0:idx]
sec_num=line[(idx+1):]
if sec_num.find(' ') != -1:
sec_str=''
for i in sec_num.split(' '):
sec_str=sec_str+chr(int(i, 16))
add_emoji(emojis, chr(int(first_num, 16)), sec_str)
else:
add_emoji(emojis, chr(int(first_num, 16)), chr(int(sec_num, 16)))
else:
add_emoji(emojis, chr(int(line, 16)))
return emojis
def generate_pickle(pickle_file="emojis.pickle", emoji_file="emoji-all.txt"):
emojis = load_emojis(emoji_file)
with open(pickle_file, "wb") as f:
pickle.dump(emojis, f)
def load_pickle(pickle_file="emojis.pickle"):
emojis={}
try:
with open(pickle_file, "rb") as f:
emojis=pickle.load(f)
except pickle.PickleError as ex:
print('Pickling error: {}'.foramt(ex))
except IOError:
print('Cannot open ' + pickle_file)
return emojis
def strip_emojis(emojis, txt):
txt_len=len(txt)
cnt=0
res_text=''
while cnt < txt_len:
c = txt[cnt]
if c in emojis:
maybe_map=emojis[c]
if isinstance(maybe_map, dict):
found_full=False
sub_text=txt[(cnt+1):]
for k,v in maybe_map.items():
if k == '':
continue
if sub_text.startswith(k):
found_full=True
cnt = cnt + len(k)
if not found_full and '' not in maybe_map:
res_text = res_text + c
else:
res_text = res_text + c
cnt = cnt + 1
return res_text
# Test:
# with open('emoji-test.txt', encoding='utf8') as f:
# emoji_test = f.read().strip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment