Skip to content

Instantly share code, notes, and snippets.

@Mylloon
Forked from slowkow/remove-emoji.py
Last active October 15, 2022 10:35
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mylloon/e63f90e27b7e933779794cf6e39b758b to your computer and use it in GitHub Desktop.
Save Mylloon/e63f90e27b7e933779794cf6e39b758b to your computer and use it in GitHub Desktop.
Remove all traces of emoji from a text file.
from requests import get
def emojis() -> str:
page = get("https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt")
lines = page.text.split("\n")
blacklist = [ # blacklist of element who are not really emojis
"number sign",
"asterisk",
"digit zero..digit nine",
"copyright",
"registered",
"double exclamation mark",
"exclamation question mark",
"trade mark",
"information"
]
unicodes = []
extendedEmoji = {}
for line in lines: # check all lines
if not line.startswith("#") and len(line) > 0: # ignores comment lines and blank lines
if line.split(')')[1].strip() not in blacklist: # check if the emoji isn't in the blacklist
temp = f"{line.split(';')[0]}".strip() # recovery of the first column
if ".." in temp: # if it is a "list" of emojis, adding to a dict
extendedEmoji[temp.split("..")[0]] = temp.split("..")[1]
else:
unicodes.append(temp)
unicodes = list(set(unicodes) - {""}) # removal of duplicates and especially of extra spaces
def _uChar(string: str): # choice between \u and \U in addition of the "0" to complete the code
stringLen = len(string)
if stringLen > 7: # Can't be more than 7 anyways
raise Exception(f"{string} is too long! ({stringLen})")
u, totalLong = "U", 7 # Should be 7 characters long if it is a capital U
if stringLen < 4: # 4 characters long if smaller than 4
u, totalLong = "u", 4 # Should be 4 characters long if it is a lowercase u
resultat = ""
while len(f"{resultat}{string}") <= totalLong: # Adding the 0
resultat += "0"
return f"\{u}{resultat}" # Return the right "U" with the right number of 0
for i in range(0, len(unicodes)): # add unicode syntax to the list
unicodes[i] = f"{_uChar(unicodes[i])}{unicodes[i]}"
for mot in extendedEmoji.items(): # add unicode syntax to the dict
extendedEmoji[mot[0]] = f"{_uChar(mot[1])}{mot[1]}"
temp = f"{_uChar(mot[0])}{mot[0]}-{extendedEmoji[mot[0]]}"
if temp not in unicodes: # if not already in the list
unicodes.append(temp) # add the item to the list
resultat = "["
for code in unicodes: # conversion of the list into a string with | to separate all the emojis
resultat += f"{code}|"
return f"{resultat[:-1]}]+"
# For testing, to be removed
from re import sub, UNICODE
string = "hello 😂"
print(f"String: {string}\nWithout Emojis: {sub(emojis(), '', string, flags = UNICODE)}")
@pskd73
Copy link

pskd73 commented Mar 24, 2022

Even this missed → emoticon

@Mylloon
Copy link
Author

Mylloon commented Mar 24, 2022

→ isn't an emoji
if you wan't to keep only letters/numbers/spaces simply use this regex : ' *?[^\w\s]+'
Example:

from re import sub
string = "hello → 😂"
regex = r" *?[^\w\s]+"
print(f"String: {string}\nWithout Emojis: {sub(regex, ' ', string)}")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment