Skip to content

Instantly share code, notes, and snippets.

@jinyu121
Created October 26, 2022 12:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jinyu121/3140ade03fff8f555c703a20f403e870 to your computer and use it in GitHub Desktop.
Save jinyu121/3140ade03fff8f555c703a20f403e870 to your computer and use it in GitHub Desktop.
Emoji Crawler

Emoji Crawler

This script parses the emoji list given by the unicode organization, and saves them as image files

import base64
import mimetypes
from pathlib import Path
import httpx
import magic
from bs4 import BeautifulSoup
from tqdm import tqdm
PAGE_URL = "https://unicode.org/emoji/charts-14.0/full-emoji-list.html"
EMOJI_TYPES = ["Apple", "Google", "Facebook", "Windows", "Twitter", "JoyPixels", "Samsung",
"GMail", "SoftBank", "DoCoMo", "KDDI"]
BASE_PATH = Path("emoji")
def data2bytes(text: str) -> (str, bytes):
text, data = text.split(",")
data = base64.b64decode(data)
filetype = mimetypes.guess_extension(magic.from_buffer(data, mime=True))
return filetype, data
# If you do not want to install `python-magic` and libmagic, this code also **works**
#def data2bytes(text: str) -> (str, bytes):
# text, data = text.split(",")
# data = base64.b64decode(data)
# filetype = text.split(";")[0].split(":")[1].split("/")[1]
# return "." + filetype, data
if __name__ == '__main__':
mimetypes.init()
for vendor in EMOJI_TYPES:
(BASE_PATH / vendor).mkdir(parents=True, exist_ok=True)
# Get HTML
page_cache = BASE_PATH / "page_cache.html"
if page_cache.exists() and page_cache.is_file():
html_doc = page_cache.open().read()
else:
html_doc = httpx.get(PAGE_URL).content
with page_cache.open("wb") as f:
f.write(html_doc)
soup = BeautifulSoup(html_doc, 'html.parser')
for row in tqdm(soup.find_all('tr')):
label_td = list(row.find_all("td"))
if len(label_td) <= 1:
continue
if "№" == label_td[0].text.strip():
continue
code = label_td[1].text.strip()
icons = label_td[3:]
if len(icons) < len(EMOJI_TYPES):
ext, img = data2bytes(icons[0].img["src"])
for vendor in EMOJI_TYPES:
filename = BASE_PATH / vendor / (code + ext)
with filename.open("wb") as f:
f.write(img)
else:
for vendor, icon in zip(EMOJI_TYPES, icons):
if "—" == icon.text.strip():
continue
ext, img, = data2bytes(icon.img["src"])
filename = BASE_PATH / vendor / (code + ext)
with filename.open("wb") as f:
f.write(img)
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment