Skip to content

Instantly share code, notes, and snippets.

@joke2k
Last active June 26, 2018 18:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joke2k/90096fd6875a158a931024f13dff8b83 to your computer and use it in GitHub Desktop.
Save joke2k/90096fd6875a158a931024f13dff8b83 to your computer and use it in GitHub Desktop.
Script to extract emojis from unicode.org and create faker provider. [requires: requests-html]
import sys
from pprint import pformat
from requests_html import HTMLSession
session = HTMLSession()
url = 'https://unicode.org/emoji/charts/emoji-list.html'
r = session.get(url)
results = {}
emojis = 0
last_main_category = last_category = None
for row in r.html.find('tr'):
columns = row.find('td')
if len(columns) == 0:
# header row
columns = row.find('th')
if len(columns) == 1:
header = columns[0]
if 'bighead' in header.attrs['class']:
# main category (cleaned)
last_main_category = "-".join(header.text.split(' & ')).lower()
results[last_main_category] = {}
elif 'mediumhead' in header.attrs['class']:
# category
last_category = header.text
results[last_main_category][last_category] = []
continue
index, code, sample, short_name, keywords = [
column.text
for column in columns
]
# remove symbol for new emoji
short_name = short_name.replace('⊛', '').strip()
# remove keyword for "emojification"
keywords = keywords.replace('➯ emojification of pre-existing character', '').strip()
results[last_main_category][last_category].append([
code, short_name, [k for k in keywords.split(' | ') if k != short_name]
])
emojis += 1
sys.stderr.write("#emojis: {}\n".format(emojis))
print("""
from __future__ import unicode_literals
from .. import BaseProvider
class Provider(BaseProvider):
# Source: {}
# Scraper: https://gist.github.com/joke2k/90096fd6875a158a931024f13dff8b83
emojis = {}
""".format(url, pformat(results)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment