Skip to content

Instantly share code, notes, and snippets.

@shello
Last active June 16, 2016 13:40
Show Gist options
  • Save shello/8a2889807c112688c576 to your computer and use it in GitHub Desktop.
Save shello/8a2889807c112688c576 to your computer and use it in GitHub Desktop.
Wiki Emoji

Emoji in Wikipedia (EN):

Emoji missing from Wikipedia (EN):

Last updated on 2015-03-04 00:42:15.004541

# coding: utf-8
from unicodedata import name as unicode_name
import urllib.request, urllib.parse
import json
import time
from datetime import datetime
EMOJI_RANGES_UNICODE_7 = [
('\U0001F300', '\U0001F32C'),
('\U0001F330', '\U0001F37D'),
('\U0001F380', '\U0001F3CE'),
('\U0001F3D4', '\U0001F3F7'),
('\U0001F400', '\U0001F4FE'),
('\U0001F500', '\U0001F54A'),
('\U0001F550', '\U0001F579'),
('\U0001F57B', '\U0001F5A3'),
('\U0001F5A5', '\U0001F5FF')
]
EMOJI_RANGES_UNICODE_6 = [
('\U0001F300', '\U0001F320'),
('\U0001F330', '\U0001F335'),
('\U0001F337', '\U0001F37C'),
('\U0001F380', '\U0001F393'),
('\U0001F3A0', '\U0001F3C4'),
('\U0001F3C6', '\U0001F3CA'),
('\U0001F3E0', '\U0001F3F0'),
('\U0001F400', '\U0001F43E'),
('\U0001F440', '\U0001F440'),
('\U0001F442', '\U0001F4F7'),
('\U0001F4F9', '\U0001F4FC'),
('\U0001F500', '\U0001F53C'),
('\U0001F540', '\U0001F543'),
('\U0001F550', '\U0001F567'),
('\U0001F5FB', '\U0001F5FF')
]
EMOJI_RANGES = EMOJI_RANGES_UNICODE_6
WIKIPEDIA_ENDPOINT = 'http://en.wikipedia.org/w/api.php'
WIKIPEDIA_PARAMS = {
'format': 'json',
'action': 'query',
'prop': 'info',
'redirects': True
}
WIKIPEDIA_TITLE_PARAM = 'titles'
WIKIPEDIA_TITLE_MAX = 50
WIKIPEDIA_TITLE_SEP = '|'
WIKIPEDIA_REQUEST_WAIT = 1 # Seconds
class MultipleRanges:
'''Join multiple ranges into one range'''
def __init__(self):
self.ranges = []
self.iter_ranges = None
self.curr_range = None
def add_range(self, range_first, range_last):
'''Add range to this MultipleRanges'''
self.ranges.append((range_first, range_last+1))
return True
def __iter__(self):
self.iter_ranges = iter(self.ranges)
self.curr_range = iter(range(*next(self.iter_ranges)))
return self
def __next__(self):
element = next(self.curr_range, None)
if element is None:
self.curr_range = iter(range(*next(self.iter_ranges)))
element = next(self.curr_range)
return element
class CharMultipleRanges(MultipleRanges):
'''Join multiple character ranges into one range'''
def add_range(self, range_first, range_last):
return super().add_range(ord(range_first), ord(range_last))
def __next__(self):
return chr(super().__next__())
class Bunches:
'''Split an iterable into bunches'''
def __init__(self, iterable, bunch_max, separator=','):
self.iterable = iter(iterable)
self.bunch_max = bunch_max
self.separator = separator
self.finished = False
def __iter__(self):
self.finished = False
return self
def __next__(self):
bunch = []
if self.finished:
raise StopIteration
try:
while len(bunch) < self.bunch_max:
bunch.append(next(self.iterable))
except StopIteration:
self.finished = True
return self.separator.join(bunch)
# Make a MultipleRanges of emoji and populate it
emoji = CharMultipleRanges()
for (first, last) in EMOJI_RANGES:
emoji.add_range(first, last)
# Dict to save the redirects; emoji as key, redirect as value
wikipedia_redirects = {}
def build_wikipedia_url(title_group):
'''Build an URL for the Wikipedia API'''
params = WIKIPEDIA_PARAMS.copy()
params[WIKIPEDIA_TITLE_PARAM] = title_group
return WIKIPEDIA_ENDPOINT + '?' + urllib.parse.urlencode(params,
safe=WIKIPEDIA_TITLE_SEP)
def parse_wikipedia_response(response):
'''Parse a Wikipedia API response and update the redirects Dict'''
query = json.loads(response)['query']
if 'redirects' in query:
for redirect in query['redirects']:
wikipedia_redirects[redirect['from']] = redirect['to']
if 'pages' in query:
for _, missing_page in filter(lambda page: int(page[0]) < 0,
query['pages'].items()):
wikipedia_redirects[missing_page['title']] = None
return len(query['redirects'] if 'redirects' in query else 0)
def do_wikipedia_requests():
'''Make requests to the Wikipedia API to get all the emoji pages'''
title_groups = Bunches(emoji, WIKIPEDIA_TITLE_MAX, '|')
for title_group in title_groups:
url = build_wikipedia_url(title_group)
request = urllib.request.urlopen(url)
if request.status is 200:
parsed = parse_wikipedia_response(request.read().decode())
print("Parsed {} redirects in this bunch.".format(parsed), flush=True)
time.sleep(WIKIPEDIA_REQUEST_WAIT)
# Gotta catch 'em all
do_wikipedia_requests()
wikipedia_requests_time = datetime.now()
# Make a list of missing articles
#[(k, unicode_name(k).title()) for k, v in filter(lambda e: not e[1], wikipedia_redirects.items())]
# Markdown output
md_line = '* [{}](https://en.wikipedia.org/wiki/{}?redirect=no "{}"){}'
redirects_line = ' redirects to **[{}](https://en.wikipedia.org/wiki/{})**'
def get_unicode_code_name(char):
'''Return a "U+NNNNN UNICODE CHAR NAME" formatted string'''
char_hex = hex(ord(char))[2:].upper()
return 'U+{} {}'.format(char_hex, unicode_name(char))
def wiki_markdown():
existing = []
missing = []
for char in emoji:
exists = wikipedia_redirects[char] is not None
char_escaped = urllib.parse.quote(char)
u_name = get_unicode_code_name(char)
redirects_to = redirects_line.format(wikipedia_redirects[char], char_escaped) if exists else ''
line = md_line.format(char, char_escaped, u_name, redirects_to)
if exists:
existing.append(line)
else:
missing.append(line)
return '''# Emoji in Wikipedia (EN):
{}
# Emoji missing from Wikipedia (EN):
{}
Last updated on {}'''.format('\n'.join(existing), '\n'.join(missing), wikipedia_requests_time)
print(wiki_markdown())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment