doggan/emoji_scrape.py

## emoji_scrape.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
- Parse the raw html dump from http://emojitracker.com, extracting a list of
  the most frequently used emoji unicode points.
  - Ideally, this script could parse the HTML directly, but since they use
  AJAX to render the contents, it's easier to just copy the DOM (in chrome)
  to a file and parse the file.
- Scrape/download the emojis from http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal
  using the emoji unicode points, and download them to a local file using the
  unicode point as the filename.
- Protip:
  - For resizing the images after download, try this:
    mogrify -path images_out/ -resize 32x32 ./images/*.png
"""

import argparse
from bs4 import BeautifulSoup
import os
import urllib

def parse_args():
    parser = argparse.ArgumentParser(
        description='scrap emoji textures')
    parser.add_argument(
        'source', help='the input raw html dump (from http://emojitracker.com)')
    parser.add_argument(
        'output', help='the output path to write the resultant images')
    return parser.parse_args()

def parse_html(contents):
    print "### Parsing HTML..."

    with open(contents, 'r') as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    # Extract all the codes, preserving ranking order.
    # Links are in the form:
    # <a href="/details/1F52B" title="PISTOL" data-id="1F52B">
    codes = []
    rankings = soup.find("section", {"id": "rankings"})
    for link in soup.findAll('a'):
        data = link.get('data-id')
        if data:
            codes.append(data.lower())

    print "### %s codes found..." % len(codes)

    return codes

def main():
    args = parse_args()
    source = args.source
    output = args.output

    # Prepare the output directory.
    if not os.path.exists(output):
        os.makedirs(output)

    codes = parse_html(source)

    # Example URL:
    # "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/1f601.png"
    base_url = "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/"

    # Max # of results to download (rankings).
    COUNT = 500

    print "### Attempting to download %s files..." % COUNT

    done_count = 0
    for code in codes:
        filename = code + '.png'
        url = base_url + filename
        outpath = os.path.join(output, filename)

        # Only download if the file hasn't been downloaded yet.
        if not os.path.isfile(outpath):
            opener = urllib.URLopener()
            opener.retrieve(url, outpath)

        done_count += 1
        if done_count == COUNT:
            break

    print "### Finished writing %s files." % done_count

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	- Parse the raw html dump from http://emojitracker.com, extracting a list of
	the most frequently used emoji unicode points.
	- Ideally, this script could parse the HTML directly, but since they use
	AJAX to render the contents, it's easier to just copy the DOM (in chrome)
	to a file and parse the file.
	- Scrape/download the emojis from http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal
	using the emoji unicode points, and download them to a local file using the
	unicode point as the filename.
	- Protip:
	- For resizing the images after download, try this:
	mogrify -path images_out/ -resize 32x32 ./images/*.png
	"""

	import argparse
	from bs4 import BeautifulSoup
	import os
	import urllib

	def parse_args():
	parser = argparse.ArgumentParser(
	description='scrap emoji textures')
	parser.add_argument(
	'source', help='the input raw html dump (from http://emojitracker.com)')
	parser.add_argument(
	'output', help='the output path to write the resultant images')
	return parser.parse_args()

	def parse_html(contents):
	print "### Parsing HTML..."

	with open(contents, 'r') as f:
	soup = BeautifulSoup(f.read(), "html.parser")

	# Extract all the codes, preserving ranking order.
	# Links are in the form:
	# <a href="/details/1F52B" title="PISTOL" data-id="1F52B">
	codes = []
	rankings = soup.find("section", {"id": "rankings"})
	for link in soup.findAll('a'):
	data = link.get('data-id')
	if data:
	codes.append(data.lower())

	print "### %s codes found..." % len(codes)

	return codes

	def main():
	args = parse_args()
	source = args.source
	output = args.output

	# Prepare the output directory.
	if not os.path.exists(output):
	os.makedirs(output)

	codes = parse_html(source)

	# Example URL:
	# "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/1f601.png"
	base_url = "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/"

	# Max # of results to download (rankings).
	COUNT = 500

	print "### Attempting to download %s files..." % COUNT

	done_count = 0
	for code in codes:
	filename = code + '.png'
	url = base_url + filename
	outpath = os.path.join(output, filename)

	# Only download if the file hasn't been downloaded yet.
	if not os.path.isfile(outpath):
	opener = urllib.URLopener()
	opener.retrieve(url, outpath)

	done_count += 1
	if done_count == COUNT:
	break

	print "### Finished writing %s files." % done_count

	if __name__ == "__main__":
	main()