JohnEmhoff/demo.py

## demo.py
import sys
import json
import urllib

import requests


def reddit(sub):
    """
    Retrieve the json representation of the given subreddit
    """
    url = 'http://reddit.com/r/%s.json' % sub
    resp = requests.get(url)
    return json.loads(resp.text)


def reddit_urls(js):
    """
    Return all the URLs in the given json blob of reddit data
    """
    return [x['data']['url'] for x in js['data']['children']]


def embedly_extract(url, key):
    """
    Call the embedly API and return the json blob. Returns None if there
    is an error processing the URL (e.g., HTTP 404)
    """
    quoted = urllib.quote(url)
    api = 'http://api.embed.ly/1/extract?url=%s&key=%s' % (quoted, key)
    resp = requests.get(api)
    if resp.status_code != 200:
        sys.stderr.write('Failed to process URL %s :(\n' % url)
        return None
    return json.loads(resp.text)


def sum_counts(item_lists):
    """
    Takes a list of lists of items and mashes them all together into a dict
    mapping item_name -> item_score. The input items are either entity or
    keyword lists, which look like {'name': 'Dennis', 'count': 10} or
    {'name': 'sports', 'score': 33}, respectively.
    """
    ret = {}
    for item_list in item_lists:
        for item in item_list:
            # get either score or count so we can work with both
            # keywords and entities
            name, count = item['name'], item.get('count', item.get('score', 0))
            ret[name] = ret.get(name, 0) + count
    return ret


def top_n(item_dict, n):
    by_count = sorted(item_dict.iteritems(), key=lambda x: x[1],
        reverse=True)
    return by_count[:n]


def main(subreddit, key):
    urls = reddit_urls(reddit(subreddit))
    embeds = [embedly_extract(url, key) for url in urls]
    # change 'entities' to 'keywords' to check out the keyword extraction!
    items = [x['entities'] for x in embeds if x]
    all_counts = sum_counts(items)

    top = top_n(all_counts, 10)
    print '\n'.join('%s %s' % (x[0], x[1]) for x in top)


if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write('Usage: demo.py <subreddit> <embedly-key>\n')
    else:
        main(sys.argv[1], sys.argv[2])
	import sys
	import json
	import urllib

	import requests


	def reddit(sub):
	"""
	Retrieve the json representation of the given subreddit
	"""
	url = 'http://reddit.com/r/%s.json' % sub
	resp = requests.get(url)
	return json.loads(resp.text)


	def reddit_urls(js):
	"""
	Return all the URLs in the given json blob of reddit data
	"""
	return [x['data']['url'] for x in js['data']['children']]


	def embedly_extract(url, key):
	"""
	Call the embedly API and return the json blob. Returns None if there
	is an error processing the URL (e.g., HTTP 404)
	"""
	quoted = urllib.quote(url)
	api = 'http://api.embed.ly/1/extract?url=%s&key=%s' % (quoted, key)
	resp = requests.get(api)
	if resp.status_code != 200:
	sys.stderr.write('Failed to process URL %s :(\n' % url)
	return None
	return json.loads(resp.text)


	def sum_counts(item_lists):
	"""
	Takes a list of lists of items and mashes them all together into a dict
	mapping item_name -> item_score. The input items are either entity or
	keyword lists, which look like {'name': 'Dennis', 'count': 10} or
	{'name': 'sports', 'score': 33}, respectively.
	"""
	ret = {}
	for item_list in item_lists:
	for item in item_list:
	# get either score or count so we can work with both
	# keywords and entities
	name, count = item['name'], item.get('count', item.get('score', 0))
	ret[name] = ret.get(name, 0) + count
	return ret


	def top_n(item_dict, n):
	by_count = sorted(item_dict.iteritems(), key=lambda x: x[1],
	reverse=True)
	return by_count[:n]


	def main(subreddit, key):
	urls = reddit_urls(reddit(subreddit))
	embeds = [embedly_extract(url, key) for url in urls]
	# change 'entities' to 'keywords' to check out the keyword extraction!
	items = [x['entities'] for x in embeds if x]
	all_counts = sum_counts(items)

	top = top_n(all_counts, 10)
	print '\n'.join('%s %s' % (x[0], x[1]) for x in top)


	if __name__ == '__main__':
	if len(sys.argv) < 3:
	sys.stderr.write('Usage: demo.py <subreddit> <embedly-key>\n')
	else:
	main(sys.argv[1], sys.argv[2])