edsu/unshorten_tweet_urls.py

## unshorten_tweet_urls.py
#!/usr/bin/env python

"""
Feed this program line-oriented JSON tweet data (as received from the API)
on STDIN and get unshortened URLs mentioned in the tweets on STDOUT.

This module will look up multiple urls at once using the multiprocessing
library. Change CONCURRENCY to have more or less processes, defaults to 10.
"""

CONCURRENCY = 10

import json
import requests
import fileinput
import multiprocessing

seen = {}

def unshorten(url):
    if url in seen:
       return seen[url]

    new_url = url

    try:
        r = requests.get(url)
        if r.status_code == 200:
           new_url = r.url
    except:
        pass # oh well

    seen[url] = new_url

    return new_url

def urls():
    for line in fileinput.input():
        tweet = json.loads(line)
        for url in tweet["entities"]["urls"]:
            yield url["expanded_url"]

if __name__ == "__main__":
    pool = multiprocessing.Pool(processes=CONCURRENCY)
    for url in pool.imap_unordered(unshorten, urls()):
        print url
    pool.close()
	#!/usr/bin/env python

	"""
	Feed this program line-oriented JSON tweet data (as received from the API)
	on STDIN and get unshortened URLs mentioned in the tweets on STDOUT.

	This module will look up multiple urls at once using the multiprocessing
	library. Change CONCURRENCY to have more or less processes, defaults to 10.
	"""

	CONCURRENCY = 10

	import json
	import requests
	import fileinput
	import multiprocessing

	seen = {}

	def unshorten(url):
	if url in seen:
	return seen[url]

	new_url = url

	try:
	r = requests.get(url)
	if r.status_code == 200:
	new_url = r.url
	except:
	pass # oh well

	seen[url] = new_url

	return new_url

	def urls():
	for line in fileinput.input():
	tweet = json.loads(line)
	for url in tweet["entities"]["urls"]:
	yield url["expanded_url"]

	if __name__ == "__main__":
	pool = multiprocessing.Pool(processes=CONCURRENCY)
	for url in pool.imap_unordered(unshorten, urls()):
	print url
	pool.close()