jmduke/gist:6281ef0e8c7ef8fa466e

## gistfile1.py
from Queue import Queue
from threading import Thread, active_count
import pytumblr
import sys
import time
import logging

client = pytumblr.TumblrRestClient(
  'FILLINYOUROWNDAMNVALUES',
  'FILLINYOUROWNDAMNVALUES',
  'FILLINYOUROWNDAMNVALUES',
  'FILLINYOUROWNDAMNVALUES'
)

crawlable_names = Queue()
crawled_names = set()
running_app = True

def worker():
    # This allows us to intercept Ctrl+Cs and actually shut down the script.
    while running_app and len(crawled_names) < 5000:
        username = crawlable_names.get()
        process_username(username)
        crawlable_names.task_done()

def process_username(username):
    logging.warning("Processing {} / {}".format(username, len(crawled_names)))

    # Grab posts from tumblr.
    response = client.posts(username)

    # Sometimes the usernames are bad; let's just ignore them.
    if 'posts' not in response:
        logging.warning("Posts not found for name: {}".format(username))
        return

    posts = response['posts']
    for post in posts:

        # Only grab reblogs.
        if 'source_url' not in post:
            continue
        url = post['source_url']

        # Don't go circular.
        if username in url:
            continue

        # Sources can be external, so ignore those.
        if 'tumblr' not in url or "www.tumblr.com" in url:
            continue

        # Should look like "http://{username}.tumblr.com/{otherstuff}".
        # (Regex is probably a better way to do this.)
        try:
            new_name = url.split(".tumblr.com")[0]
            if "https" in new_name:
                new_name = new_name.split("https://")[1]
            elif "http" in new_name:
                new_name = new_name.split("http://")[1]
            if "www" == new_name[0:3]:
                new_name = new_name.split(".")[1]
        except:
            logging.warning("Can't find username in url: {}".format(url))
            continue

        if new_name not in crawled_names:
            # Add it to crawled_names immediately to make sure other threads don't try it.
            crawlable_names.put(new_name)
            crawled_names.add(new_name)
            logging.info("Found new username: {}".format(new_name))

    logging.info("Finished processing {}.  {} usernames left; {} usernames found.".format(username, crawlable_names.qsize(), len(crawled_names)))


def spin_threads():
    thread_count = 50
    for i in range(thread_count):
        thread = Thread(target=worker)
        thread.daemon = (thread == 0)
        thread.start()

if __name__ == "__main__":
    try:
        crawlable_names.put("jmduke")
        spin_threads()
        while active_count() > 0 and len(crawled_names) < 5000:
            time.sleep(0.1)
            worker()
    except KeyboardInterrupt:
        running_app = False
        with open("tumblr_usernames.csv", "w") as outfile:
            outfile.write("\n".join(crawled_names).encode('utf-8'))
	from Queue import Queue
	from threading import Thread, active_count
	import pytumblr
	import sys
	import time
	import logging

	client = pytumblr.TumblrRestClient(
	'FILLINYOUROWNDAMNVALUES',
	'FILLINYOUROWNDAMNVALUES',
	'FILLINYOUROWNDAMNVALUES',
	'FILLINYOUROWNDAMNVALUES'
	)

	crawlable_names = Queue()
	crawled_names = set()
	running_app = True

	def worker():
	# This allows us to intercept Ctrl+Cs and actually shut down the script.
	while running_app and len(crawled_names) < 5000:
	username = crawlable_names.get()
	process_username(username)
	crawlable_names.task_done()

	def process_username(username):
	logging.warning("Processing {} / {}".format(username, len(crawled_names)))

	# Grab posts from tumblr.
	response = client.posts(username)

	# Sometimes the usernames are bad; let's just ignore them.
	if 'posts' not in response:
	logging.warning("Posts not found for name: {}".format(username))
	return

	posts = response['posts']
	for post in posts:

	# Only grab reblogs.
	if 'source_url' not in post:
	continue
	url = post['source_url']

	# Don't go circular.
	if username in url:
	continue

	# Sources can be external, so ignore those.
	if 'tumblr' not in url or "www.tumblr.com" in url:
	continue

	# Should look like "http://{username}.tumblr.com/{otherstuff}".
	# (Regex is probably a better way to do this.)
	try:
	new_name = url.split(".tumblr.com")[0]
	if "https" in new_name:
	new_name = new_name.split("https://")[1]
	elif "http" in new_name:
	new_name = new_name.split("http://")[1]
	if "www" == new_name[0:3]:
	new_name = new_name.split(".")[1]
	except:
	logging.warning("Can't find username in url: {}".format(url))
	continue

	if new_name not in crawled_names:
	# Add it to crawled_names immediately to make sure other threads don't try it.
	crawlable_names.put(new_name)
	crawled_names.add(new_name)
	logging.info("Found new username: {}".format(new_name))

	logging.info("Finished processing {}. {} usernames left; {} usernames found.".format(username, crawlable_names.qsize(), len(crawled_names)))


	def spin_threads():
	thread_count = 50
	for i in range(thread_count):
	thread = Thread(target=worker)
	thread.daemon = (thread == 0)
	thread.start()

	if __name__ == "__main__":
	try:
	crawlable_names.put("jmduke")
	spin_threads()
	while active_count() > 0 and len(crawled_names) < 5000:
	time.sleep(0.1)
	worker()
	except KeyboardInterrupt:
	running_app = False
	with open("tumblr_usernames.csv", "w") as outfile:
	outfile.write("\n".join(crawled_names).encode('utf-8'))