Created
December 20, 2014 22:42
-
-
Save jmduke/6281ef0e8c7ef8fa466e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Queue import Queue | |
from threading import Thread, active_count | |
import pytumblr | |
import sys | |
import time | |
import logging | |
client = pytumblr.TumblrRestClient( | |
'FILLINYOUROWNDAMNVALUES', | |
'FILLINYOUROWNDAMNVALUES', | |
'FILLINYOUROWNDAMNVALUES', | |
'FILLINYOUROWNDAMNVALUES' | |
) | |
crawlable_names = Queue() | |
crawled_names = set() | |
running_app = True | |
def worker(): | |
# This allows us to intercept Ctrl+Cs and actually shut down the script. | |
while running_app and len(crawled_names) < 5000: | |
username = crawlable_names.get() | |
process_username(username) | |
crawlable_names.task_done() | |
def process_username(username): | |
logging.warning("Processing {} / {}".format(username, len(crawled_names))) | |
# Grab posts from tumblr. | |
response = client.posts(username) | |
# Sometimes the usernames are bad; let's just ignore them. | |
if 'posts' not in response: | |
logging.warning("Posts not found for name: {}".format(username)) | |
return | |
posts = response['posts'] | |
for post in posts: | |
# Only grab reblogs. | |
if 'source_url' not in post: | |
continue | |
url = post['source_url'] | |
# Don't go circular. | |
if username in url: | |
continue | |
# Sources can be external, so ignore those. | |
if 'tumblr' not in url or "www.tumblr.com" in url: | |
continue | |
# Should look like "http://{username}.tumblr.com/{otherstuff}". | |
# (Regex is probably a better way to do this.) | |
try: | |
new_name = url.split(".tumblr.com")[0] | |
if "https" in new_name: | |
new_name = new_name.split("https://")[1] | |
elif "http" in new_name: | |
new_name = new_name.split("http://")[1] | |
if "www" == new_name[0:3]: | |
new_name = new_name.split(".")[1] | |
except: | |
logging.warning("Can't find username in url: {}".format(url)) | |
continue | |
if new_name not in crawled_names: | |
# Add it to crawled_names immediately to make sure other threads don't try it. | |
crawlable_names.put(new_name) | |
crawled_names.add(new_name) | |
logging.info("Found new username: {}".format(new_name)) | |
logging.info("Finished processing {}. {} usernames left; {} usernames found.".format(username, crawlable_names.qsize(), len(crawled_names))) | |
def spin_threads(): | |
thread_count = 50 | |
for i in range(thread_count): | |
thread = Thread(target=worker) | |
thread.daemon = (thread == 0) | |
thread.start() | |
if __name__ == "__main__": | |
try: | |
crawlable_names.put("jmduke") | |
spin_threads() | |
while active_count() > 0 and len(crawled_names) < 5000: | |
time.sleep(0.1) | |
worker() | |
except KeyboardInterrupt: | |
running_app = False | |
with open("tumblr_usernames.csv", "w") as outfile: | |
outfile.write("\n".join(crawled_names).encode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment