Skip to content

Instantly share code, notes, and snippets.

@jmduke
Created December 20, 2014 22:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmduke/6281ef0e8c7ef8fa466e to your computer and use it in GitHub Desktop.
Save jmduke/6281ef0e8c7ef8fa466e to your computer and use it in GitHub Desktop.
from Queue import Queue
from threading import Thread, active_count
import pytumblr
import sys
import time
import logging
client = pytumblr.TumblrRestClient(
'FILLINYOUROWNDAMNVALUES',
'FILLINYOUROWNDAMNVALUES',
'FILLINYOUROWNDAMNVALUES',
'FILLINYOUROWNDAMNVALUES'
)
crawlable_names = Queue()
crawled_names = set()
running_app = True
def worker():
# This allows us to intercept Ctrl+Cs and actually shut down the script.
while running_app and len(crawled_names) < 5000:
username = crawlable_names.get()
process_username(username)
crawlable_names.task_done()
def process_username(username):
logging.warning("Processing {} / {}".format(username, len(crawled_names)))
# Grab posts from tumblr.
response = client.posts(username)
# Sometimes the usernames are bad; let's just ignore them.
if 'posts' not in response:
logging.warning("Posts not found for name: {}".format(username))
return
posts = response['posts']
for post in posts:
# Only grab reblogs.
if 'source_url' not in post:
continue
url = post['source_url']
# Don't go circular.
if username in url:
continue
# Sources can be external, so ignore those.
if 'tumblr' not in url or "www.tumblr.com" in url:
continue
# Should look like "http://{username}.tumblr.com/{otherstuff}".
# (Regex is probably a better way to do this.)
try:
new_name = url.split(".tumblr.com")[0]
if "https" in new_name:
new_name = new_name.split("https://")[1]
elif "http" in new_name:
new_name = new_name.split("http://")[1]
if "www" == new_name[0:3]:
new_name = new_name.split(".")[1]
except:
logging.warning("Can't find username in url: {}".format(url))
continue
if new_name not in crawled_names:
# Add it to crawled_names immediately to make sure other threads don't try it.
crawlable_names.put(new_name)
crawled_names.add(new_name)
logging.info("Found new username: {}".format(new_name))
logging.info("Finished processing {}. {} usernames left; {} usernames found.".format(username, crawlable_names.qsize(), len(crawled_names)))
def spin_threads():
thread_count = 50
for i in range(thread_count):
thread = Thread(target=worker)
thread.daemon = (thread == 0)
thread.start()
if __name__ == "__main__":
try:
crawlable_names.put("jmduke")
spin_threads()
while active_count() > 0 and len(crawled_names) < 5000:
time.sleep(0.1)
worker()
except KeyboardInterrupt:
running_app = False
with open("tumblr_usernames.csv", "w") as outfile:
outfile.write("\n".join(crawled_names).encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment