Skip to content

Instantly share code, notes, and snippets.

@Skarlett
Last active June 11, 2018 08:21
Show Gist options
  • Save Skarlett/072c096f429e39137106b248ad1442b0 to your computer and use it in GitHub Desktop.
Save Skarlett/072c096f429e39137106b248ad1442b0 to your computer and use it in GitHub Desktop.
Extensive - configurable - Tumblr Scraper. (Python)
import requests
import json
import time
def range_package(*args, **kwargs):
last_var = None
for i in range(*args, **kwargs):
if last_var and i:
yield last_var, i
last_var = i
class TumblrScraperDummy:
MAX_CHUNK_SIZE = 50
HTTP = requests.Session
SLEEP_TIME = 0.2
def __init__(self, name):
self.name = name
self.session = self.HTTP()
@property
def url(self):
return "http://{}.tumblr.com/api/read/json".format(self.name)
def best_size_media(self, jpkg, lookfor='photo-url-'):
last = 0
for k in jpkg:
if k.startswith(lookfor):
size = int(k.split('-')[-1])
if size > last:
last = size
return jpkg[lookfor+str(last)]
def get_json(self, start=0, num=50):
args = {
'start': start,
'num': num
}
r = self.session.get(self.url, params=args)
text = r.text.strip()[22:-1]
return json.loads(text)
def scrape(self):
results = set()
jpkg = self.get_json()
total_posts = jpkg['posts-total']
for post in jpkg['posts']:
results.add(self.gather(post))
for start, stop in range_package(0, total_posts, self.MAX_CHUNK_SIZE):
time.sleep(self.SLEEP_TIME)
for post in self.get_json(start, stop)['posts']:
results.add(self.gather(post))
return results
def gather(self, post):
return post
import requests
import tumblr_scraper_lib
class ProxyPortal(requests.Session):
# Lets make an easy way of using tor.
PROXY = {
'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'
}
def get(self, url, **kwargs):
if 'proxies' in kwargs:
kwargs.pop('proxies')
return requests.Session.get(self, url, proxies=self.PROXY, **kwargs)
class Tumblr(tumblr_scraper_lib.TumblrScraperDummy):
HTTP = ProxyPortal # Now all the HTTP get requests will go over the Tor Network. (I'd hate to get IP Banned.)
def gather(self, post):
# We can also define our gather definition to specify what we want.
if post['type'].lower() == 'photo':
return self.best_size_media(post) # returns photo urls in the best resolution.
if __name__ == '__main__':
# Pretty cool huh
results = []
for name in ['my', 'giant', 'list'. 'of', 'user', 'names']:
user = Tumblr(name)
results.append(user.scrape())
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment