Skip to content

Instantly share code, notes, and snippets.

@msm595
Created January 1, 2014 17:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save msm595/8209922 to your computer and use it in GitHub Desktop.
Save msm595/8209922 to your computer and use it in GitHub Desktop.
Download every liked image on tumblr to a folder. Requires python3, requests, and beautifulsoup.
# Either input info here, use command-line arguments, or be asked in-program.
email = ""
password = ""
folder = "" #remember to use / for directories, not \
threads = 8
########################################
##### downtumblrlikes.py v20140101 #####
########################################
###### DO NOT EDIT BELOW THIS LINE #####
########################################
import os, requests, time
from bs4 import BeautifulSoup as bs
from concurrent.futures import ThreadPoolExecutor
def downtumblrlikes(email, password, folder, threads):
s = requests.Session()
## get form_key to login properly
print('Getting form_key...')
r = s.get('https://www.tumblr.com/login')
formKey = r.text.split('<input type="hidden" name="form_key" value="')[1]
formKey = formKey.split('"')[0]
data = {'user[email]': email, 'user[password]': password, 'user[tos]': 1,
'context': 'login', 'version': 'STANDARD', 'follow': '',
'http_referer': 'https://www.tumblr.com/login', 'form_key': formKey}
## login
print('Logging in...')
r = s.post('https://www.tumblr.com/login', data=data)
if 'logged_in' not in s.cookies or s.cookies['logged_in'] is not '1':
print('Error logging in.')
return
else:
print('Successfully logged in.')
data = {}
email = ''
password = ''
## get total pages
r = s.get('http://www.tumblr.com/likes/page/1')
totalPages = int(bs(r.text).select('a.likes div')[0]['data-count'])//10+1
ald = set(os.listdir(folder)) #already downloaded files
totalBytes = 0
sTime = time.time()
def parsePage(page):
imgs = []
r = s.get('http://www.tumblr.com/likes/page/%d' % page, headers={
'X-Requested-With': 'XMLHttpRequest'})
soup = bs(r.text)
## get pics from page (either from script or html)
posts = soup.find_all("div", class_=lambda x: x in ['is_photo', 'is_photoset'])
for p in posts:
script = p('script')
if not script:
url = p.select('div.post_media img')[0]['src']
imgs.append(url)
else:
script = script[0].text
urls = script.split('high_res: \'')
if len(urls) == 2: #1 url for photo
urls = [urls[1].split('\'')[0]]
else: #many urls for photoset
urls = (u.split('"')[0].replace('\\/', '/') for u
in script.split('high_res":"')[1:])
imgs.extend(urls)
return set(imgs)
def downloadImage(i):
filename = i.split('/')[-1]
if filename in ald:
return -2 #file already exists
r = s.get(i, stream=True)
if r.status_code == 200:
size = 0
with open(folder+'/'+filename, 'wb') as file:
for chunk in r.iter_content(1024*4):
file.write(chunk)
size = os.fstat(file.fileno()).st_size
return size #file sucessfully downloaded
else:
return -1
imgs = set()
downloaded = set()
existing = set()
error = set()
print("Grabbing img urls from %d pages:" % totalPages)
with ThreadPoolExecutor(max_workers=threads) as executor:
for page, i in zip(range(1,totalPages+1),
executor.map(parsePage, range(1,totalPages+1))):
imgs |= i
print('\tParsed page %d' % page)
## download images
print('Downloading %d images:' % len(imgs))
with ThreadPoolExecutor(max_workers=threads) as executor:
for img, result in zip(imgs, executor.map(downloadImage, imgs)):
name = img.split('/')[-1]
if result == -2:
print('\tAlready exists: %s' % name)
existing.add(img)
elif result == -1:
print('\tError downloading: %s' % name)
error.add(img)
else:
print('\tDownloaded: %s' % name)
totalBytes += result
downloaded.add(img)
eTime = time.time()
print('Done.')
print('Downloaded %d new images.' % len(downloaded))
print('Already had %d images.' % len(existing))
print('Error dling %d images: \n\t%s' % (len(error), "\n\t".join(error)))
print('Downloaded %0.2fMB in %0.2fs (%0.2fMb/s)' %
((totalBytes/2**20), (eTime-sTime), 8*(totalBytes/2**20)/(eTime-sTime)))
if __name__ == "__main__":
import argparse, getpass
parser = argparse.ArgumentParser(description='Download tumblr likes.',
prog='downtumblrlikes.py')
parser.add_argument('-e', '--email', metavar='email')
parser.add_argument('-p', '--password', metavar='pass')
parser.add_argument('-f', '--folder', metavar='folder')
parser.add_argument('-t', '--threads', type=int, metavar='threads')
parser.add_argument('-v', '--version', action='version', version='%(prog)s 20140101')
args = parser.parse_args()
email = args.email if args.email != None else email
password = args.password if args.password != None else password
folder = args.folder if args.folder != None else folder
threads = args.threads if args.threads != None else threads
if len(email) == 0:
email = input('Email: ')
if len(password) == 0:
password = getpass.getpass('Password: ')
if len(folder) == 0:
folder = input('Folder: ')
if threads < 1:
threads = int(input('Threads: '))
downtumblrlikes(email, password, folder, threads)
@cafwf
Copy link

cafwf commented Apr 21, 2015

wtf is this?
i want to download my likes not this bunch of crap.

@BrutalSimplicity
Copy link

BrutalSimplicity commented Jul 25, 2016

Rude! Anyway, I've just made a script for this, but it uses the consumer secret and key to communicate with the tumblr API. However, this seems like it would work for a much more general case. I imagine it was a PITA to get all the scraping and automating login just right.

Nice job. I'll post mine later so you can see how I went about it, if you're interested at all.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment