Skip to content

Instantly share code, notes, and snippets.

@lbenedix
Last active August 29, 2015 14:05
Show Gist options
  • Save lbenedix/1504161ec400d028f0c3 to your computer and use it in GitHub Desktop.
Save lbenedix/1504161ec400d028f0c3 to your computer and use it in GitHub Desktop.
import requests, multiprocessing, os
from bs4 import BeautifulSoup
url = 'http://beesandbombs.tumblr.com/page/{}'
def download_file( url, path='f:/beesandbombs' ):
local_filename = path + '/' + url.split( '/' )[-1]
if os.path.isfile( local_filename ):
return local_filename
r = requests.get( url, stream=True )
print( '{} --> {}'.format( url, local_filename ) )
with open( local_filename, 'wb' ) as f:
for chunk in r.iter_content( chunk_size=1024 ):
if chunk: # filter out keep-alive new chunks
f.write( chunk )
f.flush()
return local_filename
if __name__ == '__main__':
img_set = set()
p = 0
while True:
p += 1
print( url.format( p ) )
resp = requests.get( url.format(p) )
soup = BeautifulSoup( resp.text )
for img in soup.find_all('img'):
src = img.attrs['src']
if src[-3:] == 'gif' and 'tumblr' in src:
img_set.add( img.attrs['src'] )
if len(resp.text) < 20000:
break
pool_size = multiprocessing.cpu_count() * 2
pool = multiprocessing.Pool( processes=pool_size )
pool_outputs = pool.map(download_file, img_set)
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment