Skip to content

Instantly share code, notes, and snippets.

@nrubin
Created October 2, 2014 23:13
Show Gist options
  • Save nrubin/7d9656b3b11c5b3ab383 to your computer and use it in GitHub Desktop.
Save nrubin/7d9656b3b11c5b3ab383 to your computer and use it in GitHub Desktop.
Downloads a bunch of hipster images from some website. Resumable. Can throttle if you want, but not enabled right now.
#let's download some hipster images
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
import os
def throttle():
t = 80*np.random.randn()+200
while t < 0:
t = 80*np.random.randn()+150
print "throttling for %f ms" % (t)
time.sleep(t/1000.0)
def get(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def anchors_with_links(tag):
return ((tag.name == "a") and ("photos" in tag['href']))
def index_photo(photo_links,photo_uri):
low_index = photo_uri.find("/photos/") + len("/photos/")
photo_id = photo_uri[low_index:-9]
full_uri = "https://unsplash.com" + photo_uri
pair = (photo_id,full_uri)
photo_links.add(pair)
def parse(photo_links,text):
soup = BeautifulSoup(text)
anchors = soup.find_all(anchors_with_links)
for anchor in anchors:
index_photo(photo_links,anchor['href'])
def file_exists(filename):
return os.path.isfile(filename)
def download_photo(photo_tuple,path):
filename = photo_tuple[0] + ".jpg"
photo_url = photo_tuple[1]
path = "./Images/" + filename
if file_exists(path):
print "%s already downloaded, skipping..." % (filename)
else:
r = requests.get(photo_url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
def mark_page_done(num):
filename = ".page-%d" % (num)
path = "./Images/"
open(path + filename,'a').close()
def page_done(num):
filename = ".page-%d" % (num)
path = "./Images/"
file_exists(path+filename)
def get_all_photos():
for ind in xrange(1,1000):
url = "http://unsplash.com/grid?_=1412274378042&page=%d" % (ind)
print "getting url %d" % (ind)
text = get(url)
if text is not None and not page_done(ind):
photo_links = set()
parse(photo_links,text)
total = len(photo_links)
progress = 1
for photo in photo_links:
download_photo(photo,"")
print "downloaded photo %d of %d" % (progress,total)
progress += 1
mark_page_done(ind)
else:
print ind
return
if __name__ == '__main__':
get_all_photos()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment