Skip to content

Instantly share code, notes, and snippets.

@brandonhesse
Created March 19, 2016 05:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brandonhesse/f49bcf44ee0c3d905e56 to your computer and use it in GitHub Desktop.
Save brandonhesse/f49bcf44ee0c3d905e56 to your computer and use it in GitHub Desktop.
Modified a script I found in /r/learnpython
"""
Un-scrape
Downloads images from the ~NEW~ section on Unslash.com and stores them for later use
Created by Roulx
Modified by midelh
"""
# I'm in python3 on this PC,
# so this should allow me to use my print and you can still run this in py2
from __future__ import print_function
import json #json over csv, personal preference
import os
import os.path as path # I like to make a shortcut to this
import sys # for sys.stderr
import time # Take it slow when scraping other's bandwidth
# External Pypy dependencies
import bs4
import requests
def main():
"""Main Runtime"""
# Document functions using docstrings
# I avoid globals, because they lead to "side effects"
# Constants should be upper-cased
# Changed to be universal for MacOSX, Linux and Windows
SAVE_DIRECTORY = path.join(path.expanduser('~'), 'Desktop', 'Unslash')
URL = 'https://unsplash.com'
try:
create_save_directory(SAVE_DIRECTORY)
check_log_file(SAVE_DIRECTORY)
res = fetch(URL + '/new')
links = find_low_res_links(make_soup(res.text))
hi_res = generate_hi_res_links(URL, links)
download_images(SAVE_DIRECTORY, hi_res)
print("Success!")
except Exception as e: # Better error handling still needed
print("There was an error not handled properly.\n", e, file=sys.stderr)
def create_save_directory(directory):
"""Check to see if save folder exists. If not, make it."""
print('Setting up save folder.', file=sys.stderr)
# If this fails, you should let the program fail.
# We can handle it's error higher up
os.makedirs(directory, mode=0o755, exist_ok=True)
def check_log_file(directory, log_file='log.json'):
"""Makes sure the logfile exists and is a proper json file"""
log = path.join(directory, log_file)
try:
fp = open(log, 'r')
except IOError:
fp = open(log, 'w')
fp.write('{}')
fp.flush()
finally:
fp.close()
def fetch(url):
"""Fetch a url via requests and return its response"""
res = requests.get(url)
res.raise_for_status()
return res
def make_soup(html):
"""Create BeautifulSoup object from html text"""
return bs4.BeautifulSoup(html, 'html.parser')
def find_low_res_links(soup):
"""Get the low resolution representation of each image off the front page"""
return [image_link.get("href") for image_link in soup.findAll("a", class_="photo__image-container")]
def generate_hi_res_links(base_url, links, ):
"""Get hi-res links for each image"""
hi_res = []
for link in links:
image_url = base_url + link
res = fetch(image_url)
soup = make_soup(res.text)
for img in soup.findAll('img', class_='single-photo__fake-image'):
# grab all the src!
link = img.get('src').split('?')[0]
print("Found", link)
hi_res.append(link)
print('Giving their servers a 5 second break.', file=sys.stderr)
time.sleep(5)
return hi_res
def download_images(save_directory, hi_res_links):
"""Download each hi-res photo using the link"""
print("Starting download of the high res images.")
# Why I prefer json or pickle
with open(path.join(save_directory, "log.json"), "r") as json_input:
log = json.load(json_input)
# Now we have a dictionary. Their keys are hashed!
try:
data = log['data']
except KeyError:
data = log['data'] = {}
# Quicker lookup!
for link in hi_res_links:
print(link, file=sys.stderr)
filename = link.split('/')[-1]
short_name = filename[6:]
if short_name in data:
print("Image already in downloaded! Skipping.", file=sys.stderr)
else:
status = save_image(link, path.join(save_directory, filename + '.jpg'))
data[short_name] = status # Lets log the status codes
print("{} => {}".format(short_name, status))
print("Starting the next download in 5 seconds.")
time.sleep(5) # Give their bandwidth a break
with open(path.join(save_directory, "log.json"), "w") as json_output:
json.dump(log, json_output, indent=2)
def save_image(link, dest_file):
res = requests.get(link, stream=True)
if res.status_code == 200:
with open(dest_file, 'wb') as f:
for chunk in res:
f.write(chunk)
return res.status_code
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment