Created
March 19, 2016 05:35
-
-
Save brandonhesse/f49bcf44ee0c3d905e56 to your computer and use it in GitHub Desktop.
Modified a script I found in /r/learnpython
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Un-scrape | |
Downloads images from the ~NEW~ section on Unslash.com and stores them for later use | |
Created by Roulx | |
Modified by midelh | |
""" | |
# I'm in python3 on this PC, | |
# so this should allow me to use my print and you can still run this in py2 | |
from __future__ import print_function | |
import json #json over csv, personal preference | |
import os | |
import os.path as path # I like to make a shortcut to this | |
import sys # for sys.stderr | |
import time # Take it slow when scraping other's bandwidth | |
# External Pypy dependencies | |
import bs4 | |
import requests | |
def main(): | |
"""Main Runtime""" | |
# Document functions using docstrings | |
# I avoid globals, because they lead to "side effects" | |
# Constants should be upper-cased | |
# Changed to be universal for MacOSX, Linux and Windows | |
SAVE_DIRECTORY = path.join(path.expanduser('~'), 'Desktop', 'Unslash') | |
URL = 'https://unsplash.com' | |
try: | |
create_save_directory(SAVE_DIRECTORY) | |
check_log_file(SAVE_DIRECTORY) | |
res = fetch(URL + '/new') | |
links = find_low_res_links(make_soup(res.text)) | |
hi_res = generate_hi_res_links(URL, links) | |
download_images(SAVE_DIRECTORY, hi_res) | |
print("Success!") | |
except Exception as e: # Better error handling still needed | |
print("There was an error not handled properly.\n", e, file=sys.stderr) | |
def create_save_directory(directory): | |
"""Check to see if save folder exists. If not, make it.""" | |
print('Setting up save folder.', file=sys.stderr) | |
# If this fails, you should let the program fail. | |
# We can handle it's error higher up | |
os.makedirs(directory, mode=0o755, exist_ok=True) | |
def check_log_file(directory, log_file='log.json'): | |
"""Makes sure the logfile exists and is a proper json file""" | |
log = path.join(directory, log_file) | |
try: | |
fp = open(log, 'r') | |
except IOError: | |
fp = open(log, 'w') | |
fp.write('{}') | |
fp.flush() | |
finally: | |
fp.close() | |
def fetch(url): | |
"""Fetch a url via requests and return its response""" | |
res = requests.get(url) | |
res.raise_for_status() | |
return res | |
def make_soup(html): | |
"""Create BeautifulSoup object from html text""" | |
return bs4.BeautifulSoup(html, 'html.parser') | |
def find_low_res_links(soup): | |
"""Get the low resolution representation of each image off the front page""" | |
return [image_link.get("href") for image_link in soup.findAll("a", class_="photo__image-container")] | |
def generate_hi_res_links(base_url, links, ): | |
"""Get hi-res links for each image""" | |
hi_res = [] | |
for link in links: | |
image_url = base_url + link | |
res = fetch(image_url) | |
soup = make_soup(res.text) | |
for img in soup.findAll('img', class_='single-photo__fake-image'): | |
# grab all the src! | |
link = img.get('src').split('?')[0] | |
print("Found", link) | |
hi_res.append(link) | |
print('Giving their servers a 5 second break.', file=sys.stderr) | |
time.sleep(5) | |
return hi_res | |
def download_images(save_directory, hi_res_links): | |
"""Download each hi-res photo using the link""" | |
print("Starting download of the high res images.") | |
# Why I prefer json or pickle | |
with open(path.join(save_directory, "log.json"), "r") as json_input: | |
log = json.load(json_input) | |
# Now we have a dictionary. Their keys are hashed! | |
try: | |
data = log['data'] | |
except KeyError: | |
data = log['data'] = {} | |
# Quicker lookup! | |
for link in hi_res_links: | |
print(link, file=sys.stderr) | |
filename = link.split('/')[-1] | |
short_name = filename[6:] | |
if short_name in data: | |
print("Image already in downloaded! Skipping.", file=sys.stderr) | |
else: | |
status = save_image(link, path.join(save_directory, filename + '.jpg')) | |
data[short_name] = status # Lets log the status codes | |
print("{} => {}".format(short_name, status)) | |
print("Starting the next download in 5 seconds.") | |
time.sleep(5) # Give their bandwidth a break | |
with open(path.join(save_directory, "log.json"), "w") as json_output: | |
json.dump(log, json_output, indent=2) | |
def save_image(link, dest_file): | |
res = requests.get(link, stream=True) | |
if res.status_code == 200: | |
with open(dest_file, 'wb') as f: | |
for chunk in res: | |
f.write(chunk) | |
return res.status_code | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment