brandonhesse/scrub.py

## scrub.py
"""
Un-scrape
Downloads images from the ~NEW~ section on Unslash.com and stores them for later use
Created by Roulx
Modified by midelh
"""
# I'm in python3 on this PC,
# so this should allow me to use my print and you can still run this in py2
from __future__ import print_function
import json #json over csv, personal preference
import os
import os.path as path # I like to make a shortcut to this
import sys # for sys.stderr
import time # Take it slow when scraping other's bandwidth

# External Pypy dependencies
import bs4
import requests


def main():
    """Main Runtime"""
    # Document functions using docstrings
    # I avoid globals, because they lead to "side effects"
    # Constants should be upper-cased
    # Changed to be universal for MacOSX, Linux and Windows
    SAVE_DIRECTORY = path.join(path.expanduser('~'), 'Desktop', 'Unslash')
    URL = 'https://unsplash.com'

    try:
        create_save_directory(SAVE_DIRECTORY)
        check_log_file(SAVE_DIRECTORY)
        res = fetch(URL + '/new')
        links = find_low_res_links(make_soup(res.text))
        hi_res = generate_hi_res_links(URL, links)
        download_images(SAVE_DIRECTORY, hi_res)
        print("Success!")
    except Exception as e: # Better error handling still needed
        print("There was an error not handled properly.\n", e, file=sys.stderr)


def create_save_directory(directory):
    """Check to see if save folder exists. If not, make it."""
    print('Setting up save folder.', file=sys.stderr)
    # If this fails, you should let the program fail.
    # We can handle it's error higher up
    os.makedirs(directory, mode=0o755, exist_ok=True)


def check_log_file(directory, log_file='log.json'):
    """Makes sure the logfile exists and is a proper json file"""
    log = path.join(directory, log_file)
    try:
        fp = open(log, 'r')
    except IOError:
        fp = open(log, 'w')
        fp.write('{}')
        fp.flush()
    finally:
        fp.close()


def fetch(url):
    """Fetch a url via requests and return its response"""
    res = requests.get(url)
    res.raise_for_status()
    return res


def make_soup(html):
    """Create BeautifulSoup object from html text"""
    return bs4.BeautifulSoup(html, 'html.parser')


def find_low_res_links(soup):
    """Get the low resolution representation of each image off the front page"""
    return [image_link.get("href") for image_link in soup.findAll("a", class_="photo__image-container")]


def generate_hi_res_links(base_url, links, ):
    """Get hi-res links for each image"""
    hi_res = []
    for link in links:
        image_url = base_url + link
        res = fetch(image_url)
        soup = make_soup(res.text)

        for img in soup.findAll('img', class_='single-photo__fake-image'):
            # grab all the src!
            link = img.get('src').split('?')[0]
            print("Found", link)
            hi_res.append(link)
        print('Giving their servers a 5 second break.', file=sys.stderr)
        time.sleep(5)
    return hi_res


def download_images(save_directory, hi_res_links):
    """Download each hi-res photo using the link"""
    print("Starting download of the high res images.")
    # Why I prefer json or pickle
    with open(path.join(save_directory, "log.json"), "r") as json_input:
        log = json.load(json_input)

    # Now we have a dictionary. Their keys are hashed!
    try:
        data = log['data']
    except KeyError:
        data = log['data'] = {}

    # Quicker lookup!
    for link in hi_res_links:
        print(link, file=sys.stderr)
        filename = link.split('/')[-1]
        short_name = filename[6:]

        if short_name in data:
            print("Image already in downloaded! Skipping.", file=sys.stderr)
        else:
            status = save_image(link, path.join(save_directory, filename + '.jpg'))
            data[short_name] = status # Lets log the status codes
            print("{} => {}".format(short_name, status))
            print("Starting the next download in 5 seconds.")
            time.sleep(5) # Give their bandwidth a break

    with open(path.join(save_directory, "log.json"), "w") as json_output:
        json.dump(log, json_output, indent=2)


def save_image(link, dest_file):
    res = requests.get(link, stream=True)
    if res.status_code == 200:
        with open(dest_file, 'wb') as f:
            for chunk in res:
                f.write(chunk)
    return res.status_code


if __name__ == '__main__':
    main()
	"""
	Un-scrape
	Downloads images from the ~NEW~ section on Unslash.com and stores them for later use
	Created by Roulx
	Modified by midelh
	"""
	# I'm in python3 on this PC,
	# so this should allow me to use my print and you can still run this in py2
	from __future__ import print_function
	import json #json over csv, personal preference
	import os
	import os.path as path # I like to make a shortcut to this
	import sys # for sys.stderr
	import time # Take it slow when scraping other's bandwidth

	# External Pypy dependencies
	import bs4
	import requests


	def main():
	"""Main Runtime"""
	# Document functions using docstrings
	# I avoid globals, because they lead to "side effects"
	# Constants should be upper-cased
	# Changed to be universal for MacOSX, Linux and Windows
	SAVE_DIRECTORY = path.join(path.expanduser('~'), 'Desktop', 'Unslash')
	URL = 'https://unsplash.com'

	try:
	create_save_directory(SAVE_DIRECTORY)
	check_log_file(SAVE_DIRECTORY)
	res = fetch(URL + '/new')
	links = find_low_res_links(make_soup(res.text))
	hi_res = generate_hi_res_links(URL, links)
	download_images(SAVE_DIRECTORY, hi_res)
	print("Success!")
	except Exception as e: # Better error handling still needed
	print("There was an error not handled properly.\n", e, file=sys.stderr)


	def create_save_directory(directory):
	"""Check to see if save folder exists. If not, make it."""
	print('Setting up save folder.', file=sys.stderr)
	# If this fails, you should let the program fail.
	# We can handle it's error higher up
	os.makedirs(directory, mode=0o755, exist_ok=True)


	def check_log_file(directory, log_file='log.json'):
	"""Makes sure the logfile exists and is a proper json file"""
	log = path.join(directory, log_file)
	try:
	fp = open(log, 'r')
	except IOError:
	fp = open(log, 'w')
	fp.write('{}')
	fp.flush()
	finally:
	fp.close()


	def fetch(url):
	"""Fetch a url via requests and return its response"""
	res = requests.get(url)
	res.raise_for_status()
	return res


	def make_soup(html):
	"""Create BeautifulSoup object from html text"""
	return bs4.BeautifulSoup(html, 'html.parser')


	def find_low_res_links(soup):
	"""Get the low resolution representation of each image off the front page"""
	return [image_link.get("href") for image_link in soup.findAll("a", class_="photo__image-container")]


	def generate_hi_res_links(base_url, links, ):
	"""Get hi-res links for each image"""
	hi_res = []
	for link in links:
	image_url = base_url + link
	res = fetch(image_url)
	soup = make_soup(res.text)

	for img in soup.findAll('img', class_='single-photo__fake-image'):
	# grab all the src!
	link = img.get('src').split('?')[0]
	print("Found", link)
	hi_res.append(link)
	print('Giving their servers a 5 second break.', file=sys.stderr)
	time.sleep(5)
	return hi_res


	def download_images(save_directory, hi_res_links):
	"""Download each hi-res photo using the link"""
	print("Starting download of the high res images.")
	# Why I prefer json or pickle
	with open(path.join(save_directory, "log.json"), "r") as json_input:
	log = json.load(json_input)

	# Now we have a dictionary. Their keys are hashed!
	try:
	data = log['data']
	except KeyError:
	data = log['data'] = {}

	# Quicker lookup!
	for link in hi_res_links:
	print(link, file=sys.stderr)
	filename = link.split('/')[-1]
	short_name = filename[6:]

	if short_name in data:
	print("Image already in downloaded! Skipping.", file=sys.stderr)
	else:
	status = save_image(link, path.join(save_directory, filename + '.jpg'))
	data[short_name] = status # Lets log the status codes
	print("{} => {}".format(short_name, status))
	print("Starting the next download in 5 seconds.")
	time.sleep(5) # Give their bandwidth a break

	with open(path.join(save_directory, "log.json"), "w") as json_output:
	json.dump(log, json_output, indent=2)


	def save_image(link, dest_file):
	res = requests.get(link, stream=True)
	if res.status_code == 200:
	with open(dest_file, 'wb') as f:
	for chunk in res:
	f.write(chunk)
	return res.status_code


	if __name__ == '__main__':
	main()