treece/unsplash.py

## unsplash.py
import feedparser
import time
import re
import httplib
import urlparse
import httplib2
import urllib
import urllib2
import sys
import os
from math import fabs

failed_counter = 0
last_page = ''

def getContentLocation(link):
    h = httplib2.Http()
    h.follow_all_redirects = True
    resp = h.request(link, "GET")[0]
    contentLocation = resp['content-location']
    contentLocation = contentLocation[:-4]
    return contentLocation

def unshorten_url(url):
    parsed = urlparse.urlparse(url)
    h = httplib.HTTPConnection(parsed.netloc)
    resource = parsed.path
    if parsed.query != "":
        resource += "?" + parsed.query
    h.request('HEAD', resource )
    response = h.getresponse()
    if response.status/100 == 3 and response.getheader('Location'):
        return unshorten_url(response.getheader('Location'))
    else:
        return url

def report(count, blockSize, totalSize):
  	percent = int(count*blockSize*100/totalSize)
  	sys.stdout.write("\r%d%%" % percent + ' Complete - ')
  	sys.stdout.flush()

def save_file(siteName, url):
  filename = os.path.basename(url)
  if not os.path.isfile(siteName + '/' + filename):

    sys.stdout.write('\rDownloading Image ' + url + '...\n')
    urllib.urlretrieve(url, siteName + '/' + filename, reporthook=report)
    sys.stdout.write("\rDownload complete, saved as %s" % (filename) + '\n\n')
    sys.stdout.flush()

  else:
    print("Image already exists in directory!")

def make_dir(d):
  if not os.path.exists(d):
    os.makedirs(d)

def start():
  global pageNum, failed_counter, last_page

  siteName = "unsplash"
  print("Starting page number?")
  pageStart = int(raw_input('> '))
  if pageStart == 0:
      pageStart = 1
  pageNum = pageStart
  print("Ending page number?")
  pageEnd = int(raw_input('> '))
  try:
    for page in range(pageStart, pageEnd+1):
      make_dir(siteName)
      if pageNum == 1:
          link = 'http://' + siteName + '.tumblr.com'
          linkFinal = getContentLocation(link) + 'rss'
      else:
          link = ('http://' + siteName + '.tumblr.com/page/' + str(pageNum) +
              '/rss')
          linkFinal = getContentLocation(link) + '/rss'

      print('\n---- Downloading images on page ' + str(pageNum)) + ' ----\n'

      d = feedparser.parse(linkFinal)
      for post in d.entries:
          myString = post.description
          match = re.search(r'http://bit.ly[\'"]?([^\'" >]+)', myString)
          if match:
            matched = match.group(0)
            imageUrl = unshorten_url(matched)
            save_file(siteName, imageUrl)

      pageNum +=1
  except:
      print('Something went wrong!\n')

start()
	import feedparser
	import time
	import re
	import httplib
	import urlparse
	import httplib2
	import urllib
	import urllib2
	import sys
	import os
	from math import fabs

	failed_counter = 0
	last_page = ''

	def getContentLocation(link):
	h = httplib2.Http()
	h.follow_all_redirects = True
	resp = h.request(link, "GET")[0]
	contentLocation = resp['content-location']
	contentLocation = contentLocation[:-4]
	return contentLocation

	def unshorten_url(url):
	parsed = urlparse.urlparse(url)
	h = httplib.HTTPConnection(parsed.netloc)
	resource = parsed.path
	if parsed.query != "":
	resource += "?" + parsed.query
	h.request('HEAD', resource )
	response = h.getresponse()
	if response.status/100 == 3 and response.getheader('Location'):
	return unshorten_url(response.getheader('Location'))
	else:
	return url

	def report(count, blockSize, totalSize):
	percent = int(countblockSize100/totalSize)
	sys.stdout.write("\r%d%%" % percent + ' Complete - ')
	sys.stdout.flush()

	def save_file(siteName, url):
	filename = os.path.basename(url)
	if not os.path.isfile(siteName + '/' + filename):

	sys.stdout.write('\rDownloading Image ' + url + '...\n')
	urllib.urlretrieve(url, siteName + '/' + filename, reporthook=report)
	sys.stdout.write("\rDownload complete, saved as %s" % (filename) + '\n\n')
	sys.stdout.flush()

	else:
	print("Image already exists in directory!")

	def make_dir(d):
	if not os.path.exists(d):
	os.makedirs(d)

	def start():
	global pageNum, failed_counter, last_page

	siteName = "unsplash"
	print("Starting page number?")
	pageStart = int(raw_input('> '))
	if pageStart == 0:
	pageStart = 1
	pageNum = pageStart
	print("Ending page number?")
	pageEnd = int(raw_input('> '))
	try:
	for page in range(pageStart, pageEnd+1):
	make_dir(siteName)
	if pageNum == 1:
	link = 'http://' + siteName + '.tumblr.com'
	linkFinal = getContentLocation(link) + 'rss'
	else:
	link = ('http://' + siteName + '.tumblr.com/page/' + str(pageNum) +
	'/rss')
	linkFinal = getContentLocation(link) + '/rss'

	print('\n---- Downloading images on page ' + str(pageNum)) + ' ----\n'

	d = feedparser.parse(linkFinal)
	for post in d.entries:
	myString = post.description
	match = re.search(r'http://bit.ly[\'"]?([^\'" >]+)', myString)
	if match:
	matched = match.group(0)
	imageUrl = unshorten_url(matched)
	save_file(siteName, imageUrl)

	pageNum +=1
	except:
	print('Something went wrong!\n')

	start()