daviesjamie/interfacelift-scraper.py

## interfacelift-scraper.py
import os, re, urllib2

# Where to download the wallpapers from
base_url = 'https://interfacelift.com'
listing_url = base_url + '/wallpaper/downloads/date/wide_16:9/2880x1620/'

# Where to save the wallpapers
output_directory = 'wallpapers'

# How many wallpapers to download (0 = all)
max_downloads = 0

# Regex to find the download links
# Example link:
# <a href="/wallpaper/7yz4ma1/03767_rushingin_2880x1620.jpg">
#     <img src="/img_NEW/button_download.png" alt="" title="" align="center" width="77" height="27">
# </a>
link_regex = re.compile(r'<a href="(?P<url>.+)"><img.+?src="/img_NEW/button_download.png')

# Regex to find the total number of pages
# Example html:
# <p style="">You are on <b style="">page 1</b> of .<b style="">78</b></p>
pages_regex = re.compile(r'You are on <b.+?>page [0-9]+</b> of <b.+?>(?P<pages>[0-9]+)')


if not os.path.exists(output_directory):
    os.makedirs(output_directory)

page = 1
max_pages = -1
downloaded = 0

while True:
    print 'Loading first page...' if max_pages == -1 else 'Loading page {} of {}...'.format(page, max_pages)
    url = '{}/index{}.html'.format(listing_url, page)

    # InterfaceLift returns 403 if no referer specified
    headers = {
        'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
        'Referer': url
    }

    try:
        req = urllib2.Request(url, None, headers)
        f = urllib2.urlopen(req)
    except IOError, e:
        print 'Failed to open {}'.format(url)
        print e
        continue

    html = f.read()
    links = link_regex.finditer(html)

    # Download all the wallpapers on the page
    for wall in links:
        dl_url = base_url + wall.group('url')
        req = urllib2.Request(dl_url, None, headers)
        out_file = os.path.join(output_directory, wall.group('url').split('/')[-1])
        with open(out_file, 'wb') as f:
            try:
                res = urllib2.urlopen(req)
                f.write(res.read())
            except:
                print 'Fail!'
                try:
                    os.remove(out_file)
                except:
                    pass
        downloaded += 1
        print downloaded

    page += 1

    # Get the max number of pages, if we haven't yet
    if max_pages == -1:
        pages_match = pages_regex.search(html)
        if pages_match is None:
            print 'Max pages not found!'
            break
        max_pages = pages_match.group('pages')

    # Check for break conditions
    if max_downloads != 0 and downloaded >= max_downloads:
        print 'Complete!'
        break

    if page > max_pages:
        print 'Out of pages!'
        break
	import os, re, urllib2

	# Where to download the wallpapers from
	base_url = 'https://interfacelift.com'
	listing_url = base_url + '/wallpaper/downloads/date/wide_16:9/2880x1620/'

	# Where to save the wallpapers
	output_directory = 'wallpapers'

	# How many wallpapers to download (0 = all)
	max_downloads = 0

	# Regex to find the download links
	# Example link:
	# <a href="/wallpaper/7yz4ma1/03767_rushingin_2880x1620.jpg">
	# <img src="/img_NEW/button_download.png" alt="" title="" align="center" width="77" height="27">
	# </a>
	link_regex = re.compile(r'<a href="(?P<url>.+)"><img.+?src="/img_NEW/button_download.png')

	# Regex to find the total number of pages
	# Example html:
	# <p style="">You are on <b style="">page 1</b> of .<b style="">78</b></p>
	pages_regex = re.compile(r'You are on <b.+?>page [0-9]+</b> of <b.+?>(?P<pages>[0-9]+)')


	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	page = 1
	max_pages = -1
	downloaded = 0

	while True:
	print 'Loading first page...' if max_pages == -1 else 'Loading page {} of {}...'.format(page, max_pages)
	url = '{}/index{}.html'.format(listing_url, page)

	# InterfaceLift returns 403 if no referer specified
	headers = {
	'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
	'Referer': url
	}

	try:
	req = urllib2.Request(url, None, headers)
	f = urllib2.urlopen(req)
	except IOError, e:
	print 'Failed to open {}'.format(url)
	print e
	continue

	html = f.read()
	links = link_regex.finditer(html)

	# Download all the wallpapers on the page
	for wall in links:
	dl_url = base_url + wall.group('url')
	req = urllib2.Request(dl_url, None, headers)
	out_file = os.path.join(output_directory, wall.group('url').split('/')[-1])
	with open(out_file, 'wb') as f:
	try:
	res = urllib2.urlopen(req)
	f.write(res.read())
	except:
	print 'Fail!'
	try:
	os.remove(out_file)
	except:
	pass
	downloaded += 1
	print downloaded

	page += 1

	# Get the max number of pages, if we haven't yet
	if max_pages == -1:
	pages_match = pages_regex.search(html)
	if pages_match is None:
	print 'Max pages not found!'
	break
	max_pages = pages_match.group('pages')

	# Check for break conditions
	if max_downloads != 0 and downloaded >= max_downloads:
	print 'Complete!'
	break

	if page > max_pages:
	print 'Out of pages!'
	break