Created
November 21, 2014 22:49
-
-
Save daviesjamie/220223205434f746a737 to your computer and use it in GitHub Desktop.
Easily scrape wallpapers from Interface Lift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, urllib2 | |
# Where to download the wallpapers from | |
base_url = 'https://interfacelift.com' | |
listing_url = base_url + '/wallpaper/downloads/date/wide_16:9/2880x1620/' | |
# Where to save the wallpapers | |
output_directory = 'wallpapers' | |
# How many wallpapers to download (0 = all) | |
max_downloads = 0 | |
# Regex to find the download links | |
# Example link: | |
# <a href="/wallpaper/7yz4ma1/03767_rushingin_2880x1620.jpg"> | |
# <img src="/img_NEW/button_download.png" alt="" title="" align="center" width="77" height="27"> | |
# </a> | |
link_regex = re.compile(r'<a href="(?P<url>.+)"><img.+?src="/img_NEW/button_download.png') | |
# Regex to find the total number of pages | |
# Example html: | |
# <p style="">You are on <b style="">page 1</b> of .<b style="">78</b></p> | |
pages_regex = re.compile(r'You are on <b.+?>page [0-9]+</b> of <b.+?>(?P<pages>[0-9]+)') | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
page = 1 | |
max_pages = -1 | |
downloaded = 0 | |
while True: | |
print 'Loading first page...' if max_pages == -1 else 'Loading page {} of {}...'.format(page, max_pages) | |
url = '{}/index{}.html'.format(listing_url, page) | |
# InterfaceLift returns 403 if no referer specified | |
headers = { | |
'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', | |
'Referer': url | |
} | |
try: | |
req = urllib2.Request(url, None, headers) | |
f = urllib2.urlopen(req) | |
except IOError, e: | |
print 'Failed to open {}'.format(url) | |
print e | |
continue | |
html = f.read() | |
links = link_regex.finditer(html) | |
# Download all the wallpapers on the page | |
for wall in links: | |
dl_url = base_url + wall.group('url') | |
req = urllib2.Request(dl_url, None, headers) | |
out_file = os.path.join(output_directory, wall.group('url').split('/')[-1]) | |
with open(out_file, 'wb') as f: | |
try: | |
res = urllib2.urlopen(req) | |
f.write(res.read()) | |
except: | |
print 'Fail!' | |
try: | |
os.remove(out_file) | |
except: | |
pass | |
downloaded += 1 | |
print downloaded | |
page += 1 | |
# Get the max number of pages, if we haven't yet | |
if max_pages == -1: | |
pages_match = pages_regex.search(html) | |
if pages_match is None: | |
print 'Max pages not found!' | |
break | |
max_pages = pages_match.group('pages') | |
# Check for break conditions | |
if max_downloads != 0 and downloaded >= max_downloads: | |
print 'Complete!' | |
break | |
if page > max_pages: | |
print 'Out of pages!' | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment