Skip to content

Instantly share code, notes, and snippets.

@daviesjamie
Created November 21, 2014 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daviesjamie/220223205434f746a737 to your computer and use it in GitHub Desktop.
Save daviesjamie/220223205434f746a737 to your computer and use it in GitHub Desktop.
Easily scrape wallpapers from Interface Lift
import os, re, urllib2
# Where to download the wallpapers from
base_url = 'https://interfacelift.com'
listing_url = base_url + '/wallpaper/downloads/date/wide_16:9/2880x1620/'
# Where to save the wallpapers
output_directory = 'wallpapers'
# How many wallpapers to download (0 = all)
max_downloads = 0
# Regex to find the download links
# Example link:
# <a href="/wallpaper/7yz4ma1/03767_rushingin_2880x1620.jpg">
# <img src="/img_NEW/button_download.png" alt="" title="" align="center" width="77" height="27">
# </a>
link_regex = re.compile(r'<a href="(?P<url>.+)"><img.+?src="/img_NEW/button_download.png')
# Regex to find the total number of pages
# Example html:
# <p style="">You are on <b style="">page 1</b> of .<b style="">78</b></p>
pages_regex = re.compile(r'You are on <b.+?>page [0-9]+</b> of <b.+?>(?P<pages>[0-9]+)')
if not os.path.exists(output_directory):
os.makedirs(output_directory)
page = 1
max_pages = -1
downloaded = 0
while True:
print 'Loading first page...' if max_pages == -1 else 'Loading page {} of {}...'.format(page, max_pages)
url = '{}/index{}.html'.format(listing_url, page)
# InterfaceLift returns 403 if no referer specified
headers = {
'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Referer': url
}
try:
req = urllib2.Request(url, None, headers)
f = urllib2.urlopen(req)
except IOError, e:
print 'Failed to open {}'.format(url)
print e
continue
html = f.read()
links = link_regex.finditer(html)
# Download all the wallpapers on the page
for wall in links:
dl_url = base_url + wall.group('url')
req = urllib2.Request(dl_url, None, headers)
out_file = os.path.join(output_directory, wall.group('url').split('/')[-1])
with open(out_file, 'wb') as f:
try:
res = urllib2.urlopen(req)
f.write(res.read())
except:
print 'Fail!'
try:
os.remove(out_file)
except:
pass
downloaded += 1
print downloaded
page += 1
# Get the max number of pages, if we haven't yet
if max_pages == -1:
pages_match = pages_regex.search(html)
if pages_match is None:
print 'Max pages not found!'
break
max_pages = pages_match.group('pages')
# Check for break conditions
if max_downloads != 0 and downloaded >= max_downloads:
print 'Complete!'
break
if page > max_pages:
print 'Out of pages!'
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment