Skip to content

Instantly share code, notes, and snippets.

@cnDelbert
Last active August 29, 2015 14:14
Show Gist options
  • Save cnDelbert/939889801265fa5b9ad5 to your computer and use it in GitHub Desktop.
Save cnDelbert/939889801265fa5b9ad5 to your computer and use it in GitHub Desktop.
Download images from SimpleDesktops.com .
# -*- coding: utf-8 -*-
__author__ = 'Delbert'
# You can pass 2 parameters to init_config()
# `path` is the directory you would like to save the images
# `image_id` is the image ID you want to download from.
from bs4 import BeautifulSoup
import requests
import os
def init_config(path='', image_id=0):
if path:
if not path.startswith('./'):
path = './' + path
if not path.endswith('/'):
path = path + '/'
dir_to_save_images = path
else:
dir_to_save_images = './simple/'
image_id_to_start = image_id
return dir_to_save_images, image_id_to_start
def download_image(path, image_id=0):
base_url = "http://simpledesktops.com/download/?desktop="
if not os.path.isfile(path + 'down.log'):
down_log_file = open(path + 'down.log', 'wt', encoding='utf-8')
down_log_file.write('Image ID\t Image URLs\n')
else:
down_log_file = open(path + 'down.log', 'at', encoding='utf-8')
if not os.path.isfile(path + 'error.log'):
error_log_file = open(path + 'error.log', 'wt', encoding='utf-8')
else:
error_log_file = open(path + 'error.log', 'at', encoding='utf-8')
while image_id > 0:
full_url = base_url + str(image_id)
try:
resp = requests.get(full_url)
except TimeoutError:
print("A timeout retry...")
resp = requests.get(full_url, timeout=10)
if not resp.status_code == requests.codes.ok:
error_log_file.write(full_url + ' Error ' + str(resp.status_code) + '\n')
print(full_url + ' Error ' + str(resp.status_code))
image_id -= 1
continue
full_url = resp.url
image_name = full_url.split('/')[-1]
print("Downloading {file}...".format(file=image_name))
image = open(path + image_name, 'wb')
image.write(resp.content)
image.close()
down_log_file.write('{id}\t{url}\n'.format(id=image_id, url=full_url))
image_id -= 1
print('Download Complete.')
down_log_file.close()
error_log_file.close()
def check_directory(path):
if not os.path.exists(path):
os.mkdir(path)
def parse_detail_page(path, url):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Cookie": "__utma=36407714.1844768811.1422784067.1422784067.1422784067.1; "
"__utmb=36407714.1.10.1422784067; __utmc=36407714; "
"__utmz=36407714.1422784067.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
"Host": "simpledesktops.com",
"Referer": "http://simpledesktops.com/browse/",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:35.0) Gecko/20100101 Firefox/35.0"
}
full_url = "http://simpledesktops.com" + url
resp = requests.get(full_url, headers=headers)
if not resp.status_code == requests.codes.ok:
print(full_url + str(resp.status_code) + ' Error')
return
detail_page_code = resp.text
all_links = BeautifulSoup(detail_page_code).find_all('a')
image_id = 0
for link in all_links:
link = link.get('href')
if link.startswith('/download/?desktop='):
image_id = int(link.strip('/download/?desktop='))
break
download_image(path, image_id)
def parse_homepage(path, image_id):
check_directory(path)
if image_id:
download_image(path, image_id)
else:
homepage_url = 'http://simpledesktops.com/browse/'
homepage_code = BeautifulSoup(requests.get(homepage_url).text)
latest_image_div = homepage_code.find_all('div', {'class': 'desktop'})[0]
latest_image = BeautifulSoup(str(latest_image_div))
details_page = latest_image.find_all("a")[0]["href"]
parse_detail_page(path, details_page)
def main():
# You can pass two parameters to init_config(dir_to_save_images, image_id_to_start)
dir_to_save_images, image_id_to_start = init_config()
parse_homepage(dir_to_save_images, image_id_to_start)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment