airbob/jasoncharleshill.py

## jasoncharleshill.py
# -*- coding: utf-8 -*-

import os
import re
import urllib
from bs4 import BeautifulSoup as Soup

URL_REG = re.compile(r'(https://[^/\\]+)', re.I)
IMG_REG = re.compile(r'<img[^>]*?src=([\'"])([^\1]*?)\1', re.I)

## query url and filter image resources and download them to dir folder
def download(dir, url):
    if not os.path.isdir(dir):
        os.mkdir(dir)

    global URL_REG, IMG_REG

    m = URL_REG.match(url)  #IMG_REG
    if not m:
        print '[Error]Invalid URL: ', url
        return
    host = m.group(1)

    if not os.path.isdir(dir):
        os.mkdir(dir)

    #retrieve image url
    html = urllib.urlopen(url).read()
    imgs = [ item[1] for item in IMG_REG.findall(html) ]

    f = lambda path: path if path.startswith('https://') else \
        host + path if path.startswith('/') else url + '/' + path

    imgs = list(set(map(f, imgs)))
    print '[Info]Find %d images.' % len(imgs)

    #download images
    for idx, img in enumerate(imgs):
        name = img.split('/')[-1]
        path = os.path.join(dir, name)
        try:
            print '[Info]Download(%d): %s' % (idx+1, img)
            urllib.urlretrieve(img, path)
        except:
            print "[Error]Can't download(%d): %s" % (idx+1, img)

## get all projects from this domain
def getAllProjects(url):
    s = urllib.urlopen(url).read()
    html = Soup(s, 'html.parser')
    links = [a['href'] for a in html.find_all('a')]
    projects = []
    for link in links:
        if link.startswith('/') and len(link) > 1:
            if link not in ['/about', '/film', '/contact']:
                projects.append(link)

    final = set(projects)
    urls = []
    for project in final:
        url = questHTTP + project
        urls.append(url)
    return urls


savePath = 'image'
domainUrl = "https://www.jasoncharleshill.com"


urls = getAllProjects(domainUrl)
for url in urls:
    download(savePath, url)
	# -- coding: utf-8 --

	import os
	import re
	import urllib
	from bs4 import BeautifulSoup as Soup

	URL_REG = re.compile(r'(https://[^/\\]+)', re.I)
	IMG_REG = re.compile(r'<img[^>]?src=([\'"])([^\1]?)\1', re.I)

	## query url and filter image resources and download them to dir folder
	def download(dir, url):
	if not os.path.isdir(dir):
	os.mkdir(dir)

	global URL_REG, IMG_REG

	m = URL_REG.match(url) #IMG_REG
	if not m:
	print '[Error]Invalid URL: ', url
	return
	host = m.group(1)

	if not os.path.isdir(dir):
	os.mkdir(dir)

	#retrieve image url
	html = urllib.urlopen(url).read()
	imgs = [ item[1] for item in IMG_REG.findall(html) ]

	f = lambda path: path if path.startswith('https://') else \
	host + path if path.startswith('/') else url + '/' + path

	imgs = list(set(map(f, imgs)))
	print '[Info]Find %d images.' % len(imgs)

	#download images
	for idx, img in enumerate(imgs):
	name = img.split('/')[-1]
	path = os.path.join(dir, name)
	try:
	print '[Info]Download(%d): %s' % (idx+1, img)
	urllib.urlretrieve(img, path)
	except:
	print "[Error]Can't download(%d): %s" % (idx+1, img)

	## get all projects from this domain
	def getAllProjects(url):
	s = urllib.urlopen(url).read()
	html = Soup(s, 'html.parser')
	links = [a['href'] for a in html.find_all('a')]
	projects = []
	for link in links:
	if link.startswith('/') and len(link) > 1:
	if link not in ['/about', '/film', '/contact']:
	projects.append(link)

	final = set(projects)
	urls = []
	for project in final:
	url = questHTTP + project
	urls.append(url)
	return urls


	savePath = 'image'
	domainUrl = "https://www.jasoncharleshill.com"


	urls = getAllProjects(domainUrl)
	for url in urls:
	download(savePath, url)