TheGU/crawl_and_download.py

## crawl_and_download.py
# -*- coding: utf-8 -*-
import requests
import re
import urlparse
import urllib
import os
from bs4 import BeautifulSoup

base_url = "http://downloads.khinsider.com/game-soundtracks/album/patapon-2"
crawl_link_string = "Download"
download_link_string = "Click here to download"
download_local_path = "Z:\Patapon"

# @url : url of target page
# @a_string : a string in [<a>a_string</a>] to filter link
# return : list of link url
def getdownload(url,a_string):
    # try to open url and retry if connection error
    try:
        req = requests.get(url)
    except requests.exceptions.ConnectionError as e:    # This is the correct syntax
        print e, "... Retry"
        return getdownload(url,a_string)

    # check response status
    if(req.status_code != 200):
        return

    # get page content
    soup = BeautifulSoup(req.text, 'html.parser')

    # capture all download link
    linklist = [l.get('href') for l in soup.find_all("a", string=a_string)]
    print "Get {} download links".format(len(linklist))

    # return all download link
    return linklist

# get a page then save download link to specific path
# @url : url of target page
# @a_crawl_string : a string in [<a>a_crawl_string</a>] to filter link for crawl to next page
# @a_download_string : a string in [<a>a_download_string</a>] to filter link for download
# @path : path on local disk to save to
# return : None
def crawl(url, a_crawl_string, a_download_string, path):
    req = requests.get(url)
    if(req.status_code != 200):
        return

    # fetch list of page to crawl
    soup = BeautifulSoup(req.text, 'html.parser')
    link_list = [l.get('href') for l in soup.find_all("a", string=a_crawl_string)]

    # loop on list of page
    for link in link_list:
        print "#### Link {}".format(link)

        # get download link on target download page
        download_list = getdownload(link,a_download_string)

        # download all the link found on download page
        for d in download_list:
            file_name = os.path.join(path,d.split('/')[-1])
            print "Download {} to {}".format(d,file_name),
            urllib.urlretrieve(d, file_name)
            print "... Done"

crawl(base_url, crawl_link_string, download_link_string, download_local_path)
	# -- coding: utf-8 --
	import requests
	import re
	import urlparse
	import urllib
	import os
	from bs4 import BeautifulSoup

	base_url = "http://downloads.khinsider.com/game-soundtracks/album/patapon-2"
	crawl_link_string = "Download"
	download_link_string = "Click here to download"
	download_local_path = "Z:\Patapon"

	# @url : url of target page
	# @a_string : a string in [<a>a_string</a>] to filter link
	# return : list of link url
	def getdownload(url,a_string):
	# try to open url and retry if connection error
	try:
	req = requests.get(url)
	except requests.exceptions.ConnectionError as e: # This is the correct syntax
	print e, "... Retry"
	return getdownload(url,a_string)

	# check response status
	if(req.status_code != 200):
	return

	# get page content
	soup = BeautifulSoup(req.text, 'html.parser')

	# capture all download link
	linklist = [l.get('href') for l in soup.find_all("a", string=a_string)]
	print "Get {} download links".format(len(linklist))

	# return all download link
	return linklist

	# get a page then save download link to specific path
	# @url : url of target page
	# @a_crawl_string : a string in [<a>a_crawl_string</a>] to filter link for crawl to next page
	# @a_download_string : a string in [<a>a_download_string</a>] to filter link for download
	# @path : path on local disk to save to
	# return : None
	def crawl(url, a_crawl_string, a_download_string, path):
	req = requests.get(url)
	if(req.status_code != 200):
	return

	# fetch list of page to crawl
	soup = BeautifulSoup(req.text, 'html.parser')
	link_list = [l.get('href') for l in soup.find_all("a", string=a_crawl_string)]

	# loop on list of page
	for link in link_list:
	print "#### Link {}".format(link)

	# get download link on target download page
	download_list = getdownload(link,a_download_string)

	# download all the link found on download page
	for d in download_list:
	file_name = os.path.join(path,d.split('/')[-1])
	print "Download {} to {}".format(d,file_name),
	urllib.urlretrieve(d, file_name)
	print "... Done"

	crawl(base_url, crawl_link_string, download_link_string, download_local_path)