Skip to content

Instantly share code, notes, and snippets.

@TheGU
Last active August 29, 2015 14:27
Show Gist options
  • Save TheGU/b61dfaeb63fa3cbacef2 to your computer and use it in GitHub Desktop.
Save TheGU/b61dfaeb63fa3cbacef2 to your computer and use it in GitHub Desktop.
a script to crawl a page then download all file to local disk (http://pattapongj.com/2015/08/11/python-crawler-and-download/)
# -*- coding: utf-8 -*-
import requests
import re
import urlparse
import urllib
import os
from bs4 import BeautifulSoup
base_url = "http://downloads.khinsider.com/game-soundtracks/album/patapon-2"
crawl_link_string = "Download"
download_link_string = "Click here to download"
download_local_path = "Z:\Patapon"
# @url : url of target page
# @a_string : a string in [<a>a_string</a>] to filter link
# return : list of link url
def getdownload(url,a_string):
# try to open url and retry if connection error
try:
req = requests.get(url)
except requests.exceptions.ConnectionError as e: # This is the correct syntax
print e, "... Retry"
return getdownload(url,a_string)
# check response status
if(req.status_code != 200):
return
# get page content
soup = BeautifulSoup(req.text, 'html.parser')
# capture all download link
linklist = [l.get('href') for l in soup.find_all("a", string=a_string)]
print "Get {} download links".format(len(linklist))
# return all download link
return linklist
# get a page then save download link to specific path
# @url : url of target page
# @a_crawl_string : a string in [<a>a_crawl_string</a>] to filter link for crawl to next page
# @a_download_string : a string in [<a>a_download_string</a>] to filter link for download
# @path : path on local disk to save to
# return : None
def crawl(url, a_crawl_string, a_download_string, path):
req = requests.get(url)
if(req.status_code != 200):
return
# fetch list of page to crawl
soup = BeautifulSoup(req.text, 'html.parser')
link_list = [l.get('href') for l in soup.find_all("a", string=a_crawl_string)]
# loop on list of page
for link in link_list:
print "#### Link {}".format(link)
# get download link on target download page
download_list = getdownload(link,a_download_string)
# download all the link found on download page
for d in download_list:
file_name = os.path.join(path,d.split('/')[-1])
print "Download {} to {}".format(d,file_name),
urllib.urlretrieve(d, file_name)
print "... Done"
crawl(base_url, crawl_link_string, download_link_string, download_local_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment