yszheda/download_files.py

## download_files.py
import urllib2
from bs4 import BeautifulSoup
import requests
import urlparse
import os


extensions = ( '.pdf', '.jpg', '.png' )

url = raw_input('Input url:')
dst_dir = raw_input('Input download dir:')
if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

site = requests.get(url)
html = site.content
soup = BeautifulSoup(html, 'lxml')

for link in soup.find_all('a'):
    new_url = link.get('href')
    abs_url = urlparse.urljoin(site.url, new_url)
    print(abs_url)
    if not abs_url.endswith(extensions):
        continue
    page = urllib2.urlopen(abs_url)
    html = page.read()
    # path = urlparse.urlparse(abs_url).path
    # name = dst_dir + os.path.basename(path)
    name = os.path.join(dst_dir, new_url)
    print(name)
    with open(name, 'wb') as f:
        f.write(html)
	import urllib2
	from bs4 import BeautifulSoup
	import requests
	import urlparse
	import os


	extensions = ( '.pdf', '.jpg', '.png' )

	url = raw_input('Input url:')
	dst_dir = raw_input('Input download dir:')
	if not os.path.exists(dst_dir):
	os.makedirs(dst_dir)

	site = requests.get(url)
	html = site.content
	soup = BeautifulSoup(html, 'lxml')

	for link in soup.find_all('a'):
	new_url = link.get('href')
	abs_url = urlparse.urljoin(site.url, new_url)
	print(abs_url)
	if not abs_url.endswith(extensions):
	continue
	page = urllib2.urlopen(abs_url)
	html = page.read()
	# path = urlparse.urlparse(abs_url).path
	# name = dst_dir + os.path.basename(path)
	name = os.path.join(dst_dir, new_url)
	print(name)
	with open(name, 'wb') as f:
	f.write(html)