Skip to content

Instantly share code, notes, and snippets.

@yszheda
Created December 24, 2018 10:08
Show Gist options
  • Save yszheda/148d0b51b013fa6ce3226ada9c935410 to your computer and use it in GitHub Desktop.
Save yszheda/148d0b51b013fa6ce3226ada9c935410 to your computer and use it in GitHub Desktop.
Download all the files with given extensions from a given url
import urllib2
from bs4 import BeautifulSoup
import requests
import urlparse
import os
extensions = ( '.pdf', '.jpg', '.png' )
url = raw_input('Input url:')
dst_dir = raw_input('Input download dir:')
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
site = requests.get(url)
html = site.content
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('a'):
new_url = link.get('href')
abs_url = urlparse.urljoin(site.url, new_url)
print(abs_url)
if not abs_url.endswith(extensions):
continue
page = urllib2.urlopen(abs_url)
html = page.read()
# path = urlparse.urlparse(abs_url).path
# name = dst_dir + os.path.basename(path)
name = os.path.join(dst_dir, new_url)
print(name)
with open(name, 'wb') as f:
f.write(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment