Skip to content

Instantly share code, notes, and snippets.

@wanghuibin0
Last active March 19, 2021 07:31
Show Gist options
  • Save wanghuibin0/42824b32c6da5aefdfe7ee6d2d0bdf84 to your computer and use it in GitHub Desktop.
Save wanghuibin0/42824b32c6da5aefdfe7ee6d2d0bdf84 to your computer and use it in GitHub Desktop.
This is a python script to download all pdf or ppt files from a user-specified url
# Run this script with two command line arguments:
# 1. the source url
# 2. the destination folder
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import sys, os
def get_html(url):
try:
html = requests.get(url).text
except Exception as e:
print('Web requests url error: {}\nlink: {}'.format(e, url))
return html
class WebDownloader(object):
def __init__(self, base_url, dest_folder):
self.url = base_url
self.dest_folder = dest_folder
self.links = set()
def parse_html(self, verbose=False):
full_url = str(self.url)
base_url = full_url.split('/')[0:-1]
base_url = '/'.join(base_url)
print(base_url)
html = get_html(self.url)
soup = BeautifulSoup(html, 'html.parser')
for link in soup.findAll('a'):
if link.has_attr('href'):
href = str(link.get('href'))
if href.startswith('http'):
self.links.add(href)
else:
self.links.add(base_url + '/' + href)
if verbose:
print(link.get('href'))
def download(self):
for link in self.links:
link = str(link)
if link.endswith('.pdf') or link.endswith('.ppt') or link.endswith('pptx'):
file_name = link.split('/')[-1]
full_name = self.dest_folder + os.sep + file_name
if os.path.exists(full_name):
print('File ' + full_name + ' exists. Skip it!')
continue
try:
print('Downloading file ' + link)
r = requests.get(link)
with open(full_name, 'wb+') as f:
f.write(r.content)
except Exception as e:
print('Downloading error: {}\nlink: {}'.format(e, link))
def _main():
argv = sys.argv
if len(argv) < 3:
print('argv < 3, please input the url and dest folder')
sys.exit()
url = str(argv[1])
dest_folder = str(argv[2])
wd = WebDownloader(url, dest_folder)
wd.parse_html()
pprint(wd.links)
wd.download()
print('Mission accomplished! Have a nice day~')
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment