Last active
March 19, 2021 07:31
-
-
Save wanghuibin0/42824b32c6da5aefdfe7ee6d2d0bdf84 to your computer and use it in GitHub Desktop.
This is a python script to download all pdf or ppt files from a user-specified url
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run this script with two command line arguments: | |
# 1. the source url | |
# 2. the destination folder | |
from pprint import pprint | |
import requests | |
from bs4 import BeautifulSoup | |
import sys, os | |
def get_html(url): | |
try: | |
html = requests.get(url).text | |
except Exception as e: | |
print('Web requests url error: {}\nlink: {}'.format(e, url)) | |
return html | |
class WebDownloader(object): | |
def __init__(self, base_url, dest_folder): | |
self.url = base_url | |
self.dest_folder = dest_folder | |
self.links = set() | |
def parse_html(self, verbose=False): | |
full_url = str(self.url) | |
base_url = full_url.split('/')[0:-1] | |
base_url = '/'.join(base_url) | |
print(base_url) | |
html = get_html(self.url) | |
soup = BeautifulSoup(html, 'html.parser') | |
for link in soup.findAll('a'): | |
if link.has_attr('href'): | |
href = str(link.get('href')) | |
if href.startswith('http'): | |
self.links.add(href) | |
else: | |
self.links.add(base_url + '/' + href) | |
if verbose: | |
print(link.get('href')) | |
def download(self): | |
for link in self.links: | |
link = str(link) | |
if link.endswith('.pdf') or link.endswith('.ppt') or link.endswith('pptx'): | |
file_name = link.split('/')[-1] | |
full_name = self.dest_folder + os.sep + file_name | |
if os.path.exists(full_name): | |
print('File ' + full_name + ' exists. Skip it!') | |
continue | |
try: | |
print('Downloading file ' + link) | |
r = requests.get(link) | |
with open(full_name, 'wb+') as f: | |
f.write(r.content) | |
except Exception as e: | |
print('Downloading error: {}\nlink: {}'.format(e, link)) | |
def _main(): | |
argv = sys.argv | |
if len(argv) < 3: | |
print('argv < 3, please input the url and dest folder') | |
sys.exit() | |
url = str(argv[1]) | |
dest_folder = str(argv[2]) | |
wd = WebDownloader(url, dest_folder) | |
wd.parse_html() | |
pprint(wd.links) | |
wd.download() | |
print('Mission accomplished! Have a nice day~') | |
# Press the green button in the gutter to run the script. | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment