Created
May 5, 2018 09:43
-
-
Save Kelvinson/1075c043dd9e7cd920837c815c8f7994 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Download all the pdfs linked on a given webpage | |
Revised from the original author elssar's gist (https://gist.github.com/elssar/5160757) to scrap the pdf links in piazza resources | |
tab, the original script is for the websits don't need login. But you have | |
to login piazza to scape the contents. So I revised the request.post with | |
authentification info and turn the program to run with Python3 | |
Usage - | |
python grab_pdfs.py url <path/to/directory> | |
url is required | |
path is optional. Path needs to be absolute | |
will save in the current directory if no path is given | |
will save in the current directory if given path does not exist | |
Requires - requests >= 1.0.4 | |
beautifulsoup >= 4.0.0 | |
Download and install using | |
pip install requests | |
pip install beautifulsoup4 | |
""" | |
__author__ = 'elssar <elssar@altrawcode.com>' | |
__license__ = 'MIT' | |
__version__ = '1.0.0' | |
import requests | |
from requests import get | |
from urllib.parse import urljoin | |
from os import path, getcwd | |
from bs4 import BeautifulSoup as soup | |
from sys import argv | |
def get_page(base_url): | |
payload = {'email': 'xxx', 'password': 'yyy'} | |
with requests.Session() as s: | |
p = s.post('https://piazza.com', data=payload) | |
# print the html returned or something more intelligent to see if it's a successful login page. | |
# print p.text | |
# An authorised request. | |
r = s.get('https://piazza.com/xxxx') | |
if r.status_code == 200: | |
return r.text | |
raise Exception('Error {0}'.format(r.status_code)) | |
def get_all_links(html): | |
bs = soup(html) | |
links = bs.findAll('a') | |
return links | |
def get_pdf(base_url, base_dir): | |
payload = {'email': 'xxx', 'password': 'yyy'} | |
with requests.Session() as s: | |
p = s.post('https://piazza.com', data=payload) | |
# print the html returned or something more intelligent to see if it's a successful login page. | |
# print p.text | |
# An authorised request. | |
r = s.get('https://piazza.com/xxx') | |
if r.status_code == 200: | |
html = r.text | |
links = get_all_links(html) | |
if len(links) == 0: | |
raise Exception('No links found on the webpage') | |
n_pdfs = 0 | |
for link in links: | |
if link['href'][-4:] == '.pdf': | |
n_pdfs += 1 | |
content = get(urljoin(base_url, link['href'])) | |
if content.status == 200 and content.headers['content-type'] == 'application/pdf': | |
with open(path.join(base_dir, link.text + '.pdf'), 'wb') as pdf: | |
pdf.write(content.content) | |
if n_pdfs == 0: | |
raise Exception('No pdfs found on the page') | |
print("{0} pdfs downloaded and saved in {1}".format(n_pdfs, base_dir)) | |
raise Exception('Error {0}'.format(r.status_code)) | |
if __name__ == '__main__': | |
if len(argv) not in (2, 3): | |
print('Error! Invalid arguments') | |
print(__doc__) | |
exit(-1) | |
arg = '' | |
url = argv[1] | |
if len(argv) == 3: | |
arg = argv[2] | |
base_dir = [getcwd(), arg][path.isdir(arg)] | |
try: | |
get_pdf(base_url=url, base_dir=base_dir) | |
except Exception as e: | |
print(e) | |
exit(-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment