Skip to content

Instantly share code, notes, and snippets.

@msaroufim
Created May 16, 2015 20:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msaroufim/abcec62152b77dbbadb3 to your computer and use it in GitHub Desktop.
Save msaroufim/abcec62152b77dbbadb3 to your computer and use it in GitHub Desktop.
"""
A simple command line utility that takes in a list of target url's
and downloads all pdf links contained on the page
Potentially look into using scrapy to build a full fledged crawler instead
"""
import requests
import sys
import re
import subprocess
from bs4 import BeautifulSoup
import urlparse
def process(target_url):
page = requests.get(target_url)
print "Succesfully crawled %s" %(target_url)
print "Searching for pdf links"
soup = BeautifulSoup(page.text)
#print soup
links = soup.findAll(href=re.compile("\.pdf$"))
for link in links:
#if pdf is locally linked
if not link.get('href').startswith("http"):
link = urlparse.urljoin(target_url,link.get('href'))
#if pdf link is absolute
else:
link = link.get('href')
print "Downloading: %s" %(link)
#Create pdf files
subprocess.call(["curl", "-O",link],shell=False)
print "Created: %s" %(link)
def main():
if len(sys.argv) == 1:
print "Usage python get-all-pdf-on-page.py target_url_1 target_url_2 ..."
sys.exit(-1)
for target_url in sys.argv[1:]:
process(target_url)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment