|
''' |
|
Gist to accompany blog artice: |
|
http://reachtim.com/PDF-Testing.html |
|
''' |
|
|
|
from PyPDF2 import PdfFileReader |
|
import requests |
|
import sys |
|
import urllib |
|
|
|
def check_ftp(url): |
|
try: |
|
response = urllib.urlopen(url) |
|
except IOError as e: |
|
result, reason = False, e |
|
else: |
|
if response.read(): |
|
result, reason = True, 'okay' |
|
else: |
|
result, reason = False, 'Empty Page' |
|
return result, reason |
|
|
|
def check_url(url, auth=None): |
|
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': '*/*'} |
|
if url.startswith('ftp://'): |
|
result, reason = check_ftp(url) |
|
else: |
|
try: |
|
response = requests.get(url, timeout=6, auth=auth, headers=headers) |
|
except (requests.ConnectionError, |
|
requests.HTTPError, |
|
requests.Timeout, |
|
requests.exceptions.MissingSchema) as e: |
|
result, reason = False, e |
|
else: |
|
if response.text: |
|
result, reason = response.status_code, response.reason |
|
else: |
|
result, reason = False, 'Empty Page' |
|
|
|
return result, reason |
|
|
|
def check_pdf(pdf): |
|
links = list() |
|
urls = list() |
|
badurls = list() |
|
|
|
for page in pdf.pages: |
|
obj = page.getObject() |
|
for annot in [x.getObject() for x in obj.get('/Annots', [])]: |
|
if '/A' in annot: |
|
dst = annot['/A'].get('/D') |
|
url = annot['/A'].get('/URI') |
|
if dst: |
|
links.append(dst) |
|
elif url: |
|
urls.append(url) |
|
result, reason = check_url(url) |
|
if not result: |
|
badurls.append({'url':url, 'reason': '%r' % reason}) |
|
elif '/Dest' in annot: |
|
links.append(annot['/Dest']) |
|
|
|
|
|
anchors = pdf.namedDestinations.keys() |
|
superfluous_anchors = [x for x in anchors if x not in links] |
|
badlinks = [x for x in links if x not in anchors] |
|
return anchors, links, superfluous_anchors, badlinks, urls, badurls |
|
|
|
if __name__ == '__main__': |
|
fname = sys.argv[1] |
|
print 'Checking %s' % fname |
|
pdf = PdfFileReader(fname) |
|
anchors, links, superfluous_anchors, badlinks, urls, badurls = check_pdf(pdf) |
|
print 'urls: ', ', '.join(urls) |
|
print |
|
print 'anchors: ', ', '.join(anchors) |
|
print |
|
print 'superfluous_anchors: ', ', '.join(superfluous_anchors) |
|
print |
|
print 'links: ', ', '.join(links) |
|
print |
|
print 'bad links: ', ', '.join(badlinks) |
|
print |
|
print 'bad urls:' |
|
for item in badurls: |
|
for key, value in item.items(): |
|
print ' {}: {}'.format(key, value) |
It's a great resource, and to be very honest, conversations like these help me to remember why I adore the IT community. Everyone is willing to provide a hand and share priceless advice at all times. Speaking of advice, I recently had a little experience with PDFs of my own. My phone was inundated with paperwork as I was leaving for a client presentation—proposals, contracts, you name it. Finding the correct PDF at the right moment became a mini-nightmare among the confusion. Subsequently, I discovered something that gave me hope for the future. This post http://www.techguide.com.au/news/internet-news/adding-pdf-to-apple-wallet-a-travelers-guide-for-safe-pdf-document-storage/ about adding PDFs to Apple Wallet caught my attention; although it seems straightforward, it was enlightening. Take a look at TechGuide for yourself.For me, it's proven invaluable, particularly in circumstances where I need to quickly access my documents without having to go through emails or folders.