brechtm/PDF-Testing.md

## PDF-Testing.md

      
    Raw
  

              PDF-Testing.md
            
          
    PDF Testing Gist

These two files, pdf_linkchecker.py and pdf_fontchecker.py are code examples to go along with a blog article: http://reachtim.com/articles/PDF-Testing.html
See the article for details on how to test your PDFs for broken internal and external links and for unembedded fonts.

  
## pdf_fontchecker.py
'''
Gist to accompany blog artice:
http://reachtim.com/articles/PDF-Testing.html
'''

from PyPDF2 import PdfFileReader
from pprint import pprint
import sys

fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])

def walk(obj, fnt, emb):
    if '/BaseFont' in obj:
        fnt.add(obj['/BaseFont'])

    elif '/FontName' in obj and fontkeys.intersection(set(obj)):
        emb.add(obj['/FontName'])

    for k in obj:
        if hasattr(obj[k], 'keys'):
            walk(obj[k], fnt, emb)

    return fnt, emb

if __name__ == '__main__':
    fname = sys.argv[1]
    pdf = PdfFileReader(fname)
    fonts = set()
    embedded = set()

    for page in pdf.pages:
        obj = page.getObject()
        f, e = walk(obj['/Resources'], fonts, embedded)
        fonts = fonts.union(f)
        embedded = embedded.union(e)

    unembedded = fonts - embedded
    print 'Font List'
    pprint(sorted(list(fonts)))

    if unembedded:
        print '\nUnembedded Fonts'
        pprint(unembedded)

## pdf_linkchecker.py
'''
Gist to accompany blog artice:
http://reachtim.com/PDF-Testing.html
'''

from PyPDF2 import PdfFileReader
import requests
import sys
import urllib

def check_ftp(url):
    try:
        response = urllib.urlopen(url)
    except IOError as e:
        result, reason = False, e
    else:
        if response.read():
            result, reason = True, 'okay'
        else:
            result, reason = False, 'Empty Page'
    return result, reason

def check_url(url, auth=None):
    headers = {'User-Agent': 'Mozilla/5.0', 'Accept': '*/*'}
    if url.startswith('ftp://'):
        result, reason = check_ftp(url)
    else:
        try:
            response = requests.get(url, timeout=6, auth=auth, headers=headers)
        except (requests.ConnectionError,
                requests.HTTPError,
                requests.Timeout,
                requests.exceptions.MissingSchema) as e:
            result, reason = False, e
        else:
            if response.text:
                result, reason = response.status_code, response.reason
            else:
                result, reason = False, 'Empty Page'

    return result, reason

def check_pdf(pdf):
    links = list()
    urls = list()
    badurls = list()

    for page in pdf.pages:
        obj = page.getObject()
        for annot in [x.getObject() for x in obj.get('/Annots', [])]:
            if '/A' in annot:
                dst = annot['/A'].get('/D')
                url = annot['/A'].get('/URI')
                if dst:
                    links.append(dst)
                elif url:
                    urls.append(url)
                    result, reason = check_url(url)
                    if not result:
                        badurls.append({'url':url, 'reason': '%r' % reason})
            elif '/Dest' in annot:
                links.append(annot['/Dest'])


    anchors = pdf.namedDestinations.keys()
    superfluous_anchors = [x for x in anchors if x not in links]
    badlinks = [x for x in links if x not in anchors]
    return anchors, links, superfluous_anchors, badlinks, urls, badurls

if __name__ == '__main__':
    fname = sys.argv[1]
    print 'Checking %s' % fname
    pdf = PdfFileReader(fname)
    anchors, links, superfluous_anchors, badlinks, urls, badurls = check_pdf(pdf)
    print 'urls: ', ', '.join(urls)
    print
    print 'anchors: ', ', '.join(anchors)
    print
    print 'superfluous_anchors: ', ', '.join(superfluous_anchors)
    print
    print 'links: ', ', '.join(links)
    print
    print 'bad links: ', ', '.join(badlinks)
    print
    print 'bad urls:'
    for item in badurls:
        for key, value in item.items():
            print '  {}: {}'.format(key, value)
	'''
	Gist to accompany blog artice:
	http://reachtim.com/articles/PDF-Testing.html
	'''

	from PyPDF2 import PdfFileReader
	from pprint import pprint
	import sys

	fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])

	def walk(obj, fnt, emb):
	if '/BaseFont' in obj:
	fnt.add(obj['/BaseFont'])

	elif '/FontName' in obj and fontkeys.intersection(set(obj)):
	emb.add(obj['/FontName'])

	for k in obj:
	if hasattr(obj[k], 'keys'):
	walk(obj[k], fnt, emb)

	return fnt, emb

	if __name__ == '__main__':
	fname = sys.argv[1]
	pdf = PdfFileReader(fname)
	fonts = set()
	embedded = set()

	for page in pdf.pages:
	obj = page.getObject()
	f, e = walk(obj['/Resources'], fonts, embedded)
	fonts = fonts.union(f)
	embedded = embedded.union(e)

	unembedded = fonts - embedded
	print 'Font List'
	pprint(sorted(list(fonts)))

	if unembedded:
	print '\nUnembedded Fonts'
	pprint(unembedded)
	'''
	Gist to accompany blog artice:
	http://reachtim.com/PDF-Testing.html
	'''

	from PyPDF2 import PdfFileReader
	import requests
	import sys
	import urllib

	def check_ftp(url):
	try:
	response = urllib.urlopen(url)
	except IOError as e:
	result, reason = False, e
	else:
	if response.read():
	result, reason = True, 'okay'
	else:
	result, reason = False, 'Empty Page'
	return result, reason

	def check_url(url, auth=None):
	headers = {'User-Agent': 'Mozilla/5.0', 'Accept': '/'}
	if url.startswith('ftp://'):
	result, reason = check_ftp(url)
	else:
	try:
	response = requests.get(url, timeout=6, auth=auth, headers=headers)
	except (requests.ConnectionError,
	requests.HTTPError,
	requests.Timeout,
	requests.exceptions.MissingSchema) as e:
	result, reason = False, e
	else:
	if response.text:
	result, reason = response.status_code, response.reason
	else:
	result, reason = False, 'Empty Page'

	return result, reason

	def check_pdf(pdf):
	links = list()
	urls = list()
	badurls = list()

	for page in pdf.pages:
	obj = page.getObject()
	for annot in [x.getObject() for x in obj.get('/Annots', [])]:
	if '/A' in annot:
	dst = annot['/A'].get('/D')
	url = annot['/A'].get('/URI')
	if dst:
	links.append(dst)
	elif url:
	urls.append(url)
	result, reason = check_url(url)
	if not result:
	badurls.append({'url':url, 'reason': '%r' % reason})
	elif '/Dest' in annot:
	links.append(annot['/Dest'])


	anchors = pdf.namedDestinations.keys()
	superfluous_anchors = [x for x in anchors if x not in links]
	badlinks = [x for x in links if x not in anchors]
	return anchors, links, superfluous_anchors, badlinks, urls, badurls

	if __name__ == '__main__':
	fname = sys.argv[1]
	print 'Checking %s' % fname
	pdf = PdfFileReader(fname)
	anchors, links, superfluous_anchors, badlinks, urls, badurls = check_pdf(pdf)
	print 'urls: ', ', '.join(urls)
	print
	print 'anchors: ', ', '.join(anchors)
	print
	print 'superfluous_anchors: ', ', '.join(superfluous_anchors)
	print
	print 'links: ', ', '.join(links)
	print
	print 'bad links: ', ', '.join(badlinks)
	print
	print 'bad urls:'
	for item in badurls:
	for key, value in item.items():
	print ' {}: {}'.format(key, value)