Skip to content

Instantly share code, notes, and snippets.

@serv-inc
Created December 11, 2018 06:13
Show Gist options
  • Save serv-inc/0405594483a4115233f47ab19cfbf3b2 to your computer and use it in GitHub Desktop.
Save serv-inc/0405594483a4115233f47ab19cfbf3b2 to your computer and use it in GitHub Desktop.
'''loads pdf file in sys.argv[1], extracts URLs, tries to load each URL'''
import urllib
import sys
import PyPDF2
# credits to stackoverflow.com/questions/27744210
def extract_urls(filename):
'''extracts all urls from filename'''
PDFFile = open(filename,'rb')
PDF = PyPDF2.PdfFileReader(PDFFile)
pages = PDF.getNumPages()
key = '/Annots'
uri = '/URI'
ank = '/A'
for page in range(pages):
pageSliced = PDF.getPage(page)
pageObject = pageSliced.getObject()
if pageObject.has_key(key):
ann = pageObject[key]
for a in ann:
u = a.getObject()
if u[ank].has_key(uri):
yield u[ank][uri]
def check_http_url(url):
urllib.urlopen(url)
if __name__ == "__main__":
for url in extract_urls(sys.argv[1]):
check_http_url(url)
@jsbien
Copy link

jsbien commented May 8, 2019

python3 check_pdf_urls.py file.pdf gives me
File "check_pdf_urls.py", line 21, in extract_urls
if pageObject.has_key(key):
AttributeError: 'PageObject' object has no attribute 'has_key'

@jsbien
Copy link

jsbien commented Apr 9, 2023

File "/home/jsbien/Downloads/check_pdf_urls/check_pdf_urls.py", line 12
'''extracts all urls from filename'''
IndentationError: expected an indented block

@wvwhome
Copy link

wvwhome commented Apr 9, 2023

Using PyPDF2 version 3.0.1 and Python 3.10.4 on Windows

'''loads pdf file in sys.argv[1], extracts URLs, tries to load each URL'''

print('\nstart', __file__)

my_header_agent = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}

def extract_urls(filename):
    '''extracts all urls from filename'''
    PDFFile = open(filename, 'rb')
    PDF = PyPDF2.PdfReader(PDFFile)

    pages = len(PDF.pages)

    key = '/Annots'
    uri = '/URI'
    ank = '/A'

    for page in range(pages):
        pageSliced = PDF.pages[page]
        pageObject = pageSliced.get_object()


        if key in pageObject:
            ann = pageObject[key]
            for a in ann:
                u = a.get_object()
                if uri in u[ank]:
                    yield u[ank][uri]

def check_http_url(url):
    print('\ncheck url:', url)
    try:
        response = requests.head(url, timeout=60, params={"Cache-Control": "no-cache"}, headers=my_header_agent)
        status_code_int = response.status_code
        # print(type(status_code))
        if status_code_int == 200:
            print('  ok')
        else:
            header_2 = requests.head(url, timeout=60, params={"Cache-Control": "no-cache"}).headers

            if status_code_int == 301:
                new_location = header_2['Location']
                print('    new_location:', new_location)
                # print('    response:\n', response, response.status_code)
            else:
                print('    header_2:\n', header_2)

    except Exception as exc_log:
        print('exc_log:', exc_log)

if __name__ == "__main__":
    for url in extract_urls(sys.argv[1]):
        check_http_url(url)


print('\nfinished', __file__)

@jsbien
Copy link

jsbien commented Apr 9, 2023

On Debian I have problems with installing PyPDF2. I give up for some time.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment