Skip to content

Instantly share code, notes, and snippets.

@dlee35
Last active September 18, 2019 23:11
Show Gist options
  • Save dlee35/992db55dfa9c9dc6a7738f9c3006045b to your computer and use it in GitHub Desktop.
Save dlee35/992db55dfa9c9dc6a7738f9c3006045b to your computer and use it in GitHub Desktop.
parse urls from pptx. creates tab-delimited links.txt output w/page numbers and http status codes
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Originally authored by Josh Wright (https://github.com/joswr1ght/pptxsanity)
# With code by Eric Jang ericjang2004@gmail.com
# Updated by Dustin Lee
# https://gist.github.com/992db55dfa9c9dc6a7738f9c3006045b
TIMEOUT=6 # URL request timeout in seconds
MAXRETRY=4
MAXREDIR=4
from pptx import Presentation
import sys
import re
import os
import shutil
import glob
import tempfile
import urllib3
try:
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()
except ImportError:
pass
import signal
from zipfile import ZipFile
from xml.dom.minidom import parse
import platform
import ssl
from functools import wraps
def sslwrap(func):
@wraps(func)
def bar(*args, **kw):
kw['ssl_version'] = ssl.PROTOCOL_TLSv1
return func(*args, **kw)
return bar
# Remove trailing unwanted characters from the end of URL's
# This is a recursive function. Did I do it well? I don't know.
def striptrailingchar(s):
# The valid URL charset is A-Za-z0-9-._~:/?#[]@!$&'()*+,;= and & followed by hex character
# I don't have a better way to parse URL's from the cruft that I get from XML content, so I
# also remove .),;'? too. Note that this is only the end of the URL (making ? OK to remove)
#if s[-1] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZZabcdefghijklmnopqrstuvwxyzz0123456789-_~:/#[]@!$&(*+=":
if s[-1] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZZabcdefghijklmnopqrstuvwxyzz0123456789-_~:#[]@!$&(*+=/":
s = striptrailingchar(s[0:-1])
elif s[-5:] == "&quot":
s = striptrailingchar(s[0:-5])
else:
pass
return s
# Parse the given root recursively (root is intended to be the paragraph element <a:p>
# If we encounter a link-break element a:br, add a new line to global paragraphtext
# If we encounter an element with type TEXT_NODE, append value to paragraphtext
paragraphtext=""
def parse_node(root):
global paragraphtext
if root.childNodes:
for node in root.childNodes:
if node.nodeType == node.TEXT_NODE:
paragraphtext += node.nodeValue.encode('ascii', 'ignore')
if node.nodeType == node.ELEMENT_NODE:
if node.tagName == 'a:br':
paragraphtext += "\n"
parse_node(node)
def parseslidenotes(pptxfile):
global paragraphtext
#urls = []
urls = {}
tmpd = tempfile.mkdtemp()
ZipFile(pptxfile).extractall(path=tmpd, pwd=None)
path = tmpd + os.sep + 'ppt' + os.sep + 'notesSlides' + os.sep
for infile in glob.glob(os.path.join(path, '*.xml')):
#parse each XML notes file from the notes folder.
# Get the slide number
slideNumber = re.match(".*notesSlide(\d+).xml", infile).group(1)
# Parse slide notes, adding a space after each paragraph marker, and removing XML markup
dom = parse(infile)
paragraphs=dom.getElementsByTagName('a:p')
for paragraph in paragraphs:
paragraphtext=""
parse_node(paragraph)
# Parse URL content from notes text for the current paragraph
urlmatches = re.findall(urlmatchre, paragraphtext)
if len(urlmatches) > 0:
for match in urlmatches: # Now it's a tuple
for urlmatch in match:
if urlmatch != '':
if striptrailingchar(urlmatch) not in urls:
urls[striptrailingchar(urlmatch)] = []
else:
urls[striptrailingchar(urlmatch)].append(int(slideNumber))
urls = {url:list(sorted(set(page))) for url, page in urls.items()}
# Remove all the files created with unzip
shutil.rmtree(tmpd)
return urls
# Parse the text on slides using the python-pptx module, return URLs
def parseslidetext(prs):
#urls = []
urls = {}
nexttitle = False
singletextrun=""
slidenum=0
for slide in prs.slides:
slidenum+=1
text_runs = []
for shape in slide.shapes:
try:
if not shape.has_text_frame:
continue
except AttributeError:
sys.stderr.write("Error: Please upgrade your version of python-pptx: pip uninstall python-pptx ; pip install python-pptx\n")
sys.exit(-1)
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
singletextrun += run.text
text_runs.append(singletextrun)
for text in text_runs:
if text == None : continue
try:
m = re.match(urlmatchre,text)
except IndexError,TypeError:
continue
if m != None:
url = striptrailingchar(m.groups()[0])
if url not in urls:
urls[url] = []
else:
urls[url].append(int(slidenum))
urls = {url:list(sorted(set(page))) for url, page in urls.items()}
return urls
def signal_exit(signal, frame):
sys.exit(0)
if __name__ == "__main__":
if (len(sys.argv) != 2):
print "Validate URLs in the notes and slides of a PowerPoint PPTX file. (version 1.2). Creates 'links.txt' file with url, page number(s), and http status codes"
print "Usage: pptxsanity.py [pptx file]"
sys.exit(1)
elif sys.argv[1].endswith('ppt'):
print "Powerpoint PPTX files only please"
print "Usage: pptxsanity.py [pptx file]"
sys.exit(1)
signal.signal(signal.SIGINT, signal_exit)
# Disable urllib3 InsecureRequestWarning
try:
urllib3.disable_warnings()
except AttributeError:
sys.stdout.write("You need to upgrade your version of the urllib3 library to the latest available.\n");
sys.stdout.write("Try running the following command to upgrade urllib3: sudo pip install urllib3 --upgrade\n");
sys.exit(1)
try:
prs = Presentation(sys.argv[1])
except Exception:
sys.stderr.write("Invalid PPTX file: " + sys.argv[1] + "\n")
sys.exit(-1)
# This may be the most insane regex I've ever seen. It's very comprehensive, but it's too aggressive for
# what I want. It matches arp:remote in ettercap -TqM arp:remote // //, so I'm using something simpler
#urlmatchre = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)
urlmatchre = re.compile(r'((https?://[^\s<>"]+|www\.[^\s<>"]+))',re.DOTALL)
privateaddr = re.compile(r'(\S+127\.)|(\S+192\.168\.)|(\S+10\.)|(\S+172\.1[6-9]\.)|(\S+172\.2[0-9]\.)|(\S+172\.3[0-1]\.)|(\S+::1)')
# Changed this from SKIP200 to SKIP300, but I don't really use it anyway
SKIP300=int(os.getenv('SKIP300', 0))
with open('links.txt','w') as links:
links.write("URL" + " Page Number(s)" + " HTTP Status Code" + " HTTP Action\r\n")
urls = {}
urls.update(parseslidetext(prs))
urls.update(parseslidenotes(sys.argv[1]))
# For identical URL's that appear on different pages, remove the duplicate entries and combine page numbers as CSVs
# TODO
for url, page in urls.iteritems():
pagenum = ', '.join(str(x) for x in page) # Turn those sorted ints back to strings
if ',' in pagenum:
pageline = " Pages "
else:
pageline = " Page "
# OS X Bus Error Workaround #22
if platform.system() == "Darwin":
if "whois.net" in url or "isecpartners" in url:
print "Skipping URL for OSX bug workaround (%s)",url
continue
url = url.encode('ascii', 'ignore')
# Add default URI for www.anything
if url[0:3] == "www": url="http://"+url
# Some authors include URLs in the form http://www.josh.net.[1], http://www.josh.net[1]. or http://www.josh.net[1]
# Remove the footnote and/or leading or trailing dot.
footnote=re.compile(r"(\.\[\d+\]|\[\d+\]\.|\[\d+\])")
if re.search(footnote, url):
url=re.sub(footnote, "", url)
# Remove a trailing period
if url[-1] == ".":
url = url[:-1]
# Skip private IP addresses and localhost
if re.match(privateaddr,url): continue
if '//localhost' in url: continue
# Uncomment this debug line to print the URL before testing status to identify sites causing "Bus Error" fault on OSX
#print "DEBUG: %s"%url
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:35.0) Gecko/20100101 Firefox/35.0' }
retries=urllib3.Retry(redirect=False, total=4, connect=0, read=0)
http = urllib3.PoolManager(timeout=6, retries=retries)
try:
req=http.urlopen('HEAD', url, headers=headers, redirect=False)
code=req.status
except Exception, e:
links.write(url + pageline + pagenum + " " + str(e) + " Error\r\n")
# Some websites return 404 for HEAD requests (microsoft.com). If we get a 404, try to retrieve using GET
# and report the corresponding response code. Also check out 405 "Method not allowed" responses.
if code == 404 or code == 405:
# Stupid non-compliant web server
try:
req=http.request('GET', url, headers=headers, redirect=False)
code=req.status
if code == 404:
links.write(url + pageline + pagenum + " " + str(code) + " Not Found\r\n")
elif code == 405:
links.write(url + pageline + pagenum + " " + str(code) + " Method Not Allowed\r\n")
else:
links.write(url + pageline + pagenum + " " + str(code) + " (Previous 404/405)\r\n")
except Exception, e:
links.write(url + pageline + pagenum + " " + str(e) + " Error\r\n")
elif SKIP300 == 0 and (code == 301 or code ==302):
links.write(url + pageline + pagenum + " " + str(code) + " Redirect\r\n")
req=http.urlopen('HEAD', url, headers=headers, redirect=True)
code=req.status
redirecturl=req.geturl()
if code == 200:
links.write(redirecturl + pageline + pagenum + " " + str(code) + " Success (Redirected)\r\n")
elif code == 200:
links.write(url + pageline + pagenum + " " + str(code) + " Success\r\n")
else:
links.write(url + pageline + pagenum + " " + str(code) + " Error\r\n")
if os.name == 'nt':
x=raw_input("Press Enter to exit.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment