Skip to content

Instantly share code, notes, and snippets.

Last active June 9, 2016 14:35
Show Gist options
  • Save scottstamp/e585f7245caadc5dc0c8 to your computer and use it in GitHub Desktop.
Save scottstamp/e585f7245caadc5dc0c8 to your computer and use it in GitHub Desktop.
Utility for checking anchor links for the Docker documentation
""" I honestly don't even know how the hell this works, just use it. """
__author__ = "Scott Stamp <>"
from HTMLParser import HTMLParser
from urlparse import urljoin
from sys import setrecursionlimit
import re
import requests
root = 'http://localhost:8000'
class DataHolder:
def __init__(self, value=None, attr_name='value'):
self._attr_name = attr_name
def __call__(self, value):
return self.set(value)
def set(self, value):
setattr(self, self._attr_name, value)
return value
def get(self):
return getattr(self, self._attr_name)
class Parser(HTMLParser):
global root
ids = set()
crawled = set()
anchors = {}
pages = set()
save_match = DataHolder(attr_name='match')
def __init__(self, origin):
self.origin = origin
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if 'href' in attrs:
href = attrs['href']
if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
if self.save_match('.*\#(.*?)$', href)):
if self.origin not in self.anchors:
self.anchors[self.origin] = set()
url = urljoin(root, href)
if url not in self.crawled and not re.match('^\#', href):
if 'id' in attrs:
r = requests.get(root)
parser = Parser(root)
for anchor in parser.anchors:
for anchor_name in parser.anchors[anchor]:
if anchor_name not in parser.ids:
print 'Missing - ({0}): #{1}'.format(
anchor.replace(root, ''), anchor_name)
Copy link

lsloan commented Jun 9, 2016

Interesting enhancement to DataHolder. AFAICT, the original is here:

I was going to ask about the changes, but since you "don't even know how the hell this works", I don't want to put you on the spot. 😉

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment