Created
May 4, 2022 16:35
-
-
Save alanbernstein/2dcb49a275e1e87cc4ae6f1cdeafdaa3 to your computer and use it in GitHub Desktop.
markdown link validator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ipython | |
from glob import glob | |
import os | |
import re | |
# from urlparse import urlparse | |
from urllib.parse import urlparse | |
from ipdb import set_trace as debug | |
from pprint import pprint as pp | |
""" | |
Documentation site generators/frameworks that we have used do not have the best support | |
for link validation. That is, they may correctly and identify broken links when the | |
URL does not exist, but they don't even try to check whether the URL *fragment* exists. | |
This is a major usability issue, because when doc sections get moved around between pages, | |
or the section headings get renamed, the anchor links / fragments get renamed as well, | |
resulting in broken internal links on the site, which is quite unprofessional. | |
It is often tricky to keep this in mind while in the middle of "refactoring" docs, so | |
instead of trying to remember, I wrote this script to scan the entire docs repo for | |
issues with URLs and a few other minor things. | |
This was originally written for Hugo, then modified to work for Docusaurus. The general | |
approach is probably applicable to Jekyll, but probably needs to be updated to account | |
for the directory structure. | |
""" | |
# TODO use an md parser | |
# TODO handle relative paths robustly | |
# TODO: check for external->internal links | |
# TODO: check if all files are discoverable via nav sidebar | |
# TODO make sure links are local-absolute and end in slash | |
# end in slash: That would avoid a redirect (giving us referrers in analytics and lower latency) and I think the anchors get wiped during the redirect. This could be done with a clever sed line. | |
# local-absolute (/docs/introduction instead of ../../docs/introduction): for consistency, and it would probably be easier to find and change later | |
content_dir = os.getenv('HOME') + '/src/molecula-docs/docs/' | |
scan_dirs = [ | |
content_dir | |
] | |
ignore_files = [ | |
('docs', 'style-guide'), # contains informative examples of broken links | |
] | |
file_pattern = '*.md' | |
markdown_link_regex = '\[([^\]]*)\]\(([^)]*)\)' | |
# finds '[X](Y)', where X excludes ']', Y excludes ')', and X, Y are captured groups | |
def main(): | |
links, anchors = scan_files(scan_dirs, file_pattern) | |
# debug() | |
sidebars = load_sidebars() | |
#print('XXX links') | |
#print_links(links) | |
#print('XXX anchors') | |
#print_anchors(anchors) | |
#debug() | |
#print('XXX problems') | |
find_problems(links, anchors) | |
# find_unreachable_pages(pages, sidebars) | |
def load_sidebars(): | |
return [] | |
def find_links_to_duplicates(file_data): | |
# check if the anchor that a link links to has duplicate lines... | |
pass | |
def internal_missing_slash(link): | |
#if not link['is_pilosa']: | |
# # only want to check internal | |
# return False | |
if link['is_image']: | |
# don't care about images | |
return False | |
if link['url'].startswith('mailto'): | |
# don't care about mail links | |
return False | |
parsed = urlparse(link['url']) | |
if parsed.path == '' and parsed.fragment: | |
# same-page fragment, not really a link | |
return False | |
elif not parsed.path.endswith('/'): | |
return True | |
def internal_unmatched(link, anchors): | |
# this isn't perfect, because it doesn't scan everything | |
# TODO check that the page exists (how?) | |
if link['is_anchor'] and link['netloc'] == '' and not link['anchor_key'] in anchors: | |
return True | |
return False | |
def find_problems(links, anchors): | |
print('checking links in:') | |
for d in scan_dirs: | |
print(' %s' % d) | |
print('bad links to other sections are false positives.\n') | |
error_count = 0 | |
problem_files_count = 0 | |
for file_key, file_links in links.items(): | |
if file_key in ignore_files: | |
continue | |
errors = [] | |
for link in file_links: | |
if link['scheme'] in ['mailto', 'tel']: | |
# ignore these | |
continue | |
if not link['url'].startswith('http') and ".md" in link['raw']: | |
error_type = '.md' | |
errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) | |
if not link['url'].startswith('http') and not link['url'].startswith('#') and not link['url'].startswith('/') and not link['url'].startswith('localhost'): | |
error_type = 'no leading slash' | |
errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) | |
if internal_unmatched(link, anchors): | |
# debug() | |
error_type = 'bad link' | |
errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) | |
#if internal_missing_slash(link): | |
# errors.append('missing slash: %s -- %s' % (link['anchor_key'], link['raw'])) | |
if errors: | |
problem_files_count += 1 | |
print('/'.join(file_key)) | |
for e in errors: | |
error_count += 1 | |
print(' %s' % e) | |
print('found %d errors across %d files' % (error_count, problem_files_count)) | |
def print_anchors(anchors, filt_func=None): | |
filt_func = filt_func or true_filter | |
for anchor_key, anchor in anchors.items(): | |
if filt_func(anchor): | |
print(' %d:%s' % (anchor['line'], anchor['raw'])) | |
def print_links(links, filt_func=None): | |
filt_func = filt_func or true_filter | |
for file_key, file_links in links.items(): | |
print(file_key) | |
for link in file_links: | |
if filt_func(link): | |
print(' %d:%s' % (link['line'], link['raw'])) | |
def ref_filter(link): | |
return link['is_ref'] | |
def anchor_filter(link): | |
return link['is_anchor'] | |
def true_filter(x): | |
return True | |
def get_file_data(fname, file_key): | |
with open(fname) as f: | |
lines = f.read().strip().split('\n') | |
code_fence = False | |
links = [] | |
anchors = {} | |
# print(fname) | |
for n, line in enumerate(lines, 1): | |
# find anchors (any element with an `id` attribute in html | |
if line.startswith('```'): | |
code_fence = not code_fence | |
if code_fence: | |
continue | |
slugs = [] | |
if line.startswith('#'): | |
# section header automatically gets an `id` | |
slugs = [slugify(line)] | |
elif 'id="' in line: | |
# other things can have `id`s manually added (like in the pilosa.com glossary) | |
slugs = re.findall('id="([^"]*)"', line) | |
for slug in slugs: | |
anchor_key = tuple(list(file_key) + [slug]) | |
# debug() | |
if anchor_key in anchors: | |
if 'duplicate_lines' in anchors[anchor_key]: | |
anchors[anchor_key]['duplicate_lines'].append(n) | |
else: | |
anchors[anchor_key]['duplicate_lines'] = [n] | |
continue | |
anchors[anchor_key] = { | |
'raw': line, | |
'file': fname, | |
'line': n, | |
'slug': slug, | |
} | |
# print(anchor_key) | |
# find links | |
matches = re.findall(markdown_link_regex, line) | |
for match in matches: | |
url = match[1] | |
if ' ' in url: | |
parts = url.split(' ') | |
url = parts[0] | |
title = ' '.join(parts[1:]) | |
else: | |
title = '' | |
parsed = urlparse(url) | |
is_ref = '{{< ref ' in url | |
if is_ref: | |
print('!!!!!!!!!!! ref link found: %s:%d' % (fname, n)) | |
is_relative = parsed.netloc == '' | |
anchor = parsed.fragment | |
is_anchor = '#' in url | |
is_pilosa = 'pilosa.com' in url or is_relative | |
is_image = '.gif' in url.lower() or '.jpg' in url.lower() or '.png' in url.lower() or '.svg' in url.lower() | |
anchor_key = None | |
if not is_image and not parsed.netloc: | |
if not parsed.path: | |
page_key = file_key | |
else: | |
# TODO un-hack | |
parsed_path = parsed.path.replace('docs/latest', 'docs') | |
# this "aliases" docs/latest to docs, so i dont have to deal with the multiple | |
# docs directories | |
path_parts = [p for p in parsed_path.split('/') if p not in ['..', '']] | |
if len(path_parts) == 1: | |
path_parts = [file_key[0]] + path_parts | |
page_key = tuple(path_parts) | |
anchor_key = tuple(list(page_key) + [anchor]) | |
# print(' %s %s' % (right_pad(url, 40), anchor_key)) | |
links.append({ | |
'raw': '[%s](%s)' % match, | |
'file': fname, | |
'line': n, | |
'scheme': parsed.scheme, | |
'netloc': parsed.netloc, | |
'text': match[0], | |
'title': title, | |
'url': url, | |
'is_ref': is_ref, | |
'is_relative': is_relative, | |
'is_anchor': is_anchor, | |
'is_pilosa': is_pilosa, | |
'is_image': is_image, | |
'anchor': anchor, | |
'anchor_key': anchor_key, | |
}) | |
return links, anchors | |
def right_pad(s, n): | |
spaces = ' ' * (n - len(s)) | |
return s + spaces | |
def slugify(text): | |
# if it's a link, remove the url and link syntax, leaving only the display text | |
m = re.search('\[(.*)\]\((.*)\)', text) | |
if m: | |
text = '# ' + m.groups()[0] | |
text = text.lower().replace('#', '').strip() | |
text = re.sub('[ _]', '-', text) | |
text = re.sub('[^0-9a-z\-]', '', text) | |
return text | |
def scan_files(scan_dirs, file_pattern): | |
# returns two dicts: | |
# links = { # grouped by file | |
# file_key: [ # file_key is a tuple of path elements, like ('explanations', 'architecture') | |
# link_data, | |
# ... | |
# ], | |
# } | |
# anchors = { # all in one dict for easy search | |
# anchor_key: anchor_data, | |
# ... | |
# } | |
links = {} | |
anchors = {} | |
files = {} | |
print('scanning files...') | |
for dir in scan_dirs: | |
# files = glob(dir + '/' + file_pattern) | |
files = glob(dir + '/**/' + file_pattern, recursive=True) | |
for file in files: | |
# file_key = tuple(file[:-3].split('/')[-2:]) # for having only two path components | |
file_key = tuple(file[len(dir):].split('/')) | |
# get rid of '.md' suffix for file key, because links shouldn't be using it. | |
file_key = tuple([part.replace('.md', '') for part in file_key]) | |
# filekey = tuple(file[len(dir):].split('/')) | |
# TODO: fix filekey, return files list, with filename, markdown id, path key | |
# use those to determine if file is present in sidebar | |
print(' %s %s' % (file, file_key)) | |
l, a = get_file_data(file, file_key) | |
links[file_key] = l | |
anchors.update(a) | |
return links, anchors | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment