alanbernstein/markdown-link-validator.py

## markdown-link-validator.py
#!/usr/bin/env ipython
from glob import glob
import os
import re
# from urlparse import urlparse
from urllib.parse import urlparse

from ipdb import set_trace as debug
from pprint import pprint as pp

"""
Documentation site generators/frameworks that we have used do not have the best support
for link validation. That is, they may correctly and identify broken links when the
URL does not exist, but they don't even try to check whether the URL *fragment* exists.
This is a major usability issue, because when doc sections get moved around between pages,
or the section headings get renamed, the anchor links / fragments get renamed as well,
resulting in broken internal links on the site, which is quite unprofessional.

It is often tricky to keep this in mind while in the middle of "refactoring" docs, so
instead of trying to remember, I wrote this script to scan the entire docs repo for
issues with URLs and a few other minor things.

This was originally written for Hugo, then modified to work for Docusaurus. The general
approach is probably applicable to Jekyll, but probably needs to be updated to account
for the directory structure.
"""

# TODO use an md parser
# TODO handle relative paths robustly
# TODO: check for external->internal links
# TODO: check if all files are discoverable via nav sidebar
# TODO make sure links are local-absolute and end in slash
# end in slash: That would avoid a redirect (giving us referrers in analytics and lower latency) and I think the anchors get wiped during the redirect. This could be done with a clever sed line.
# local-absolute (/docs/introduction instead of ../../docs/introduction): for consistency, and it would probably be easier to find and change later

content_dir = os.getenv('HOME') + '/src/molecula-docs/docs/'

scan_dirs = [
    content_dir
]

ignore_files = [
    ('docs', 'style-guide'),  # contains informative examples of broken links
]

file_pattern = '*.md'
markdown_link_regex = '\[([^\]]*)\]\(([^)]*)\)'
# finds '[X](Y)', where X excludes ']', Y excludes ')', and X, Y are captured groups


def main():
    links, anchors = scan_files(scan_dirs, file_pattern)
    # debug()
    sidebars = load_sidebars()
    #print('XXX links')
    #print_links(links)
    #print('XXX anchors')
    #print_anchors(anchors)
    #debug()
    #print('XXX problems')
    find_problems(links, anchors)
    # find_unreachable_pages(pages, sidebars)


def load_sidebars():
    return []

def find_links_to_duplicates(file_data):
    # check if the anchor that a link links to has duplicate lines...
    pass


def internal_missing_slash(link):
    #if not link['is_pilosa']:
    #    # only want to check internal
    #    return False
    if link['is_image']:
        # don't care about images
        return False
    if link['url'].startswith('mailto'):
        # don't care about mail links
        return False
    parsed = urlparse(link['url'])
    if parsed.path == '' and parsed.fragment:
        # same-page fragment, not really a link
        return False
    elif not parsed.path.endswith('/'):
        return True


def internal_unmatched(link, anchors):
    # this isn't perfect, because it doesn't scan everything
    # TODO check that the page exists (how?)
    if link['is_anchor'] and link['netloc'] == '' and not link['anchor_key'] in anchors:
        return True
    return False


def find_problems(links, anchors):
    print('checking links in:')
    for d in scan_dirs:
        print('  %s' % d)
    print('bad links to other sections are false positives.\n')

    error_count = 0
    problem_files_count = 0
    for file_key, file_links in links.items():
        if file_key in ignore_files:
            continue
        errors = []
        for link in file_links:
            if link['scheme'] in ['mailto', 'tel']:
                # ignore these
                continue
            if not link['url'].startswith('http') and ".md" in link['raw']:
                error_type = '.md'
                errors.append('%s (%d)     : %s  --  %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
            if not link['url'].startswith('http') and not link['url'].startswith('#') and not link['url'].startswith('/') and not link['url'].startswith('localhost'):
                error_type = 'no leading slash'
                errors.append('%s (%d)     : %s  --  %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
            if internal_unmatched(link, anchors):
                # debug()
                error_type = 'bad link'
                errors.append('%s (%d)  : %s  --  %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
            #if internal_missing_slash(link):
            #    errors.append('missing slash: %s  --  %s' % (link['anchor_key'], link['raw']))

        if errors:
            problem_files_count += 1
            print('/'.join(file_key))
            for e in errors:
                error_count += 1
                print('  %s' % e)


    print('found %d errors across %d files' % (error_count, problem_files_count))


def print_anchors(anchors, filt_func=None):
    filt_func = filt_func or true_filter
    for anchor_key, anchor in anchors.items():
        if filt_func(anchor):
            print('    %d:%s' % (anchor['line'], anchor['raw']))


def print_links(links, filt_func=None):
    filt_func = filt_func or true_filter
    for file_key, file_links in links.items():
        print(file_key)
        for link in file_links:
            if filt_func(link):
                print('    %d:%s' % (link['line'], link['raw']))


def ref_filter(link):
    return link['is_ref']


def anchor_filter(link):
    return link['is_anchor']


def true_filter(x):
    return True


def get_file_data(fname, file_key):
    with open(fname) as f:
        lines = f.read().strip().split('\n')

    code_fence = False
    links = []
    anchors = {}
    # print(fname)
    for n, line in enumerate(lines, 1):
        # find anchors (any element with an `id` attribute in html
        if line.startswith('```'):
            code_fence = not code_fence
        if code_fence:
            continue

        slugs = []
        if line.startswith('#'):
            # section header automatically gets an `id`
            slugs = [slugify(line)]
        elif 'id="' in line:
            # other things can have `id`s manually added (like in the pilosa.com glossary)
            slugs = re.findall('id="([^"]*)"', line)

        for slug in slugs:
            anchor_key = tuple(list(file_key) + [slug])
            # debug()
            if anchor_key in anchors:
                if 'duplicate_lines' in anchors[anchor_key]:
                    anchors[anchor_key]['duplicate_lines'].append(n)
                else:
                    anchors[anchor_key]['duplicate_lines'] = [n]
                continue
            anchors[anchor_key] = {
                'raw': line,
                'file': fname,
                'line': n,
                'slug': slug,
            }
            # print(anchor_key)

        # find links
        matches = re.findall(markdown_link_regex, line)
        for match in matches:
            url = match[1]
            if ' ' in url:
                parts = url.split(' ')
                url = parts[0]
                title = ' '.join(parts[1:])
            else:
                title = ''
            parsed = urlparse(url)

            is_ref = '{{< ref ' in url
            if is_ref:
                print('!!!!!!!!!!! ref link found: %s:%d' % (fname, n))
            is_relative = parsed.netloc == ''
            anchor = parsed.fragment
            is_anchor = '#' in url
            is_pilosa = 'pilosa.com' in url or is_relative
            is_image = '.gif' in url.lower() or '.jpg' in url.lower() or '.png' in url.lower() or '.svg' in url.lower()

            anchor_key = None
            if not is_image and not parsed.netloc:
                if not parsed.path:
                    page_key = file_key
                else:
                    # TODO un-hack
                    parsed_path = parsed.path.replace('docs/latest', 'docs')
                    # this "aliases" docs/latest to docs, so i dont have to deal with the multiple
                    # docs directories
                    path_parts = [p for p in parsed_path.split('/') if p not in ['..', '']]
                    if len(path_parts) == 1:
                        path_parts = [file_key[0]] + path_parts
                    page_key = tuple(path_parts)
                anchor_key = tuple(list(page_key) + [anchor])
                # print('  %s   %s' % (right_pad(url, 40), anchor_key))
            links.append({
                'raw': '[%s](%s)' % match,
                'file': fname,
                'line': n,
                'scheme': parsed.scheme,
                'netloc': parsed.netloc,
                'text': match[0],
                'title': title,
                'url': url,
                'is_ref': is_ref,
                'is_relative': is_relative,
                'is_anchor': is_anchor,
                'is_pilosa': is_pilosa,
                'is_image': is_image,
                'anchor': anchor,
                'anchor_key': anchor_key,
            })
    return links, anchors


def right_pad(s, n):
    spaces = ' ' * (n - len(s))
    return s + spaces


def slugify(text):
    # if it's a link, remove the url and link syntax, leaving only the display text
    m = re.search('\[(.*)\]\((.*)\)', text)
    if m:
        text = '# ' + m.groups()[0]

    text = text.lower().replace('#', '').strip()
    text = re.sub('[ _]', '-', text)
    text = re.sub('[^0-9a-z\-]', '', text)
    return text


def scan_files(scan_dirs, file_pattern):
    # returns two dicts:
    # links = {             # grouped by file
    #     file_key: [       # file_key is a tuple of path elements, like ('explanations', 'architecture')
    #         link_data,
    #         ...
    #     ],
    # }
    # anchors = {           # all in one dict for easy search
    #     anchor_key: anchor_data,
    #     ...
    # }
    links = {}
    anchors = {}
    files = {}
    print('scanning files...')
    for dir in scan_dirs:
        # files = glob(dir + '/' + file_pattern)
        files = glob(dir + '/**/' + file_pattern, recursive=True)
        for file in files:
            # file_key = tuple(file[:-3].split('/')[-2:]) # for having only two path components
            file_key = tuple(file[len(dir):].split('/'))

            # get rid of '.md' suffix for file key, because links shouldn't be using it.
            file_key = tuple([part.replace('.md', '') for part in file_key])

            # filekey = tuple(file[len(dir):].split('/'))
            # TODO: fix filekey, return files list, with filename, markdown id, path key
            # use those to determine if file is present in sidebar
            print('  %s  %s' % (file, file_key))
            l, a = get_file_data(file, file_key)
            links[file_key] = l
            anchors.update(a)

    return links, anchors


main()
	#!/usr/bin/env ipython
	from glob import glob
	import os
	import re
	# from urlparse import urlparse
	from urllib.parse import urlparse

	from ipdb import set_trace as debug
	from pprint import pprint as pp

	"""
	Documentation site generators/frameworks that we have used do not have the best support
	for link validation. That is, they may correctly and identify broken links when the
	URL does not exist, but they don't even try to check whether the URL fragment exists.
	This is a major usability issue, because when doc sections get moved around between pages,
	or the section headings get renamed, the anchor links / fragments get renamed as well,
	resulting in broken internal links on the site, which is quite unprofessional.

	It is often tricky to keep this in mind while in the middle of "refactoring" docs, so
	instead of trying to remember, I wrote this script to scan the entire docs repo for
	issues with URLs and a few other minor things.

	This was originally written for Hugo, then modified to work for Docusaurus. The general
	approach is probably applicable to Jekyll, but probably needs to be updated to account
	for the directory structure.
	"""

	# TODO use an md parser
	# TODO handle relative paths robustly
	# TODO: check for external->internal links
	# TODO: check if all files are discoverable via nav sidebar
	# TODO make sure links are local-absolute and end in slash
	# end in slash: That would avoid a redirect (giving us referrers in analytics and lower latency) and I think the anchors get wiped during the redirect. This could be done with a clever sed line.
	# local-absolute (/docs/introduction instead of ../../docs/introduction): for consistency, and it would probably be easier to find and change later

	content_dir = os.getenv('HOME') + '/src/molecula-docs/docs/'

	scan_dirs = [
	content_dir
	]

	ignore_files = [
	('docs', 'style-guide'), # contains informative examples of broken links
	]

	file_pattern = '*.md'
	markdown_link_regex = '\[([^\]])\]\(([^)])\)'
	# finds '[X](Y)', where X excludes ']', Y excludes ')', and X, Y are captured groups


	def main():
	links, anchors = scan_files(scan_dirs, file_pattern)
	# debug()
	sidebars = load_sidebars()
	#print('XXX links')
	#print_links(links)
	#print('XXX anchors')
	#print_anchors(anchors)
	#debug()
	#print('XXX problems')
	find_problems(links, anchors)
	# find_unreachable_pages(pages, sidebars)


	def load_sidebars():
	return []

	def find_links_to_duplicates(file_data):
	# check if the anchor that a link links to has duplicate lines...
	pass


	def internal_missing_slash(link):
	#if not link['is_pilosa']:
	# # only want to check internal
	# return False
	if link['is_image']:
	# don't care about images
	return False
	if link['url'].startswith('mailto'):
	# don't care about mail links
	return False
	parsed = urlparse(link['url'])
	if parsed.path == '' and parsed.fragment:
	# same-page fragment, not really a link
	return False
	elif not parsed.path.endswith('/'):
	return True


	def internal_unmatched(link, anchors):
	# this isn't perfect, because it doesn't scan everything
	# TODO check that the page exists (how?)
	if link['is_anchor'] and link['netloc'] == '' and not link['anchor_key'] in anchors:
	return True
	return False


	def find_problems(links, anchors):
	print('checking links in:')
	for d in scan_dirs:
	print(' %s' % d)
	print('bad links to other sections are false positives.\n')

	error_count = 0
	problem_files_count = 0
	for file_key, file_links in links.items():
	if file_key in ignore_files:
	continue
	errors = []
	for link in file_links:
	if link['scheme'] in ['mailto', 'tel']:
	# ignore these
	continue
	if not link['url'].startswith('http') and ".md" in link['raw']:
	error_type = '.md'
	errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
	if not link['url'].startswith('http') and not link['url'].startswith('#') and not link['url'].startswith('/') and not link['url'].startswith('localhost'):
	error_type = 'no leading slash'
	errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
	if internal_unmatched(link, anchors):
	# debug()
	error_type = 'bad link'
	errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw']))
	#if internal_missing_slash(link):
	# errors.append('missing slash: %s -- %s' % (link['anchor_key'], link['raw']))

	if errors:
	problem_files_count += 1
	print('/'.join(file_key))
	for e in errors:
	error_count += 1
	print(' %s' % e)


	print('found %d errors across %d files' % (error_count, problem_files_count))



	def print_anchors(anchors, filt_func=None):
	filt_func = filt_func or true_filter
	for anchor_key, anchor in anchors.items():
	if filt_func(anchor):
	print(' %d:%s' % (anchor['line'], anchor['raw']))


	def print_links(links, filt_func=None):
	filt_func = filt_func or true_filter
	for file_key, file_links in links.items():
	print(file_key)
	for link in file_links:
	if filt_func(link):
	print(' %d:%s' % (link['line'], link['raw']))


	def ref_filter(link):
	return link['is_ref']


	def anchor_filter(link):
	return link['is_anchor']


	def true_filter(x):
	return True


	def get_file_data(fname, file_key):
	with open(fname) as f:
	lines = f.read().strip().split('\n')

	code_fence = False
	links = []
	anchors = {}
	# print(fname)
	for n, line in enumerate(lines, 1):
	# find anchors (any element with an `id` attribute in html
	if line.startswith('```'):
	code_fence = not code_fence
	if code_fence:
	continue

	slugs = []
	if line.startswith('#'):
	# section header automatically gets an `id`
	slugs = [slugify(line)]
	elif 'id="' in line:
	# other things can have `id`s manually added (like in the pilosa.com glossary)
	slugs = re.findall('id="([^"]*)"', line)

	for slug in slugs:
	anchor_key = tuple(list(file_key) + [slug])
	# debug()
	if anchor_key in anchors:
	if 'duplicate_lines' in anchors[anchor_key]:
	anchors[anchor_key]['duplicate_lines'].append(n)
	else:
	anchors[anchor_key]['duplicate_lines'] = [n]
	continue
	anchors[anchor_key] = {
	'raw': line,
	'file': fname,
	'line': n,
	'slug': slug,
	}
	# print(anchor_key)

	# find links
	matches = re.findall(markdown_link_regex, line)
	for match in matches:
	url = match[1]
	if ' ' in url:
	parts = url.split(' ')
	url = parts[0]
	title = ' '.join(parts[1:])
	else:
	title = ''
	parsed = urlparse(url)

	is_ref = '{{< ref ' in url
	if is_ref:
	print('!!!!!!!!!!! ref link found: %s:%d' % (fname, n))
	is_relative = parsed.netloc == ''
	anchor = parsed.fragment
	is_anchor = '#' in url
	is_pilosa = 'pilosa.com' in url or is_relative
	is_image = '.gif' in url.lower() or '.jpg' in url.lower() or '.png' in url.lower() or '.svg' in url.lower()

	anchor_key = None
	if not is_image and not parsed.netloc:
	if not parsed.path:
	page_key = file_key
	else:
	# TODO un-hack
	parsed_path = parsed.path.replace('docs/latest', 'docs')
	# this "aliases" docs/latest to docs, so i dont have to deal with the multiple
	# docs directories
	path_parts = [p for p in parsed_path.split('/') if p not in ['..', '']]
	if len(path_parts) == 1:
	path_parts = [file_key[0]] + path_parts
	page_key = tuple(path_parts)
	anchor_key = tuple(list(page_key) + [anchor])
	# print(' %s %s' % (right_pad(url, 40), anchor_key))
	links.append({
	'raw': '[%s](%s)' % match,
	'file': fname,
	'line': n,
	'scheme': parsed.scheme,
	'netloc': parsed.netloc,
	'text': match[0],
	'title': title,
	'url': url,
	'is_ref': is_ref,
	'is_relative': is_relative,
	'is_anchor': is_anchor,
	'is_pilosa': is_pilosa,
	'is_image': is_image,
	'anchor': anchor,
	'anchor_key': anchor_key,
	})
	return links, anchors


	def right_pad(s, n):
	spaces = ' ' * (n - len(s))
	return s + spaces


	def slugify(text):
	# if it's a link, remove the url and link syntax, leaving only the display text
	m = re.search('\[(.)\]\((.)\)', text)
	if m:
	text = '# ' + m.groups()[0]

	text = text.lower().replace('#', '').strip()
	text = re.sub('[ _]', '-', text)
	text = re.sub('[^0-9a-z\-]', '', text)
	return text


	def scan_files(scan_dirs, file_pattern):
	# returns two dicts:
	# links = { # grouped by file
	# file_key: [ # file_key is a tuple of path elements, like ('explanations', 'architecture')
	# link_data,
	# ...
	# ],
	# }
	# anchors = { # all in one dict for easy search
	# anchor_key: anchor_data,
	# ...
	# }
	links = {}
	anchors = {}
	files = {}
	print('scanning files...')
	for dir in scan_dirs:
	# files = glob(dir + '/' + file_pattern)
	files = glob(dir + '/**/' + file_pattern, recursive=True)
	for file in files:
	# file_key = tuple(file[:-3].split('/')[-2:]) # for having only two path components
	file_key = tuple(file[len(dir):].split('/'))

	# get rid of '.md' suffix for file key, because links shouldn't be using it.
	file_key = tuple([part.replace('.md', '') for part in file_key])

	# filekey = tuple(file[len(dir):].split('/'))
	# TODO: fix filekey, return files list, with filename, markdown id, path key
	# use those to determine if file is present in sidebar
	print(' %s %s' % (file, file_key))
	l, a = get_file_data(file, file_key)
	links[file_key] = l
	anchors.update(a)

	return links, anchors


	main()