Created
May 28, 2018 10:14
-
-
Save lukpueh/1aa22e8970dd986b6de15ca5225b5f19 to your computer and use it in GitHub Desktop.
Basic HTML tag opening-closing-pair validator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
<Program Name> | |
count_html_elems.py | |
<Author> | |
Lukas Puehringer <luk.puehringer@gmail.com> | |
<Started> | |
May, 2018 | |
<Purpose> | |
Basic HTML tag opening-closing-pair validator. | |
Usage: | |
``` | |
python count_html_elems.py <path/to/html/file> [<path/to/html/file> ...] | |
``` | |
""" | |
import re | |
import argparse | |
REG_PATTERN = r"<\/?(\w+)[^>]*?>" | |
OPENING = 1 | |
CLOSING = 0 | |
def get_tag_list(path): | |
"""Return list of dicts with HTML tag info found in passed HTML file, | |
excluding HTML singelton elements (e.g. img, input, ...). | |
""" | |
tags_list = [] | |
with open(path) as html_fp: | |
for lineno, line in enumerate(html_fp): | |
for match in re.finditer(REG_PATTERN, line): | |
tag = match.group(0) | |
tag_name = match.group(1) | |
tag_type = CLOSING if tag.startswith("</") else OPENING | |
# Skip some singleton tags | |
# TODO: There are more | |
singleton_tag_names = ["area", "base", "br", "col", "command", "embed", | |
"hr", "img", "input", "keygen", "link", "meta", "param", "source", | |
"track", "wbr"] | |
if tag_name in singleton_tag_names: | |
continue | |
tags_list.append( | |
{ | |
"lineno": lineno + 1, # Make 1-based instead of 0-based | |
"tag": tag, | |
"name": tag_name, | |
"type_": tag_type | |
}) | |
return tags_list | |
def check_pairs_recursive(tags_list): | |
"""Recurse over passed list of HTML tag info dicts, popping corresponding | |
opening/closing HTML tag pairs. Raises AssertionError if unmatched pairs | |
remain. | |
""" | |
# Helpful for debugging progress of recursion | |
# print map(lambda x: x["name"], tags_list) | |
tag_count = len(tags_list) | |
# Iterate over list of tags | |
for idx in range(tag_count): | |
# If current tag is a closing tag, preceding tag must be an opening | |
# tag with the same name | |
if tags_list[idx]["type_"] == CLOSING: | |
assert (idx != 0 and tags_list[idx - 1]["type_"] == OPENING and | |
tags_list[idx - 1]["name"] == tags_list[idx]["name"]), \ | |
("Missing opening tag for closing tag '{tag}' (lineno:{lineno})" | |
" previous tag is '{prev_tag}' (lineno:{prev_lineno})").format( | |
tag=tags_list[idx]["tag"], lineno=tags_list[idx]["lineno"], | |
prev_tag=tags_list[idx-1]["tag"], | |
prev_lineno=tags_list[idx -1]["lineno"]) | |
# If it was the last tag closing tag it must also be the last tag pair | |
if idx == tag_count: | |
break | |
# Otherwise we recurse removing the valid tag pair | |
return check_pairs_recursive(tags_list[:idx-1] + tags_list[idx+1:]) | |
assert not tag_count, "Remaining tags are " + str(tags_list) | |
def check_tags_in_html_files(paths): | |
"""Iterate over passed file paths and check each file for opening and closing | |
HTML tag pairs. | |
""" | |
for path in paths: | |
tags_list = get_tag_list(path) | |
try: | |
check_pairs_recursive(tags_list) | |
except AssertionError as e: | |
print "Error in '{0}':\n\t {1}".format(path, e) | |
else: | |
print ("Successfully checked '{0}' non-singleton HTML tags in '{1}'." | |
.format(len(tags_list), path)) | |
def main(): | |
"""Parse path(s) passed as arguments and call function to check files | |
for opening and closing HTML tag pairs. | |
""" | |
parser = argparse.ArgumentParser(description="Check if every opening tag has" | |
" a corresponding closing tag in a given HTML file.") | |
parser.add_argument("path", nargs="+", metavar="<path/to/html/file> [...]") | |
args = parser.parse_args() | |
check_tags_in_html_files(args.path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment