Skip to content

Instantly share code, notes, and snippets.

@lukpueh
Created May 28, 2018 10:14
Show Gist options
  • Save lukpueh/1aa22e8970dd986b6de15ca5225b5f19 to your computer and use it in GitHub Desktop.
Save lukpueh/1aa22e8970dd986b6de15ca5225b5f19 to your computer and use it in GitHub Desktop.
Basic HTML tag opening-closing-pair validator
#!/usr/bin/env python
"""
<Program Name>
count_html_elems.py
<Author>
Lukas Puehringer <luk.puehringer@gmail.com>
<Started>
May, 2018
<Purpose>
Basic HTML tag opening-closing-pair validator.
Usage:
```
python count_html_elems.py <path/to/html/file> [<path/to/html/file> ...]
```
"""
import re
import argparse
REG_PATTERN = r"<\/?(\w+)[^>]*?>"
OPENING = 1
CLOSING = 0
def get_tag_list(path):
"""Return list of dicts with HTML tag info found in passed HTML file,
excluding HTML singelton elements (e.g. img, input, ...).
"""
tags_list = []
with open(path) as html_fp:
for lineno, line in enumerate(html_fp):
for match in re.finditer(REG_PATTERN, line):
tag = match.group(0)
tag_name = match.group(1)
tag_type = CLOSING if tag.startswith("</") else OPENING
# Skip some singleton tags
# TODO: There are more
singleton_tag_names = ["area", "base", "br", "col", "command", "embed",
"hr", "img", "input", "keygen", "link", "meta", "param", "source",
"track", "wbr"]
if tag_name in singleton_tag_names:
continue
tags_list.append(
{
"lineno": lineno + 1, # Make 1-based instead of 0-based
"tag": tag,
"name": tag_name,
"type_": tag_type
})
return tags_list
def check_pairs_recursive(tags_list):
"""Recurse over passed list of HTML tag info dicts, popping corresponding
opening/closing HTML tag pairs. Raises AssertionError if unmatched pairs
remain.
"""
# Helpful for debugging progress of recursion
# print map(lambda x: x["name"], tags_list)
tag_count = len(tags_list)
# Iterate over list of tags
for idx in range(tag_count):
# If current tag is a closing tag, preceding tag must be an opening
# tag with the same name
if tags_list[idx]["type_"] == CLOSING:
assert (idx != 0 and tags_list[idx - 1]["type_"] == OPENING and
tags_list[idx - 1]["name"] == tags_list[idx]["name"]), \
("Missing opening tag for closing tag '{tag}' (lineno:{lineno})"
" previous tag is '{prev_tag}' (lineno:{prev_lineno})").format(
tag=tags_list[idx]["tag"], lineno=tags_list[idx]["lineno"],
prev_tag=tags_list[idx-1]["tag"],
prev_lineno=tags_list[idx -1]["lineno"])
# If it was the last tag closing tag it must also be the last tag pair
if idx == tag_count:
break
# Otherwise we recurse removing the valid tag pair
return check_pairs_recursive(tags_list[:idx-1] + tags_list[idx+1:])
assert not tag_count, "Remaining tags are " + str(tags_list)
def check_tags_in_html_files(paths):
"""Iterate over passed file paths and check each file for opening and closing
HTML tag pairs.
"""
for path in paths:
tags_list = get_tag_list(path)
try:
check_pairs_recursive(tags_list)
except AssertionError as e:
print "Error in '{0}':\n\t {1}".format(path, e)
else:
print ("Successfully checked '{0}' non-singleton HTML tags in '{1}'."
.format(len(tags_list), path))
def main():
"""Parse path(s) passed as arguments and call function to check files
for opening and closing HTML tag pairs.
"""
parser = argparse.ArgumentParser(description="Check if every opening tag has"
" a corresponding closing tag in a given HTML file.")
parser.add_argument("path", nargs="+", metavar="<path/to/html/file> [...]")
args = parser.parse_args()
check_tags_in_html_files(args.path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment