Skip to content

Instantly share code, notes, and snippets.

@adifahmi
Last active May 18, 2018 12:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adifahmi/0db3811720acea2f7ce4fdf4687a6c24 to your computer and use it in GitHub Desktop.
Save adifahmi/0db3811720acea2f7ce4fdf4687a6c24 to your computer and use it in GitHub Desktop.
Python html tags checker, it checks if tags are properly closed, inspired from https://github.com/ryanpcmcquen/unclosedTagFinder
import re
import argparse
import urllib.parse
import urllib.request
htmlRegex = '<[^\!][^>]*>'
voidElementsRegex = '</?(?!area|base|br|col|embed|hr|img|input|keygen|link|menuitem|meta)'
openingTagRegex = '<[^/]'
closingTagRegex = '</'
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input',)
def get_tag_list(html):
tags = re.compile(htmlRegex, flags=re.I | re.M)
tag_list = re.findall(tags, html)
return tag_list
def get_opening_tag_list(tag_list):
opening_tag = list(
filter(
lambda tag: re.match(openingTagRegex, tag),
tag_list
)
)
return opening_tag
def get_closing_tag_list(tag_list):
closing_tag_list = list(
filter(
lambda tag: re.match(closingTagRegex, tag),
tag_list
)
)
return closing_tag_list
def clean_html(raw_html):
cleantext = re.sub(r'\W+', '', raw_html)
return cleantext
def clean_list(the_list):
for idx, val in enumerate(the_list):
the_list[idx] = clean_html(val)
return the_list
# Simple check if opening tags are equal closing tags
def is_match_count(opening_tag_list, closing_tag_list):
if len(opening_tag_list) != len(closing_tag_list):
return False
return True
# will check if tag is closed by equal tag
# ex: `head` must be closed by `head` too
def is_mismatch(opening_tag_list, closing_tag_list):
o = clean_list(opening_tag_list)
c = clean_list(closing_tag_list)
reverse_c = c[::-1] # need to be reversed since html closing tags works in reversed
if o != reverse_c:
return False
return True
def is_tag_completed():
args = parser.parse_args()
html = args.input
tag_list = get_tag_list(html)
# print(tag_list)
opening_tag_list = get_opening_tag_list(tag_list)
closing_tag_list = get_closing_tag_list(tag_list)
# print(opening_tag_list)
# print(closing_tag_list)
if is_match_count(opening_tag_list, closing_tag_list) is False:
print("MISMATCHED TAGS COUNT")
print("FALSE")
return False
elif is_mismatch(opening_tag_list, closing_tag_list) is False:
print("MISMATCHED TAGS")
print("FALSE")
return False
else:
print("TRUE")
return True
if __name__ == '__main__':
is_tag_completed()
@adifahmi
Copy link
Author

adifahmi commented May 17, 2018

How to use (python 3):

python parser.py -i '<div>Hello<b>World</b></div>'

TRUE

python parser.py -i '<div>Hello<b>World</div></b>'

MISMATCHED TAGS
FALSE

python parser.py -i '<div>Hello<b>World</div>'

MISMATCHED TAGS COUNT
FALSE

@lokoroi
Copy link

lokoroi commented May 18, 2018

uwaaaaww

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment