Skip to content

Instantly share code, notes, and snippets.

@JamesJinPark
Last active August 29, 2015 14:06
Show Gist options
  • Save JamesJinPark/9def706729b796d9db11 to your computer and use it in GitHub Desktop.
Save JamesJinPark/9def706729b796d9db11 to your computer and use it in GitHub Desktop.
tidyHTML
#CIT 591
#Assignment 4 - HTML Tidy
#by James Park and Yue Chen
import Tkinter
import tkFileDialog
import os.path
import shutil
import random
import sys
import re
def create_test_file(file_name):
"""This function creates a test file."""
file_name = os.path.dirname(os.path.realpath(__file__))
file_name += os.sep + "unittest.html"
f = open(file_name, 'w+')
f.close()
return
def delete_file(file_name):
"""This function deletes a file."""
os.remove(file_name)
def file_exists(file_name):
"""This function verifies whether a file exists."""
return os.path.exists(file_name)
def ask_user_for_file():
"""This function asks user for a file."""
Tkinter.Tk().withdraw() # Close the root window
print 'Please select the HMTL file that you want to fix.'
in_path = tkFileDialog.askopenfilename()
return in_path
def create_backup(in_path):
"""This function copies the input file and creates a backup file."""
bad_file_location = os.path.basename(in_path)
backup_file = bad_file_location + '.bak'
shutil.copyfile(in_path, backup_file)
return
def generate_random_file_name(in_path):
"""This function generates a random file name."""
random_number = random.randint(1, sys.maxint)
output_shell = os.path.dirname(in_path)
output_shell += os.sep + str(random_number) + '.html'
return output_shell
def read_entire_input_file(in_path):
"""This function reads the entire input file and stores the entire content."""
input_file = open(in_path, 'r')
whole_content_as_string = input_file.read()
input_file.close()
return whole_content_as_string
def read_input_file_line_by_line(in_path):
"""This function reads the input file line by line."""
input_file = open(in_path, 'r')
list_of_content_lines = input_file.readlines()
input_file.close()
return list_of_content_lines
def merge_list(list_of_lines):
"""This function takes a list of lists and flattens it to the just the elements."""
merged_list = []
for i in list_of_lines:
for j in i:
merged_list.append(j)
return merged_list
def compare_variable_with_string(content, string):
"""This function compares a variable with a string. (Mainly for testing purposes)"""
content == string
def create_output_file(output_shell, content):
"""This function creates the output file that we will put fixed HTML code into."""
f = open(output_shell, 'w+')
f.write(content)
f.close()
def find_next_tag(start_position, content):
"""This functions finds start tags and next position to search the content."""
p = re.compile('<([a-zA-Z][^>\s]*|/[a-zA-Z][^>\s]*)[^>]*>')
m = p.search(content, start_position)
if m is None:
return None, len(content), len(content)
return m.group(1), m.span(0)[0], m.span(0)[1]
def lower_case_tags(tag_position, tag, content):
"""This function lower cases tags."""
tag_position += 1
end_position = tag_position + len(tag)
content = content[:tag_position] + tag.lower() + content[end_position:]
return content
def tag_type(tag):
"""This function checks whether a tag is a start tag or end tag."""
return tag[0:1] != '/'
def tag_match(start_tag, end_tag):
"""This function is matching the start tag with an end tag."""
return end_tag[1:] == start_tag
def convert_to_end_tag(start_tag):
"""This converts a tag to an end_tag."""
return '</' + start_tag.lower() + '>'
def insert_missing_tag(start_tag, start_position, end_position, content):
"""This function inserts a missing end tag where end tags are missing."""
end_tag = convert_to_end_tag(start_tag)
content = content[:start_position] + end_tag + content[start_position:]
return start_position + len(end_tag), end_position + len(end_tag), content
def delete_extra_end_tag(tag_position, next_start_position, content):
"""This function deletes extra end tags."""
content = content[:tag_position] + content[next_start_position:]
return tag_position, tag_position, content
def convert_to_start_tag(end_tag):
"""This function converts an end tag to a start tag."""
return end_tag[1:]
def fix_missing_tags(content):
"""This function compiles smaller functions and goes through an input file and fixes the tagging and lower cases all tags."""
next_start_position = 0
tag_position = 0
stack = []
while True:
tag, tag_position, next_start_position = find_next_tag(next_start_position, content)
if tag is None:
break
content = lower_case_tags(tag_position, tag, content)
if tag_type(tag):
stack.append(tag)
else:
if convert_to_start_tag(tag) not in stack:
tag_position, next_start_position, content = delete_extra_end_tag(tag_position, next_start_position, content)
continue
while len(stack) > 0:
pop_tag = stack.pop()
if tag_match(pop_tag, tag):
break
tag_position, next_start_position, content = insert_missing_tag(pop_tag, tag_position, next_start_position, content)
while len(stack) > 0:
tag_position, next_start_position, content = insert_missing_tag(stack.pop(), tag_position, next_start_position, content)
return content
def check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(line):
"""This function checks whether a line has both a start tag and an end tag, and no nested start tag. """
next_start_position = 0
tag_position = 0
stack = []
while True:
tag, tag_position, next_start_position = find_next_tag(next_start_position, line)
if tag is None:
break
if tag_type(tag):
stack.append(tag)
else:
if convert_to_start_tag(tag) not in stack:
continue
while len(stack) > 0:
pop_tag = stack.pop()
if tag_match(pop_tag, tag) and not stack:
return True
def fix_indentation(content):
"""This function fixes all the identations in the input file."""
for i in range(0, len(content)):
for j in range(0, len(content[i])):
if check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(content[i]) == False:
"""This brings back a list of lines that do not have both a start tag and end tag. If the line
has a start and end tag, that means that there is a nested start tag."""
""" This is list of lines that do not have both start and end tags in the same line.
If there is /n between start tag and matching end tag, then indent 2 spaces.
If start tag does not have a \n in front of it, then add one.
When end tag is found, add \n.
"""
pass
def insert_lines_for_special_tags(content):
"""This function inserts lines for tags such as <head> and <body>. Also, places <pre> start tags in a new line."""
search_strings = ('<head>', '<body>', '<h>')
content = content.replace("<pre>", "\n<pre>")
for word in search_strings:
special_tag_location = content.find(word)
content = content[:special_tag_location] + '\n' + content[special_tag_location:]
return content
def find_pre_start_tags(line):
""""This function finds pre start tags."""
found_line = line.find('<pre>')
if found_line >= 0:
return True
return False
def find_pre_end_tags(line):
"""This function finds pre end tags."""
found_line = line.find('</pre>')
if found_line >= 0:
return True
return False
def insert_space_in_long_line(line):
"""This function inserts spaces in lines inputted and will recursively do so."""
if len(line) <= 80:
return line
space_index = line.rfind(" ", 0, 79)
if space_index == -1:
return line
new_line = line[:space_index] + '\n' + insert_space_in_long_line(line[space_index + 1:])
return new_line
def fix_length(content):
"""This function finds lines greater than 80 characters and splits them into two separate sentences."""
content = content.split('\n')
new_content = []
for i in range(0, len(content)):
current_line = content[i]
if find_pre_start_tags(current_line):
new_content.append(current_line)
continue
if find_pre_end_tags(current_line):
new_content.append(current_line)
continue
new_content.append(insert_space_in_long_line(current_line))
return new_content
def rename_file(old_file_name, new_file_name):
"""This function renames the new output file with the original file name."""
name_to_delete = os.path.basename(old_file_name)
return os.rename(name_to_delete, new_file_name)
def is_tag_start_tag_or_end_tag(s):
"""The input is the part between '<' and '>' of each tag('<' and '>' included)
check if the char after '<' is '/'
the strip() is in case for possible spaces between '<' and ('/' or the first letter of the start tag name)
"""
return s[1:].strip()[0] != '/'
def read_all_tags(s):
"""
find out all tags in the original file and store them in a list
"""
tags_list = []
p = re.compile('<.+?>')
for tag in p.finditer(s):
tags_list.append(tag)
return tags_list
def process_indentation(text):
"""The input string is the file in which all tags are in lowercase, well matched and nested,
each tag is in the approriate line
we hope to output the file in which all lines are well-indented
1. split the file into different lines and store them into 'lst'
2. the variable 'output' refers to the whole file after processing
define last_end_location, to record where am I in the current line(original file),
while processing the input and store the result into output
3. for each line, delete all the spaces in the line
4. read all tags in current line and store them in tags_list
5. if tags_list is
"""
n = 0
lst = text.split('\n')
i = 2
output = ''
last_end_location = 0
for line in lst:
line = line.strip()
tags_list = read_all_tags(line)
if len(tags_list) > 0:
tag = tags_list[0]
if is_tag_start_tag_or_end_tag(tag.group()):
output += ' ' * n * 2 + line + '\n'
elif is_tag_start_tag_or_end_tag(tag.group()) == False and tag.start() != 0:
output += ' ' * n * 2 + line + '\n'
else:
output += ' ' * (n - 1) * 2 + line + '\n'
else:
output += ' ' * n * 2 + line + '\n'
for tag in tags_list:
tag_name = tag.group()
if is_tag_start_tag_or_end_tag(tag_name):
n += 1
else:
n -= 1
return output
def delete_blank_line(line):
"""Delete all blank lines in the input file."""
lst = line.split('\n')
s = ''
for line in lst:
if line != '':
s += line + '\n'
return s
def main():
in_path = ask_user_for_file()
backup_file = create_backup(in_path)
output_shell = generate_random_file_name(in_path)
content = read_input_file_line_by_line(in_path)
merged_list = merge_list(content)
content_to_be_fixed = ''.join(merged_list)
new_content_to_be_fixed = insert_lines_for_special_tags(content_to_be_fixed)
fixed_tags_content = fix_missing_tags(new_content_to_be_fixed)
indented_content = process_indentation(fixed_tags_content)
jumbled_list = fix_length(indented_content)
not_final_content = '\n'.join(jumbled_list)
final_content = delete_blank_line(not_final_content)
create_output_file(output_shell, final_content)
new_file_name = os.path.basename(in_path)
delete_file(new_file_name)
new_file_name = str(new_file_name)
rename_file(output_shell, new_file_name)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment