Last active
August 29, 2015 14:06
-
-
Save JamesJinPark/9def706729b796d9db11 to your computer and use it in GitHub Desktop.
tidyHTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#CIT 591 | |
#Assignment 4 - HTML Tidy | |
#by James Park and Yue Chen | |
import Tkinter | |
import tkFileDialog | |
import os.path | |
import shutil | |
import random | |
import sys | |
import re | |
def create_test_file(file_name): | |
"""This function creates a test file.""" | |
file_name = os.path.dirname(os.path.realpath(__file__)) | |
file_name += os.sep + "unittest.html" | |
f = open(file_name, 'w+') | |
f.close() | |
return | |
def delete_file(file_name): | |
"""This function deletes a file.""" | |
os.remove(file_name) | |
def file_exists(file_name): | |
"""This function verifies whether a file exists.""" | |
return os.path.exists(file_name) | |
def ask_user_for_file(): | |
"""This function asks user for a file.""" | |
Tkinter.Tk().withdraw() # Close the root window | |
print 'Please select the HMTL file that you want to fix.' | |
in_path = tkFileDialog.askopenfilename() | |
return in_path | |
def create_backup(in_path): | |
"""This function copies the input file and creates a backup file.""" | |
bad_file_location = os.path.basename(in_path) | |
backup_file = bad_file_location + '.bak' | |
shutil.copyfile(in_path, backup_file) | |
return | |
def generate_random_file_name(in_path): | |
"""This function generates a random file name.""" | |
random_number = random.randint(1, sys.maxint) | |
output_shell = os.path.dirname(in_path) | |
output_shell += os.sep + str(random_number) + '.html' | |
return output_shell | |
def read_entire_input_file(in_path): | |
"""This function reads the entire input file and stores the entire content.""" | |
input_file = open(in_path, 'r') | |
whole_content_as_string = input_file.read() | |
input_file.close() | |
return whole_content_as_string | |
def read_input_file_line_by_line(in_path): | |
"""This function reads the input file line by line.""" | |
input_file = open(in_path, 'r') | |
list_of_content_lines = input_file.readlines() | |
input_file.close() | |
return list_of_content_lines | |
def merge_list(list_of_lines): | |
"""This function takes a list of lists and flattens it to the just the elements.""" | |
merged_list = [] | |
for i in list_of_lines: | |
for j in i: | |
merged_list.append(j) | |
return merged_list | |
def compare_variable_with_string(content, string): | |
"""This function compares a variable with a string. (Mainly for testing purposes)""" | |
content == string | |
def create_output_file(output_shell, content): | |
"""This function creates the output file that we will put fixed HTML code into.""" | |
f = open(output_shell, 'w+') | |
f.write(content) | |
f.close() | |
def find_next_tag(start_position, content): | |
"""This functions finds start tags and next position to search the content.""" | |
p = re.compile('<([a-zA-Z][^>\s]*|/[a-zA-Z][^>\s]*)[^>]*>') | |
m = p.search(content, start_position) | |
if m is None: | |
return None, len(content), len(content) | |
return m.group(1), m.span(0)[0], m.span(0)[1] | |
def lower_case_tags(tag_position, tag, content): | |
"""This function lower cases tags.""" | |
tag_position += 1 | |
end_position = tag_position + len(tag) | |
content = content[:tag_position] + tag.lower() + content[end_position:] | |
return content | |
def tag_type(tag): | |
"""This function checks whether a tag is a start tag or end tag.""" | |
return tag[0:1] != '/' | |
def tag_match(start_tag, end_tag): | |
"""This function is matching the start tag with an end tag.""" | |
return end_tag[1:] == start_tag | |
def convert_to_end_tag(start_tag): | |
"""This converts a tag to an end_tag.""" | |
return '</' + start_tag.lower() + '>' | |
def insert_missing_tag(start_tag, start_position, end_position, content): | |
"""This function inserts a missing end tag where end tags are missing.""" | |
end_tag = convert_to_end_tag(start_tag) | |
content = content[:start_position] + end_tag + content[start_position:] | |
return start_position + len(end_tag), end_position + len(end_tag), content | |
def delete_extra_end_tag(tag_position, next_start_position, content): | |
"""This function deletes extra end tags.""" | |
content = content[:tag_position] + content[next_start_position:] | |
return tag_position, tag_position, content | |
def convert_to_start_tag(end_tag): | |
"""This function converts an end tag to a start tag.""" | |
return end_tag[1:] | |
def fix_missing_tags(content): | |
"""This function compiles smaller functions and goes through an input file and fixes the tagging and lower cases all tags.""" | |
next_start_position = 0 | |
tag_position = 0 | |
stack = [] | |
while True: | |
tag, tag_position, next_start_position = find_next_tag(next_start_position, content) | |
if tag is None: | |
break | |
content = lower_case_tags(tag_position, tag, content) | |
if tag_type(tag): | |
stack.append(tag) | |
else: | |
if convert_to_start_tag(tag) not in stack: | |
tag_position, next_start_position, content = delete_extra_end_tag(tag_position, next_start_position, content) | |
continue | |
while len(stack) > 0: | |
pop_tag = stack.pop() | |
if tag_match(pop_tag, tag): | |
break | |
tag_position, next_start_position, content = insert_missing_tag(pop_tag, tag_position, next_start_position, content) | |
while len(stack) > 0: | |
tag_position, next_start_position, content = insert_missing_tag(stack.pop(), tag_position, next_start_position, content) | |
return content | |
def check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(line): | |
"""This function checks whether a line has both a start tag and an end tag, and no nested start tag. """ | |
next_start_position = 0 | |
tag_position = 0 | |
stack = [] | |
while True: | |
tag, tag_position, next_start_position = find_next_tag(next_start_position, line) | |
if tag is None: | |
break | |
if tag_type(tag): | |
stack.append(tag) | |
else: | |
if convert_to_start_tag(tag) not in stack: | |
continue | |
while len(stack) > 0: | |
pop_tag = stack.pop() | |
if tag_match(pop_tag, tag) and not stack: | |
return True | |
def fix_indentation(content): | |
"""This function fixes all the identations in the input file.""" | |
for i in range(0, len(content)): | |
for j in range(0, len(content[i])): | |
if check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(content[i]) == False: | |
"""This brings back a list of lines that do not have both a start tag and end tag. If the line | |
has a start and end tag, that means that there is a nested start tag.""" | |
""" This is list of lines that do not have both start and end tags in the same line. | |
If there is /n between start tag and matching end tag, then indent 2 spaces. | |
If start tag does not have a \n in front of it, then add one. | |
When end tag is found, add \n. | |
""" | |
pass | |
def insert_lines_for_special_tags(content): | |
"""This function inserts lines for tags such as <head> and <body>. Also, places <pre> start tags in a new line.""" | |
search_strings = ('<head>', '<body>', '<h>') | |
content = content.replace("<pre>", "\n<pre>") | |
for word in search_strings: | |
special_tag_location = content.find(word) | |
content = content[:special_tag_location] + '\n' + content[special_tag_location:] | |
return content | |
def find_pre_start_tags(line): | |
""""This function finds pre start tags.""" | |
found_line = line.find('<pre>') | |
if found_line >= 0: | |
return True | |
return False | |
def find_pre_end_tags(line): | |
"""This function finds pre end tags.""" | |
found_line = line.find('</pre>') | |
if found_line >= 0: | |
return True | |
return False | |
def insert_space_in_long_line(line): | |
"""This function inserts spaces in lines inputted and will recursively do so.""" | |
if len(line) <= 80: | |
return line | |
space_index = line.rfind(" ", 0, 79) | |
if space_index == -1: | |
return line | |
new_line = line[:space_index] + '\n' + insert_space_in_long_line(line[space_index + 1:]) | |
return new_line | |
def fix_length(content): | |
"""This function finds lines greater than 80 characters and splits them into two separate sentences.""" | |
content = content.split('\n') | |
new_content = [] | |
for i in range(0, len(content)): | |
current_line = content[i] | |
if find_pre_start_tags(current_line): | |
new_content.append(current_line) | |
continue | |
if find_pre_end_tags(current_line): | |
new_content.append(current_line) | |
continue | |
new_content.append(insert_space_in_long_line(current_line)) | |
return new_content | |
def rename_file(old_file_name, new_file_name): | |
"""This function renames the new output file with the original file name.""" | |
name_to_delete = os.path.basename(old_file_name) | |
return os.rename(name_to_delete, new_file_name) | |
def is_tag_start_tag_or_end_tag(s): | |
"""The input is the part between '<' and '>' of each tag('<' and '>' included) | |
check if the char after '<' is '/' | |
the strip() is in case for possible spaces between '<' and ('/' or the first letter of the start tag name) | |
""" | |
return s[1:].strip()[0] != '/' | |
def read_all_tags(s): | |
""" | |
find out all tags in the original file and store them in a list | |
""" | |
tags_list = [] | |
p = re.compile('<.+?>') | |
for tag in p.finditer(s): | |
tags_list.append(tag) | |
return tags_list | |
def process_indentation(text): | |
"""The input string is the file in which all tags are in lowercase, well matched and nested, | |
each tag is in the approriate line | |
we hope to output the file in which all lines are well-indented | |
1. split the file into different lines and store them into 'lst' | |
2. the variable 'output' refers to the whole file after processing | |
define last_end_location, to record where am I in the current line(original file), | |
while processing the input and store the result into output | |
3. for each line, delete all the spaces in the line | |
4. read all tags in current line and store them in tags_list | |
5. if tags_list is | |
""" | |
n = 0 | |
lst = text.split('\n') | |
i = 2 | |
output = '' | |
last_end_location = 0 | |
for line in lst: | |
line = line.strip() | |
tags_list = read_all_tags(line) | |
if len(tags_list) > 0: | |
tag = tags_list[0] | |
if is_tag_start_tag_or_end_tag(tag.group()): | |
output += ' ' * n * 2 + line + '\n' | |
elif is_tag_start_tag_or_end_tag(tag.group()) == False and tag.start() != 0: | |
output += ' ' * n * 2 + line + '\n' | |
else: | |
output += ' ' * (n - 1) * 2 + line + '\n' | |
else: | |
output += ' ' * n * 2 + line + '\n' | |
for tag in tags_list: | |
tag_name = tag.group() | |
if is_tag_start_tag_or_end_tag(tag_name): | |
n += 1 | |
else: | |
n -= 1 | |
return output | |
def delete_blank_line(line): | |
"""Delete all blank lines in the input file.""" | |
lst = line.split('\n') | |
s = '' | |
for line in lst: | |
if line != '': | |
s += line + '\n' | |
return s | |
def main(): | |
in_path = ask_user_for_file() | |
backup_file = create_backup(in_path) | |
output_shell = generate_random_file_name(in_path) | |
content = read_input_file_line_by_line(in_path) | |
merged_list = merge_list(content) | |
content_to_be_fixed = ''.join(merged_list) | |
new_content_to_be_fixed = insert_lines_for_special_tags(content_to_be_fixed) | |
fixed_tags_content = fix_missing_tags(new_content_to_be_fixed) | |
indented_content = process_indentation(fixed_tags_content) | |
jumbled_list = fix_length(indented_content) | |
not_final_content = '\n'.join(jumbled_list) | |
final_content = delete_blank_line(not_final_content) | |
create_output_file(output_shell, final_content) | |
new_file_name = os.path.basename(in_path) | |
delete_file(new_file_name) | |
new_file_name = str(new_file_name) | |
rename_file(output_shell, new_file_name) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment