JamesJinPark/Assignment 4(1).py

## Assignment 4(1).py
#CIT 591
#Assignment 4 - HTML Tidy
#by James Park and Yue Chen

import Tkinter
import tkFileDialog
import os.path
import shutil
import random
import sys
import re

def create_test_file(file_name):
    """This function creates a test file."""
    file_name = os.path.dirname(os.path.realpath(__file__))
    file_name += os.sep + "unittest.html"
    f = open(file_name, 'w+')
    f.close()
    return

def delete_file(file_name):
    """This function deletes a file."""
    os.remove(file_name)

def file_exists(file_name):
    """This function verifies whether a file exists."""
    return os.path.exists(file_name)

def ask_user_for_file():
    """This function asks user for a file."""
    Tkinter.Tk().withdraw() # Close the root window
    print 'Please select the HMTL file that you want to fix.'
    in_path = tkFileDialog.askopenfilename()
    return in_path

def create_backup(in_path):
    """This function copies the input file and creates a backup file."""
    bad_file_location = os.path.basename(in_path)
    backup_file = bad_file_location + '.bak'
    shutil.copyfile(in_path, backup_file)
    return

def generate_random_file_name(in_path):
    """This function generates a random file name."""
    random_number = random.randint(1, sys.maxint)
    output_shell = os.path.dirname(in_path)
    output_shell += os.sep + str(random_number) + '.html'
    return output_shell

def read_entire_input_file(in_path):
    """This function reads the entire input file and stores the entire content."""
    input_file = open(in_path, 'r')
    whole_content_as_string = input_file.read()
    input_file.close()
    return whole_content_as_string

def read_input_file_line_by_line(in_path):
    """This function reads the input file line by line."""
    input_file = open(in_path, 'r')
    list_of_content_lines = input_file.readlines()
    input_file.close()
    return list_of_content_lines

def merge_list(list_of_lines):
    """This function takes a list of lists and flattens it to the just the elements."""
    merged_list = []
    for i in list_of_lines:
        for j in i:
            merged_list.append(j)
    return merged_list

def compare_variable_with_string(content, string):
    """This function compares a variable with a string. (Mainly for testing purposes)"""
    content == string

def create_output_file(output_shell, content):
    """This function creates the output file that we will put fixed HTML code into."""
    f = open(output_shell, 'w+')
    f.write(content)
    f.close()

def find_next_tag(start_position, content):
    """This functions finds start tags and next position to search the content."""
    p = re.compile('<([a-zA-Z][^>\s]*|/[a-zA-Z][^>\s]*)[^>]*>')
    m = p.search(content, start_position)
    if m is None:
        return None, len(content), len(content)
    return m.group(1), m.span(0)[0], m.span(0)[1]

def lower_case_tags(tag_position, tag, content):
    """This function lower cases tags."""
    tag_position += 1
    end_position = tag_position + len(tag)
    content = content[:tag_position] + tag.lower() + content[end_position:]
    return content

def tag_type(tag):
    """This function checks whether a tag is a start tag or end tag."""
    return tag[0:1] != '/'

def tag_match(start_tag, end_tag):
    """This function is matching the start tag with an end tag."""
    return end_tag[1:] == start_tag

def convert_to_end_tag(start_tag):
    """This converts a tag to an end_tag."""
    return '</' + start_tag.lower() + '>'

def insert_missing_tag(start_tag, start_position, end_position, content):
    """This function inserts a missing end tag where end tags are missing."""
    end_tag = convert_to_end_tag(start_tag)
    content = content[:start_position] + end_tag + content[start_position:]
    return start_position + len(end_tag), end_position + len(end_tag), content

def delete_extra_end_tag(tag_position, next_start_position, content):
    """This function deletes extra end tags."""
    content = content[:tag_position] + content[next_start_position:]
    return tag_position, tag_position, content

def convert_to_start_tag(end_tag):
    """This function converts an end tag to a start tag."""
    return end_tag[1:]

def fix_missing_tags(content):
    """This function compiles smaller functions and goes through an input file and fixes the tagging and lower cases all tags."""
    next_start_position = 0
    tag_position = 0
    stack = []
    while True:
        tag, tag_position, next_start_position = find_next_tag(next_start_position, content)
        if tag is None:
            break
        content = lower_case_tags(tag_position, tag, content)
        if tag_type(tag):
            stack.append(tag)
        else:
            if convert_to_start_tag(tag) not in stack:
                tag_position, next_start_position, content = delete_extra_end_tag(tag_position, next_start_position, content)
                continue
            while len(stack) > 0:
                pop_tag = stack.pop()
                if tag_match(pop_tag, tag):
                    break
                tag_position, next_start_position, content = insert_missing_tag(pop_tag, tag_position, next_start_position, content)
    while len(stack) > 0:
        tag_position, next_start_position, content = insert_missing_tag(stack.pop(), tag_position, next_start_position, content)
    return content

def check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(line):
    """This function checks whether a line has both a start tag and an end tag, and no nested start tag. """
    next_start_position = 0
    tag_position = 0
    stack = []
    while True:
        tag, tag_position, next_start_position = find_next_tag(next_start_position, line)
        if tag is None:
            break
        if tag_type(tag):
            stack.append(tag)
        else:
            if convert_to_start_tag(tag) not in stack:
                continue
            while len(stack) > 0:
                pop_tag = stack.pop()
                if tag_match(pop_tag, tag) and not stack:
                    return True


def fix_indentation(content):
    """This function fixes all the identations in the input file."""
    for i in range(0, len(content)):
        for j in range(0, len(content[i])):
            if check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(content[i]) == False:
                """This brings back a list of lines that do not have both a start tag and end tag.  If the line
                    has a start and end tag, that means that there is a nested start tag."""

                """ This is list of lines that do not have both start and end tags in the same line.
                    If there is /n between start tag and matching end tag, then indent 2 spaces.
                    If start tag does not have a \n in front of it, then add one.
                    When end tag is found, add \n.

                """
    pass

def insert_lines_for_special_tags(content):
    """This function inserts lines for tags such as <head> and <body>. Also, places <pre> start tags in a new line."""
    search_strings = ('<head>', '<body>', '<h>')
    content = content.replace("<pre>", "\n<pre>")
    for word in search_strings:
        special_tag_location = content.find(word)
        content = content[:special_tag_location] + '\n' + content[special_tag_location:]
    return content

def find_pre_start_tags(line):
    """"This function finds pre start tags."""
    found_line = line.find('<pre>')
    if found_line >= 0:
        return True
    return False

def find_pre_end_tags(line):
    """This function finds pre end tags."""
    found_line = line.find('</pre>')
    if found_line >= 0:
        return True
    return False

def insert_space_in_long_line(line):
    """This function inserts spaces in lines inputted and will recursively do so."""
    if len(line) <= 80:
        return line
    space_index = line.rfind(" ", 0, 79)
    if space_index == -1:
        return line
    new_line = line[:space_index] + '\n' + insert_space_in_long_line(line[space_index + 1:])
    return new_line

def fix_length(content):
    """This function finds lines greater than 80 characters and splits them into two separate sentences."""
    content = content.split('\n')
    new_content = []
    for i in range(0, len(content)):
        current_line = content[i]
        if find_pre_start_tags(current_line):
            new_content.append(current_line)
            continue
        if find_pre_end_tags(current_line):
            new_content.append(current_line)
            continue
        new_content.append(insert_space_in_long_line(current_line))
    return new_content

def rename_file(old_file_name, new_file_name):
    """This function renames the new output file with the original file name."""
    name_to_delete = os.path.basename(old_file_name)
    return os.rename(name_to_delete, new_file_name)

def is_tag_start_tag_or_end_tag(s):
    """The input is the part between '<' and '>' of each tag('<' and '>' included)
    check if the char after '<' is '/'
    the strip() is in case for possible spaces between '<' and ('/' or the first letter of the start tag name)
    """
    return s[1:].strip()[0] != '/'

def read_all_tags(s):
    """
    find out all tags in the original file and store them in a list
    """

    tags_list = []
    p = re.compile('<.+?>')
    for tag in p.finditer(s):
        tags_list.append(tag)
    return tags_list

def process_indentation(text):
    """The input string is the file in which all tags are in lowercase, well matched and nested,
    each tag is in the approriate line
    we hope to output the file in which all lines are well-indented

    1. split the file into different lines and store them into 'lst'
    2. the variable 'output' refers to the whole file after processing
       define last_end_location, to record where am I in the current line(original file),
       while processing the input and store the result into output
    3. for each line, delete all the spaces in the line
    4. read all tags in current line and store them in tags_list
    5. if tags_list is
    """
    n = 0
    lst = text.split('\n')
    i = 2
    output = ''
    last_end_location = 0
    for line in lst:
        line = line.strip()
        tags_list = read_all_tags(line)
        if len(tags_list) > 0:
            tag = tags_list[0]
            if is_tag_start_tag_or_end_tag(tag.group()):
                output += ' ' * n * 2 + line + '\n'
            elif is_tag_start_tag_or_end_tag(tag.group()) == False and tag.start() != 0:
                output += ' ' * n * 2 + line + '\n'
            else:
                output += ' ' * (n - 1) * 2 + line + '\n'
        else:
            output += ' ' * n * 2 + line + '\n'
        for tag in tags_list:
            tag_name = tag.group()
            if is_tag_start_tag_or_end_tag(tag_name):
                n += 1
            else:
                n -= 1
    return output

def delete_blank_line(line):
    """Delete all blank lines in the input file."""
    lst = line.split('\n')
    s = ''
    for line in lst:
        if line != '':
            s += line + '\n'
    return s

def main():
    in_path = ask_user_for_file()
    backup_file = create_backup(in_path)
    output_shell = generate_random_file_name(in_path)
    content = read_input_file_line_by_line(in_path)
    merged_list = merge_list(content)
    content_to_be_fixed = ''.join(merged_list)
    new_content_to_be_fixed = insert_lines_for_special_tags(content_to_be_fixed)
    fixed_tags_content = fix_missing_tags(new_content_to_be_fixed)
    indented_content = process_indentation(fixed_tags_content)
    jumbled_list = fix_length(indented_content)
    not_final_content = '\n'.join(jumbled_list)
    final_content = delete_blank_line(not_final_content)
    create_output_file(output_shell, final_content)
    new_file_name = os.path.basename(in_path)
    delete_file(new_file_name)
    new_file_name = str(new_file_name)
    rename_file(output_shell, new_file_name)

if __name__ == "__main__":
    main()
	#CIT 591
	#Assignment 4 - HTML Tidy
	#by James Park and Yue Chen

	import Tkinter
	import tkFileDialog
	import os.path
	import shutil
	import random
	import sys
	import re

	def create_test_file(file_name):
	"""This function creates a test file."""
	file_name = os.path.dirname(os.path.realpath(__file__))
	file_name += os.sep + "unittest.html"
	f = open(file_name, 'w+')
	f.close()
	return

	def delete_file(file_name):
	"""This function deletes a file."""
	os.remove(file_name)

	def file_exists(file_name):
	"""This function verifies whether a file exists."""
	return os.path.exists(file_name)

	def ask_user_for_file():
	"""This function asks user for a file."""
	Tkinter.Tk().withdraw() # Close the root window
	print 'Please select the HMTL file that you want to fix.'
	in_path = tkFileDialog.askopenfilename()
	return in_path

	def create_backup(in_path):
	"""This function copies the input file and creates a backup file."""
	bad_file_location = os.path.basename(in_path)
	backup_file = bad_file_location + '.bak'
	shutil.copyfile(in_path, backup_file)
	return

	def generate_random_file_name(in_path):
	"""This function generates a random file name."""
	random_number = random.randint(1, sys.maxint)
	output_shell = os.path.dirname(in_path)
	output_shell += os.sep + str(random_number) + '.html'
	return output_shell

	def read_entire_input_file(in_path):
	"""This function reads the entire input file and stores the entire content."""
	input_file = open(in_path, 'r')
	whole_content_as_string = input_file.read()
	input_file.close()
	return whole_content_as_string

	def read_input_file_line_by_line(in_path):
	"""This function reads the input file line by line."""
	input_file = open(in_path, 'r')
	list_of_content_lines = input_file.readlines()
	input_file.close()
	return list_of_content_lines

	def merge_list(list_of_lines):
	"""This function takes a list of lists and flattens it to the just the elements."""
	merged_list = []
	for i in list_of_lines:
	for j in i:
	merged_list.append(j)
	return merged_list

	def compare_variable_with_string(content, string):
	"""This function compares a variable with a string. (Mainly for testing purposes)"""
	content == string

	def create_output_file(output_shell, content):
	"""This function creates the output file that we will put fixed HTML code into."""
	f = open(output_shell, 'w+')
	f.write(content)
	f.close()

	def find_next_tag(start_position, content):
	"""This functions finds start tags and next position to search the content."""
	p = re.compile('<([a-zA-Z][^>\s]\|/[a-zA-Z][^>\s])[^>]*>')
	m = p.search(content, start_position)
	if m is None:
	return None, len(content), len(content)
	return m.group(1), m.span(0)[0], m.span(0)[1]

	def lower_case_tags(tag_position, tag, content):
	"""This function lower cases tags."""
	tag_position += 1
	end_position = tag_position + len(tag)
	content = content[:tag_position] + tag.lower() + content[end_position:]
	return content

	def tag_type(tag):
	"""This function checks whether a tag is a start tag or end tag."""
	return tag[0:1] != '/'

	def tag_match(start_tag, end_tag):
	"""This function is matching the start tag with an end tag."""
	return end_tag[1:] == start_tag

	def convert_to_end_tag(start_tag):
	"""This converts a tag to an end_tag."""
	return '</' + start_tag.lower() + '>'

	def insert_missing_tag(start_tag, start_position, end_position, content):
	"""This function inserts a missing end tag where end tags are missing."""
	end_tag = convert_to_end_tag(start_tag)
	content = content[:start_position] + end_tag + content[start_position:]
	return start_position + len(end_tag), end_position + len(end_tag), content

	def delete_extra_end_tag(tag_position, next_start_position, content):
	"""This function deletes extra end tags."""
	content = content[:tag_position] + content[next_start_position:]
	return tag_position, tag_position, content

	def convert_to_start_tag(end_tag):
	"""This function converts an end tag to a start tag."""
	return end_tag[1:]

	def fix_missing_tags(content):
	"""This function compiles smaller functions and goes through an input file and fixes the tagging and lower cases all tags."""
	next_start_position = 0
	tag_position = 0
	stack = []
	while True:
	tag, tag_position, next_start_position = find_next_tag(next_start_position, content)
	if tag is None:
	break
	content = lower_case_tags(tag_position, tag, content)
	if tag_type(tag):
	stack.append(tag)
	else:
	if convert_to_start_tag(tag) not in stack:
	tag_position, next_start_position, content = delete_extra_end_tag(tag_position, next_start_position, content)
	continue
	while len(stack) > 0:
	pop_tag = stack.pop()
	if tag_match(pop_tag, tag):
	break
	tag_position, next_start_position, content = insert_missing_tag(pop_tag, tag_position, next_start_position, content)
	while len(stack) > 0:
	tag_position, next_start_position, content = insert_missing_tag(stack.pop(), tag_position, next_start_position, content)
	return content

	def check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(line):
	"""This function checks whether a line has both a start tag and an end tag, and no nested start tag. """
	next_start_position = 0
	tag_position = 0
	stack = []
	while True:
	tag, tag_position, next_start_position = find_next_tag(next_start_position, line)
	if tag is None:
	break
	if tag_type(tag):
	stack.append(tag)
	else:
	if convert_to_start_tag(tag) not in stack:
	continue
	while len(stack) > 0:
	pop_tag = stack.pop()
	if tag_match(pop_tag, tag) and not stack:
	return True


	def fix_indentation(content):
	"""This function fixes all the identations in the input file."""
	for i in range(0, len(content)):
	for j in range(0, len(content[i])):
	if check_if_line_has_both_start_and_end_tags_and_no_nested_start_tag(content[i]) == False:
	"""This brings back a list of lines that do not have both a start tag and end tag. If the line
	has a start and end tag, that means that there is a nested start tag."""

	""" This is list of lines that do not have both start and end tags in the same line.
	If there is /n between start tag and matching end tag, then indent 2 spaces.
	If start tag does not have a \n in front of it, then add one.
	When end tag is found, add \n.

	"""
	pass

	def insert_lines_for_special_tags(content):
	"""This function inserts lines for tags such as <head> and <body>. Also, places <pre> start tags in a new line."""
	search_strings = ('<head>', '<body>', '<h>')
	content = content.replace("<pre>", "\n<pre>")
	for word in search_strings:
	special_tag_location = content.find(word)
	content = content[:special_tag_location] + '\n' + content[special_tag_location:]
	return content

	def find_pre_start_tags(line):
	""""This function finds pre start tags."""
	found_line = line.find('<pre>')
	if found_line >= 0:
	return True
	return False

	def find_pre_end_tags(line):
	"""This function finds pre end tags."""
	found_line = line.find('</pre>')
	if found_line >= 0:
	return True
	return False

	def insert_space_in_long_line(line):
	"""This function inserts spaces in lines inputted and will recursively do so."""
	if len(line) <= 80:
	return line
	space_index = line.rfind(" ", 0, 79)
	if space_index == -1:
	return line
	new_line = line[:space_index] + '\n' + insert_space_in_long_line(line[space_index + 1:])
	return new_line

	def fix_length(content):
	"""This function finds lines greater than 80 characters and splits them into two separate sentences."""
	content = content.split('\n')
	new_content = []
	for i in range(0, len(content)):
	current_line = content[i]
	if find_pre_start_tags(current_line):
	new_content.append(current_line)
	continue
	if find_pre_end_tags(current_line):
	new_content.append(current_line)
	continue
	new_content.append(insert_space_in_long_line(current_line))
	return new_content

	def rename_file(old_file_name, new_file_name):
	"""This function renames the new output file with the original file name."""
	name_to_delete = os.path.basename(old_file_name)
	return os.rename(name_to_delete, new_file_name)

	def is_tag_start_tag_or_end_tag(s):
	"""The input is the part between '<' and '>' of each tag('<' and '>' included)
	check if the char after '<' is '/'
	the strip() is in case for possible spaces between '<' and ('/' or the first letter of the start tag name)
	"""
	return s[1:].strip()[0] != '/'

	def read_all_tags(s):
	"""
	find out all tags in the original file and store them in a list
	"""

	tags_list = []
	p = re.compile('<.+?>')
	for tag in p.finditer(s):
	tags_list.append(tag)
	return tags_list

	def process_indentation(text):
	"""The input string is the file in which all tags are in lowercase, well matched and nested,
	each tag is in the approriate line
	we hope to output the file in which all lines are well-indented

	1. split the file into different lines and store them into 'lst'
	2. the variable 'output' refers to the whole file after processing
	define last_end_location, to record where am I in the current line(original file),
	while processing the input and store the result into output
	3. for each line, delete all the spaces in the line
	4. read all tags in current line and store them in tags_list
	5. if tags_list is
	"""
	n = 0
	lst = text.split('\n')
	i = 2
	output = ''
	last_end_location = 0
	for line in lst:
	line = line.strip()
	tags_list = read_all_tags(line)
	if len(tags_list) > 0:
	tag = tags_list[0]
	if is_tag_start_tag_or_end_tag(tag.group()):
	output += ' ' * n * 2 + line + '\n'
	elif is_tag_start_tag_or_end_tag(tag.group()) == False and tag.start() != 0:
	output += ' ' * n * 2 + line + '\n'
	else:
	output += ' ' * (n - 1) * 2 + line + '\n'
	else:
	output += ' ' * n * 2 + line + '\n'
	for tag in tags_list:
	tag_name = tag.group()
	if is_tag_start_tag_or_end_tag(tag_name):
	n += 1
	else:
	n -= 1
	return output

	def delete_blank_line(line):
	"""Delete all blank lines in the input file."""
	lst = line.split('\n')
	s = ''
	for line in lst:
	if line != '':
	s += line + '\n'
	return s

	def main():
	in_path = ask_user_for_file()
	backup_file = create_backup(in_path)
	output_shell = generate_random_file_name(in_path)
	content = read_input_file_line_by_line(in_path)
	merged_list = merge_list(content)
	content_to_be_fixed = ''.join(merged_list)
	new_content_to_be_fixed = insert_lines_for_special_tags(content_to_be_fixed)
	fixed_tags_content = fix_missing_tags(new_content_to_be_fixed)
	indented_content = process_indentation(fixed_tags_content)
	jumbled_list = fix_length(indented_content)
	not_final_content = '\n'.join(jumbled_list)
	final_content = delete_blank_line(not_final_content)
	create_output_file(output_shell, final_content)
	new_file_name = os.path.basename(in_path)
	delete_file(new_file_name)
	new_file_name = str(new_file_name)
	rename_file(output_shell, new_file_name)

	if __name__ == "__main__":
	main()