amitrani6/cleaning_the_text.py

## cleaning_the_text.py
# A function to open the raw text
def open_file(file_path):

    with open(file_path, 'r') as file:
        raw_text = file.read().replace('\n', ' ')

    return raw_text


# A function that takes in the raw text of a string, removes special characters and
# stop words, and returns the script text as a list where each element represents a word
def cleaned_episode(raw_text, stop_words = False):

    #RegEx to delete all text between and including brackets and parenthesis
    raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)

    #Removes any special characters
    for symbol in "*,#-.?!''\n":
        raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()

    #Splits the text into a list of words
    cleaned_text = raw_text_no_notes.split(" ")

    #Removes any word that containes a colon, i.e. character speaking indicators
    #Deletes spaces and blank elements
    for i in cleaned_text:

        if i.endswith(':') == True or i == '' or i == ' ':
            cleaned_text.remove(i)

    #Removes any stop words passed as a list
    if stop_words:

        cleaned_text  = [word for word in cleaned_text if word.lower() not in stop_words]

    #Returns the raw text as a list of words
    return cleaned_text
	# A function to open the raw text
	def open_file(file_path):

	with open(file_path, 'r') as file:
	raw_text = file.read().replace('\n', ' ')

	return raw_text


	# A function that takes in the raw text of a string, removes special characters and
	# stop words, and returns the script text as a list where each element represents a word
	def cleaned_episode(raw_text, stop_words = False):

	#RegEx to delete all text between and including brackets and parenthesis
	raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)

	#Removes any special characters
	for symbol in "*,#-.?!''\n":
	raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()

	#Splits the text into a list of words
	cleaned_text = raw_text_no_notes.split(" ")

	#Removes any word that containes a colon, i.e. character speaking indicators
	#Deletes spaces and blank elements
	for i in cleaned_text:

	if i.endswith(':') == True or i == '' or i == ' ':
	cleaned_text.remove(i)

	#Removes any stop words passed as a list
	if stop_words:

	cleaned_text = [word for word in cleaned_text if word.lower() not in stop_words]

	#Returns the raw text as a list of words
	return cleaned_text