Created
October 4, 2019 01:53
-
-
Save amitrani6/49035ea3b961dbbeab8e9681cff0da47 to your computer and use it in GitHub Desktop.
A cleaning function for NLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A function to open the raw text | |
def open_file(file_path): | |
with open(file_path, 'r') as file: | |
raw_text = file.read().replace('\n', ' ') | |
return raw_text | |
# A function that takes in the raw text of a string, removes special characters and | |
# stop words, and returns the script text as a list where each element represents a word | |
def cleaned_episode(raw_text, stop_words = False): | |
#RegEx to delete all text between and including brackets and parenthesis | |
raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text) | |
#Removes any special characters | |
for symbol in "*,#-.?!''\n": | |
raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower() | |
#Splits the text into a list of words | |
cleaned_text = raw_text_no_notes.split(" ") | |
#Removes any word that containes a colon, i.e. character speaking indicators | |
#Deletes spaces and blank elements | |
for i in cleaned_text: | |
if i.endswith(':') == True or i == '' or i == ' ': | |
cleaned_text.remove(i) | |
#Removes any stop words passed as a list | |
if stop_words: | |
cleaned_text = [word for word in cleaned_text if word.lower() not in stop_words] | |
#Returns the raw text as a list of words | |
return cleaned_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment