Skip to content

Instantly share code, notes, and snippets.

@amitrani6
Created October 4, 2019 01:53
Show Gist options
  • Save amitrani6/49035ea3b961dbbeab8e9681cff0da47 to your computer and use it in GitHub Desktop.
Save amitrani6/49035ea3b961dbbeab8e9681cff0da47 to your computer and use it in GitHub Desktop.
A cleaning function for NLP
# A function to open the raw text
def open_file(file_path):
with open(file_path, 'r') as file:
raw_text = file.read().replace('\n', ' ')
return raw_text
# A function that takes in the raw text of a string, removes special characters and
# stop words, and returns the script text as a list where each element represents a word
def cleaned_episode(raw_text, stop_words = False):
#RegEx to delete all text between and including brackets and parenthesis
raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)
#Removes any special characters
for symbol in "*,#-.?!''\n":
raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()
#Splits the text into a list of words
cleaned_text = raw_text_no_notes.split(" ")
#Removes any word that containes a colon, i.e. character speaking indicators
#Deletes spaces and blank elements
for i in cleaned_text:
if i.endswith(':') == True or i == '' or i == ' ':
cleaned_text.remove(i)
#Removes any stop words passed as a list
if stop_words:
cleaned_text = [word for word in cleaned_text if word.lower() not in stop_words]
#Returns the raw text as a list of words
return cleaned_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment