Created
August 10, 2021 04:00
-
-
Save LauraLangdon/633c1ce4213956562c1cfa452b5551df to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_text(corpus, input_string: str) -> list: | |
""" | |
Clean text data and add to corpus | |
:param corpus: list of all words in the data | |
:param input_string: string of words to be added to the corpus | |
:return: output_string_as_list: cleaned list of words from input string | |
""" | |
input_string = re.split(r'\W+', input_string) | |
output_string_as_list = [] | |
for word in input_string: | |
if word in stop_words_list: | |
continue | |
for char in word: | |
if char in string.punctuation or char.isnumeric() or char == ' ': | |
word = word.replace(char, '') | |
if word == '': | |
continue | |
if word.lower() not in corpus: | |
corpus.append(word.lower()) | |
output_string_as_list.append(word.lower()) | |
return output_string_as_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment