Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
def clean_text(corpus, input_string: str) -> list:
"""
Clean text data and add to corpus
:param corpus: list of all words in the data
:param input_string: string of words to be added to the corpus
:return: output_string_as_list: cleaned list of words from input string
"""
input_string = re.split(r'\W+', input_string)
output_string_as_list = []
for word in input_string:
if word in stop_words_list:
continue
for char in word:
if char in string.punctuation or char.isnumeric() or char == ' ':
word = word.replace(char, '')
if word == '':
continue
if word.lower() not in corpus:
corpus.append(word.lower())
output_string_as_list.append(word.lower())
return output_string_as_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment