This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_non_ascii(words): | |
new_words = [] | |
for word in words: | |
new_word = unicodedata.normalize('NFKD', word).encode( | |
'ascii', 'ignore').decode('utf-8', 'ignore') | |
new_words.append(new_word) | |
return new_words | |
# To LowerCase | |
def to_lowercase(words): | |
new_words = [] | |
for word in words: | |
new_word = word.lower() | |
new_words.append(new_word) | |
return new_words | |
# Remove Punctuation | |
def remove_punctuation(words): | |
new_words = [] | |
for word in words: | |
new_word = re.sub(r'[^\w\s]', '', word) | |
if new_word != '': | |
new_words.append(new_word) | |
return new_words | |
# Replace Numbers with Textual Representations | |
def replace_numbers(words): | |
p = inflect.engine() | |
new_words = [] | |
for word in words: | |
if word.isdigit(): | |
new_word = p.number_to_words(word) | |
new_words.append(new_word) | |
else: | |
new_words.append(word) | |
return new_words | |
# Remove Stopwords | |
def remove_stopwords(words): | |
new_words = [] | |
for word in words: | |
if word not in stopwords.words('english'): | |
new_words.append(word) | |
return new_words | |
# Combine all functions into Normalize() function | |
def normalize(words): | |
words = remove_non_ascii(words) | |
words = to_lowercase(words) | |
words = remove_punctuation(words) | |
words = replace_numbers(words) | |
words = remove_stopwords(words) | |
return words | |
nltk.download('stopwords') | |
sents = normalize(flat_sent_token) | |
print("Length of sentences list: ", len(sents)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment