Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created January 31, 2021 07:41
Embed
What would you like to do?
def remove_non_ascii(words):
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode(
'ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
# To LowerCase
def to_lowercase(words):
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
# Remove Punctuation
def remove_punctuation(words):
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
# Replace Numbers with Textual Representations
def replace_numbers(words):
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
# Remove Stopwords
def remove_stopwords(words):
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
# Combine all functions into Normalize() function
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
words = replace_numbers(words)
words = remove_stopwords(words)
return words
nltk.download('stopwords')
sents = normalize(flat_sent_token)
print("Length of sentences list: ", len(sents))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment