Last active August 9, 2023 01:44
import re
import string
import unidecode
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.downloader as api
from pycontractions import Contractions
from word2number import w2n
Text Normalization for NLP
-removes extra whitespace within text
-converts unicode to ascii
-converts to lowercase
-remove leading or trailing whitespace
-expands contractions
-tokenizes sentences and words
-removes punctuation
-lemmatizes words
-removes stopwords
class TextNormalizer:
def __init__(self):
self.cont = Contractions(kv_model=api.load("glove-twitter-25"))
self.lemmatizer = WordNetLemmatizer()
self.punctuation_table = str.maketrans('','',string.punctuation)
self.stop_words = set(stopwords.words('english'))
def normalize_text(self,text):
normalized_sentences = []
text = re.sub(' +',' ', text)
text = unidecode.unidecode(text)
text = text.lower()
expanded_contractions = list(self.cont.expand_texts([text],precise=True))
if expanded_contractions:
text = expanded_contractions[0]
sentences = sent_tokenize(text)
for sentence in sentences:
#remove punctuation
sentence = sentence.translate(self.punctuation_table)
#strip leading/trailing whitespace
sentence = sentence.strip()
words = word_tokenize(sentence)
#lemmatize and remove stopwords
filtered = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
new_sentence = ' '.join(filtered)
return normalized_sentences
I was unable to run your code explaining Text Normalization due to a dependancy error on Windows 10: pycontractions depends on language-check which is now abandoned and in a broken state.

Did you test this under Windows or Linux? And what version of nltk and python did you use for this?

lvngd commented Jul 31, 2021

Hi, I was probably using Python 3.6 or 3.7, not sure which version of NLTK - on MacOS and linux. But this error isn't coming from NLTK. You could just omit the part that uses pycontractions(comment out lines 8, 9, 29, 39-41) - that's a separate package and sounds like there are some issues installing with Windows. Unfortunately I'm not familiar with Python on Windows, so you might need to find an alternative if you want to expand contractions.

