This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import unidecode | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import gensim.downloader as api | |
from pycontractions import Contractions | |
from word2number import w2n | |
""" | |
Text Normalization for NLP | |
-removes extra whitespace within text | |
-converts unicode to ascii | |
-converts to lowercase | |
-remove leading or trailing whitespace | |
-expands contractions | |
-tokenizes sentences and words | |
-removes punctuation | |
-lemmatizes words | |
-removes stopwords | |
""" | |
class TextNormalizer: | |
def __init__(self): | |
self.cont = Contractions(kv_model=api.load("glove-twitter-25")) | |
self.lemmatizer = WordNetLemmatizer() | |
self.punctuation_table = str.maketrans('','',string.punctuation) | |
self.stop_words = set(stopwords.words('english')) | |
def normalize_text(self,text): | |
normalized_sentences = [] | |
text = re.sub(' +',' ', text) | |
text = unidecode.unidecode(text) | |
text = text.lower() | |
expanded_contractions = list(self.cont.expand_texts([text],precise=True)) | |
if expanded_contractions: | |
text = expanded_contractions[0] | |
sentences = sent_tokenize(text) | |
for sentence in sentences: | |
#remove punctuation | |
sentence = sentence.translate(self.punctuation_table) | |
#strip leading/trailing whitespace | |
sentence = sentence.strip() | |
words = word_tokenize(sentence) | |
#lemmatize and remove stopwords | |
filtered = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words] | |
new_sentence = ' '.join(filtered) | |
normalized_sentences.append(new_sentence) | |
return normalized_sentences |
Hi, I was probably using Python 3.6 or 3.7, not sure which version of NLTK - on MacOS and linux. But this error isn't coming from NLTK. You could just omit the part that uses pycontractions(comment out lines 8, 9, 29, 39-41) - that's a separate package and sounds like there are some issues installing with Windows. Unfortunately I'm not familiar with Python on Windows, so you might need to find an alternative if you want to expand contractions.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I was unable to run your code explaining Text Normalization due to a dependancy error on Windows 10: pycontractions depends on language-check which is now abandoned and in a broken state.
Did you test this under Windows or Linux? And what version of nltk and python did you use for this?