Last active
August 9, 2023 01:44
-
-
Save lvngd/3695aac64461de2cfb9d50bb11d5fbb3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import unidecode | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import gensim.downloader as api | |
from pycontractions import Contractions | |
from word2number import w2n | |
""" | |
Text Normalization for NLP | |
-removes extra whitespace within text | |
-converts unicode to ascii | |
-converts to lowercase | |
-remove leading or trailing whitespace | |
-expands contractions | |
-tokenizes sentences and words | |
-removes punctuation | |
-lemmatizes words | |
-removes stopwords | |
""" | |
class TextNormalizer: | |
def __init__(self): | |
self.cont = Contractions(kv_model=api.load("glove-twitter-25")) | |
self.lemmatizer = WordNetLemmatizer() | |
self.punctuation_table = str.maketrans('','',string.punctuation) | |
self.stop_words = set(stopwords.words('english')) | |
def normalize_text(self,text): | |
normalized_sentences = [] | |
text = re.sub(' +',' ', text) | |
text = unidecode.unidecode(text) | |
text = text.lower() | |
expanded_contractions = list(self.cont.expand_texts([text],precise=True)) | |
if expanded_contractions: | |
text = expanded_contractions[0] | |
sentences = sent_tokenize(text) | |
for sentence in sentences: | |
#remove punctuation | |
sentence = sentence.translate(self.punctuation_table) | |
#strip leading/trailing whitespace | |
sentence = sentence.strip() | |
words = word_tokenize(sentence) | |
#lemmatize and remove stopwords | |
filtered = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words] | |
new_sentence = ' '.join(filtered) | |
normalized_sentences.append(new_sentence) | |
return normalized_sentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I was probably using Python 3.6 or 3.7, not sure which version of NLTK - on MacOS and linux. But this error isn't coming from NLTK. You could just omit the part that uses pycontractions(comment out lines 8, 9, 29, 39-41) - that's a separate package and sounds like there are some issues installing with Windows. Unfortunately I'm not familiar with Python on Windows, so you might need to find an alternative if you want to expand contractions.