Skip to content

Instantly share code, notes, and snippets.

@Muhammad4hmed
Last active December 21, 2020 09:00
Show Gist options
  • Save Muhammad4hmed/51de260ebbcdfc02efc83af8c4abb651 to your computer and use it in GitHub Desktop.
Save Muhammad4hmed/51de260ebbcdfc02efc83af8c4abb651 to your computer and use it in GitHub Desktop.
Basic Pre Processing NLP
import re, string, unicodedata
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def clean_text(s):
"""
This function cleans the text a bit
:param s: string
:return: cleaned string
"""
# split by all whitespaces
s = s.split()
# join tokens by single space
# why we do this?
# this will remove all kinds of weird space
# "hi. how are you" becomes
# "hi. how are you"
s = " ".join(s)
# remove all punctuations using regex and string module
s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
# you can add more cleaning here if you want
# and then return the cleaned string
return s
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
def remove_URL(sample):
"""Remove URLs from a sample string"""
return re.sub(r"http\S+", "", sample)
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words.split():
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
words = replace_numbers(words)
words = remove_stopwords(words)
return words
def preprocess(sample):
sample = remove_URL(sample)
sample = replace_contractions(sample)
# Tokenize
# words = nltk.word_tokenize(sample)
# Normalize
return normalize(sample)
def remove(text):
return str(text).replace('[','').replace(']','').replace(',','').replace('\'','')
train['review'] = train['review'].apply(lambda x: preprocess(x))
test['review'] = test['review'].apply(lambda x: preprocess(x))
train['review'] = train['review'].apply(lambda x: str(remove(x)))
test['review'] = test['review'].apply(lambda x: str(remove(x)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment