Muhammad4hmed/Basic-Pre-Processing-NLP.py

## Basic-Pre-Processing-NLP.py
import re, string, unicodedata
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

def clean_text(s):
    """
        This function cleans the text a bit
        :param s: string
        :return: cleaned string
    """
    # split by all whitespaces
    s = s.split()

    # join tokens by single space
    # why we do this?
    # this will remove all kinds of weird space
    # "hi. how are you" becomes
    # "hi. how are you"
    s = " ".join(s)

    # remove all punctuations using regex and string module
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

    # you can add more cleaning here if you want
    # and then return the cleaned string
    return s

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words.split():
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

def preprocess(sample):
    sample = remove_URL(sample)
    sample = replace_contractions(sample)
    # Tokenize
#     words = nltk.word_tokenize(sample)

    # Normalize
    return normalize(sample)

def remove(text):
    return str(text).replace('[','').replace(']','').replace(',','').replace('\'','')

train['review'] = train['review'].apply(lambda x: preprocess(x))
test['review'] = test['review'].apply(lambda x: preprocess(x))

train['review'] = train['review'].apply(lambda x: str(remove(x)))
test['review'] = test['review'].apply(lambda x: str(remove(x)))
	import re, string, unicodedata
	import nltk
	import contractions
	import inflect
	from nltk import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import LancasterStemmer, WordNetLemmatizer

	def clean_text(s):
	"""
	This function cleans the text a bit
	:param s: string
	:return: cleaned string
	"""
	# split by all whitespaces
	s = s.split()

	# join tokens by single space
	# why we do this?
	# this will remove all kinds of weird space
	# "hi. how are you" becomes
	# "hi. how are you"
	s = " ".join(s)

	# remove all punctuations using regex and string module
	s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

	# you can add more cleaning here if you want
	# and then return the cleaned string
	return s

	def replace_contractions(text):
	"""Replace contractions in string of text"""
	return contractions.fix(text)

	def remove_URL(sample):
	"""Remove URLs from a sample string"""
	return re.sub(r"http\S+", "", sample)

	def remove_non_ascii(words):
	"""Remove non-ASCII characters from list of tokenized words"""
	new_words = []
	for word in words.split():
	new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	new_words.append(new_word)
	return new_words

	def to_lowercase(words):
	"""Convert all characters to lowercase from list of tokenized words"""
	new_words = []
	for word in words:
	new_word = word.lower()
	new_words.append(new_word)
	return new_words

	def remove_punctuation(words):
	"""Remove punctuation from list of tokenized words"""
	new_words = []
	for word in words:
	new_word = re.sub(r'[^\w\s]', '', word)
	if new_word != '':
	new_words.append(new_word)
	return new_words

	def replace_numbers(words):
	"""Replace all interger occurrences in list of tokenized words with textual representation"""
	p = inflect.engine()
	new_words = []
	for word in words:
	if word.isdigit():
	new_word = p.number_to_words(word)
	new_words.append(new_word)
	else:
	new_words.append(word)
	return new_words

	def remove_stopwords(words):
	"""Remove stop words from list of tokenized words"""
	new_words = []
	for word in words:
	if word not in stopwords.words('english'):
	new_words.append(word)
	return new_words

	def stem_words(words):
	"""Stem words in list of tokenized words"""
	stemmer = LancasterStemmer()
	stems = []
	for word in words:
	stem = stemmer.stem(word)
	stems.append(stem)
	return stems

	def lemmatize_verbs(words):
	"""Lemmatize verbs in list of tokenized words"""
	lemmatizer = WordNetLemmatizer()
	lemmas = []
	for word in words:
	lemma = lemmatizer.lemmatize(word, pos='v')
	lemmas.append(lemma)
	return lemmas

	def normalize(words):
	words = remove_non_ascii(words)
	words = to_lowercase(words)
	words = remove_punctuation(words)
	words = replace_numbers(words)
	words = remove_stopwords(words)
	return words

	def preprocess(sample):
	sample = remove_URL(sample)
	sample = replace_contractions(sample)
	# Tokenize
	# words = nltk.word_tokenize(sample)

	# Normalize
	return normalize(sample)

	def remove(text):
	return str(text).replace('[','').replace(']','').replace(',','').replace('\'','')

	train['review'] = train['review'].apply(lambda x: preprocess(x))
	test['review'] = test['review'].apply(lambda x: preprocess(x))

	train['review'] = train['review'].apply(lambda x: str(remove(x)))
	test['review'] = test['review'].apply(lambda x: str(remove(x)))