pranavraikote/text_preprocess.py

## text_preprocess.py
import nltk
import string
import unicodedata

# For Tokenizer
nltk.download('punkt')

# For Lemmatizer
nltk.download('wordnet')

# For Stopwords
nltk.download('stopwords')

# Importing nltk preprocessing methods
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# Removing Unaccented Characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# Removing Special Characters
def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'
    n_pattern = r'[^a-zA-Z0-9\s]'

    # Removing everything apart from alphanumerical chars
    text = re.sub(pattern, '', text)

    # Removing numbers
    text = re.sub(n_pattern, '', text)

    return text

# Converting to Lower Case
def to_lower(text):
    return text.lower().strip()

# Removing Punctuation
def remove_p(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub('[''""…]', '', text)
    text = re.sub('\n', '', text)

    return text

# Tokenization
def tokenization(text):
    tokens = nltk.word_tokenize(text)

    return tokens

# Stopword Removal
STOPWORDS = stopwords.words('english')

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token not in STOPWORDS]

    return filtered_tokens

# Stemming
ps = PorterStemmer()

def stem(words):
   stemmed_tokens = [ps.stem(word) for word in words]

   return stemmed_tokens

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize(words):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]

    return lemmatized_tokens

# Function which will call above functions in a pipeline

def text_preprocess(text):
    text = remove_accented_chars(text)
    text = remove_special_characters(text)
    text = to_lower(text)
    text = remove_p(text)

    tokens = tokenization(text)
    tokens = remove_stopwords(tokens)
    #tokens = stem(tokens)
    tokens = lemmatize(tokens)

    return ' '.join(tokens)

sentence = "Type your sentence to be processed"
text_preprocess(sentence)
	import nltk
	import string
	import unicodedata

	# For Tokenizer
	nltk.download('punkt')

	# For Lemmatizer
	nltk.download('wordnet')

	# For Stopwords
	nltk.download('stopwords')

	# Importing nltk preprocessing methods
	from nltk import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.tokenize import sent_tokenize
	from nltk.stem.wordnet import WordNetLemmatizer

	# Removing Unaccented Characters
	def remove_accented_chars(text):
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	return text

	# Removing Special Characters
	def remove_special_characters(text):
	pattern = r'[^a-zA-Z\s]'
	n_pattern = r'[^a-zA-Z0-9\s]'

	# Removing everything apart from alphanumerical chars
	text = re.sub(pattern, '', text)

	# Removing numbers
	text = re.sub(n_pattern, '', text)

	return text

	# Converting to Lower Case
	def to_lower(text):
	return text.lower().strip()

	# Removing Punctuation
	def remove_p(text):
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = re.sub('[''""…]', '', text)
	text = re.sub('\n', '', text)

	return text

	# Tokenization
	def tokenization(text):
	tokens = nltk.word_tokenize(text)

	return tokens

	# Stopword Removal
	STOPWORDS = stopwords.words('english')

	def remove_stopwords(tokens):
	filtered_tokens = [token for token in tokens if token not in STOPWORDS]

	return filtered_tokens

	# Stemming
	ps = PorterStemmer()

	def stem(words):
	stemmed_tokens = [ps.stem(word) for word in words]

	return stemmed_tokens

	# Lemmatization
	lemmatizer = WordNetLemmatizer()

	def lemmatize(words):
	lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]

	return lemmatized_tokens

	# Function which will call above functions in a pipeline

	def text_preprocess(text):
	text = remove_accented_chars(text)
	text = remove_special_characters(text)
	text = to_lower(text)
	text = remove_p(text)

	tokens = tokenization(text)
	tokens = remove_stopwords(tokens)
	#tokens = stem(tokens)
	tokens = lemmatize(tokens)

	return ' '.join(tokens)

	sentence = "Type your sentence to be processed"
	text_preprocess(sentence)