Skip to content

Instantly share code, notes, and snippets.

Last active March 18, 2024 09:51
Show Gist options
  • Save pranavraikote/9e66a03a8f77f317009c9904481e50e1 to your computer and use it in GitHub Desktop.
Save pranavraikote/9e66a03a8f77f317009c9904481e50e1 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 1: Beginner's guide to Text Pre-processing
import nltk
import string
import unicodedata
# For Tokenizer'punkt')
# For Lemmatizer'wordnet')
# For Stopwords'stopwords')
# Importing nltk preprocessing methods
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
# Removing Unaccented Characters
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
# Removing Special Characters
def remove_special_characters(text):
pattern = r'[^a-zA-Z\s]'
n_pattern = r'[^a-zA-Z0-9\s]'
# Removing everything apart from alphanumerical chars
text = re.sub(pattern, '', text)
# Removing numbers
text = re.sub(n_pattern, '', text)
return text
# Converting to Lower Case
def to_lower(text):
return text.lower().strip()
# Removing Punctuation
def remove_p(text):
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub('[''""…]', '', text)
text = re.sub('\n', '', text)
return text
# Tokenization
def tokenization(text):
tokens = nltk.word_tokenize(text)
return tokens
# Stopword Removal
STOPWORDS = stopwords.words('english')
def remove_stopwords(tokens):
filtered_tokens = [token for token in tokens if token not in STOPWORDS]
return filtered_tokens
# Stemming
ps = PorterStemmer()
def stem(words):
stemmed_tokens = [ps.stem(word) for word in words]
return stemmed_tokens
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(words):
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]
return lemmatized_tokens
# Function which will call above functions in a pipeline
def text_preprocess(text):
text = remove_accented_chars(text)
text = remove_special_characters(text)
text = to_lower(text)
text = remove_p(text)
tokens = tokenization(text)
tokens = remove_stopwords(tokens)
#tokens = stem(tokens)
tokens = lemmatize(tokens)
return ' '.join(tokens)
sentence = "Type your sentence to be processed"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment