Skip to content

Instantly share code, notes, and snippets.

@khuyentran1401
Forked from MrEliptik/text_preprocessing.py
Last active August 10, 2021 09:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save khuyentran1401/8dd289de0841d97cd75e31c4b187073d to your computer and use it in GitHub Desktop.
Save khuyentran1401/8dd289de0841d97cd75e31c4b187073d to your computer and use it in GitHub Desktop.
A python script to preprocess text (remove URL, lowercase, tokenize, etc..)
import re, string, unicodedata
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def remove_html(words):
'''Remove message with html'''
return re.sub(r'^<p.*</p>', '', words)
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
def remove_URL(sample):
"""Remove URLs from a sample string"""
return re.sub(r"http\S+", "", sample)
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
return [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
return [word.lower() for word in words]
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
return [re.sub(r'[^\w\s]', '', word) for word in words]
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
return [p.number_to_words(word) if word.isdigit() else word for word in words]
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
return [word for word in words if word not in stopwords.words('english')]
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
return [stemmer.stem(word) for word in words]
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word, pos='v') for word in words]
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
words = replace_numbers(words)
words = remove_stopwords(words)
# Remove space
words = ' '.join(words).replace(' ', ' ').strip().split(' ')
try:
words.remove('')
except:
pass
return words
def preprocess(sample):
#sample = remove_html(sample)
sample = remove_URL(sample)
sample = replace_contractions(sample)
# Tokenize
words = nltk.word_tokenize(sample)
# Normalize
words = normalize(words)
#return sample
return ' '.join(words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment