Skip to content

Instantly share code, notes, and snippets.

@vrjkmr
Last active August 28, 2018 03:00
Show Gist options
  • Save vrjkmr/192ac53752c737d057cdde50e369bafd to your computer and use it in GitHub Desktop.
Save vrjkmr/192ac53752c737d057cdde50e369bafd to your computer and use it in GitHub Desktop.
Code snippet to preprocess and clean text.
# -*- coding: utf-8 -*-
# imports
import re
from nltk.corpus import stopwords as sw
# stopwords : a list/set of strings
stopwords = set(sw.words('english'))
# TextCleaner : cleans text
class TextCleaner:
def __init__(self, stopwords=stopwords):
self.stopwords = stopwords
def isolate_characters(self, text):
text = re.sub(r"#", "# ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\_", " _ ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ) ", text)
text = re.sub(r"\[", " [ ", text)
text = re.sub(r"\]", " ] ", text)
text = re.sub(r"\{", " { ", text)
text = re.sub(r"\}", " } ", text)
text = re.sub(r"\+", " ", text)
text = re.sub(r"\$", " $ ", text)
text = re.sub(r"\%", " ", text)
return text
def shorten_duplicates(self, text):
return re.sub(r"(.)\1{2,}", r"\1"*2, text)
def isolate_contractions(self, text):
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"won\'t", "will not", text)
text = re.sub(r"shan\'t", "shall not", text)
text = re.sub(r"n\'t", " not", text)
text = re.sub(r"let\'s", "let us", text)
text = re.sub(r"\'s", " is", text)
text = re.sub(r"i\'m", "i am", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"y\'all", "you all", text)
text = re.sub(r"eatin\'", "eating", text)
text = re.sub(r"doin\'", "doing", text)
return text
def remove_words_with_chars(self, text, chars=['@','&','=','/','~','/','¿','#','.',\
'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
return ' '.join(w for w in text.split() if not any(s in w for s in chars))
def remove_stopwords(self, text):
return ' '.join(w for w in text.split() if w not in self.stopwords)
def clean_text(self, text, remove_stopwords=False, unnecessary_chars=['@','&','=','/','~','/','¿','#','.',\
'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
text = text.lower()
text = self.isolate_characters(text)
text = self.shorten_duplicates(text)
text = self.isolate_contractions(text)
text = self.remove_words_with_chars(text, unnecessary_chars)
if remove_stopwords:
text = self.remove_stopwords(text)
return text
# sample usage
t_cleaner = TextCleaner()
text = "how are y'all doin' today? I loooove me some ice-cream, let's get some!"
print(t_cleaner.clean_text(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment