vrjkmr/text_cleaner.py

## text_cleaner.py
# -*- coding: utf-8 -*-

# imports
import re
from nltk.corpus import stopwords as sw

# stopwords : a list/set of strings
stopwords = set(sw.words('english'))

# TextCleaner : cleans text
class TextCleaner:

    def __init__(self, stopwords=stopwords):
        self.stopwords = stopwords

    def isolate_characters(self, text):
        text = re.sub(r"#", "# ", text)
        text = re.sub(r"\.", " . ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\?", " ? ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\_", " _ ", text)
        text = re.sub(r",", " , ", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r"\(", " ( ", text)
        text = re.sub(r"\)", " ) ", text)
        text = re.sub(r"\[", " [ ", text)
        text = re.sub(r"\]", " ] ", text)
        text = re.sub(r"\{", " { ", text)
        text = re.sub(r"\}", " } ", text)
        text = re.sub(r"\+", " ", text)
        text = re.sub(r"\$", " $ ", text)
        text = re.sub(r"\%", " ", text)
        return text

    def shorten_duplicates(self, text):
        return re.sub(r"(.)\1{2,}", r"\1"*2, text)

    def isolate_contractions(self, text):
        text = re.sub(r"can\'t", "can not", text)
        text = re.sub(r"won\'t", "will not", text)
        text = re.sub(r"shan\'t", "shall not", text)
        text = re.sub(r"n\'t", " not", text)
        text = re.sub(r"let\'s", "let us", text)
        text = re.sub(r"\'s", " is", text)
        text = re.sub(r"i\'m", "i am", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"y\'all", "you all", text)
        text = re.sub(r"eatin\'", "eating", text)
        text = re.sub(r"doin\'", "doing", text)
        return text

    def remove_words_with_chars(self, text, chars=['@','&','=','/','~','/','¿','#','.',\
                        '?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
        return ' '.join(w for w in text.split() if not any(s in w for s in chars))

    def remove_stopwords(self, text):
        return ' '.join(w for w in text.split() if w not in self.stopwords)

    def clean_text(self, text, remove_stopwords=False, unnecessary_chars=['@','&','=','/','~','/','¿','#','.',\
                        '?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
        text = text.lower()
        text = self.isolate_characters(text)
        text = self.shorten_duplicates(text)
        text = self.isolate_contractions(text)
        text = self.remove_words_with_chars(text, unnecessary_chars)
        if remove_stopwords:
            text = self.remove_stopwords(text)
        return text

# sample usage
t_cleaner = TextCleaner()
text = "how are y'all doin' today? I loooove me some ice-cream, let's get some!"
print(t_cleaner.clean_text(text))
	# -- coding: utf-8 --

	# imports
	import re
	from nltk.corpus import stopwords as sw

	# stopwords : a list/set of strings
	stopwords = set(sw.words('english'))

	# TextCleaner : cleans text
	class TextCleaner:

	def __init__(self, stopwords=stopwords):
	self.stopwords = stopwords

	def isolate_characters(self, text):
	text = re.sub(r"#", "# ", text)
	text = re.sub(r"\.", " . ", text)
	text = re.sub(r"!", " ! ", text)
	text = re.sub(r"\?", " ? ", text)
	text = re.sub(r"\-", " - ", text)
	text = re.sub(r"\_", " _ ", text)
	text = re.sub(r",", " , ", text)
	text = re.sub(r":", " : ", text)
	text = re.sub(r"\(", " ( ", text)
	text = re.sub(r"\)", " ) ", text)
	text = re.sub(r"\[", " [ ", text)
	text = re.sub(r"\]", " ] ", text)
	text = re.sub(r"\{", " { ", text)
	text = re.sub(r"\}", " } ", text)
	text = re.sub(r"\+", " ", text)
	text = re.sub(r"\$", " $ ", text)
	text = re.sub(r"\%", " ", text)
	return text

	def shorten_duplicates(self, text):
	return re.sub(r"(.)\1{2,}", r"\1"*2, text)

	def isolate_contractions(self, text):
	text = re.sub(r"can\'t", "can not", text)
	text = re.sub(r"won\'t", "will not", text)
	text = re.sub(r"shan\'t", "shall not", text)
	text = re.sub(r"n\'t", " not", text)
	text = re.sub(r"let\'s", "let us", text)
	text = re.sub(r"\'s", " is", text)
	text = re.sub(r"i\'m", "i am", text)
	text = re.sub(r"\'re", " are", text)
	text = re.sub(r"\'d", " would", text)
	text = re.sub(r"\'ll", " will", text)
	text = re.sub(r"y\'all", "you all", text)
	text = re.sub(r"eatin\'", "eating", text)
	text = re.sub(r"doin\'", "doing", text)
	return text

	def remove_words_with_chars(self, text, chars=['@','&','=','/','~','/','¿','#','.',\
	'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
	return ' '.join(w for w in text.split() if not any(s in w for s in chars))

	def remove_stopwords(self, text):
	return ' '.join(w for w in text.split() if w not in self.stopwords)

	def clean_text(self, text, remove_stopwords=False, unnecessary_chars=['@','&','=','/','~','/','¿','#','.',\
	'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']):
	text = text.lower()
	text = self.isolate_characters(text)
	text = self.shorten_duplicates(text)
	text = self.isolate_contractions(text)
	text = self.remove_words_with_chars(text, unnecessary_chars)
	if remove_stopwords:
	text = self.remove_stopwords(text)
	return text

	# sample usage
	t_cleaner = TextCleaner()
	text = "how are y'all doin' today? I loooove me some ice-cream, let's get some!"
	print(t_cleaner.clean_text(text))