Last active
August 28, 2018 03:00
-
-
Save vrjkmr/192ac53752c737d057cdde50e369bafd to your computer and use it in GitHub Desktop.
Code snippet to preprocess and clean text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# imports | |
import re | |
from nltk.corpus import stopwords as sw | |
# stopwords : a list/set of strings | |
stopwords = set(sw.words('english')) | |
# TextCleaner : cleans text | |
class TextCleaner: | |
def __init__(self, stopwords=stopwords): | |
self.stopwords = stopwords | |
def isolate_characters(self, text): | |
text = re.sub(r"#", "# ", text) | |
text = re.sub(r"\.", " . ", text) | |
text = re.sub(r"!", " ! ", text) | |
text = re.sub(r"\?", " ? ", text) | |
text = re.sub(r"\-", " - ", text) | |
text = re.sub(r"\_", " _ ", text) | |
text = re.sub(r",", " , ", text) | |
text = re.sub(r":", " : ", text) | |
text = re.sub(r"\(", " ( ", text) | |
text = re.sub(r"\)", " ) ", text) | |
text = re.sub(r"\[", " [ ", text) | |
text = re.sub(r"\]", " ] ", text) | |
text = re.sub(r"\{", " { ", text) | |
text = re.sub(r"\}", " } ", text) | |
text = re.sub(r"\+", " ", text) | |
text = re.sub(r"\$", " $ ", text) | |
text = re.sub(r"\%", " ", text) | |
return text | |
def shorten_duplicates(self, text): | |
return re.sub(r"(.)\1{2,}", r"\1"*2, text) | |
def isolate_contractions(self, text): | |
text = re.sub(r"can\'t", "can not", text) | |
text = re.sub(r"won\'t", "will not", text) | |
text = re.sub(r"shan\'t", "shall not", text) | |
text = re.sub(r"n\'t", " not", text) | |
text = re.sub(r"let\'s", "let us", text) | |
text = re.sub(r"\'s", " is", text) | |
text = re.sub(r"i\'m", "i am", text) | |
text = re.sub(r"\'re", " are", text) | |
text = re.sub(r"\'d", " would", text) | |
text = re.sub(r"\'ll", " will", text) | |
text = re.sub(r"y\'all", "you all", text) | |
text = re.sub(r"eatin\'", "eating", text) | |
text = re.sub(r"doin\'", "doing", text) | |
return text | |
def remove_words_with_chars(self, text, chars=['@','&','=','/','~','/','¿','#','.',\ | |
'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']): | |
return ' '.join(w for w in text.split() if not any(s in w for s in chars)) | |
def remove_stopwords(self, text): | |
return ' '.join(w for w in text.split() if w not in self.stopwords) | |
def clean_text(self, text, remove_stopwords=False, unnecessary_chars=['@','&','=','/','~','/','¿','#','.',\ | |
'?','!','-','_', ',',':','(',')','[',']','{','}','http','\\']): | |
text = text.lower() | |
text = self.isolate_characters(text) | |
text = self.shorten_duplicates(text) | |
text = self.isolate_contractions(text) | |
text = self.remove_words_with_chars(text, unnecessary_chars) | |
if remove_stopwords: | |
text = self.remove_stopwords(text) | |
return text | |
# sample usage | |
t_cleaner = TextCleaner() | |
text = "how are y'all doin' today? I loooove me some ice-cream, let's get some!" | |
print(t_cleaner.clean_text(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment