Created
September 17, 2019 00:10
-
-
Save TATABOX42/85fd5b299cac4cd5543848f45cac96e8 to your computer and use it in GitHub Desktop.
Built this module in Python 3 to process and clean strings. It removes emojis, words that are smaller than a certain length, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from string import digits, punctuation | |
from nltk.stem import SnowballStemmer | |
from nltk.tokenize import word_tokenize | |
class ProcessTextMethods: | |
def __init__(self): | |
self.stemmer = SnowballStemmer("english") | |
self.emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
# exclude words that are smaller than: | |
self.minimum_word_size_allowed = 3 | |
def nlp_clean_text(self, input_string): | |
""" | |
Example: input_string = ' we Should hodl 3? an$D hodler 2 moar 34' | |
""" | |
input_string_clean = input_string.lower() | |
# clear weird accents | |
input_string_clean = re.sub('’', '', input_string_clean) | |
input_string_clean = re.sub('´', '', input_string_clean) | |
# clean URLs | |
input_string_clean = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', input_string_clean) | |
input_string_clean = re.sub('www.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', input_string_clean) | |
# clean emojis | |
input_string_clean = re.sub(self.emoji_pattern, '', input_string_clean) | |
input_string_clean = input_string_clean.translate({ord(k): ' ' for k in digits}) | |
input_string_clean = input_string_clean.translate({ord(k): ' ' for k in punctuation}) | |
input_string_clean = ' '.join([word for word in input_string_clean.split() if len(word) >= self.minimum_word_size_allowed]) | |
return(input_string_clean) | |
def word_ocurrence_counter(self, key, input_string_clean): | |
""" | |
Example: key = 'hodl' | |
""" | |
if (input_string_clean is not None) and (input_string_clean != ''): | |
count = len([m.start() for m in re.finditer(key, input_string_clean)]) | |
else: | |
count = 0 | |
return count | |
def stem_input_string(self, input_string): | |
# stem comment | |
words = word_tokenize(input_string) | |
stem_text = list() | |
for word in words: | |
stem_text.append(self.stemmer.stem(word)) | |
input_string_clean = ' '.join(stem_text) | |
return input_string_clean |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment