Skip to content

Instantly share code, notes, and snippets.

@TATABOX42
Created September 17, 2019 00:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TATABOX42/85fd5b299cac4cd5543848f45cac96e8 to your computer and use it in GitHub Desktop.
Save TATABOX42/85fd5b299cac4cd5543848f45cac96e8 to your computer and use it in GitHub Desktop.
Built this module in Python 3 to process and clean strings. It removes emojis, words that are smaller than a certain length, etc.
import re
from string import digits, punctuation
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
class ProcessTextMethods:
def __init__(self):
self.stemmer = SnowballStemmer("english")
self.emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
# exclude words that are smaller than:
self.minimum_word_size_allowed = 3
def nlp_clean_text(self, input_string):
"""
Example: input_string = ' we Should hodl 3? an$D hodler 2 moar 34'
"""
input_string_clean = input_string.lower()
# clear weird accents
input_string_clean = re.sub('’', '', input_string_clean)
input_string_clean = re.sub('´', '', input_string_clean)
# clean URLs
input_string_clean = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', input_string_clean)
input_string_clean = re.sub('www.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', input_string_clean)
# clean emojis
input_string_clean = re.sub(self.emoji_pattern, '', input_string_clean)
input_string_clean = input_string_clean.translate({ord(k): ' ' for k in digits})
input_string_clean = input_string_clean.translate({ord(k): ' ' for k in punctuation})
input_string_clean = ' '.join([word for word in input_string_clean.split() if len(word) >= self.minimum_word_size_allowed])
return(input_string_clean)
def word_ocurrence_counter(self, key, input_string_clean):
"""
Example: key = 'hodl'
"""
if (input_string_clean is not None) and (input_string_clean != ''):
count = len([m.start() for m in re.finditer(key, input_string_clean)])
else:
count = 0
return count
def stem_input_string(self, input_string):
# stem comment
words = word_tokenize(input_string)
stem_text = list()
for word in words:
stem_text.append(self.stemmer.stem(word))
input_string_clean = ' '.join(stem_text)
return input_string_clean
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment