Skip to content

Instantly share code, notes, and snippets.

@AmirAbaskohi
Last active December 1, 2021 08:13
Show Gist options
  • Save AmirAbaskohi/52d4b7290c0c76b52d2b8978f6c91559 to your computer and use it in GitHub Desktop.
Save AmirAbaskohi/52d4b7290c0c76b52d2b8978f6c91559 to your computer and use it in GitHub Desktop.
BERT
def NormalizeWithPOS(text):
# Lemmatization & Stemming according to POS tagging
word_list = word_tokenize(text)
rev = []
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
for word, tag in pos_tag(word_list):
if tag.startswith('J'):
w = lemmatizer.lemmatize(word, pos='a')
elif tag.startswith('V'):
w = lemmatizer.lemmatize(word, pos='v')
elif tag.startswith('N'):
w = lemmatizer.lemmatize(word, pos='n')
elif tag.startswith('R'):
w = lemmatizer.lemmatize(word, pos='r')
else:
w = word
w = stemmer.stem(w)
rev.append(w)
review = ' '.join(rev)
return review
def cleanText(text):
text = re.sub(r'<.*?>', ' ', text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "can not", text)
text = re.sub(r"n't", " not", text)
text = re.sub(r"'ve", " have", text)
text = re.sub(r"'ll", " will", text)
text = re.sub(r"'re", " are", text)
if embedding is not 'BERT':
text = re.sub(r"[0-9]+", ' ', text)
text = re.sub(r"-", ' ', text)
text = text.strip().lower()
if embedding is 'WORD2VEC_NO_STOP':
# Remove Stop words
default_stop_words = set(stopwords.words('english'))
default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
'would','must',"'ve","'ll",'may'})
word_list = word_tokenize(text)
filtered_list = [w for w in word_list if not w in stop_words]
text = ' '.join(filtered_list)
if embedding is not 'BERT':
# Remove other contractions
text = re.sub(r"'", ' ', text)
# Replace punctuations with space
if embedding is 'BERT': # save ! ? . for end of the sentence detection [,/():;']
filters='"#$%&*+<=>@[\\]^_`{|}~\t\n'
text = re.sub(r'\!+', '!', text)
text = re.sub(r'\?+', '?', text)
else:
filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
translate_dict = dict((i, " ") for i in filters)
translate_map = str.maketrans(translate_dict)
text = text.translate(translate_map)
if embedding is 'BERT':
text = re.sub(r'\( *\)', ' ', text)
if embedding is not 'BERT':
text = ' '.join([w for w in text.split() if len(w)>1])
# Replace multiple space with one space
text = re.sub(' +', ' ', text)
text = ''.join(text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment