Last active
December 1, 2021 08:13
-
-
Save AmirAbaskohi/52d4b7290c0c76b52d2b8978f6c91559 to your computer and use it in GitHub Desktop.
BERT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def NormalizeWithPOS(text): | |
# Lemmatization & Stemming according to POS tagging | |
word_list = word_tokenize(text) | |
rev = [] | |
lemmatizer = WordNetLemmatizer() | |
stemmer = PorterStemmer() | |
for word, tag in pos_tag(word_list): | |
if tag.startswith('J'): | |
w = lemmatizer.lemmatize(word, pos='a') | |
elif tag.startswith('V'): | |
w = lemmatizer.lemmatize(word, pos='v') | |
elif tag.startswith('N'): | |
w = lemmatizer.lemmatize(word, pos='n') | |
elif tag.startswith('R'): | |
w = lemmatizer.lemmatize(word, pos='r') | |
else: | |
w = word | |
w = stemmer.stem(w) | |
rev.append(w) | |
review = ' '.join(rev) | |
return review | |
def cleanText(text): | |
text = re.sub(r'<.*?>', ' ', text) | |
text = re.sub(r"won't", "will not", text) | |
text = re.sub(r"can't", "can not", text) | |
text = re.sub(r"n't", " not", text) | |
text = re.sub(r"'ve", " have", text) | |
text = re.sub(r"'ll", " will", text) | |
text = re.sub(r"'re", " are", text) | |
if embedding is not 'BERT': | |
text = re.sub(r"[0-9]+", ' ', text) | |
text = re.sub(r"-", ' ', text) | |
text = text.strip().lower() | |
if embedding is 'WORD2VEC_NO_STOP': | |
# Remove Stop words | |
default_stop_words = set(stopwords.words('english')) | |
default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'}) | |
stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s", | |
'would','must',"'ve","'ll",'may'}) | |
word_list = word_tokenize(text) | |
filtered_list = [w for w in word_list if not w in stop_words] | |
text = ' '.join(filtered_list) | |
if embedding is not 'BERT': | |
# Remove other contractions | |
text = re.sub(r"'", ' ', text) | |
# Replace punctuations with space | |
if embedding is 'BERT': # save ! ? . for end of the sentence detection [,/():;'] | |
filters='"#$%&*+<=>@[\\]^_`{|}~\t\n' | |
text = re.sub(r'\!+', '!', text) | |
text = re.sub(r'\?+', '?', text) | |
else: | |
filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' | |
translate_dict = dict((i, " ") for i in filters) | |
translate_map = str.maketrans(translate_dict) | |
text = text.translate(translate_map) | |
if embedding is 'BERT': | |
text = re.sub(r'\( *\)', ' ', text) | |
if embedding is not 'BERT': | |
text = ' '.join([w for w in text.split() if len(w)>1]) | |
# Replace multiple space with one space | |
text = re.sub(' +', ' ', text) | |
text = ''.join(text) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment