AmirAbaskohi/BERT_2.py

## BERT_2.py
def NormalizeWithPOS(text):
    # Lemmatization & Stemming according to POS tagging
    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(w)
        rev.append(w)
    review = ' '.join(rev)
    return review

 def cleanText(text):

    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'re", " are", text)
    if embedding is not 'BERT':
        text = re.sub(r"[0-9]+", ' ', text)
        text = re.sub(r"-", ' ', text)
    text = text.strip().lower()
    if embedding is 'WORD2VEC_NO_STOP':
        # Remove Stop words
        default_stop_words = set(stopwords.words('english'))
        default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
        stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
                                               'would','must',"'ve","'ll",'may'})
        word_list = word_tokenize(text)
        filtered_list = [w for w in word_list if not w in stop_words]
        text = ' '.join(filtered_list)
    if embedding is not 'BERT':
        # Remove other contractions
        text = re.sub(r"'", ' ', text)
    # Replace punctuations with space
    if embedding is 'BERT': # save ! ? . for end of the sentence detection [,/():;']
        filters='"#$%&*+<=>@[\\]^_`{|}~\t\n'
        text = re.sub(r'\!+', '!', text)
        text = re.sub(r'\?+', '?', text)
    else:
        filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    if embedding is 'BERT':
        text = re.sub(r'\( *\)', ' ', text)
    if embedding is not 'BERT':
        text = ' '.join([w for w in text.split() if len(w)>1])
    # Replace multiple space with one space
    text = re.sub(' +', ' ', text)
    text = ''.join(text)
    return text
	def NormalizeWithPOS(text):
	# Lemmatization & Stemming according to POS tagging
	word_list = word_tokenize(text)
	rev = []
	lemmatizer = WordNetLemmatizer()
	stemmer = PorterStemmer()
	for word, tag in pos_tag(word_list):
	if tag.startswith('J'):
	w = lemmatizer.lemmatize(word, pos='a')
	elif tag.startswith('V'):
	w = lemmatizer.lemmatize(word, pos='v')
	elif tag.startswith('N'):
	w = lemmatizer.lemmatize(word, pos='n')
	elif tag.startswith('R'):
	w = lemmatizer.lemmatize(word, pos='r')
	else:
	w = word
	w = stemmer.stem(w)
	rev.append(w)
	review = ' '.join(rev)
	return review

	def cleanText(text):

	text = re.sub(r'<.*?>', ' ', text)
	text = re.sub(r"won't", "will not", text)
	text = re.sub(r"can't", "can not", text)
	text = re.sub(r"n't", " not", text)
	text = re.sub(r"'ve", " have", text)
	text = re.sub(r"'ll", " will", text)
	text = re.sub(r"'re", " are", text)
	if embedding is not 'BERT':
	text = re.sub(r"[0-9]+", ' ', text)
	text = re.sub(r"-", ' ', text)
	text = text.strip().lower()
	if embedding is 'WORD2VEC_NO_STOP':
	# Remove Stop words
	default_stop_words = set(stopwords.words('english'))
	default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
	stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
	'would','must',"'ve","'ll",'may'})
	word_list = word_tokenize(text)
	filtered_list = [w for w in word_list if not w in stop_words]
	text = ' '.join(filtered_list)
	if embedding is not 'BERT':
	# Remove other contractions
	text = re.sub(r"'", ' ', text)
	# Replace punctuations with space
	if embedding is 'BERT': # save ! ? . for end of the sentence detection [,/():;']
	filters='"#$%&*+<=>@[\\]^_`{\|}~\t\n'
	text = re.sub(r'\!+', '!', text)
	text = re.sub(r'\?+', '?', text)
	else:
	filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n'
	translate_dict = dict((i, " ") for i in filters)
	translate_map = str.maketrans(translate_dict)
	text = text.translate(translate_map)
	if embedding is 'BERT':
	text = re.sub(r'\( *\)', ' ', text)
	if embedding is not 'BERT':
	text = ' '.join([w for w in text.split() if len(w)>1])
	# Replace multiple space with one space
	text = re.sub(' +', ' ', text)
	text = ''.join(text)
	return text