arimitramaiti/dominos_preprocessing

## dominos_preprocessing

def pre_processing(dataset, my_stop_list):
    store = []
    ##Use contractions
    dataset['review_modified'] = dataset['review'].apply(lambda x: [contractions.fix(word) for word in x.split()])
    ##convert list to string
    dataset['review_modified_unlist'] = [' '.join(map(str, l)) for l in dataset['review_modified']]
    ##convert string to tokens
    dataset['review_mod_tokens'] = dataset['review_modified_unlist'].apply(nltk.word_tokenize)
    ##convert token to lower case
    dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word.lower() for word in x])
    ##remove punctuations
    dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])
    stop_set = set(nltk.corpus.stopwords.words('english'))
    stop_set.update(my_stop_list)
    ##remove default and custom stop words
    dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in stop_set])
    ##extract parts of speech tags
    dataset['review_tags'] = dataset['review_mod_tokens'].apply(nltk.tag.pos_tag)

    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    dataset['review_wordnet'] = dataset['review_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
    ##use lemmatizer on parts of speech tags
    lm = WordNetLemmatizer()
    dataset['review_lemmatized'] = dataset['review_wordnet'].apply(lambda x: [lm.lemmatize(word, tag) for word, tag in x])

    dataset['review_str'] = [' '.join(map(str,l)) for l in dataset['review_lemmatized']]
    ##extract sentiment score from parts of speech tags
    dataset['sentiment_score_textblob'] = dataset['review_str'].apply(lambda x: TextBlob(x).sentiment.polarity)

    def afinn_polarity(text):
        return Afinn().score(text)

    def score_sentiwordnet(doc):
        synsets = (wordnet.synsets(w) for w in nltk.word_tokenize(doc) if w is not None) # tokenize and lookup synonyms in WordNet
        words = (s[0].name() for s in synsets if s is not None and len(s) > 0) # use the first (most common) synonym
        words = (swn.senti_synset(s) for s in words) # get sentiment scores which have pos, neg, and neutral parts
        scores = ((1 - w.obj_score()) * (w.pos_score() - w.neg_score()) for w in words) # calculate individual word scores
        return (sum(scores)) # classify document as positive if resulting score is positive

    def ifformula(score):
        if score < 0:
            return "0"
        elif score == 0:
            return "1"
        else:
            return "2"

    dataset['sentiment_score_afinn'] = dataset['review_str'].apply(afinn_polarity)
    dataset['sentiment_score_sentinet'] = dataset['review_str'].apply(score_sentiwordnet)

    dataset['sentiment_score_afinn'] = dataset['sentiment_score_afinn'].apply(ifformula)
    dataset['sentiment_score_textblob'] = dataset['sentiment_score_textblob'].apply(ifformula)
    dataset['sentiment_score_sentinet'] = dataset['sentiment_score_sentinet'].apply(ifformula)

    dataset = dataset[['UniversalMessageId', 'Sentiment', 'review_date','review_lemmatized', 'review_str', 'sentiment_score_textblob', 'sentiment_score_sentinet', 'sentiment_score_afinn']]

    dataset['word_count'] = dataset['review_lemmatized'].apply(lambda x: len(str(x).split()))
    dataset['review_length'] = dataset['review_str'].astype(str).apply(len)

    ###Process to extract text2emotion for every row
    for rx in range(0, dataset.shape[0]):
        text = dataset['review_str'][rx]
        result = te.get_emotion(text)
        result = pd.DataFrame(result.items(), columns=['Emotion', 'Score'])
        result = result.sort_values(by=['Score'], ascending=False)
        result['cum_percent'] = 100*(result.Score.cumsum() / result.Score.sum())
        result['cum_percent'].fillna(0, inplace=True)
        result.reset_index(drop=True, inplace=True)

        print(rx)
        ##This would pick only those rows where cumulative emotion touches 100%
        store1 = []
        for i in range(0, result.shape[0]):
            a = result['cum_percent'][i]
            if a == 100.0:
                b = result['Emotion'][i]
                store1.append(b)
                break

            else:
                b = result['Emotion'][i]
                store1.append(b)

        c = store1
        mylist = [rx, c, result['cum_percent'].max()]
        store.append(mylist)
#     print(mylist)

    result_emotions = pd.DataFrame(store, columns=['Index_col', 'Emotions', 'Max_score'])
    result_emotions['Emotions'] = [','.join(map(str,l)) for l in result_emotions['Emotions']]
    result_emotions['Emotions'] = np.where(result_emotions['Max_score']==0, 'None', result_emotions['Emotions'])

    dataset = pd.merge(dataset, result_emotions, how="left", left_index = True, right_index = True)

    return dataset

	def pre_processing(dataset, my_stop_list):
	store = []
	##Use contractions
	dataset['review_modified'] = dataset['review'].apply(lambda x: [contractions.fix(word) for word in x.split()])
	##convert list to string
	dataset['review_modified_unlist'] = [' '.join(map(str, l)) for l in dataset['review_modified']]
	##convert string to tokens
	dataset['review_mod_tokens'] = dataset['review_modified_unlist'].apply(nltk.word_tokenize)
	##convert token to lower case
	dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word.lower() for word in x])
	##remove punctuations
	dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])
	stop_set = set(nltk.corpus.stopwords.words('english'))
	stop_set.update(my_stop_list)
	##remove default and custom stop words
	dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in stop_set])
	##extract parts of speech tags
	dataset['review_tags'] = dataset['review_mod_tokens'].apply(nltk.tag.pos_tag)

	def get_wordnet_pos(tag):
	if tag.startswith('J'):
	return wordnet.ADJ
	elif tag.startswith('V'):
	return wordnet.VERB
	elif tag.startswith('N'):
	return wordnet.NOUN
	elif tag.startswith('R'):
	return wordnet.ADV
	else:
	return wordnet.NOUN

	dataset['review_wordnet'] = dataset['review_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
	##use lemmatizer on parts of speech tags
	lm = WordNetLemmatizer()
	dataset['review_lemmatized'] = dataset['review_wordnet'].apply(lambda x: [lm.lemmatize(word, tag) for word, tag in x])

	dataset['review_str'] = [' '.join(map(str,l)) for l in dataset['review_lemmatized']]
	##extract sentiment score from parts of speech tags
	dataset['sentiment_score_textblob'] = dataset['review_str'].apply(lambda x: TextBlob(x).sentiment.polarity)

	def afinn_polarity(text):
	return Afinn().score(text)

	def score_sentiwordnet(doc):
	synsets = (wordnet.synsets(w) for w in nltk.word_tokenize(doc) if w is not None) # tokenize and lookup synonyms in WordNet
	words = (s[0].name() for s in synsets if s is not None and len(s) > 0) # use the first (most common) synonym
	words = (swn.senti_synset(s) for s in words) # get sentiment scores which have pos, neg, and neutral parts
	scores = ((1 - w.obj_score()) * (w.pos_score() - w.neg_score()) for w in words) # calculate individual word scores
	return (sum(scores)) # classify document as positive if resulting score is positive

	def ifformula(score):
	if score < 0:
	return "0"
	elif score == 0:
	return "1"
	else:
	return "2"

	dataset['sentiment_score_afinn'] = dataset['review_str'].apply(afinn_polarity)
	dataset['sentiment_score_sentinet'] = dataset['review_str'].apply(score_sentiwordnet)

	dataset['sentiment_score_afinn'] = dataset['sentiment_score_afinn'].apply(ifformula)
	dataset['sentiment_score_textblob'] = dataset['sentiment_score_textblob'].apply(ifformula)
	dataset['sentiment_score_sentinet'] = dataset['sentiment_score_sentinet'].apply(ifformula)

	dataset = dataset[['UniversalMessageId', 'Sentiment', 'review_date','review_lemmatized', 'review_str', 'sentiment_score_textblob', 'sentiment_score_sentinet', 'sentiment_score_afinn']]

	dataset['word_count'] = dataset['review_lemmatized'].apply(lambda x: len(str(x).split()))
	dataset['review_length'] = dataset['review_str'].astype(str).apply(len)

	###Process to extract text2emotion for every row
	for rx in range(0, dataset.shape[0]):
	text = dataset['review_str'][rx]
	result = te.get_emotion(text)
	result = pd.DataFrame(result.items(), columns=['Emotion', 'Score'])
	result = result.sort_values(by=['Score'], ascending=False)
	result['cum_percent'] = 100*(result.Score.cumsum() / result.Score.sum())
	result['cum_percent'].fillna(0, inplace=True)
	result.reset_index(drop=True, inplace=True)

	print(rx)
	##This would pick only those rows where cumulative emotion touches 100%
	store1 = []
	for i in range(0, result.shape[0]):
	a = result['cum_percent'][i]
	if a == 100.0:
	b = result['Emotion'][i]
	store1.append(b)
	break

	else:
	b = result['Emotion'][i]
	store1.append(b)

	c = store1
	mylist = [rx, c, result['cum_percent'].max()]
	store.append(mylist)
	# print(mylist)

	result_emotions = pd.DataFrame(store, columns=['Index_col', 'Emotions', 'Max_score'])
	result_emotions['Emotions'] = [','.join(map(str,l)) for l in result_emotions['Emotions']]
	result_emotions['Emotions'] = np.where(result_emotions['Max_score']==0, 'None', result_emotions['Emotions'])

	dataset = pd.merge(dataset, result_emotions, how="left", left_index = True, right_index = True)

	return dataset