This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing extra spaces | |
df['cleaned']=df['cleaned'].apply(lambda x: re.sub(' +',' ',x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for index,text in enumerate(df['cleaned'][35:40]): | |
print('Review %d:\n'%(index+1),text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing spacy | |
import spacy | |
# Loading model | |
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner']) | |
# Lemmatization with stopwords removal | |
df['lemmatized']=df['cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df_grouped=df[['name','lemmatized']].groupby(by='name').agg(lambda x:' '.join(x)) | |
df_grouped.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Creating Document Term Matrix | |
from sklearn.feature_extraction.text | |
import CountVectorizer | |
cv=CountVectorizer(analyzer='word') | |
data=cv.fit_transform(df_grouped['lemmatized']) | |
df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names()) | |
df_dtm.index=df_grouped.index | |
df_dtm.head(3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text | |
from wordcloud import WordCloud | |
from textwrap import wrap | |
# Function for generating word clouds | |
def generate_wordcloud(data,title): | |
wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data) | |
plt.figure(figsize=(10,8)) | |
plt.imshow(wc, interpolation='bilinear') | |
plt.axis("off") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from textblob import TextBlob | |
df['polarity']=df['lemmatized'].apply(lambda x:TextBlob(x).sentiment.polarity) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("3 Random Reviews with Highest Polarity:") | |
for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=False)[:3].index]['reviews.text']): | |
print('Review {}:\n'.format(index+1),review) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("3 Random Reviews with Lowest Polarity:") | |
for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=True)[:3].index]['reviews.text']): | |
print('Review {}:\n'.format(index+1),review) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
product_polarity_sorted=pd.DataFrame(df.groupby('name')['polarity'].mean().sort_values(ascending=True)) | |
plt.figure(figsize=(16,8)) | |
plt.xlabel('Polarity') | |
plt.ylabel('Products') | |
plt.title('Polarity of Different Amazon Product Reviews') | |
polarity_graph=plt.barh(np.arange(len(product_polarity_sorted.index)),product_polarity_sorted['polarity'],color='purple',) | |
# Writing product names on bar | |
for bar,product in zip(polarity_graph,product_polarity_sorted.index): |