ABHISHEK SHARMA abhishek-shrm

## eta_on_text_data-12.py
# Removing extra spaces
df['cleaned']=df['cleaned'].apply(lambda x: re.sub(' +',' ',x))

## eta_on_text_data-13.py
for index,text in enumerate(df['cleaned'][35:40]):
  print('Review %d:\n'%(index+1),text)

## eta_on_text_data-14.py
# Importing spacy
import spacy

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# Lemmatization with stopwords removal
df['lemmatized']=df['cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

## eta_on_text_data-15.py
df_grouped=df[['name','lemmatized']].groupby(by='name').agg(lambda x:' '.join(x))
df_grouped.head()

## eta_on_text_data-16.py
# Creating Document Term Matrix
from sklearn.feature_extraction.text
import CountVectorizer
cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(df_grouped['lemmatized'])
df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
df_dtm.index=df_grouped.index
df_dtm.head(3)

## eta_on_text_data-17.py
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")

## eta_on_text_data-18.py
from textblob import TextBlob
df['polarity']=df['lemmatized'].apply(lambda x:TextBlob(x).sentiment.polarity)

## eta_on_text_data-19.py
print("3 Random Reviews with Highest Polarity:")
for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=False)[:3].index]['reviews.text']):
  print('Review {}:\n'.format(index+1),review)

## eta_on_text_data-20.py
print("3 Random Reviews with Lowest Polarity:")
for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=True)[:3].index]['reviews.text']):
  print('Review {}:\n'.format(index+1),review)

## eta_on_text_data-21.py
product_polarity_sorted=pd.DataFrame(df.groupby('name')['polarity'].mean().sort_values(ascending=True))

plt.figure(figsize=(16,8))
plt.xlabel('Polarity')
plt.ylabel('Products')
plt.title('Polarity of Different Amazon Product Reviews')
polarity_graph=plt.barh(np.arange(len(product_polarity_sorted.index)),product_polarity_sorted['polarity'],color='purple',)

# Writing product names on bar
for bar,product in zip(polarity_graph,product_polarity_sorted.index):
	# Removing extra spaces
	df['cleaned']=df['cleaned'].apply(lambda x: re.sub(' +',' ',x))
	for index,text in enumerate(df['cleaned'][35:40]):
	print('Review %d:\n'%(index+1),text)
	# Importing spacy
	import spacy

	# Loading model
	nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

	# Lemmatization with stopwords removal
	df['lemmatized']=df['cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
	df_grouped=df[['name','lemmatized']].groupby(by='name').agg(lambda x:' '.join(x))
	df_grouped.head()
	# Creating Document Term Matrix
	from sklearn.feature_extraction.text
	import CountVectorizer
	cv=CountVectorizer(analyzer='word')
	data=cv.fit_transform(df_grouped['lemmatized'])
	df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
	df_dtm.index=df_grouped.index
	df_dtm.head(3)
	# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
	from wordcloud import WordCloud
	from textwrap import wrap

	# Function for generating word clouds
	def generate_wordcloud(data,title):
	wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
	plt.figure(figsize=(10,8))
	plt.imshow(wc, interpolation='bilinear')
	plt.axis("off")
	from textblob import TextBlob
	df['polarity']=df['lemmatized'].apply(lambda x:TextBlob(x).sentiment.polarity)
	print("3 Random Reviews with Highest Polarity:")
	for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=False)[:3].index]['reviews.text']):
	print('Review {}:\n'.format(index+1),review)
	print("3 Random Reviews with Lowest Polarity:")
	for index,review in enumerate(df.iloc[df['polarity'].sort_values(ascending=True)[:3].index]['reviews.text']):
	print('Review {}:\n'.format(index+1),review)
	product_polarity_sorted=pd.DataFrame(df.groupby('name')['polarity'].mean().sort_values(ascending=True))

	plt.figure(figsize=(16,8))
	plt.xlabel('Polarity')
	plt.ylabel('Products')
	plt.title('Polarity of Different Amazon Product Reviews')
	polarity_graph=plt.barh(np.arange(len(product_polarity_sorted.index)),product_polarity_sorted['polarity'],color='purple',)

	# Writing product names on bar
	for bar,product in zip(polarity_graph,product_polarity_sorted.index):