getgimphed/NewsPaperEval.py

## NewsPaperEval.py
# Evaluating 4 Indian English NewsPapers for 10th May 2020 for their
## Vocabulary or No of Unique words per Paragraphs
## Factual Presentation
## Sentimental Analysis
## Graphic content/ images : Needs preprocessing
## Visualising

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import pickle

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

file = open('corpus.pkl', 'rb')
corpus = pickle.load(file)
file.close()

newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times']

cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) )
docTermMatrix = cv.fit_transform(corpus).toarray()
data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
data_dtm.index =  pd.Index(newspapers)
data_dtm = data_dtm.transpose()

# Checking out top 30 words for all newspapers
top_dict = {}
for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending =False).head(30)
    top_dict[c] = list(zip(top.index,top.values))


# checking top words collective in these and seeing top occurring words accross
words = []
for newspaper in data_dtm.columns:
    top = [word for (word,count) in top_dict[newspaper]]
    for t in top:
        words.append(t)

from collections import Counter
Counter(words).most_common()

# adding them to stopwords list ( Anything common across all 4 newspapers)
new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3]
stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) )
docTermMatrix = cv.fit_transform(corpus).toarray()
data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
data_stop.index =  pd.Index(newspapers)

# Visualising top Words are Word Clouds
from wordcloud import WordCloud
wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0)

plt.rcParams['figure.figsize'] = [16,6]

for i,newspaper in enumerate(data_dtm.columns):
    top = data_dtm[newspaper].sort_values(ascending=False).head(100)
    listOfWords = [ word for word in top.index ]

    wc.generate(' '.join(listOfWords))

    plt.subplot(3, 4, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(newspaper)

# Getting unique words / Vocabulary
unique_list = []
for newspaper in data_dtm.columns:
    uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size
    unique_list.append(uniques)

unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word'])
#unique_words= unique_words.sort_values('unique_word',ascending = False)

# Manually checked
NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ]
NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount'])
NoOfPages = NoOfPages.transpose()

# Unique words per page
WPP = []
for i,j in enumerate(NoOfPages):
    WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) )

# Plotting Total Words
X = np.arange(4)
plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5)
plt.yticks(X,newspapers)
plt.xlabel("Unique Words")
plt.title('Total Unique Words')
plt.show()

# Plotting Words per Page
plt.barh(X, WPP , align= 'center', alpha = 0.5)
plt.yticks(X,newspapers)
plt.xlabel('Words Count')
plt.title('Words per page')
plt.show()

# plotting stats per newspaper
file = open('stats.pkl', 'rb')
stats = pickle.load(file)
file.close()
statsLen = [len(li) for li in stats ]

barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5)
barlist[0].set_color('0.4')
barlist[1].set_color('r')
barlist[2].set_color('b')
barlist[3].set_color('g')
plt.yticks(X,newspapers)
plt.xlabel('Numeric Figures used')
plt.title('Numeric Figures used')
plt.show()

# Plotting Sentiment Analysis
from textblob import TextBlob
sentiment = []
for i in np.arange(4):
    sentiment.append(TextBlob(corpus[i]).subjectivity)

plt.scatter(X,sentiment,linewidths=5)
plt.xticks(X,newspapers)
plt.ylabel("<--Facts-----------------Opininios-->")
plt.title("Subjectivity Graph")
plt.show()

# Calculating and Plotting Images Count
imagesCount = []
BasePath = os.getcwd() + "\\NLP_ExtractImages\\"
paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ]
for path in paths:
    os.scandir(path)
    counter = 0
    for entry in os.scandir(path):
        size = entry.stat().st_size
        if size > 5000 :
            counter += 1
    imagesCount.append(counter)

barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5)
barlist[0].set_color('0.4')
barlist[1].set_color('r')
barlist[2].set_color('b')
barlist[3].set_color('g')
plt.xticks(X,newspapers)
plt.ylabel('No of Significant Images')
plt.title('No of Significant Images')
plt.show()
	# Evaluating 4 Indian English NewsPapers for 10th May 2020 for their
	## Vocabulary or No of Unique words per Paragraphs
	## Factual Presentation
	## Sentimental Analysis
	## Graphic content/ images : Needs preprocessing
	## Visualising

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import os
	import re
	import pickle

	from sklearn.feature_extraction import text
	from sklearn.feature_extraction.text import CountVectorizer

	file = open('corpus.pkl', 'rb')
	corpus = pickle.load(file)
	file.close()

	newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times']

	cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) )
	docTermMatrix = cv.fit_transform(corpus).toarray()
	data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
	data_dtm.index = pd.Index(newspapers)
	data_dtm = data_dtm.transpose()

	# Checking out top 30 words for all newspapers
	top_dict = {}
	for c in data_dtm.columns:
	top = data_dtm[c].sort_values(ascending =False).head(30)
	top_dict[c] = list(zip(top.index,top.values))


	# checking top words collective in these and seeing top occurring words accross
	words = []
	for newspaper in data_dtm.columns:
	top = [word for (word,count) in top_dict[newspaper]]
	for t in top:
	words.append(t)

	from collections import Counter
	Counter(words).most_common()

	# adding them to stopwords list ( Anything common across all 4 newspapers)
	new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3]
	stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

	cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) )
	docTermMatrix = cv.fit_transform(corpus).toarray()
	data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
	data_stop.index = pd.Index(newspapers)

	# Visualising top Words are Word Clouds
	from wordcloud import WordCloud
	wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0)

	plt.rcParams['figure.figsize'] = [16,6]

	for i,newspaper in enumerate(data_dtm.columns):
	top = data_dtm[newspaper].sort_values(ascending=False).head(100)
	listOfWords = [ word for word in top.index ]

	wc.generate(' '.join(listOfWords))

	plt.subplot(3, 4, i+1)
	plt.imshow(wc, interpolation="bilinear")
	plt.axis("off")
	plt.title(newspaper)

	# Getting unique words / Vocabulary
	unique_list = []
	for newspaper in data_dtm.columns:
	uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size
	unique_list.append(uniques)

	unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word'])
	#unique_words= unique_words.sort_values('unique_word',ascending = False)

	# Manually checked
	NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ]
	NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount'])
	NoOfPages = NoOfPages.transpose()

	# Unique words per page
	WPP = []
	for i,j in enumerate(NoOfPages):
	WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) )

	# Plotting Total Words
	X = np.arange(4)
	plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5)
	plt.yticks(X,newspapers)
	plt.xlabel("Unique Words")
	plt.title('Total Unique Words')
	plt.show()

	# Plotting Words per Page
	plt.barh(X, WPP , align= 'center', alpha = 0.5)
	plt.yticks(X,newspapers)
	plt.xlabel('Words Count')
	plt.title('Words per page')
	plt.show()

	# plotting stats per newspaper
	file = open('stats.pkl', 'rb')
	stats = pickle.load(file)
	file.close()
	statsLen = [len(li) for li in stats ]

	barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5)
	barlist[0].set_color('0.4')
	barlist[1].set_color('r')
	barlist[2].set_color('b')
	barlist[3].set_color('g')
	plt.yticks(X,newspapers)
	plt.xlabel('Numeric Figures used')
	plt.title('Numeric Figures used')
	plt.show()

	# Plotting Sentiment Analysis
	from textblob import TextBlob
	sentiment = []
	for i in np.arange(4):
	sentiment.append(TextBlob(corpus[i]).subjectivity)

	plt.scatter(X,sentiment,linewidths=5)
	plt.xticks(X,newspapers)
	plt.ylabel("<--Facts-----------------Opininios-->")
	plt.title("Subjectivity Graph")
	plt.show()

	# Calculating and Plotting Images Count
	imagesCount = []
	BasePath = os.getcwd() + "\\NLP_ExtractImages\\"
	paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ]
	for path in paths:
	os.scandir(path)
	counter = 0
	for entry in os.scandir(path):
	size = entry.stat().st_size
	if size > 5000 :
	counter += 1
	imagesCount.append(counter)

	barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5)
	barlist[0].set_color('0.4')
	barlist[1].set_color('r')
	barlist[2].set_color('b')
	barlist[3].set_color('g')
	plt.xticks(X,newspapers)
	plt.ylabel('No of Significant Images')
	plt.title('No of Significant Images')
	plt.show()