Skip to content

Instantly share code, notes, and snippets.

@getgimphed
Created May 19, 2020 07:08
Show Gist options
  • Save getgimphed/e46082cabc1434d793f4889e4a5b7ac9 to your computer and use it in GitHub Desktop.
Save getgimphed/e46082cabc1434d793f4889e4a5b7ac9 to your computer and use it in GitHub Desktop.
# Evaluating 4 Indian English NewsPapers for 10th May 2020 for their
## Vocabulary or No of Unique words per Paragraphs
## Factual Presentation
## Sentimental Analysis
## Graphic content/ images : Needs preprocessing
## Visualising
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import pickle
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
file = open('corpus.pkl', 'rb')
corpus = pickle.load(file)
file.close()
newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times']
cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) )
docTermMatrix = cv.fit_transform(corpus).toarray()
data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
data_dtm.index = pd.Index(newspapers)
data_dtm = data_dtm.transpose()
# Checking out top 30 words for all newspapers
top_dict = {}
for c in data_dtm.columns:
top = data_dtm[c].sort_values(ascending =False).head(30)
top_dict[c] = list(zip(top.index,top.values))
# checking top words collective in these and seeing top occurring words accross
words = []
for newspaper in data_dtm.columns:
top = [word for (word,count) in top_dict[newspaper]]
for t in top:
words.append(t)
from collections import Counter
Counter(words).most_common()
# adding them to stopwords list ( Anything common across all 4 newspapers)
new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3]
stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)
cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) )
docTermMatrix = cv.fit_transform(corpus).toarray()
data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
data_stop.index = pd.Index(newspapers)
# Visualising top Words are Word Clouds
from wordcloud import WordCloud
wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0)
plt.rcParams['figure.figsize'] = [16,6]
for i,newspaper in enumerate(data_dtm.columns):
top = data_dtm[newspaper].sort_values(ascending=False).head(100)
listOfWords = [ word for word in top.index ]
wc.generate(' '.join(listOfWords))
plt.subplot(3, 4, i+1)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(newspaper)
# Getting unique words / Vocabulary
unique_list = []
for newspaper in data_dtm.columns:
uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size
unique_list.append(uniques)
unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word'])
#unique_words= unique_words.sort_values('unique_word',ascending = False)
# Manually checked
NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ]
NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount'])
NoOfPages = NoOfPages.transpose()
# Unique words per page
WPP = []
for i,j in enumerate(NoOfPages):
WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) )
# Plotting Total Words
X = np.arange(4)
plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5)
plt.yticks(X,newspapers)
plt.xlabel("Unique Words")
plt.title('Total Unique Words')
plt.show()
# Plotting Words per Page
plt.barh(X, WPP , align= 'center', alpha = 0.5)
plt.yticks(X,newspapers)
plt.xlabel('Words Count')
plt.title('Words per page')
plt.show()
# plotting stats per newspaper
file = open('stats.pkl', 'rb')
stats = pickle.load(file)
file.close()
statsLen = [len(li) for li in stats ]
barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5)
barlist[0].set_color('0.4')
barlist[1].set_color('r')
barlist[2].set_color('b')
barlist[3].set_color('g')
plt.yticks(X,newspapers)
plt.xlabel('Numeric Figures used')
plt.title('Numeric Figures used')
plt.show()
# Plotting Sentiment Analysis
from textblob import TextBlob
sentiment = []
for i in np.arange(4):
sentiment.append(TextBlob(corpus[i]).subjectivity)
plt.scatter(X,sentiment,linewidths=5)
plt.xticks(X,newspapers)
plt.ylabel("<--Facts-----------------Opininios-->")
plt.title("Subjectivity Graph")
plt.show()
# Calculating and Plotting Images Count
imagesCount = []
BasePath = os.getcwd() + "\\NLP_ExtractImages\\"
paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ]
for path in paths:
os.scandir(path)
counter = 0
for entry in os.scandir(path):
size = entry.stat().st_size
if size > 5000 :
counter += 1
imagesCount.append(counter)
barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5)
barlist[0].set_color('0.4')
barlist[1].set_color('r')
barlist[2].set_color('b')
barlist[3].set_color('g')
plt.xticks(X,newspapers)
plt.ylabel('No of Significant Images')
plt.title('No of Significant Images')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment