Skip to content

Instantly share code, notes, and snippets.

@getgimphed
Last active May 18, 2020 20:34
Show Gist options
  • Save getgimphed/d91aeb3c81dde77832264f924b6d80d7 to your computer and use it in GitHub Desktop.
Save getgimphed/d91aeb3c81dde77832264f924b6d80d7 to your computer and use it in GitHub Desktop.
# Importing the libraries
import docx2txt
import pickle
import re
# TH : The Hindu, TOI : Times of India, IE : Indian Express, HT : Hindustan Times
# Loading newspaper text using docx2txt python library.
THtext = docx2txt.process("TH.docx")
TOItext = docx2txt.process("TOI.docx")
IEtext = docx2txt.process("IE.docx")
HTtext = docx2txt.process("HT.docx")
# Pulling the images in respective folders
THimages = docx2txt.process("TH.docx", os.getcwd() + "\\NLP_ExtractImages\\TH")
TOIimages = docx2txt.process("TOI.docx", os.getcwd() + "\\NLP_ExtractImages\\TOI")
IEimages = docx2txt.process("IE.docx", os.getcwd() + "\\NLP_ExtractImages\\IE")
HTimages = docx2txt.process("HT.docx", os.getcwd() + "\\NLP_ExtractImages\\HT")
# Cleaning text
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# RegEx to get all stats printed
# RegEx to get all stats printed
regex = re.compile('\d+\.?\d*')
THStats = regex.findall(THtext)
TOIStats = regex.findall(TOItext)
IEStats = regex.findall(IEtext)
HTStats = regex.findall(HTtext)
Stats = []
Stats.append(THStats)
Stats.append(TOIStats)
Stats.append(IEStats)
Stats.append(HTStats)
file = open("stats.pkl","wb")
pickle.dump(Stats,file)
file.close()
corpus = []
THtext = THtext.lower()
THtext = re.sub('[^a-zA-Z]', ' ' , THtext)
THtext = THtext.split()
THtext = [word for word in THtext if len(set(word)) != 1 ]
ps = PorterStemmer()
THtext = [ps.stem(word) for word in THtext if word not in set(stopwords.words('english')) ]
THtext = ' '.join(THtext)
corpus.append(THtext)
TOItext = TOItext.lower()
TOItext = re.sub('[^a-zA-Z]', ' ' , TOItext)
TOItext = TOItext.split()
TOItext = [word for word in TOItext if len(set(word)) != 1 ]
ps = PorterStemmer()
TOItext = [ps.stem(word) for word in TOItext if word not in set(stopwords.words('english')) ]
TOItext = ' '.join(TOItext)
corpus.append(TOItext)
IEtext = IEtext.lower()
IEtext = re.sub('[^a-zA-Z]', ' ' , IEtext)
IEtext = IEtext.split()
IEtext = [word for word in IEtext if len(set(word)) != 1 ]
ps = PorterStemmer()
IEtext = [ps.stem(word) for word in IEtext if word not in set(stopwords.words('english')) ]
IEtext = ' '.join(IEtext)
corpus.append(IEtext)
HTtext = HTtext.lower()
HTtext = re.sub('[^a-zA-Z]', ' ' , HTtext)
HTtext = HTtext.split()
HTtext = [word for word in HTtext if len(set(word)) != 1 ]
ps = PorterStemmer()
HTtext = [ps.stem(word) for word in HTtext if word not in set(stopwords.words('english')) ]
HTtext = ' '.join(HTtext)
corpus.append(HTtext)
file = open("corpus.pkl","wb")
pickle.dump(corpus,file)
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment