yabyzq/doc_prediction

## doc_prediction
import PyPDF2


def read_pdfs(pdf_file_name):
    pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb'))
    num_pages = pdf.numPages
    count = 0
    text = ""
    while count < num_pages:
        pageObj = pdf.getPage(count)
        count +=1
        text += pageObj.extractText()
    #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
    if text != "":
       text = text
    else:
       text = textract.process(fileurl, method='tesseract', language='eng')
    return text


doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf')
doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf')
doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf')
doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf')
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf')
doc_complete = [doc2, doc3, doc4, doc5, doc6]


import nltk
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))


from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
#nltk.download()
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]


# Importing Gensim

from gensim import corpora
import gensim
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

print(ldamodel.print_topics(num_topics=5, num_words=5))
#ldamodel.print_topics()


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance

#adding test data
doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf')
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf')
doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf')
doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf')
doc_test = [doc1,doc6, doc7, doc8]
doc_test_target =[0,1,0,2]


from sklearn.linear_model import SGDClassifier #SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_svm.fit(doc_complete, doc_target)

predicted_svm = text_clf_svm.predict(doc_complete)
print(np.mean(predicted_svm == doc_target))
print(predicted_svm)


from sklearn.linear_model import SGDClassifier #SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_svm.fit(doc_complete, doc_target)

predicted_svm = text_clf_svm.predict(doc_complete)
print(np.mean(predicted_svm == doc_target))
print(predicted_svm)

import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                     ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False)),
])
text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test)
np.mean(predicted_mnb_stemmed == doc_test_target)


import pandas as pd
df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target})
category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['target', 'doc']].values)
df


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.doc).toarray()
labels = df.target
features.shape

from sklearn.feature_selection import chi2
N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("\n# '{}':".format(Product[0:50]))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
	import PyPDF2





	def read_pdfs(pdf_file_name):
	pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb'))
	num_pages = pdf.numPages
	count = 0
	text = ""
	while count < num_pages:
	pageObj = pdf.getPage(count)
	count +=1
	text += pageObj.extractText()
	#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
	if text != "":
	text = text
	else:
	text = textract.process(fileurl, method='tesseract', language='eng')
	return text


	doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf')
	doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf')
	doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf')
	doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf')
	doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf')
	doc_complete = [doc2, doc3, doc4, doc5, doc6]


	import nltk
	from nltk.corpus import stopwords
	stop = set(stopwords.words('english'))





	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	import nltk
	#nltk.download()
	import string
	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation)
	lemma = WordNetLemmatizer()
	def clean(doc):
	stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
	punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
	normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
	return normalized

	doc_clean = [clean(doc).split() for doc in doc_complete]





	# Importing Gensim

	from gensim import corpora
	import gensim
	# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
	dictionary = corpora.Dictionary(doc_clean)

	# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
	doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

	Lda = gensim.models.ldamodel.LdaModel

	# Running and Trainign LDA model on the document term matrix.
	ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

	print(ldamodel.print_topics(num_topics=5, num_words=5))
	#ldamodel.print_topics()



	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.naive_bayes import MultinomialNB
	import numpy as np

	doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance

	#adding test data
	doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf')
	doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf')
	doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf')
	doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf')
	doc_test = [doc1,doc6, doc7, doc8]
	doc_test_target =[0,1,0,2]





	from sklearn.linear_model import SGDClassifier #SVM
	text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
	('tfidf', TfidfTransformer()),
	('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
	_ = text_clf_svm.fit(doc_complete, doc_target)

	predicted_svm = text_clf_svm.predict(doc_complete)
	print(np.mean(predicted_svm == doc_target))
	print(predicted_svm)





	from sklearn.linear_model import SGDClassifier #SVM
	text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
	('tfidf', TfidfTransformer()),
	('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
	_ = text_clf_svm.fit(doc_complete, doc_target)

	predicted_svm = text_clf_svm.predict(doc_complete)
	print(np.mean(predicted_svm == doc_target))
	print(predicted_svm)

	import nltk
	from nltk.stem.snowball import SnowballStemmer
	stemmer = SnowballStemmer("english", ignore_stopwords=True)
	class StemmedCountVectorizer(CountVectorizer):
	def build_analyzer(self):
	analyzer = super(StemmedCountVectorizer, self).build_analyzer()
	return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
	stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
	text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
	('tfidf', TfidfTransformer()),
	('mnb', MultinomialNB(fit_prior=False)),
	])
	text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target)
	predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test)
	np.mean(predicted_mnb_stemmed == doc_test_target)


	import pandas as pd
	df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target})
	category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target')
	category_to_id = dict(category_id_df.values)
	id_to_category = dict(category_id_df[['target', 'doc']].values)
	df


	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
	features = tfidf.fit_transform(df.doc).toarray()
	labels = df.target
	features.shape

	from sklearn.feature_selection import chi2
	N = 2
	for Product, category_id in sorted(category_to_id.items()):
	features_chi2 = chi2(features, labels == category_id)
	indices = np.argsort(features_chi2[0])
	feature_names = np.array(tfidf.get_feature_names())[indices]
	unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
	bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
	print("\n# '{}':".format(Product[0:50]))
	print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
	print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))