Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created April 4, 2018 07:48
Show Gist options
  • Save yabyzq/d4535a05ba7b21092a7bd157f327aad3 to your computer and use it in GitHub Desktop.
Save yabyzq/d4535a05ba7b21092a7bd157f327aad3 to your computer and use it in GitHub Desktop.
document prediction
import PyPDF2
def read_pdfs(pdf_file_name):
pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb'))
num_pages = pdf.numPages
count = 0
text = ""
while count < num_pages:
pageObj = pdf.getPage(count)
count +=1
text += pageObj.extractText()
#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
else:
text = textract.process(fileurl, method='tesseract', language='eng')
return text
doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf')
doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf')
doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf')
doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf')
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf')
doc_complete = [doc2, doc3, doc4, doc5, doc6]
import nltk
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
#nltk.download()
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]
# Importing Gensim
from gensim import corpora
import gensim
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
Lda = gensim.models.ldamodel.LdaModel
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=5, num_words=5))
#ldamodel.print_topics()
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance
#adding test data
doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf')
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf')
doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf')
doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf')
doc_test = [doc1,doc6, doc7, doc8]
doc_test_target =[0,1,0,2]
from sklearn.linear_model import SGDClassifier #SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_svm.fit(doc_complete, doc_target)
predicted_svm = text_clf_svm.predict(doc_complete)
print(np.mean(predicted_svm == doc_target))
print(predicted_svm)
from sklearn.linear_model import SGDClassifier #SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_svm.fit(doc_complete, doc_target)
predicted_svm = text_clf_svm.predict(doc_complete)
print(np.mean(predicted_svm == doc_target))
print(predicted_svm)
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
('tfidf', TfidfTransformer()),
('mnb', MultinomialNB(fit_prior=False)),
])
text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test)
np.mean(predicted_mnb_stemmed == doc_test_target)
import pandas as pd
df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target})
category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['target', 'doc']].values)
df
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.doc).toarray()
labels = df.target
features.shape
from sklearn.feature_selection import chi2
N = 2
for Product, category_id in sorted(category_to_id.items()):
features_chi2 = chi2(features, labels == category_id)
indices = np.argsort(features_chi2[0])
feature_names = np.array(tfidf.get_feature_names())[indices]
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
print("\n# '{}':".format(Product[0:50]))
print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment