-
-
Save CaptainAshis/d813b46fe69bb753b49c2a41057bb042 to your computer and use it in GitHub Desktop.
Python code- Abzooba
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import matplotlib.pyplot as plt | |
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve | |
import logging | |
import pickle | |
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from statsmodels.stats.outliers_influence import variance_inflation_factor | |
import statsmodels.api as sm | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import os | |
os.chdir('./abzooba') | |
class Classification_Model(): | |
def __init__(self): | |
logging.basicConfig( | |
level=logging.DEBUG, | |
filename='data.log', | |
filemode='w', | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
datefmt='%d/%m/%Y %H:%M:%S') | |
print("Classifciation model initialized") | |
print() | |
self.clf_and_params = dict() | |
self.train_data = pd.DataFrame() | |
self.test_data = pd.DataFrame() | |
def read_data(self,train_data,test_data): | |
try: | |
self.train_data = pd.read_csv(train_data) | |
self.test_data = pd.read_csv(test_data) | |
print(f"Column names are {self.train_data.columns.tolist()} \n") | |
print(f"# of Columns are {len(self.train_data.columns)}") | |
assert(len(self.train_data.columns) + | |
1 == len(self.test_data.columns)) | |
except BaseException: | |
logging.error('Exception occured while reading data') | |
return self.train_data, self.test_data | |
def exploratory_data(self,train_data,test_data): | |
try: | |
self.train_data, self.test_data = self.read_data(train_data,test_data) | |
df = pd.DataFrame(self.train_data.dtypes) | |
df['Missing_values'] = self.train_data.isna().sum().values | |
df.rename(columns={0: 'data types'}, inplace=True) | |
print("=" * 60) | |
print(df) | |
print("=" * 60) | |
print( | |
f" ========================== Data Stats ======================================") | |
print(self.train_data.describe()) | |
print( | |
f"==============================================================================") | |
except BaseException: | |
logging.error('Exception occured in exploratory data format') | |
def drop_columns(self,train_data,test_data): | |
try: | |
train_data, test_data = self.read_data(train_data,test_data) | |
train_data1 = train_data.copy() | |
train_data.drop(['patient_id', 'Adherence'], axis=1, inplace=True) | |
test_data.drop(['patient_id'], axis=1, inplace=True) | |
print( | |
f"Number of columns in training data after dropping columns {len(train_data.columns)}") | |
return train_data, test_data, train_data1 | |
except BaseException: | |
logging.error('Exception occured in dropping columns method') | |
def visualisation(self,train_data,test_data): | |
train_data, test_data = self.read_data(train_data,test_data) | |
list_of_cols = train_data.columns.tolist()[2:9] | |
list_of_cols.pop(1) | |
for names in list_of_cols: | |
print() | |
plot = train_data.groupby([names, 'Adherence'])[ | |
'patient_id'].count().reset_index() | |
plot.rename(columns={'patient_id': 'counts'}, inplace=True) | |
sns.catplot( | |
y='counts', | |
x='Adherence', | |
hue=names, | |
data=plot, | |
kind='bar') | |
plt.title(f"Distribution of Adherence wrt to {names}") | |
def data_preprocessing(self,train_data,test_data): | |
try: | |
train_data, test_data, train_data1 = self.drop_columns(train_data,test_data) | |
df_train1 = pd.get_dummies(train_data, drop_first=True) | |
print(df_train1.columns) | |
correln_matrix = df_train1.corr() | |
print(correln_matrix) | |
# Calculating VIF | |
vif = pd.DataFrame() | |
vif["variables"] = df_train1.columns | |
vif["VIF"] = [ | |
variance_inflation_factor( | |
df_train1.values, | |
i) for i in range( | |
df_train1.shape[1])] | |
# print(vif) | |
valid_variables = vif.loc[vif['VIF'] < 5]['variables'].tolist() | |
except BaseException: | |
logging.error('Exception occured in data_preprocessing') | |
return df_train1[valid_variables], train_data1 | |
def train_test_split(self,train_data,test_data): | |
try: | |
df_train1, train_data1 = self.data_preprocessing(train_data,test_data) | |
df_train1['Adherence_new'] = train_data1['Adherence'].apply( | |
lambda x: 0 if x == 'No' else 1) | |
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( | |
df_train1, df_train1['Adherence_new'], test_size=0.2) | |
except BaseException: | |
logging.error('Exception occured in data_preprocessing') | |
return self.X_train, self.X_test, self.y_train, self.y_test | |
def algorithm(self, clf): | |
try: | |
clf = KNeighborsClassifier() | |
params = {'n_neighbors': [5, 7, 9, 11, 13, 15], | |
'leaf_size': [1, 2, 3, 5], | |
'weights': ['uniform', 'distance'] | |
} | |
self.clf_and_params[clf] = params | |
clf = LogisticRegression() | |
params = {'penalty': ['l1', 'l2'], | |
'C': np.logspace(0, 4, 10) | |
} | |
self.clf_and_params[clf] = params | |
clf = SVC() | |
params = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, { | |
'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}] | |
self.clf_and_params[clf] = params | |
clf = DecisionTreeClassifier() | |
params = { | |
'max_features': [ | |
'auto', | |
'sqrt', | |
'log2'], | |
'min_samples_split': [ | |
2, | |
3, | |
4, | |
5, | |
6, | |
7, | |
8, | |
9, | |
10, | |
11, | |
12, | |
13, | |
14, | |
15], | |
'min_samples_leaf': [1], | |
'random_state': [123]} | |
self.clf_and_params[clf] = params | |
clf = RandomForestClassifier() | |
params = { | |
'max_depth': [10, 15, 20], | |
'n_estimators': [2, 4], | |
'max_features': ['sqrt', 'auto'] | |
} | |
self.clf_and_params[clf] = params | |
except BaseException: | |
logging.error('Exception occured in algorithm method') | |
return clf, self.clf_and_params[clf] | |
def model_train(self, clf, params): | |
try: | |
models = [] | |
self.results = {} | |
self.current_clf_name = clf.__class__.__name__ | |
grid_search_clf = RandomizedSearchCV(clf, params, cv=5) | |
grid_search_clf.fit(self.X_train, self.y_train) | |
self.Y_pred = grid_search_clf.predict(self.X_test) | |
self.Y_pred_prob = grid_search_clf.predict_proba(self.X_test) | |
clf_train_acc = round( | |
grid_search_clf.score( | |
self.X_train, self.y_train) * 100, 2) | |
print( | |
self.current_clf_name, | |
" trained and used for prediction on test data...") | |
self.results[self.current_clf_name] = clf_train_acc | |
self.show_result() | |
self.save_model(grid_search_clf) | |
self.output_stats(grid_search_clf) | |
models.append(clf) | |
except BaseException: | |
logging.error('Exception occured in model train method') | |
def show_result(self): | |
try: | |
for clf_name, train_acc in self.results.items(): | |
print( | |
"{} train accuracy is {:.3f}".format( | |
clf_name, train_acc)) | |
except BaseException: | |
logging.error('Exception occured in show result method') | |
def save_model(self, grid_search_clf): | |
try: | |
# save the model to disk | |
filename = f'{self.current_clf_name}_finalized_model.pickle' | |
with open(f'{filename}', 'wb') as handle: | |
pickle.dump(grid_search_clf, handle) | |
print(f"Pickling completed for {filename}") | |
except BaseException: | |
logging.error('Exception occured in save model method') | |
def output_stats(self, grid_search_clf): | |
try: | |
print("=== Confusion Matrix ===") | |
print(confusion_matrix(self.y_test, self.Y_pred)) | |
print('\n') | |
print("=== Classification Report ===") | |
print(classification_report(self.y_test, self.Y_pred)) | |
print() | |
plot_confusion_matrix(grid_search_clf, self.X_test, self.y_test) | |
plt.show() | |
plot_precision_recall_curve( | |
grid_search_clf, self.X_test, self.y_test) | |
plot_roc_curve(grid_search_clf, self.X_test, self.y_test) | |
except BaseException: | |
logging.error('Exception occured in output_stats method') | |
os.chdir('./abz') | |
filename=f'external.config' | |
contents=open(filename).read() | |
config=eval(contents) | |
train_data=config['train_data'] | |
test_data=config['test_data'] | |
cm = Classification_Model() | |
X_train, X_test, y_train, y_test = cm.train_test_split(train_data,test_data) | |
clf, params = cm.algorithm(RandomForestClassifier()) | |
cm.model_train(clf, params) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment