AdityaSoni19031997/kaggle.py

## kaggle.py
import seaborn as sns
from sklearn import preprocessing, ensemble
from scipy.stats import kendalltau
import pandas as pd
import random

#todo change module name
from tqdm import tqdm
import numpy as np
import pandas as pd
from multiprocessing import Pool

def _apply_df(args):
    df, func, num, kwargs = args
    return num, df.apply(func, **kwargs)

def _apply_series(args):
    df, func, num, kwargs = args
    return num, df.apply(func)

def apply_by_multiprocessing(df,func,**kwargs):
    workers = kwargs.pop('workers')
    chunks = kwargs.pop('chunks')
    is_series = kwargs.pop('is_series')

    with Pool(workers) as p:
        apply_lst = [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, chunks))]
        if is_series:
            result = list(tqdm(p.imap(_apply_series, apply_lst), total=len(apply_lst)))
        else:
            result = list(tqdm(p.imap(_apply_df, apply_lst), total=len(apply_lst)))

    result=sorted(result,key=lambda x:x[0])
    return pd.concat([i[1] for i in result], sort = False)

def _apply_df_groupby(args):
    group, func, name, kwargs = args
    return name, func(group, **kwargs)

def multiprocessing_groupby(groupby,
                            func,
                            **kwargs):
    workers = kwargs.pop('workers')

    with Pool(workers) as p:
        apply_lst = [(group, func, name, kwargs) for name,group in groupby]

        result = list(tqdm(p.imap(_apply_df_groupby, apply_lst), total=len(apply_lst)))

    result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result], sort = False)

# From Kaggle's Avito Comp
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

#load csv file at path
def load_dataframe(csv_path):
    return pd.read_csv(csv_path) # returns dataframe

#show info about csv
def show_basic_info(dataframe):
    print("data basic info:")
    print("size: {0}".format(dataframe.size))
    print("columns: {0}".format(dataframe.columns))

#convert to numeric data
def convert_to_numeric(column, drop_missing_data=True):
    numeric = pd.to_numeric(column, errors='coerce')
    if(drop_missing_data):
        numeric = numeric.dropna()
    return numeric

#get columns for pandas dataframe
def get_columns(dataframe, columns_names_arr):
    return dataframe[columns_names_arr]

#plot basic hex graph for pair of numerical data
def plot_hex_graph(numerical_data1, numerical_data2):
    with sns.axes_style("white"):
        hex_plt=sns.jointplot(x=numerical_data1, y=numerical_data2, kind="hex", gridsize=24, space=0, color="r")
        print("--> plotting data on hex graph....")
        sns.plt.show()

#numerical data needs to be pndas.series
def plot_histogram(numerical_data):
    print("plotting histogram for {0}".format(numerical_data.name))
    try:
        sns.distplot(numerical_data)
        sns.plt.show()
    except:
        print("failed to plot histogram")

#summarize one categorical data (csv column)
#column needs to be pandas series
def summarize_categorical_data(data_name, column):
    print("________________________")
    print("{0}:".format(data_name))
    description=column.describe()
    counts_description=description.sort(['counts'], ascending=False)
    print(counts_description)

#summarize all categorical data (multiple csv columns)
#categorical_data needs to be array of pandas series
def summarize_all_categorical_data(categorical_data):
    print("--> categorical data summaries:")
    for i in range(len(categorical_data.columns)):
        column_name=categorical_data.columns[i]
        column_data=categorical_data[column_name]
        converted_to_categorical=pd.Categorical(column_data)
        summarize_categorical_data(column_name,converted_to_categorical)

#get names of categorical and numerical data columns
def divide_data_to_categorical_and_numerical(data):
        numerical_column_names=[]
        categorical_column_names=[]
        for column_name in data.columns:
            column= data[column_name] # column type is "pandas.core.series.Series"
            few=10
            few_elements=column.values[:few] # ten first elements
            is_categorical=[type(x) for x in few_elements].count(type("string"))==few
            #print("column: {0}. Categorical {1}".format(column_name,is_categorical))
            if(is_categorical):
                categorical_column_names.append(column_name)
            else:
                numerical_column_names.append(column_name)
        return categorical_column_names, numerical_column_names

#create bivariate distributions for numerical data
def plot_multiple_bivariate_distributions_grid(dataframe):
    sns.pairplot(dataframe)
#########################################################################################

#keras visuals
layer_names = []
for layer in model.layers[:-1]:
    layer_names.append(layer.name)
images_per_row = 16
for layer_name, layer_activation in zip(layer_names, activations):
    if layer_name.startswith('conv'):
        n_features = layer_activation.shape[-1]
        size = layer_activation.shape[1]
        n_cols = n_features // images_per_row
        display_grid = np.zeros((size * n_cols, images_per_row * size))
        for col in range(n_cols):
            for row in range(images_per_row):
                channel_image = layer_activation[0,:, :, col * images_per_row + row]
                channel_image -= channel_image.mean()
                channel_image /= channel_image.std()
                channel_image *= 64
                channel_image += 128
                channel_image = np.clip(channel_image, 0, 255).astype('uint8')
                display_grid[col * size : (col + 1) * size,
                             row * size : (row + 1) * size] = channel_image
        scale = 1. / size
        plt.figure(figsize=(scale * display_grid.shape[1],
                            scale * display_grid.shape[0]))
        plt.title(layer_name)
        plt.grid(False)
        plt.imshow(display_grid, aspect='auto', cmap='viridis')

########################################################################################
##https://www.kaggle.com/adityaecdrid/mnist-with-keras-for-beginners-99457/
# Look at confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Predict the values from the validation dataset
Y_pred = model.predict(X_val)
# Convert predictions classes to one hot vectors
Y_pred_classes = np.argmax(Y_pred, axis = 1)
# Convert validation observations to one hot vectors
Y_true = np.argmax(Y_val, axis = 1)
# compute the confusion matrix
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes)
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10))

########################################################################################################################
print(h.history.keys())
#  "Accuracy"
plt.plot(h.history['acc'])
plt.plot(h.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# "Loss"
plt.plot(h.history['loss'])
plt.plot(h.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
#"Learning Rate"
plt.plot(h.history['lr'])
plt.title('Learning Rate')
plt.show()

###########################################################################################################
# Errors are difference between predicted labels and true labels
errors = (Y_pred_classes - Y_true != 0)

Y_pred_classes_errors = Y_pred_classes[errors]
Y_pred_errors = Y_pred[errors]
Y_true_errors = Y_true[errors]
X_val_errors = X_val[errors]

def display_errors(errors_index,img_errors,pred_errors, obs_errors):
    """ This function shows 6 images with their predicted and real labels"""
    n = 0
    nrows = 2
    ncols = 3
    fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
    for row in range(nrows):
        for col in range(ncols):
            error = errors_index[n]
            ax[row,col].imshow((img_errors[error]).reshape((28,28)))
            ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
            n += 1

# Probabilities of the wrong predicted numbers
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)

# Predicted probabilities of the true values in the error set
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))

# Difference between the probability of the predicted label and the true label
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors

# Sorted list of the delta prob errors
sorted_dela_errors = np.argsort(delta_pred_true_errors)

# Top 6 errors
most_important_errors = sorted_dela_errors[-6:]

# Show the top 6 errors
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)

############################################################################################################
'''
K.function creates theano/tensorflow tensor functions which is later used
to get the output from the symbolic graph given the input.

Now K.learning_phase() is required as an input as many Keras layers like Dropout/Batchnomalization
depend on it to change behavior during training and test time.
Had taken help from the keras
[docs](https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer),
this [answer](https://stackoverflow.com/questions/41711190/keras-how-to-get-the-output-of-each-layer) on
stack and converted the same in a function like thing ...
'''
def layer_to_visualize(layer):
    inputs = [K.learning_phase()] + model.inputs

    _convout1_f = K.function(inputs, [layer.output])

    def convout1_f(X):
        # The [0] is to disable the training phase flag
        return _convout1_f([0] + [X])
    convolutions = convout1_f(img_to_visualize)
    convolutions = np.squeeze(convolutions)

    print ('Shape of conv:', convolutions.shape)

    n = convolutions.shape[0]
    n = int(np.ceil(np.sqrt(n)))

    # Visualization of each filter of the layer

    fig = plt.figure(figsize=(12,12))
    for i in range(len(convolutions)):
        ax = fig.add_subplot(n,n,i+1)
        ax.imshow(convolutions[i], cmap='gray')
 # Specify the layer to want to visualize
layer_to_visualize(convo1)
# As convout2 is the result of a MaxPool2D layer
# We can see that the image has blurred since
# the resolution has reduced
layer_to_visualize(convo2)

#################################################################################
	import seaborn as sns
	from sklearn import preprocessing, ensemble
	from scipy.stats import kendalltau
	import pandas as pd
	import random

	#todo change module name
	from tqdm import tqdm
	import numpy as np
	import pandas as pd
	from multiprocessing import Pool

	def _apply_df(args):
	df, func, num, kwargs = args
	return num, df.apply(func, **kwargs)

	def _apply_series(args):
	df, func, num, kwargs = args
	return num, df.apply(func)

	def apply_by_multiprocessing(df,func,**kwargs):
	workers = kwargs.pop('workers')
	chunks = kwargs.pop('chunks')
	is_series = kwargs.pop('is_series')

	with Pool(workers) as p:
	apply_lst = [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, chunks))]
	if is_series:
	result = list(tqdm(p.imap(_apply_series, apply_lst), total=len(apply_lst)))
	else:
	result = list(tqdm(p.imap(_apply_df, apply_lst), total=len(apply_lst)))

	result=sorted(result,key=lambda x:x[0])
	return pd.concat([i[1] for i in result], sort = False)

	def _apply_df_groupby(args):
	group, func, name, kwargs = args
	return name, func(group, **kwargs)

	def multiprocessing_groupby(groupby,
	func,
	**kwargs):
	workers = kwargs.pop('workers')

	with Pool(workers) as p:
	apply_lst = [(group, func, name, kwargs) for name,group in groupby]

	result = list(tqdm(p.imap(_apply_df_groupby, apply_lst), total=len(apply_lst)))

	result=sorted(result,key=lambda x:x[0])
	return pd.concat([i[1] for i in result], sort = False)

	# From Kaggle's Avito Comp
	def reduce_mem_usage(df):
	""" iterate through all the columns of a dataframe and modify the data type
	to reduce memory usage.
	"""
	start_mem = df.memory_usage().sum() / 1024**2
	print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

	for col in df.columns:
	col_type = df[col].dtype

	if col_type != object:
	c_min = df[col].min()
	c_max = df[col].max()
	if str(col_type)[:3] == 'int':
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
	df[col] = df[col].astype(np.int8)
	elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
	df[col] = df[col].astype(np.int16)
	elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
	df[col] = df[col].astype(np.int32)
	elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
	df[col] = df[col].astype(np.int64)
	else:
	if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
	df[col] = df[col].astype(np.float16)
	elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
	df[col] = df[col].astype(np.float32)
	else:
	df[col] = df[col].astype(np.float64)
	else: df[col] = df[col].astype('category')

	end_mem = df.memory_usage().sum() / 1024**2
	print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
	print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

	return df

	#load csv file at path
	def load_dataframe(csv_path):
	return pd.read_csv(csv_path) # returns dataframe

	#show info about csv
	def show_basic_info(dataframe):
	print("data basic info:")
	print("size: {0}".format(dataframe.size))
	print("columns: {0}".format(dataframe.columns))

	#convert to numeric data
	def convert_to_numeric(column, drop_missing_data=True):
	numeric = pd.to_numeric(column, errors='coerce')
	if(drop_missing_data):
	numeric = numeric.dropna()
	return numeric

	#get columns for pandas dataframe
	def get_columns(dataframe, columns_names_arr):
	return dataframe[columns_names_arr]

	#plot basic hex graph for pair of numerical data
	def plot_hex_graph(numerical_data1, numerical_data2):
	with sns.axes_style("white"):
	hex_plt=sns.jointplot(x=numerical_data1, y=numerical_data2, kind="hex", gridsize=24, space=0, color="r")
	print("--> plotting data on hex graph....")
	sns.plt.show()

	#numerical data needs to be pndas.series
	def plot_histogram(numerical_data):
	print("plotting histogram for {0}".format(numerical_data.name))
	try:
	sns.distplot(numerical_data)
	sns.plt.show()
	except:
	print("failed to plot histogram")

	#summarize one categorical data (csv column)
	#column needs to be pandas series
	def summarize_categorical_data(data_name, column):
	print("________________________")
	print("{0}:".format(data_name))
	description=column.describe()
	counts_description=description.sort(['counts'], ascending=False)
	print(counts_description)

	#summarize all categorical data (multiple csv columns)
	#categorical_data needs to be array of pandas series
	def summarize_all_categorical_data(categorical_data):
	print("--> categorical data summaries:")
	for i in range(len(categorical_data.columns)):
	column_name=categorical_data.columns[i]
	column_data=categorical_data[column_name]
	converted_to_categorical=pd.Categorical(column_data)
	summarize_categorical_data(column_name,converted_to_categorical)

	#get names of categorical and numerical data columns
	def divide_data_to_categorical_and_numerical(data):
	numerical_column_names=[]
	categorical_column_names=[]
	for column_name in data.columns:
	column= data[column_name] # column type is "pandas.core.series.Series"
	few=10
	few_elements=column.values[:few] # ten first elements
	is_categorical=[type(x) for x in few_elements].count(type("string"))==few
	#print("column: {0}. Categorical {1}".format(column_name,is_categorical))
	if(is_categorical):
	categorical_column_names.append(column_name)
	else:
	numerical_column_names.append(column_name)
	return categorical_column_names, numerical_column_names

	#create bivariate distributions for numerical data
	def plot_multiple_bivariate_distributions_grid(dataframe):
	sns.pairplot(dataframe)
	#########################################################################################

	#keras visuals
	layer_names = []
	for layer in model.layers[:-1]:
	layer_names.append(layer.name)
	images_per_row = 16
	for layer_name, layer_activation in zip(layer_names, activations):
	if layer_name.startswith('conv'):
	n_features = layer_activation.shape[-1]
	size = layer_activation.shape[1]
	n_cols = n_features // images_per_row
	display_grid = np.zeros((size * n_cols, images_per_row * size))
	for col in range(n_cols):
	for row in range(images_per_row):
	channel_image = layer_activation[0,:, :, col * images_per_row + row]
	channel_image -= channel_image.mean()
	channel_image /= channel_image.std()
	channel_image *= 64
	channel_image += 128
	channel_image = np.clip(channel_image, 0, 255).astype('uint8')
	display_grid[col * size : (col + 1) * size,
	row * size : (row + 1) * size] = channel_image
	scale = 1. / size
	plt.figure(figsize=(scale * display_grid.shape[1],
	scale * display_grid.shape[0]))
	plt.title(layer_name)
	plt.grid(False)
	plt.imshow(display_grid, aspect='auto', cmap='viridis')

	########################################################################################
	##https://www.kaggle.com/adityaecdrid/mnist-with-keras-for-beginners-99457/
	# Look at confusion matrix
	def plot_confusion_matrix(cm, classes,
	normalize=False,
	title='Confusion matrix',
	cmap=plt.cm.):
	"""
	This function prints and plots the confusion matrix.
	Normalization can be applied by setting `normalize=True`.
	"""
	plt.imshow(cm, interpolation='nearest', cmap=cmap)
	plt.title(title)
	plt.colorbar()
	tick_marks = np.arange(len(classes))
	plt.xticks(tick_marks, classes, rotation=45)
	plt.yticks(tick_marks, classes)

	if normalize:
	cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

	thresh = cm.max() / 2.
	for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
	plt.text(j, i, cm[i, j],
	horizontalalignment="center",
	color="white" if cm[i, j] > thresh else "black")

	plt.tight_layout()
	plt.ylabel('True label')
	plt.xlabel('Predicted label')

	# Predict the values from the validation dataset
	Y_pred = model.predict(X_val)
	# Convert predictions classes to one hot vectors
	Y_pred_classes = np.argmax(Y_pred, axis = 1)
	# Convert validation observations to one hot vectors
	Y_true = np.argmax(Y_val, axis = 1)
	# compute the confusion matrix
	confusion_mtx = confusion_matrix(Y_true, Y_pred_classes)
	# plot the confusion matrix
	plot_confusion_matrix(confusion_mtx, classes = range(10))

	########################################################################################################################
	print(h.history.keys())
	# "Accuracy"
	plt.plot(h.history['acc'])
	plt.plot(h.history['val_acc'])
	plt.title('model accuracy')
	plt.ylabel('accuracy')
	plt.xlabel('epoch')
	plt.legend(['train', 'validation'], loc='upper left')
	# "Loss"
	plt.plot(h.history['loss'])
	plt.plot(h.history['val_loss'])
	plt.title('model loss')
	plt.ylabel('loss')
	plt.xlabel('epoch')
	plt.legend(['train', 'validation'], loc='upper left')
	#"Learning Rate"
	plt.plot(h.history['lr'])
	plt.title('Learning Rate')
	plt.show()

	###########################################################################################################
	# Errors are difference between predicted labels and true labels
	errors = (Y_pred_classes - Y_true != 0)

	Y_pred_classes_errors = Y_pred_classes[errors]
	Y_pred_errors = Y_pred[errors]
	Y_true_errors = Y_true[errors]
	X_val_errors = X_val[errors]

	def display_errors(errors_index,img_errors,pred_errors, obs_errors):
	""" This function shows 6 images with their predicted and real labels"""
	n = 0
	nrows = 2
	ncols = 3
	fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
	for row in range(nrows):
	for col in range(ncols):
	error = errors_index[n]
	ax[row,col].imshow((img_errors[error]).reshape((28,28)))
	ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
	n += 1

	# Probabilities of the wrong predicted numbers
	Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)

	# Predicted probabilities of the true values in the error set
	true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))

	# Difference between the probability of the predicted label and the true label
	delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors

	# Sorted list of the delta prob errors
	sorted_dela_errors = np.argsort(delta_pred_true_errors)

	# Top 6 errors
	most_important_errors = sorted_dela_errors[-6:]

	# Show the top 6 errors
	display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)

	############################################################################################################
	'''
	K.function creates theano/tensorflow tensor functions which is later used
	to get the output from the symbolic graph given the input.

	Now K.learning_phase() is required as an input as many Keras layers like Dropout/Batchnomalization
	depend on it to change behavior during training and test time.
	Had taken help from the keras
	[docs](https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer),
	this [answer](https://stackoverflow.com/questions/41711190/keras-how-to-get-the-output-of-each-layer) on
	stack and converted the same in a function like thing ...
	'''
	def layer_to_visualize(layer):
	inputs = [K.learning_phase()] + model.inputs

	_convout1_f = K.function(inputs, [layer.output])

	def convout1_f(X):
	# The [0] is to disable the training phase flag
	return _convout1_f([0] + [X])
	convolutions = convout1_f(img_to_visualize)
	convolutions = np.squeeze(convolutions)

	print ('Shape of conv:', convolutions.shape)

	n = convolutions.shape[0]
	n = int(np.ceil(np.sqrt(n)))

	# Visualization of each filter of the layer

	fig = plt.figure(figsize=(12,12))
	for i in range(len(convolutions)):
	ax = fig.add_subplot(n,n,i+1)
	ax.imshow(convolutions[i], cmap='gray')
	# Specify the layer to want to visualize
	layer_to_visualize(convo1)
	# As convout2 is the result of a MaxPool2D layer
	# We can see that the image has blurred since
	# the resolution has reduced
	layer_to_visualize(convo2)

	#################################################################################