Kaggle Helper Scripts
import seaborn as sns
from sklearn import preprocessing, ensemble
from scipy.stats import kendalltau
import pandas as pd
import random
#todo change module name
from tqdm import tqdm
import numpy as np
import pandas as pd
from multiprocessing import Pool
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def _apply_series(args):
df, func, num, kwargs = args
return num, df.apply(func)
def apply_by_multiprocessing(df,func,**kwargs):
workers = kwargs.pop('workers')
chunks = kwargs.pop('chunks')
is_series = kwargs.pop('is_series')
with Pool(workers) as p:
apply_lst = [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, chunks))]
if is_series:
result = list(tqdm(p.imap(_apply_series, apply_lst), total=len(apply_lst)))
result = list(tqdm(p.imap(_apply_df, apply_lst), total=len(apply_lst)))
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result], sort = False)
def _apply_df_groupby(args):
group, func, name, kwargs = args
return name, func(group, **kwargs)
def multiprocessing_groupby(groupby,
workers = kwargs.pop('workers')
with Pool(workers) as p:
apply_lst = [(group, func, name, kwargs) for name,group in groupby]
result = list(tqdm(p.imap(_apply_df_groupby, apply_lst), total=len(apply_lst)))
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result], sort = False)
# From Kaggle's Avito Comp
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
df[col] = df[col].astype(np.float64)
else: df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
#load csv file at path
def load_dataframe(csv_path):
return pd.read_csv(csv_path) # returns dataframe
#show info about csv
def show_basic_info(dataframe):
print("data basic info:")
print("size: {0}".format(dataframe.size))
print("columns: {0}".format(dataframe.columns))
#convert to numeric data
def convert_to_numeric(column, drop_missing_data=True):
numeric = pd.to_numeric(column, errors='coerce')
numeric = numeric.dropna()
return numeric
#get columns for pandas dataframe
def get_columns(dataframe, columns_names_arr):
return dataframe[columns_names_arr]
#plot basic hex graph for pair of numerical data
def plot_hex_graph(numerical_data1, numerical_data2):
with sns.axes_style("white"):
hex_plt=sns.jointplot(x=numerical_data1, y=numerical_data2, kind="hex", gridsize=24, space=0, color="r")
print("--> plotting data on hex graph....")
#numerical data needs to be pndas.series
def plot_histogram(numerical_data):
print("plotting histogram for {0}".format(
print("failed to plot histogram")
#summarize one categorical data (csv column)
#column needs to be pandas series
def summarize_categorical_data(data_name, column):
counts_description=description.sort(['counts'], ascending=False)
#summarize all categorical data (multiple csv columns)
#categorical_data needs to be array of pandas series
def summarize_all_categorical_data(categorical_data):
print("--> categorical data summaries:")
for i in range(len(categorical_data.columns)):
#get names of categorical and numerical data columns
def divide_data_to_categorical_and_numerical(data):
for column_name in data.columns:
column= data[column_name] # column type is "pandas.core.series.Series"
few_elements=column.values[:few] # ten first elements
is_categorical=[type(x) for x in few_elements].count(type("string"))==few
#print("column: {0}. Categorical {1}".format(column_name,is_categorical))
return categorical_column_names, numerical_column_names
#create bivariate distributions for numerical data
def plot_multiple_bivariate_distributions_grid(dataframe):
#keras visuals
layer_names = []
for layer in model.layers[:-1]:
images_per_row = 16
for layer_name, layer_activation in zip(layer_names, activations):
if layer_name.startswith('conv'):
n_features = layer_activation.shape[-1]
size = layer_activation.shape[1]
n_cols = n_features // images_per_row
display_grid = np.zeros((size * n_cols, images_per_row * size))
for col in range(n_cols):
for row in range(images_per_row):
channel_image = layer_activation[0,:, :, col * images_per_row + row]
channel_image -= channel_image.mean()
channel_image /= channel_image.std()
channel_image *= 64
channel_image += 128
channel_image = np.clip(channel_image, 0, 255).astype('uint8')
display_grid[col * size : (col + 1) * size,
row * size : (row + 1) * size] = channel_image
scale = 1. / size
plt.figure(figsize=(scale * display_grid.shape[1],
scale * display_grid.shape[0]))
plt.imshow(display_grid, aspect='auto', cmap='viridis')
# Look at confusion matrix
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
plt.imshow(cm, interpolation='nearest', cmap=cmap)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Predict the values from the validation dataset
Y_pred = model.predict(X_val)
# Convert predictions classes to one hot vectors
Y_pred_classes = np.argmax(Y_pred, axis = 1)
# Convert validation observations to one hot vectors
Y_true = np.argmax(Y_val, axis = 1)
# compute the confusion matrix
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes)
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10))
# "Accuracy"
plt.title('model accuracy')
plt.legend(['train', 'validation'], loc='upper left')
# "Loss"
plt.title('model loss')
plt.legend(['train', 'validation'], loc='upper left')
#"Learning Rate"
plt.title('Learning Rate')
# Errors are difference between predicted labels and true labels
errors = (Y_pred_classes - Y_true != 0)
Y_pred_classes_errors = Y_pred_classes[errors]
Y_pred_errors = Y_pred[errors]
Y_true_errors = Y_true[errors]
X_val_errors = X_val[errors]
def display_errors(errors_index,img_errors,pred_errors, obs_errors):
""" This function shows 6 images with their predicted and real labels"""
n = 0
nrows = 2
ncols = 3
fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
for row in range(nrows):
for col in range(ncols):
error = errors_index[n]
ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
n += 1
# Probabilities of the wrong predicted numbers
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)
# Predicted probabilities of the true values in the error set
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))
# Difference between the probability of the predicted label and the true label
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors
# Sorted list of the delta prob errors
sorted_dela_errors = np.argsort(delta_pred_true_errors)
# Top 6 errors
most_important_errors = sorted_dela_errors[-6:]
# Show the top 6 errors
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)
K.function creates theano/tensorflow tensor functions which is later used
to get the output from the symbolic graph given the input.
Now K.learning_phase() is required as an input as many Keras layers like Dropout/Batchnomalization
depend on it to change behavior during training and test time.
Had taken help from the keras
this [answer]( on
stack and converted the same in a function like thing ...
def layer_to_visualize(layer):
inputs = [K.learning_phase()] + model.inputs
_convout1_f = K.function(inputs, [layer.output])
def convout1_f(X):
# The [0] is to disable the training phase flag
return _convout1_f([0] + [X])
convolutions = convout1_f(img_to_visualize)
convolutions = np.squeeze(convolutions)
print ('Shape of conv:', convolutions.shape)
n = convolutions.shape[0]
n = int(np.ceil(np.sqrt(n)))
# Visualization of each filter of the layer
fig = plt.figure(figsize=(12,12))
for i in range(len(convolutions)):
ax = fig.add_subplot(n,n,i+1)
ax.imshow(convolutions[i], cmap='gray')
# Specify the layer to want to visualize
# As convout2 is the result of a MaxPool2D layer
# We can see that the image has blurred since
# the resolution has reduced
