Skip to content

Instantly share code, notes, and snippets.

@AdityaSoni19031997
Last active November 16, 2022 15:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save AdityaSoni19031997/6d916ae608b1e9d422a44dbb70a79c2d to your computer and use it in GitHub Desktop.
Save AdityaSoni19031997/6d916ae608b1e9d422a44dbb70a79c2d to your computer and use it in GitHub Desktop.
Kaggle Helper Scripts
import seaborn as sns
from sklearn import preprocessing, ensemble
from scipy.stats import kendalltau
import pandas as pd
import random
#todo change module name
from tqdm import tqdm
import numpy as np
import pandas as pd
from multiprocessing import Pool
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def _apply_series(args):
df, func, num, kwargs = args
return num, df.apply(func)
def apply_by_multiprocessing(df,func,**kwargs):
workers = kwargs.pop('workers')
chunks = kwargs.pop('chunks')
is_series = kwargs.pop('is_series')
with Pool(workers) as p:
apply_lst = [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, chunks))]
if is_series:
result = list(tqdm(p.imap(_apply_series, apply_lst), total=len(apply_lst)))
else:
result = list(tqdm(p.imap(_apply_df, apply_lst), total=len(apply_lst)))
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result], sort = False)
def _apply_df_groupby(args):
group, func, name, kwargs = args
return name, func(group, **kwargs)
def multiprocessing_groupby(groupby,
func,
**kwargs):
workers = kwargs.pop('workers')
with Pool(workers) as p:
apply_lst = [(group, func, name, kwargs) for name,group in groupby]
result = list(tqdm(p.imap(_apply_df_groupby, apply_lst), total=len(apply_lst)))
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result], sort = False)
# From Kaggle's Avito Comp
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else: df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
#load csv file at path
def load_dataframe(csv_path):
return pd.read_csv(csv_path) # returns dataframe
#show info about csv
def show_basic_info(dataframe):
print("data basic info:")
print("size: {0}".format(dataframe.size))
print("columns: {0}".format(dataframe.columns))
#convert to numeric data
def convert_to_numeric(column, drop_missing_data=True):
numeric = pd.to_numeric(column, errors='coerce')
if(drop_missing_data):
numeric = numeric.dropna()
return numeric
#get columns for pandas dataframe
def get_columns(dataframe, columns_names_arr):
return dataframe[columns_names_arr]
#plot basic hex graph for pair of numerical data
def plot_hex_graph(numerical_data1, numerical_data2):
with sns.axes_style("white"):
hex_plt=sns.jointplot(x=numerical_data1, y=numerical_data2, kind="hex", gridsize=24, space=0, color="r")
print("--> plotting data on hex graph....")
sns.plt.show()
#numerical data needs to be pndas.series
def plot_histogram(numerical_data):
print("plotting histogram for {0}".format(numerical_data.name))
try:
sns.distplot(numerical_data)
sns.plt.show()
except:
print("failed to plot histogram")
#summarize one categorical data (csv column)
#column needs to be pandas series
def summarize_categorical_data(data_name, column):
print("________________________")
print("{0}:".format(data_name))
description=column.describe()
counts_description=description.sort(['counts'], ascending=False)
print(counts_description)
#summarize all categorical data (multiple csv columns)
#categorical_data needs to be array of pandas series
def summarize_all_categorical_data(categorical_data):
print("--> categorical data summaries:")
for i in range(len(categorical_data.columns)):
column_name=categorical_data.columns[i]
column_data=categorical_data[column_name]
converted_to_categorical=pd.Categorical(column_data)
summarize_categorical_data(column_name,converted_to_categorical)
#get names of categorical and numerical data columns
def divide_data_to_categorical_and_numerical(data):
numerical_column_names=[]
categorical_column_names=[]
for column_name in data.columns:
column= data[column_name] # column type is "pandas.core.series.Series"
few=10
few_elements=column.values[:few] # ten first elements
is_categorical=[type(x) for x in few_elements].count(type("string"))==few
#print("column: {0}. Categorical {1}".format(column_name,is_categorical))
if(is_categorical):
categorical_column_names.append(column_name)
else:
numerical_column_names.append(column_name)
return categorical_column_names, numerical_column_names
#create bivariate distributions for numerical data
def plot_multiple_bivariate_distributions_grid(dataframe):
sns.pairplot(dataframe)
#########################################################################################
#keras visuals
layer_names = []
for layer in model.layers[:-1]:
layer_names.append(layer.name)
images_per_row = 16
for layer_name, layer_activation in zip(layer_names, activations):
if layer_name.startswith('conv'):
n_features = layer_activation.shape[-1]
size = layer_activation.shape[1]
n_cols = n_features // images_per_row
display_grid = np.zeros((size * n_cols, images_per_row * size))
for col in range(n_cols):
for row in range(images_per_row):
channel_image = layer_activation[0,:, :, col * images_per_row + row]
channel_image -= channel_image.mean()
channel_image /= channel_image.std()
channel_image *= 64
channel_image += 128
channel_image = np.clip(channel_image, 0, 255).astype('uint8')
display_grid[col * size : (col + 1) * size,
row * size : (row + 1) * size] = channel_image
scale = 1. / size
plt.figure(figsize=(scale * display_grid.shape[1],
scale * display_grid.shape[0]))
plt.title(layer_name)
plt.grid(False)
plt.imshow(display_grid, aspect='auto', cmap='viridis')
########################################################################################
##https://www.kaggle.com/adityaecdrid/mnist-with-keras-for-beginners-99457/
# Look at confusion matrix
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Predict the values from the validation dataset
Y_pred = model.predict(X_val)
# Convert predictions classes to one hot vectors
Y_pred_classes = np.argmax(Y_pred, axis = 1)
# Convert validation observations to one hot vectors
Y_true = np.argmax(Y_val, axis = 1)
# compute the confusion matrix
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes)
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10))
########################################################################################################################
print(h.history.keys())
# "Accuracy"
plt.plot(h.history['acc'])
plt.plot(h.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# "Loss"
plt.plot(h.history['loss'])
plt.plot(h.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
#"Learning Rate"
plt.plot(h.history['lr'])
plt.title('Learning Rate')
plt.show()
###########################################################################################################
# Errors are difference between predicted labels and true labels
errors = (Y_pred_classes - Y_true != 0)
Y_pred_classes_errors = Y_pred_classes[errors]
Y_pred_errors = Y_pred[errors]
Y_true_errors = Y_true[errors]
X_val_errors = X_val[errors]
def display_errors(errors_index,img_errors,pred_errors, obs_errors):
""" This function shows 6 images with their predicted and real labels"""
n = 0
nrows = 2
ncols = 3
fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
for row in range(nrows):
for col in range(ncols):
error = errors_index[n]
ax[row,col].imshow((img_errors[error]).reshape((28,28)))
ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
n += 1
# Probabilities of the wrong predicted numbers
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)
# Predicted probabilities of the true values in the error set
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))
# Difference between the probability of the predicted label and the true label
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors
# Sorted list of the delta prob errors
sorted_dela_errors = np.argsort(delta_pred_true_errors)
# Top 6 errors
most_important_errors = sorted_dela_errors[-6:]
# Show the top 6 errors
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)
############################################################################################################
'''
K.function creates theano/tensorflow tensor functions which is later used
to get the output from the symbolic graph given the input.
Now K.learning_phase() is required as an input as many Keras layers like Dropout/Batchnomalization
depend on it to change behavior during training and test time.
Had taken help from the keras
[docs](https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer),
this [answer](https://stackoverflow.com/questions/41711190/keras-how-to-get-the-output-of-each-layer) on
stack and converted the same in a function like thing ...
'''
def layer_to_visualize(layer):
inputs = [K.learning_phase()] + model.inputs
_convout1_f = K.function(inputs, [layer.output])
def convout1_f(X):
# The [0] is to disable the training phase flag
return _convout1_f([0] + [X])
convolutions = convout1_f(img_to_visualize)
convolutions = np.squeeze(convolutions)
print ('Shape of conv:', convolutions.shape)
n = convolutions.shape[0]
n = int(np.ceil(np.sqrt(n)))
# Visualization of each filter of the layer
fig = plt.figure(figsize=(12,12))
for i in range(len(convolutions)):
ax = fig.add_subplot(n,n,i+1)
ax.imshow(convolutions[i], cmap='gray')
# Specify the layer to want to visualize
layer_to_visualize(convo1)
# As convout2 is the result of a MaxPool2D layer
# We can see that the image has blurred since
# the resolution has reduced
layer_to_visualize(convo2)
#################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment