This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def apply_log1p_transformation(dataframe, column): | |
'''This function takes a dataframe and a column in the string format | |
then applies numpy log1p transformation to the column | |
as a result returns log1p applied pandas series''' | |
dataframe["log_" + column] = np.log1p(dataframe[column]) | |
return dataframe["log_" + column] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def make_list_of_K(K, dataframe): | |
'''inputs: K as integer and dataframe | |
apply k-means clustering to dataframe | |
and make a list of inertia values against 1 to K | |
return the inertia values list | |
''' | |
cluster_values = list(range(1, K+1)) | |
inertia_values=[] | |
for c in cluster_values: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def encode_column(column): | |
if column > 0: | |
return 1 | |
if column <= 0: | |
return 0 | |
def aggregate_by_ordered_quantity(dataframe, column_list): | |
'''this function: | |
1. aggregates a given dataframe by column list, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
np.random.seed(42) | |
import plotly.offline as pyo | |
import plotly.graph_objs as go | |
# create data | |
x = np.random.randint(1, 101, 100) | |
y = np.random.randint(1, 101, 100) | |
# data object to be used in figure object |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source https://towardsdatascience.com/debugging-jupyter-notebooks-will-boost-your-productivity-a33387f4fa62 | |
import contextlib | |
with contextlib.redirect_stdout(None): | |
import pixiedust | |
# New Cell | |
def find_max (values): | |
max = 0 | |
import pdb; pdb.set_trace() | |
for val in values: | |
if val > max: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data manipulation libraries | |
import pandas as pd | |
import numpy as np | |
pd.set_option('display.max_columns', 60) | |
# data visualization libraries | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from IPython.core.pylabtools import figsize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def missing_values_table(df): | |
# Total missing values | |
mis_val = df.isnull().sum() | |
# Percentage of missing values | |
mis_val_percent = 100 * df.isnull().sum() / len(df) | |
# Make a table with the results | |
mis_val_table = pd.concat([mis_val, mis_val_percent], | |
axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def CleanData(df, drop_columns, target_name): | |
# this function drops not used features | |
# and duplicate rows | |
# and empty rows of target(poinst) | |
# returns cleaned df | |
interim_df = df.drop(columns=drop_columns) | |
interim_df_2 = (interim_df | |
.drop_duplicates(ignore_index=True)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_histogram(df, column, b=None): | |
# funtion to print histogram | |
# with mean and median | |
# using distplot | |
# set the histogram, mean and median | |
g = sns.distplot(df[column], kde=False, bins=b) | |
plt.axvline(x=df[column].mean(), | |
linewidth=3, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_distribution(df, target, column_values, column_name): | |
# funtion to print distribution of a continuous variable | |
# for categorical data | |
for value in column_values: | |
subset = df[df[column_name] == value] | |
g = sns.kdeplot(subset[target], | |
label=value, | |
linewidth=3) |