Skip to content

Instantly share code, notes, and snippets.

View cereniyim's full-sized avatar

Ceren cereniyim

View GitHub Profile
@cereniyim
cereniyim / logarithm_transformation.py
Created February 14, 2020 11:54
Natural Logarithm+1 Transformation Function
def apply_log1p_transformation(dataframe, column):
'''This function takes a dataframe and a column in the string format
then applies numpy log1p transformation to the column
as a result returns log1p applied pandas series'''
dataframe["log_" + column] = np.log1p(dataframe[column])
return dataframe["log_" + column]
@cereniyim
cereniyim / make_list_of_K.py
Last active February 14, 2020 14:39
Make a list of given K for the k-means clustering applied to a given dataframe
def make_list_of_K(K, dataframe):
'''inputs: K as integer and dataframe
apply k-means clustering to dataframe
and make a list of inertia values against 1 to K
return the inertia values list
'''
cluster_values = list(range(1, K+1))
inertia_values=[]
for c in cluster_values:
def encode_column(column):
if column > 0:
return 1
if column <= 0:
return 0
def aggregate_by_ordered_quantity(dataframe, column_list):
'''this function:
1. aggregates a given dataframe by column list,
@cereniyim
cereniyim / plotly_logic_explanation.py
Created April 6, 2020 09:36
explanation of how plotly works
import numpy as np
np.random.seed(42)
import plotly.offline as pyo
import plotly.graph_objs as go
# create data
x = np.random.randint(1, 101, 100)
y = np.random.randint(1, 101, 100)
# data object to be used in figure object
@cereniyim
cereniyim / jupyterlab_debugging.py
Created April 6, 2020 10:23
Debugging enabled in jupyter notebooks
# source https://towardsdatascience.com/debugging-jupyter-notebooks-will-boost-your-productivity-a33387f4fa62
import contextlib
with contextlib.redirect_stdout(None):
import pixiedust
# New Cell
def find_max (values):
max = 0
import pdb; pdb.set_trace()
for val in values:
if val > max:
@cereniyim
cereniyim / wine_rating_predictor_imports.py
Created April 29, 2020 11:00
libraries for wine rating predictor part one
# data manipulation libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 60)
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
@cereniyim
cereniyim / missing_value_function.py
Last active April 29, 2020 11:19
missing value function
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent],
axis=1)
@cereniyim
cereniyim / clean_data.py
Created April 29, 2020 11:06
clean data function
def CleanData(df, drop_columns, target_name):
# this function drops not used features
# and duplicate rows
# and empty rows of target(poinst)
# returns cleaned df
interim_df = df.drop(columns=drop_columns)
interim_df_2 = (interim_df
.drop_duplicates(ignore_index=True))
@cereniyim
cereniyim / plot_histogram.py
Created April 29, 2020 11:14
plot histogram with mean and median
def plot_histogram(df, column, b=None):
# funtion to print histogram
# with mean and median
# using distplot
# set the histogram, mean and median
g = sns.distplot(df[column], kde=False, bins=b)
plt.axvline(x=df[column].mean(),
linewidth=3,
@cereniyim
cereniyim / plot_distribution.py
Created April 29, 2020 11:17
plot distirbution of a variable per a categorical variable
def plot_distribution(df, target, column_values, column_name):
# funtion to print distribution of a continuous variable
# for categorical data
for value in column_values:
subset = df[df[column_name] == value]
g = sns.kdeplot(subset[target],
label=value,
linewidth=3)