Skip to content

Instantly share code, notes, and snippets.

View cereniyim's full-sized avatar

Ceren cereniyim

View GitHub Profile
cereniyim /
Created February 14, 2020 11:54
Natural Logarithm+1 Transformation Function
def apply_log1p_transformation(dataframe, column):
'''This function takes a dataframe and a column in the string format
then applies numpy log1p transformation to the column
as a result returns log1p applied pandas series'''
dataframe["log_" + column] = np.log1p(dataframe[column])
return dataframe["log_" + column]
cereniyim /
Last active February 14, 2020 14:39
Make a list of given K for the k-means clustering applied to a given dataframe
def make_list_of_K(K, dataframe):
'''inputs: K as integer and dataframe
apply k-means clustering to dataframe
and make a list of inertia values against 1 to K
return the inertia values list
cluster_values = list(range(1, K+1))
for c in cluster_values:
def encode_column(column):
if column > 0:
return 1
if column <= 0:
return 0
def aggregate_by_ordered_quantity(dataframe, column_list):
'''this function:
1. aggregates a given dataframe by column list,
cereniyim /
Created April 6, 2020 09:36
explanation of how plotly works
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go
# create data
x = np.random.randint(1, 101, 100)
y = np.random.randint(1, 101, 100)
# data object to be used in figure object
cereniyim /
Created April 6, 2020 10:23
Debugging enabled in jupyter notebooks
# source
import contextlib
with contextlib.redirect_stdout(None):
import pixiedust
# New Cell
def find_max (values):
max = 0
import pdb; pdb.set_trace()
for val in values:
if val > max:
cereniyim /
Created April 29, 2020 11:00
libraries for wine rating predictor part one
# data manipulation libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 60)
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
cereniyim /
Last active April 29, 2020 11:19
missing value function
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent],
cereniyim /
Created April 29, 2020 11:06
clean data function
def CleanData(df, drop_columns, target_name):
# this function drops not used features
# and duplicate rows
# and empty rows of target(poinst)
# returns cleaned df
interim_df = df.drop(columns=drop_columns)
interim_df_2 = (interim_df
cereniyim /
Created April 29, 2020 11:14
plot histogram with mean and median
def plot_histogram(df, column, b=None):
# funtion to print histogram
# with mean and median
# using distplot
# set the histogram, mean and median
g = sns.distplot(df[column], kde=False, bins=b)
cereniyim /
Created April 29, 2020 11:17
plot distirbution of a variable per a categorical variable
def plot_distribution(df, target, column_values, column_name):
# funtion to print distribution of a continuous variable
# for categorical data
for value in column_values:
subset = df[df[column_name] == value]
g = sns.kdeplot(subset[target],