Skip to content

Instantly share code, notes, and snippets.

View cereniyim's full-sized avatar

Ceren cereniyim

View GitHub Profile
@cereniyim
cereniyim / extract_year_from_title.py
Created April 29, 2020 11:54
year extraction function from feature
def extract_year_from_title(title):
# function to find the year in the given list
# if not found assigns zero as year
# ASSUMPTION: There is no NA values
# in the title feature
int_list = []
now = datetime.datetime.now()
for item in title:
@cereniyim
cereniyim / search_keywards_in_feature.py
Created April 29, 2020 11:51
seacrh a given keyword in a feature
def extract_features_from_description(df,
column_name,
new_feature_name,
extract_words):
# function to extract features from the column_name
# searches column_name feature for a given list
# ASSUMPTION: There is no NA values
# in the description feature
check_regex = (r'\b(?:{})\b'
.format('|'
@cereniyim
cereniyim / plot_distribution.py
Created April 29, 2020 11:17
plot distirbution of a variable per a categorical variable
def plot_distribution(df, target, column_values, column_name):
# funtion to print distribution of a continuous variable
# for categorical data
for value in column_values:
subset = df[df[column_name] == value]
g = sns.kdeplot(subset[target],
label=value,
linewidth=3)
@cereniyim
cereniyim / plot_histogram.py
Created April 29, 2020 11:14
plot histogram with mean and median
def plot_histogram(df, column, b=None):
# funtion to print histogram
# with mean and median
# using distplot
# set the histogram, mean and median
g = sns.distplot(df[column], kde=False, bins=b)
plt.axvline(x=df[column].mean(),
linewidth=3,
@cereniyim
cereniyim / clean_data.py
Created April 29, 2020 11:06
clean data function
def CleanData(df, drop_columns, target_name):
# this function drops not used features
# and duplicate rows
# and empty rows of target(poinst)
# returns cleaned df
interim_df = df.drop(columns=drop_columns)
interim_df_2 = (interim_df
.drop_duplicates(ignore_index=True))
@cereniyim
cereniyim / missing_value_function.py
Last active April 29, 2020 11:19
missing value function
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent],
axis=1)
@cereniyim
cereniyim / wine_rating_predictor_imports.py
Created April 29, 2020 11:00
libraries for wine rating predictor part one
# data manipulation libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 60)
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
@cereniyim
cereniyim / jupyterlab_debugging.py
Created April 6, 2020 10:23
Debugging enabled in jupyter notebooks
# source https://towardsdatascience.com/debugging-jupyter-notebooks-will-boost-your-productivity-a33387f4fa62
import contextlib
with contextlib.redirect_stdout(None):
import pixiedust
# New Cell
def find_max (values):
max = 0
import pdb; pdb.set_trace()
for val in values:
if val > max:
@cereniyim
cereniyim / plotly_logic_explanation.py
Created April 6, 2020 09:36
explanation of how plotly works
import numpy as np
np.random.seed(42)
import plotly.offline as pyo
import plotly.graph_objs as go
# create data
x = np.random.randint(1, 101, 100)
y = np.random.randint(1, 101, 100)
# data object to be used in figure object
def encode_column(column):
if column > 0:
return 1
if column <= 0:
return 0
def aggregate_by_ordered_quantity(dataframe, column_list):
'''this function:
1. aggregates a given dataframe by column list,