Skip to content

Instantly share code, notes, and snippets.

View cereniyim's full-sized avatar

Ceren cereniyim

View GitHub Profile
@cereniyim
cereniyim / convert_target_to_array.py
Created May 21, 2020 17:35
function to convert target dataframe to array
def convert_target_to_array(target):
# function to convert target df
# to an array
target_array = (np
.array(target)
.reshape((-1, )))
return target_array
@cereniyim
cereniyim / convert_features_to_array.py
Created May 21, 2020 17:34
function to convert features dataframe to array
def convert_features_to_array(features):
# function to convert feature df
# to an array
num_rows = len(features)
num_cols = len(features.columns)
features_array = (np
.array(features)
.reshape((num_rows,
num_cols)))
@cereniyim
cereniyim / wine_rating_predictor_imports2.py
Created May 21, 2020 17:32
libraries used in WineRatingPredictor-2 notebook
# manipulation libraries
import pandas as pd
import numpy as np
# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# to display visuals in the notebook
@cereniyim
cereniyim / impute_missing_values.py
Created April 29, 2020 14:15
impute missing values function
def ImputeMissingValues(train_df, test_df):
# separete non-NA cols
is_features = [col for col in train_df.columns
if col.find("is_") != -1]
interim_train_1 = train_df[is_features]
interim_test_1 = test_df[is_features]
# impute taster_name NA with 0 as "Unknown"
constant_impute = ImputeWithConstant(train_df,
test_df)
@cereniyim
cereniyim / most_frequent_imputer.py
Created April 29, 2020 13:45
most frequent imputer function
def ImputeWithMostFrequent(train_df, test_df,
cols=["country", "province",
"region_1", "variety"]):
# function to impute country, province, region_1, variety
# columns with the most_frequent value of each feature
# most_frequent imputer is fitted on train dataset
# transformation done on the train and test set
train_df = pd.DataFrame(train_df[cols])
test_df = pd.DataFrame(test_df[cols])
@cereniyim
cereniyim / median_imputer.py
Created April 29, 2020 13:40
median imputer function
def ImputeWithMedian(train_df, test_df, cols=["price", "year"]):
# function to impute price and year
# columns with the median value of each
# median imputer is fitted on train dataset
# transformation done on the train and test set
train_df = pd.DataFrame(train_df[cols])
test_df = pd.DataFrame(test_df[cols])
median_imputer = SimpleImputer(strategy="median")
@cereniyim
cereniyim / constant_imputer.py
Created April 29, 2020 13:38
constant imputer function
def ImputeWithConstant(train_df, test_df, cols=["taster_name"]):
# function to impute taster_name
# with 0 stands for "Unknown value"
train_df = pd.DataFrame(train_df[cols])
test_df = pd.DataFrame(test_df[cols])
constant_imputer = SimpleImputer(strategy="constant", fill_value=0)
constant_imputer.fit(train_df)
@cereniyim
cereniyim / ordinal_encoder.py
Created April 29, 2020 13:06
ordinal encoder function
def EncodeCategoricalData(train_df,
test_df,
cols=ordinal_encode_cols):
# funtion to create ordinal encoder object and
# assigns -1 to the unseen labels of the test set
# returns ordinal encoded train and test datasets
# respectively
ordinal_encoder = OrdinalEncoder(cols=ordinal_encode_cols,
return_df=True,
handle_unknown="value",
@cereniyim
cereniyim / extract_features.py
Last active April 29, 2020 13:47
extract features function
def ExtractFeatures(df):
# adds is_red, is_white, is_rose, is_rose
# is_sparkling, is_dry, is_sweet
for key, value in desc_extracting_dict.items():
interim_df = extract_features_from_description(df, "description", key, value)
# adds year
interim_df["title_numlist"] = (interim_df
.title
.str
@cereniyim
cereniyim / extract_blend_from_variety.py
Created April 29, 2020 11:59
extract blend from variety function
def extract_blend_from_variety(variety):
# function to find if a variety is a blend
# ASSUMPTION: There is no NA values
# in the variety feature
if (variety.find("-") != -1) | (variety.find("Blend") != -1):
if variety == "Xarel-lo":
return 0
else:
return 1