This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def convert_target_to_array(target): | |
# function to convert target df | |
# to an array | |
target_array = (np | |
.array(target) | |
.reshape((-1, ))) | |
return target_array |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def convert_features_to_array(features): | |
# function to convert feature df | |
# to an array | |
num_rows = len(features) | |
num_cols = len(features.columns) | |
features_array = (np | |
.array(features) | |
.reshape((num_rows, | |
num_cols))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# manipulation libraries | |
import pandas as pd | |
import numpy as np | |
# visualization libraries | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
# to display visuals in the notebook |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ImputeMissingValues(train_df, test_df): | |
# separete non-NA cols | |
is_features = [col for col in train_df.columns | |
if col.find("is_") != -1] | |
interim_train_1 = train_df[is_features] | |
interim_test_1 = test_df[is_features] | |
# impute taster_name NA with 0 as "Unknown" | |
constant_impute = ImputeWithConstant(train_df, | |
test_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ImputeWithMostFrequent(train_df, test_df, | |
cols=["country", "province", | |
"region_1", "variety"]): | |
# function to impute country, province, region_1, variety | |
# columns with the most_frequent value of each feature | |
# most_frequent imputer is fitted on train dataset | |
# transformation done on the train and test set | |
train_df = pd.DataFrame(train_df[cols]) | |
test_df = pd.DataFrame(test_df[cols]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ImputeWithMedian(train_df, test_df, cols=["price", "year"]): | |
# function to impute price and year | |
# columns with the median value of each | |
# median imputer is fitted on train dataset | |
# transformation done on the train and test set | |
train_df = pd.DataFrame(train_df[cols]) | |
test_df = pd.DataFrame(test_df[cols]) | |
median_imputer = SimpleImputer(strategy="median") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ImputeWithConstant(train_df, test_df, cols=["taster_name"]): | |
# function to impute taster_name | |
# with 0 stands for "Unknown value" | |
train_df = pd.DataFrame(train_df[cols]) | |
test_df = pd.DataFrame(test_df[cols]) | |
constant_imputer = SimpleImputer(strategy="constant", fill_value=0) | |
constant_imputer.fit(train_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def EncodeCategoricalData(train_df, | |
test_df, | |
cols=ordinal_encode_cols): | |
# funtion to create ordinal encoder object and | |
# assigns -1 to the unseen labels of the test set | |
# returns ordinal encoded train and test datasets | |
# respectively | |
ordinal_encoder = OrdinalEncoder(cols=ordinal_encode_cols, | |
return_df=True, | |
handle_unknown="value", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ExtractFeatures(df): | |
# adds is_red, is_white, is_rose, is_rose | |
# is_sparkling, is_dry, is_sweet | |
for key, value in desc_extracting_dict.items(): | |
interim_df = extract_features_from_description(df, "description", key, value) | |
# adds year | |
interim_df["title_numlist"] = (interim_df | |
.title | |
.str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_blend_from_variety(variety): | |
# function to find if a variety is a blend | |
# ASSUMPTION: There is no NA values | |
# in the variety feature | |
if (variety.find("-") != -1) | (variety.find("Blend") != -1): | |
if variety == "Xarel-lo": | |
return 0 | |
else: | |
return 1 |