Srishti Saha srishtis

## SQL_map.sql
SELECT *
FROM
(
SELECT TEST_DEVICE, DEVICE_OS_TEST, USAGE_TEST,
       CONTROL_DEVICE, DEVICE_OS_CTRL, USAGE_CTRL,
       RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_5 ) AS DEV_RANK_5,
       RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_3 ) AS DEV_RANK_3,
       RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_1 ) AS DEV_RANK_1
FROM EUCLIDEAN_TABLE
)

## SQL_euclidean.sql
CREATE TABLE EUCLIDEAN_TABLE AS
SELECT DEVICE_ID_TEST AS TEST_DEVICE, DEVICE_OS_TEST, USAGE_TEST, DEVICE_ID_CTRL AS CONTROL_DEVICE, DEVICE_OS_CTRL, USAGE_CTRL
      , SQRT (
             (POWER (RETAIL_PRICE,2))+
             (POWER (RAM,2))+
             (POWER (AGE,2))+
             (POWER (STORAGE,2))+
             (POWER (SCREEN_SIZE,2))
             ) AS EUCLEDIAN_DEV_5
     , SQRT (

## SQL_join.sql
SELECT     A.DEVICE_ID_TEST,A.DEVICE_OS_TEST, A.USAGE_TEST, B.DEVICE_ID_CTRL, B.DEVICE_OS_CTRL, B.USAGE_CTRL,
            (A.CURR_RETAIL_PRC_TEST)-(B.CURR_RETAIL_PRC_CTRL) AS RETAIL_PRICE,
            (A.DEVICE_RAM_TEST)-(B.DEVICE_RAM_CTRL) AS RAM,
           (A.AGE_TEST)-(B.AGE_CONTROL) AS AGE,
           (A.DEVICE_STORAGE_TEST)-(B.DEVICE_STORAGE_CTRL) AS STORAGE,
          (A.SCREEN_INCH_TEST)-(B.SCREEN_INCH_CTRL) AS SCREEN_SIZE
 FROM
(
     SELECT DEVICE_ID AS DEVICE_ID_TEST
          , DEVICE_DESC AS DEVICE_DESC_TEST

## SQL_select.sql
SELECT DEVICE_ID
          , DEVICE_DESC
          , CURR_RETAIL_PRC AS CURR_RETAIL_PRC_TEST
          , DEVICE_RAM AS DEVICE_RAM_TEST
          , AGE AS AGE_TEST
          , DEVICE_STORAGE AS DEVICE_STORAGE_TEST
          , SCREEN_INCH AS SCREEN_INCH_TEST
          , DEVICE_OS AS DEVICE_OS_TEST
          , USAGE AS USAGE_TEST
    FROM TABLE_NAME

## hpp_normalize.py
# normalize the variables to values from 0 to 1
normalized_total_df = preprocessing.normalize(total_df)

## hpp_ohe_le.py
# create a list of ordinal variables
ordinal_variables=['HeatingQC','KitchenQual','FireplaceQu','GarageQual','PoolQC','ExterQual','BsmtQual','Fence','BsmtCond','GarageCond','ExterCond','GarageCond','OverallCond','OverallQual','TotalHomeQual']
# label encoder
le = preprocessing.LabelEncoder()

for c in ordinal_variables:
    le.fit(total_df[c])
    total_df[c] = le.transform(total_df[c])

# create a list of categorical columns for one hot encoding

## hpp_feature_engg_1.py
# Let us start with the variables having highest correlation with the target variable
# looking at OverallQual, GrLivArea, GarageCars and TotalBsmtSF

# Since it is one of the highest correlated variables with the response, we can create a quadratic variable that might be a part of the regression equation
total_df["OverallQual_2"] = total_df["OverallQual"].astype(int) ** 2
#also creating cubic
total_df["OverallQual_3"] = total_df["OverallQual"].astype(int) ** 3
# another sqrt transformation
total_df["OverallQual_sqrt"] = np.sqrt(total_df["OverallQual"].astype(int))

## hpp_missing_values_3.py
# function to scale a column
def norm_minmax(col):
    return (col-col.min())/(col.max()-col.min())

# By business definition, LotFrontage is the area of each street connected to the house property
# Intuitively it should be highly correlated to variables like LotArea
# It should also depend on LotShape, LotConfig
# Let us make a simple Linear regressor to get the most accurate values

# convert categoricals to dummies

## hpp_missing_values_2.py
# Let us first focus on the lesser null percentages (except LoTFrontage)
# Let us see the distribution of data across these fields

# first up: Utilities
total_df.groupby(['Utilities']).size() # only one NoSeWa value and 2 nulls
train_df.groupby(['Utilities']).size() # train data contains the 'NoSeWa'i.e. Test has no NoSeWa value
# 2 null values come from Test data
## intuitively this will not play a significant role in our model prediction
# for now let us populate the nulls with the most frequent value 'AllPub'-- can drop it later
total_df['Utilities'] = total_df['Utilities'].fillna(total_df['Utilities'].mode()[0])

## hpp_missing_values_1.py
# columns with attributes like Pool, Fence etc. marked as NaN indicate the absence of these features.
attributes_with_na = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2']

# replace 'NaN' with 'None' in these columns
for col in attributes_with_na:
    total_df[col].fillna('None',inplace=True)

#NAs in basement related columns will indicate no masonry veneer. Thus replacing MasVnr Area with 0
	SELECT *
	FROM
	(
	SELECT TEST_DEVICE, DEVICE_OS_TEST, USAGE_TEST,
	CONTROL_DEVICE, DEVICE_OS_CTRL, USAGE_CTRL,
	RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_5 ) AS DEV_RANK_5,
	RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_3 ) AS DEV_RANK_3,
	RANK () OVER (PARTITION BY TEST_DEVICE ORDER BY EUCLEDIAN_DEV_1 ) AS DEV_RANK_1
	FROM EUCLIDEAN_TABLE
	)
	CREATE TABLE EUCLIDEAN_TABLE AS
	SELECT DEVICE_ID_TEST AS TEST_DEVICE, DEVICE_OS_TEST, USAGE_TEST, DEVICE_ID_CTRL AS CONTROL_DEVICE, DEVICE_OS_CTRL, USAGE_CTRL
	, SQRT (
	(POWER (RETAIL_PRICE,2))+
	(POWER (RAM,2))+
	(POWER (AGE,2))+
	(POWER (STORAGE,2))+
	(POWER (SCREEN_SIZE,2))
	) AS EUCLEDIAN_DEV_5
	, SQRT (
	SELECT A.DEVICE_ID_TEST,A.DEVICE_OS_TEST, A.USAGE_TEST, B.DEVICE_ID_CTRL, B.DEVICE_OS_CTRL, B.USAGE_CTRL,
	(A.CURR_RETAIL_PRC_TEST)-(B.CURR_RETAIL_PRC_CTRL) AS RETAIL_PRICE,
	(A.DEVICE_RAM_TEST)-(B.DEVICE_RAM_CTRL) AS RAM,
	(A.AGE_TEST)-(B.AGE_CONTROL) AS AGE,
	(A.DEVICE_STORAGE_TEST)-(B.DEVICE_STORAGE_CTRL) AS STORAGE,
	(A.SCREEN_INCH_TEST)-(B.SCREEN_INCH_CTRL) AS SCREEN_SIZE
	FROM
	(
	SELECT DEVICE_ID AS DEVICE_ID_TEST
	, DEVICE_DESC AS DEVICE_DESC_TEST
	SELECT DEVICE_ID
	, DEVICE_DESC
	, CURR_RETAIL_PRC AS CURR_RETAIL_PRC_TEST
	, DEVICE_RAM AS DEVICE_RAM_TEST
	, AGE AS AGE_TEST
	, DEVICE_STORAGE AS DEVICE_STORAGE_TEST
	, SCREEN_INCH AS SCREEN_INCH_TEST
	, DEVICE_OS AS DEVICE_OS_TEST
	, USAGE AS USAGE_TEST
	FROM TABLE_NAME
	# normalize the variables to values from 0 to 1
	normalized_total_df = preprocessing.normalize(total_df)
	# create a list of ordinal variables
	ordinal_variables=['HeatingQC','KitchenQual','FireplaceQu','GarageQual','PoolQC','ExterQual','BsmtQual','Fence','BsmtCond','GarageCond','ExterCond','GarageCond','OverallCond','OverallQual','TotalHomeQual']
	# label encoder
	le = preprocessing.LabelEncoder()

	for c in ordinal_variables:
	le.fit(total_df[c])
	total_df[c] = le.transform(total_df[c])

	# create a list of categorical columns for one hot encoding
	# Let us start with the variables having highest correlation with the target variable
	# looking at OverallQual, GrLivArea, GarageCars and TotalBsmtSF

	# Since it is one of the highest correlated variables with the response, we can create a quadratic variable that might be a part of the regression equation
	total_df["OverallQual_2"] = total_df["OverallQual"].astype(int) ** 2
	#also creating cubic
	total_df["OverallQual_3"] = total_df["OverallQual"].astype(int) ** 3
	# another sqrt transformation
	total_df["OverallQual_sqrt"] = np.sqrt(total_df["OverallQual"].astype(int))
	# function to scale a column
	def norm_minmax(col):
	return (col-col.min())/(col.max()-col.min())

	# By business definition, LotFrontage is the area of each street connected to the house property
	# Intuitively it should be highly correlated to variables like LotArea
	# It should also depend on LotShape, LotConfig
	# Let us make a simple Linear regressor to get the most accurate values

	# convert categoricals to dummies
	# Let us first focus on the lesser null percentages (except LoTFrontage)
	# Let us see the distribution of data across these fields

	# first up: Utilities
	total_df.groupby(['Utilities']).size() # only one NoSeWa value and 2 nulls
	train_df.groupby(['Utilities']).size() # train data contains the 'NoSeWa'i.e. Test has no NoSeWa value
	# 2 null values come from Test data
	## intuitively this will not play a significant role in our model prediction
	# for now let us populate the nulls with the most frequent value 'AllPub'-- can drop it later
	total_df['Utilities'] = total_df['Utilities'].fillna(total_df['Utilities'].mode()[0])
	# columns with attributes like Pool, Fence etc. marked as NaN indicate the absence of these features.
	attributes_with_na = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
	'GarageQual','GarageCond','GarageFinish','GarageType',
	'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2']

	# replace 'NaN' with 'None' in these columns
	for col in attributes_with_na:
	total_df[col].fillna('None',inplace=True)

	#NAs in basement related columns will indicate no masonry veneer. Thus replacing MasVnr Area with 0