frankchen0130/pd_pipes.py

## pd_pipes.py
def pipe_basic_fillna(df=combined):
    local_ntrain = ntrain
    pre_combined=df.copy()

    #print("The input train dimension:\t", pre_combined[0:ntrain].shape)
    #print("The input test dimension:\t", pre_combined[ntrain:].drop("SalePrice",axis=1).shape)

    num_cols = pre_combined.drop(["Id","SalePrice"],axis=1).select_dtypes(include=[np.number]).columns
    cat_cols = pre_combined.select_dtypes(include=[np.object]).columns


    pre_combined[num_cols]= pre_combined[num_cols].fillna(pre_combined[num_cols].median())
    # Median is my favoraite fillna mode, which can eliminate the skew impact.
    pre_combined[cat_cols]= pre_combined[cat_cols].fillna("NA")
    pre_combined= pd.concat([pre_combined[["Id","SalePrice"]],pre_combined[cat_cols],pre_combined[num_cols]],axis=1)
    return pre_combined
def pipe_drop4cols(pre_combined=pipe_fillna_ascat()):
    cols_drop =['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
    # the 4 features(columns was identified earlier which missing data >50%)
    #pre_combined, ntrain = customize_fillna_extract_outliersdrop()
    #pre_combined = customize_fillna_extract_outliersdrop()
    pre_combined = pre_combined.drop(cols_drop,axis=1)

    return pre_combined
def pipe_drop_cols(pre_combined = pipe_fillna_ascat()):
    pre_combied = pre_combined.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'], axis = 1)

    return pre_combined
def pipe_fillna_ascat(df=combined):
    from datetime import datetime
    local_ntrain = train.shape[0]

    pre_combined=df.copy()


    #convert quality feature to category type
    #def feature group with same categories value
    cols_Subclass = ["MSSubClass"]
    cols_Zone = ["MSZoning"]
    cols_Overall =["OverallQual","OverallCond"]

    cols_Qual = ["BsmtCond","BsmtQual",\
                 "ExterQual","ExterCond",\
                 "FireplaceQu","GarageQual","GarageCond",\
                 "HeatingQC","KitchenQual",
                "PoolQC"]
    cols_BsmtFinType = ["BsmtFinType1","BsmtFinType2"]
    cols_access = ["Alley","Street"]
    cols_condition = ["Condition1","Condition2"]
    cols_fence =["Fence"]
    cols_exposure = ["BsmtExposure"]
    cols_miscfeat = ["MiscFeature"]
    cols_exter = ["Exterior1st","Exterior2nd"]
    cols_MasVnr =["MasVnrType"]
    cols_GarageType = ["GarageType"]
    cols_GarageFinish =["GarageFinish"]
    cols_Functional = ["Functional"]
    cols_Util =["Utilities"]
    cols_SaleType = ["SaleType"]
    cols_Electrical = ["Electrical"]


    #define the map of categories valus group
    cat_Subclass = ["20",#1-STORY 1946 & NEWER ALL STYLES
                    "30",#1-STORY 1945 & OLDER
                    "40",#1-STORY W/FINISHED ATTIC ALL AGES
                    "45",#1-1/2 STORY - UNFINISHED ALL AGES
                    "50",#1-1/2 STORY FINISHED ALL AGES
                    "60",#2-STORY 1946 & NEWER
                    "70",#2-STORY 1945 & OLDER
                    "75",#2-1/2 STORY ALL AGES
                    "80",#SPLIT OR MULTI-LEVEL
                    "85",#SPLIT FOYER
                    "90",#DUPLEX - ALL STYLES AND AGES
                    "120",#1-STORY PUD (Planned Unit Development) - 1946 & NEWER
                    "150",#1-1/2 STORY PUD - ALL AGES
                    "160",#2-STORY PUD - 1946 & NEWER
                    "180",#PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
                    "190",#2 FAMILY CONVERSION - ALL STYLES AND AGES
                   ]
    cat_Zone = ["A",#Agriculture
                "C (all)",#Commercial #the train/test value is different than the data_description file.
                "FV",#Floating Village Residential
                "I",#Industrial
                "RH",#Residential High Density
                "RL",#Residential Low Density
                "RP",#Residential Low Density Park
                "RM",#Residential Medium Density
               ]
    cat_Overall = ["10","9","8","7","6","5","4","3","2","1"]
    cat_Qual = ["Ex","Gd","TA","Fa","Po","NA"]
    cat_BsmtFinType = ["GLQ","ALQ","BLQ","Rec","LwQ","Unf","NA"]
    cat_access = ["Grvl","Pave","NA"]
    cat_conditions= ["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"]
    cat_fence = ["GdPrv",#Good Privacy
                "MnPrv",#Minimum Privacy
                "GdWo",#Good Wood
                "MnWw",#Minimum Wood/Wire
                "NA",#No Fence
                ]
    cat_exposure = ["Gd", #Good Exposure
                    "Av", #Average Exposure (split levels or foyers typically score average or above)
                    "Mn", #Mimimum Exposure
                    "No", #No Exposure
                    "NA", #No Basement
                   ]
    cat_miscfeat = ["Elev",#Elevator
                    "Gar2",#2nd Garage (if not described in garage section)
                    "Othr",#Other
                    "Shed",#Shed (over 100 SF)
                    "TenC",#Tennis Court
                    "NA",#None
                   ]
    cat_exter =["AsbShng",#Asbestos Shingles
                "AsphShn",#Asphalt Shingles
                "BrkComm",#Brick Common Brk Cmn BrkComm
                "BrkFace",#Brick Face
                "CBlock",#Cinder Block
                "CementBd",#Cement Board #CementBd was the  data_description value
                "HdBoard",#Hard Board
                "ImStucc",#Imitation Stucco
                "MetalSd",#Metal Siding
                "Other",#Other
                "Plywood",#Plywood
                "PreCast",#PreCast,#
                "Stone",#Stone
                "Stucco",#Stucco
                "VinylSd",#Vinyl Siding
                "Wd Sdng",#Wood Siding
                "WdShing",#Wood Shingles #Wd Shng WdShing
                    ]
    cat_MasVnr =["BrkCmn",#Brick Common
                "BrkFace",#Brick Face
                "CBlock",#Cinder Block
                "None",#None
                "Stone",#Stone
                ]
    cat_GarageType =["2Types",#More than one type of garage
                    "Attchd",#Attached to home
                    "Basment",#Basement Garage
                    "BuiltIn",#Built-In (Garage part of house - typically has room above garage)
                    "CarPort",#Car Port
                    "Detchd",#Detached from home
                    "NA",#No Garage
                    ]
    cat_GarageFinish =["Fin",#Finished
                        "RFn",#Rough Finished,#
                        "Unf",#Unfinished
                        "NA",#No Garage
                      ]
    cat_Functional = ["Typ",#Typical Functionality
                    "Min1",#Minor Deductions 1
                    "Min2",#Minor Deductions 2
                    "Mod",#Moderate Deductions
                    "Maj1",#Major Deductions 1
                    "Maj2",#Major Deductions 2
                    "Sev",#Severely Damaged
                    "Sal",#Salvage only
                    ]
    cat_Util =["AllPub",#All public Utilities (E,G,W,& S)
               "NoSewr",#Electricity, Gas, and Water (Septic Tank)
                "NoSeWa",#Electricity and Gas Only
                "ELO",#Electricity only,#
                ]
    cat_SaleType =["WD",#Warranty Deed - Conventional
                   "CWD",#Warranty Deed - Cash
                    "VWD",#Warranty Deed - VA Loan
                    "New",#Home just constructed and sold
                    "COD",#Court Officer Deed/Estate
                    "Con",#Contract 15% Down payment regular terms
                    "ConLw",#Contract Low Down payment and low interest
                    "ConLI",#Contract Low Interest
                    "ConLD",#Contract Low Down
                    "Oth",#Other
                    ]
    cat_Electrical = ["SBrkr",#Standard Circuit Breakers & Romex
                        "FuseA",#Fuse Box over 60 AMP and all Romex wiring (Average),#
                        "FuseF",#60 AMP Fuse Box and mostly Romex wiring (Fair)
                        "FuseP",#60 AMP Fuse Box and mostly knob & tube wiring (poor)
                        "Mix",#Mixed
                        ]


    ###########################################################################
    #define the collection of group features &categories value by diction type
    Dict_category={"Qual":[cols_Qual,cat_Qual,"NA","Ordinal"],
                   "Overall":[cols_Overall,cat_Overall,"5","Ordinal"], # It is integer already. no need overwork
                   "BsmtFinType":[cols_BsmtFinType,cat_BsmtFinType,"NA","Ordinal"],
                   "Access":[cols_access,cat_access,"NA","Ordinal"],
                    "Fence":[cols_fence,cat_fence,"NA","Ordinal"],
                   "Exposure":[cols_exposure,cat_exposure,"NA","v"],
                   "GarageFinish":[cols_GarageFinish,cat_GarageFinish,"NA","Ordinal"],
                   "Functional":[cols_Functional,cat_Functional,"Typ","Ordinal"], #fill na with lowest quality
                   "Utility":[cols_Util,cat_Util,"ELO","Ordinal"], # fillNA with lowest quality

                   "Subclass":[cols_Subclass,cat_Subclass,"NA","Nominal"],
                   "Zone":[cols_Zone,cat_Zone,"RL","Nominal"], #RL is most popular zone value. "C(all) is the study result"
                   "Cond":[cols_condition,cat_conditions,"Norm","Nominal"],
                   "MiscFeature":[cols_miscfeat,cat_miscfeat,"NA","Nominal"],
                   "Exter":[cols_exter,cat_exter,"Other","Nominal"],
                   "MasVnr":[cols_MasVnr,cat_MasVnr,"None","Nominal"],
                   "GarageType":[cols_GarageType,cat_GarageType,"NA","Nominal"],
                   "SaleType":[cols_SaleType, cat_SaleType,"WD","Nominal"],
                   "Electrical":[cols_Electrical,cat_Electrical,"SBrkr","Nominal"],
                   }


    #Change input feature type to string, especailly to below integer type
    pre_combined[cols_Overall] = pre_combined[cols_Overall].astype(str)
    pre_combined[cols_Subclass] = pre_combined[cols_Subclass].astype(str)


    #fix the raw data mistyping
    exter_map = {"Brk Cmn":"BrkComm",
                 "CmentBd":"CementBd",
                 "CemntBd":"CementBd",
                 "Wd Shng":"WdShing" }
    pre_combined[cols_exter]=pre_combined[cols_exter].replace(exter_map)

    for v in Dict_category.values():
        cols_cat = v[0]
        cat_order =v[1]
        cat_fillnavalue=v[2]
        pre_combined[cols_cat]=pre_combined[cols_cat].fillna(cat_fillnavalue)
        #if not isOrdinal:
        if v[3] =="Nominal":
            for col in cols_cat:
                pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order)
        elif v[3]=="Ordinal":
            for col in cols_cat:
                pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order).cat.codes
                pre_combined[col] = pre_combined[col].astype(np.number)


    #pre_combined[cols_Overall] = pre_combined[cols_Overall].fillna(pre_combined[cols_Overall].median())

    #Lotfrontage fill mssing value

    pre_combined["LotFrontage"] = pre_combined.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


    #fill missing value to Garage related features
    #Assuming no garage for thos missing value
    pre_combined["GarageCars"] =pre_combined["GarageCars"].fillna(0).astype(int)
    pre_combined["GarageArea"] =pre_combined["GarageArea"].fillna(0).astype(int)

    #fill missing value to Basement related features
    pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]]= pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]].fillna(0)
    pre_combined["TotalBsmtSF"]= pre_combined["BsmtFinSF1"] + pre_combined["BsmtFinSF2"]+pre_combined["BsmtUnfSF"]

    cols_Bsmt_Bath = ["BsmtHalfBath","BsmtFullBath"]
    pre_combined[cols_Bsmt_Bath] =pre_combined[cols_Bsmt_Bath].fillna(0) #assuming mean

    pre_combined["MasVnrArea"] = pre_combined["MasVnrArea"].fillna(0) #filled per study

    #solve Year related feature missing value
    #cols_time = ["YearBuilt","YearRemodAdd","GarageYrBlt","MoSold","YrSold"]
    pre_combined["GarageYrBlt"] = pre_combined["GarageYrBlt"].fillna(pre_combined["YearBuilt"]) #use building year for garage even no garage.

    return pre_combined
def pipe_outliersdrop(pre_combined=pipe_fillna_ascat(),ratio =0.001):
    # note, it could done by statsmodel as well. it will explored in future
    ratio =1-ratio
    ntrain = pre_combined["SalePrice"].notnull().sum()

    Y_train = pre_combined["SalePrice"][:ntrain]

    num_cols = pre_combined.select_dtypes(include=[np.number]).columns
    out_df = pre_combined[0:ntrain][num_cols]

    top5 = np.abs(out_df.corrwith(Y_train)).sort_values(ascending=False)[:5]

    #eda_plot(df=pre_combined[:ntrain],cols=top5.index)
    limit = out_df["GrLivArea"].quantile(ratio)

    # limit use to remove the outliers
    dropindex = out_df[out_df["GrLivArea"]>limit].index
    dropped_pre_combined =pre_combined.drop(dropindex)
    #*****************************
    dropped_Y_train = Y_train.drop(dropindex)
    #*****************************
    print("\n\n*****Drop outlier based on ratio > {0:.3f} quantile :".format(ratio))
    #print("New shape of collected data",dropped_pre_combined.shape)
    return dropped_pre_combined
def cat_col_compress(col, threshold=0.005):
    #copy the code from stackoverflow
    # removes the bind
    dummy_col=col.copy()

    # what is the ratio of a dummy in whole column
    count = pd.value_counts(dummy_col) / len(dummy_col)

    # cond whether the ratios is higher than the threshold
    mask = dummy_col.isin(count[count > threshold].index)

    # replace the ones which ratio is lower than the threshold by a special name
    dummy_col[~mask] = "dum_others"

    return dummy_col

def pipe_log_getdummies(pre_combined = pipe_fillna_ascat(),skew_ratio=0.75,cat_ratio=0):
    from scipy.stats import skew
    skew_limit =skew_ratio #I got this limit from Kaggle directly. Someone use 1 , someone use 0.75. I just use 0.75 by random and has no detail study yet
    cat_threshold = cat_ratio


    num_cols = pre_combined.select_dtypes(include=[np.number]).columns
    cat_cols = pre_combined.select_dtypes(include=[np.object]).columns

    #log transform skewed numeric features:
    skewed_Series = np.abs(pre_combined[num_cols].skew()) #compute skewness
    skewed_cols = skewed_Series[skewed_Series > skew_limit].index.values

    pre_combined[skewed_cols] = np.log1p(pre_combined[skewed_cols])
    skewed_Series = abs(pre_combined.skew()) #compute skewness
    skewed_cols = skewed_Series[skewed_Series > skew_limit].index.tolist()

    for col in cat_cols:
        pre_combined[col]=cat_col_compress(pre_combined[col],threshold=cat_threshold) # threshold set to zero as it get high core for all estimatior  except ridge based

    pre_combined= pd.get_dummies(pre_combined,drop_first=True)

    return pre_combined
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
    cols = ["MSSubClass_160","MSZoning_C (all)"]
    pre_combined=pre_combined.drop(cols,axis=1)

    return pre_combined
def pipe_export(pre_output,name):

    if (pre_output is None) :
        print("None input! Expect pre_combined dataframe name as parameter")
        return
    elif pre_output.drop("SalePrice",axis=1).isnull().sum().sum()>0:
        print("Dataframe still missing value! pls check again")
        return
    elif type(name) is not str:
        print("Expect preparing option name to generate output file")
        print("The out file name will be [Preparing_Output_<name>_20171029.h5] ")
        return
    else:
        from datetime import datetime
        savetime=datetime.now().strftime("%m-%d-%H_%M")
        directory_name = "./prepare/"
        filename = directory_name + name +"_"+ savetime +".h5"
        local_ntrain = pre_output.SalePrice.notnull().sum()
        pre_train = pre_output[0:local_ntrain]
        pre_test =pre_output[local_ntrain:].drop("SalePrice",axis=1)

        pre_train.to_hdf(filename,"pre_train")
        pre_test.to_hdf(filename,"pre_test")
        #print("\n***Exported*** :{0}".format(filename))
        #print("\ttrain set size :\t",local_ntrain)
        #print("\tpre_train shape:\t", pre_train.shape)
        #print("\tpre_test  shape:\t", pre_test.shape)
        return pre_output
def pipe_r2test(df):
    import statsmodels.api as sm
    import warnings
    warnings.filterwarnings('ignore')
    print("****Testing*****")
    train_df = df

    ntrain = train_df["SalePrice"].notnull().sum()
    train = train_df[:ntrain]
    X_train = train.drop(["Id","SalePrice"],axis =1)
    Y_train = train["SalePrice"]

    from lightgbm import LGBMRegressor,LGBMClassifier

    from sklearn.model_selection import cross_val_score
    LGB =LGBMRegressor()
    nCV=3
    score = cross_val_score(LGB,X_train,Y_train,cv=nCV,scoring="r2")
    print("R2 Scoring by lightGBM = {0:.4f}".format(score.mean()))


    #print(pd.concat([X_train,Y_train],axis=1).head())
    result = sm.OLS(Y_train, X_train).fit()


    result_str= str(result.summary())
    results1 = result_str.split("\n")[:10]

    for result in results1:
        print(result)
    print('*'*20)
    return df
def pipe_extract( pre_combined = pipe_fillna_ascat()):

    #extract 3 age feature(building, Garage, and remodel)
    pre_combined["BldAge"] = pre_combined["YrSold"] - pre_combined["YearBuilt"]
    pre_combined["GarageAge"] = pre_combined["YrSold"] - pre_combined["GarageYrBlt"]
    pre_combined["RemodelAge"] = pre_combined["YrSold"] - pre_combined["YearRemodAdd"]
    SoldYM_df = pd.DataFrame({"year":pre_combined.YrSold,"month":pre_combined.MoSold.astype("int").astype("object"),"day":1})
    SoldYM_df = pd.to_datetime(SoldYM_df,format='%Y%m%d',unit="D")
    pre_combined["SoldYM"]=SoldYM_df.apply(lambda x: x.toordinal())

    #extract total space features
    # three options for calculating the total Square feet. (Garage & basement has very high sknew, remove them )
    #pre_combined["TotalSQF"] = pre_combined["GarageArea"] + pre_combined['TotalBsmtSF']  +pre_combined["GrLivArea"]+pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']
    #pre_combined["TotalSQF"] = pre_combined['TotalBsmtSF'] + pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']

    pre_combined["TotalSQF"] = pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] +pre_combined["GrLivArea"]
    return pre_combined
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
    cols = ["MSSubClass_160","MSZoning_C (all)"]
    pre_combined=pre_combined.drop(cols,axis=1)

    return pre_combined
	def pipe_basic_fillna(df=combined):
	local_ntrain = ntrain
	pre_combined=df.copy()

	#print("The input train dimension:\t", pre_combined[0:ntrain].shape)
	#print("The input test dimension:\t", pre_combined[ntrain:].drop("SalePrice",axis=1).shape)

	num_cols = pre_combined.drop(["Id","SalePrice"],axis=1).select_dtypes(include=[np.number]).columns
	cat_cols = pre_combined.select_dtypes(include=[np.object]).columns


	pre_combined[num_cols]= pre_combined[num_cols].fillna(pre_combined[num_cols].median())
	# Median is my favoraite fillna mode, which can eliminate the skew impact.
	pre_combined[cat_cols]= pre_combined[cat_cols].fillna("NA")
	pre_combined= pd.concat([pre_combined[["Id","SalePrice"]],pre_combined[cat_cols],pre_combined[num_cols]],axis=1)
	return pre_combined
	def pipe_drop4cols(pre_combined=pipe_fillna_ascat()):
	cols_drop =['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
	# the 4 features(columns was identified earlier which missing data >50%)
	#pre_combined, ntrain = customize_fillna_extract_outliersdrop()
	#pre_combined = customize_fillna_extract_outliersdrop()
	pre_combined = pre_combined.drop(cols_drop,axis=1)

	return pre_combined
	def pipe_drop_cols(pre_combined = pipe_fillna_ascat()):
	pre_combied = pre_combined.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'], axis = 1)

	return pre_combined
	def pipe_fillna_ascat(df=combined):
	from datetime import datetime
	local_ntrain = train.shape[0]

	pre_combined=df.copy()


	#convert quality feature to category type
	#def feature group with same categories value
	cols_Subclass = ["MSSubClass"]
	cols_Zone = ["MSZoning"]
	cols_Overall =["OverallQual","OverallCond"]

	cols_Qual = ["BsmtCond","BsmtQual",\
	"ExterQual","ExterCond",\
	"FireplaceQu","GarageQual","GarageCond",\
	"HeatingQC","KitchenQual",
	"PoolQC"]
	cols_BsmtFinType = ["BsmtFinType1","BsmtFinType2"]
	cols_access = ["Alley","Street"]
	cols_condition = ["Condition1","Condition2"]
	cols_fence =["Fence"]
	cols_exposure = ["BsmtExposure"]
	cols_miscfeat = ["MiscFeature"]
	cols_exter = ["Exterior1st","Exterior2nd"]
	cols_MasVnr =["MasVnrType"]
	cols_GarageType = ["GarageType"]
	cols_GarageFinish =["GarageFinish"]
	cols_Functional = ["Functional"]
	cols_Util =["Utilities"]
	cols_SaleType = ["SaleType"]
	cols_Electrical = ["Electrical"]


	#define the map of categories valus group
	cat_Subclass = ["20",#1-STORY 1946 & NEWER ALL STYLES
	"30",#1-STORY 1945 & OLDER
	"40",#1-STORY W/FINISHED ATTIC ALL AGES
	"45",#1-1/2 STORY - UNFINISHED ALL AGES
	"50",#1-1/2 STORY FINISHED ALL AGES
	"60",#2-STORY 1946 & NEWER
	"70",#2-STORY 1945 & OLDER
	"75",#2-1/2 STORY ALL AGES
	"80",#SPLIT OR MULTI-LEVEL
	"85",#SPLIT FOYER
	"90",#DUPLEX - ALL STYLES AND AGES
	"120",#1-STORY PUD (Planned Unit Development) - 1946 & NEWER
	"150",#1-1/2 STORY PUD - ALL AGES
	"160",#2-STORY PUD - 1946 & NEWER
	"180",#PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
	"190",#2 FAMILY CONVERSION - ALL STYLES AND AGES
	]
	cat_Zone = ["A",#Agriculture
	"C (all)",#Commercial #the train/test value is different than the data_description file.
	"FV",#Floating Village Residential
	"I",#Industrial
	"RH",#Residential High Density
	"RL",#Residential Low Density
	"RP",#Residential Low Density Park
	"RM",#Residential Medium Density
	]
	cat_Overall = ["10","9","8","7","6","5","4","3","2","1"]
	cat_Qual = ["Ex","Gd","TA","Fa","Po","NA"]
	cat_BsmtFinType = ["GLQ","ALQ","BLQ","Rec","LwQ","Unf","NA"]
	cat_access = ["Grvl","Pave","NA"]
	cat_conditions= ["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"]
	cat_fence = ["GdPrv",#Good Privacy
	"MnPrv",#Minimum Privacy
	"GdWo",#Good Wood
	"MnWw",#Minimum Wood/Wire
	"NA",#No Fence
	]
	cat_exposure = ["Gd", #Good Exposure
	"Av", #Average Exposure (split levels or foyers typically score average or above)
	"Mn", #Mimimum Exposure
	"No", #No Exposure
	"NA", #No Basement
	]
	cat_miscfeat = ["Elev",#Elevator
	"Gar2",#2nd Garage (if not described in garage section)
	"Othr",#Other
	"Shed",#Shed (over 100 SF)
	"TenC",#Tennis Court
	"NA",#None
	]
	cat_exter =["AsbShng",#Asbestos Shingles
	"AsphShn",#Asphalt Shingles
	"BrkComm",#Brick Common Brk Cmn BrkComm
	"BrkFace",#Brick Face
	"CBlock",#Cinder Block
	"CementBd",#Cement Board #CementBd was the data_description value
	"HdBoard",#Hard Board
	"ImStucc",#Imitation Stucco
	"MetalSd",#Metal Siding
	"Other",#Other
	"Plywood",#Plywood
	"PreCast",#PreCast,#
	"Stone",#Stone
	"Stucco",#Stucco
	"VinylSd",#Vinyl Siding
	"Wd Sdng",#Wood Siding
	"WdShing",#Wood Shingles #Wd Shng WdShing
	]
	cat_MasVnr =["BrkCmn",#Brick Common
	"BrkFace",#Brick Face
	"CBlock",#Cinder Block
	"None",#None
	"Stone",#Stone
	]
	cat_GarageType =["2Types",#More than one type of garage
	"Attchd",#Attached to home
	"Basment",#Basement Garage
	"BuiltIn",#Built-In (Garage part of house - typically has room above garage)
	"CarPort",#Car Port
	"Detchd",#Detached from home
	"NA",#No Garage
	]
	cat_GarageFinish =["Fin",#Finished
	"RFn",#Rough Finished,#
	"Unf",#Unfinished
	"NA",#No Garage
	]
	cat_Functional = ["Typ",#Typical Functionality
	"Min1",#Minor Deductions 1
	"Min2",#Minor Deductions 2
	"Mod",#Moderate Deductions
	"Maj1",#Major Deductions 1
	"Maj2",#Major Deductions 2
	"Sev",#Severely Damaged
	"Sal",#Salvage only
	]
	cat_Util =["AllPub",#All public Utilities (E,G,W,& S)
	"NoSewr",#Electricity, Gas, and Water (Septic Tank)
	"NoSeWa",#Electricity and Gas Only
	"ELO",#Electricity only,#
	]
	cat_SaleType =["WD",#Warranty Deed - Conventional
	"CWD",#Warranty Deed - Cash
	"VWD",#Warranty Deed - VA Loan
	"New",#Home just constructed and sold
	"COD",#Court Officer Deed/Estate
	"Con",#Contract 15% Down payment regular terms
	"ConLw",#Contract Low Down payment and low interest
	"ConLI",#Contract Low Interest
	"ConLD",#Contract Low Down
	"Oth",#Other
	]
	cat_Electrical = ["SBrkr",#Standard Circuit Breakers & Romex
	"FuseA",#Fuse Box over 60 AMP and all Romex wiring (Average),#
	"FuseF",#60 AMP Fuse Box and mostly Romex wiring (Fair)
	"FuseP",#60 AMP Fuse Box and mostly knob & tube wiring (poor)
	"Mix",#Mixed
	]


	###########################################################################
	#define the collection of group features &categories value by diction type
	Dict_category={"Qual":[cols_Qual,cat_Qual,"NA","Ordinal"],
	"Overall":[cols_Overall,cat_Overall,"5","Ordinal"], # It is integer already. no need overwork
	"BsmtFinType":[cols_BsmtFinType,cat_BsmtFinType,"NA","Ordinal"],
	"Access":[cols_access,cat_access,"NA","Ordinal"],
	"Fence":[cols_fence,cat_fence,"NA","Ordinal"],
	"Exposure":[cols_exposure,cat_exposure,"NA","v"],
	"GarageFinish":[cols_GarageFinish,cat_GarageFinish,"NA","Ordinal"],
	"Functional":[cols_Functional,cat_Functional,"Typ","Ordinal"], #fill na with lowest quality
	"Utility":[cols_Util,cat_Util,"ELO","Ordinal"], # fillNA with lowest quality

	"Subclass":[cols_Subclass,cat_Subclass,"NA","Nominal"],
	"Zone":[cols_Zone,cat_Zone,"RL","Nominal"], #RL is most popular zone value. "C(all) is the study result"
	"Cond":[cols_condition,cat_conditions,"Norm","Nominal"],
	"MiscFeature":[cols_miscfeat,cat_miscfeat,"NA","Nominal"],
	"Exter":[cols_exter,cat_exter,"Other","Nominal"],
	"MasVnr":[cols_MasVnr,cat_MasVnr,"None","Nominal"],
	"GarageType":[cols_GarageType,cat_GarageType,"NA","Nominal"],
	"SaleType":[cols_SaleType, cat_SaleType,"WD","Nominal"],
	"Electrical":[cols_Electrical,cat_Electrical,"SBrkr","Nominal"],
	}


	#Change input feature type to string, especailly to below integer type
	pre_combined[cols_Overall] = pre_combined[cols_Overall].astype(str)
	pre_combined[cols_Subclass] = pre_combined[cols_Subclass].astype(str)


	#fix the raw data mistyping
	exter_map = {"Brk Cmn":"BrkComm",
	"CmentBd":"CementBd",
	"CemntBd":"CementBd",
	"Wd Shng":"WdShing" }
	pre_combined[cols_exter]=pre_combined[cols_exter].replace(exter_map)

	for v in Dict_category.values():
	cols_cat = v[0]
	cat_order =v[1]
	cat_fillnavalue=v[2]
	pre_combined[cols_cat]=pre_combined[cols_cat].fillna(cat_fillnavalue)
	#if not isOrdinal:
	if v[3] =="Nominal":
	for col in cols_cat:
	pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order)
	elif v[3]=="Ordinal":
	for col in cols_cat:
	pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order).cat.codes
	pre_combined[col] = pre_combined[col].astype(np.number)



	#pre_combined[cols_Overall] = pre_combined[cols_Overall].fillna(pre_combined[cols_Overall].median())

	#Lotfrontage fill mssing value

	pre_combined["LotFrontage"] = pre_combined.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


	#fill missing value to Garage related features
	#Assuming no garage for thos missing value
	pre_combined["GarageCars"] =pre_combined["GarageCars"].fillna(0).astype(int)
	pre_combined["GarageArea"] =pre_combined["GarageArea"].fillna(0).astype(int)

	#fill missing value to Basement related features
	pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]]= pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]].fillna(0)
	pre_combined["TotalBsmtSF"]= pre_combined["BsmtFinSF1"] + pre_combined["BsmtFinSF2"]+pre_combined["BsmtUnfSF"]

	cols_Bsmt_Bath = ["BsmtHalfBath","BsmtFullBath"]
	pre_combined[cols_Bsmt_Bath] =pre_combined[cols_Bsmt_Bath].fillna(0) #assuming mean

	pre_combined["MasVnrArea"] = pre_combined["MasVnrArea"].fillna(0) #filled per study

	#solve Year related feature missing value
	#cols_time = ["YearBuilt","YearRemodAdd","GarageYrBlt","MoSold","YrSold"]
	pre_combined["GarageYrBlt"] = pre_combined["GarageYrBlt"].fillna(pre_combined["YearBuilt"]) #use building year for garage even no garage.

	return pre_combined
	def pipe_outliersdrop(pre_combined=pipe_fillna_ascat(),ratio =0.001):
	# note, it could done by statsmodel as well. it will explored in future
	ratio =1-ratio
	ntrain = pre_combined["SalePrice"].notnull().sum()

	Y_train = pre_combined["SalePrice"][:ntrain]

	num_cols = pre_combined.select_dtypes(include=[np.number]).columns
	out_df = pre_combined[0:ntrain][num_cols]

	top5 = np.abs(out_df.corrwith(Y_train)).sort_values(ascending=False)[:5]

	#eda_plot(df=pre_combined[:ntrain],cols=top5.index)
	limit = out_df["GrLivArea"].quantile(ratio)

	# limit use to remove the outliers
	dropindex = out_df[out_df["GrLivArea"]>limit].index
	dropped_pre_combined =pre_combined.drop(dropindex)
	#*****************************
	dropped_Y_train = Y_train.drop(dropindex)
	#*****************************
	print("\n\n*****Drop outlier based on ratio > {0:.3f} quantile :".format(ratio))
	#print("New shape of collected data",dropped_pre_combined.shape)
	return dropped_pre_combined
	def cat_col_compress(col, threshold=0.005):
	#copy the code from stackoverflow
	# removes the bind
	dummy_col=col.copy()

	# what is the ratio of a dummy in whole column
	count = pd.value_counts(dummy_col) / len(dummy_col)

	# cond whether the ratios is higher than the threshold
	mask = dummy_col.isin(count[count > threshold].index)

	# replace the ones which ratio is lower than the threshold by a special name
	dummy_col[~mask] = "dum_others"

	return dummy_col

	def pipe_log_getdummies(pre_combined = pipe_fillna_ascat(),skew_ratio=0.75,cat_ratio=0):
	from scipy.stats import skew
	skew_limit =skew_ratio #I got this limit from Kaggle directly. Someone use 1 , someone use 0.75. I just use 0.75 by random and has no detail study yet
	cat_threshold = cat_ratio


	num_cols = pre_combined.select_dtypes(include=[np.number]).columns
	cat_cols = pre_combined.select_dtypes(include=[np.object]).columns

	#log transform skewed numeric features:
	skewed_Series = np.abs(pre_combined[num_cols].skew()) #compute skewness
	skewed_cols = skewed_Series[skewed_Series > skew_limit].index.values

	pre_combined[skewed_cols] = np.log1p(pre_combined[skewed_cols])
	skewed_Series = abs(pre_combined.skew()) #compute skewness
	skewed_cols = skewed_Series[skewed_Series > skew_limit].index.tolist()

	for col in cat_cols:
	pre_combined[col]=cat_col_compress(pre_combined[col],threshold=cat_threshold) # threshold set to zero as it get high core for all estimatior except ridge based

	pre_combined= pd.get_dummies(pre_combined,drop_first=True)

	return pre_combined
	def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
	cols = ["MSSubClass_160","MSZoning_C (all)"]
	pre_combined=pre_combined.drop(cols,axis=1)

	return pre_combined
	def pipe_export(pre_output,name):

	if (pre_output is None) :
	print("None input! Expect pre_combined dataframe name as parameter")
	return
	elif pre_output.drop("SalePrice",axis=1).isnull().sum().sum()>0:
	print("Dataframe still missing value! pls check again")
	return
	elif type(name) is not str:
	print("Expect preparing option name to generate output file")
	print("The out file name will be [Preparing_Output_<name>_20171029.h5] ")
	return
	else:
	from datetime import datetime
	savetime=datetime.now().strftime("%m-%d-%H_%M")
	directory_name = "./prepare/"
	filename = directory_name + name +"_"+ savetime +".h5"
	local_ntrain = pre_output.SalePrice.notnull().sum()
	pre_train = pre_output[0:local_ntrain]
	pre_test =pre_output[local_ntrain:].drop("SalePrice",axis=1)

	pre_train.to_hdf(filename,"pre_train")
	pre_test.to_hdf(filename,"pre_test")
	#print("\n*Exported* :{0}".format(filename))
	#print("\ttrain set size :\t",local_ntrain)
	#print("\tpre_train shape:\t", pre_train.shape)
	#print("\tpre_test shape:\t", pre_test.shape)
	return pre_output
	def pipe_r2test(df):
	import statsmodels.api as sm
	import warnings
	warnings.filterwarnings('ignore')
	print("**Testing***")
	train_df = df

	ntrain = train_df["SalePrice"].notnull().sum()
	train = train_df[:ntrain]
	X_train = train.drop(["Id","SalePrice"],axis =1)
	Y_train = train["SalePrice"]

	from lightgbm import LGBMRegressor,LGBMClassifier

	from sklearn.model_selection import cross_val_score
	LGB =LGBMRegressor()
	nCV=3
	score = cross_val_score(LGB,X_train,Y_train,cv=nCV,scoring="r2")
	print("R2 Scoring by lightGBM = {0:.4f}".format(score.mean()))


	#print(pd.concat([X_train,Y_train],axis=1).head())
	result = sm.OLS(Y_train, X_train).fit()


	result_str= str(result.summary())
	results1 = result_str.split("\n")[:10]

	for result in results1:
	print(result)
	print(''20)
	return df
	def pipe_extract( pre_combined = pipe_fillna_ascat()):

	#extract 3 age feature(building, Garage, and remodel)
	pre_combined["BldAge"] = pre_combined["YrSold"] - pre_combined["YearBuilt"]
	pre_combined["GarageAge"] = pre_combined["YrSold"] - pre_combined["GarageYrBlt"]
	pre_combined["RemodelAge"] = pre_combined["YrSold"] - pre_combined["YearRemodAdd"]
	SoldYM_df = pd.DataFrame({"year":pre_combined.YrSold,"month":pre_combined.MoSold.astype("int").astype("object"),"day":1})
	SoldYM_df = pd.to_datetime(SoldYM_df,format='%Y%m%d',unit="D")
	pre_combined["SoldYM"]=SoldYM_df.apply(lambda x: x.toordinal())

	#extract total space features
	# three options for calculating the total Square feet. (Garage & basement has very high sknew, remove them )
	#pre_combined["TotalSQF"] = pre_combined["GarageArea"] + pre_combined['TotalBsmtSF'] +pre_combined["GrLivArea"]+pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']
	#pre_combined["TotalSQF"] = pre_combined['TotalBsmtSF'] + pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']

	pre_combined["TotalSQF"] = pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] +pre_combined["GrLivArea"]
	return pre_combined
	def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
	cols = ["MSSubClass_160","MSZoning_C (all)"]
	pre_combined=pre_combined.drop(cols,axis=1)

	return pre_combined