Created
January 17, 2018 06:05
-
-
Save frankchen0130/5950eaa4d98ea4f93deed707b027b517 to your computer and use it in GitHub Desktop.
pandas pipe examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pipe_basic_fillna(df=combined): | |
local_ntrain = ntrain | |
pre_combined=df.copy() | |
#print("The input train dimension:\t", pre_combined[0:ntrain].shape) | |
#print("The input test dimension:\t", pre_combined[ntrain:].drop("SalePrice",axis=1).shape) | |
num_cols = pre_combined.drop(["Id","SalePrice"],axis=1).select_dtypes(include=[np.number]).columns | |
cat_cols = pre_combined.select_dtypes(include=[np.object]).columns | |
pre_combined[num_cols]= pre_combined[num_cols].fillna(pre_combined[num_cols].median()) | |
# Median is my favoraite fillna mode, which can eliminate the skew impact. | |
pre_combined[cat_cols]= pre_combined[cat_cols].fillna("NA") | |
pre_combined= pd.concat([pre_combined[["Id","SalePrice"]],pre_combined[cat_cols],pre_combined[num_cols]],axis=1) | |
return pre_combined | |
def pipe_drop4cols(pre_combined=pipe_fillna_ascat()): | |
cols_drop =['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'] | |
# the 4 features(columns was identified earlier which missing data >50%) | |
#pre_combined, ntrain = customize_fillna_extract_outliersdrop() | |
#pre_combined = customize_fillna_extract_outliersdrop() | |
pre_combined = pre_combined.drop(cols_drop,axis=1) | |
return pre_combined | |
def pipe_drop_cols(pre_combined = pipe_fillna_ascat()): | |
pre_combied = pre_combined.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'], axis = 1) | |
return pre_combined | |
def pipe_fillna_ascat(df=combined): | |
from datetime import datetime | |
local_ntrain = train.shape[0] | |
pre_combined=df.copy() | |
#convert quality feature to category type | |
#def feature group with same categories value | |
cols_Subclass = ["MSSubClass"] | |
cols_Zone = ["MSZoning"] | |
cols_Overall =["OverallQual","OverallCond"] | |
cols_Qual = ["BsmtCond","BsmtQual",\ | |
"ExterQual","ExterCond",\ | |
"FireplaceQu","GarageQual","GarageCond",\ | |
"HeatingQC","KitchenQual", | |
"PoolQC"] | |
cols_BsmtFinType = ["BsmtFinType1","BsmtFinType2"] | |
cols_access = ["Alley","Street"] | |
cols_condition = ["Condition1","Condition2"] | |
cols_fence =["Fence"] | |
cols_exposure = ["BsmtExposure"] | |
cols_miscfeat = ["MiscFeature"] | |
cols_exter = ["Exterior1st","Exterior2nd"] | |
cols_MasVnr =["MasVnrType"] | |
cols_GarageType = ["GarageType"] | |
cols_GarageFinish =["GarageFinish"] | |
cols_Functional = ["Functional"] | |
cols_Util =["Utilities"] | |
cols_SaleType = ["SaleType"] | |
cols_Electrical = ["Electrical"] | |
#define the map of categories valus group | |
cat_Subclass = ["20",#1-STORY 1946 & NEWER ALL STYLES | |
"30",#1-STORY 1945 & OLDER | |
"40",#1-STORY W/FINISHED ATTIC ALL AGES | |
"45",#1-1/2 STORY - UNFINISHED ALL AGES | |
"50",#1-1/2 STORY FINISHED ALL AGES | |
"60",#2-STORY 1946 & NEWER | |
"70",#2-STORY 1945 & OLDER | |
"75",#2-1/2 STORY ALL AGES | |
"80",#SPLIT OR MULTI-LEVEL | |
"85",#SPLIT FOYER | |
"90",#DUPLEX - ALL STYLES AND AGES | |
"120",#1-STORY PUD (Planned Unit Development) - 1946 & NEWER | |
"150",#1-1/2 STORY PUD - ALL AGES | |
"160",#2-STORY PUD - 1946 & NEWER | |
"180",#PUD - MULTILEVEL - INCL SPLIT LEV/FOYER | |
"190",#2 FAMILY CONVERSION - ALL STYLES AND AGES | |
] | |
cat_Zone = ["A",#Agriculture | |
"C (all)",#Commercial #the train/test value is different than the data_description file. | |
"FV",#Floating Village Residential | |
"I",#Industrial | |
"RH",#Residential High Density | |
"RL",#Residential Low Density | |
"RP",#Residential Low Density Park | |
"RM",#Residential Medium Density | |
] | |
cat_Overall = ["10","9","8","7","6","5","4","3","2","1"] | |
cat_Qual = ["Ex","Gd","TA","Fa","Po","NA"] | |
cat_BsmtFinType = ["GLQ","ALQ","BLQ","Rec","LwQ","Unf","NA"] | |
cat_access = ["Grvl","Pave","NA"] | |
cat_conditions= ["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"] | |
cat_fence = ["GdPrv",#Good Privacy | |
"MnPrv",#Minimum Privacy | |
"GdWo",#Good Wood | |
"MnWw",#Minimum Wood/Wire | |
"NA",#No Fence | |
] | |
cat_exposure = ["Gd", #Good Exposure | |
"Av", #Average Exposure (split levels or foyers typically score average or above) | |
"Mn", #Mimimum Exposure | |
"No", #No Exposure | |
"NA", #No Basement | |
] | |
cat_miscfeat = ["Elev",#Elevator | |
"Gar2",#2nd Garage (if not described in garage section) | |
"Othr",#Other | |
"Shed",#Shed (over 100 SF) | |
"TenC",#Tennis Court | |
"NA",#None | |
] | |
cat_exter =["AsbShng",#Asbestos Shingles | |
"AsphShn",#Asphalt Shingles | |
"BrkComm",#Brick Common Brk Cmn BrkComm | |
"BrkFace",#Brick Face | |
"CBlock",#Cinder Block | |
"CementBd",#Cement Board #CementBd was the data_description value | |
"HdBoard",#Hard Board | |
"ImStucc",#Imitation Stucco | |
"MetalSd",#Metal Siding | |
"Other",#Other | |
"Plywood",#Plywood | |
"PreCast",#PreCast,# | |
"Stone",#Stone | |
"Stucco",#Stucco | |
"VinylSd",#Vinyl Siding | |
"Wd Sdng",#Wood Siding | |
"WdShing",#Wood Shingles #Wd Shng WdShing | |
] | |
cat_MasVnr =["BrkCmn",#Brick Common | |
"BrkFace",#Brick Face | |
"CBlock",#Cinder Block | |
"None",#None | |
"Stone",#Stone | |
] | |
cat_GarageType =["2Types",#More than one type of garage | |
"Attchd",#Attached to home | |
"Basment",#Basement Garage | |
"BuiltIn",#Built-In (Garage part of house - typically has room above garage) | |
"CarPort",#Car Port | |
"Detchd",#Detached from home | |
"NA",#No Garage | |
] | |
cat_GarageFinish =["Fin",#Finished | |
"RFn",#Rough Finished,# | |
"Unf",#Unfinished | |
"NA",#No Garage | |
] | |
cat_Functional = ["Typ",#Typical Functionality | |
"Min1",#Minor Deductions 1 | |
"Min2",#Minor Deductions 2 | |
"Mod",#Moderate Deductions | |
"Maj1",#Major Deductions 1 | |
"Maj2",#Major Deductions 2 | |
"Sev",#Severely Damaged | |
"Sal",#Salvage only | |
] | |
cat_Util =["AllPub",#All public Utilities (E,G,W,& S) | |
"NoSewr",#Electricity, Gas, and Water (Septic Tank) | |
"NoSeWa",#Electricity and Gas Only | |
"ELO",#Electricity only,# | |
] | |
cat_SaleType =["WD",#Warranty Deed - Conventional | |
"CWD",#Warranty Deed - Cash | |
"VWD",#Warranty Deed - VA Loan | |
"New",#Home just constructed and sold | |
"COD",#Court Officer Deed/Estate | |
"Con",#Contract 15% Down payment regular terms | |
"ConLw",#Contract Low Down payment and low interest | |
"ConLI",#Contract Low Interest | |
"ConLD",#Contract Low Down | |
"Oth",#Other | |
] | |
cat_Electrical = ["SBrkr",#Standard Circuit Breakers & Romex | |
"FuseA",#Fuse Box over 60 AMP and all Romex wiring (Average),# | |
"FuseF",#60 AMP Fuse Box and mostly Romex wiring (Fair) | |
"FuseP",#60 AMP Fuse Box and mostly knob & tube wiring (poor) | |
"Mix",#Mixed | |
] | |
########################################################################### | |
#define the collection of group features &categories value by diction type | |
Dict_category={"Qual":[cols_Qual,cat_Qual,"NA","Ordinal"], | |
"Overall":[cols_Overall,cat_Overall,"5","Ordinal"], # It is integer already. no need overwork | |
"BsmtFinType":[cols_BsmtFinType,cat_BsmtFinType,"NA","Ordinal"], | |
"Access":[cols_access,cat_access,"NA","Ordinal"], | |
"Fence":[cols_fence,cat_fence,"NA","Ordinal"], | |
"Exposure":[cols_exposure,cat_exposure,"NA","v"], | |
"GarageFinish":[cols_GarageFinish,cat_GarageFinish,"NA","Ordinal"], | |
"Functional":[cols_Functional,cat_Functional,"Typ","Ordinal"], #fill na with lowest quality | |
"Utility":[cols_Util,cat_Util,"ELO","Ordinal"], # fillNA with lowest quality | |
"Subclass":[cols_Subclass,cat_Subclass,"NA","Nominal"], | |
"Zone":[cols_Zone,cat_Zone,"RL","Nominal"], #RL is most popular zone value. "C(all) is the study result" | |
"Cond":[cols_condition,cat_conditions,"Norm","Nominal"], | |
"MiscFeature":[cols_miscfeat,cat_miscfeat,"NA","Nominal"], | |
"Exter":[cols_exter,cat_exter,"Other","Nominal"], | |
"MasVnr":[cols_MasVnr,cat_MasVnr,"None","Nominal"], | |
"GarageType":[cols_GarageType,cat_GarageType,"NA","Nominal"], | |
"SaleType":[cols_SaleType, cat_SaleType,"WD","Nominal"], | |
"Electrical":[cols_Electrical,cat_Electrical,"SBrkr","Nominal"], | |
} | |
#Change input feature type to string, especailly to below integer type | |
pre_combined[cols_Overall] = pre_combined[cols_Overall].astype(str) | |
pre_combined[cols_Subclass] = pre_combined[cols_Subclass].astype(str) | |
#fix the raw data mistyping | |
exter_map = {"Brk Cmn":"BrkComm", | |
"CmentBd":"CementBd", | |
"CemntBd":"CementBd", | |
"Wd Shng":"WdShing" } | |
pre_combined[cols_exter]=pre_combined[cols_exter].replace(exter_map) | |
for v in Dict_category.values(): | |
cols_cat = v[0] | |
cat_order =v[1] | |
cat_fillnavalue=v[2] | |
pre_combined[cols_cat]=pre_combined[cols_cat].fillna(cat_fillnavalue) | |
#if not isOrdinal: | |
if v[3] =="Nominal": | |
for col in cols_cat: | |
pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order) | |
elif v[3]=="Ordinal": | |
for col in cols_cat: | |
pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order).cat.codes | |
pre_combined[col] = pre_combined[col].astype(np.number) | |
#pre_combined[cols_Overall] = pre_combined[cols_Overall].fillna(pre_combined[cols_Overall].median()) | |
#Lotfrontage fill mssing value | |
pre_combined["LotFrontage"] = pre_combined.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median())) | |
#fill missing value to Garage related features | |
#Assuming no garage for thos missing value | |
pre_combined["GarageCars"] =pre_combined["GarageCars"].fillna(0).astype(int) | |
pre_combined["GarageArea"] =pre_combined["GarageArea"].fillna(0).astype(int) | |
#fill missing value to Basement related features | |
pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]]= pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]].fillna(0) | |
pre_combined["TotalBsmtSF"]= pre_combined["BsmtFinSF1"] + pre_combined["BsmtFinSF2"]+pre_combined["BsmtUnfSF"] | |
cols_Bsmt_Bath = ["BsmtHalfBath","BsmtFullBath"] | |
pre_combined[cols_Bsmt_Bath] =pre_combined[cols_Bsmt_Bath].fillna(0) #assuming mean | |
pre_combined["MasVnrArea"] = pre_combined["MasVnrArea"].fillna(0) #filled per study | |
#solve Year related feature missing value | |
#cols_time = ["YearBuilt","YearRemodAdd","GarageYrBlt","MoSold","YrSold"] | |
pre_combined["GarageYrBlt"] = pre_combined["GarageYrBlt"].fillna(pre_combined["YearBuilt"]) #use building year for garage even no garage. | |
return pre_combined | |
def pipe_outliersdrop(pre_combined=pipe_fillna_ascat(),ratio =0.001): | |
# note, it could done by statsmodel as well. it will explored in future | |
ratio =1-ratio | |
ntrain = pre_combined["SalePrice"].notnull().sum() | |
Y_train = pre_combined["SalePrice"][:ntrain] | |
num_cols = pre_combined.select_dtypes(include=[np.number]).columns | |
out_df = pre_combined[0:ntrain][num_cols] | |
top5 = np.abs(out_df.corrwith(Y_train)).sort_values(ascending=False)[:5] | |
#eda_plot(df=pre_combined[:ntrain],cols=top5.index) | |
limit = out_df["GrLivArea"].quantile(ratio) | |
# limit use to remove the outliers | |
dropindex = out_df[out_df["GrLivArea"]>limit].index | |
dropped_pre_combined =pre_combined.drop(dropindex) | |
#***************************** | |
dropped_Y_train = Y_train.drop(dropindex) | |
#***************************** | |
print("\n\n*****Drop outlier based on ratio > {0:.3f} quantile :".format(ratio)) | |
#print("New shape of collected data",dropped_pre_combined.shape) | |
return dropped_pre_combined | |
def cat_col_compress(col, threshold=0.005): | |
#copy the code from stackoverflow | |
# removes the bind | |
dummy_col=col.copy() | |
# what is the ratio of a dummy in whole column | |
count = pd.value_counts(dummy_col) / len(dummy_col) | |
# cond whether the ratios is higher than the threshold | |
mask = dummy_col.isin(count[count > threshold].index) | |
# replace the ones which ratio is lower than the threshold by a special name | |
dummy_col[~mask] = "dum_others" | |
return dummy_col | |
def pipe_log_getdummies(pre_combined = pipe_fillna_ascat(),skew_ratio=0.75,cat_ratio=0): | |
from scipy.stats import skew | |
skew_limit =skew_ratio #I got this limit from Kaggle directly. Someone use 1 , someone use 0.75. I just use 0.75 by random and has no detail study yet | |
cat_threshold = cat_ratio | |
num_cols = pre_combined.select_dtypes(include=[np.number]).columns | |
cat_cols = pre_combined.select_dtypes(include=[np.object]).columns | |
#log transform skewed numeric features: | |
skewed_Series = np.abs(pre_combined[num_cols].skew()) #compute skewness | |
skewed_cols = skewed_Series[skewed_Series > skew_limit].index.values | |
pre_combined[skewed_cols] = np.log1p(pre_combined[skewed_cols]) | |
skewed_Series = abs(pre_combined.skew()) #compute skewness | |
skewed_cols = skewed_Series[skewed_Series > skew_limit].index.tolist() | |
for col in cat_cols: | |
pre_combined[col]=cat_col_compress(pre_combined[col],threshold=cat_threshold) # threshold set to zero as it get high core for all estimatior except ridge based | |
pre_combined= pd.get_dummies(pre_combined,drop_first=True) | |
return pre_combined | |
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()): | |
cols = ["MSSubClass_160","MSZoning_C (all)"] | |
pre_combined=pre_combined.drop(cols,axis=1) | |
return pre_combined | |
def pipe_export(pre_output,name): | |
if (pre_output is None) : | |
print("None input! Expect pre_combined dataframe name as parameter") | |
return | |
elif pre_output.drop("SalePrice",axis=1).isnull().sum().sum()>0: | |
print("Dataframe still missing value! pls check again") | |
return | |
elif type(name) is not str: | |
print("Expect preparing option name to generate output file") | |
print("The out file name will be [Preparing_Output_<name>_20171029.h5] ") | |
return | |
else: | |
from datetime import datetime | |
savetime=datetime.now().strftime("%m-%d-%H_%M") | |
directory_name = "./prepare/" | |
filename = directory_name + name +"_"+ savetime +".h5" | |
local_ntrain = pre_output.SalePrice.notnull().sum() | |
pre_train = pre_output[0:local_ntrain] | |
pre_test =pre_output[local_ntrain:].drop("SalePrice",axis=1) | |
pre_train.to_hdf(filename,"pre_train") | |
pre_test.to_hdf(filename,"pre_test") | |
#print("\n***Exported*** :{0}".format(filename)) | |
#print("\ttrain set size :\t",local_ntrain) | |
#print("\tpre_train shape:\t", pre_train.shape) | |
#print("\tpre_test shape:\t", pre_test.shape) | |
return pre_output | |
def pipe_r2test(df): | |
import statsmodels.api as sm | |
import warnings | |
warnings.filterwarnings('ignore') | |
print("****Testing*****") | |
train_df = df | |
ntrain = train_df["SalePrice"].notnull().sum() | |
train = train_df[:ntrain] | |
X_train = train.drop(["Id","SalePrice"],axis =1) | |
Y_train = train["SalePrice"] | |
from lightgbm import LGBMRegressor,LGBMClassifier | |
from sklearn.model_selection import cross_val_score | |
LGB =LGBMRegressor() | |
nCV=3 | |
score = cross_val_score(LGB,X_train,Y_train,cv=nCV,scoring="r2") | |
print("R2 Scoring by lightGBM = {0:.4f}".format(score.mean())) | |
#print(pd.concat([X_train,Y_train],axis=1).head()) | |
result = sm.OLS(Y_train, X_train).fit() | |
result_str= str(result.summary()) | |
results1 = result_str.split("\n")[:10] | |
for result in results1: | |
print(result) | |
print('*'*20) | |
return df | |
def pipe_extract( pre_combined = pipe_fillna_ascat()): | |
#extract 3 age feature(building, Garage, and remodel) | |
pre_combined["BldAge"] = pre_combined["YrSold"] - pre_combined["YearBuilt"] | |
pre_combined["GarageAge"] = pre_combined["YrSold"] - pre_combined["GarageYrBlt"] | |
pre_combined["RemodelAge"] = pre_combined["YrSold"] - pre_combined["YearRemodAdd"] | |
SoldYM_df = pd.DataFrame({"year":pre_combined.YrSold,"month":pre_combined.MoSold.astype("int").astype("object"),"day":1}) | |
SoldYM_df = pd.to_datetime(SoldYM_df,format='%Y%m%d',unit="D") | |
pre_combined["SoldYM"]=SoldYM_df.apply(lambda x: x.toordinal()) | |
#extract total space features | |
# three options for calculating the total Square feet. (Garage & basement has very high sknew, remove them ) | |
#pre_combined["TotalSQF"] = pre_combined["GarageArea"] + pre_combined['TotalBsmtSF'] +pre_combined["GrLivArea"]+pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] | |
#pre_combined["TotalSQF"] = pre_combined['TotalBsmtSF'] + pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] | |
pre_combined["TotalSQF"] = pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] +pre_combined["GrLivArea"] | |
return pre_combined | |
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()): | |
cols = ["MSSubClass_160","MSZoning_C (all)"] | |
pre_combined=pre_combined.drop(cols,axis=1) | |
return pre_combined |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment