Skip to content

Instantly share code, notes, and snippets.

@frankchen0130
Created January 17, 2018 06:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frankchen0130/5950eaa4d98ea4f93deed707b027b517 to your computer and use it in GitHub Desktop.
Save frankchen0130/5950eaa4d98ea4f93deed707b027b517 to your computer and use it in GitHub Desktop.
pandas pipe examples
def pipe_basic_fillna(df=combined):
local_ntrain = ntrain
pre_combined=df.copy()
#print("The input train dimension:\t", pre_combined[0:ntrain].shape)
#print("The input test dimension:\t", pre_combined[ntrain:].drop("SalePrice",axis=1).shape)
num_cols = pre_combined.drop(["Id","SalePrice"],axis=1).select_dtypes(include=[np.number]).columns
cat_cols = pre_combined.select_dtypes(include=[np.object]).columns
pre_combined[num_cols]= pre_combined[num_cols].fillna(pre_combined[num_cols].median())
# Median is my favoraite fillna mode, which can eliminate the skew impact.
pre_combined[cat_cols]= pre_combined[cat_cols].fillna("NA")
pre_combined= pd.concat([pre_combined[["Id","SalePrice"]],pre_combined[cat_cols],pre_combined[num_cols]],axis=1)
return pre_combined
def pipe_drop4cols(pre_combined=pipe_fillna_ascat()):
cols_drop =['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
# the 4 features(columns was identified earlier which missing data >50%)
#pre_combined, ntrain = customize_fillna_extract_outliersdrop()
#pre_combined = customize_fillna_extract_outliersdrop()
pre_combined = pre_combined.drop(cols_drop,axis=1)
return pre_combined
def pipe_drop_cols(pre_combined = pipe_fillna_ascat()):
pre_combied = pre_combined.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'], axis = 1)
return pre_combined
def pipe_fillna_ascat(df=combined):
from datetime import datetime
local_ntrain = train.shape[0]
pre_combined=df.copy()
#convert quality feature to category type
#def feature group with same categories value
cols_Subclass = ["MSSubClass"]
cols_Zone = ["MSZoning"]
cols_Overall =["OverallQual","OverallCond"]
cols_Qual = ["BsmtCond","BsmtQual",\
"ExterQual","ExterCond",\
"FireplaceQu","GarageQual","GarageCond",\
"HeatingQC","KitchenQual",
"PoolQC"]
cols_BsmtFinType = ["BsmtFinType1","BsmtFinType2"]
cols_access = ["Alley","Street"]
cols_condition = ["Condition1","Condition2"]
cols_fence =["Fence"]
cols_exposure = ["BsmtExposure"]
cols_miscfeat = ["MiscFeature"]
cols_exter = ["Exterior1st","Exterior2nd"]
cols_MasVnr =["MasVnrType"]
cols_GarageType = ["GarageType"]
cols_GarageFinish =["GarageFinish"]
cols_Functional = ["Functional"]
cols_Util =["Utilities"]
cols_SaleType = ["SaleType"]
cols_Electrical = ["Electrical"]
#define the map of categories valus group
cat_Subclass = ["20",#1-STORY 1946 & NEWER ALL STYLES
"30",#1-STORY 1945 & OLDER
"40",#1-STORY W/FINISHED ATTIC ALL AGES
"45",#1-1/2 STORY - UNFINISHED ALL AGES
"50",#1-1/2 STORY FINISHED ALL AGES
"60",#2-STORY 1946 & NEWER
"70",#2-STORY 1945 & OLDER
"75",#2-1/2 STORY ALL AGES
"80",#SPLIT OR MULTI-LEVEL
"85",#SPLIT FOYER
"90",#DUPLEX - ALL STYLES AND AGES
"120",#1-STORY PUD (Planned Unit Development) - 1946 & NEWER
"150",#1-1/2 STORY PUD - ALL AGES
"160",#2-STORY PUD - 1946 & NEWER
"180",#PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
"190",#2 FAMILY CONVERSION - ALL STYLES AND AGES
]
cat_Zone = ["A",#Agriculture
"C (all)",#Commercial #the train/test value is different than the data_description file.
"FV",#Floating Village Residential
"I",#Industrial
"RH",#Residential High Density
"RL",#Residential Low Density
"RP",#Residential Low Density Park
"RM",#Residential Medium Density
]
cat_Overall = ["10","9","8","7","6","5","4","3","2","1"]
cat_Qual = ["Ex","Gd","TA","Fa","Po","NA"]
cat_BsmtFinType = ["GLQ","ALQ","BLQ","Rec","LwQ","Unf","NA"]
cat_access = ["Grvl","Pave","NA"]
cat_conditions= ["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"]
cat_fence = ["GdPrv",#Good Privacy
"MnPrv",#Minimum Privacy
"GdWo",#Good Wood
"MnWw",#Minimum Wood/Wire
"NA",#No Fence
]
cat_exposure = ["Gd", #Good Exposure
"Av", #Average Exposure (split levels or foyers typically score average or above)
"Mn", #Mimimum Exposure
"No", #No Exposure
"NA", #No Basement
]
cat_miscfeat = ["Elev",#Elevator
"Gar2",#2nd Garage (if not described in garage section)
"Othr",#Other
"Shed",#Shed (over 100 SF)
"TenC",#Tennis Court
"NA",#None
]
cat_exter =["AsbShng",#Asbestos Shingles
"AsphShn",#Asphalt Shingles
"BrkComm",#Brick Common Brk Cmn BrkComm
"BrkFace",#Brick Face
"CBlock",#Cinder Block
"CementBd",#Cement Board #CementBd was the data_description value
"HdBoard",#Hard Board
"ImStucc",#Imitation Stucco
"MetalSd",#Metal Siding
"Other",#Other
"Plywood",#Plywood
"PreCast",#PreCast,#
"Stone",#Stone
"Stucco",#Stucco
"VinylSd",#Vinyl Siding
"Wd Sdng",#Wood Siding
"WdShing",#Wood Shingles #Wd Shng WdShing
]
cat_MasVnr =["BrkCmn",#Brick Common
"BrkFace",#Brick Face
"CBlock",#Cinder Block
"None",#None
"Stone",#Stone
]
cat_GarageType =["2Types",#More than one type of garage
"Attchd",#Attached to home
"Basment",#Basement Garage
"BuiltIn",#Built-In (Garage part of house - typically has room above garage)
"CarPort",#Car Port
"Detchd",#Detached from home
"NA",#No Garage
]
cat_GarageFinish =["Fin",#Finished
"RFn",#Rough Finished,#
"Unf",#Unfinished
"NA",#No Garage
]
cat_Functional = ["Typ",#Typical Functionality
"Min1",#Minor Deductions 1
"Min2",#Minor Deductions 2
"Mod",#Moderate Deductions
"Maj1",#Major Deductions 1
"Maj2",#Major Deductions 2
"Sev",#Severely Damaged
"Sal",#Salvage only
]
cat_Util =["AllPub",#All public Utilities (E,G,W,& S)
"NoSewr",#Electricity, Gas, and Water (Septic Tank)
"NoSeWa",#Electricity and Gas Only
"ELO",#Electricity only,#
]
cat_SaleType =["WD",#Warranty Deed - Conventional
"CWD",#Warranty Deed - Cash
"VWD",#Warranty Deed - VA Loan
"New",#Home just constructed and sold
"COD",#Court Officer Deed/Estate
"Con",#Contract 15% Down payment regular terms
"ConLw",#Contract Low Down payment and low interest
"ConLI",#Contract Low Interest
"ConLD",#Contract Low Down
"Oth",#Other
]
cat_Electrical = ["SBrkr",#Standard Circuit Breakers & Romex
"FuseA",#Fuse Box over 60 AMP and all Romex wiring (Average),#
"FuseF",#60 AMP Fuse Box and mostly Romex wiring (Fair)
"FuseP",#60 AMP Fuse Box and mostly knob & tube wiring (poor)
"Mix",#Mixed
]
###########################################################################
#define the collection of group features &categories value by diction type
Dict_category={"Qual":[cols_Qual,cat_Qual,"NA","Ordinal"],
"Overall":[cols_Overall,cat_Overall,"5","Ordinal"], # It is integer already. no need overwork
"BsmtFinType":[cols_BsmtFinType,cat_BsmtFinType,"NA","Ordinal"],
"Access":[cols_access,cat_access,"NA","Ordinal"],
"Fence":[cols_fence,cat_fence,"NA","Ordinal"],
"Exposure":[cols_exposure,cat_exposure,"NA","v"],
"GarageFinish":[cols_GarageFinish,cat_GarageFinish,"NA","Ordinal"],
"Functional":[cols_Functional,cat_Functional,"Typ","Ordinal"], #fill na with lowest quality
"Utility":[cols_Util,cat_Util,"ELO","Ordinal"], # fillNA with lowest quality
"Subclass":[cols_Subclass,cat_Subclass,"NA","Nominal"],
"Zone":[cols_Zone,cat_Zone,"RL","Nominal"], #RL is most popular zone value. "C(all) is the study result"
"Cond":[cols_condition,cat_conditions,"Norm","Nominal"],
"MiscFeature":[cols_miscfeat,cat_miscfeat,"NA","Nominal"],
"Exter":[cols_exter,cat_exter,"Other","Nominal"],
"MasVnr":[cols_MasVnr,cat_MasVnr,"None","Nominal"],
"GarageType":[cols_GarageType,cat_GarageType,"NA","Nominal"],
"SaleType":[cols_SaleType, cat_SaleType,"WD","Nominal"],
"Electrical":[cols_Electrical,cat_Electrical,"SBrkr","Nominal"],
}
#Change input feature type to string, especailly to below integer type
pre_combined[cols_Overall] = pre_combined[cols_Overall].astype(str)
pre_combined[cols_Subclass] = pre_combined[cols_Subclass].astype(str)
#fix the raw data mistyping
exter_map = {"Brk Cmn":"BrkComm",
"CmentBd":"CementBd",
"CemntBd":"CementBd",
"Wd Shng":"WdShing" }
pre_combined[cols_exter]=pre_combined[cols_exter].replace(exter_map)
for v in Dict_category.values():
cols_cat = v[0]
cat_order =v[1]
cat_fillnavalue=v[2]
pre_combined[cols_cat]=pre_combined[cols_cat].fillna(cat_fillnavalue)
#if not isOrdinal:
if v[3] =="Nominal":
for col in cols_cat:
pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order)
elif v[3]=="Ordinal":
for col in cols_cat:
pre_combined[col]=pre_combined[col].astype('category',ordered =True,categories=cat_order).cat.codes
pre_combined[col] = pre_combined[col].astype(np.number)
#pre_combined[cols_Overall] = pre_combined[cols_Overall].fillna(pre_combined[cols_Overall].median())
#Lotfrontage fill mssing value
pre_combined["LotFrontage"] = pre_combined.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
#fill missing value to Garage related features
#Assuming no garage for thos missing value
pre_combined["GarageCars"] =pre_combined["GarageCars"].fillna(0).astype(int)
pre_combined["GarageArea"] =pre_combined["GarageArea"].fillna(0).astype(int)
#fill missing value to Basement related features
pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]]= pre_combined[["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]].fillna(0)
pre_combined["TotalBsmtSF"]= pre_combined["BsmtFinSF1"] + pre_combined["BsmtFinSF2"]+pre_combined["BsmtUnfSF"]
cols_Bsmt_Bath = ["BsmtHalfBath","BsmtFullBath"]
pre_combined[cols_Bsmt_Bath] =pre_combined[cols_Bsmt_Bath].fillna(0) #assuming mean
pre_combined["MasVnrArea"] = pre_combined["MasVnrArea"].fillna(0) #filled per study
#solve Year related feature missing value
#cols_time = ["YearBuilt","YearRemodAdd","GarageYrBlt","MoSold","YrSold"]
pre_combined["GarageYrBlt"] = pre_combined["GarageYrBlt"].fillna(pre_combined["YearBuilt"]) #use building year for garage even no garage.
return pre_combined
def pipe_outliersdrop(pre_combined=pipe_fillna_ascat(),ratio =0.001):
# note, it could done by statsmodel as well. it will explored in future
ratio =1-ratio
ntrain = pre_combined["SalePrice"].notnull().sum()
Y_train = pre_combined["SalePrice"][:ntrain]
num_cols = pre_combined.select_dtypes(include=[np.number]).columns
out_df = pre_combined[0:ntrain][num_cols]
top5 = np.abs(out_df.corrwith(Y_train)).sort_values(ascending=False)[:5]
#eda_plot(df=pre_combined[:ntrain],cols=top5.index)
limit = out_df["GrLivArea"].quantile(ratio)
# limit use to remove the outliers
dropindex = out_df[out_df["GrLivArea"]>limit].index
dropped_pre_combined =pre_combined.drop(dropindex)
#*****************************
dropped_Y_train = Y_train.drop(dropindex)
#*****************************
print("\n\n*****Drop outlier based on ratio > {0:.3f} quantile :".format(ratio))
#print("New shape of collected data",dropped_pre_combined.shape)
return dropped_pre_combined
def cat_col_compress(col, threshold=0.005):
#copy the code from stackoverflow
# removes the bind
dummy_col=col.copy()
# what is the ratio of a dummy in whole column
count = pd.value_counts(dummy_col) / len(dummy_col)
# cond whether the ratios is higher than the threshold
mask = dummy_col.isin(count[count > threshold].index)
# replace the ones which ratio is lower than the threshold by a special name
dummy_col[~mask] = "dum_others"
return dummy_col
def pipe_log_getdummies(pre_combined = pipe_fillna_ascat(),skew_ratio=0.75,cat_ratio=0):
from scipy.stats import skew
skew_limit =skew_ratio #I got this limit from Kaggle directly. Someone use 1 , someone use 0.75. I just use 0.75 by random and has no detail study yet
cat_threshold = cat_ratio
num_cols = pre_combined.select_dtypes(include=[np.number]).columns
cat_cols = pre_combined.select_dtypes(include=[np.object]).columns
#log transform skewed numeric features:
skewed_Series = np.abs(pre_combined[num_cols].skew()) #compute skewness
skewed_cols = skewed_Series[skewed_Series > skew_limit].index.values
pre_combined[skewed_cols] = np.log1p(pre_combined[skewed_cols])
skewed_Series = abs(pre_combined.skew()) #compute skewness
skewed_cols = skewed_Series[skewed_Series > skew_limit].index.tolist()
for col in cat_cols:
pre_combined[col]=cat_col_compress(pre_combined[col],threshold=cat_threshold) # threshold set to zero as it get high core for all estimatior except ridge based
pre_combined= pd.get_dummies(pre_combined,drop_first=True)
return pre_combined
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
cols = ["MSSubClass_160","MSZoning_C (all)"]
pre_combined=pre_combined.drop(cols,axis=1)
return pre_combined
def pipe_export(pre_output,name):
if (pre_output is None) :
print("None input! Expect pre_combined dataframe name as parameter")
return
elif pre_output.drop("SalePrice",axis=1).isnull().sum().sum()>0:
print("Dataframe still missing value! pls check again")
return
elif type(name) is not str:
print("Expect preparing option name to generate output file")
print("The out file name will be [Preparing_Output_<name>_20171029.h5] ")
return
else:
from datetime import datetime
savetime=datetime.now().strftime("%m-%d-%H_%M")
directory_name = "./prepare/"
filename = directory_name + name +"_"+ savetime +".h5"
local_ntrain = pre_output.SalePrice.notnull().sum()
pre_train = pre_output[0:local_ntrain]
pre_test =pre_output[local_ntrain:].drop("SalePrice",axis=1)
pre_train.to_hdf(filename,"pre_train")
pre_test.to_hdf(filename,"pre_test")
#print("\n***Exported*** :{0}".format(filename))
#print("\ttrain set size :\t",local_ntrain)
#print("\tpre_train shape:\t", pre_train.shape)
#print("\tpre_test shape:\t", pre_test.shape)
return pre_output
def pipe_r2test(df):
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
print("****Testing*****")
train_df = df
ntrain = train_df["SalePrice"].notnull().sum()
train = train_df[:ntrain]
X_train = train.drop(["Id","SalePrice"],axis =1)
Y_train = train["SalePrice"]
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.model_selection import cross_val_score
LGB =LGBMRegressor()
nCV=3
score = cross_val_score(LGB,X_train,Y_train,cv=nCV,scoring="r2")
print("R2 Scoring by lightGBM = {0:.4f}".format(score.mean()))
#print(pd.concat([X_train,Y_train],axis=1).head())
result = sm.OLS(Y_train, X_train).fit()
result_str= str(result.summary())
results1 = result_str.split("\n")[:10]
for result in results1:
print(result)
print('*'*20)
return df
def pipe_extract( pre_combined = pipe_fillna_ascat()):
#extract 3 age feature(building, Garage, and remodel)
pre_combined["BldAge"] = pre_combined["YrSold"] - pre_combined["YearBuilt"]
pre_combined["GarageAge"] = pre_combined["YrSold"] - pre_combined["GarageYrBlt"]
pre_combined["RemodelAge"] = pre_combined["YrSold"] - pre_combined["YearRemodAdd"]
SoldYM_df = pd.DataFrame({"year":pre_combined.YrSold,"month":pre_combined.MoSold.astype("int").astype("object"),"day":1})
SoldYM_df = pd.to_datetime(SoldYM_df,format='%Y%m%d',unit="D")
pre_combined["SoldYM"]=SoldYM_df.apply(lambda x: x.toordinal())
#extract total space features
# three options for calculating the total Square feet. (Garage & basement has very high sknew, remove them )
#pre_combined["TotalSQF"] = pre_combined["GarageArea"] + pre_combined['TotalBsmtSF'] +pre_combined["GrLivArea"]+pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']
#pre_combined["TotalSQF"] = pre_combined['TotalBsmtSF'] + pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF']
pre_combined["TotalSQF"] = pre_combined['1stFlrSF'] + pre_combined['2ndFlrSF'] +pre_combined["GrLivArea"]
return pre_combined
def pipe_drop_dummycols(pre_combined=pipe_log_getdummies()):
cols = ["MSSubClass_160","MSZoning_C (all)"]
pre_combined=pre_combined.drop(cols,axis=1)
return pre_combined
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment