This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts | |
dfRFC_trn = dfRFC[0:45000] # training set | |
dfRFC_tst = dfRFC[45000:] # testing set | |
RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble | |
max_depth=25) # maximum depth of each tree | |
RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Find columns with Null values | |
nullcols = [] | |
for col in dfX.columns: | |
nbnull = (dfX[col].isnull()*1).sum() | |
if ( nbnull > 0 ): | |
tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value | |
nullcols.append([col, tp]) | |
print(col, nbnull, t) | |
# Output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for col, tp in nullcols: | |
if (tp == bool): | |
dfX.loc[dfX[col]==True, col] = 'TRUE' | |
dfX.loc[dfX[col]==False, col] = 'FALSE' | |
dfX.loc[dfX[col].isnull(), col] = 'MISSING' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for col, tp in nullcols: | |
if (tp == str): | |
dfX.loc[dfX[col].isnull(), col] = 'MISSING' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for col in categories: | |
cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False) | |
rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences | |
if len(rare_values)>0: | |
print( 'Trim values : ', col, len(rare_values)) | |
dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare' | |
# Output : | |
# Trim values : funder 1730 | |
# Trim values : installer 1982 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# bound of min/max latitude/longitude/height for Tanzania | |
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)] | |
# mean of geographical data in each bucket | |
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean() | |
assert(mean_geo_df.shape[0] == len(dfX['basin'].unique())) | |
# Out[31]: mean_geo_df | |
# latitude longitude gps_height |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Before overwriting keep track of suspect rows with new binary columns | |
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1 | |
geos.append('gps_height_bad') | |
dfX['longitude_bad'] = (dfX['longitude']<25)*1 | |
geos.append('longitude_bad') | |
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1 | |
geos.append('latitude_bad') | |
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude'] | |
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nbQs = 4 # quartiles | |
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# select columns that have "few" unique values | |
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)] | |
for col in cramer_cols: | |
try: | |
cm = pd.crosstab(df[col], df['status_group']).values # contingency table | |
cv1 = cramers_corrected_stat(cm) | |
if (cv1>=0.20): | |
print(col, int(cv1*100)) | |
except: |