Skip to content

Instantly share code, notes, and snippets.

from sklearn.ensemble import RandomForestClassifier
dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
dfRFC_trn = dfRFC[0:45000] # training set
dfRFC_tst = dfRFC[45000:] # testing set
RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble
max_depth=25) # maximum depth of each tree
RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values)
for col in numerical_columns: # Check frequency of most common values
cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
for k in cs.keys()[0:5]:
print( col, k, int(cs[k]*1000)/10. )
# Output :
# amount_tsh 0.0 70.0 # 70% of the record are 0
# amount_tsh 500.0 5.2
# amount_tsh 50.0 4.1
# amount_tsh 1000.0 2.5
# Find columns with Null values
nullcols = []
for col in dfX.columns:
nbnull = (dfX[col].isnull()*1).sum()
if ( nbnull > 0 ):
tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
nullcols.append([col, tp])
print(col, nbnull, t)
# Output
for col, tp in nullcols:
if (tp == bool):
dfX.loc[dfX[col]==True, col] = 'TRUE'
dfX.loc[dfX[col]==False, col] = 'FALSE'
dfX.loc[dfX[col].isnull(), col] = 'MISSING'
@mzaradzki
mzaradzki / pump_null_categorical.py
Last active July 3, 2017 10:20
Handling of Null values for categorical and boolean columns in Pandas dataframes
for col, tp in nullcols:
if (tp == str):
dfX.loc[dfX[col].isnull(), col] = 'MISSING'
for col in categories:
cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False)
rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences
if len(rare_values)>0:
print( 'Trim values : ', col, len(rare_values))
dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare'
# Output :
# Trim values : funder 1730
# Trim values : installer 1982
# bound of min/max latitude/longitude/height for Tanzania
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]
# mean of geographical data in each bucket
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()
assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))
# Out[31]: mean_geo_df
# latitude longitude gps_height
# Before overwriting keep track of suspect rows with new binary columns
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1
geos.append('gps_height_bad')
dfX['longitude_bad'] = (dfX['longitude']<25)*1
geos.append('longitude_bad')
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1
geos.append('latitude_bad')
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude']
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height'])
nbQs = 4 # quartiles
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0)
# select columns that have "few" unique values
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)]
for col in cramer_cols:
try:
cm = pd.crosstab(df[col], df['status_group']).values # contingency table
cv1 = cramers_corrected_stat(cm)
if (cv1>=0.20):
print(col, int(cv1*100))
except: