m.zaradzki mzaradzki

## pump_null_dates.py
# before filling the null keep track of them
dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
dates.append( 'construction_year_missing' ) # list of dates related fields

# to fill missing dates, can use : mean, median or oldest
mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()

dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)

## pump_parsing_dates.py
# Split date_recorded stamp (string) into year, month, day of month, day of week features
from dateutil import parser

dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
dates.append('date_recorded_year')

dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
dates.append('date_recorded_month')

# WARNING : probably not usefull for this dataset

## pump_null_zero_geographicals_1.py
# bound of min/max latitude/longitude/height for Tanzania
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]

# mean of geographical data in each bucket
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()

assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))

# Out[31]: mean_geo_df
#                           latitude  longitude gps_height

## pump_import_data.py
import pandas as pd

dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable

## pump_null_listing.py
# Find columns with Null values
nullcols = []
for col in dfX.columns:
    nbnull = (dfX[col].isnull()*1).sum()
    if ( nbnull > 0 ):
        tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
        nullcols.append([col, tp])
        print(col, nbnull, t)

# Output

## pump_null_boolean.py
for col, tp in nullcols:
    if (tp == bool):
        dfX.loc[dfX[col]==True, col] = 'TRUE'
        dfX.loc[dfX[col]==False, col] = 'FALSE'
        dfX.loc[dfX[col].isnull(), col] = 'MISSING'

## pump_ohe.py
dfOHE = None

for col in categories: # encode 1 category at a time

    one_hot = pd.get_dummies(df[col], prefix=col)
    # drop column as it is now encoded
    if dfOHE is None:
        dfOHE = df.drop(col, axis=1)
    else:
        dfOHE = dfOHE.drop(col, axis=1)

## pump_logistic.py
from sklearn.linear_model import LogisticRegression

dfLR = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
dfLR_trn = dfLR[0:45000] # training set
dfLR_tst = dfLR[45000:] # testing set

LR = LogisticRegression(multi_class='ovr') # ovr = one (class) versus rest (of classes)
LR.fit(dfLR_trn[predictors].values, dfLR_trn['status_group_enc'].values)

# model accuracy score between 0% and 100%

## pump_randomforest.py
from sklearn.ensemble import RandomForestClassifier

dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
dfRFC_trn = dfRFC[0:45000] # training set
dfRFC_tst = dfRFC[45000:] # testing set

RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble
                             max_depth=25) # maximum depth of each tree

RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values)

## pump_hidden_null_values.py
for col in numerical_columns: # Check frequency of most common values
    cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
    for k in cs.keys()[0:5]:
        print( col, k, int(cs[k]*1000)/10. )

# Output :
# amount_tsh 0.0    70.0 # 70% of the record are 0
# amount_tsh 500.0  5.2
# amount_tsh 50.0   4.1
# amount_tsh 1000.0 2.5
	# before filling the null keep track of them
	dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
	dates.append( 'construction_year_missing' ) # list of dates related fields

	# to fill missing dates, can use : mean, median or oldest
	mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()

	dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)
	# Split date_recorded stamp (string) into year, month, day of month, day of week features
	from dateutil import parser

	dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
	dates.append('date_recorded_year')

	dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
	dates.append('date_recorded_month')

	# WARNING : probably not usefull for this dataset
	# bound of min/max latitude/longitude/height for Tanzania
	bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]

	# mean of geographical data in each bucket
	mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()

	assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))

	# Out[31]: mean_geo_df
	# latitude longitude gps_height
	import pandas as pd

	dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
	dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable
	# Find columns with Null values
	nullcols = []
	for col in dfX.columns:
	nbnull = (dfX[col].isnull()*1).sum()
	if ( nbnull > 0 ):
	tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
	nullcols.append([col, tp])
	print(col, nbnull, t)

	# Output
	for col, tp in nullcols:
	if (tp == bool):
	dfX.loc[dfX[col]==True, col] = 'TRUE'
	dfX.loc[dfX[col]==False, col] = 'FALSE'
	dfX.loc[dfX[col].isnull(), col] = 'MISSING'
	dfOHE = None

	for col in categories: # encode 1 category at a time

	one_hot = pd.get_dummies(df[col], prefix=col)
	# drop column as it is now encoded
	if dfOHE is None:
	dfOHE = df.drop(col, axis=1)
	else:
	dfOHE = dfOHE.drop(col, axis=1)
	from sklearn.linear_model import LogisticRegression

	dfLR = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
	dfLR_trn = dfLR[0:45000] # training set
	dfLR_tst = dfLR[45000:] # testing set

	LR = LogisticRegression(multi_class='ovr') # ovr = one (class) versus rest (of classes)
	LR.fit(dfLR_trn[predictors].values, dfLR_trn['status_group_enc'].values)

	# model accuracy score between 0% and 100%
	from sklearn.ensemble import RandomForestClassifier

	dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
	dfRFC_trn = dfRFC[0:45000] # training set
	dfRFC_tst = dfRFC[45000:] # testing set

	RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble
	max_depth=25) # maximum depth of each tree

	RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values)
	for col in numerical_columns: # Check frequency of most common values
	cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
	for k in cs.keys()[0:5]:
	print( col, k, int(cs[k]*1000)/10. )

	# Output :
	# amount_tsh 0.0 70.0 # 70% of the record are 0
	# amount_tsh 500.0 5.2
	# amount_tsh 50.0 4.1
	# amount_tsh 1000.0 2.5