m.zaradzki mzaradzki

## pump_null_boolean.py
for col, tp in nullcols:
    if (tp == bool):
        dfX.loc[dfX[col]==True, col] = 'TRUE'
        dfX.loc[dfX[col]==False, col] = 'FALSE'
        dfX.loc[dfX[col].isnull(), col] = 'MISSING'

## pump_null_listing.py
# Find columns with Null values
nullcols = []
for col in dfX.columns:
    nbnull = (dfX[col].isnull()*1).sum()
    if ( nbnull > 0 ):
        tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
        nullcols.append([col, tp])
        print(col, nbnull, t)

# Output

## pump_import_data.py
import pandas as pd

dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable

## pump_null_zero_geographicals_1.py
# bound of min/max latitude/longitude/height for Tanzania
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]

# mean of geographical data in each bucket
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()

assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))

# Out[31]: mean_geo_df
#                           latitude  longitude gps_height

## pump_parsing_dates.py
# Split date_recorded stamp (string) into year, month, day of month, day of week features
from dateutil import parser

dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
dates.append('date_recorded_year')

dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
dates.append('date_recorded_month')

# WARNING : probably not usefull for this dataset

## pump_null_dates.py
# before filling the null keep track of them
dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
dates.append( 'construction_year_missing' ) # list of dates related fields

# to fill missing dates, can use : mean, median or oldest
mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()

dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)

## pump_null_categorical.py
for col, tp in nullcols:
    if (tp == str):
        dfX.loc[dfX[col].isnull(), col] = 'MISSING'

## pump_similar_variables.py
# Search for variables that are very similar
def show_similars(cols, threshold=0.90):
    for i1, col1 in enumerate(cols):
        for i2, col2 in enumerate(cols):
            if (i1<i2):
                cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
                cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
                if (cv12 > threshold):
                    print((col1, col2), int(cv12*100))

## pump_trim_categoricals.py
for col in categories:
    cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False)
    rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences
    if len(rare_values)>0:
        print( 'Trim values : ', col, len(rare_values))
        dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare'

# Output :
# Trim values : funder      1730
# Trim values : installer   1982

## docSearcher_snippet1.js
exports.handler = (event, context, callback) => {

    var csd = new AWS.CloudSearchDomain({
        endpoint: CS_NAME+'.'+SERVICES_REGION+'.cloudsearch.amazonaws.com',
        apiVersion: '2013-01-01'
    });

    var params = {
        query: event.query,
        sort: '_score desc',
	for col, tp in nullcols:
	if (tp == bool):
	dfX.loc[dfX[col]==True, col] = 'TRUE'
	dfX.loc[dfX[col]==False, col] = 'FALSE'
	dfX.loc[dfX[col].isnull(), col] = 'MISSING'
	# Find columns with Null values
	nullcols = []
	for col in dfX.columns:
	nbnull = (dfX[col].isnull()*1).sum()
	if ( nbnull > 0 ):
	tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
	nullcols.append([col, tp])
	print(col, nbnull, t)

	# Output
	import pandas as pd

	dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
	dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable
	# bound of min/max latitude/longitude/height for Tanzania
	bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]

	# mean of geographical data in each bucket
	mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()

	assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))

	# Out[31]: mean_geo_df
	# latitude longitude gps_height
	# Split date_recorded stamp (string) into year, month, day of month, day of week features
	from dateutil import parser

	dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
	dates.append('date_recorded_year')

	dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
	dates.append('date_recorded_month')

	# WARNING : probably not usefull for this dataset
	# before filling the null keep track of them
	dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
	dates.append( 'construction_year_missing' ) # list of dates related fields

	# to fill missing dates, can use : mean, median or oldest
	mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()

	dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)
	for col, tp in nullcols:
	if (tp == str):
	dfX.loc[dfX[col].isnull(), col] = 'MISSING'
	# Search for variables that are very similar
	def show_similars(cols, threshold=0.90):
	for i1, col1 in enumerate(cols):
	for i2, col2 in enumerate(cols):
	if (i1<i2):
	cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
	cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
	if (cv12 > threshold):
	print((col1, col2), int(cv12*100))
	for col in categories:
	cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False)
	rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences
	if len(rare_values)>0:
	print( 'Trim values : ', col, len(rare_values))
	dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare'

	# Output :
	# Trim values : funder 1730
	# Trim values : installer 1982
	exports.handler = (event, context, callback) => {

	var csd = new AWS.CloudSearchDomain({
	endpoint: CS_NAME+'.'+SERVICES_REGION+'.cloudsearch.amazonaws.com',
	apiVersion: '2013-01-01'
	});

	var params = {
	query: event.query,
	sort: '_score desc',