Skip to content

Instantly share code, notes, and snippets.

for col, tp in nullcols:
if (tp == bool):
dfX.loc[dfX[col]==True, col] = 'TRUE'
dfX.loc[dfX[col]==False, col] = 'FALSE'
dfX.loc[dfX[col].isnull(), col] = 'MISSING'
# Find columns with Null values
nullcols = []
for col in dfX.columns:
nbnull = (dfX[col].isnull()*1).sum()
if ( nbnull > 0 ):
tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
nullcols.append([col, tp])
print(col, nbnull, t)
# Output
import pandas as pd
dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable
# bound of min/max latitude/longitude/height for Tanzania
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]
# mean of geographical data in each bucket
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()
assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))
# Out[31]: mean_geo_df
# latitude longitude gps_height
# Split date_recorded stamp (string) into year, month, day of month, day of week features
from dateutil import parser
dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
dates.append('date_recorded_year')
dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
dates.append('date_recorded_month')
# WARNING : probably not usefull for this dataset
# before filling the null keep track of them
dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
dates.append( 'construction_year_missing' ) # list of dates related fields
# to fill missing dates, can use : mean, median or oldest
mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()
dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)
@mzaradzki
mzaradzki / pump_null_categorical.py
Last active July 3, 2017 10:20
Handling of Null values for categorical and boolean columns in Pandas dataframes
for col, tp in nullcols:
if (tp == str):
dfX.loc[dfX[col].isnull(), col] = 'MISSING'
# Search for variables that are very similar
def show_similars(cols, threshold=0.90):
for i1, col1 in enumerate(cols):
for i2, col2 in enumerate(cols):
if (i1<i2):
cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
if (cv12 > threshold):
print((col1, col2), int(cv12*100))
for col in categories:
cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False)
rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences
if len(rare_values)>0:
print( 'Trim values : ', col, len(rare_values))
dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare'
# Output :
# Trim values : funder 1730
# Trim values : installer 1982
@mzaradzki
mzaradzki / docSearcher_snippet1.js
Created May 29, 2017 16:16
AWS Lambda function to class Cloud Search API with javascript SDK
exports.handler = (event, context, callback) => {
var csd = new AWS.CloudSearchDomain({
endpoint: CS_NAME+'.'+SERVICES_REGION+'.cloudsearch.amazonaws.com',
apiVersion: '2013-01-01'
});
var params = {
query: event.query,
sort: '_score desc',