Skip to content

Instantly share code, notes, and snippets.

from time import localtime, mktime
MAX_RUN_MINUTES = 120
def cron_killer():
def __run_minutes(proc):
t_start = localtime(proc.create_time())
t_now = localtime()
return (mktime(t_now) - mktime(t_start)) / 60.
from psutil import virtual_memory
from functools import wraps
MIN_VM_SHARE = 0.10
MAX_CRON_PROCESSES = 5
def cron_control(func=None):
@wraps(func)
def wrapped(*args, **kwargs):
from psutil import process_iter
def __get_cron_processes():
processes = [proc for proc in process_iter() if ('python' == proc.name())]
processes = [proc for proc in processes if ('python' in proc.cmdline())]
processes = [proc for proc in processes if not(proc.username() is 'root')]
processes = [proc for proc in processes if not('ipykernel' in proc.cmdline())]
return processes
# select columns that have "few" unique values
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)]
for col in cramer_cols:
try:
cm = pd.crosstab(df[col], df['status_group']).values # contingency table
cv1 = cramers_corrected_stat(cm)
if (cv1>=0.20):
print(col, int(cv1*100))
except:
nbQs = 4 # quartiles
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0)
# Before overwriting keep track of suspect rows with new binary columns
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1
geos.append('gps_height_bad')
dfX['longitude_bad'] = (dfX['longitude']<25)*1
geos.append('longitude_bad')
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1
geos.append('latitude_bad')
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude']
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height'])
for col in numerical_columns: # Check frequency of most common values
cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
for k in cs.keys()[0:5]:
print( col, k, int(cs[k]*1000)/10. )
# Output :
# amount_tsh 0.0 70.0 # 70% of the record are 0
# amount_tsh 500.0 5.2
# amount_tsh 50.0 4.1
# amount_tsh 1000.0 2.5
from sklearn.ensemble import RandomForestClassifier
dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
dfRFC_trn = dfRFC[0:45000] # training set
dfRFC_tst = dfRFC[45000:] # testing set
RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble
max_depth=25) # maximum depth of each tree
RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values)
from sklearn.linear_model import LogisticRegression
dfLR = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts
dfLR_trn = dfLR[0:45000] # training set
dfLR_tst = dfLR[45000:] # testing set
LR = LogisticRegression(multi_class='ovr') # ovr = one (class) versus rest (of classes)
LR.fit(dfLR_trn[predictors].values, dfLR_trn['status_group_enc'].values)
# model accuracy score between 0% and 100%
dfOHE = None
for col in categories: # encode 1 category at a time
one_hot = pd.get_dummies(df[col], prefix=col)
# drop column as it is now encoded
if dfOHE is None:
dfOHE = df.drop(col, axis=1)
else:
dfOHE = dfOHE.drop(col, axis=1)