Skip to content

Instantly share code, notes, and snippets.

# Before overwriting keep track of suspect rows with new binary columns
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1
geos.append('gps_height_bad')
dfX['longitude_bad'] = (dfX['longitude']<25)*1
geos.append('longitude_bad')
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1
geos.append('latitude_bad')
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude']
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height'])
nbQs = 4 # quartiles
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0)
# select columns that have "few" unique values
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)]
for col in cramer_cols:
try:
cm = pd.crosstab(df[col], df['status_group']).values # contingency table
cv1 = cramers_corrected_stat(cm)
if (cv1>=0.20):
print(col, int(cv1*100))
except:
from psutil import process_iter
def __get_cron_processes():
processes = [proc for proc in process_iter() if ('python' == proc.name())]
processes = [proc for proc in processes if ('python' in proc.cmdline())]
processes = [proc for proc in processes if not(proc.username() is 'root')]
processes = [proc for proc in processes if not('ipykernel' in proc.cmdline())]
return processes
from psutil import virtual_memory
from functools import wraps
MIN_VM_SHARE = 0.10
MAX_CRON_PROCESSES = 5
def cron_control(func=None):
@wraps(func)
def wrapped(*args, **kwargs):
from time import localtime, mktime
MAX_RUN_MINUTES = 120
def cron_killer():
def __run_minutes(proc):
t_start = localtime(proc.create_time())
t_now = localtime()
return (mktime(t_now) - mktime(t_start)) / 60.