This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import localtime, mktime | |
MAX_RUN_MINUTES = 120 | |
def cron_killer(): | |
def __run_minutes(proc): | |
t_start = localtime(proc.create_time()) | |
t_now = localtime() | |
return (mktime(t_now) - mktime(t_start)) / 60. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from psutil import virtual_memory | |
from functools import wraps | |
MIN_VM_SHARE = 0.10 | |
MAX_CRON_PROCESSES = 5 | |
def cron_control(func=None): | |
@wraps(func) | |
def wrapped(*args, **kwargs): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from psutil import process_iter | |
def __get_cron_processes(): | |
processes = [proc for proc in process_iter() if ('python' == proc.name())] | |
processes = [proc for proc in processes if ('python' in proc.cmdline())] | |
processes = [proc for proc in processes if not(proc.username() is 'root')] | |
processes = [proc for proc in processes if not('ipykernel' in proc.cmdline())] | |
return processes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# select columns that have "few" unique values | |
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)] | |
for col in cramer_cols: | |
try: | |
cm = pd.crosstab(df[col], df['status_group']).values # contingency table | |
cv1 = cramers_corrected_stat(cm) | |
if (cv1>=0.20): | |
print(col, int(cv1*100)) | |
except: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nbQs = 4 # quartiles | |
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Before overwriting keep track of suspect rows with new binary columns | |
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1 | |
geos.append('gps_height_bad') | |
dfX['longitude_bad'] = (dfX['longitude']<25)*1 | |
geos.append('longitude_bad') | |
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1 | |
geos.append('latitude_bad') | |
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude'] | |
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
dfRFC = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts | |
dfRFC_trn = dfRFC[0:45000] # training set | |
dfRFC_tst = dfRFC[45000:] # testing set | |
RFC = RandomForestClassifier(n_estimators=20, # number of trees in the "forest" ensemble | |
max_depth=25) # maximum depth of each tree | |
RFC.fit(dfRFC_trn[predictors].values, dfRFC_trn['status_group_enc'].values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LogisticRegression | |
dfLR = dfOHE.sample(frac=1) # shuffle the dataset before spliting it in 2 parts | |
dfLR_trn = dfLR[0:45000] # training set | |
dfLR_tst = dfLR[45000:] # testing set | |
LR = LogisticRegression(multi_class='ovr') # ovr = one (class) versus rest (of classes) | |
LR.fit(dfLR_trn[predictors].values, dfLR_trn['status_group_enc'].values) | |
# model accuracy score between 0% and 100% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dfOHE = None | |
for col in categories: # encode 1 category at a time | |
one_hot = pd.get_dummies(df[col], prefix=col) | |
# drop column as it is now encoded | |
if dfOHE is None: | |
dfOHE = df.drop(col, axis=1) | |
else: | |
dfOHE = dfOHE.drop(col, axis=1) |
NewerOlder