Skip to content

Instantly share code, notes, and snippets.

@mzaradzki
mzaradzki / docSearcher_snippet1.js
Created May 29, 2017 16:16
AWS Lambda function to class Cloud Search API with javascript SDK
exports.handler = (event, context, callback) => {
var csd = new AWS.CloudSearchDomain({
endpoint: CS_NAME+'.'+SERVICES_REGION+'.cloudsearch.amazonaws.com',
apiVersion: '2013-01-01'
});
var params = {
query: event.query,
sort: '_score desc',
@mzaradzki
mzaradzki / docIndexer_snippet1.js
Created May 29, 2017 09:13
AWS Lambda to index S3 new files in CloudSearch
exports.handler = (event, context, callback) => {
// WARNING :
// This snippet assumes : event.Records[0].eventName == 'ObjectCreated:Put'
// but the ful code deals with both 'ObjectCreated:Put' and 'ObjectRemoved:Delete'
var filename = event.Records[0].s3.object.key;
var bucketname = event.Records[0].s3.bucket.name;
var params = {
from psutil import virtual_memory
from functools import wraps
MIN_VM_SHARE = 0.10
MAX_CRON_PROCESSES = 5
def cron_control(func=None):
@wraps(func)
def wrapped(*args, **kwargs):
from time import localtime, mktime
MAX_RUN_MINUTES = 120
def cron_killer():
def __run_minutes(proc):
t_start = localtime(proc.create_time())
t_now = localtime()
return (mktime(t_now) - mktime(t_start)) / 60.
from psutil import process_iter
def __get_cron_processes():
processes = [proc for proc in process_iter() if ('python' == proc.name())]
processes = [proc for proc in processes if ('python' in proc.cmdline())]
processes = [proc for proc in processes if not(proc.username() is 'root')]
processes = [proc for proc in processes if not('ipykernel' in proc.cmdline())]
return processes
# Search for variables that are very similar
def show_similars(cols, threshold=0.90):
for i1, col1 in enumerate(cols):
for i2, col2 in enumerate(cols):
if (i1<i2):
cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
if (cv12 > threshold):
print((col1, col2), int(cv12*100))
# select columns that have "few" unique values
cramer_cols = [col for col in df.columns.values if (len(df[col].unique())<250)]
for col in cramer_cols:
try:
cm = pd.crosstab(df[col], df['status_group']).values # contingency table
cv1 = cramers_corrected_stat(cm)
if (cv1>=0.20):
print(col, int(cv1*100))
except:
nbQs = 4 # quartiles
dfX['construction_year_quantile'] = pd.qcut(dfX['construction_year'], nbQs, labels=False)/(nbQs-1.0)
# Before overwriting keep track of suspect rows with new binary columns
dfX['gps_height_bad'] = (dfX['gps_height']<=0)*1
geos.append('gps_height_bad')
dfX['longitude_bad'] = (dfX['longitude']<25)*1
geos.append('longitude_bad')
dfX['latitude_bad'] = (dfX['latitude']>-0.5)*1
geos.append('latitude_bad')
# Exemple of query via index=basin : mean_geo_df.at['Lake Victoria','latitude']
dfX.loc[dfX['gps_height']<=0, 'gps_height'] = dfX['basin'].apply(lambda x : mean_geo_df.at[x,'gps_height'])
# bound of min/max latitude/longitude/height for Tanzania
bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]
# mean of geographical data in each bucket
mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()
assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))
# Out[31]: mean_geo_df
# latitude longitude gps_height