Created
January 3, 2014 00:21
-
-
Save aficionado/8230033 to your computer and use it in GitHub Desktop.
Simple test to detect covariate shift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Simple test to detect covariate shift. | |
# Requires: | |
# BIGML_AUTH set up with your BIGML_USERNAME and API_KEY | |
# curl: http://curl.haxx.se/ | |
# jq: http://stedolan.github.io/jq/ | |
BIGML_DOMAIN=bigml.io # Set it up to your own BigML VPC domain | |
FINISHED=5 | |
ERROR=-1 | |
WAIT=5 | |
TOTAL_WAIT=600 | |
SEED=SEED | |
TRAINING_SAMPLE_RATE=0.8 | |
PRODUCTION_SAMPLE_RATE=0.8 | |
EVALUATION_SAMPLE_RATE=0.8 | |
OBJECTIVE_FIELD_NAME=Origin | |
TRAINING_DATA=https://gist.github.com/aficionado/7743748/raw/3f1f1d5bd09c296e099344a80539103e4fa90756/titanic_train.csv | |
PRODUCTION_DATA=https://gist.github.com/aficionado/7743752/raw/db2f5b38bc290b4defc68fe0865f23c16e1e6b7f/titanic_test.csv | |
NAME=Titanic | |
MAX_TRIALS=10 | |
TRAINING_FILTER=true | |
PRODUCTION_FILTER=true | |
EXCLUDED_FIELDS=[] | |
# Uncomment to induce a covariate shift | |
TRAINING_FILTER='(= (f Sex) male)' | |
PRODUCTION_FILTER='(!= (f Sex) male)' | |
# Uncomment to exclude discriminative fields | |
#EXCLUDED_FIELDS=[\"000001\"] | |
function wait_resource { | |
# Waits for resources to finish as their creation is asynchronous | |
ID=$1 | |
COUNTER=0 | |
STATUS=$(curl -s "https://$BIGML_DOMAIN/$ID?$BIGML_AUTH" \ | |
| jq ".status.code") | |
while [ "$STATUS" -ne "$FINISHED" ] && [ "$STATUS" -ne "$ERROR" ] && | |
[ "$COUNTER" -lt "$TOTAL_WAIT" ]; do | |
sleep $WAIT | |
let COUNTER++ | |
STATUS=$(curl -s https://$BIGML_DOMAIN/$ID?$BIGML_AUTH \ | |
| jq ".status.code") | |
done | |
if [ "$STATUS" -eq "$ERROR" ]; then | |
echo "Detected a failure waiting for $ID" | |
exit 1 | |
fi | |
} | |
# Create training and production sources | |
TRAINING_SOURCE=$(curl -s "https://$BIGML_DOMAIN/source?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"remote": "'"$TRAINING_DATA"'", "name": "'"$NAME"' Training"}' \ | |
| jq -r ".resource") | |
PRODUCTION_SOURCE=$(curl -s "https://$BIGML_DOMAIN/source?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"remote": "'"$PRODUCTION_DATA"'", "name": "'"$NAME"' Production"}' \ | |
| jq -r ".resource") | |
# Create training and production datasets | |
wait_resource $TRAINING_SOURCE | |
TRAINING_DATASET=$(curl -s "https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"source": "'"$TRAINING_SOURCE"'"}' \ | |
| jq -r ".resource") | |
wait_resource $PRODUCTION_SOURCE | |
PRODUCTION_DATASET=$(curl -s "https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"source": "'"$PRODUCTION_SOURCE"'"}' \ | |
| jq -r ".resource") | |
wait_resource $TRAINING_DATASET | |
wait_resource $PRODUCTION_DATASET | |
TRIALS=0 | |
AVG_PHI=0 | |
while [ "$TRIALS" -lt "$MAX_TRIALS" ]; do | |
let TRIALS++ | |
# Filter training and production datasets and label them with a new field | |
LABELED_TRANING_DATASET=$(curl -s \ | |
"https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"origin_dataset": "'"$TRAINING_DATASET"'", | |
"lisp_filter": "'"$TRAINING_FILTER"'", | |
"new_fields": [{"field": "Training", | |
"name": "'"$OBJECTIVE_FIELD_NAME"'"}], | |
"sample_rate": '"$TRAINING_SAMPLE_RATE"'}' \ | |
| jq -r ".resource") | |
LABELED_PRODUCTION_DATASET=$(curl -s \ | |
"https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"origin_dataset": "'"$PRODUCTION_DATASET"'", | |
"lisp_filter": "'"$PRODUCTION_FILTER"'", | |
"new_fields": [{"field": "Production", | |
"name": "'"$OBJECTIVE_FIELD_NAME"'"}], | |
"sample_rate": '"$PRODUCTION_SAMPLE_RATE"'}' \ | |
| jq -r ".resource") | |
wait_resource $LABELED_TRANING_DATASET | |
wait_resource $LABELED_PRODUCTION_DATASET | |
# Compute sample rate sizes to make sure that the input dataset for the | |
# model is balanced | |
TRAINING_INSTANCES=$(curl -s \ | |
"https://$BIGML_DOMAIN/$LABELED_TRANING_DATASET?$BIGML_AUTH" \ | |
| jq -r ".rows") | |
PRODUCTION_INSTANCES=$(curl -s \ | |
"https://$BIGML_DOMAIN/$LABELED_PRODUCTION_DATASET?$BIGML_AUTH" \ | |
| jq -r ".rows") | |
if [ $TRAINING_INSTANCES -gt $PRODUCTION_INSTANCES ]; then | |
SAMPLE_RATE=$(echo "$PRODUCTION_INSTANCES/$TRAINING_INSTANCES" | bc -l) | |
TRAINING_SAMPLE_RATE=$(printf '%.4f\n' $SAMPLE_RATE) | |
PRODUCTION_SAMPLE_RATE=1 | |
else | |
SAMPLE_RATE=$(echo "$TRAINING_INSTANCES/$PRODUCTION_INSTANCES" | bc -l) | |
TRAINING_SAMPLE_RATE=1 | |
PRODUCTION_SAMPLE_RATE=$(printf '%.4f\n' $SAMPLE_RATE) | |
fi | |
# The target of the new model will be the label (Training / Production) | |
OBJECTIVE_FIELD=$(curl -s \ | |
"https://$BIGML_DOMAIN/$LABELED_TRANING_DATASET?$BIGML_AUTH;prefix=$OBJECTIVE_FIELD_NAME" \ | |
| jq -r ".fields | keys[0]") | |
# Create a model using just a sample of the data | |
MODEL=$(curl -s "https://$BIGML_DOMAIN/model?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"datasets": ["'"$LABELED_TRANING_DATASET"'", | |
"'"$LABELED_PRODUCTION_DATASET"'"], | |
"sample_rates": {"'"$LABELED_TRANING_DATASET"'": | |
'"$TRAINING_SAMPLE_RATE"', | |
"'"$LABELED_PRODUCTION_DATASET"'": | |
'"$PRODUCTION_SAMPLE_RATE"'}, | |
"objective_field": "'"$OBJECTIVE_FIELD"'", | |
"sample_rate": '"$EVALUATION_SAMPLE_RATE"', | |
"seed": "'"$SEED"'", | |
"name": "'"$NAME"' - Covariate Shift?", | |
"excluded_fields": '"$EXCLUDED_FIELDS"'}' \ | |
| jq -r ".resource") | |
wait_resource $MODEL | |
# Create an evaluation using the other part of the data (out_of_bag=true) | |
EVALUATION=$(curl -s "https://$BIGML_DOMAIN/evaluation?$BIGML_AUTH" \ | |
-X POST -H "content-type: application/json" \ | |
-d '{"datasets": ["'"$LABELED_TRANING_DATASET"'", | |
"'"$LABELED_PRODUCTION_DATASET"'"], | |
"sample_rates": {"'"$LABELED_TRANING_DATASET"'": | |
'"$TRAINING_SAMPLE_RATE"', | |
"'"$LABELED_PRODUCTION_DATASET"'": | |
'"$PRODUCTION_SAMPLE_RATE"'}, | |
"sample_rate": '"$EVALUATION_SAMPLE_RATE"', | |
"seed": "'"$SEED"'", | |
"out_of_bag": true, | |
"model": "'"$MODEL"'", | |
"name": "'"$NAME"' - Covariate Shift?"}' \ | |
| jq -r ".resource") | |
wait_resource $EVALUATION | |
PHI=$(curl -s "https://$BIGML_DOMAIN/$EVALUATION?$BIGML_AUTH" \ | |
| jq -r ".result.model.average_phi") | |
AVG_PHI=$(echo "$AVG_PHI + $PHI" | bc -l) | |
printf '%.4f\n' $PHI | |
done | |
AVG_PHI=$(echo "$AVG_PHI / $TRIALS" | bc -l) | |
printf 'AVG_PHI: %.4f\n' $AVG_PHI |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment