Skip to content

Instantly share code, notes, and snippets.

@aficionado
Created January 3, 2014 00:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aficionado/8230033 to your computer and use it in GitHub Desktop.
Save aficionado/8230033 to your computer and use it in GitHub Desktop.
Simple test to detect covariate shift
#!/bin/bash
# Simple test to detect covariate shift.
# Requires:
# BIGML_AUTH set up with your BIGML_USERNAME and API_KEY
# curl: http://curl.haxx.se/
# jq: http://stedolan.github.io/jq/
BIGML_DOMAIN=bigml.io # Set it up to your own BigML VPC domain
FINISHED=5
ERROR=-1
WAIT=5
TOTAL_WAIT=600
SEED=SEED
TRAINING_SAMPLE_RATE=0.8
PRODUCTION_SAMPLE_RATE=0.8
EVALUATION_SAMPLE_RATE=0.8
OBJECTIVE_FIELD_NAME=Origin
TRAINING_DATA=https://gist.github.com/aficionado/7743748/raw/3f1f1d5bd09c296e099344a80539103e4fa90756/titanic_train.csv
PRODUCTION_DATA=https://gist.github.com/aficionado/7743752/raw/db2f5b38bc290b4defc68fe0865f23c16e1e6b7f/titanic_test.csv
NAME=Titanic
MAX_TRIALS=10
TRAINING_FILTER=true
PRODUCTION_FILTER=true
EXCLUDED_FIELDS=[]
# Uncomment to induce a covariate shift
TRAINING_FILTER='(= (f Sex) male)'
PRODUCTION_FILTER='(!= (f Sex) male)'
# Uncomment to exclude discriminative fields
#EXCLUDED_FIELDS=[\"000001\"]
function wait_resource {
# Waits for resources to finish as their creation is asynchronous
ID=$1
COUNTER=0
STATUS=$(curl -s "https://$BIGML_DOMAIN/$ID?$BIGML_AUTH" \
| jq ".status.code")
while [ "$STATUS" -ne "$FINISHED" ] && [ "$STATUS" -ne "$ERROR" ] &&
[ "$COUNTER" -lt "$TOTAL_WAIT" ]; do
sleep $WAIT
let COUNTER++
STATUS=$(curl -s https://$BIGML_DOMAIN/$ID?$BIGML_AUTH \
| jq ".status.code")
done
if [ "$STATUS" -eq "$ERROR" ]; then
echo "Detected a failure waiting for $ID"
exit 1
fi
}
# Create training and production sources
TRAINING_SOURCE=$(curl -s "https://$BIGML_DOMAIN/source?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"remote": "'"$TRAINING_DATA"'", "name": "'"$NAME"' Training"}' \
| jq -r ".resource")
PRODUCTION_SOURCE=$(curl -s "https://$BIGML_DOMAIN/source?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"remote": "'"$PRODUCTION_DATA"'", "name": "'"$NAME"' Production"}' \
| jq -r ".resource")
# Create training and production datasets
wait_resource $TRAINING_SOURCE
TRAINING_DATASET=$(curl -s "https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"source": "'"$TRAINING_SOURCE"'"}' \
| jq -r ".resource")
wait_resource $PRODUCTION_SOURCE
PRODUCTION_DATASET=$(curl -s "https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"source": "'"$PRODUCTION_SOURCE"'"}' \
| jq -r ".resource")
wait_resource $TRAINING_DATASET
wait_resource $PRODUCTION_DATASET
TRIALS=0
AVG_PHI=0
while [ "$TRIALS" -lt "$MAX_TRIALS" ]; do
let TRIALS++
# Filter training and production datasets and label them with a new field
LABELED_TRANING_DATASET=$(curl -s \
"https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"origin_dataset": "'"$TRAINING_DATASET"'",
"lisp_filter": "'"$TRAINING_FILTER"'",
"new_fields": [{"field": "Training",
"name": "'"$OBJECTIVE_FIELD_NAME"'"}],
"sample_rate": '"$TRAINING_SAMPLE_RATE"'}' \
| jq -r ".resource")
LABELED_PRODUCTION_DATASET=$(curl -s \
"https://$BIGML_DOMAIN/dataset?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"origin_dataset": "'"$PRODUCTION_DATASET"'",
"lisp_filter": "'"$PRODUCTION_FILTER"'",
"new_fields": [{"field": "Production",
"name": "'"$OBJECTIVE_FIELD_NAME"'"}],
"sample_rate": '"$PRODUCTION_SAMPLE_RATE"'}' \
| jq -r ".resource")
wait_resource $LABELED_TRANING_DATASET
wait_resource $LABELED_PRODUCTION_DATASET
# Compute sample rate sizes to make sure that the input dataset for the
# model is balanced
TRAINING_INSTANCES=$(curl -s \
"https://$BIGML_DOMAIN/$LABELED_TRANING_DATASET?$BIGML_AUTH" \
| jq -r ".rows")
PRODUCTION_INSTANCES=$(curl -s \
"https://$BIGML_DOMAIN/$LABELED_PRODUCTION_DATASET?$BIGML_AUTH" \
| jq -r ".rows")
if [ $TRAINING_INSTANCES -gt $PRODUCTION_INSTANCES ]; then
SAMPLE_RATE=$(echo "$PRODUCTION_INSTANCES/$TRAINING_INSTANCES" | bc -l)
TRAINING_SAMPLE_RATE=$(printf '%.4f\n' $SAMPLE_RATE)
PRODUCTION_SAMPLE_RATE=1
else
SAMPLE_RATE=$(echo "$TRAINING_INSTANCES/$PRODUCTION_INSTANCES" | bc -l)
TRAINING_SAMPLE_RATE=1
PRODUCTION_SAMPLE_RATE=$(printf '%.4f\n' $SAMPLE_RATE)
fi
# The target of the new model will be the label (Training / Production)
OBJECTIVE_FIELD=$(curl -s \
"https://$BIGML_DOMAIN/$LABELED_TRANING_DATASET?$BIGML_AUTH;prefix=$OBJECTIVE_FIELD_NAME" \
| jq -r ".fields | keys[0]")
# Create a model using just a sample of the data
MODEL=$(curl -s "https://$BIGML_DOMAIN/model?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"datasets": ["'"$LABELED_TRANING_DATASET"'",
"'"$LABELED_PRODUCTION_DATASET"'"],
"sample_rates": {"'"$LABELED_TRANING_DATASET"'":
'"$TRAINING_SAMPLE_RATE"',
"'"$LABELED_PRODUCTION_DATASET"'":
'"$PRODUCTION_SAMPLE_RATE"'},
"objective_field": "'"$OBJECTIVE_FIELD"'",
"sample_rate": '"$EVALUATION_SAMPLE_RATE"',
"seed": "'"$SEED"'",
"name": "'"$NAME"' - Covariate Shift?",
"excluded_fields": '"$EXCLUDED_FIELDS"'}' \
| jq -r ".resource")
wait_resource $MODEL
# Create an evaluation using the other part of the data (out_of_bag=true)
EVALUATION=$(curl -s "https://$BIGML_DOMAIN/evaluation?$BIGML_AUTH" \
-X POST -H "content-type: application/json" \
-d '{"datasets": ["'"$LABELED_TRANING_DATASET"'",
"'"$LABELED_PRODUCTION_DATASET"'"],
"sample_rates": {"'"$LABELED_TRANING_DATASET"'":
'"$TRAINING_SAMPLE_RATE"',
"'"$LABELED_PRODUCTION_DATASET"'":
'"$PRODUCTION_SAMPLE_RATE"'},
"sample_rate": '"$EVALUATION_SAMPLE_RATE"',
"seed": "'"$SEED"'",
"out_of_bag": true,
"model": "'"$MODEL"'",
"name": "'"$NAME"' - Covariate Shift?"}' \
| jq -r ".resource")
wait_resource $EVALUATION
PHI=$(curl -s "https://$BIGML_DOMAIN/$EVALUATION?$BIGML_AUTH" \
| jq -r ".result.model.average_phi")
AVG_PHI=$(echo "$AVG_PHI + $PHI" | bc -l)
printf '%.4f\n' $PHI
done
AVG_PHI=$(echo "$AVG_PHI / $TRIALS" | bc -l)
printf 'AVG_PHI: %.4f\n' $AVG_PHI
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment