Created
July 12, 2012 00:18
-
-
Save beaucronin/3094654 to your computer and use it in GitHub Desktop.
Veritable code to analyze heights and weights with different units
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import veritable | |
import csv | |
import matplotlib.pyplot as plt | |
# Load the csv and read into a Veritable dataset using inches and pounds | |
print 'Reading data from file' | |
data_inches_pounds = [] | |
with open('heights_weights_genders.csv') as fd: | |
rd = csv.reader(fd) | |
rd.next() # skip the header | |
for i, line in enumerate(rd): | |
data_inches_pounds.append({ | |
'_id': 'R_{:05}'.format(i), | |
'gender': line[0], | |
'height_in': float(line[1]), | |
'weight_lb': float(line[2]) | |
}) | |
schema_inches_pounds = { | |
'gender': { 'type': 'categorical' }, | |
'height_in': { 'type': 'real' }, | |
'weight_lb': { 'type': 'real' } | |
} | |
# Convert to a second dataset using meters and grams | |
print 'Creating second dataset' | |
data_meters_grams = [{ | |
'_id': row['_id'], | |
'gender': row['gender'], | |
'height_m': row['height_in'] * .0254, | |
'weight_g': row['weight_lb'] * 453.6 | |
} for row in data_inches_pounds] | |
schema_meters_grams = { | |
'gender': { 'type': 'categorical' }, | |
'height_m': { 'type': 'real' }, | |
'weight_g': { 'type': 'real' } | |
} | |
# Analyze both datasets | |
api = veritable.connect() | |
print 'Uploading data' | |
table_inches_pounds = api.create_table('inches_pounds', force=True) | |
table_inches_pounds.batch_upload_rows(data_inches_pounds) | |
analysis_inches_pounds = table_inches_pounds.create_analysis( | |
schema_inches_pounds, 'inches_pounds') | |
table_meters_grams = api.create_table('meters_grams', force=True) | |
table_meters_grams.batch_upload_rows(data_meters_grams) | |
analysis_meters_grams = table_meters_grams.create_analysis( | |
schema_meters_grams, 'meters_grams') | |
print 'Analyzing' | |
analysis_inches_pounds.wait() | |
analysis_meters_grams.wait() | |
# Make some predictions that demonstrate that Veritable can handle variables | |
# of widely different scales | |
print 'Making predictions' | |
# Q1: what gender are tall people? (6' 6", or almost 2 meters) | |
query_1a = { 'height_in': 78., 'gender': None } | |
query_1b = { 'height_m': 1.98, 'gender': None } | |
preds_1a = analysis_inches_pounds.predict(query_1a) | |
print preds_1a.credible_values('gender') | |
preds_1b = analysis_meters_grams.predict(query_1b) | |
print preds_1b.credible_values('gender') | |
plt.figure() | |
plt.bar( | |
[1, 2], | |
[preds_1a.credible_values('gender')['Male'], preds_1b.credible_values('gender')['Male']]) | |
plt.xlim([.8, 3.0]) | |
plt.ylim([0., 1.]) | |
plt.ylabel('Prob( Gender = Male )') | |
plt.xticks([1.4, 2.4] , ['Inches / Pounds', 'Meters / Grams']) | |
plt.savefig('genders.pdf') | |
# Q2: what are the credible weights for a short woman? | |
query_2a = { 'gender': 'Female', 'height_in': 60., 'weight_lb': None } | |
query_2b = { 'gender': 'Female', 'height_m': 1.52, 'weight_g': None } | |
preds_2a = analysis_inches_pounds.predict(query_2a, 1000) | |
print 'pounds:', preds_2a.credible_values('weight_lb') | |
preds_2b = analysis_meters_grams.predict(query_2b, 1000) | |
print 'grams:', preds_2b.credible_values('weight_g') | |
plt.figure() | |
plt.subplot(2,1,1) | |
plt.hist([p['weight_lb'] for p in preds_2a.distribution], bins=20, normed=True) | |
plt.xlabel('weight (pounds)') | |
plt.ylabel('Frequency') | |
plt.yticks([]) | |
plt.subplot(2,1,2) | |
plt.hist([p['weight_g'] for p in preds_2b.distribution], bins=20, normed=True) | |
plt.xlabel('weight (grams)') | |
plt.ylabel('Frequency') | |
plt.yticks([]) | |
plt.savefig('weights.pdf') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment