Skip to content

Instantly share code, notes, and snippets.

@beaucronin
Created July 12, 2012 00:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beaucronin/3094654 to your computer and use it in GitHub Desktop.
Save beaucronin/3094654 to your computer and use it in GitHub Desktop.
Veritable code to analyze heights and weights with different units
import veritable
import csv
import matplotlib.pyplot as plt
# Load the csv and read into a Veritable dataset using inches and pounds
print 'Reading data from file'
data_inches_pounds = []
with open('heights_weights_genders.csv') as fd:
rd = csv.reader(fd)
rd.next() # skip the header
for i, line in enumerate(rd):
data_inches_pounds.append({
'_id': 'R_{:05}'.format(i),
'gender': line[0],
'height_in': float(line[1]),
'weight_lb': float(line[2])
})
schema_inches_pounds = {
'gender': { 'type': 'categorical' },
'height_in': { 'type': 'real' },
'weight_lb': { 'type': 'real' }
}
# Convert to a second dataset using meters and grams
print 'Creating second dataset'
data_meters_grams = [{
'_id': row['_id'],
'gender': row['gender'],
'height_m': row['height_in'] * .0254,
'weight_g': row['weight_lb'] * 453.6
} for row in data_inches_pounds]
schema_meters_grams = {
'gender': { 'type': 'categorical' },
'height_m': { 'type': 'real' },
'weight_g': { 'type': 'real' }
}
# Analyze both datasets
api = veritable.connect()
print 'Uploading data'
table_inches_pounds = api.create_table('inches_pounds', force=True)
table_inches_pounds.batch_upload_rows(data_inches_pounds)
analysis_inches_pounds = table_inches_pounds.create_analysis(
schema_inches_pounds, 'inches_pounds')
table_meters_grams = api.create_table('meters_grams', force=True)
table_meters_grams.batch_upload_rows(data_meters_grams)
analysis_meters_grams = table_meters_grams.create_analysis(
schema_meters_grams, 'meters_grams')
print 'Analyzing'
analysis_inches_pounds.wait()
analysis_meters_grams.wait()
# Make some predictions that demonstrate that Veritable can handle variables
# of widely different scales
print 'Making predictions'
# Q1: what gender are tall people? (6' 6", or almost 2 meters)
query_1a = { 'height_in': 78., 'gender': None }
query_1b = { 'height_m': 1.98, 'gender': None }
preds_1a = analysis_inches_pounds.predict(query_1a)
print preds_1a.credible_values('gender')
preds_1b = analysis_meters_grams.predict(query_1b)
print preds_1b.credible_values('gender')
plt.figure()
plt.bar(
[1, 2],
[preds_1a.credible_values('gender')['Male'], preds_1b.credible_values('gender')['Male']])
plt.xlim([.8, 3.0])
plt.ylim([0., 1.])
plt.ylabel('Prob( Gender = Male )')
plt.xticks([1.4, 2.4] , ['Inches / Pounds', 'Meters / Grams'])
plt.savefig('genders.pdf')
# Q2: what are the credible weights for a short woman?
query_2a = { 'gender': 'Female', 'height_in': 60., 'weight_lb': None }
query_2b = { 'gender': 'Female', 'height_m': 1.52, 'weight_g': None }
preds_2a = analysis_inches_pounds.predict(query_2a, 1000)
print 'pounds:', preds_2a.credible_values('weight_lb')
preds_2b = analysis_meters_grams.predict(query_2b, 1000)
print 'grams:', preds_2b.credible_values('weight_g')
plt.figure()
plt.subplot(2,1,1)
plt.hist([p['weight_lb'] for p in preds_2a.distribution], bins=20, normed=True)
plt.xlabel('weight (pounds)')
plt.ylabel('Frequency')
plt.yticks([])
plt.subplot(2,1,2)
plt.hist([p['weight_g'] for p in preds_2b.distribution], bins=20, normed=True)
plt.xlabel('weight (grams)')
plt.ylabel('Frequency')
plt.yticks([])
plt.savefig('weights.pdf')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment