beaucronin/gist:3094654

## gistfile1.py
import veritable
import csv
import matplotlib.pyplot as plt

# Load the csv and read into a Veritable dataset using inches and pounds
print 'Reading data from file'
data_inches_pounds = []
with open('heights_weights_genders.csv') as fd:
    rd = csv.reader(fd)
    rd.next() # skip the header
    for i, line in enumerate(rd):
        data_inches_pounds.append({
            '_id': 'R_{:05}'.format(i),
            'gender': line[0],
            'height_in': float(line[1]),
            'weight_lb': float(line[2])
        })
schema_inches_pounds = {
    'gender': { 'type': 'categorical' },
    'height_in': { 'type': 'real' },
    'weight_lb': { 'type': 'real' }
}

# Convert to a second dataset using meters and grams
print 'Creating second dataset'
data_meters_grams = [{
        '_id': row['_id'],
        'gender': row['gender'],
        'height_m': row['height_in'] * .0254,
        'weight_g': row['weight_lb'] * 453.6
    } for row in data_inches_pounds]
schema_meters_grams = {
    'gender': { 'type': 'categorical' },
    'height_m': { 'type': 'real' },
    'weight_g': { 'type': 'real' }
}

# Analyze both datasets
api = veritable.connect()

print 'Uploading data'
table_inches_pounds = api.create_table('inches_pounds', force=True)
table_inches_pounds.batch_upload_rows(data_inches_pounds)
analysis_inches_pounds = table_inches_pounds.create_analysis(
    schema_inches_pounds, 'inches_pounds')

table_meters_grams = api.create_table('meters_grams', force=True)
table_meters_grams.batch_upload_rows(data_meters_grams)
analysis_meters_grams = table_meters_grams.create_analysis(
    schema_meters_grams, 'meters_grams')

print 'Analyzing'
analysis_inches_pounds.wait()
analysis_meters_grams.wait()

# Make some predictions that demonstrate that Veritable can handle variables
# of widely different scales
print 'Making predictions'

# Q1: what gender are tall people? (6' 6", or almost 2 meters)
query_1a = { 'height_in': 78., 'gender': None }
query_1b = { 'height_m': 1.98, 'gender': None }
preds_1a = analysis_inches_pounds.predict(query_1a)
print preds_1a.credible_values('gender')
preds_1b = analysis_meters_grams.predict(query_1b)
print preds_1b.credible_values('gender')

plt.figure()
plt.bar(
    [1, 2],
    [preds_1a.credible_values('gender')['Male'], preds_1b.credible_values('gender')['Male']])
plt.xlim([.8, 3.0])
plt.ylim([0., 1.])
plt.ylabel('Prob( Gender = Male )')
plt.xticks([1.4, 2.4] , ['Inches / Pounds', 'Meters / Grams'])
plt.savefig('genders.pdf')

# Q2: what are the credible weights for a short woman?
query_2a = { 'gender': 'Female', 'height_in': 60., 'weight_lb': None }
query_2b = { 'gender': 'Female', 'height_m': 1.52, 'weight_g': None }
preds_2a = analysis_inches_pounds.predict(query_2a, 1000)
print 'pounds:', preds_2a.credible_values('weight_lb')
preds_2b = analysis_meters_grams.predict(query_2b, 1000)
print 'grams:', preds_2b.credible_values('weight_g')

plt.figure()
plt.subplot(2,1,1)
plt.hist([p['weight_lb'] for p in preds_2a.distribution], bins=20, normed=True)
plt.xlabel('weight (pounds)')
plt.ylabel('Frequency')
plt.yticks([])
plt.subplot(2,1,2)
plt.hist([p['weight_g'] for p in preds_2b.distribution], bins=20, normed=True)
plt.xlabel('weight (grams)')
plt.ylabel('Frequency')
plt.yticks([])
plt.savefig('weights.pdf')
	import veritable
	import csv
	import matplotlib.pyplot as plt

	# Load the csv and read into a Veritable dataset using inches and pounds
	print 'Reading data from file'
	data_inches_pounds = []
	with open('heights_weights_genders.csv') as fd:
	rd = csv.reader(fd)
	rd.next() # skip the header
	for i, line in enumerate(rd):
	data_inches_pounds.append({
	'_id': 'R_{:05}'.format(i),
	'gender': line[0],
	'height_in': float(line[1]),
	'weight_lb': float(line[2])
	})
	schema_inches_pounds = {
	'gender': { 'type': 'categorical' },
	'height_in': { 'type': 'real' },
	'weight_lb': { 'type': 'real' }
	}

	# Convert to a second dataset using meters and grams
	print 'Creating second dataset'
	data_meters_grams = [{
	'_id': row['_id'],
	'gender': row['gender'],
	'height_m': row['height_in'] * .0254,
	'weight_g': row['weight_lb'] * 453.6
	} for row in data_inches_pounds]
	schema_meters_grams = {
	'gender': { 'type': 'categorical' },
	'height_m': { 'type': 'real' },
	'weight_g': { 'type': 'real' }
	}

	# Analyze both datasets
	api = veritable.connect()

	print 'Uploading data'
	table_inches_pounds = api.create_table('inches_pounds', force=True)
	table_inches_pounds.batch_upload_rows(data_inches_pounds)
	analysis_inches_pounds = table_inches_pounds.create_analysis(
	schema_inches_pounds, 'inches_pounds')

	table_meters_grams = api.create_table('meters_grams', force=True)
	table_meters_grams.batch_upload_rows(data_meters_grams)
	analysis_meters_grams = table_meters_grams.create_analysis(
	schema_meters_grams, 'meters_grams')

	print 'Analyzing'
	analysis_inches_pounds.wait()
	analysis_meters_grams.wait()

	# Make some predictions that demonstrate that Veritable can handle variables
	# of widely different scales
	print 'Making predictions'

	# Q1: what gender are tall people? (6' 6", or almost 2 meters)
	query_1a = { 'height_in': 78., 'gender': None }
	query_1b = { 'height_m': 1.98, 'gender': None }
	preds_1a = analysis_inches_pounds.predict(query_1a)
	print preds_1a.credible_values('gender')
	preds_1b = analysis_meters_grams.predict(query_1b)
	print preds_1b.credible_values('gender')

	plt.figure()
	plt.bar(
	[1, 2],
	[preds_1a.credible_values('gender')['Male'], preds_1b.credible_values('gender')['Male']])
	plt.xlim([.8, 3.0])
	plt.ylim([0., 1.])
	plt.ylabel('Prob( Gender = Male )')
	plt.xticks([1.4, 2.4] , ['Inches / Pounds', 'Meters / Grams'])
	plt.savefig('genders.pdf')

	# Q2: what are the credible weights for a short woman?
	query_2a = { 'gender': 'Female', 'height_in': 60., 'weight_lb': None }
	query_2b = { 'gender': 'Female', 'height_m': 1.52, 'weight_g': None }
	preds_2a = analysis_inches_pounds.predict(query_2a, 1000)
	print 'pounds:', preds_2a.credible_values('weight_lb')
	preds_2b = analysis_meters_grams.predict(query_2b, 1000)
	print 'grams:', preds_2b.credible_values('weight_g')

	plt.figure()
	plt.subplot(2,1,1)
	plt.hist([p['weight_lb'] for p in preds_2a.distribution], bins=20, normed=True)
	plt.xlabel('weight (pounds)')
	plt.ylabel('Frequency')
	plt.yticks([])
	plt.subplot(2,1,2)
	plt.hist([p['weight_g'] for p in preds_2b.distribution], bins=20, normed=True)
	plt.xlabel('weight (grams)')
	plt.ylabel('Frequency')
	plt.yticks([])
	plt.savefig('weights.pdf')