tingletech/README.md

## README.md

      
    Raw
  

              README.md
            
          
    User Survey Analysis

http://code.google.com/p/json4lib/source/browse/#hg%2Fsurvey
http://stats.stackexchange.com/questions/39319/bayesian-user-survey-with-a-credible-interval
https://gist.github.com/3896501
http://en.wikipedia.org/wiki/Monte_Carlo_method
http://www.resample.com/
http://tingletech.tumblr.com/tagged/pop-up-survey
OS X install notes

use virtualenv
brew is telling me "10.7.5 Xcode (4.4.1) is outdated Please install Xcode 4.5." but I can't be bothered
pip install numpy
brew install freetype
export CPPFLAGS="-I/usr/X11/include/freetype2 -I/usr/X11/include -I/usr/local/opt/freetype/include"
export LDFLAGS="-L/usr/X11/lib -lfreetype -lz -lbz2"
pip install matplotlib
License

Copyright © 2012, Regents of the University of California
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
Neither the name of the University of California nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

  
## figure.png

      
    Raw
  

              figure.png
            
          
## figure10percent.png

      
    Raw
  

              figure10percent.png
            
          
## survey_analysis.py
#!/usr/bin/env python

"""
plot user survey results and estimate bayesian credible interval using monte carlo method

uses 2.5% and 97.5% percentiles

data is hard coded for one survey right now

"""

import sys
from numpy.random.mtrand import dirichlet
from numpy import array, sum, percentile
from pprint import pprint
import numpy as np

def main(argv=None):
    # need to refactor the data out
    categories = [
        'k-12 teacher or librarian',
        'k-12 student',
        'college or graduate student',
        'faculty or academic researcher',
        'archivist or librarian',
        'genealogist or family researcher',
        'other'
    ]
    observations_oac = [ 65,  71, 532, 307, 369, 234, 584]
    observations_cs  = [133, 280, 445,  82,  55,  68, 220]

    # calculate the statistics
    estimates_oac, error_bars_oac = monte_carlo(observations_oac)
    estimates_cs, error_bars_cs = monte_carlo(observations_cs)

    # http://matplotlib.org/examples/api/barchart_demo.html
    # inner function we will need for labels
    def autolabel(rects):
        # attach some text labels
        for rect in rects:
            height = rect.get_height()
            height_format = "{:.0%}".format(height)
            plt.text(rect.get_x()+rect.get_width()/2., height + 0.025, height_format,
                    ha='center', va='bottom')

    # okay, now we are ready to get our plot on
    import matplotlib
    # The backend must be set BEFORE importing matplotlib.pyplot
    # matplotlib.use('Agg') # Generates image.
    import matplotlib.pyplot as plt

    # x axis index
    x = np.arange(1, len(categories)+1)

    plt.figure(1)
    plt.subplot(211)
    plt.title("Answer that best describes yourself: OAC N=" + str(sum(observations_oac)))
    plt.xticks(x, [label.replace(" ","\n") for label in categories ])
    plt.ylim((0,0.5))
    rects1 = plt.bar(x, estimates_oac, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
    autolabel(rects1)

    plt.subplot(212)
    plt.title("Answer that best describes yourself: Calisphere N=" + str(sum(observations_cs)))
    plt.xticks(x, [label.replace(" ","\n") for label in categories ])
    plt.ylim((0,0.5))
    rects2 = plt.bar(x, estimates_cs, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
    autolabel(rects2)

    plt.tight_layout()
    plt.savefig("figure.png", dpi = (100))
    # http://workshops.arl.arizona.edu/python1/python_modules/matplotlib_example.py
    return 0

def monte_carlo(observations):
    """ takes a standard array of observations and returns a 2 dimensional numpy array of simulation results
    """
    ITERATIONS = 1000
    results = []
    for n in xrange(ITERATIONS):
        results.append( dirichlet([x+1 for x in observations]) )
    # convert results to a numby array for better indexing/sums
    results = array(results)
    # summarize the results
    estimates = sum(results, axis=0) / ITERATIONS
    # "You need 2*N, not N*2 arrays here."
    lows = []
    highs = []
    # calculate credible interval
    for k in xrange(len(estimates)):
        q025 = percentile(results[:,k],  2.5)
        q975 = percentile(results[:,k], 97.5)
        # http://www.mailinglistarchive.com/html/matplotlib-users@lists.sourceforge.net/2009-09/msg00181.html
        lows.append( estimates[k] - q025)
        highs.append( q975 - estimates[k])
    return estimates, [ lows, highs ]

if __name__ == '__main__':
    status = main()
    sys.exit(status)
	#!/usr/bin/env python

	"""
	plot user survey results and estimate bayesian credible interval using monte carlo method

	uses 2.5% and 97.5% percentiles

	data is hard coded for one survey right now

	"""

	import sys
	from numpy.random.mtrand import dirichlet
	from numpy import array, sum, percentile
	from pprint import pprint
	import numpy as np

	def main(argv=None):
	# need to refactor the data out
	categories = [
	'k-12 teacher or librarian',
	'k-12 student',
	'college or graduate student',
	'faculty or academic researcher',
	'archivist or librarian',
	'genealogist or family researcher',
	'other'
	]
	observations_oac = [ 65, 71, 532, 307, 369, 234, 584]
	observations_cs = [133, 280, 445, 82, 55, 68, 220]

	# calculate the statistics
	estimates_oac, error_bars_oac = monte_carlo(observations_oac)
	estimates_cs, error_bars_cs = monte_carlo(observations_cs)

	# http://matplotlib.org/examples/api/barchart_demo.html
	# inner function we will need for labels
	def autolabel(rects):
	# attach some text labels
	for rect in rects:
	height = rect.get_height()
	height_format = "{:.0%}".format(height)
	plt.text(rect.get_x()+rect.get_width()/2., height + 0.025, height_format,
	ha='center', va='bottom')

	# okay, now we are ready to get our plot on
	import matplotlib
	# The backend must be set BEFORE importing matplotlib.pyplot
	# matplotlib.use('Agg') # Generates image.
	import matplotlib.pyplot as plt

	# x axis index
	x = np.arange(1, len(categories)+1)

	plt.figure(1)
	plt.subplot(211)
	plt.title("Answer that best describes yourself: OAC N=" + str(sum(observations_oac)))
	plt.xticks(x, [label.replace(" ","\n") for label in categories ])
	plt.ylim((0,0.5))
	rects1 = plt.bar(x, estimates_oac, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
	autolabel(rects1)

	plt.subplot(212)
	plt.title("Answer that best describes yourself: Calisphere N=" + str(sum(observations_cs)))
	plt.xticks(x, [label.replace(" ","\n") for label in categories ])
	plt.ylim((0,0.5))
	rects2 = plt.bar(x, estimates_cs, 0.8, yerr=error_bars_oac, facecolor='none', align='center', ecolor="k" )
	autolabel(rects2)

	plt.tight_layout()
	plt.savefig("figure.png", dpi = (100))
	# http://workshops.arl.arizona.edu/python1/python_modules/matplotlib_example.py
	return 0

	def monte_carlo(observations):
	""" takes a standard array of observations and returns a 2 dimensional numpy array of simulation results
	"""
	ITERATIONS = 1000
	results = []
	for n in xrange(ITERATIONS):
	results.append( dirichlet([x+1 for x in observations]) )
	# convert results to a numby array for better indexing/sums
	results = array(results)
	# summarize the results
	estimates = sum(results, axis=0) / ITERATIONS
	# "You need 2N, not N2 arrays here."
	lows = []
	highs = []
	# calculate credible interval
	for k in xrange(len(estimates)):
	q025 = percentile(results[:,k], 2.5)
	q975 = percentile(results[:,k], 97.5)
	# http://www.mailinglistarchive.com/html/matplotlib-users@lists.sourceforge.net/2009-09/msg00181.html
	lows.append( estimates[k] - q025)
	highs.append( q975 - estimates[k])
	return estimates, [ lows, highs ]

	if __name__ == '__main__':
	status = main()
	sys.exit(status)