dbiswa4/em.py

## em.py
# Candy2.py
# by ___________
# based on skeleton code by D. Crandall, 11/2016
#
# A candy manufacturer makes 5 different types of candy bags, each of which
# is filled with lime and cherry candies but in different proportions.
#
# We've bought 100 bags chosen at random. For each bag, we've then opened
# it, randomly drawn 100 candies, and recorded the flavor of each one.
#
# We want to estimate (1) what's the actual percentage of cherry candies
# in each of the 5 bag types, and (2) what's the actual bag type of each
# of our 100 bags?

import random
import numpy
import math
from copy import deepcopy

#####
# You shouldn't have to modify this part

# These are the *actual* values of C0 ... C4 we're trying to estimate.
# Shh... They're a secret! :) This is what we're trying to estimate.
bagtype_count = 5
actual_cs = (0.2, 0.3, 0.7, 0.9, 1.0)

# Now sample 100 bags
bag_count = 100
actual_bagtypes = [ random.randrange(0, bagtype_count) for i in range(0, bag_count) ]

# Now sample 100 candies from each bag, to produce a list-of-lists
candy_count = 100
observations = [ [ ("L", "C")[x] for x in tuple(numpy.random.binomial( 1, actual_cs[ bagtype ], candy_count ) ) ] for bagtype in actual_bagtypes ]

######
# This is the part you'll want to edit

# This list will hold your estimated C0 ... C4 values, and your estimated
# bagtype for each bag.
estimated_cs = [0] * bagtype_count
estimated_bagtypes = [0] * bag_count

# Here's pseudocode for what you should implement:
#
#
# Run EM multiple times:
#
#   Randomly initialize estimated_cs
#   Until estimated probabilities converge:
#
#      # E-step
#      For each sampled bag:
#          Calculate probability of the data given each model, i.e. of the observed candies in this bag assuming each of the 5 bagtypes
#          Put the highest-probability bagtype into estimated_labels[bag]
#
#      # M-step
#      For each bagtype:
#          Estimate probability c_i for this bagtype i using the bags currently assigned to this bagtype in estimated_labels
#          Update estimated_cs[bagtype]
#
#   Calculate probability (or log-probability) of the data given the final values of estimated_cs
#
# Select the model with the highest probability of the data given estimated_cs

j=0
previous_estimated_cs = [0] * bagtype_count
estimated_cs = numpy.random.uniform(0.0,1.0,5).tolist()

count = 0
def convergeIsNotTrue():
    #improvement = max([abs(estimated_cs - previous_estimated_cs)])
    print("convergeIsNotTrue ()")
    print("previous_estimated_cs : ", previous_estimated_cs)
    print("estimated_cs : ", estimated_cs)
    improvement = max([abs(estimated_cs_i - previous_estimated_cs_i) for estimated_cs_i, previous_estimated_cs_i in zip(estimated_cs, previous_estimated_cs)])
    print ("improvement:         ", improvement)
    if improvement > 0.005:
        return True
    else:
        return False
# estimated_cs.append((numpy.random.uniform(0.0,1.0,5).tolist()))

while(convergeIsNotTrue()):
    ind = 0
    for observation in observations:
        cherry_prob = float(sum([1 for k in observation if k=='C']))/float(len(observations))
        temp_diff = 0.0
        min_diff = 1000.0
        for estimate in estimated_cs:
            temp_diff = abs(estimate - cherry_prob)
            if temp_diff < min_diff:
                min_diff=temp_diff
                temp_bagtype = estimated_cs.index(estimate)
        #estimated_bagtypes[observations.index(observation)]= temp_bagtype
        estimated_bagtypes[ind] = temp_bagtype
        ind += 1

    print("estimated_bagtypes : ", estimated_bagtypes)
    previous_estimated_cs = deepcopy(estimated_cs)
    print("In while:")
    print("estimated_cs : ", estimated_cs)
    print("previous_estimated_cs : ", previous_estimated_cs)
    for bag in range(bagtype_count):
        print("bag : ", bag)
        '''
        list1 = [1 for k in estimated_bagtypes if k==bag]
        sum1 = sum(list1)
        print("list1 : ", list1)
        print("sum1 : ", sum1)
        print ("sum([1 for k in estimated_bagtypes if k==bag])/bag_count : ", sum([1 for k in estimated_bagtypes if k==bag])/bag_count)
        bag_cherry_prob = float(sum1)/float(bag_count)
        '''
        estimated_cs[bag] = float(sum([1 for k in estimated_bagtypes if k==bag]))/float(bag_count)
        #print("estimated_cs : ", estimated_cs)


######
# You shouldn't have to modify this part -- it just spits out the results.

# Sort the estimated probabilities so they coincide with the actual ones
#
sorted_cs = sorted((e,i) for i,e in enumerate(estimated_cs))
estimated_cs = [ v[0] for v in sorted_cs ]
index_remapper = [ 0 ] * bagtype_count
for i in range(0, bagtype_count):
    index_remapper[ sorted_cs[i][1] ] = i
estimated_bagtypes = [ index_remapper[bagtype] for bagtype in estimated_bagtypes ]

print ("Actual C's:         ", actual_cs)
print ("Estimated C's:      ", estimated_cs)
print ("Actual bagtypes:    ", actual_bagtypes)
print ("Estimated bagtypes: ", estimated_bagtypes)
print ("Correctly estimated bags: ", sum( [ actual_bagtypes[i] == estimated_bagtypes[i] for i in range(0, len(estimated_bagtypes) ) ] ))
	# Candy2.py
	# by ___________
	# based on skeleton code by D. Crandall, 11/2016
	#
	# A candy manufacturer makes 5 different types of candy bags, each of which
	# is filled with lime and cherry candies but in different proportions.
	#
	# We've bought 100 bags chosen at random. For each bag, we've then opened
	# it, randomly drawn 100 candies, and recorded the flavor of each one.
	#
	# We want to estimate (1) what's the actual percentage of cherry candies
	# in each of the 5 bag types, and (2) what's the actual bag type of each
	# of our 100 bags?

	import random
	import numpy
	import math
	from copy import deepcopy

	#####
	# You shouldn't have to modify this part

	# These are the actual values of C0 ... C4 we're trying to estimate.
	# Shh... They're a secret! :) This is what we're trying to estimate.
	bagtype_count = 5
	actual_cs = (0.2, 0.3, 0.7, 0.9, 1.0)

	# Now sample 100 bags
	bag_count = 100
	actual_bagtypes = [ random.randrange(0, bagtype_count) for i in range(0, bag_count) ]

	# Now sample 100 candies from each bag, to produce a list-of-lists
	candy_count = 100
	observations = [ [ ("L", "C")[x] for x in tuple(numpy.random.binomial( 1, actual_cs[ bagtype ], candy_count ) ) ] for bagtype in actual_bagtypes ]

	######
	# This is the part you'll want to edit

	# This list will hold your estimated C0 ... C4 values, and your estimated
	# bagtype for each bag.
	estimated_cs = [0] * bagtype_count
	estimated_bagtypes = [0] * bag_count

	# Here's pseudocode for what you should implement:
	#
	#
	# Run EM multiple times:
	#
	# Randomly initialize estimated_cs
	# Until estimated probabilities converge:
	#
	# # E-step
	# For each sampled bag:
	# Calculate probability of the data given each model, i.e. of the observed candies in this bag assuming each of the 5 bagtypes
	# Put the highest-probability bagtype into estimated_labels[bag]
	#
	# # M-step
	# For each bagtype:
	# Estimate probability c_i for this bagtype i using the bags currently assigned to this bagtype in estimated_labels
	# Update estimated_cs[bagtype]
	#
	# Calculate probability (or log-probability) of the data given the final values of estimated_cs
	#
	# Select the model with the highest probability of the data given estimated_cs

	j=0
	previous_estimated_cs = [0] * bagtype_count
	estimated_cs = numpy.random.uniform(0.0,1.0,5).tolist()

	count = 0
	def convergeIsNotTrue():
	#improvement = max([abs(estimated_cs - previous_estimated_cs)])
	print("convergeIsNotTrue ()")
	print("previous_estimated_cs : ", previous_estimated_cs)
	print("estimated_cs : ", estimated_cs)
	improvement = max([abs(estimated_cs_i - previous_estimated_cs_i) for estimated_cs_i, previous_estimated_cs_i in zip(estimated_cs, previous_estimated_cs)])
	print ("improvement: ", improvement)
	if improvement > 0.005:
	return True
	else:
	return False
	# estimated_cs.append((numpy.random.uniform(0.0,1.0,5).tolist()))

	while(convergeIsNotTrue()):
	ind = 0
	for observation in observations:
	cherry_prob = float(sum([1 for k in observation if k=='C']))/float(len(observations))
	temp_diff = 0.0
	min_diff = 1000.0
	for estimate in estimated_cs:
	temp_diff = abs(estimate - cherry_prob)
	if temp_diff < min_diff:
	min_diff=temp_diff
	temp_bagtype = estimated_cs.index(estimate)
	#estimated_bagtypes[observations.index(observation)]= temp_bagtype
	estimated_bagtypes[ind] = temp_bagtype
	ind += 1

	print("estimated_bagtypes : ", estimated_bagtypes)
	previous_estimated_cs = deepcopy(estimated_cs)
	print("In while:")
	print("estimated_cs : ", estimated_cs)
	print("previous_estimated_cs : ", previous_estimated_cs)
	for bag in range(bagtype_count):
	print("bag : ", bag)
	'''
	list1 = [1 for k in estimated_bagtypes if k==bag]
	sum1 = sum(list1)
	print("list1 : ", list1)
	print("sum1 : ", sum1)
	print ("sum([1 for k in estimated_bagtypes if k==bag])/bag_count : ", sum([1 for k in estimated_bagtypes if k==bag])/bag_count)
	bag_cherry_prob = float(sum1)/float(bag_count)
	'''
	estimated_cs[bag] = float(sum([1 for k in estimated_bagtypes if k==bag]))/float(bag_count)
	#print("estimated_cs : ", estimated_cs)


	######
	# You shouldn't have to modify this part -- it just spits out the results.

	# Sort the estimated probabilities so they coincide with the actual ones
	#
	sorted_cs = sorted((e,i) for i,e in enumerate(estimated_cs))
	estimated_cs = [ v[0] for v in sorted_cs ]
	index_remapper = [ 0 ] * bagtype_count
	for i in range(0, bagtype_count):
	index_remapper[ sorted_cs[i][1] ] = i
	estimated_bagtypes = [ index_remapper[bagtype] for bagtype in estimated_bagtypes ]

	print ("Actual C's: ", actual_cs)
	print ("Estimated C's: ", estimated_cs)
	print ("Actual bagtypes: ", actual_bagtypes)
	print ("Estimated bagtypes: ", estimated_bagtypes)
	print ("Correctly estimated bags: ", sum( [ actual_bagtypes[i] == estimated_bagtypes[i] for i in range(0, len(estimated_bagtypes) ) ] ))