Skip to content

Instantly share code, notes, and snippets.

@piyushbansal
Created April 5, 2016 09:39
Show Gist options
  • Save piyushbansal/76399cee043f2c9ff3d65638e25c8806 to your computer and use it in GitHub Desktop.
Save piyushbansal/76399cee043f2c9ff3d65638e25c8806 to your computer and use it in GitHub Desktop.
# There are basically two functions in this gist. The first function
# get_covariance_matrix is basically a helper function.
# The other one get_mutual_information_based_best_sample is the one that is used as sampling
# strategy.
def get_covariance_matrix(matrix_a, matrix_b):
"""Takes two matrices (2 dimensional arrays) and returns covariance matrix."""
cov_matrix = []
for i,elements_a in enumerate(matrix_a):
this_row = []
for j,elements_b in enumerate(matrix_b):
this_row.append(numpy.dot(elements_a, elements_b))
cov_matrix.append(this_row)
return numpy.matrix(cov_matrix)
def get_mutual_information_based_best_sample(X, known_votes, possibilities):
"""Mutual Information based sampling strategy."""
# X is a 2 dimensional array, with each element being a 100 dimensional document vector (array).
X = X.toarray()
current_vectors, non_current_vectors = [], []
#Picking up document numbers which haven;t been sampled before.
possibilities = set([x[0] for x in possibilities])
#current_vectors now has collection of document vectors of
#documents which have been sampled, and non_current_vectors
#has collection of document vectors of documents which are
#still not sampled for crowdsourcing.
for i, sample in enumerate(known_votes):
if i not in possibilities and len(sample) > 0:
current_vectors.append(X[i])
else:
non_current_vectors.append(X[i])
mutual_information = []
covariance_matrix = get_covariance_matrix(X, X)
#Begin argmaxing as outlined in Krause's paper.
for document_idx in possibilities:
#Since the kernal is linear, sigma_square_y just refers to K(Y,Y)
#where K is dot-product.
sigma_square_y = covariance_matrix.item((document_idx, document_idx))
#K(Y,A) where A refers to set of document vectors which have been sampled a.k.a current_vectors
sigma_y_a = get_covariance_matrix([X[document_idx]], current_vectors)
#K(A,A)
sigma_a_a = get_covariance_matrix(current_vectors, current_vectors)
#K(A,A)^(-1)
inv_sigma_a_a = numpy.matrix(numpy.linalg.inv(sigma_a_a))
#K(A,Y)
sigma_a_y = get_covariance_matrix(current_vectors, [X[document_idx]])
numerator = sigma_square_y - ((sigma_y_a * inv_sigma_a_a) * sigma_a_y)
#A_bar refers to set of non_current_vectors also excluding
#current document_idx (or also referred to as Y here)
a_bar = numpy.array(filter(lambda x: repr(x)!= repr(X[document_idx]), non_current_vectors))
#K(Y,A_bar)
sigma_y_a_bar = get_covariance_matrix([X[document_idx]], a_bar)
#K(A_bar,A_bar)
sigma_a_bar_a_bar = get_covariance_matrix(a_bar, a_bar)
#K(A_bar,A_bar)^(-1)
inv_sigma_a_bar_a_bar = numpy.linalg.inv(sigma_a_bar_a_bar)
#K(A_bar,Y)
sigma_a_bar_y = get_covariance_matrix(a_bar, [X[document_idx]])
denominator = sigma_square_y - ((sigma_y_a_bar * inv_sigma_a_bar_a_bar) * sigma_a_bar_y)
mutual_information.append((document_idx, numerator.item(0)/denominator.item(0)))
#Pick the best document_idx having argmax mutual_information
sorted_mutual_information = sorted(mutual_information, key=lambda x: x[1], reverse=True)
return sorted_mutual_information[0][0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment