Created
April 5, 2016 09:39
-
-
Save piyushbansal/76399cee043f2c9ff3d65638e25c8806 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# There are basically two functions in this gist. The first function | |
# get_covariance_matrix is basically a helper function. | |
# The other one get_mutual_information_based_best_sample is the one that is used as sampling | |
# strategy. | |
def get_covariance_matrix(matrix_a, matrix_b): | |
"""Takes two matrices (2 dimensional arrays) and returns covariance matrix.""" | |
cov_matrix = [] | |
for i,elements_a in enumerate(matrix_a): | |
this_row = [] | |
for j,elements_b in enumerate(matrix_b): | |
this_row.append(numpy.dot(elements_a, elements_b)) | |
cov_matrix.append(this_row) | |
return numpy.matrix(cov_matrix) | |
def get_mutual_information_based_best_sample(X, known_votes, possibilities): | |
"""Mutual Information based sampling strategy.""" | |
# X is a 2 dimensional array, with each element being a 100 dimensional document vector (array). | |
X = X.toarray() | |
current_vectors, non_current_vectors = [], [] | |
#Picking up document numbers which haven;t been sampled before. | |
possibilities = set([x[0] for x in possibilities]) | |
#current_vectors now has collection of document vectors of | |
#documents which have been sampled, and non_current_vectors | |
#has collection of document vectors of documents which are | |
#still not sampled for crowdsourcing. | |
for i, sample in enumerate(known_votes): | |
if i not in possibilities and len(sample) > 0: | |
current_vectors.append(X[i]) | |
else: | |
non_current_vectors.append(X[i]) | |
mutual_information = [] | |
covariance_matrix = get_covariance_matrix(X, X) | |
#Begin argmaxing as outlined in Krause's paper. | |
for document_idx in possibilities: | |
#Since the kernal is linear, sigma_square_y just refers to K(Y,Y) | |
#where K is dot-product. | |
sigma_square_y = covariance_matrix.item((document_idx, document_idx)) | |
#K(Y,A) where A refers to set of document vectors which have been sampled a.k.a current_vectors | |
sigma_y_a = get_covariance_matrix([X[document_idx]], current_vectors) | |
#K(A,A) | |
sigma_a_a = get_covariance_matrix(current_vectors, current_vectors) | |
#K(A,A)^(-1) | |
inv_sigma_a_a = numpy.matrix(numpy.linalg.inv(sigma_a_a)) | |
#K(A,Y) | |
sigma_a_y = get_covariance_matrix(current_vectors, [X[document_idx]]) | |
numerator = sigma_square_y - ((sigma_y_a * inv_sigma_a_a) * sigma_a_y) | |
#A_bar refers to set of non_current_vectors also excluding | |
#current document_idx (or also referred to as Y here) | |
a_bar = numpy.array(filter(lambda x: repr(x)!= repr(X[document_idx]), non_current_vectors)) | |
#K(Y,A_bar) | |
sigma_y_a_bar = get_covariance_matrix([X[document_idx]], a_bar) | |
#K(A_bar,A_bar) | |
sigma_a_bar_a_bar = get_covariance_matrix(a_bar, a_bar) | |
#K(A_bar,A_bar)^(-1) | |
inv_sigma_a_bar_a_bar = numpy.linalg.inv(sigma_a_bar_a_bar) | |
#K(A_bar,Y) | |
sigma_a_bar_y = get_covariance_matrix(a_bar, [X[document_idx]]) | |
denominator = sigma_square_y - ((sigma_y_a_bar * inv_sigma_a_bar_a_bar) * sigma_a_bar_y) | |
mutual_information.append((document_idx, numerator.item(0)/denominator.item(0))) | |
#Pick the best document_idx having argmax mutual_information | |
sorted_mutual_information = sorted(mutual_information, key=lambda x: x[1], reverse=True) | |
return sorted_mutual_information[0][0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment