Last active
November 10, 2015 15:33
-
-
Save gravesmedical/58a6b665b553c1294b56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
#### Read in the data from u1.base and format it appropriately. | |
FILE_NAME = 'u1.base' | |
def get_data(file_name): | |
data = open(file_name,'r').readlines() | |
return data | |
def Transpose(matrix): | |
nrows = len(matrix) | |
ncols = len(matrix[0]) | |
matrixT = [[0 for i in range(nrows)] for j in range(ncols)] | |
for i in range(nrows): | |
for j in range(ncols): | |
matrixT[j][i] = matrix[i][j] | |
return matrixT | |
def format_training_set(data): | |
result = list(map( lambda x: x.strip( ), data )) | |
result = list(map( lambda x: x.split("\t"), result)) | |
for n in range(len(result)): | |
result[n] = list(map( int, result[n][0:-1])) | |
return result | |
def _similarity_matrix(transposed_data): | |
Users = transposed_data[0] | |
numberOfUsers = len(list(set(Users))) #### Find out how many users there are in the training set. | |
SimilarityMatrix = [[0 for j in range(numberOfUsers + 1)] for i in range(numberOfUsers + 1)] | |
#### Initially the similarity matrix is really an identity matrix. | |
#### However, the first row and first column are just 0's. But the rest of the matrix is an identity matrix. | |
for i in range(1, len(SimilarityMatrix)): | |
for j in range(1, len(SimilarityMatrix)): | |
if i == j: | |
SimilarityMatrix[i][j] = 1 | |
return SimilarityMatrix | |
def _average_ratings(Items,Ratings): | |
def FindTheAverage(List): | |
return sum(List)/len(List) | |
SetOfItems = set(Items) | |
AverageRatingsOfItems = dict( ) | |
for item in SetOfItems: | |
AverageRatingsOfItems[item] = [ ] | |
for i in range(len(Items)): | |
AverageRatingsOfItems[Items[i]].append(Ratings[i]) | |
for item in AverageRatingsOfItems: | |
AverageRatingsOfItems[item] = FindTheAverage(AverageRatingsOfItems[item]) | |
return AverageRatingsOfItems | |
def _pq_ratings(TrainingSet,Users,Items,Ratings): | |
PQ_Ratings = dict( ) | |
#### The dictionary below gives the rating that some user has for some item. | |
for n in range(len(TrainingSet)): | |
PQ_Ratings[(Users[n], Items[n])] = Ratings[n] | |
return PQ_Ratings | |
#@profile | |
def Correlation(i, j): | |
sum1 = 0 | |
sum2 = 0 | |
numeratorProduct = 1 | |
denominatorProduct1 = 1 | |
denominatorProduct2 = 1 | |
pairs = IJ_Pairs.get((i,j),[]) | |
for pair in pairs: | |
_i = pair[0] | |
_j = pair[1] | |
pq_rating = PQ_Ratings[pair] | |
avg_rating = AverageRatingsOfItems[_j] | |
diff = pq_rating - avg_rating | |
if _i == i: | |
sum1+= diff | |
else: | |
sum2+= diff | |
numeratorProduct+= sum1*sum2 | |
denominatorProduct1+= sum1**2 | |
denominatorProduct2+= sum2**2 | |
denom = (math.sqrt(denominatorProduct1)*math.sqrt(denominatorProduct2)) | |
result = numeratorProduct/denom | |
return result | |
#@profile | |
def pq_to_dict(): | |
total = len(SimilarityMatrix) | |
i_pairs = {} | |
j_pairs = {} | |
ij_pairs = {} | |
for i in range(1, total): | |
for j in range(i + 1, total): | |
i_pairs[i] = (i,j) | |
j_pairs[j] = (i,j) | |
ij_pairs[(i,j)] = [] | |
for pair in PQ_Keys: | |
target_key = pair[0] | |
has_i = i_pairs.get(target_key,False) | |
has_j = j_pairs.get(target_key,False) | |
has_combo = ij_pairs.get(pair,False) | |
if has_i != False: | |
if has_combo != False: | |
ij_pairs[pair].append(pair) | |
if has_j != False: | |
if has_combo != False: | |
ij_pairs[pair].append(pair) | |
return ij_pairs | |
def calculate(SimilarityMatrix): | |
total = len(SimilarityMatrix) | |
for i in range(1, total): | |
for j in range(i + 1, total): | |
SimilarityMatrix[i][j] = Correlation(i, j) | |
return #SimilarityMatrix | |
data = get_data(FILE_NAME) | |
TrainingSet = format_training_set(data) | |
transposed = Transpose(TrainingSet) | |
SimilarityMatrix = _similarity_matrix(transposed) | |
Users = transposed[0] | |
Items = transposed[1] | |
Ratings = transposed[2] | |
AverageRatingsOfItems = _average_ratings(Items,Ratings) | |
PQ_Ratings = _pq_ratings(TrainingSet,Users,Items,Ratings) | |
PQ_Keys = PQ_Ratings.keys() | |
IJ_Pairs = pq_to_dict() | |
calculate(SimilarityMatrix) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment