Skip to content

Instantly share code, notes, and snippets.

@gravesmedical
Last active November 10, 2015 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gravesmedical/58a6b665b553c1294b56 to your computer and use it in GitHub Desktop.
Save gravesmedical/58a6b665b553c1294b56 to your computer and use it in GitHub Desktop.
import math
#### Read in the data from u1.base and format it appropriately.
FILE_NAME = 'u1.base'
def get_data(file_name):
data = open(file_name,'r').readlines()
return data
def Transpose(matrix):
nrows = len(matrix)
ncols = len(matrix[0])
matrixT = [[0 for i in range(nrows)] for j in range(ncols)]
for i in range(nrows):
for j in range(ncols):
matrixT[j][i] = matrix[i][j]
return matrixT
def format_training_set(data):
result = list(map( lambda x: x.strip( ), data ))
result = list(map( lambda x: x.split("\t"), result))
for n in range(len(result)):
result[n] = list(map( int, result[n][0:-1]))
return result
def _similarity_matrix(transposed_data):
Users = transposed_data[0]
numberOfUsers = len(list(set(Users))) #### Find out how many users there are in the training set.
SimilarityMatrix = [[0 for j in range(numberOfUsers + 1)] for i in range(numberOfUsers + 1)]
#### Initially the similarity matrix is really an identity matrix.
#### However, the first row and first column are just 0's. But the rest of the matrix is an identity matrix.
for i in range(1, len(SimilarityMatrix)):
for j in range(1, len(SimilarityMatrix)):
if i == j:
SimilarityMatrix[i][j] = 1
return SimilarityMatrix
def _average_ratings(Items,Ratings):
def FindTheAverage(List):
return sum(List)/len(List)
SetOfItems = set(Items)
AverageRatingsOfItems = dict( )
for item in SetOfItems:
AverageRatingsOfItems[item] = [ ]
for i in range(len(Items)):
AverageRatingsOfItems[Items[i]].append(Ratings[i])
for item in AverageRatingsOfItems:
AverageRatingsOfItems[item] = FindTheAverage(AverageRatingsOfItems[item])
return AverageRatingsOfItems
def _pq_ratings(TrainingSet,Users,Items,Ratings):
PQ_Ratings = dict( )
#### The dictionary below gives the rating that some user has for some item.
for n in range(len(TrainingSet)):
PQ_Ratings[(Users[n], Items[n])] = Ratings[n]
return PQ_Ratings
#@profile
def Correlation(i, j):
sum1 = 0
sum2 = 0
numeratorProduct = 1
denominatorProduct1 = 1
denominatorProduct2 = 1
pairs = IJ_Pairs.get((i,j),[])
for pair in pairs:
_i = pair[0]
_j = pair[1]
pq_rating = PQ_Ratings[pair]
avg_rating = AverageRatingsOfItems[_j]
diff = pq_rating - avg_rating
if _i == i:
sum1+= diff
else:
sum2+= diff
numeratorProduct+= sum1*sum2
denominatorProduct1+= sum1**2
denominatorProduct2+= sum2**2
denom = (math.sqrt(denominatorProduct1)*math.sqrt(denominatorProduct2))
result = numeratorProduct/denom
return result
#@profile
def pq_to_dict():
total = len(SimilarityMatrix)
i_pairs = {}
j_pairs = {}
ij_pairs = {}
for i in range(1, total):
for j in range(i + 1, total):
i_pairs[i] = (i,j)
j_pairs[j] = (i,j)
ij_pairs[(i,j)] = []
for pair in PQ_Keys:
target_key = pair[0]
has_i = i_pairs.get(target_key,False)
has_j = j_pairs.get(target_key,False)
has_combo = ij_pairs.get(pair,False)
if has_i != False:
if has_combo != False:
ij_pairs[pair].append(pair)
if has_j != False:
if has_combo != False:
ij_pairs[pair].append(pair)
return ij_pairs
def calculate(SimilarityMatrix):
total = len(SimilarityMatrix)
for i in range(1, total):
for j in range(i + 1, total):
SimilarityMatrix[i][j] = Correlation(i, j)
return #SimilarityMatrix
data = get_data(FILE_NAME)
TrainingSet = format_training_set(data)
transposed = Transpose(TrainingSet)
SimilarityMatrix = _similarity_matrix(transposed)
Users = transposed[0]
Items = transposed[1]
Ratings = transposed[2]
AverageRatingsOfItems = _average_ratings(Items,Ratings)
PQ_Ratings = _pq_ratings(TrainingSet,Users,Items,Ratings)
PQ_Keys = PQ_Ratings.keys()
IJ_Pairs = pq_to_dict()
calculate(SimilarityMatrix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment