Skip to content

Instantly share code, notes, and snippets.

@dylanfareed
Created May 27, 2010 15:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dylanfareed/415917 to your computer and use it in GitHub Desktop.
Save dylanfareed/415917 to your computer and use it in GitHub Desktop.
# ----------------------------------------------------------------------
#
# Ruby adaptations of the Python code found in Toby Segaran's
# Programming Collective Intelligence book.
#
# http://www.romej.com/archives/590/programming-collective-intelligence-with-ruby
# steven.romej @ gmail (4 may 08)
# ----------------------------------------------------------------------
class Recommendations
# -------------------------------------------------
# Euclidean distance
# -------------------------------------------------
# Returns a distance-based similarity score for person1 and person2
def sim_distance( prefs , person1 , person2 )
# Get the list of shared_items
si = {}
for item in prefs[person1].keys
if prefs[person2].include? item
si[item] = 1
end
end
# if they have no ratings in common, return 0
return 0 if si.length == 0
squares = []
for item in prefs[person1].keys
if prefs[person2].include? item
squares << (prefs[person1][item] - prefs[person2][item]) ** 2
end
end
sum_of_squares = squares.inject { |sum,value| sum += value }
return 1/(1 + sum_of_squares)
end
# -------------------------------------------------
# Pearson score
# -------------------------------------------------
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson( prefs, p1, p2)
# Get the list of mutually rated items
si = {}
for item in prefs[p1].keys
si[item] = 1 if prefs[p2].include? item
end
# Find the number of elements
n = si.length
# If there are no ratings in common, return 0
return 0 if n == 0
# Add up all the preferences
sum1 = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] }
sum2 = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] }
# Sum up the squares
sum1Sq = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] ** 2 }
sum2Sq = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] ** 2 }
# Sum up the products
pSum = si.keys.inject(0) { |sum,value| sum += (prefs[p1][value] * prefs[p2][value])}
# Calculate the Pearson score
num = pSum - (sum1*sum2/n)
den = Math.sqrt((sum1Sq - (sum1 ** 2)/n) * (sum2Sq - (sum2 ** 2)/n))
return 0 if den == 0
r = num / den
end
# Ranking the critics
# TODO lacks the score-function-as-parameter aspect of original
def topMatches( prefs, person, n=5, scorefunc = :sim_pearson )
scores = []
for other in prefs.keys
if scorefunc == :sim_pearson
scores << [ sim_pearson(prefs,person,other), other] if other != person
else
scores << [ sim_distance(prefs,person,other), other] if other != person
end
end
return scores.sort.reverse.slice(0,n)
end
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
# TODO just uses sim_pearson and not a function as parameter
def getRecommendations(prefs, person, scorefunc = :sim_pearson )
totals = {}
simSums = {}
for other in prefs.keys
# don't compare me to myself
next if other == person
if scorefunc == :sim_pearson
sim = sim_pearson( prefs, person, other)
else
sim = sim_distance( prefs, person, other)
end
# ignore scores of zero or lower
next if sim <= 0
for item in prefs[other].keys
# only score movies I haven't seen yet
if !prefs[person].include? item or prefs[person][item] == 0
# similarity * score
totals.default = 0
totals[item] += prefs[other][item] * sim
# sum of similarities
simSums.default = 0
simSums[item] += sim
end
end
end
# Create a normalized list
rankings = []
totals.each do |item,total|
rankings << [total/simSums[item], item]
end
# Return the sorted list
return rankings.sort.reverse
end
def transformPrefs( prefs )
result = {}
for person in prefs.keys
for item in prefs[person].keys
result[item] = {} if result[item] == nil
# Flip item and person
result[item][person] = prefs[person][item]
end
end
return result
end
def calculateSimilarItems( prefs, n = 10 )
# Create a dictionary of items showing which other items they are most similar to
result = {}
# Invert the preference matrix to be item-centric
itemPrefs = transformPrefs(prefs)
c = 0
for item in itemPrefs.keys
# Status updates for large datasets
c += 1
puts "#{c}/#{itemPrefs.length}" if c % 100 == 0
# Find the most similar items to this one
scores = topMatches(itemPrefs, item, n, :sim_distance)
result[item] = scores
end
return result
end
def getRecommendedItems( prefs, itemMatch, user)
userRatings = prefs[user]
scores = {}
totalSim = {}
# Loop over items rated by this user
userRatings.each do |item,rating|
itemMatch[item].each do |similarity,item2|
# Ignore if this user has already rated this item
next if userRatings.include? item2
# Weighted sum of rating times similarity
scores[item2] = 0 if scores[item2] == nil
scores[item2] += similarity * rating
# Sum of all the similarities
totalSim[item2] = 0 if totalSim[item2] == nil
totalSim[item2] += similarity
end
end
# Divide each total score by total weighting to get an average
rankings = []
scores.each do |item,score|
rankings << [score/totalSim[item], item]
end
return rankings.sort.reverse
end
def loadMovieLens( path = "ml-data" )
movies = {}
File.open(path + "/u.item") do |file|
while !file.eof?
(id,title) = file.readline.split("|")[0,2]
movies[id] = title
end
end
prefs = {}
File.open(path + "/u.data") do |file|
while !file.eof?
(user,movieid,rating,ts) = file.readline.split("\t")
prefs[user] = {} if prefs[user] == nil
prefs[user][movies[movieid]] = rating.to_f
end
end
return prefs
end
end
# ----------------------------------------------------------------------
# A simple class that implements the necessary pydelicious functions
# Sleeps 1 second after each request to prevent 503 errors
# ----------------------------------------------------------------------
require 'net/http'
require 'rexml/document'
require 'digest/md5'
module Delicious
# Get a list of popular urls (title and link)
def get_popular( tag = "" )
popular = []
url = "http://del.icio.us/rss/popular/#{tag}"
response = Net::HTTP.get_response(URI.parse(url)).body
doc = REXML::Document.new(response)
doc.elements.each("//item") do |item|
popular << { "title" => item.elements["title"].text , "href" => item.elements["link"].text }
end
sleep 1
return popular
end
# Get a list of users that posted the url
def get_urlposts( url )
urlposts = []
urlcode = Digest::MD5.hexdigest(url)
url = "http://feeds.delicious.com/rss/url/#{urlcode}"
response = Net::HTTP.get_response(URI.parse(url)).body
doc = REXML::Document.new(response)
doc.elements.each("//item") do |item|
urlposts << { "user" => item.elements["dc:creator"].text }
end
sleep 1
return urlposts
end
# Get a list of urls by username
def get_userposts( user )
posts = []
url = "http://feeds.delicious.com/rss/#{user}"
response = Net::HTTP.get_response(URI.parse(url)).body
doc = REXML::Document.new(response)
doc.elements.each("//item") do |item|
posts << { "href" => item.elements["link"].text }
end
sleep 1
return posts
end
end
# ----------------------------------------------------------------------
# A del.icio.us link recommendation engine
# ----------------------------------------------------------------------
class DeliciousRec
include Delicious
# returns a dictionary of users, each pointing to empty hash
def initializeUserDict( tag, count = 1 )
user_dict = {}
# get the top 'count' popular posts
for p1 in get_popular(tag)[0,count]
# find all users who posted this
for p2 in get_urlposts(p1["href"])
user = p2["user"]
user_dict[user] = {}
end
end
return user_dict
end
def fillItems( user_dict )
all_items = {}
# Find links posted by all users
for user in user_dict.keys
for attempt in 1..3
begin
puts "fetching for #{user}"
posts = get_userposts(user)
puts "fetched posts for #{user}"
sleep 2
break
rescue
puts "Failed user #{user}, retrying"
sleep 4
end
end
for post in posts
url = post["href"]
user_dict[user][url] = 1
all_items[url] = 1
end
end
# Fill in missing items with 0
for ratings in user_dict.values
for item in all_items.keys
ratings[item] = 0 if !ratings.include? item
end
end
end
end
class App
def run
# A dictionary of movie critics and their ratings of a small
# set of movies
critics = {'Lisa Rose'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.5,
'Just My Luck'=> 3.0, 'Superman Returns'=> 3.5, 'You, Me and Dupree'=> 2.5,
'The Night Listener'=> 3.0},
'Gene Seymour'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 3.5,
'Just My Luck'=> 1.5, 'Superman Returns'=> 5.0, 'The Night Listener'=> 3.0,
'You, Me and Dupree'=> 3.5},
'Michael Phillips'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.0,
'Superman Returns'=> 3.5, 'The Night Listener'=> 4.0},
'Claudia Puig'=> {'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 3.0,
'The Night Listener'=> 4.5, 'Superman Returns'=> 4.0,
'You, Me and Dupree'=> 2.5},
'Mick LaSalle'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
'Just My Luck'=> 2.0, 'Superman Returns'=> 3.0, 'The Night Listener'=> 3.0,
'You, Me and Dupree'=> 2.0},
'Jack Matthews'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
'The Night Listener'=> 3.0, 'Superman Returns'=> 5.0, 'You, Me and Dupree'=> 3.5},
'Toby'=> {'Snakes on a Plane'=>4.5,'You, Me and Dupree'=>1.0,'Superman Returns'=>4.0}
}
recommendations = Recommendations.new
# Test the Euclidean score code
puts "The Euclidean distance score is #{recommendations.sim_distance( critics , "Lisa Rose", "Gene Seymour")}"
# Test the Pearson score
puts "The Pearson score is #{recommendations.sim_pearson( critics , "Lisa Rose" , "Gene Seymour" )}"
# Test the topMatches
puts recommendations.topMatches(critics,"Toby", 3)
# Try getting recommendations
puts recommendations.getRecommendations(critics,"Toby")
# Transform the preferences, get recommendation
movies = recommendations.transformPrefs( critics )
puts recommendations.topMatches( movies , "Superman Returns")
itemsim = recommendations.calculateSimilarItems(critics)
puts itemsim
puts recommendations.getRecommendedItems(critics, itemsim, "Toby")
# -------------------------------------------------------------------
# del.icio.us examples
delicious = DeliciousRec.new
delusers = delicious.initializeUserDict("programming")
delicious.fillItems(delusers)
# pick a user at random
user = delusers.keys[rand(delusers.length - 1)]
puts "user is #{user}"
puts recommendations.topMatches(delusers,user)
puts recommendations.getRecommendations(delusers,user)[0,10]
url = recommendations.getRecommendations(delusers,user)[0][1]
puts "the url is #{url}"
puts recommendations.topMatches(recommendations.transformPrefs(delusers),url)
# --------------------------------------------------------------------
# Movie Lens examples
##prefs = recommendations.loadMovieLens()
#puts prefs["87"]
#puts recommendations.getRecommendations(prefs, "87")[0,30]
##itemsim = recommendations.calculateSimilarItems(prefs, n=50)
##puts recommendations.getRecommendedItems(prefs, itemsim, "87")[0,30]
end
end
app = App.new
app.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment