dylanfareed/gist:415917

## gistfile1.txt
# ----------------------------------------------------------------------
#
# Ruby adaptations of the Python code found in Toby Segaran's
# Programming Collective Intelligence book.
#
# http://www.romej.com/archives/590/programming-collective-intelligence-with-ruby
# steven.romej @ gmail (4 may 08)
# ----------------------------------------------------------------------


class Recommendations
  # -------------------------------------------------
  # Euclidean distance
  # -------------------------------------------------

  # Returns a distance-based similarity score for person1 and person2
  def sim_distance( prefs , person1 , person2 )
    # Get the list of shared_items
    si = {}
    for item in prefs[person1].keys
      if prefs[person2].include? item
        si[item] = 1
      end
    end

    # if they have no ratings in common, return 0
    return 0 if si.length == 0

    squares = []
    for item in prefs[person1].keys
      if prefs[person2].include? item
        squares << (prefs[person1][item] - prefs[person2][item]) ** 2
      end
    end

    sum_of_squares = squares.inject { |sum,value| sum += value }
    return 1/(1 + sum_of_squares)
  end

  # -------------------------------------------------
  # Pearson score
  # -------------------------------------------------

  # Returns the Pearson correlation coefficient for p1 and p2
  def sim_pearson( prefs, p1, p2)
    # Get the list of mutually rated items
    si = {}
    for item in prefs[p1].keys
      si[item] = 1 if prefs[p2].include? item
    end

    # Find the number of elements
    n = si.length
    # If there are no ratings in common, return 0
    return 0 if n == 0

    # Add up all the preferences
    sum1 = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] }
    sum2 = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] }

    # Sum up the squares
    sum1Sq = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] ** 2 }
    sum2Sq = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] ** 2 }

    # Sum up the products
    pSum = si.keys.inject(0) { |sum,value| sum += (prefs[p1][value] * prefs[p2][value])}

    # Calculate the Pearson score
    num = pSum - (sum1*sum2/n)
    den = Math.sqrt((sum1Sq - (sum1 ** 2)/n) * (sum2Sq - (sum2 ** 2)/n))

    return 0 if den == 0
    r = num / den
  end

  # Ranking the critics
  # TODO lacks the score-function-as-parameter aspect of original
  def topMatches( prefs, person, n=5, scorefunc = :sim_pearson )
    scores = []
    for other in prefs.keys
      if scorefunc == :sim_pearson
        scores << [ sim_pearson(prefs,person,other), other] if other != person
      else
        scores << [ sim_distance(prefs,person,other), other] if other != person
      end
    end
    return scores.sort.reverse.slice(0,n)
  end

  # Gets recommendations for a person by using a weighted average
  # of every other user's rankings
  # TODO just uses sim_pearson and not a function as parameter
  def getRecommendations(prefs, person, scorefunc = :sim_pearson )
    totals = {}
    simSums = {}
    for other in prefs.keys
      # don't compare me to myself
      next if other == person

      if scorefunc == :sim_pearson
        sim = sim_pearson( prefs, person, other)
      else
        sim = sim_distance( prefs, person, other)
      end

      # ignore scores of zero or lower
      next if sim <= 0

      for item in prefs[other].keys
        # only score movies I haven't seen yet
        if !prefs[person].include? item or prefs[person][item] == 0
          # similarity * score
          totals.default = 0
          totals[item] += prefs[other][item] * sim
          # sum of similarities
          simSums.default = 0
          simSums[item] += sim
        end
      end
    end

    # Create a normalized list
    rankings = []
    totals.each do |item,total|
      rankings << [total/simSums[item], item]
    end

    # Return the sorted list
    return rankings.sort.reverse
  end


  def transformPrefs( prefs )
    result = {}
    for person in prefs.keys
      for item in prefs[person].keys
        result[item] = {} if result[item] == nil
        # Flip item and person
        result[item][person] = prefs[person][item]
      end
    end
    return result
  end

  def calculateSimilarItems( prefs, n = 10 )
    # Create a dictionary of items showing which other items they are most similar to
    result = {}

    # Invert the preference matrix to be item-centric
    itemPrefs = transformPrefs(prefs)

    c = 0
    for item in itemPrefs.keys
      # Status updates for large datasets
      c += 1
      puts "#{c}/#{itemPrefs.length}" if c % 100 == 0
      # Find the most similar items to this one
      scores = topMatches(itemPrefs, item, n, :sim_distance)
      result[item] = scores
    end
    return result
  end


  def getRecommendedItems( prefs, itemMatch, user)
    userRatings = prefs[user]
    scores = {}
    totalSim = {}

    # Loop over items rated by this user
    userRatings.each do |item,rating|
      itemMatch[item].each do |similarity,item2|
        # Ignore if this user has already rated this item
        next if userRatings.include? item2

        # Weighted sum of rating times similarity
        scores[item2] = 0 if scores[item2] == nil
        scores[item2] += similarity * rating

        # Sum of all the similarities
        totalSim[item2] = 0 if totalSim[item2] == nil
        totalSim[item2] += similarity
      end
    end

    # Divide each total score by total weighting to get an average
    rankings = []
    scores.each do |item,score|
      rankings << [score/totalSim[item], item]
    end

    return rankings.sort.reverse
  end


  def loadMovieLens( path = "ml-data" )
    movies = {}
    File.open(path + "/u.item") do |file|
      while !file.eof?
        (id,title) = file.readline.split("|")[0,2]
        movies[id] = title
      end
    end

    prefs = {}
    File.open(path + "/u.data") do |file|
      while !file.eof?
        (user,movieid,rating,ts) = file.readline.split("\t")
        prefs[user] = {} if prefs[user] == nil
        prefs[user][movies[movieid]] = rating.to_f
      end
    end
    return prefs
  end

end


# ----------------------------------------------------------------------
# A simple class that implements the necessary pydelicious functions
# Sleeps 1 second after each request to prevent 503 errors
# ----------------------------------------------------------------------
require 'net/http'
require 'rexml/document'
require 'digest/md5'

module Delicious

  # Get a list of popular urls (title and link)
  def get_popular( tag = "" )
    popular = []
    url = "http://del.icio.us/rss/popular/#{tag}"

    response = Net::HTTP.get_response(URI.parse(url)).body
    doc = REXML::Document.new(response)

    doc.elements.each("//item") do |item|
      popular << { "title" => item.elements["title"].text , "href" => item.elements["link"].text }
    end
    sleep 1
    return popular
  end

  # Get a list of users that posted the url
  def get_urlposts( url )
    urlposts = []
    urlcode = Digest::MD5.hexdigest(url)
    url = "http://feeds.delicious.com/rss/url/#{urlcode}"

    response = Net::HTTP.get_response(URI.parse(url)).body
    doc = REXML::Document.new(response)

    doc.elements.each("//item") do |item|
      urlposts << { "user" => item.elements["dc:creator"].text }
    end

    sleep 1
    return urlposts
  end

  # Get a list of urls by username
  def get_userposts( user )
    posts = []
    url = "http://feeds.delicious.com/rss/#{user}"

    response = Net::HTTP.get_response(URI.parse(url)).body
    doc = REXML::Document.new(response)

    doc.elements.each("//item") do |item|
      posts << { "href" => item.elements["link"].text }
    end

    sleep 1
    return posts
  end
end

# ----------------------------------------------------------------------
# A del.icio.us link recommendation engine
# ----------------------------------------------------------------------
class DeliciousRec
  include Delicious

  # returns a dictionary of users, each pointing to empty hash
  def initializeUserDict( tag, count = 1 )
    user_dict = {}
    # get the top 'count' popular posts
    for p1 in get_popular(tag)[0,count]
      # find all users who posted this
      for p2 in get_urlposts(p1["href"])
        user = p2["user"]
        user_dict[user] = {}
      end
    end
    return user_dict
  end

  def fillItems( user_dict )
    all_items = {}
    # Find links posted by all users
    for user in user_dict.keys
      for attempt in 1..3
        begin
          puts "fetching for #{user}"
          posts = get_userposts(user)
          puts "fetched posts for #{user}"
          sleep 2
          break
        rescue
          puts "Failed user #{user}, retrying"
          sleep 4
        end
      end

      for post in posts
        url = post["href"]
        user_dict[user][url] = 1
        all_items[url] = 1
      end
    end

    # Fill in missing items with 0
    for ratings in user_dict.values
      for item in all_items.keys
        ratings[item] = 0 if !ratings.include? item
      end
    end
  end
end


class App

  def run
    # A dictionary of movie critics and their ratings of a small
    # set of movies
    critics = {'Lisa Rose'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.5,
 'Just My Luck'=> 3.0, 'Superman Returns'=> 3.5, 'You, Me and Dupree'=> 2.5,
 'The Night Listener'=> 3.0},
 'Gene Seymour'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 3.5,
 'Just My Luck'=> 1.5, 'Superman Returns'=> 5.0, 'The Night Listener'=> 3.0,
 'You, Me and Dupree'=> 3.5},
 'Michael Phillips'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.0,
 'Superman Returns'=> 3.5, 'The Night Listener'=> 4.0},
 'Claudia Puig'=> {'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 3.0,
 'The Night Listener'=> 4.5, 'Superman Returns'=> 4.0,
 'You, Me and Dupree'=> 2.5},
 'Mick LaSalle'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
 'Just My Luck'=> 2.0, 'Superman Returns'=> 3.0, 'The Night Listener'=> 3.0,
 'You, Me and Dupree'=> 2.0},
 'Jack Matthews'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
 'The Night Listener'=> 3.0, 'Superman Returns'=> 5.0, 'You, Me and Dupree'=> 3.5},
 'Toby'=> {'Snakes on a Plane'=>4.5,'You, Me and Dupree'=>1.0,'Superman Returns'=>4.0}
    }


    recommendations = Recommendations.new

    # Test the Euclidean score code
    puts "The Euclidean distance score is #{recommendations.sim_distance( critics , "Lisa Rose", "Gene Seymour")}"
    # Test the Pearson score
    puts "The Pearson score is #{recommendations.sim_pearson( critics , "Lisa Rose" , "Gene Seymour" )}"
    # Test the topMatches
    puts recommendations.topMatches(critics,"Toby", 3)
    # Try getting recommendations
    puts recommendations.getRecommendations(critics,"Toby")
    # Transform the preferences, get recommendation
    movies = recommendations.transformPrefs( critics )
    puts recommendations.topMatches( movies , "Superman Returns")

    itemsim = recommendations.calculateSimilarItems(critics)
    puts itemsim
    puts recommendations.getRecommendedItems(critics, itemsim, "Toby")

    # -------------------------------------------------------------------
    # del.icio.us examples
    delicious = DeliciousRec.new
    delusers = delicious.initializeUserDict("programming")
    delicious.fillItems(delusers)
    # pick a user at random
    user = delusers.keys[rand(delusers.length - 1)]
    puts "user is #{user}"
    puts recommendations.topMatches(delusers,user)
    puts recommendations.getRecommendations(delusers,user)[0,10]
    url = recommendations.getRecommendations(delusers,user)[0][1]
    puts "the url is #{url}"
    puts recommendations.topMatches(recommendations.transformPrefs(delusers),url)

    # --------------------------------------------------------------------
    # Movie Lens examples
    ##prefs = recommendations.loadMovieLens()
    #puts prefs["87"]
    #puts recommendations.getRecommendations(prefs, "87")[0,30]
    ##itemsim = recommendations.calculateSimilarItems(prefs, n=50)
    ##puts recommendations.getRecommendedItems(prefs, itemsim, "87")[0,30]
  end
end

app = App.new
app.run
	# ----------------------------------------------------------------------
	#
	# Ruby adaptations of the Python code found in Toby Segaran's
	# Programming Collective Intelligence book.
	#
	# http://www.romej.com/archives/590/programming-collective-intelligence-with-ruby
	# steven.romej @ gmail (4 may 08)
	# ----------------------------------------------------------------------


	class Recommendations
	# -------------------------------------------------
	# Euclidean distance
	# -------------------------------------------------

	# Returns a distance-based similarity score for person1 and person2
	def sim_distance( prefs , person1 , person2 )
	# Get the list of shared_items
	si = {}
	for item in prefs[person1].keys
	if prefs[person2].include? item
	si[item] = 1
	end
	end

	# if they have no ratings in common, return 0
	return 0 if si.length == 0

	squares = []
	for item in prefs[person1].keys
	if prefs[person2].include? item
	squares << (prefs[person1][item] - prefs[person2][item]) ** 2
	end
	end

	sum_of_squares = squares.inject { \|sum,value\| sum += value }
	return 1/(1 + sum_of_squares)
	end

	# -------------------------------------------------
	# Pearson score
	# -------------------------------------------------

	# Returns the Pearson correlation coefficient for p1 and p2
	def sim_pearson( prefs, p1, p2)
	# Get the list of mutually rated items
	si = {}
	for item in prefs[p1].keys
	si[item] = 1 if prefs[p2].include? item
	end

	# Find the number of elements
	n = si.length
	# If there are no ratings in common, return 0
	return 0 if n == 0

	# Add up all the preferences
	sum1 = si.keys.inject(0) { \|sum,value\| sum += prefs[p1][value] }
	sum2 = si.keys.inject(0) { \|sum,value\| sum += prefs[p2][value] }

	# Sum up the squares
	sum1Sq = si.keys.inject(0) { \|sum,value\| sum += prefs[p1][value] ** 2 }
	sum2Sq = si.keys.inject(0) { \|sum,value\| sum += prefs[p2][value] ** 2 }

	# Sum up the products
	pSum = si.keys.inject(0) { \|sum,value\| sum += (prefs[p1][value] * prefs[p2][value])}

	# Calculate the Pearson score
	num = pSum - (sum1*sum2/n)
	den = Math.sqrt((sum1Sq - (sum1 ** 2)/n) * (sum2Sq - (sum2 ** 2)/n))

	return 0 if den == 0
	r = num / den
	end

	# Ranking the critics
	# TODO lacks the score-function-as-parameter aspect of original
	def topMatches( prefs, person, n=5, scorefunc = :sim_pearson )
	scores = []
	for other in prefs.keys
	if scorefunc == :sim_pearson
	scores << [ sim_pearson(prefs,person,other), other] if other != person
	else
	scores << [ sim_distance(prefs,person,other), other] if other != person
	end
	end
	return scores.sort.reverse.slice(0,n)
	end

	# Gets recommendations for a person by using a weighted average
	# of every other user's rankings
	# TODO just uses sim_pearson and not a function as parameter
	def getRecommendations(prefs, person, scorefunc = :sim_pearson )
	totals = {}
	simSums = {}
	for other in prefs.keys
	# don't compare me to myself
	next if other == person

	if scorefunc == :sim_pearson
	sim = sim_pearson( prefs, person, other)
	else
	sim = sim_distance( prefs, person, other)
	end

	# ignore scores of zero or lower
	next if sim <= 0

	for item in prefs[other].keys
	# only score movies I haven't seen yet
	if !prefs[person].include? item or prefs[person][item] == 0
	# similarity * score
	totals.default = 0
	totals[item] += prefs[other][item] * sim
	# sum of similarities
	simSums.default = 0
	simSums[item] += sim
	end
	end
	end

	# Create a normalized list
	rankings = []
	totals.each do \|item,total\|
	rankings << [total/simSums[item], item]
	end

	# Return the sorted list
	return rankings.sort.reverse
	end


	def transformPrefs( prefs )
	result = {}
	for person in prefs.keys
	for item in prefs[person].keys
	result[item] = {} if result[item] == nil
	# Flip item and person
	result[item][person] = prefs[person][item]
	end
	end
	return result
	end

	def calculateSimilarItems( prefs, n = 10 )
	# Create a dictionary of items showing which other items they are most similar to
	result = {}

	# Invert the preference matrix to be item-centric
	itemPrefs = transformPrefs(prefs)

	c = 0
	for item in itemPrefs.keys
	# Status updates for large datasets
	c += 1
	puts "#{c}/#{itemPrefs.length}" if c % 100 == 0
	# Find the most similar items to this one
	scores = topMatches(itemPrefs, item, n, :sim_distance)
	result[item] = scores
	end
	return result
	end


	def getRecommendedItems( prefs, itemMatch, user)
	userRatings = prefs[user]
	scores = {}
	totalSim = {}

	# Loop over items rated by this user
	userRatings.each do \|item,rating\|
	itemMatch[item].each do \|similarity,item2\|
	# Ignore if this user has already rated this item
	next if userRatings.include? item2

	# Weighted sum of rating times similarity
	scores[item2] = 0 if scores[item2] == nil
	scores[item2] += similarity * rating

	# Sum of all the similarities
	totalSim[item2] = 0 if totalSim[item2] == nil
	totalSim[item2] += similarity
	end
	end

	# Divide each total score by total weighting to get an average
	rankings = []
	scores.each do \|item,score\|
	rankings << [score/totalSim[item], item]
	end

	return rankings.sort.reverse
	end


	def loadMovieLens( path = "ml-data" )
	movies = {}
	File.open(path + "/u.item") do \|file\|
	while !file.eof?
	(id,title) = file.readline.split("\|")[0,2]
	movies[id] = title
	end
	end

	prefs = {}
	File.open(path + "/u.data") do \|file\|
	while !file.eof?
	(user,movieid,rating,ts) = file.readline.split("\t")
	prefs[user] = {} if prefs[user] == nil
	prefs[user][movies[movieid]] = rating.to_f
	end
	end
	return prefs
	end

	end


	# ----------------------------------------------------------------------
	# A simple class that implements the necessary pydelicious functions
	# Sleeps 1 second after each request to prevent 503 errors
	# ----------------------------------------------------------------------
	require 'net/http'
	require 'rexml/document'
	require 'digest/md5'

	module Delicious

	# Get a list of popular urls (title and link)
	def get_popular( tag = "" )
	popular = []
	url = "http://del.icio.us/rss/popular/#{tag}"

	response = Net::HTTP.get_response(URI.parse(url)).body
	doc = REXML::Document.new(response)

	doc.elements.each("//item") do \|item\|
	popular << { "title" => item.elements["title"].text , "href" => item.elements["link"].text }
	end
	sleep 1
	return popular
	end

	# Get a list of users that posted the url
	def get_urlposts( url )
	urlposts = []
	urlcode = Digest::MD5.hexdigest(url)
	url = "http://feeds.delicious.com/rss/url/#{urlcode}"

	response = Net::HTTP.get_response(URI.parse(url)).body
	doc = REXML::Document.new(response)

	doc.elements.each("//item") do \|item\|
	urlposts << { "user" => item.elements["dc:creator"].text }
	end

	sleep 1
	return urlposts
	end

	# Get a list of urls by username
	def get_userposts( user )
	posts = []
	url = "http://feeds.delicious.com/rss/#{user}"

	response = Net::HTTP.get_response(URI.parse(url)).body
	doc = REXML::Document.new(response)

	doc.elements.each("//item") do \|item\|
	posts << { "href" => item.elements["link"].text }
	end

	sleep 1
	return posts
	end
	end

	# ----------------------------------------------------------------------
	# A del.icio.us link recommendation engine
	# ----------------------------------------------------------------------
	class DeliciousRec
	include Delicious

	# returns a dictionary of users, each pointing to empty hash
	def initializeUserDict( tag, count = 1 )
	user_dict = {}
	# get the top 'count' popular posts
	for p1 in get_popular(tag)[0,count]
	# find all users who posted this
	for p2 in get_urlposts(p1["href"])
	user = p2["user"]
	user_dict[user] = {}
	end
	end
	return user_dict
	end

	def fillItems( user_dict )
	all_items = {}
	# Find links posted by all users
	for user in user_dict.keys
	for attempt in 1..3
	begin
	puts "fetching for #{user}"
	posts = get_userposts(user)
	puts "fetched posts for #{user}"
	sleep 2
	break
	rescue
	puts "Failed user #{user}, retrying"
	sleep 4
	end
	end

	for post in posts
	url = post["href"]
	user_dict[user][url] = 1
	all_items[url] = 1
	end
	end

	# Fill in missing items with 0
	for ratings in user_dict.values
	for item in all_items.keys
	ratings[item] = 0 if !ratings.include? item
	end
	end
	end
	end


	class App

	def run
	# A dictionary of movie critics and their ratings of a small
	# set of movies
	critics = {'Lisa Rose'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.5,
	'Just My Luck'=> 3.0, 'Superman Returns'=> 3.5, 'You, Me and Dupree'=> 2.5,
	'The Night Listener'=> 3.0},
	'Gene Seymour'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 3.5,
	'Just My Luck'=> 1.5, 'Superman Returns'=> 5.0, 'The Night Listener'=> 3.0,
	'You, Me and Dupree'=> 3.5},
	'Michael Phillips'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.0,
	'Superman Returns'=> 3.5, 'The Night Listener'=> 4.0},
	'Claudia Puig'=> {'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 3.0,
	'The Night Listener'=> 4.5, 'Superman Returns'=> 4.0,
	'You, Me and Dupree'=> 2.5},
	'Mick LaSalle'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
	'Just My Luck'=> 2.0, 'Superman Returns'=> 3.0, 'The Night Listener'=> 3.0,
	'You, Me and Dupree'=> 2.0},
	'Jack Matthews'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0,
	'The Night Listener'=> 3.0, 'Superman Returns'=> 5.0, 'You, Me and Dupree'=> 3.5},
	'Toby'=> {'Snakes on a Plane'=>4.5,'You, Me and Dupree'=>1.0,'Superman Returns'=>4.0}
	}


	recommendations = Recommendations.new

	# Test the Euclidean score code
	puts "The Euclidean distance score is #{recommendations.sim_distance( critics , "Lisa Rose", "Gene Seymour")}"
	# Test the Pearson score
	puts "The Pearson score is #{recommendations.sim_pearson( critics , "Lisa Rose" , "Gene Seymour" )}"
	# Test the topMatches
	puts recommendations.topMatches(critics,"Toby", 3)
	# Try getting recommendations
	puts recommendations.getRecommendations(critics,"Toby")
	# Transform the preferences, get recommendation
	movies = recommendations.transformPrefs( critics )
	puts recommendations.topMatches( movies , "Superman Returns")

	itemsim = recommendations.calculateSimilarItems(critics)
	puts itemsim
	puts recommendations.getRecommendedItems(critics, itemsim, "Toby")

	# -------------------------------------------------------------------
	# del.icio.us examples
	delicious = DeliciousRec.new
	delusers = delicious.initializeUserDict("programming")
	delicious.fillItems(delusers)
	# pick a user at random
	user = delusers.keys[rand(delusers.length - 1)]
	puts "user is #{user}"
	puts recommendations.topMatches(delusers,user)
	puts recommendations.getRecommendations(delusers,user)[0,10]
	url = recommendations.getRecommendations(delusers,user)[0][1]
	puts "the url is #{url}"
	puts recommendations.topMatches(recommendations.transformPrefs(delusers),url)

	# --------------------------------------------------------------------
	# Movie Lens examples
	##prefs = recommendations.loadMovieLens()
	#puts prefs["87"]
	#puts recommendations.getRecommendations(prefs, "87")[0,30]
	##itemsim = recommendations.calculateSimilarItems(prefs, n=50)
	##puts recommendations.getRecommendedItems(prefs, itemsim, "87")[0,30]
	end
	end

	app = App.new
	app.run