Created
July 25, 2010 01:39
-
-
Save satojkovic/489178 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"Lisa Rose" : { | |
"Lady in the Water" : 2.5, | |
"Snakes on a Plane" : 3.5, | |
"Just My Luck" : 3.0, | |
"Superman Returns" : 3.5, | |
"You, Me and Dupree" : 2.5, | |
"The Night Listener" : 3.0 | |
}, | |
"Gene Seymour" : { | |
"Lady in the Water" : 3.0, | |
"Snakes on a Plane" : 3.5, | |
"Just My Luck" : 1.5, | |
"Superman Returns" : 5.0, | |
"The Night Listener" : 3.0, | |
"You, Me and Dupree" : 3.5 | |
}, | |
"Michael Phillips" : { | |
"Lady in the Water" : 2.5, | |
"Snakes on a Plane" : 3.0, | |
"Superman Returns" : 3.5, | |
"The Night Listener" : 4.0 | |
}, | |
"Claudia Puig" : { | |
"Snakes on a Plane" : 3.5, | |
"Just My Luck" : 3.0, | |
"The Night Listener" : 4.5, | |
"Superman Returns" : 4.0, | |
"You, Me and Dupree" : 2.5 | |
}, | |
"Mick LaSalle" : { | |
"Lady in the Water" : 3.0, | |
"Snakes on a Plane" : 4.0, | |
"Just My Luck" : 2.0, | |
"Superman Returns" : 3.0, | |
"The Night Listener" : 3.0, | |
"You, Me and Dupree" : 2.0 | |
}, | |
"Jack Matthews" : { | |
"Lady in the Water" : 3.0, | |
"Snakes on a Plane" : 4.0, | |
"The Night Listener" : 3.0, | |
"Superman Returns" : 5.0, | |
"You, Me and Dupree" : 3.5 | |
}, | |
"Toby" : { | |
"Snakes on a Plane" : 4.5, | |
"Superman Returns" : 4.0, | |
"You, Me and Dupree" : 1.0 | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; | |
use warnings; | |
use JSON; | |
use LWP::UserAgent; | |
use URI::Escape; | |
use utf8; | |
use Encode; | |
use Digest::MD5 qw(md5 md5_hex md5_base64); | |
use MyRecommendations; | |
# Popular bookmarkを取得する関数 | |
sub get_popular { | |
my ($tag, $count) = @_; | |
# Popular bookmarksをJSON形式で取得 from Delicious | |
my $tagd = uri_escape(encode('utf-8', $tag)); # URLエンコード | |
my $base = "http://feeds.delicious.com/v2/json/popular"; | |
my $url = sprintf("%s/%s?count=%d", $base, $tagd, $count); | |
my $ua = LWP::UserAgent->new; | |
my $req = HTTP::Request->new('GET', $url); | |
my $res = $ua->request($req); # ハッシュを要素に持つ配列が得られる | |
unless( $res->is_success ) { | |
die "Request failed : $url"; | |
} | |
else { | |
my $r = decode_json(decode_utf8($res->content)); | |
return $r; # 配列のリファレンス | |
} | |
} | |
# Recent bookmark for a URLを取得する関数 | |
sub get_urlposts { | |
my $urls = shift; | |
# Recent Bookmars for a URL | |
my $base = "http://feeds.delicious.com/v2/json/url"; | |
my $count = @{$urls}; | |
my $url; | |
my $ua = LWP::UserAgent->new; | |
my $req; | |
my $res; | |
my %rb = (); | |
for( my $i=0; $i<$count; $i++) { | |
# Request | |
$url = sprintf("%s/%s", $base, md5_hex($urls->[$i]->{'u'})); | |
$req = HTTP::Request->new('GET', $url); | |
$res = $ua->request($req); # ハッシュを要素に持つ配列が得られる | |
unless( $res->is_success ) { | |
die "Request failed : $url"; | |
} | |
else { | |
my $r = decode_json(decode_utf8($res->content)); | |
$rb{$urls->[$i]->{'u'}} = $r; | |
} | |
} | |
return \%rb; | |
} | |
# Popular bookmarkをブックマークしたユーザリスト(user dict)を作成する関数 | |
sub initializeUserDict { | |
my ($tag, $count) = @_; | |
# Popular Bookmarksを取得 | |
my $pb = get_popular($tag, $count); | |
# Recent bookmark for a URLを取得 | |
my $rb = get_urlposts(\%{$pb}); | |
# ユーザをキーにしたハッシュ(user dict)を作成 | |
my %ud = (); | |
foreach my $key (keys %{$rb}) { | |
my $loop = @{$rb->{$key}}; | |
for(my $i=0; $i<$loop; $i++) { | |
$ud{$rb->{$key}->[$i]->{'a'}} = (); | |
} | |
} | |
return \%ud; | |
} | |
# user dictに評価値を埋める関数 | |
sub fillItems { | |
my $ud = shift; | |
# bookmark for specific user | |
my $base = "http://feeds.delicious.com/v2/json"; | |
my $url; | |
my $ua = LWP::UserAgent->new; | |
my $req; | |
my $res; | |
my $r; | |
my $count; | |
my %all_items = (); | |
foreach my $user (keys %{ $ud }) { | |
$url = sprintf("%s/%s", $base, $user); | |
$req = HTTP::Request->new('GET', $url); | |
$res = $ua->request($req); | |
unless( $res->is_success ) { | |
warn "Request Failed : " . $url . "\n"; | |
} | |
else { | |
$r = decode_json(decode_utf8($res->content)); | |
} | |
# userがブックマークしているurlは1を埋める | |
$count = @{$r}; | |
for(my $i=0; $i<$count; $i++) { | |
$ud->{$user}->{$r->[$i]->{'u'}} = 1.0; | |
$all_items{$r->[$i]->{'u'}} = 1; | |
} | |
} | |
# userがブックマークしていないurlは0を埋める | |
foreach my $user (keys %{$ud}) { | |
foreach my $key (keys %all_items) { | |
# keyが存在しなければ新たに追加 | |
if( not exists $ud->{$user}->{$key} ) { | |
$ud->{$user}->{$key} = 0.0; | |
} | |
} | |
} | |
} | |
## | |
## メイン | |
## | |
my $tag = 'perl'; | |
my $count = 5; | |
my $udict = initializeUserDict($tag, $count); | |
fillItems($udict); | |
# データセット表示(一部) | |
my $c1 = 0; | |
my $c2 = 0; | |
foreach my $user (keys %{$udict}) { | |
print "$user : \n"; | |
foreach my $key (keys %{ $udict->{$user} }) { | |
if( $udict->{$user}->{$key} == 1.0 and $c1 < 2 ) { | |
print "\t" . $key . ":" . $udict->{$user}->{$key} . "\n"; | |
$c1++; | |
} | |
elsif( $udict->{$user}->{$key} == 0.0 and $c2 < 3 ) { | |
print "\t" . $key . ":" . $udict->{$user}->{$key} . "\n"; | |
$c2++; | |
} | |
} | |
last; | |
} | |
my @users = keys(%{$udict}); | |
my $unum = @users; | |
my $num = int(rand($unum)); | |
my $n = 5; | |
# Similar User | |
print "Similar user for $users[$num] :\n"; | |
my @match_users = topMatches($udict, $users[$num], $n, \&sim_pearson); | |
for (my $i=0; $i<$n; $i++) { | |
foreach my $key (keys %{ $match_users[$i] }) { | |
print "\t" . "$key\t : $match_users[$i]{$key}\n"; | |
} | |
} | |
# リンクを推薦 | |
print "Recommendations for $users[$num] :\n"; | |
my @recommendations = getRecommendations($udict, $users[$num], \&sim_pearson); | |
for(my $i=0; $i<$n; $i++) { | |
foreach my $key (keys %{ $recommendations[$i] }) { | |
print "\t" . uri_unescape($key) . "\t : $recommendations[$i]{$key}\n"; | |
} | |
} | |
# my $i = 0; | |
# foreach my $key (keys %{ $udict->{$user} }) { | |
# print $i . " : " . $key . ":" . $udict->{$user}->{$key} . "\n"; | |
# $i++; | |
# } | |
# foreach my $key (keys %$udict) { | |
# print $key . "\n"; | |
# } | |
#my $r = sim_distance($udict, 'HSN', 'ShelteringTheSky'); | |
#my $r = sim_pearson($udict, 'hayajo', 'kanisan'); | |
#print $r . "\n"; | |
# my $user = 'ftnk'; | |
#my $r = sim_pearson($udict, 'zjape', 'yosshi'); | |
# foreach my $key (keys %{$udict}) { | |
# print $key . "\n"; | |
# } | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; | |
use warnings; | |
use JSON; | |
# person1とperson2のユークリッド距離を基にした類似性スコアを返す | |
sub sim_distance { | |
my ($prefs, $person1, $person2) = @_; | |
# 二人とも評価しているアイテムのリストを得る | |
my %si = (); | |
foreach my $key (keys %{ $prefs->{$person1} } ) { | |
if ($prefs->{$person2}->{$key}) { | |
$si{$key} = 1; | |
} | |
} | |
# 両者ともに評価しているものが一つもなければ0を返す | |
my $n = keys %si; | |
if ($n == 0) { return 0; } | |
# すべての差の平方を足し合わせる | |
my $sum_of_squares; | |
foreach my $key (keys %{ $prefs->{$person1} } ) { | |
if ($prefs->{$person2}->{$key}) { | |
$sum_of_squares += ( ($prefs->{$person1}->{$key} - $prefs->{$person2}->{$key}) ** 2 ); | |
} | |
} | |
return (1 / (1 + $sum_of_squares)); | |
} | |
# person1とperson2のピアソン相関係数を返す | |
sub sim_pearson { | |
my ($prefs, $person1, $person2) = @_; | |
# 二人共評価しているアイテムのリストを得る | |
my %si = (); | |
foreach my $item (keys %{ $prefs->{$person1} }) { | |
if ( $prefs->{$person2}->{$item} ) { | |
$si{$item} = 1; | |
} | |
} | |
# 共に評価しているものが一つもなければ0を返す | |
my $n = keys %si; | |
if ($n == 0) { return 0; } | |
# person1とperson2の評価の平均を求める | |
my $p1ave = 0.0; | |
my $p2ave = 0.0; | |
foreach my $item (keys %si) { | |
$p1ave += $prefs->{$person1}->{$item}; | |
$p2ave += $prefs->{$person2}->{$item}; | |
} | |
$p1ave = $p1ave / $n; | |
$p2ave = $p2ave / $n; | |
# 偏差の二乗和、偏差積の和を求める | |
my $Sp1p1 = 0.0; | |
my $Sp2p2 = 0.0; | |
my $Sp1p2 = 0.0; | |
foreach my $item (keys %si) { | |
$Sp1p1 += (($prefs->{$person1}->{$item} - $p1ave)**2); | |
$Sp2p2 += (($prefs->{$person2}->{$item} - $p2ave)**2); | |
$Sp1p2 += (($prefs->{$person1}->{$item} - $p1ave) * | |
($prefs->{$person2}->{$item} - $p2ave)); | |
} | |
# 相関係数を求める | |
my $r = $Sp1p2 / sqrt( $Sp1p1 * $Sp2p2 ); | |
} | |
# personに最もマッチするトップn人を返す | |
sub topMatches { | |
my ($prefs, $person, $n, $similarity) = @_; | |
# person以外のユーザとのスコア算出 | |
my %scores = (); | |
foreach my $user (keys %$prefs) { | |
if ($user ne $person) { | |
$scores{$user} = &$similarity($prefs, $person, $user); | |
} | |
} | |
# スコアが高い順にソートする | |
my @sortedscores = (); | |
foreach my $user (sort {$scores{$b} <=> $scores{$a}} keys %scores) { | |
my $h = { | |
$user => $scores{$user} | |
}; | |
push(@sortedscores, $h); | |
} | |
# トップn人を返す | |
my @result = (); | |
for (my $i=0; $i<$n; $i++) { | |
push(@result, $sortedscores[$i]); | |
} | |
return \@result; | |
} | |
# personへ推薦するアイテムを返す | |
sub getRecommendations { | |
my ($prefs, $person, $similarity) = @_; | |
# ユーザ間の類似性スコアとアイテムの評価点から、対象ユーザが評価していないアイテムの評価値を計算 | |
# 評価値を全ユーザの類似性スコアを合計した値で割った重み付き平均を計算し、値の高い順にソートしてランキングを作成 | |
my %totals = (); | |
my %simSums = (); | |
foreach my $user (keys %$prefs) { | |
next if ($user eq $person); | |
# ユーザ間の類似性スコア算出 | |
my $score; | |
$score = &$similarity($prefs, $person, $user); | |
# 0以下のスコアは無視する | |
next if ($score <= 0); | |
# 対象ユーザが評価していないアイテムについて評価値を計算 | |
foreach my $item (keys %{ $prefs->{$user} }) { | |
if ((not exists $prefs->{$person}->{$item}) || ($prefs->{$person}->{$item} == 0)) { | |
# 評価値 | |
$totals{$item} += $prefs->{$user}->{$item} * $score; | |
# 類似性スコアの合計値 | |
$simSums{$item} += $score; | |
} | |
} | |
} | |
# 重み付き平均の算出 | |
my %li = (); | |
foreach my $item (keys %totals) { | |
$li{$item} = $totals{$item} / $simSums{$item}; | |
} | |
# 値の高い順にソートする | |
my @rankings = (); | |
foreach my $item (sort {$li{$b} <=> $li{$a}} keys %li) { | |
my $h = { | |
$item => $li{$item} | |
}; | |
push(@rankings, $h); | |
} | |
return \@rankings; | |
} | |
# データのitemとpersonを入れ替える | |
sub transformPrefs { | |
my ($prefs) = @_; | |
my %result = (); | |
foreach my $person (keys %{$prefs}) { | |
foreach my $item (keys %{ $prefs->{$person} }) { | |
$result{$item}{$person} = $prefs->{$person}->{$item}; | |
} | |
} | |
return %result; | |
} | |
# JSON形式をperlデータに変換 | |
sub conv_json2perl { | |
my $json_file = shift; | |
# ファイルのオープン | |
my $json_text; | |
open(my $fh, "<", $json_file) or die "$!"; | |
{ | |
# ファイル全体を読み込む | |
local $/ = undef; | |
$json_text = <$fh>; | |
} | |
close $fh; | |
# decode_json | |
my $perl_data = decode_json($json_text); | |
# ハッシュのリファレンスを返す | |
return $perl_data; | |
} | |
# 各アイテムに似ているアイテムのリストを作成する | |
sub calculateSimilarItems { | |
my ($prefs, $n, $similarity) = @_; | |
# ユーザベースのデータをアイテムベースに入れ替える | |
my %itemPrefs = transformPrefs($prefs); | |
# リスト作成 | |
my %result = (); | |
my $c = 0; | |
foreach my $item (keys %itemPrefs) { | |
# 巨大なデータセットのときはステータス表示 | |
$c += 1; | |
if( ($c % 100) == 0 ) { | |
printf("%d / %d\n", $c, scalar(keys %itemPrefs)); | |
} | |
# アイテムに似ているアイテムを探す | |
my $scores = topMatches(\%itemPrefs, $item, $n, $similarity); | |
$result{$item} = $scores; | |
} | |
return \%result; | |
} | |
# アイテムベースのレコメンドを行う | |
sub getRecommendedItems { | |
my ($prefs, $itemMatch, $user) = @_; | |
# あるユーザの評価したアイテムと点数へのリファレンス | |
my $userRatings = $prefs->{$user}; | |
my %scores = (); | |
my %totalSim = (); | |
# $userに評価されたアイテムをループする | |
foreach my $item (keys %{ $userRatings }) { | |
# そのアイテムの点数 | |
my $rating = $userRatings->{$item}; | |
# アイテム間類似度データの中でこのアイテムに似ているアイテムについて処理 | |
# これはcalculateSimilarItemsで作られたデータでハッシュの配列のハッシュ | |
my $cnt = @{$itemMatch->{$item}}; | |
for( my $i=0; $i<$cnt; $i++) { | |
foreach my $item2 (keys %{ $itemMatch->{$item}->[$i] }) { | |
my $similarity = $itemMatch->{$item}->[$i]{$item2}; | |
# このアイテムに対してユーザが既に評価を行っていたら無視する | |
if( exists $userRatings->{$item2} ) { last; } | |
# 点数と類似度を掛けあわせたものの合計で重み付けする | |
$scores{$item2} += ($similarity * $rating); | |
# すべての類似度の合計 | |
$totalSim{$item2} += $similarity; | |
} | |
} | |
} | |
# 類似度の合計で正規化 | |
my %normScores = (); | |
foreach my $item (keys %scores) { | |
$normScores{$item} = $scores{$item} / $totalSim{$item}; | |
} | |
# 降順に並べたランキングを返す | |
my @rankings = (); | |
foreach my $key (sort {$normScores{$b} <=> $normScores{$a}} keys %normScores) { | |
my $h = { | |
$key => $normScores{$key} | |
}; | |
push(@rankings, $h); | |
} | |
return \@rankings; | |
} | |
# MovieLensのデータセットを読み込む関数 | |
sub loadMovieLens { | |
my $data = "u.item"; | |
my $data2 = "u.data"; | |
# 映画のタイトルを得る | |
my %movies = (); | |
open(my $fh, "<", $data) or die "$!"; | |
while(<$fh>) { | |
chomp; | |
my @fields = split /\|/, $_; | |
my $movieid = $fields[0]; | |
my $title = $fields[1]; | |
$movies{$movieid} = $title; # 最初がmovieidで次がタイトル | |
} | |
close $fh; | |
# ユーザ毎の映画への評価値を得る | |
my %prefs = (); | |
open($fh, "<", $data2) or die "$!"; | |
while(<$fh>) { | |
chomp; | |
# user|movieid|rating|ts | |
my @fields = split /\s+/, $_; | |
my $user = $fields[0]; | |
my $movieid = $fields[1]; | |
my $rating = $fields[2]; | |
$prefs{$user}{$movies{$movieid}} = $rating; | |
} | |
close $fh; | |
return \%prefs; | |
} | |
# | |
# 推薦を行うメイン | |
# | |
my $u = '87'; | |
my $cnt = 30; | |
my $t1 = (times)[0]; | |
my $prefs = loadMovieLens(); # データのロード | |
my $t2 = (times)[0]; | |
my $load_time = $t2 - $t1; | |
$t1 = (times)[0]; | |
my $urankings = getRecommendations($prefs, $u, \&sim_distance); | |
$t2 = (times)[0]; | |
my $urank_time = $t2 - $t1; | |
print "User-based Recommendations:\n"; | |
for(my $i=0; $i<$cnt; $i++) { | |
foreach my $item (keys %{ $urankings->[$i] }) { | |
print "\t" . $item . " " . $urankings->[$i]->{$item} . "\n"; | |
} | |
} | |
$t1 = (times)[0]; | |
my $itemSim = calculateSimilarItems($prefs, $cnt, \&sim_distance); | |
$t2 = (times)[0]; | |
my $conv_time = $t2 - $t1; | |
$t1 = (times)[0]; | |
my $irankings = getRecommendedItems($prefs, $itemSim, $u); | |
$t2 = (times)[0]; | |
my $irank_time = $t2 - $t1; | |
print "Item-based Recommendations:\n"; | |
for(my $i=0; $i<$cnt; $i++) { | |
foreach my $item (keys %{ $irankings->[$i] }) { | |
print "\t" . $item . " " . $irankings->[$i]->{$item} . "\n"; | |
} | |
} | |
print "Processed time:\n"; | |
print "\t" . "load time = " . $load_time . "\n"; | |
print "\t" . "urank time = " . $urank_time . "\n"; | |
print "\t" . "conv time = " . $conv_time . "\n"; | |
print "\t" . "irank time = " . $irank_time . "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment