use movielens;
select max(movieid) from ratings;
> 3952
-- list popular items
WITH t as (
select
movieid as itemid,
count(1) as cnt
from
ratings
group by
movieid
),
popular as (
select
itemid,
cnt,
row_number() OVER (order by cnt desc) as rownum
from
t
order by
cnt desc
limit 10
)
select map_values(to_ordered_map(rownum, itemid))
from popular;
> [2858,260,1196,1210,480,2028,589,2571,1270,593]
-- 訓練データ
drop table ranking_train;
create table ranking_train
as
select
userid,
-- bits_collect(movieid) as itemids
collect_list(movieid) as itemids
-- sort_array(collect_set(movieid)) as itemids
from
training t
where
rating >= 3
group by
userid
;
-- 検証データ
drop table ranking_test;
create table ranking_test
as
select
userid,
itemids,
size(itemids) as num_pos_items
from (
select
userid,
collect_list(movieid) as itemids
from
testing t1
where
rating >= 3
group by
userid
) t2
;
-- 学習
drop table bprmf_model;
CREATE TABLE bprmf_model
as
WITH t as (
select
bpr_sampling(userid, itemids, "-pair_sampling -sampling_rate 3.0 -max_itemid 3952")
-- bpr_sampling(userid, itemids, "-sampling_rate 5.0")
-- bpr_sampling(userid, itemids, "-sampling_rate 5.0 -pair_sampling")
as (userid, pos_item, neg_item)
from
ranking_train
)
select
train_bprmf(userid, pos_item, neg_item, "-factor 8 -iterations 50 -boldDriver")
-- train_bprmf(userid, pos_item, neg_item, "-iterations 50 -boldDriver")
-- train_bprmf(userid, pos_item, neg_item, "-iterations 100 -reg 0.1")
-- train_bprmf(userid, pos_item, neg_item, "-iterations 100 -boldDriver")
from
t
;
drop table bprmf_model;
CREATE TABLE bprmf_model
as
WITH t as (
select
t1.userid,
t2.pos_item,
t2.neg_item
from
ranking_train t1
LATERAL VIEW item_pairs_sampling(t1.itemids, 3952, "-bitset_input") t2 as pos_item, neg_item
CLUSTER BY
rand(1)
)
select
-- train_bprmf(userid, pos_item, neg_item, "-iterations 100 -reg 0.1")
train_bprmf(userid, pos_item, neg_item, "-iterations 100 -boldDriver")
from
t
;
-- テストデータ
-- top-kのアイテム推薦
set hive.mapjoin.optimized.hashtable=false;
set mapreduce.framework.name=yarn;
set hive.execution.engine=mr;
set mapreduce.framework.name=yarn-tez;
set hive.execution.engine=tez;
drop table bpr_rec;
create table bpr_rec
as
WITH input as (
select
t1.userid,
t1.num_pos_items,
t2.itemids as excludeItems
-- bits_or(
-- -- top-10 popular items
-- to_bits(array(2858,260,1196,1210,480,2028,589,2571,1270,593)),
-- -- exclude items used in training
-- t2.itemids
-- ) as excludeItems
FROM
ranking_test t1
LEFT OUTER JOIN ranking_train t2 ON (t1.userid = t2.userid)
),
test as (
select
t1.userid,
t3.itemid,
t1.num_pos_items
from
input t1
-- LATERAL VIEW populate_not_in(array(), 3952, "-bitset_input") t3 as itemid
LATERAL VIEW populate_not_in(t1.excludeItems, 3952) t3 as itemid
-- LATERAL VIEW populate_not_in(t1.excludeItems, 3952, "-bitset_input") t3 as itemid
),
predict as (
select
t2.userid,
t2.itemid,
-- bprmf_predict(t2.Pu, p2.Qi) as score, -- no bias
bprmf_predict(t2.Pu, p2.Qi, p2.Bi) as score,
t2.num_pos_items
from (
select
t1.userid,
t1.itemid,
p1.Pu,
t1.num_pos_items
from
test t1 LEFT OUTER JOIN bprmf_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN bprmf_model p2
ON (t2.itemid = p2.idx)
),
recommend as (
select
each_top_k(10, userid, score, userid, itemid)
-- each_top_k(num_pos_items, userid, score, userid, itemid)
as (rank, score, userid, itemid)
from
predict
)
select
userid,
map_values(to_ordered_map(rank, itemid)) as rec_items
-- スコアが同じもとで問題
-- map_values(to_ordered_map(score, itemid, true)) as rec_items
from
recommend
group by
userid
;
select
r.userid,
array_intersect(r.rec_items, itemids)
from
bpr_rec r
JOIN ranking_test t ON (r.userid = t.userid)
where
r.userid <= 10;
1 [588]
2 []
3 [480,1270,1291]
4 []
5 [2959]
6 [920]
7 [1573]
8 [16,2268,527]
9 [480,1617,318]
10 [1136,594,1028,1270,1097,1210,1291,2396]
select * from bpr_rec where userid <= 10;
1 [3408,2396,3176,318,2858,2908,3148,527,2762,593]
2 [3252,2067,3256,454,2728,1674,1608,3753,3408,1233]
3 [733,1527,2628,260,110,377,1196,589,1721,356]
4 [608,260,1196,110,858,2028,1198,1210,527,593]
5 [778,3129,1089,1719,2599,994,3852,3481,2692,1594]
6 [597,7,1307,2151,2302,356,1393,1888,539,2000]
7 [1580,589,480,2571,457,2028,2628,2396,377,260]
8 [3252,319,1488,1704,778,175,36,293,1682,1672]
9 [3176,3408,2858,593,608,1358,2599,527,318,2762]
10 [2987,1028,260,1197,2012,2406,1270,2080,3175,1210]
select userid, itemids from ranking_test where userid <= 10;
1 [1246,1961,938,588,2762,150,2340]
2 [1225,3578,3468,1385,1244,3471,1246,2571,1372,3451,1293,1544,1537,1527,235,1265,163,3105,1084,1247,3257,2490,1917,1253]
3 [2858,2081,1291,590,1197,1270,3534,3868,2470,3671,480]
4 [1954]
5 [2013,1897,2291,1243,2571,2323,1529,1192,1921,1500,3409,1909,34,1923,1429,2390,1513,176,2959,1732,2289,913,1213,3260,224,1635,1580,2987,1684]
6 [1441,2469,2100,3534,1101,368,17,3682,920,3685]
7 [3256,3107,1573,1221]
8 [1059,1621,3500,39,24,1653,562,42,3260,1730,1488,1678,3256,345,3259,1711,476,3265,3267,1027,1660,2268,16,1411,3418,2571,163,741,1573,527,282,4]
9 [1617,480,377,3178,1089,1210,3948,1921,529,994,590,1148,2355,318,1923,1233,3270,597,2762,3408,1060,1721]
10 [2003,1023,3035,1210,1566,2012,2091,3198,1294,1339,953,1028,2291,208,2497,2770,3264,1682,2795,2657,2047,3097,858,1287,351,3688,186,2396,2096,3194,1097,1252,594,1394,784,1304,3309,150,116,2336,648,943,333,1345,2496,3255,3868,1025,1129,2135,1254,2021,2108,1136,1270,3247,2002,1135,1721,1948,1291,926,904,2968,1513,1586,3668,592,3095,367,1253,924,920,539]