myui/bprmf.md

## bprmf.md

      
    Raw
  

              bprmf.md
            
          
    use movielens;

select max(movieid) from ratings;
> 3952

-- list popular items
WITH t as (
  select 
    movieid as itemid,
    count(1) as cnt
  from
    ratings
  group by
    movieid
),
popular as (
  select 
    itemid, 
    cnt,
    row_number() OVER (order by cnt desc) as rownum
  from
    t
  order by
    cnt desc
  limit 10
)
select map_values(to_ordered_map(rownum, itemid))
from popular;

> [2858,260,1196,1210,480,2028,589,2571,1270,593]

-- 訓練データ
drop table ranking_train;
create table ranking_train
as
select 
  userid,
  -- bits_collect(movieid) as itemids
  collect_list(movieid) as itemids
  -- sort_array(collect_set(movieid)) as itemids
from
  training t
where 
  rating >= 3
group by
  userid
;

-- 検証データ

drop table ranking_test;
create table ranking_test
as
select
  userid,
  itemids,
  size(itemids) as num_pos_items
from (
  select 
    userid,
    collect_list(movieid) as itemids
  from
    testing t1
  where 
    rating >= 3
  group by
    userid
) t2
;

-- 学習
drop table bprmf_model;
CREATE TABLE bprmf_model
as
WITH t as (
  select
    bpr_sampling(userid, itemids, "-pair_sampling -sampling_rate 3.0 -max_itemid 3952")
--    bpr_sampling(userid, itemids, "-sampling_rate 5.0")
--    bpr_sampling(userid, itemids, "-sampling_rate 5.0 -pair_sampling")
      as (userid, pos_item, neg_item)
  from
    ranking_train
)
select
  train_bprmf(userid, pos_item, neg_item, "-factor 8 -iterations 50 -boldDriver")
--  train_bprmf(userid, pos_item, neg_item, "-iterations 50 -boldDriver")
--  train_bprmf(userid, pos_item, neg_item, "-iterations 100 -reg 0.1")
--  train_bprmf(userid, pos_item, neg_item, "-iterations 100 -boldDriver")
from
  t
;


drop table bprmf_model;
CREATE TABLE bprmf_model
as
WITH t as (
select
  t1.userid,
  t2.pos_item,
  t2.neg_item
from
  ranking_train t1
  LATERAL VIEW item_pairs_sampling(t1.itemids, 3952, "-bitset_input") t2 as pos_item, neg_item
CLUSTER BY
  rand(1)
)
select
--  train_bprmf(userid, pos_item, neg_item, "-iterations 100 -reg 0.1")
  train_bprmf(userid, pos_item, neg_item, "-iterations 100 -boldDriver")
from
  t
;

-- テストデータ

-- top-kのアイテム推薦

set hive.mapjoin.optimized.hashtable=false;

set mapreduce.framework.name=yarn;
set hive.execution.engine=mr;

set mapreduce.framework.name=yarn-tez;
set hive.execution.engine=tez;

drop table bpr_rec;
create table bpr_rec
as
WITH input as (
select
  t1.userid,
  t1.num_pos_items,
  t2.itemids as excludeItems
--  bits_or(
--    -- top-10 popular items
--    to_bits(array(2858,260,1196,1210,480,2028,589,2571,1270,593)),    
--    -- exclude items used in training
--    t2.itemids
--  ) as excludeItems
FROM  
  ranking_test t1 
  LEFT OUTER JOIN ranking_train t2 ON (t1.userid = t2.userid)
),
test as (
select 
  t1.userid,
  t3.itemid,
  t1.num_pos_items
from 
  input t1 
--  LATERAL VIEW populate_not_in(array(), 3952, "-bitset_input") t3 as itemid
  LATERAL VIEW populate_not_in(t1.excludeItems, 3952) t3 as itemid
--  LATERAL VIEW populate_not_in(t1.excludeItems, 3952, "-bitset_input") t3 as itemid
),
predict as (
    select
      t2.userid,
      t2.itemid,
      -- bprmf_predict(t2.Pu, p2.Qi) as score, -- no bias
      bprmf_predict(t2.Pu, p2.Qi, p2.Bi) as score,
      t2.num_pos_items
    from (
      select
        t1.userid, 
        t1.itemid,
        p1.Pu,
        t1.num_pos_items
      from
        test t1 LEFT OUTER JOIN bprmf_model p1
        ON (t1.userid = p1.idx) 
    ) t2 
    LEFT OUTER JOIN bprmf_model p2
    ON (t2.itemid = p2.idx)
),
recommend as (
  select
  each_top_k(10, userid, score, userid, itemid) 
--    each_top_k(num_pos_items, userid, score, userid, itemid) 
      as (rank, score, userid, itemid)
  from
    predict
)
select 
  userid,
  map_values(to_ordered_map(rank, itemid)) as rec_items
  -- スコアが同じもとで問題
  -- map_values(to_ordered_map(score, itemid, true)) as rec_items
from
  recommend
group by
  userid
;

select
  r.userid,
  array_intersect(r.rec_items, itemids)
from
  bpr_rec r
  JOIN ranking_test t ON (r.userid = t.userid)
where
  r.userid <= 10;
1       [588]
2       []
3       [480,1270,1291]
4       []
5       [2959]
6       [920]
7       [1573]
8       [16,2268,527]
9       [480,1617,318]
10      [1136,594,1028,1270,1097,1210,1291,2396]

select * from bpr_rec where userid <= 10;
1       [3408,2396,3176,318,2858,2908,3148,527,2762,593]
2       [3252,2067,3256,454,2728,1674,1608,3753,3408,1233]
3       [733,1527,2628,260,110,377,1196,589,1721,356]
4       [608,260,1196,110,858,2028,1198,1210,527,593]
5       [778,3129,1089,1719,2599,994,3852,3481,2692,1594]
6       [597,7,1307,2151,2302,356,1393,1888,539,2000]
7       [1580,589,480,2571,457,2028,2628,2396,377,260]
8       [3252,319,1488,1704,778,175,36,293,1682,1672]
9       [3176,3408,2858,593,608,1358,2599,527,318,2762]
10      [2987,1028,260,1197,2012,2406,1270,2080,3175,1210]

select userid, itemids from ranking_test where userid <= 10;
1       [1246,1961,938,588,2762,150,2340]
2       [1225,3578,3468,1385,1244,3471,1246,2571,1372,3451,1293,1544,1537,1527,235,1265,163,3105,1084,1247,3257,2490,1917,1253]
3       [2858,2081,1291,590,1197,1270,3534,3868,2470,3671,480]
4       [1954]
5       [2013,1897,2291,1243,2571,2323,1529,1192,1921,1500,3409,1909,34,1923,1429,2390,1513,176,2959,1732,2289,913,1213,3260,224,1635,1580,2987,1684]
6       [1441,2469,2100,3534,1101,368,17,3682,920,3685]
7       [3256,3107,1573,1221]
8       [1059,1621,3500,39,24,1653,562,42,3260,1730,1488,1678,3256,345,3259,1711,476,3265,3267,1027,1660,2268,16,1411,3418,2571,163,741,1573,527,282,4]
9       [1617,480,377,3178,1089,1210,3948,1921,529,994,590,1148,2355,318,1923,1233,3270,597,2762,3408,1060,1721]
10      [2003,1023,3035,1210,1566,2012,2091,3198,1294,1339,953,1028,2291,208,2497,2770,3264,1682,2795,2657,2047,3097,858,1287,351,3688,186,2396,2096,3194,1097,1252,594,1394,784,1304,3309,150,116,2336,648,943,333,1345,2496,3255,3868,1025,1129,2135,1254,2021,2108,1136,1270,3247,2002,1135,1721,1948,1291,926,904,2968,1513,1586,3668,592,3095,367,1253,924,920,539]