Skip to content

Instantly share code, notes, and snippets.

@myui
Last active August 29, 2015 14:14
Show Gist options
  • Save myui/79e06f47562bc8fd1928 to your computer and use it in GitHub Desktop.
Save myui/79e06f47562bc8fd1928 to your computer and use it in GitHub Desktop.
HivemallでMatrix Factorization ref: http://qiita.com/myui/items/dccb4f58799f080e24ab
R \approx P^{T}Q
R^{'}_{ui} = \mu + B_{u} + B_{i} + \overrightarrow{P_{u}}^{T}\overrightarrow{Q_{i}}
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);
select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx)
) t;
set hivevar:userid=1;
set hivevar:topk=5;
select
t1.movieid,
mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted
from (
select
idx as movieid,
Qi,
Bi
from
sgd_model p
where
p.idx NOT IN
(select movieid from training where userid=${userid})
) t1 CROSS JOIN (
select
Pu,
Bu
from
sgd_model
where
idx = ${userid}
) t2
order by
predicted DESC
limit ${topk};
select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx)
) t;
\mathrm{MAE} = \frac{1}{n}\sum_{i=1}^n \left| f_i-y_i\right| =\frac{1}{n}\sum_{i=1}^n \left| e_i \right|
set hivevar:kfold=10;
set hivevar:seed=31;
-- Adding group id (gid) to each training instance
drop table ratings_groupded;
create table ratings_groupded
as
select
rand_gid2(${kfold}, ${seed}) gid, -- generates group id ranging from 1 to 10
userid,
movieid,
rating
from
ratings
cluster by gid, rand(${seed});
\mathrm{MAE} = \frac{1}{n}\sum_{i=1}^n \left| f_i-y_i\right| =\frac{1}{n}\sum_{i=1}^n \left| e_i \right|
set hivevar:kfold=10;
set hivevar:seed=31;
-- Adding group id (gid) to each training instance
drop table ratings_groupded;
create table ratings_groupded
as
select
rand_gid2(${kfold}, ${seed}) gid, -- generates group id ranging from 1 to 10
userid,
movieid,
rating
from
ratings
cluster by gid, rand(${seed});
-- latent factors
set hivevar:factor=10;
-- maximum number of iterations
set hivevar:iters=50;
-- regularization parameter
set hivevar:lambda=0.05;
-- learning rate
set hivevar:eta=0.005;
-- conversion rate (if changes between iterations became less or equals to ${cv_rate}, the training will stop)
set hivevar:cv_rate=0.001;
select avg(rating) from ratings;
-- mean rating value (Optional but recommended to set ${mu})
set hivevar:mu=3.581564453029317;
min_{P,Q,B} \sum_{(u,i) \in R}{(R_{ui}-R^{'}_{ui})^{2} + \lambda ({||B_{u}||}^{2} + {||B_{i}||}^{2} + {||\overrightarrow{P_{u}}||}^{2} + {||\overrightarrow{Q_{i}}||}^{2}})
create database movielens;
use movielens;
CREATE EXTERNAL TABLE ratings (
userid INT,
movieid INT,
rating INT,
tstamp STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
STORED AS TEXTFILE
LOCATION '/dataset/movielens/ratings';
SET hivevar:seed=31;
CREATE TABLE ratings2
as
select
rand(${seed}) as rnd,
userid,
movieid,
rating
from
ratings;
CREATE TABLE training
as
select * from ratings2
order by rnd DESC
limit 800000;
CREATE TABLE testing
as
select * from ratings2
order by rnd ASC
limit 200209;
sed 's/::/#/g' ratings.dat | hadoop fs -put - /dataset/movielens/ratings/ratings.t
SET hivevar:seed=31;
CREATE TABLE ratings2
as
select
rand(${seed}) as rnd,
userid,
movieid,
rating
from
ratings;
CREATE TABLE training
as
select * from ratings2
order by rnd DESC
limit 800000;
CREATE TABLE testing
as
select * from ratings2
order by rnd ASC
limit 200209;
-- 1) mean rating
set hivevar:mu=3.593565;
-- 2) number of factors
set hivevar:factor=10;
-- 3) maximum number of training iterations
set hivevar:iters=50;
-- 1) mean rating
set hivevar:mu=3.593565;
-- 2) number of factors
set hivevar:factor=10;
-- 3) maximum number of training iterations
set hivevar:iters=50;
create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias)
from
training
) t
group by idx;
select avg(rating) from training;
create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias)
from
training
) t
group by idx;
create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias)
from
training
) t
group by idx;
hive> desc sgd_model;
idx int
pu array<float>
qi array<float>
bu double
bi double
hive> desc sgd_model;
idx int
pu array<float>
qi array<float>
bu double
bi double
select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx)
) t;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment