Skip to content

Instantly share code, notes, and snippets.

@myui
Last active June 29, 2016 13:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save myui/8123042 to your computer and use it in GitHub Desktop.
Save myui/8123042 to your computer and use it in GitHub Desktop.
Hive/Hivemallを利用した広告クリックスルー率(CTR)の推定 ref: http://qiita.com/myui/items/f726ca3dcc48410abe45
create or replace view training2 as
select
rowid,
clicks,
(impression - clicks) as noclick,
mhash(concat("1_", displayurl)) as displayurl,
mhash(concat("2_", adid)) as adid,
...
-1 as bias
from (
...
INSERT OVERWRITE TABLE training_rcfile
select transform(*)
ROW FORMAT DELIMITED ..
using 'gawk -f kddconv.awk'
as (rowid BIGINT, label FLOAT, features ARRAY<INT>)
ROW FORMAT DELIMITED ..
from training2
CLUSTER BY CAST(rand(47) * 100 as INT), CAST(rand(49) * 100 as INT), CAST(rand(50) * 100 as INT);
rowid=$1;
positive=$2;
negative=$3;
features=$4;
for(i=5;i<=NF;i++)
{
features = features "," $i;
}
for(i=0;i<positive;i++)
{
print rowid "\t1.0\t" features
}
for(i=0;i<negative;i++)
{
print rowid "\t0.0\t" features
}
set hivevar:total_steps=5000000;
create table lr_model
as
select
feature,
avg(weight) as weight -- バギング
from
(select
logress(features,label, "-total_steps ${total_steps}") as (feature,weight)
from
training_rcfile
) t -- map-onlyの弱学習器が複数実行される
group by feature -- featureの値ごとにreducerにshuffleされる
create table testing_exploded as
select
rowid,
feature
from
testing2
LATERAL VIEW explode(features) t AS feature
SELECT
t.rowid,
sigmoid(sum(m.weight)) as prob
FROM
testing_exploded t LEFT OUTER JOIN
model m ON (t.feature = m.feature)
GROUP BY
t.rowid
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment