Skip to content

Instantly share code, notes, and snippets.

@actsasflinn
Created October 9, 2015 04:03
Show Gist options
  • Save actsasflinn/7da7e8cada227fa6f960 to your computer and use it in GitHub Desktop.
Save actsasflinn/7da7e8cada227fa6f960 to your computer and use it in GitHub Desktop.
create table cars_rowid as
select
regexp_replace(reflect('java.util.UUID','randomUUID'), '-', '') as rowid,
*
from flinn.cars;
create or replace view cars_train as
select
rowid,
mhash(concat("1:", buying)) as buying,
mhash(concat("2:", maintenance)) as maintenance,
mhash(concat("3:", doors)) as doors,
mhash(concat("4:", persons)) as persons,
mhash(concat("5:", lug_boot)) as lug_boot,
mhash(concat("6:", safety)) as safety,
rating,
-1 as bias
from cars_rowid;
create or replace view cars_testing as
select
t.rowid,
t2.label,
array(buying, maintenance, doors, persons, lug_boot, safety, bias) as features
from (
select * from cars_train
) t
left join (
select
rowid,
CAST(IF(rating != "unacc", 1.0, 0.0) AS float) AS label
from cars_rowid
) t2 on t.rowid = t2.rowid;
SELECT
CAST(feature as int) as feature,
CAST(AVG(weight) as float) as weight
FROM (
SELECT
logress(features, label,'-total_steps 5') as (feature, weight)
FROM
cars_testing
) t
GROUP BY feature
SELECT
logress(features, label,'-total_steps 5') as (feature, weight)
FROM
cars_testing
drop table testing_exploded;
create table testing_exploded as
select
rowid,
feature
from
cars_testing
LATERAL VIEW explode(features) t AS feature;
set hivevar:total_steps=5000000;
drop table lr_model;
create table lr_model
as
select
feature,
cast(avg(weight) as float) as weight
from
(select
logress(features, label, "-total_steps ${total_steps}") as (feature,weight)
from
cars_testing
) t
group by feature;
drop table lr_predict;
create table lr_predict
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
select
t.rowid,
sigmoid(sum(m.weight)) as prob
from
testing_exploded t LEFT OUTER JOIN
lr_model m ON (t.feature = m.feature)
group by
t.rowid
order by
rowid ASC;
drop table results;
create table results as select cars_rowid.*, lr_predict.prob from lr_predict left join cars_rowid on lr_predict.rowid = cars_rowid.rowid;
set hivevar:xtimes=3;
create or replace view training_x3
as
select
*
from (
select
amplify(${xtimes}, *) as (rowid, label, features)
from
cars_testing
) t
CLUSTER BY rand();
create table lr_model_x3
as
select
feature,
cast(avg(weight) as float) as weight
from
(select
logress(features,label) as (feature,weight)
from
training_x3
) t
group by feature;
drop table lr_predict_x3;
create table lr_predict_x3
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
select
t.rowid,
sigmoid(sum(m.weight)) as prob
from
testing_exploded t LEFT OUTER JOIN
lr_model_x3 m ON (t.feature = m.feature)
group by
t.rowid
order by
rowid ASC;
drop table results_x3;
create table results_x3 as select cars_rowid.*, lr_predict_x3.prob from lr_predict_x3 left join cars_rowid on lr_predict_x3.rowid = cars_rowid.rowid;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment