Skip to content

Instantly share code, notes, and snippets.

@puyokw
Created March 10, 2016 07:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save puyokw/4c61e8db419aaa72a81c to your computer and use it in GitHub Desktop.
Save puyokw/4c61e8db419aaa72a81c to your computer and use it in GitHub Desktop.
td_intern criteo
INSERT OVERWRITE TABLE fm_test
select
t2.rowid,
concat_array(
array_remove(array(
if(t2.c1 is null, null, cast( mhash(concat("c1:", t2.c1)) as string) ),
if(t2.c2 is null, null, cast( mhash(concat("c2:", t2.c2)) as string) ),
if(t2.c3 is null, null, cast( mhash(concat("c3:", t2.c3)) as string) ),
if(t2.c4 is null, null, cast( mhash(concat("c4:", t2.c4)) as string) ),
if(t2.c5 is null, null, cast( mhash(concat("c5:", t2.c5)) as string) ),
if(t2.c6 is null, null, cast( mhash(concat("c6:", t2.c6)) as string) ),
if(t2.c7 is null, null, cast( mhash(concat("c7:", t2.c7)) as string) ),
if(t2.c8 is null, null, cast( mhash(concat("c8:", t2.c8)) as string) ),
if(t2.c9 is null, null, cast( mhash(concat("c9:", t2.c9)) as string) ),
if(t2.c10 is null, null, cast( mhash(concat("c10:", t2.c10)) as string) ),
if(t2.c11 is null, null, cast( mhash(concat("c11:", t2.c11)) as string) ),
if(t2.c12 is null, null, cast( mhash(concat("c12:", t2.c12)) as string) ),
if(t2.c13 is null, null, cast( mhash(concat("c13:", t2.c13)) as string) ),
if(t2.c14 is null, null, cast( mhash(concat("c14:", t2.c14)) as string) ),
if(t2.c15 is null, null, cast( mhash(concat("c15:", t2.c15)) as string) ),
if(t2.c16 is null, null, cast( mhash(concat("c16:", t2.c16)) as string) ),
if(t2.c17 is null, null, cast( mhash(concat("c17:", t2.c17)) as string) ),
if(t2.c18 is null, null, cast( mhash(concat("c18:", t2.c18)) as string) ),
if(t2.c19 is null, null, cast( mhash(concat("c19:", t2.c19)) as string) ),
if(t2.c20 is null, null, cast( mhash(concat("c20:", t2.c20)) as string) ),
if(t2.c21 is null, null, cast( mhash(concat("c21:", t2.c21)) as string) ),
if(t2.c22 is null, null, cast( mhash(concat("c22:", t2.c22)) as string) ),
if(t2.c23 is null, null, cast( mhash(concat("c23:", t2.c23)) as string) ),
if(t2.c24 is null, null, cast( mhash(concat("c24:", t2.c24)) as string) ),
if(t2.c25 is null, null, cast( mhash(concat("c25:", t2.c25)) as string) ),
if(t2.c26 is null, null, cast( mhash(concat("c26:", t2.c26)) as string) )
), null),
t1.features
) as features
from
test_ordered t2
LEFT OUTER JOIN test_quantative t1 ON (t2.rowid=t1.rowid);
INSERT OVERWRITE TABLE fm_train
select
t2.rowid,
concat_array(
array_remove(array(
if(t2.c1 is null, null, cast( mhash(concat("c1:", t2.c1)) as string) ),
if(t2.c2 is null, null, cast( mhash(concat("c2:", t2.c2)) as string) ),
if(t2.c3 is null, null, cast( mhash(concat("c3:", t2.c3)) as string) ),
if(t2.c4 is null, null, cast( mhash(concat("c4:", t2.c4)) as string) ),
if(t2.c5 is null, null, cast( mhash(concat("c5:", t2.c5)) as string) ),
if(t2.c6 is null, null, cast( mhash(concat("c6:", t2.c6)) as string) ),
if(t2.c7 is null, null, cast( mhash(concat("c7:", t2.c7)) as string) ),
if(t2.c8 is null, null, cast( mhash(concat("c8:", t2.c8)) as string) ),
if(t2.c9 is null, null, cast( mhash(concat("c9:", t2.c9)) as string) ),
if(t2.c10 is null, null, cast( mhash(concat("c10:", t2.c10)) as string) ),
if(t2.c11 is null, null, cast( mhash(concat("c11:", t2.c11)) as string) ),
if(t2.c12 is null, null, cast( mhash(concat("c12:", t2.c12)) as string) ),
if(t2.c13 is null, null, cast( mhash(concat("c13:", t2.c13)) as string) ),
if(t2.c14 is null, null, cast( mhash(concat("c14:", t2.c14)) as string) ),
if(t2.c15 is null, null, cast( mhash(concat("c15:", t2.c15)) as string) ),
if(t2.c16 is null, null, cast( mhash(concat("c16:", t2.c16)) as string) ),
if(t2.c17 is null, null, cast( mhash(concat("c17:", t2.c17)) as string) ),
if(t2.c18 is null, null, cast( mhash(concat("c18:", t2.c18)) as string) ),
if(t2.c19 is null, null, cast( mhash(concat("c19:", t2.c19)) as string) ),
if(t2.c20 is null, null, cast( mhash(concat("c20:", t2.c20)) as string) ),
if(t2.c21 is null, null, cast( mhash(concat("c21:", t2.c21)) as string) ),
if(t2.c22 is null, null, cast( mhash(concat("c22:", t2.c22)) as string) ),
if(t2.c23 is null, null, cast( mhash(concat("c23:", t2.c23)) as string) ),
if(t2.c24 is null, null, cast( mhash(concat("c24:", t2.c24)) as string) ),
if(t2.c25 is null, null, cast( mhash(concat("c25:", t2.c25)) as string) ),
if(t2.c26 is null, null, cast( mhash(concat("c26:", t2.c26)) as string) )
), null),
t1.features
) as features,
t2.label
from
train_ordered t2
LEFT OUTER JOIN train_quantative t1 ON (t2.rowid=t1.rowid);
WITH test_nonzero as (
select
t2.rowid as rowid,
IF( t2.l1 IS NULL, 0, t2.l1) as l1,
IF( t2.l2 IS NULL, 0, t2.l2) as l2,
IF( t2.l3 IS NULL, 0, t2.l3) as l3,
IF( t2.l4 IS NULL, 0, t2.l4) as l4,
IF( t2.l5 IS NULL, 0, t2.l5) as l5,
IF( t2.l6 IS NULL, 0, t2.l6) as l6,
IF( t2.l7 IS NULL, 0, t2.l7) as l7,
IF( t2.l8 IS NULL, 0, t2.l8) as l8,
IF( t2.l9 IS NULL, 0, t2.l9) as l9,
IF( t2.l10 IS NULL, 0, t2.l10) as l10,
IF( t2.l11 IS NULL, 0, t2.l11) as l11,
IF( t2.l12 IS NULL, 0, t2.l12) as l12,
IF( t2.l13 IS NULL, 0, t2.l13) as l13
from
test_ordered t2
), quantative as (
select rowid, 16777217 + 1 as feature, l1 as value from test_nonzero
union all
select rowid, 16777217 + 2 as feature, l2 as value from test_nonzero
union all
select rowid, 16777217 + 3 as feature, l3 as value from test_nonzero
union all
select rowid, 16777217 + 4 as feature, l4 as value from test_nonzero
union all
select rowid, 16777217 + 5 as feature, l5 as value from test_nonzero
union all
select rowid, 16777217 + 6 as feature, l6 as value from test_nonzero
union all
select rowid, 16777217 + 7 as feature, l7 as value from test_nonzero
union all
select rowid, 16777217 + 8 as feature, l8 as value from test_nonzero
union all
select rowid, 16777217 + 9 as feature, l9 as value from test_nonzero
union all
select rowid, 16777217 + 10 as feature, l10 as value from test_nonzero
union all
select rowid, 16777217 + 11 as feature, l11 as value from test_nonzero
union all
select rowid, 16777217 + 12 as feature, l12 as value from test_nonzero
union all
select rowid, 16777217 + 13 as feature, l13 as value from test_nonzero
),
quantative_stats as (
select
feature,
avg(value) as mean, stddev_pop(value) as stddev,
min(value) as min, max(value) as max
from
quantative
group by
feature
),quantative_norm as (
select
t1.rowid,
collect_list(
feature(
t1.feature, rescale(t1.value, t2.min, t2.max)
)
) as features
from
quantative t1
JOIN quantative_stats t2 ON (t1.feature = t2.feature)
group by
t1.rowid
)INSERT OVERWRITE TABLE test_quantative
SELECT
rowid, features
FROM
quantative_norm t1;
WITH train_nonzero as (
select
t2.rowid as rowid,
IF( t2.l1 IS NULL, 0, t2.l1) as l1,
IF( t2.l2 IS NULL, 0, t2.l2) as l2,
IF( t2.l3 IS NULL, 0, t2.l3) as l3,
IF( t2.l4 IS NULL, 0, t2.l4) as l4,
IF( t2.l5 IS NULL, 0, t2.l5) as l5,
IF( t2.l6 IS NULL, 0, t2.l6) as l6,
IF( t2.l7 IS NULL, 0, t2.l7) as l7,
IF( t2.l8 IS NULL, 0, t2.l8) as l8,
IF( t2.l9 IS NULL, 0, t2.l9) as l9,
IF( t2.l10 IS NULL, 0, t2.l10) as l10,
IF( t2.l11 IS NULL, 0, t2.l11) as l11,
IF( t2.l12 IS NULL, 0, t2.l12) as l12,
IF( t2.l13 IS NULL, 0, t2.l13) as l13
from
train_ordered t2
), quantative as (
select rowid, 16777217 + 1 as feature, l1 as value from train_nonzero
union all
select rowid, 16777217 + 2 as feature, l2 as value from train_nonzero
union all
select rowid, 16777217 + 3 as feature, l3 as value from train_nonzero
union all
select rowid, 16777217 + 4 as feature, l4 as value from train_nonzero
union all
select rowid, 16777217 + 5 as feature, l5 as value from train_nonzero
union all
select rowid, 16777217 + 6 as feature, l6 as value from train_nonzero
union all
select rowid, 16777217 + 7 as feature, l7 as value from train_nonzero
union all
select rowid, 16777217 + 8 as feature, l8 as value from train_nonzero
union all
select rowid, 16777217 + 9 as feature, l9 as value from train_nonzero
union all
select rowid, 16777217 + 10 as feature, l10 as value from train_nonzero
union all
select rowid, 16777217 + 11 as feature, l11 as value from train_nonzero
union all
select rowid, 16777217 + 12 as feature, l12 as value from train_nonzero
union all
select rowid, 16777217 + 13 as feature, l13 as value from train_nonzero
),
quantative_stats as (
select
feature,
avg(value) as mean, stddev_pop(value) as stddev,
min(value) as min, max(value) as max
from
quantative
group by
feature
),quantative_norm as (
select
t1.rowid,
collect_list(
feature(
t1.feature, rescale(t1.value, t2.min, t2.max)
)
) as features
from
quantative t1
JOIN quantative_stats t2 ON (t1.feature = t2.feature)
group by
t1.rowid
)INSERT OVERWRITE TABLE train_quantative
SELECT
rowid, features
FROM
quantative_norm t1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment