Skip to content

Instantly share code, notes, and snippets.

@diana-shealy
Created April 7, 2016 21:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save diana-shealy/16cc759fe13d0ee0ca800ce3d3488986 to your computer and use it in GitHub Desktop.
Save diana-shealy/16cc759fe13d0ee0ca800ce3d3488986 to your computer and use it in GitHub Desktop.
Create the Training Feature Vector
WITH train_ordered as (
select * from training2
order by rowid asc
),
train_quantified as (
select
t0.rowid,
t0.sales,
t2.*,
t0.competitiondistance
from
train_ordered t0
-- indexing non-number columns
LATERAL VIEW quantify(true,
stateholiday, store, promo, dayofweek, schoolholiday, promo2sinceweek, competitionopensinceyear, assortment, promo2sinceyear, promointerval, promo2, storetype, competitionopensincemonth, year, month, day) t2
as stateholiday, store, promo, dayofweek, schoolholiday, promo2sinceweek, competitionopensinceyear, assortment, promo2sinceyear, promointerval, promo2, storetype, competitionopensincemonth, year, month, day
)
INSERT OVERWRITE TABLE training3
SELECT
t1.rowid,
ARRAY( -- padding zero for missing values
t1.stateholiday, t1.store, t1.promo, t1.dayofweek, t1.schoolholiday,
IF(t1.promo2sinceweek IS NULL, 0, t1.promo2sinceweek),
IF(t1.competitionopensinceyear IS NULL, 0, t1.competitionopensinceyear),
t1.assortment,
IF(t1.promo2sinceyear IS NULL, 0, t1.promo2sinceyear),
IF(t1.promointerval IS NULL, 0, t1.promointerval),
t1.promo2, t1.storetype,
IF(t1.competitionopensincemonth IS NULL, 0, t1.competitionopensincemonth),
t1.year, t1.month, t1.day,
IF(t1.competitiondistance IS NULL, 0, t1.competitiondistance)
) AS features,
LN(1 + t1.sales) AS label, -- log scale conversion
t1.sales
FROM
train_quantified t1
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment