patmcdonough/hive_prep.sql

## hive_prep.sql
create table dimension_rollup_periods
(period_id string, time_id string, begin_time timestamp, end_time timestamp)
STORED AS TEXTFILE;

/*Run make_periods.sh script*/

alter table dimension_rollup_periods set serdeproperties ('field.delim'=',');

LOAD DATA LOCAL INPATH 'periods/2012' OVERWRITE INTO TABLE dimension_rollup_periods;

/*Create a users file from http://www.generatedata.com/#generator */
/*Make a file with a few different event types (1 per row) */

create table users (user_id string);
create table events (event_type string);

load data local inpath 'users.csv' into table users;
load data local inpath 'events.csv' into table events;


/*Make some random timestamps, several times as this becomes driver for all the test data*/
/*FYI, workarounds due to HIVE-3676 may be necessary (eg. use UNION ALL)*/
/*Careful to note that this is evenly distributed data, so use sampling when pulling from it*/

create table random_timestamps  as
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
as ts from dimension_rollup_periods;

insert into table random_timestamps
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
from dimension_rollup_periods;

/*workaround version due to HIVE-3676*/
insert into table random_timestamps
select * from
(
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) as ts
from dimension_rollup_periods
union all
select ts from random_timestamps
) unionresult"

insert into table fact_event_engagements
select a.user_id, r.ts as occured_at, a.event_type
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
join (select * from events inner join users) a;


/*workaround version*/
insert into table fact_event_engagements
select * from
(
select a.user_id, r.ts as occured_at, a.event_type
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
join
(
select *
from events
inner join users
) a
union all
select *
from fact_event_engagements
) unionresults;

create table fact_event_engagements_partitioned (user_id string, occured_at timestamp, event_type string)
partitioned by (occured_at_date string) stored as SEQUENCEFILE;


set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=10000;
SET hive.exec.compress.output=true;
SET mapred.output.compression.type=BLOCK;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
from fact_event_engagements
insert overwrite table fact_event_engagements_partitioned partition (occured_at_date)
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
distribute by year(occured_at), month(occured_at);


from fact_event_engagements_partitioned
insert overwrite table row_test partition (occured_at_date)
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
distribute by year(occured_at), month(occured_at);

## make_periods.sh
#!/bin/sh

function make_files {


  for m in {01..12};
  do
    for d in {01..28};
    do
      for h in {00..23};
      do
        PERIOD_ID=$1-$m-$d;
        TIME_ID=$h;
        BEGIN_TIME="$1-$m-$d $h:00:00";
        END_TIME="$1-$m-$d $h:59:59";
#       UTC_BEGIN_TIME=$(date --date="$1-$m-$d $h:00:00" +%s);
#       UTC_END_TIME=$(date --date="$1-$m-$d $h:59:59" +%s);


        echo $PERIOD_ID, $TIME_ID, $BEGIN_TIME, $END_TIME >> periods/$1/$m.dat
      done;
    done;
   done;


}

mkdir periods
for y in {2009..2012};
do
  mkdir periods/$y
  make_files $y &
done;

## testing.sql
SELECT /*+ MAPJOIN(dimension_rollup_periods) */ drp1.period_id, drp1.time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
FROM dimension_rollup_periods drp1
JOIN fact_event_engagements TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) engagements
WHERE drp1.period_id = '2012-09-11'
AND engagements.occured_at >= drp1.begin_time
AND engagements.occured_at < drp1.end_time
AND drp1.time_id >= 03
AND drp1.time_id <= 13
GROUP BY drp1.period_id, drp1.time_id;


/* We can accomplish the same by simply grouping by partitioned dates */
select to_date(engagements.occured_at) as period_id, hour(engagements.occured_at) as time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
from fact_event_engagements engagements
WHERE to_date(engagements.occured_at) = '2012-09-11'
AND hour(engagements.occured_at) >= 03
AND hour(engagements.occured_at) <= 13
AND engagments.occured_at between (occured_at-16hours) AND (occured_at+8)
GROUP BY to_date(engagements.occured_at), hour(engagements.occured_at) order by period_id, time_id
LIMIT 1000000;


select to_date(engagements.occured_at) as period_id,
hour(engagements.occured_at) as time_id,
COUNT(DISTINCT engagements.user_id) AS engaged_users
from fact_event_engagements engagements
WHERE to_date(engagements.occured_at) = '2012-09-11'
AND hour(engagements.occured_at) >= 03
AND hour(engagements.occured_at) <= 13
GROUP BY to_date(engagements.occured_at),
hour(engagements.occured_at)
order by period_id, time_id
LIMIT 1000000;
	create table dimension_rollup_periods
	(period_id string, time_id string, begin_time timestamp, end_time timestamp)
	STORED AS TEXTFILE;

	/Run make_periods.sh script/

	alter table dimension_rollup_periods set serdeproperties ('field.delim'=',');

	LOAD DATA LOCAL INPATH 'periods/2012' OVERWRITE INTO TABLE dimension_rollup_periods;

	/Create a users file from http://www.generatedata.com/#generator /
	/Make a file with a few different event types (1 per row) /

	create table users (user_id string);
	create table events (event_type string);

	load data local inpath 'users.csv' into table users;
	load data local inpath 'events.csv' into table events;


	/Make some random timestamps, several times as this becomes driver for all the test data/
	/FYI, workarounds due to HIVE-3676 may be necessary (eg. use UNION ALL)/
	/Careful to note that this is evenly distributed data, so use sampling when pulling from it/

	create table random_timestamps as
	select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
	as ts from dimension_rollup_periods;

	insert into table random_timestamps
	select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
	from dimension_rollup_periods;

	/workaround version due to HIVE-3676/
	insert into table random_timestamps
	select * from
	(
	select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) as ts
	from dimension_rollup_periods
	union all
	select ts from random_timestamps
	) unionresult"

	insert into table fact_event_engagements
	select a.user_id, r.ts as occured_at, a.event_type
	from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
	join (select * from events inner join users) a;


	/workaround version/
	insert into table fact_event_engagements
	select * from
	(
	select a.user_id, r.ts as occured_at, a.event_type
	from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
	join
	(
	select *
	from events
	inner join users
	) a
	union all
	select *
	from fact_event_engagements
	) unionresults;

	create table fact_event_engagements_partitioned (user_id string, occured_at timestamp, event_type string)
	partitioned by (occured_at_date string) stored as SEQUENCEFILE;


	set hive.exec.dynamic.partition.mode=nonstrict;
	set hive.exec.max.dynamic.partitions=2000;
	set hive.exec.max.dynamic.partitions.pernode=10000;
	SET hive.exec.compress.output=true;
	SET mapred.output.compression.type=BLOCK;
	SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
	from fact_event_engagements
	insert overwrite table fact_event_engagements_partitioned partition (occured_at_date)
	select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
	distribute by year(occured_at), month(occured_at);


	from fact_event_engagements_partitioned
	insert overwrite table row_test partition (occured_at_date)
	select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
	distribute by year(occured_at), month(occured_at);
	#!/bin/sh

	function make_files {


	for m in {01..12};
	do
	for d in {01..28};
	do
	for h in {00..23};
	do
	PERIOD_ID=$1-$m-$d;
	TIME_ID=$h;
	BEGIN_TIME="$1-$m-$d $h:00:00";
	END_TIME="$1-$m-$d $h:59:59";
	# UTC_BEGIN_TIME=$(date --date="$1-$m-$d $h:00:00" +%s);
	# UTC_END_TIME=$(date --date="$1-$m-$d $h:59:59" +%s);


	echo $PERIOD_ID, $TIME_ID, $BEGIN_TIME, $END_TIME >> periods/$1/$m.dat
	done;
	done;
	done;


	}

	mkdir periods
	for y in {2009..2012};
	do
	mkdir periods/$y
	make_files $y &
	done;
	SELECT /+ MAPJOIN(dimension_rollup_periods) / drp1.period_id, drp1.time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
	FROM dimension_rollup_periods drp1
	JOIN fact_event_engagements TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) engagements
	WHERE drp1.period_id = '2012-09-11'
	AND engagements.occured_at >= drp1.begin_time
	AND engagements.occured_at < drp1.end_time
	AND drp1.time_id >= 03
	AND drp1.time_id <= 13
	GROUP BY drp1.period_id, drp1.time_id;


	/* We can accomplish the same by simply grouping by partitioned dates */
	select to_date(engagements.occured_at) as period_id, hour(engagements.occured_at) as time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
	from fact_event_engagements engagements
	WHERE to_date(engagements.occured_at) = '2012-09-11'
	AND hour(engagements.occured_at) >= 03
	AND hour(engagements.occured_at) <= 13
	AND engagments.occured_at between (occured_at-16hours) AND (occured_at+8)
	GROUP BY to_date(engagements.occured_at), hour(engagements.occured_at) order by period_id, time_id
	LIMIT 1000000;


	select to_date(engagements.occured_at) as period_id,
	hour(engagements.occured_at) as time_id,
	COUNT(DISTINCT engagements.user_id) AS engaged_users
	from fact_event_engagements engagements
	WHERE to_date(engagements.occured_at) = '2012-09-11'
	AND hour(engagements.occured_at) >= 03
	AND hour(engagements.occured_at) <= 13
	GROUP BY to_date(engagements.occured_at),
	hour(engagements.occured_at)
	order by period_id, time_id
	LIMIT 1000000;