Skip to content

Instantly share code, notes, and snippets.

@patmcdonough
Created November 27, 2012 06:01
Show Gist options
  • Save patmcdonough/4152629 to your computer and use it in GitHub Desktop.
Save patmcdonough/4152629 to your computer and use it in GitHub Desktop.
Generate data to test a range query alternative
create table dimension_rollup_periods
(period_id string, time_id string, begin_time timestamp, end_time timestamp)
STORED AS TEXTFILE;
/*Run make_periods.sh script*/
alter table dimension_rollup_periods set serdeproperties ('field.delim'=',');
LOAD DATA LOCAL INPATH 'periods/2012' OVERWRITE INTO TABLE dimension_rollup_periods;
/*Create a users file from http://www.generatedata.com/#generator */
/*Make a file with a few different event types (1 per row) */
create table users (user_id string);
create table events (event_type string);
load data local inpath 'users.csv' into table users;
load data local inpath 'events.csv' into table events;
/*Make some random timestamps, several times as this becomes driver for all the test data*/
/*FYI, workarounds due to HIVE-3676 may be necessary (eg. use UNION ALL)*/
/*Careful to note that this is evenly distributed data, so use sampling when pulling from it*/
create table random_timestamps as
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
as ts from dimension_rollup_periods;
insert into table random_timestamps
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) )
from dimension_rollup_periods;
/*workaround version due to HIVE-3676*/
insert into table random_timestamps
select * from
(
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) as ts
from dimension_rollup_periods
union all
select ts from random_timestamps
) unionresult"
insert into table fact_event_engagements
select a.user_id, r.ts as occured_at, a.event_type
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
join (select * from events inner join users) a;
/*workaround version*/
insert into table fact_event_engagements
select * from
(
select a.user_id, r.ts as occured_at, a.event_type
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r
join
(
select *
from events
inner join users
) a
union all
select *
from fact_event_engagements
) unionresults;
create table fact_event_engagements_partitioned (user_id string, occured_at timestamp, event_type string)
partitioned by (occured_at_date string) stored as SEQUENCEFILE;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=10000;
SET hive.exec.compress.output=true;
SET mapred.output.compression.type=BLOCK;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
from fact_event_engagements
insert overwrite table fact_event_engagements_partitioned partition (occured_at_date)
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
distribute by year(occured_at), month(occured_at);
from fact_event_engagements_partitioned
insert overwrite table row_test partition (occured_at_date)
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date
distribute by year(occured_at), month(occured_at);
#!/bin/sh
function make_files {
for m in {01..12};
do
for d in {01..28};
do
for h in {00..23};
do
PERIOD_ID=$1-$m-$d;
TIME_ID=$h;
BEGIN_TIME="$1-$m-$d $h:00:00";
END_TIME="$1-$m-$d $h:59:59";
# UTC_BEGIN_TIME=$(date --date="$1-$m-$d $h:00:00" +%s);
# UTC_END_TIME=$(date --date="$1-$m-$d $h:59:59" +%s);
echo $PERIOD_ID, $TIME_ID, $BEGIN_TIME, $END_TIME >> periods/$1/$m.dat
done;
done;
done;
}
mkdir periods
for y in {2009..2012};
do
mkdir periods/$y
make_files $y &
done;
SELECT /*+ MAPJOIN(dimension_rollup_periods) */ drp1.period_id, drp1.time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
FROM dimension_rollup_periods drp1
JOIN fact_event_engagements TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) engagements
WHERE drp1.period_id = '2012-09-11'
AND engagements.occured_at >= drp1.begin_time
AND engagements.occured_at < drp1.end_time
AND drp1.time_id >= 03
AND drp1.time_id <= 13
GROUP BY drp1.period_id, drp1.time_id;
/* We can accomplish the same by simply grouping by partitioned dates */
select to_date(engagements.occured_at) as period_id, hour(engagements.occured_at) as time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users
from fact_event_engagements engagements
WHERE to_date(engagements.occured_at) = '2012-09-11'
AND hour(engagements.occured_at) >= 03
AND hour(engagements.occured_at) <= 13
AND engagments.occured_at between (occured_at-16hours) AND (occured_at+8)
GROUP BY to_date(engagements.occured_at), hour(engagements.occured_at) order by period_id, time_id
LIMIT 1000000;
select to_date(engagements.occured_at) as period_id,
hour(engagements.occured_at) as time_id,
COUNT(DISTINCT engagements.user_id) AS engaged_users
from fact_event_engagements engagements
WHERE to_date(engagements.occured_at) = '2012-09-11'
AND hour(engagements.occured_at) >= 03
AND hour(engagements.occured_at) <= 13
GROUP BY to_date(engagements.occured_at),
hour(engagements.occured_at)
order by period_id, time_id
LIMIT 1000000;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment