Skip to content

Instantly share code, notes, and snippets.

@NitinKumar94
Created May 5, 2016 06:30
Show Gist options
  • Save NitinKumar94/fbca5d56caa6c150eaa4c8528a63252c to your computer and use it in GitHub Desktop.
Save NitinKumar94/fbca5d56caa6c150eaa4c8528a63252c to your computer and use it in GitHub Desktop.
Aggregation query for benchmarking
------------Query for basic aggregation for performance benchmarking---------------
set hive.tez.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.tez.container.size=${hiveconf:CONTAINER_SIZE};
set hive.tez.cpu.vcores=${hiveconf:VCORES};
--set tez.grouping.split-count=15;
use benchmarking;
--explain
create table ${hiveconf:TARGET_TABLE_NAME} as
select
b.subscriber_id as subscriber_id,
hour(b.time_stamp) as hour_of_day,
max(offnet_upload_volume) as max_offnet_upload,
max(offnet_download_volume) as max_offnet_download,
min(onnet_upload_volume) as min_onnet_upload,
min(onnet_download_volume) as min_onnet_download,
count(*) as session_count,
sum(b.upload_data_volume) as total_upload,
sum(b.download_data_volume) as total_download,
sum(b.upload_data_volume) + sum(download_data_volume) as total_data_volume,
sum(b.value) as total_value,
avg(b.upload_data_volume) as avg_upload_volume,
avg(b.download_data_volume) as avg_download_volume,
avg(b.value) as avg_value,
sum(b.download_data_volume)/sum(b.upload_data_volume) as usage_ratio,
sum(b.upload_data_volume)/(sum(b.upload_data_volume) + sum(b.download_data_volume)) * 100.0 as upload_data_percentage,
sum(b.download_data_volume)/(sum(b.upload_data_volume) + sum(b.download_data_volume)) * 100.0 as download_data_percentage,
sum(b.offnet_upload_volume) as offnet_upload_volume,
sum(b.offnet_download_volume) as offnet_download_volume,
sum(b.offnet_value) as offnet_value,
sum(b.onnet_upload_volume) as onnet_upload_volume,
sum(b.onnet_download_volume) as onnet_download_volume,
sum(b.onnet_value) as onnet_value,
sum(b.offline_upload_volume) as offline_upload_volume,
sum(b.offline_download_volume) as offline_download_volume,
sum(b.offline_value) as offline_value,
variance(b.upload_data_volume) as upload_variance,
variance(b.download_data_volume) as download_variance,
variance(b.value) as value_variance
from
(
select
subscriber_id,
upload_data_volume,
download_data_volume,
value,
time_stamp,
cell_site,
latitude,
longitude,
circle,
rate_group,
disconnect_reason,
if(rate_group = 'offnet',upload_data_volume,0) as offnet_upload_volume,
if(rate_group = 'offnet',download_data_volume,0) as offnet_download_volume,
if(rate_group = 'offnet',value,0) as offnet_value,
if(rate_group = 'onnet',upload_data_volume,0) as onnet_upload_volume,
if(rate_group = 'onnet',download_data_volume,0) as onnet_download_volume,
if(rate_group = 'onnet',value,0) as onnet_value,
if(rate_group = 'offline',upload_data_volume,0) as offline_upload_volume,
if(rate_group = 'offline',download_data_volume,0) as offline_download_volume,
if(rate_group = 'offline',value,0) as offline_value
from
${hiveconf:SOURCE_TABLE_NAME}
) b
group by
b.subscriber_id, hour(b.time_stamp);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment