Last active
January 31, 2018 01:25
-
-
Save abajwa-hw/2e49079e5d89692b9eace82d0c25c4ab to your computer and use it in GitHub Desktop.
Generate TPC-DS dataset for Hive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#To run - export any variables then execute below: | |
#curl -sSL https://gist.github.com/abajwa-hw/2e49079e5d89692b9eace82d0c25c4ab/raw | sudo -E sh | |
#sudo -u hdfs -s | |
#cd /home/hdfs | |
#set java home and size of dataset (in GB). Min is 2 | |
export tpcds_size_gb=${tpcds_size_gb:-2} | |
export java_home=${java_home:-/usr/java/default} | |
sudo yum install -y gcc wget zip | |
sudo -u hdfs hdfs dfs -mkdir /user/root | |
sudo -u hdfs hdfs dfs -chown root /user/root | |
wget https://github.com/hortonworks/hive-testbench/archive/hive14.zip | |
unzip hive14.zip | |
sed -i.bak "s#UseG1GC#UseParallelGC#g" hive-testbench-hive14/settings/load-partitioned.sql | |
export JAVA_HOME=${java_home} | |
export PATH=$JAVA_HOME/bin:$PATH | |
cd hive-testbench-hive14/ | |
sudo ./tpcds-build.sh | |
#nohup ./tpcds-setup.sh ${tpcds_size_gb} >generate-tpcds-${tpcds_size_gb}.log 2>&1 & | |
./tpcds-setup.sh ${tpcds_size_gb} | |
#run statistics using regular HS2 | |
url="jdbc:hive2://$(hostname -f):2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2" | |
beeline -u $url -e " | |
use tpcds_bin_partitioned_orc_${tpcds_size_gb}; | |
analyze table call_center compute statistics for columns; | |
analyze table catalog_page compute statistics for columns; | |
analyze table catalog_returns compute statistics for columns; | |
analyze table catalog_sales compute statistics for columns; | |
analyze table customer compute statistics for columns; | |
analyze table customer_address compute statistics for columns; | |
analyze table customer_demographics compute statistics for columns; | |
analyze table date_dim compute statistics for columns; | |
analyze table household_demographics compute statistics for columns; | |
analyze table income_band compute statistics for columns; | |
analyze table inventory compute statistics for columns; | |
analyze table item compute statistics for columns; | |
analyze table promotion compute statistics for columns; | |
analyze table reason compute statistics for columns; | |
analyze table ship_mode compute statistics for columns; | |
analyze table store compute statistics for columns; | |
analyze table store_returns compute statistics for columns; | |
analyze table store_sales compute statistics for columns; | |
analyze table time_dim compute statistics for columns; | |
analyze table warehouse compute statistics for columns; | |
analyze table web_page compute statistics for columns; | |
analyze table web_returns compute statistics for columns; | |
analyze table web_sales compute statistics for columns; | |
analyze table web_site compute statistics for columns; | |
" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment