Skip to content

Instantly share code, notes, and snippets.

@abajwa-hw
Last active January 31, 2018 01:25
Show Gist options
  • Save abajwa-hw/2e49079e5d89692b9eace82d0c25c4ab to your computer and use it in GitHub Desktop.
Save abajwa-hw/2e49079e5d89692b9eace82d0c25c4ab to your computer and use it in GitHub Desktop.
Generate TPC-DS dataset for Hive
#To run - export any variables then execute below:
#curl -sSL https://gist.github.com/abajwa-hw/2e49079e5d89692b9eace82d0c25c4ab/raw | sudo -E sh
#sudo -u hdfs -s
#cd /home/hdfs
#set java home and size of dataset (in GB). Min is 2
export tpcds_size_gb=${tpcds_size_gb:-2}
export java_home=${java_home:-/usr/java/default}
sudo yum install -y gcc wget zip
sudo -u hdfs hdfs dfs -mkdir /user/root
sudo -u hdfs hdfs dfs -chown root /user/root
wget https://github.com/hortonworks/hive-testbench/archive/hive14.zip
unzip hive14.zip
sed -i.bak "s#UseG1GC#UseParallelGC#g" hive-testbench-hive14/settings/load-partitioned.sql
export JAVA_HOME=${java_home}
export PATH=$JAVA_HOME/bin:$PATH
cd hive-testbench-hive14/
sudo ./tpcds-build.sh
#nohup ./tpcds-setup.sh ${tpcds_size_gb} >generate-tpcds-${tpcds_size_gb}.log 2>&1 &
./tpcds-setup.sh ${tpcds_size_gb}
#run statistics using regular HS2
url="jdbc:hive2://$(hostname -f):2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2"
beeline -u $url -e "
use tpcds_bin_partitioned_orc_${tpcds_size_gb};
analyze table call_center compute statistics for columns;
analyze table catalog_page compute statistics for columns;
analyze table catalog_returns compute statistics for columns;
analyze table catalog_sales compute statistics for columns;
analyze table customer compute statistics for columns;
analyze table customer_address compute statistics for columns;
analyze table customer_demographics compute statistics for columns;
analyze table date_dim compute statistics for columns;
analyze table household_demographics compute statistics for columns;
analyze table income_band compute statistics for columns;
analyze table inventory compute statistics for columns;
analyze table item compute statistics for columns;
analyze table promotion compute statistics for columns;
analyze table reason compute statistics for columns;
analyze table ship_mode compute statistics for columns;
analyze table store compute statistics for columns;
analyze table store_returns compute statistics for columns;
analyze table store_sales compute statistics for columns;
analyze table time_dim compute statistics for columns;
analyze table warehouse compute statistics for columns;
analyze table web_page compute statistics for columns;
analyze table web_returns compute statistics for columns;
analyze table web_sales compute statistics for columns;
analyze table web_site compute statistics for columns;
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment