Skip to content

Instantly share code, notes, and snippets.

@cmerrick
Last active February 18, 2021 07:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cmerrick/81c2a8b245df4723d031 to your computer and use it in GitHub Desktop.
Save cmerrick/81c2a8b245df4723d031 to your computer and use it in GitHub Desktop.
Better Know a Database - Redshift Load Compression
#!/bin/bash
# OS: Ubuntu 14.01
# Generate data - produces about 100GB on disk, takes a while
DBGEN_DIR=tpch-dbgen
SCALES=(1 10 100)
ORIGIN=`pwd`
S3_DIR="s3://your-bucket-here"
TABLES=(customer lineitem nation orders part partsupp region supplier)
for SCALE in "${SCALES[@]}"
do
cd $ORIGIN
mkdir -p scale-$SCALE && cd scale-$SCALE
git clone https://github.com/electrum/tpch-dbgen.git
cd $DBGEN_DIR && make && ./dbgen -f -v -s $SCALE
chmod 644 *.tbl*
lzop -f *.tbl
gzip -kf *.tbl
bzip2 -kf *.tbl
for TABLE in ${TABLES[@]};
do
mv ${TABLE}.tbl.gz gz-${TABLE}.tbl.gz
mv ${TABLE}.tbl.lzo lzo-${TABLE}.tbl.lzo
mv ${TABLE}.tbl.bz2 bz2-${TABLE}.tbl.bz2
done
aws s3 cp . "${S3_DIR}/scale-$SCALE-csv" --recursive --exclude "*" --include "*.tbl" --include "*.tbl*"
done
#!/bin/bash
# OS: Ubuntu 14.01
# Requires: aws, jq and psql in PATH
CLUSTER_NAME=$1
CLUSTER_PASS=$2
SUBNET_GROUP_NAME=$3
VPC_SEC_GROUP_ID=$4
S3_DIR=$5
COMPRESSION=$6
if [ $COMPRESSION = "gz" ]
then
echo "Using 'gz' compression"
FILE_PREFIX="gz-"
REDSHIFT_FORMAT="gzip"
elif [ $COMPRESSION = "lzo" ]
then
echo "Using 'lzo' compression"
FILE_PREFIX="lzo-"
REDSHIFT_FORMAT="lzop"
elif [ $COMPRESSION = "bz2" ]
then
echo "Using 'bz2' compression"
FILE_PREFIX="bz2-"
REDSHIFT_FORMAT="bzip2"
else
echo "Compression not 'gz', 'bz2' or 'lzo', defaulting to no compression"
FILE_PREFIX=""
REDSHIFT_FORMAT=""
fi
AWS_KEY="FAKE"
AWS_SECRET="FAKE"
RESPONSE=$(aws redshift create-cluster \
--db-name benchmark \
--cluster-type single-node \
--node-type dc1.large \
--master-username root \
--master-user-password $CLUSTER_PASS \
--cluster-subnet-group-name $SUBNET_GROUP_NAME \
--vpc-security-group-ids $VPC_SEC_GROUP_ID \
--publicly-accessible \
--cluster-identifier $CLUSTER_NAME)
STATUS='creating'
while [ "$STATUS" == 'creating' ]
do
sleep 10
STATUS=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .ClusterStatus')
done
DB_HOST=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .Endpoint | .Address')
export PGPASSWORD=$CLUSTER_PASS
psql postgresql://root@$DB_HOST:5439/benchmark -f tables.create.sql
TABLES=(nation region part supplier partsupp customer orders lineitem)
for TABLE in ${TABLES[@]}
do
echo "COPYing $TABLE"
time psql postgresql://root@$DB_HOST:5439/benchmark -c "copy ${TABLE} from '$S3_DIR/${FILE_PREFIX}${TABLE}.tbl' credentials 'aws_access_key_id=$AWS_KEY;aws_secret_access_key=$AWS_SECRET' delimiter '|' $REDSHIFT_FORMAT"
done
echo "COPY Finished"
psql postgresql://root@$DB_HOST:5439/benchmark -A -F , -X -t -c "SELECT datediff(ms, starttime, endtime), substring(querytxt, 0, 24) FROM STL_QUERY WHERE querytxt LIKE 'copy % from %' LIMIT 10;"
echo ""
CREATE TABLE NATION ( N_NATIONKEY INTEGER NOT NULL,
N_NAME CHAR(25) NOT NULL,
N_REGIONKEY INTEGER NOT NULL,
N_COMMENT VARCHAR(152));
CREATE TABLE REGION ( R_REGIONKEY INTEGER NOT NULL,
R_NAME CHAR(25) NOT NULL,
R_COMMENT VARCHAR(152));
CREATE TABLE PART ( P_PARTKEY INTEGER NOT NULL,
P_NAME VARCHAR(55) NOT NULL,
P_MFGR CHAR(25) NOT NULL,
P_BRAND CHAR(10) NOT NULL,
P_TYPE VARCHAR(25) NOT NULL,
P_SIZE INTEGER NOT NULL,
P_CONTAINER CHAR(10) NOT NULL,
P_RETAILPRICE DECIMAL(15,2) NOT NULL,
P_COMMENT VARCHAR(23) NOT NULL );
CREATE TABLE SUPPLIER ( S_SUPPKEY INTEGER NOT NULL,
S_NAME CHAR(25) NOT NULL,
S_ADDRESS VARCHAR(40) NOT NULL,
S_NATIONKEY INTEGER NOT NULL,
S_PHONE CHAR(15) NOT NULL,
S_ACCTBAL DECIMAL(15,2) NOT NULL,
S_COMMENT VARCHAR(101) NOT NULL);
CREATE TABLE PARTSUPP ( PS_PARTKEY INTEGER NOT NULL,
PS_SUPPKEY INTEGER NOT NULL,
PS_AVAILQTY INTEGER NOT NULL,
PS_SUPPLYCOST DECIMAL(15,2) NOT NULL,
PS_COMMENT VARCHAR(199) NOT NULL );
CREATE TABLE CUSTOMER ( C_CUSTKEY INTEGER NOT NULL,
C_NAME VARCHAR(25) NOT NULL,
C_ADDRESS VARCHAR(40) NOT NULL,
C_NATIONKEY INTEGER NOT NULL,
C_PHONE CHAR(15) NOT NULL,
C_ACCTBAL DECIMAL(15,2) NOT NULL,
C_MKTSEGMENT CHAR(10) NOT NULL,
C_COMMENT VARCHAR(117) NOT NULL);
CREATE TABLE ORDERS ( O_ORDERKEY INTEGER NOT NULL,
O_CUSTKEY INTEGER NOT NULL,
O_ORDERSTATUS CHAR(1) NOT NULL,
O_TOTALPRICE DECIMAL(15,2) NOT NULL,
O_ORDERDATE DATE NOT NULL,
O_ORDERPRIORITY CHAR(15) NOT NULL,
O_CLERK CHAR(15) NOT NULL,
O_SHIPPRIORITY INTEGER NOT NULL,
O_COMMENT VARCHAR(79) NOT NULL);
CREATE TABLE LINEITEM ( L_ORDERKEY INTEGER NOT NULL,
L_PARTKEY INTEGER NOT NULL,
L_SUPPKEY INTEGER NOT NULL,
L_LINENUMBER INTEGER NOT NULL,
L_QUANTITY DECIMAL(15,2) NOT NULL,
L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL,
L_DISCOUNT DECIMAL(15,2) NOT NULL,
L_TAX DECIMAL(15,2) NOT NULL,
L_RETURNFLAG CHAR(1) NOT NULL,
L_LINESTATUS CHAR(1) NOT NULL,
L_SHIPDATE DATE NOT NULL,
L_COMMITDATE DATE NOT NULL,
L_RECEIPTDATE DATE NOT NULL,
L_SHIPINSTRUCT CHAR(25) NOT NULL,
L_SHIPMODE CHAR(10) NOT NULL,
L_COMMENT VARCHAR(44) NOT NULL);
@stabgan
Copy link

stabgan commented Feb 18, 2021

Interesting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment