Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Better Know a Database - Redshift Load Data Formats
#!/bin/sh
# OS: Ubuntu 14.01
# Generate data - produces about 200GB on disk, takes a while
DATADIR=tpch-dbgen
SCALE=1000
git clone https://github.com/electrum/tpch-dbgen.git
cd $DATADIR && make && ./dbgen -f -v -C 16 -S 1 -s $SCALE && cd -
# Convert CSV to JSON
awk -f lineitem-tojson.awk $DATADIR/lineitem.tbl.1 > $DATADIR/lineitem.1.json
# Convert JSON to avro
wget http://apache.mirrors.pair.com/avro/avro-1.7.7/java/avro-tools-1.7.7.jar -O avro-tools.jar
java -jar avro-tools.jar fromjson --schema-file lineitem.avsc $DATADIR/lineitem.1.json > $DATADIR/lineitem.1.avro
echo "CSV, JSON and avro files created locally. Use \`aws s3 cp ...\` to upload them to s3"
#!/bin/bash
# OS: Ubuntu 14.01
# Requires: aws jq and psql in PATH
CLUSTER_NAME=$1
CLUSTER_PASS=$2
SUBNET_GROUP_NAME=$3
VPC_SEC_GROUP_ID=$4
FORMAT=$5 # "AVRO 'auto'", "JSON 'auto'", or "delimiter '|'"
S3_FILENAME=$6
AWS_KEY="FAKE"
AWS_SECRET="FAKE"
RESPONSE=$(aws redshift create-cluster \
--db-name benchmark \
--cluster-type single-node \
--node-type dc1.large \
--master-username root \
--master-user-password $CLUSTER_PASS \
--cluster-subnet-group-name $SUBNET_GROUP_NAME \
--vpc-security-group-ids $VPC_SEC_GROUP_ID \
--publicly-accessible \
--cluster-identifier $CLUSTER_NAME)
STATUS='creating'
while [ "$STATUS" == 'creating' ]
do
sleep 3
STATUS=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .ClusterStatus')
done
DB_HOST=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .Endpoint | .Address')
export PGPASSWORD=$CLUSTER_PASS
psql postgresql://root@$DB_HOST:5439/benchmark -f lineitem.create.sql
echo "COPYing Data..."
time psql postgresql://root@$DB_HOST:5439/benchmark -c "copy lineitem from '$S3_FILENAME' credentials 'aws_access_key_id=$AWS_KEY;aws_secret_access_key=$AWS_SECRET' FORMAT AS $FORMAT;"
echo "COPY Finished"
BEGIN {FS = "|"}
{print "{" \
"\"l_orderkey\": " $1 ", " \
"\"l_partkey\": " $2 ", " \
"\"l_suppkey\": " $3 ", " \
"\"l_linenumber\": " $4 ", " \
"\"l_quantity\": " $5 ", " \
"\"l_extendedprice\": " $6 ", " \
"\"l_discount\": " $7 ", " \
"\"l_tax\": " $8 ", " \
"\"l_returnflag\": \"" $9 "\", " \
"\"l_linestatus\": \"" $10 "\", " \
"\"l_shipdate\": \"" $11 "\", " \
"\"l_commitdate\": \"" $12 "\", " \
"\"l_receiptdate\": \"" $13 "\", " \
"\"l_shipinstruct\": \"" $14 "\", " \
"\"l_shipmode\": \"" $15 "\", " \
"\"l_comment\": \"" $16 "\"" \
"}"
}
{"namespace": "rjm.db-benchmark",
"type": "record",
"name": "lineitem",
"fields": [
{"name": "l_orderkey", "type": "int"},
{"name": "l_partkey", "type": "int"},
{"name": "l_suppkey", "type": "int"},
{"name": "l_linenumber", "type": "int"},
{"name": "l_quantity", "type": "float"},
{"name": "l_extendedprice", "type": "float"},
{"name": "l_discount", "type": "float"},
{"name": "l_tax", "type": "float"},
{"name": "l_returnflag", "type": "string"},
{"name": "l_linestatus", "type": "string"},
{"name": "l_shipdate", "type": "string"},
{"name": "l_commitdate", "type": "string"},
{"name": "l_receiptdate", "type": "string"},
{"name": "l_shipinstruct", "type": "string"},
{"name": "l_shipmode", "type": "string"},
{"name": "l_comment", "type": "string"}
]
}
CREATE TABLE lineitem (
l_orderkey INTEGER NOT NULL, L_PARTKEY INTEGER NOT NULL,
l_suppkey INTEGER NOT NULL,
l_linenumber INTEGER NOT NULL,
l_quantity DECIMAL(15,2) NOT NULL,
l_extendedprice DECIMAL(15,2) NOT NULL,
l_discount DECIMAL(15,2) NOT NULL,
l_tax DECIMAL(15,2) NOT NULL,
l_returnflag CHAR(1) NOT NULL,
l_linestatus CHAR(1) NOT NULL,
l_shipdate DATE NOT NULL,
l_commitdate DATE NOT NULL,
l_receiptdate DATE NOT NULL,
l_shipinstruct CHAR(25) NOT NULL,
l_shipmode CHAR(10) NOT NULL,
l_comment VARCHAR(44) NOT NULL
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment