leewc/dynamoTodynamo.hive

## dynamoTodynamo.hive

-----------------------------------------------------------------------------------------------------------
-- Script to migrate data from monthly tables into one table.
--  filename: dynamotodynamo.hive
--  author: leewc
--  Instructions:
--    Run: hive -hiveconf domain="Devo" -hiveconf region="NA" -hiveconf yyyymm=201604 -f dynamoTodynamo.hive
--    Variables passed from bash are accessible via hiveconf.
--    Variables are case-sensitive i.e test != Test
------------------------------------------------------------------------------------------------------------

-- start with a clean slate.

DROP TABLE src_table_db;
DROP TABLE dst_table_db;

-- set time to live (expiration date for queries) in seconds. (one year in this example)
set hivevar:ttl = 31536000;

-- set read and write percentage capacities
SET dynamodb.throughput.write.percent = 10;
SET dynamodb.throughput.read.percent = 0.5;

!echo "Read Write Capacity Percentages set.";
!echo "USING DOMAIN: ${hiveconf:domain} REGION: ${hiveconf:region} ";
!echo;

-- note that in column mapping it's left side to right side: [ HIVE_COLUMN -> DDB_COLUMN ]

CREATE EXTERNAL TABLE src_table_db (CustomerID string, DateTime string, URLClicked string, OrderID string, ProductID string)
STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
TBLPROPERTIES ("dynamodb.table.name" = "CustomerTracker_{hiveconf:domain}_${hiveconf:region}_${hiveconf:yyyymm}",
	       "dynamodb.column.mapping" = "CustomerID:CustomerID,DateTime:DateTime,URLClicked:URLClicked,OrderID:OrderID,ProductID:ProductID");

-- In the above table properties you can also perform mapping to columns that are differently named between Hive and DynamoDB,
--  but in this example there's no need to.

!echo "CREATED SOURCE TABLE.";
!echo;

CREATE EXTERNAL TABLE dst_table_db (CustomerID string, CreationDateTime bigint, ExpirationTimestamp bigint, OrderID string)
STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
TBLPROPERTIES ("dynamodb.table.name" = "${hiveconf:domain}_${hiveconf:region}_AnniversaryPromo_2017",
	       "dynamodb.column.mapping" = "CustomerID:CustomerID,CreationDateTime:CreationDateTime,ExpirationTimestamp:ExpirationTimestamp,OrderID:OrderID");


!echo "CREATED DESTINATION TABLE. Describing all tables.. ";
!echo;

DESCRIBE src_table_db;
!echo;
DESCRIBE dst_table_db;

!echo "Starting the copy ... ";
!echo "Using a Time to Live value of ${hivevar:ttl} for the expiration date.";
!echo;

-- Parse the CreationDateTime from Joda human time to epoch and also do this for the expiration with the additional TTL.
--- We are effectively copying the data, then transforming it on the fly to get ExpirationTimeStamp and CreationTimeStamp in the right format, this will allow for TTL.
--- We lose some accuracy when converting to epoch as the format stored is in Joda time and the built-in Hive functions do not support that.
--- We can get around the formatting issue by loading a custom Joda time UDF, but it's more trouble than it's worth right now for this use case. See https://github.com/lithiumtech/yoda-time

INSERT OVERWRITE TABLE dst_table_db
SELECT CustomerID,
       unix_timestamp(StartDateTime, 'yyyy-MM-dd') as CreationTimeStamp,
       unix_timestamp(StartDateTime, 'yyyy-MM-dd') + ${hivevar:ttl} as ExpirationTimeStamp,
       OrderID
FROM src_table_db
WHERE length(OrderID) > 0;

-- Optional: Add a WHERE length(CustomerID) > 0, to guard against any possibility of inserting a null value into the primary key
-- of the destination field. I don't need to do this in this example as the customerID is the primary key in the source table
-- and is impossible to have any null fields in the Primary key field of the source table to begin with.
-- In addition to that, you can add other filters such as you want URL

!echo "INSERT COMPLETE.";
!echo "Backfill complete.";

## startDynamoDBTableExport.sh

# Filename: startDynamoDBTableExport.sh
# Author: leewc
# This script copies the specific columns in the monthly CustomerTracker tables to a new yearly table for anniversary promotions.
# This is used in conjuction with the dynamoTodynamo.hive script.
# Note: You may need to provide execute permissions to run this script. [ chmod +x startDynamoDBTableExport.sh ]
# You will have to explicitly specify the domain, region and months in this file. This is by design to avoid
#  copying the wrong data between regions.
#
#!/bin/sh

months=(201705 201704 201703 201702 201701 201612 201611 201610 201609 201608 201607 201606 201605)
domain="Devo"
region="NA"

function _start() {
    for month in ${months[@]}
    do
        echo "Beginning Hive Copy for CustomerTracker_${domain}_${region}_${month}".
	hive -hiveconf domain=${domain} -hiveconf region=${region} -hiveconf yyyymm=${month} -f inflight-dynamoTodynamo.hive
    done
}

echo "This will run dynamoTodynamo Hive script for " ${domain} " and " ${region}
echo "With the following months: "
echo ""
echo ${months[*]}
echo ""
while true; do
    read -p "Are you sure this is what you want (Y/N)? " yn
    case $yn in
        [Yy] ) _start; break;;
        [Nn] ) echo "Aborting operation."; exit;;
        * ) echo "Please answer yes (Y) or no (N).";;
    esac
done

	-----------------------------------------------------------------------------------------------------------
	-- Script to migrate data from monthly tables into one table.
	-- filename: dynamotodynamo.hive
	-- author: leewc
	-- Instructions:
	-- Run: hive -hiveconf domain="Devo" -hiveconf region="NA" -hiveconf yyyymm=201604 -f dynamoTodynamo.hive
	-- Variables passed from bash are accessible via hiveconf.
	-- Variables are case-sensitive i.e test != Test
	------------------------------------------------------------------------------------------------------------

	-- start with a clean slate.

	DROP TABLE src_table_db;
	DROP TABLE dst_table_db;

	-- set time to live (expiration date for queries) in seconds. (one year in this example)
	set hivevar:ttl = 31536000;

	-- set read and write percentage capacities
	SET dynamodb.throughput.write.percent = 10;
	SET dynamodb.throughput.read.percent = 0.5;

	!echo "Read Write Capacity Percentages set.";
	!echo "USING DOMAIN: ${hiveconf:domain} REGION: ${hiveconf:region} ";
	!echo;

	-- note that in column mapping it's left side to right side: [ HIVE_COLUMN -> DDB_COLUMN ]

	CREATE EXTERNAL TABLE src_table_db (CustomerID string, DateTime string, URLClicked string, OrderID string, ProductID string)
	STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
	TBLPROPERTIES ("dynamodb.table.name" = "CustomerTracker_{hiveconf:domain}_${hiveconf:region}_${hiveconf:yyyymm}",
	"dynamodb.column.mapping" = "CustomerID:CustomerID,DateTime:DateTime,URLClicked:URLClicked,OrderID:OrderID,ProductID:ProductID");

	-- In the above table properties you can also perform mapping to columns that are differently named between Hive and DynamoDB,
	-- but in this example there's no need to.

	!echo "CREATED SOURCE TABLE.";
	!echo;

	CREATE EXTERNAL TABLE dst_table_db (CustomerID string, CreationDateTime bigint, ExpirationTimestamp bigint, OrderID string)
	STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
	TBLPROPERTIES ("dynamodb.table.name" = "${hiveconf:domain}_${hiveconf:region}_AnniversaryPromo_2017",
	"dynamodb.column.mapping" = "CustomerID:CustomerID,CreationDateTime:CreationDateTime,ExpirationTimestamp:ExpirationTimestamp,OrderID:OrderID");


	!echo "CREATED DESTINATION TABLE. Describing all tables.. ";
	!echo;

	DESCRIBE src_table_db;
	!echo;
	DESCRIBE dst_table_db;

	!echo "Starting the copy ... ";
	!echo "Using a Time to Live value of ${hivevar:ttl} for the expiration date.";
	!echo;

	-- Parse the CreationDateTime from Joda human time to epoch and also do this for the expiration with the additional TTL.
	--- We are effectively copying the data, then transforming it on the fly to get ExpirationTimeStamp and CreationTimeStamp in the right format, this will allow for TTL.
	--- We lose some accuracy when converting to epoch as the format stored is in Joda time and the built-in Hive functions do not support that.
	--- We can get around the formatting issue by loading a custom Joda time UDF, but it's more trouble than it's worth right now for this use case. See https://github.com/lithiumtech/yoda-time

	INSERT OVERWRITE TABLE dst_table_db
	SELECT CustomerID,
	unix_timestamp(StartDateTime, 'yyyy-MM-dd') as CreationTimeStamp,
	unix_timestamp(StartDateTime, 'yyyy-MM-dd') + ${hivevar:ttl} as ExpirationTimeStamp,
	OrderID
	FROM src_table_db
	WHERE length(OrderID) > 0;

	-- Optional: Add a WHERE length(CustomerID) > 0, to guard against any possibility of inserting a null value into the primary key
	-- of the destination field. I don't need to do this in this example as the customerID is the primary key in the source table
	-- and is impossible to have any null fields in the Primary key field of the source table to begin with.
	-- In addition to that, you can add other filters such as you want URL

	!echo "INSERT COMPLETE.";
	!echo "Backfill complete.";

	# Filename: startDynamoDBTableExport.sh
	# Author: leewc
	# This script copies the specific columns in the monthly CustomerTracker tables to a new yearly table for anniversary promotions.
	# This is used in conjuction with the dynamoTodynamo.hive script.
	# Note: You may need to provide execute permissions to run this script. [ chmod +x startDynamoDBTableExport.sh ]
	# You will have to explicitly specify the domain, region and months in this file. This is by design to avoid
	# copying the wrong data between regions.
	#
	#!/bin/sh

	months=(201705 201704 201703 201702 201701 201612 201611 201610 201609 201608 201607 201606 201605)
	domain="Devo"
	region="NA"

	function _start() {
	for month in ${months[@]}
	do
	echo "Beginning Hive Copy for CustomerTracker_${domain}_${region}_${month}".
	hive -hiveconf domain=${domain} -hiveconf region=${region} -hiveconf yyyymm=${month} -f inflight-dynamoTodynamo.hive
	done
	}

	echo "This will run dynamoTodynamo Hive script for " ${domain} " and " ${region}
	echo "With the following months: "
	echo ""
	echo ${months[*]}
	echo ""
	while true; do
	read -p "Are you sure this is what you want (Y/N)? " yn
	case $yn in
	[Yy] ) _start; break;;
	[Nn] ) echo "Aborting operation."; exit;;
	* ) echo "Please answer yes (Y) or no (N).";;
	esac
	done