apenney/dependencies.sh

## dependencies.sh
#!/usr/bin/env bash

# install google utilities
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
tar xfz gsutil.tar.gz -C $HOME
vi ~/.bashrc
export PATH=${PATH}:$HOME/gsutil
cd gsutil
python setup.py install
gsutil config

sudo sh
easy_install bigquery
bq init

## mysql_to_big_query.sh
#!/bin/bash

set -x

# Cleanup the mess we've made
function cleanup_files() {
  rm -f "${TABLE_NAME}.*"
  rm -f "${file_prefix}*"
}

TABLE_SCHEMA=$1
TABLE_NAME=$2
BIGQUERY_DATASET=$3

# Customize these for your organization
BUCKET_PREFIX="eq"
GZIP_ENABLED="false"

mytime=$(date '+%y%m%d%H%M')
#hostname=$(hostname | tr '[:upper:]' '[:lower:]')
file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}"
bucket_name="${BUCKET_PREFIX}-${file_prefix}"
splitat="4000000000"
bulkfiles=200
maxbad=300

if [ $# -ne 3 ];then
  echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME'
  exit 1
fi

# Check if the table has blob data
cat > blob_query.txt << heredoc
select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
heredoc

if ! mysql -Bs < blob_query.txt; then
  echo "blob column found in table $TABLE_NAME"
  exit 3
fi

if ! gsutil mb "gs://$bucket_name"; then
  cleanup_files
  echo "bucket $bucket_name could not be created in cloud"
  exit 4
fi

# create JSON schema from mysql table structure
cat > json_query.txt << heredoc
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
heredoc

echo '[' > "$TABLE_NAME.json"
mysql -Bs < json_query.txt | sed '$s/,$//' >> "$TABLE_NAME.json"
mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql"
echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json"

# copy json and create table data to cloud
gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/"
gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/"

# Dump the data, strip \r's, a bunch of sed's, and we're done
time mysql --quick "$TABLE_SCHEMA" -Bse"select *  from $TABLE_NAME" \
  | sed -e "s/\(\\r\|\"\|NULL\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt"

# split files with prefix
time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix"

GZIP_FLAGS="-m"
if [[ "$GZIP_ENABLED" == "true" ]]; then
  GZIP_FLAGS="-m -J"
fi

if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then
  cleanup_files
  echo "$file could not be copied to cloud"
  exit 3
fi

##
## Import to BigQuery
##

for mylist in $(gsutil ls "gs://$bucket_name/*" | xargs -n$bulkfiles | tr ' ', ','); do
  echo "$mylist"

  ##
  ## Check if we have a DATE column, if we do then we can use partitioning
  ##
  BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad"

  if grep "DATE" "$TABLE_NAME.json"; then
    FIELD=$(grep "DATE" "$TABLE_NAME.json" | tail -1 | cut -d'"' -f4)
    BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}"
  fi

  if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then
    echo "bq load failed for $file, check file exist in cloud"
    cleanup_files
    exit 2
  fi
done

## wrapper.sh
#!/bin/bash

BIGQUERY_DATASET=$1
TABLE_SCHEMA=$2
LOG_TO_FILE="false"

if [[ $# -eq 0 ]] ; then
    echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA'
    exit 0
fi

bq mk "$BIGQUERY_DATASET" || true

# mysqlshow has a structure with a header at the top, we strip this out with a tail
# and then use awk to get the contents of the table cell instead of the borders
for tbl_name in $(mysqlshow "$TABLE_SCHEMA" | tail -n +5 | awk '{print $2}'); do
#sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
  if [[ "$LOG_TO_FILE" == "true" ]]; then
    ./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt
  else
    ./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET"
  fi
done
	#!/usr/bin/env bash

	# install google utilities
	wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
	tar xfz gsutil.tar.gz -C $HOME
	vi ~/.bashrc
	export PATH=${PATH}:$HOME/gsutil
	cd gsutil
	python setup.py install
	gsutil config

	sudo sh
	easy_install bigquery
	bq init
	#!/bin/bash

	set -x

	# Cleanup the mess we've made
	function cleanup_files() {
	rm -f "${TABLE_NAME}.*"
	rm -f "${file_prefix}*"
	}

	TABLE_SCHEMA=$1
	TABLE_NAME=$2
	BIGQUERY_DATASET=$3

	# Customize these for your organization
	BUCKET_PREFIX="eq"
	GZIP_ENABLED="false"

	mytime=$(date '+%y%m%d%H%M')
	#hostname=$(hostname \| tr '[:upper:]' '[:lower:]')
	file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}"
	bucket_name="${BUCKET_PREFIX}-${file_prefix}"
	splitat="4000000000"
	bulkfiles=200
	maxbad=300

	if [ $# -ne 3 ];then
	echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME'
	exit 1
	fi

	# Check if the table has blob data
	cat > blob_query.txt << heredoc
	select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
	heredoc

	if ! mysql -Bs < blob_query.txt; then
	echo "blob column found in table $TABLE_NAME"
	exit 3
	fi

	if ! gsutil mb "gs://$bucket_name"; then
	cleanup_files
	echo "bucket $bucket_name could not be created in cloud"
	exit 4
	fi

	# create JSON schema from mysql table structure
	cat > json_query.txt << heredoc
	select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
	heredoc

	echo '[' > "$TABLE_NAME.json"
	mysql -Bs < json_query.txt \| sed '$s/,$//' >> "$TABLE_NAME.json"
	mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql"
	echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json"

	# copy json and create table data to cloud
	gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/"
	gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/"

	# Dump the data, strip \r's, a bunch of sed's, and we're done
	time mysql --quick "$TABLE_SCHEMA" -Bse"select * from $TABLE_NAME" \
	\| sed -e "s/\(\\r\\|\"\\|NULL\\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt"

	# split files with prefix
	time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix"

	GZIP_FLAGS="-m"
	if [[ "$GZIP_ENABLED" == "true" ]]; then
	GZIP_FLAGS="-m -J"
	fi

	if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then
	cleanup_files
	echo "$file could not be copied to cloud"
	exit 3
	fi

	##
	## Import to BigQuery
	##

	for mylist in $(gsutil ls "gs://$bucket_name/*" \| xargs -n$bulkfiles \| tr ' ', ','); do
	echo "$mylist"

	##
	## Check if we have a DATE column, if we do then we can use partitioning
	##
	BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad"

	if grep "DATE" "$TABLE_NAME.json"; then
	FIELD=$(grep "DATE" "$TABLE_NAME.json" \| tail -1 \| cut -d'"' -f4)
	BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}"
	fi

	if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then
	echo "bq load failed for $file, check file exist in cloud"
	cleanup_files
	exit 2
	fi
	done
	#!/bin/bash

	BIGQUERY_DATASET=$1
	TABLE_SCHEMA=$2
	LOG_TO_FILE="false"

	if [[ $# -eq 0 ]] ; then
	echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA'
	exit 0
	fi

	bq mk "$BIGQUERY_DATASET" \|\| true

	# mysqlshow has a structure with a header at the top, we strip this out with a tail
	# and then use awk to get the contents of the table cell instead of the borders
	for tbl_name in $(mysqlshow "$TABLE_SCHEMA" \| tail -n +5 \| awk '{print $2}'); do
	#sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
	if [[ "$LOG_TO_FILE" == "true" ]]; then
	./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt
	else
	./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET"
	fi
	done