Skip to content

Instantly share code, notes, and snippets.

@apenney
Forked from shantanuo/mysql_to_big_query.sh
Last active October 19, 2018 21:09
Show Gist options
  • Save apenney/2d78fc1fb734bf3f58d5adbd1dfdd461 to your computer and use it in GitHub Desktop.
Save apenney/2d78fc1fb734bf3f58d5adbd1dfdd461 to your computer and use it in GitHub Desktop.
Copy MySQL table to big query. If you need to copy all tables, use the loop given at the end.Exit with error code 3 if blob or text columns are found. The csv files are first copied to google cloud before being imported to big query.
#!/usr/bin/env bash
# install google utilities
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
tar xfz gsutil.tar.gz -C $HOME
vi ~/.bashrc
export PATH=${PATH}:$HOME/gsutil
cd gsutil
python setup.py install
gsutil config
sudo sh
easy_install bigquery
bq init
#!/bin/bash
set -x
# Cleanup the mess we've made
function cleanup_files() {
rm -f "${TABLE_NAME}.*"
rm -f "${file_prefix}*"
}
TABLE_SCHEMA=$1
TABLE_NAME=$2
BIGQUERY_DATASET=$3
# Customize these for your organization
BUCKET_PREFIX="eq"
GZIP_ENABLED="false"
mytime=$(date '+%y%m%d%H%M')
#hostname=$(hostname | tr '[:upper:]' '[:lower:]')
file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}"
bucket_name="${BUCKET_PREFIX}-${file_prefix}"
splitat="4000000000"
bulkfiles=200
maxbad=300
if [ $# -ne 3 ];then
echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME'
exit 1
fi
# Check if the table has blob data
cat > blob_query.txt << heredoc
select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
heredoc
if ! mysql -Bs < blob_query.txt; then
echo "blob column found in table $TABLE_NAME"
exit 3
fi
if ! gsutil mb "gs://$bucket_name"; then
cleanup_files
echo "bucket $bucket_name could not be created in cloud"
exit 4
fi
# create JSON schema from mysql table structure
cat > json_query.txt << heredoc
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
heredoc
echo '[' > "$TABLE_NAME.json"
mysql -Bs < json_query.txt | sed '$s/,$//' >> "$TABLE_NAME.json"
mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql"
echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json"
# copy json and create table data to cloud
gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/"
gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/"
# Dump the data, strip \r's, a bunch of sed's, and we're done
time mysql --quick "$TABLE_SCHEMA" -Bse"select * from $TABLE_NAME" \
| sed -e "s/\(\\r\|\"\|NULL\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt"
# split files with prefix
time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix"
GZIP_FLAGS="-m"
if [[ "$GZIP_ENABLED" == "true" ]]; then
GZIP_FLAGS="-m -J"
fi
if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then
cleanup_files
echo "$file could not be copied to cloud"
exit 3
fi
##
## Import to BigQuery
##
for mylist in $(gsutil ls "gs://$bucket_name/*" | xargs -n$bulkfiles | tr ' ', ','); do
echo "$mylist"
##
## Check if we have a DATE column, if we do then we can use partitioning
##
BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad"
if grep "DATE" "$TABLE_NAME.json"; then
FIELD=$(grep "DATE" "$TABLE_NAME.json" | tail -1 | cut -d'"' -f4)
BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}"
fi
if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then
echo "bq load failed for $file, check file exist in cloud"
cleanup_files
exit 2
fi
done
#!/bin/bash
BIGQUERY_DATASET=$1
TABLE_SCHEMA=$2
LOG_TO_FILE="false"
if [[ $# -eq 0 ]] ; then
echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA'
exit 0
fi
bq mk "$BIGQUERY_DATASET" || true
# mysqlshow has a structure with a header at the top, we strip this out with a tail
# and then use awk to get the contents of the table cell instead of the borders
for tbl_name in $(mysqlshow "$TABLE_SCHEMA" | tail -n +5 | awk '{print $2}'); do
#sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
if [[ "$LOG_TO_FILE" == "true" ]]; then
./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt
else
./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET"
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment