Skip to content

Instantly share code, notes, and snippets.

@jimhe
Forked from shantanuo/mysql_to_big_query.sh
Last active February 2, 2017 01:30
Show Gist options
  • Save jimhe/5ab94d4404df66680fa7626745363139 to your computer and use it in GitHub Desktop.
Save jimhe/5ab94d4404df66680fa7626745363139 to your computer and use it in GitHub Desktop.
Copy MySQL table to big query. If you need to copy all tables, use the loop given at the end.Exit with error code 3 if blob or text columns are found. The csv files are first copied to google cloud before being imported to big query.
#!/bin/sh
MYSQL_USER=$1
MYSQL_PASSWORD=$2
MYSQL_HOST=$3
TABLE_SCHEMA=$4
TABLE_NAME=$5
gsutil > /dev/null 2>&1 || {
echo "gsutil not available! Please install google-cloud-sdk: brew cask install google-cloud-sdk"
exit 1
}
bq version > /dev/null 2>&1 || {
echo "bq not available! Please install google-cloud-sdk: brew cask install google-cloud-sdk"
exit 1
}
file_prefix="${TABLE_SCHEMA}_${TABLE_NAME}"
bucket_name="${TABLE_SCHEMA}_mysql"
splitat="4000000000"
bulkfiles=200
maxbad=300
mysqlargs="--user=$MYSQL_USER --password=$MYSQL_PASSWORD --host=$MYSQL_HOST"
# make sure schema and table names are supplied
if [ $# -ne 5 ]; then
echo "Usage: mysql_to_bigquery.sh <mysql user> <mysql password> <mysql host> <schema> <table>"
exit 1
fi
# make sure the table does not has blob or text columns
cat > blob_query.txt << heredoc
select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
heredoc
mycount=`mysql $mysqlargs -Bs < blob_query.txt`
if [ $mycount -ne 0 ]; then
echo "blob or text column found in table $TABLE_NAME"
exit 3
fi
# create google cloud bucket
gsutil mb gs://$bucket_name > /tmp/gsutil_mb 2>&1
if [ $? -ne 0 ]; then
grep 'already exists' /tmp/gsutil_mb
if [ $? -ne 0 ]; then
echo "bucket $bucket_name could not be created in cloud"
exit 4
fi
fi
# create JSON schema from mysql $mysqlargs table structure
cat > json_query.txt << heredoc
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING")) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
heredoc
echo '[' > $TABLE_NAME.json
mysql $mysqlargs -Bs < json_query.txt | sed '$s/,$//' >> $TABLE_NAME.json
mysql $mysqlargs $TABLE_SCHEMA -Bse"show create table $TABLE_NAME\G" > $TABLE_NAME.sql
echo ']' >> $TABLE_NAME.json
# copy json and create table data to cloud
gsutil cp $TABLE_NAME.json gs://$bucket_name/
# dump data
mysql $mysqlargs $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1
tr -d "\r" < $TABLE_NAME.txt1 > $TABLE_NAME.txt
sed -i "s/$/\t$TABLE_SCHEMA/" $TABLE_NAME.txt
sed -i 's/(Ctrl-v)(Ctrl-m)//g' $TABLE_NAME.txt
# split files with prefix
split -b $splitat $TABLE_NAME.txt $file_prefix
# loop and upload files to google cloud
for file in `ls ${file_prefix}*`
do
# big query does not seem to like double quotes and NULL
sed -i -e 's/\"//g' $file
sed -i -e 's/NULL//g' $file
gzip $file
# copy to google cloud
gsutil cp $file.gz gs://$bucket_name/
if [ $? -ne 0 ];then
echo "$file could not be copied to cloud"
exit 3
fi
rm -f $file.gz
done
# import data to big query
for mylist in `gsutil ls gs://$bucket_name/${file_prefix}*.gz | xargs -n$bulkfiles | tr ' ', ','`
do
echo $mylist
bq mk $bucket_name
bq load --nosync -F '\t' --job_id="$file" --max_bad_record=$maxbad $bucket_name.$TABLE_NAME $mylist $TABLE_NAME.json
if [ $? -ne 0 ]; then
echo "bq load failed for $file"
exit 2
fi
done
rm -f $TABLE_NAME.json $TABLE_NAME.sql $TABLE_NAME.txt
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment