Skip to content

Instantly share code, notes, and snippets.

@csaroff
Forked from shantanuo/mysql_to_big_query.sh
Created September 25, 2020 09:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csaroff/e3b91f6c23713f927e4021b047899c8d to your computer and use it in GitHub Desktop.
Save csaroff/e3b91f6c23713f927e4021b047899c8d to your computer and use it in GitHub Desktop.
Copy MySQL table to big query. If you need to copy all tables, use the loop given at the end. Exit with error code 3 if blob or text columns are found. The csv files are first copied to google cloud before being imported to big query.
#!/bin/sh
TABLE_SCHEMA=$1
TABLE_NAME=$2
mytime=`date '+%y%m%d%H%M'`
hostname=`hostname | tr 'A-Z' 'a-z'`
file_prefix="trimax$TABLE_NAME$mytime$TABLE_SCHEMA"
bucket_name=$file_prefix
splitat="4000000000"
bulkfiles=200
maxbad=300
# make sure schema and table names are supplied
if [ $# -ne 2 ];then
echo "DB and table name required"
exit 1
fi
# make sure the table does not has blob or text columns
cat > blob_query.txt << heredoc
select sum(IF((DATA_TYPE LIKE '%blob%' OR DATA_TYPE LIKE '%text%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
heredoc
mycount=`mysql -Bs < blob_query.txt`
if [ $mycount -ne 0 ];then
echo "blob or text column found in table $TABLE_NAME"
exit 3
fi
# create google cloud bucket
gsutil mb gs://$bucket_name
if [ $? -ne 0 ];then
echo "bucket $bucket_name could not be created in cloud"
exit 4
fi
# create JSON schema from mysql table structure
cat > json_query.txt << heredoc
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING")) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
heredoc
echo '[' > $TABLE_NAME.json
mysql -Bs < json_query.txt | sed '$s/,$//' >> $TABLE_NAME.json
mysql $TABLE_SCHEMA -Bse"show create table $TABLE_NAME\G" > $TABLE_NAME.sql
echo ', {"name": "hostname","type":"STRING"} ]' >> $TABLE_NAME.json
# copy json and create table data to cloud
gsutil cp $TABLE_NAME.json gs://$bucket_name/
gsutil cp $TABLE_NAME.sql gs://$bucket_name/
# dump data
time mysql $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1
tr -d "\r" < $TABLE_NAME.txt1 > $TABLE_NAME.txt
sed -i "s/$/\t$TABLE_SCHEMA/" $TABLE_NAME.txt
sed -i 's/(Ctrl-v)(Ctrl-m)//g' $TABLE_NAME.txt
# split files with prefix
time split -C $splitat $TABLE_NAME.txt $file_prefix
# loop and upload files to google cloud
for file in `ls $file_prefix*`
do
# big query does not seem to like double quotes and NULL
time sed -i -e 's/\"//g' -e's/NULL//g' $file
time gzip $file
# copy to google cloud
time gsutil cp $file.gz gs://$bucket_name/
if [ $? -ne 0 ];then
echo "$file could not be copied to cloud"
exit 3
fi
rm -f $file.gz
done
# import data to big query
for mylist in `gsutil ls gs://$bucket_name/*.gz | xargs -n$bulkfiles | tr ' ', ','`
do
echo $mylist
mytime=`date '+%b%d%y'`
time bq mk $mytime
time bq load --nosync -F '\t' --job_id="$file" --max_bad_record=$maxbad $mytime.$TABLE_NAME $mylist $TABLE_NAME.json
if [ $? -ne 0 ];then
echo "bq load failed for $file, check file exist in cloud"
exit 2
fi
#sleep 35
done
rm -f $TABLE_NAME.json $TABLE_NAME.sql $TABLE_NAME.txt
exit
#!/bin/sh
TABLE_SCHEMA='drupaldb'
for tbl_name in `mysqlshow $TABLE_SCHEMA | awk '{print $2}'`
do
sh -xv myscript.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
done
# install google utilities
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
tar xfz gsutil.tar.gz -C $HOME
vi ~/.bashrc
export PATH=${PATH}:$HOME/gsutil
cd gsutil
python setup.py install
gsutil config
sudo sh
easy_install bigquery
bq init
# use wget to download this script
wget https://gist.github.com/raw/4466298/8f842e248db27c336f8726116943afaf17d29ffb/mysql_to_big_query.sh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment