Last active
August 4, 2023 07:13
-
-
Save mikesparr/ebea5a7a490a0a7668ba87b07db31338 to your computer and use it in GitHub Desktop.
Experiment using Datastream to implement CDC between Cloud SQL (MySQL) database and Google Cloud Storage (GCS) bucket
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
##################################################################### | |
# REFERENCES | |
# - https://cloud.google.com/vpc/docs/create-modify-vpc-networks | |
# - https://cloud.google.com/sql/docs/mysql/configure-private-services-access | |
# - https://cloud.google.com/sql/docs/mysql/configure-private-ip | |
# - https://cloud.google.com/iam/docs/service-accounts-create | |
# - https://cloud.google.com/compute/docs/instances/startup-scripts/linux | |
# - https://cloud.google.com/storage/docs/discover-object-storage-gcloud | |
# - https://cloud.google.com/datastream/docs/quickstart-replication-to-bigquery | |
# - https://cloud.google.com/datastream/docs/private-connectivity | |
# - https://cloud.google.com/sdk/gcloud/reference/datastream/streams/create | |
# - https://github.com/datacharmer/test_db | |
##################################################################### | |
export PROJECT_ID=$(gcloud config get-value project) | |
export PROJECT_USER=$(gcloud config get-value core/account) # set current user | |
export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)") | |
export IDNS=${PROJECT_ID}.svc.id.goog # workflow identity domain | |
export GCP_REGION="us-central1" # CHANGEME (OPT) | |
export GCP_ZONE="us-central1-a" # CHANGEME (OPT) | |
export NETWORK_NAME="default" | |
# enable apis | |
gcloud services enable compute.googleapis.com \ | |
servicenetworking.googleapis.com \ | |
container.googleapis.com \ | |
storage.googleapis.com \ | |
sqladmin.googleapis.com \ | |
datastream.googleapis.com \ | |
iap.googleapis.com | |
# configure gcloud sdk | |
gcloud config set compute/region $GCP_REGION | |
gcloud config set compute/zone $GCP_ZONE | |
############################################################# | |
# ORGANIZATION | |
# assumptions | |
# - custom service account for proxy vm to limit scope | |
############################################################# | |
export SVC_ACCT_PROXY_NAME="datastream-proxy-sa" | |
export SVC_ACCT_PROXY_EMAIL="$SVC_ACCT_PROXY_NAME@$PROJECT_ID.iam.gserviceaccount.com" | |
# create service account | |
gcloud iam service-accounts create $SVC_ACCT_PROXY_NAME \ | |
--description="Datastream proxy SA" \ | |
--display-name="$SVC_ACCT_PROXY_NAME" | |
# create iam roles for svc acct (could restrict these further if desired for least privilege) | |
gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
--member="serviceAccount:$SVC_ACCT_PROXY_EMAIL" \ | |
--role="roles/compute.instanceAdmin.v1" | |
gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
--member="serviceAccount:$SVC_ACCT_PROXY_EMAIL" \ | |
--role="roles/iam.serviceAccountUser" | |
# allow IAP access for testing (better to add serviceAccountUser at SA level) | |
gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
--member="user:$PROJECT_USER" \ | |
--role="roles/iap.tunnelResourceAccessor" | |
gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
--member="user:$PROJECT_USER" \ | |
--role="roles/compute.instanceAdmin.v1" | |
gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
--member="user:$PROJECT_USER" \ | |
--role="roles/iam.serviceAccountUser" | |
############################################################# | |
# NETWORKING | |
# - network 1: cloud sql (private tenant) | |
# - reserved range: 10.10.0.0/16 | |
# - datastream range: 10.15.0.0/29 | |
# - compute subnet: 10.20.0.0/16 | |
############################################################# | |
export NETWORK_NAME="test-network-10-10-0-0" | |
export PRIVATE_RANGE_NAME="managed-services-$NETWORK_NAME" | |
export PRIVATE_RANGE_ADDRESS="10.10.0.0" | |
export SUBNET_COMPUTE_NAME="compute-10-20-0-0" | |
export SUBNET_COMPUTE_RANGE="10.20.0.0/16" | |
export CLOUD_ROUTER_NAME="router-1" | |
export CLOUD_ROUTER_ASN="64520" | |
export NAT_GW_NAME="internet-gw" | |
export DS_PRIVATE_CONN_RANGE="10.15.0.0/29" | |
# create network | |
gcloud compute networks create $NETWORK_NAME \ | |
--subnet-mode=custom \ | |
--bgp-routing-mode=global | |
# allocate private access range | |
gcloud compute addresses create $PRIVATE_RANGE_NAME \ | |
--global \ | |
--purpose=VPC_PEERING \ | |
--addresses=$PRIVATE_RANGE_ADDRESS \ | |
--prefix-length=16 \ | |
--network="projects/$PROJECT_ID/global/networks/$NETWORK_NAME" | |
# create private connection | |
gcloud services vpc-peerings connect \ | |
--service=servicenetworking.googleapis.com \ | |
--ranges=$PRIVATE_RANGE_NAME \ | |
--network=$NETWORK_NAME \ | |
--project=$PROJECT_ID | |
# create subnet range for compute instances | |
gcloud compute networks subnets create $SUBNET_COMPUTE_NAME \ | |
--region=$GCP_REGION \ | |
--network=$NETWORK_NAME \ | |
--range=$SUBNET_COMPUTE_RANGE | |
# create cloud router and nat gateway | |
gcloud compute routers create $CLOUD_ROUTER_NAME \ | |
--network $NETWORK_NAME \ | |
--asn $CLOUD_ROUTER_ASN \ | |
--region $GCP_REGION | |
gcloud compute routers nats create $NAT_GW_NAME \ | |
--router=$CLOUD_ROUTER_NAME \ | |
--region=$GCP_REGION \ | |
--auto-allocate-nat-external-ips \ | |
--nat-all-subnet-ip-ranges \ | |
--enable-logging | |
# enable IAP tunnel access for testing | |
gcloud compute firewall-rules create allow-ssh-ingress-from-iap \ | |
--network=$NETWORK_NAME \ | |
--project=$PROJECT_ID \ | |
--direction=INGRESS \ | |
--action=allow \ | |
--rules=tcp:22 \ | |
--source-ranges=35.235.240.0/20 | |
# enable private connectivity traffic | |
gcloud compute firewall-rules create allow-private-connectivity \ | |
--direction=INGRESS \ | |
--priority=1000 \ | |
--network=$NETWORK_NAME \ | |
--project=$PROJECT_ID \ | |
--action=ALLOW \ | |
--rules=TCP \ | |
--source-ranges="$PRIVATE_RANGE_ADDRESS/16",$SUBNET_COMPUTE_RANGE,$DS_PRIVATE_CONN_RANGE | |
############################################################# | |
# DATABASE (MYSQL) | |
############################################################# | |
export DB_INSTANCE_NAME="mysqldb1" | |
export DB_VERSION="MYSQL_8_0" | |
export DB_CPU="2" | |
export DB_MEMORY="7680MB" | |
export DB_LOG_RETENTION_DAYS="7" | |
export DB_USER="datastream" | |
export DB_PASS=$(openssl rand -base64 32) # random | |
# create private cloud sql instance | |
gcloud beta sql instances create $DB_INSTANCE_NAME \ | |
--project=$PROJECT_ID \ | |
--network=projects/$PROJECT_ID/global/networks/$NETWORK_NAME \ | |
--no-assign-ip \ | |
--enable-bin-log \ | |
--retained-transaction-log-days=$DB_LOG_RETENTION_DAYS \ | |
--allocated-ip-range-name=$PRIVATE_RANGE_NAME \ | |
--enable-google-private-path \ | |
--database-version=$DB_VERSION \ | |
--cpu=$DB_CPU \ | |
--memory=$DB_MEMORY \ | |
--region=$GCP_REGION | |
# lock down mysql default (root) user [manually input at prompt] | |
gcloud sql users set-password root \ | |
--host=% \ | |
--instance=$DB_INSTANCE_NAME \ | |
--prompt-for-password \ | |
--project=$PROJECT_ID | |
# create user for datastream | |
gcloud sql users create $DB_USER \ | |
--host=% \ | |
--instance=$DB_INSTANCE_NAME \ | |
--password=$DB_PASS | |
# fetch database IP, etc. | |
export DB_HOST=$(gcloud beta sql instances describe $DB_INSTANCE_NAME --format="value(ipAddresses.ipAddress)" --project $PROJECT_ID) | |
export DB_PROFILE=$(gcloud beta sql instances describe $DB_INSTANCE_NAME --format="value(connectionName)" --project $PROJECT_ID) | |
export DB_PORT="3306" # standard for mysql | |
############################################################# | |
# DATASTREAM NETWORK PROXY (GCE INSTANCE) | |
############################################################# | |
export INSTANCE_TEMPLATE_NAME="datastream-proxy-debian" | |
export INSTANCE_MACHINE_TYPE="e2-standard-2" | |
export INSTANCE_IMAGE_FAMILY="debian-10" | |
export INSTANCE_IMAGE_PROJECT="debian-cloud" | |
export INSTANCE_BOOT_DISK_SIZE="250GB" | |
export INSTANCE_STARTUP_SCRIPT_NAME="startup.sh" | |
export PROXY_GROUP_NAME="ds-proxy" | |
# create startup script to configure proxy | |
cat > $INSTANCE_STARTUP_SCRIPT_NAME << EOF | |
#! /bin/bash | |
export DB_ADDR=$DB_HOST | |
export DB_PORT=$DB_PORT | |
export ETH_NAME=\$(ip -o link show | awk -F': ' '{print \$2}' | grep -v lo) | |
export LOCAL_IP_ADDR=\$(ip -4 addr show \$ETH_NAME | grep -Po 'inet \K[\d.]+') | |
echo 1 > /proc/sys/net/ipv4/ip_forward | |
iptables -t nat -A PREROUTING -p tcp -m tcp --dport \$DB_PORT -j DNAT \ | |
--to-destination \$DB_ADDR:\$DB_PORT | |
iptables -t nat -A POSTROUTING -j SNAT --to-source \$LOCAL_IP_ADDR | |
EOF | |
# create compute instance template | |
gcloud compute instance-templates create $INSTANCE_TEMPLATE_NAME \ | |
--machine-type=$INSTANCE_MACHINE_TYPE \ | |
--image-family=$INSTANCE_IMAGE_FAMILY \ | |
--image-project=$INSTANCE_IMAGE_PROJECT \ | |
--boot-disk-size=$INSTANCE_BOOT_DISK_SIZE \ | |
--no-address \ | |
--no-scopes \ | |
--service-account=$SVC_ACCT_PROXY_EMAIL \ | |
--network=$NETWORK_NAME \ | |
--subnet=$SUBNET_COMPUTE_NAME \ | |
--metadata-from-file=startup-script=$(PWD)/$INSTANCE_STARTUP_SCRIPT_NAME | |
# create instance group for proxy for resilience | |
gcloud compute instance-groups managed create $PROXY_GROUP_NAME \ | |
--region=$GCP_REGION \ | |
--template=$INSTANCE_TEMPLATE_NAME \ | |
--instance-redistribution-type=none \ | |
--size=0 | |
# create instance with stateful name for proxy to retain DS conn | |
# (private preview for stateful IP may be desirable in future when GA) | |
gcloud compute instance-groups managed create-instance $PROXY_GROUP_NAME \ | |
--instance=$PROXY_GROUP_NAME-1 \ | |
--region=$GCP_REGION | |
export PROXY_ZONE=$(gcloud compute instances list --filter="name=$PROXY_GROUP_NAME-1" --format="value(zone)") | |
export PROXY_IP=$(gcloud compute instances list --filter="name=$PROXY_GROUP_NAME-1" --format="value(networkInterfaces[].networkIP)") | |
# test startup script ran and instance proxies connection to database | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "sudo journalctl -u google-startup-scripts.service" | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "sudo apt update && sudo apt install -y netcat && sudo apt install -y default-mysql-client" | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "nc -zv $DB_HOST $DB_PORT" | |
############################################################# | |
# STORAGE (CDC DESTINATION) | |
############################################################# | |
export BUCKET_NAME="mike-test-mysql-cdc" | |
# create storage bucket | |
gcloud storage buckets create gs://$BUCKET_NAME --uniform-bucket-level-access | |
############################################################# | |
# DATASTREAM | |
############################################################# | |
export DS_PRIVATE_CONN_NAME="datastream-connectivity" | |
export DS_CONN_PROFILE_MYSQL="ds-mysql-profile" | |
export DS_CONN_PROFILE_GCS="ds-gcs-profile" | |
export DS_SOURCE_CONFIG_FILE="source-config.json" | |
export DS_DEST_CONFIG_FILE="dest-config.json" | |
export DS_STREAM_NAME="employees-stream" | |
# create private connection range | |
gcloud datastream private-connections create $DS_PRIVATE_CONN_NAME \ | |
--location=$GCP_REGION \ | |
--display-name=$DS_PRIVATE_CONN_NAME \ | |
--vpc-name=$NETWORK_NAME \ | |
--subnet=$DS_PRIVATE_CONN_RANGE | |
# create connection profile (mysql) | |
gcloud datastream connection-profiles create $DS_CONN_PROFILE_MYSQL \ | |
--location=$GCP_REGION \ | |
--type=mysql \ | |
--mysql-password=$DB_PASS \ | |
--mysql-username=$DB_USER \ | |
--display-name=$DS_CONN_PROFILE_MYSQL \ | |
--mysql-hostname=$PROXY_IP \ | |
--mysql-port=$DB_PORT \ | |
--private-connection=$DS_PRIVATE_CONN_NAME | |
# create connection profile (gcs) | |
gcloud datastream connection-profiles create $DS_CONN_PROFILE_GCS \ | |
--location=$GCP_REGION \ | |
--type=google-cloud-storage \ | |
--bucket=$BUCKET_NAME \ | |
--root-path=/datastream \ | |
--display-name=$DS_CONN_PROFILE_GCS | |
# create source config file | |
cat > $DS_SOURCE_CONFIG_FILE << EOF | |
{ | |
"excludeObjects": {}, | |
"includeObjects": { | |
"mysqlDatabases": [ | |
{ | |
"database":"employees" | |
} | |
] | |
} | |
} | |
EOF | |
# create dest config file | |
cat > $DS_DEST_CONFIG_FILE << EOF | |
{ | |
"path": "employees", | |
"fileRotationMb":5, | |
"fileRotationInterval":"15s", | |
"avroFileFormat": {} | |
} | |
EOF | |
# create stream | |
gcloud datastream streams create $DS_STREAM_NAME \ | |
--location=$GCP_REGION \ | |
--display-name=$DS_STREAM_NAME \ | |
--source=$DS_CONN_PROFILE_MYSQL \ | |
--mysql-source-config=$DS_SOURCE_CONFIG_FILE \ | |
--destination=$DS_CONN_PROFILE_GCS \ | |
--gcs-destination-config=$DS_DEST_CONFIG_FILE \ | |
--backfill-all | |
# TODO: start stream (currently click start in Console) | |
############################################################# | |
# TEST DATA LOADING | |
############################################################# | |
# load database via proxy host (test purpose only) | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "sudo apt install -y default-mysql-client && sudo apt install -y git" | |
# clone same data repo on remote machine | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "git clone https://github.com/datacharmer/test_db.git" | |
# cd to sample data repo and load in database | |
gcloud compute ssh $PROXY_GROUP_NAME-1 \ | |
--zone $PROXY_ZONE -- "cd test_db && mysql -h $DB_HOST -u root -p < employees.sql" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Datastream CDC with MySQL and Cloud Storage
Recently a customer reported a unique configuration using GKE pods as the proxy for a private Datastream connection to MySQL, so I thought I would first set up a baseline experiment and ensure no similar errors with Compute Engine (recommended) for the proxy.
As I explored documentation I discovered that current Google documentation instructs users to create Datastream streams using point-and-click in the console and I couldn't find any instructions using the CLI
gcloud
SDK, so I decided to illustrate how this could be set up via scripting instead of "click ops".Experiment 1
Results (success, no errors)
Stream created
Private connection success
Connection profiles success
3.9 million records processed
Tables and data synched
Synched data stored in GCS buckets