Graham Polley polleyg

## list_gcp_projects.sh
gcloud projects list --format="csv(name,projectId,projectNumber)" | grep "<something>"

## cloud_run_instances.sh
gcloud run services list --project="<here>" --format=json | jq -r '.[] | [.metadata.name,.spec.template.metadata.annotations."autoscali
ng.knative.dev/maxScale"] | @tsv'

## bq_load_tpc.sh
#!/bin/bash
# Load into BQ using the Fivetran public bucket with TPC data (approx 1TB and 4B rows).
# It creates 24 tables in BigQuery. Plug in your details to run. If "LOAD_SYNC" is changed
# to 'true', then each load job (24 of them) will be fired off synchronously and be polled
# for completion before firing off the next one. Leaving it 'false' will fire of all 24 load
# jobs to execute in paralell. You can monitor the status of the jobs from the BigQuery UI or
# via the bq command line tool. It takes 6m-8m to load all 24 tables.
#
# more info: https://fivetran.com/blog/warehouse-benchmark && https://github.com/fivetran/benchmark

## SpannerToBigQuery.java
package org.polleyg;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.spanner.Struct;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.spanner.SpannerIO;

## cloud_scheduler.sh
gcloud beta scheduler jobs create http foobar --schedule="* * * * *" --uri=https://
cloudbuild.googleapis.com/v1/projects/cloud-scheduler-test-240911/triggers/f6553fe1-bfb7-4a39-b61f-2e47ada51a8d:run --message-body={"branchName":
"master"} --oauth-service-account-email=cloud-scheduler-test-240911@appspot.gserviceaccount.com

## copy_bq_dataset_one_liner.sh
bq ls --format=csv foo | awk '{if(NR>1)print}' | awk -F, '{print $1}' | xargs -n 1 -P 4 -i bq cp foo.{} foobarred.{}

## copy_bq_dataset.sh
export SOURCE_DATASET=$1  # project1:dataset
export DEST_PREFIX=$2  # project2:dataset2.any_prefix_
for f in `bq ls $SOURCE_DATASET |grep TABLE | awk '{print $1}'`
do
  export CP_COMMAND="bq cp $SOURCE_DATASET.$f $DEST_PREFIX$f"
  echo $CP_COMMAND
  echo `$CP_COMMAND`
done

## beam_sql_all.java
public class BeamSQLMagic {
    public static final String HEADER = "year,month,day,wikimedia_project,language,title,views";
    public static final Schema SCHEMA = Schema.builder()
            .addStringField("lang")
            .addInt32Field("views")
            .build();

    public static void main(String[] args) {
        PipelineOptionsFactory.register(DataflowPipelineOptions.class);
        DataflowPipelineOptions options = PipelineOptionsFactory

## beam_sql_part_4.java
[..]
.apply("transform_to_string", ParDo.of(new RowToString()))
.apply("write_to_gcs", TextIO.write().to("gs://batch-pipeline-sql/output/output.csv").withoutSharding());
[..]
//ParDo for Row (SQL) -> String
    public static class RowToString extends DoFn<Row, String> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            String line = c.element().getValues()
                    .stream()

## beam_sql_part_3.java
[..]
.apply("transform_sql", SqlTransform.query(
                        "SELECT lang, SUM(views) as sum_views " +
                                "FROM PCOLLECTION GROUP BY lang")
)
[..]
	gcloud run services list --project="<here>" --format=json \| jq -r '.[] \| [.metadata.name,.spec.template.metadata.annotations."autoscali
	ng.knative.dev/maxScale"] \| @tsv'
	#!/bin/bash
	# Load into BQ using the Fivetran public bucket with TPC data (approx 1TB and 4B rows).
	# It creates 24 tables in BigQuery. Plug in your details to run. If "LOAD_SYNC" is changed
	# to 'true', then each load job (24 of them) will be fired off synchronously and be polled
	# for completion before firing off the next one. Leaving it 'false' will fire of all 24 load
	# jobs to execute in paralell. You can monitor the status of the jobs from the BigQuery UI or
	# via the bq command line tool. It takes 6m-8m to load all 24 tables.
	#
	# more info: https://fivetran.com/blog/warehouse-benchmark && https://github.com/fivetran/benchmark
	package org.polleyg;

	import com.google.api.services.bigquery.model.TableFieldSchema;
	import com.google.api.services.bigquery.model.TableRow;
	import com.google.api.services.bigquery.model.TableSchema;
	import com.google.cloud.spanner.Struct;
	import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
	import org.apache.beam.sdk.Pipeline;
	import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
	import org.apache.beam.sdk.io.gcp.spanner.SpannerIO;
	gcloud beta scheduler jobs create http foobar --schedule="* * * * *" --uri=https://
	cloudbuild.googleapis.com/v1/projects/cloud-scheduler-test-240911/triggers/f6553fe1-bfb7-4a39-b61f-2e47ada51a8d:run --message-body={"branchName":
	"master"} --oauth-service-account-email=cloud-scheduler-test-240911@appspot.gserviceaccount.com
	export SOURCE_DATASET=$1 # project1:dataset
	export DEST_PREFIX=$2 # project2:dataset2.any_prefix_
	for f in `bq ls $SOURCE_DATASET \|grep TABLE \| awk '{print $1}'`
	do
	export CP_COMMAND="bq cp $SOURCE_DATASET.$f $DEST_PREFIX$f"
	echo $CP_COMMAND
	echo `$CP_COMMAND`
	done
	public class BeamSQLMagic {
	public static final String HEADER = "year,month,day,wikimedia_project,language,title,views";
	public static final Schema SCHEMA = Schema.builder()
	.addStringField("lang")
	.addInt32Field("views")
	.build();

	public static void main(String[] args) {
	PipelineOptionsFactory.register(DataflowPipelineOptions.class);
	DataflowPipelineOptions options = PipelineOptionsFactory
	[..]
	.apply("transform_to_string", ParDo.of(new RowToString()))
	.apply("write_to_gcs", TextIO.write().to("gs://batch-pipeline-sql/output/output.csv").withoutSharding());
	[..]
	//ParDo for Row (SQL) -> String
	public static class RowToString extends DoFn<Row, String> {
	@ProcessElement
	public void processElement(ProcessContext c) {
	String line = c.element().getValues()
	.stream()
	[..]
	.apply("transform_sql", SqlTransform.query(
	"SELECT lang, SUM(views) as sum_views " +
	"FROM PCOLLECTION GROUP BY lang")
	)
	[..]