erikdubbelboer/build.sh Secret

## build.sh
#!/bin/bash

DRUID="0.11.0"
HADOOP="2.8.2"
CONNECTOR="1.6.1-hadoop2"

# Our build of druid is at ../druid-src
if [ ! -f ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz ]; then
	echo "../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz doesn't exist"
	exit
fi

# Build a druid tar with all our configs and additional jars.

rm -r druid-0.*
cp ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz .
tar -xzf druid-${DRUID}-bin.tar.gz

# Untar the mysql connector to the extensions dir.
rm mysql-metadata-storage-*.tar.gz
cp ../druid-src/distribution/target/mysql-metadata-storage-*.tar.gz .
tar -xzf mysql-metadata-storage-*.tar.gz
mv mysql-metadata-storage druid-${DRUID}/extensions/

mkdir druid-${DRUID}/extensions/druid-google-extensions
cp ../druid-src/extensions-contrib/google-extensions/target/druid-google-extensions-${DRUID}.jar druid-${DRUID}/extensions/druid-google-extensions/

# Put the gcs connector where it can be found.
if [ ! -f gcs-connector-${CONNECTOR}.jar ]; then
	wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-${CONNECTOR}.jar
fi
cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/lib/
cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/extensions/druid-hdfs-storage/

rm -r druid-${DRUID}/conf
rm -r druid-${DRUID}/conf-quickstart/

# Use our own config instead of the Druid default ones
cp -r config/ druid-${DRUID}/config

rm druid-${DRUID}-bin.tar.gz

tar -czf druid-${DRUID}-bin.tar.gz druid-${DRUID}


# Distribute the druid tars to all instances
#gsutil cp druid-${DRUID}-bin.tar.gz gs://atomx/druid-${DRUID}-bin.tar.gz
cp druid-${DRUID}-bin.tar.gz ~/
scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-eu-historical-1:
#...
scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-indexer-1:
#...


# Create a Dataproc hadoop initialization
mkdir -p initialization/jars/

cat > initialization/initialization.sh << stop
#!/bin/bash

mkdir /tmp/initialization
gsutil -m rsync -r -d gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/ /tmp/initialization/

rm /usr/lib/hadoop-mapreduce/jackson-*-2.*.jar
cp /tmp/initialization/jars/* /usr/lib/hadoop-mapreduce/

exit 0
stop

rm -rf initialization/jars/*.jar
mkdir -p initialization/jars
cp druid-${DRUID}/lib/jackson-*-2.*.jar initialization/jars/

gsutil -m rsync -d -r initialization/ gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/

## core-site.xml
<?xml version="1.0" ?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
--><!-- Put site-specific property overrides in this file. --><configuration>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/hadoop/tmp</value>
    <description>A base for other temporary directories.</description>
  </property>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://hadoop-eu-1-m:8020</value>
    <description>The old FileSystem used by FsShell.</description>
  </property>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://hadoop-eu-1-m:8020</value>
    <description>
      The name of the default file system. A URI whose scheme and authority
      determine the FileSystem implementation. The uri's scheme determines
      the config property (fs.SCHEME.impl) naming the FileSystem
      implementation class. The uri's authority is used to determine the
      host, port, etc. for a filesystem.
    </description>
  </property>
  <property>
    <name>fs.gs.working.dir</name>
    <value>/</value>
    <description>
      The directory relative gs: uris resolve in inside of the default bucket.
    </description>
  </property>
  <property>
    <name>fs.gs.system.bucket</name>
    <value>hadoop-eu-atomx</value>
    <description>
      GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
    </description>
  </property>
  <property>
    <name>fs.gs.metadata.cache.directory</name>
    <value>/tmp/hadoop_gcs_connector_metadata_cache</value>
    <description>
      Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
      the local path to use as the base path for storing mirrored GCS metadata.
      Must be an absolute path, must be a directory, and must be fully
      readable/writable/executable by any user running processes which use the
      GCS connector.
    </description>
  </property>
  <property>
    <name>fs.gs.impl</name>
    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
    <description>The FileSystem for gs: (GCS) uris.</description>
  </property>
  <property>
    <name>fs.gs.project.id</name>
    <value>magnetic-icon-88807</value>
    <description>
      Google Cloud Project ID with access to configured GCS buckets.
    </description>
  </property>
  <property>
    <name>fs.gs.metadata.cache.enable</name>
    <value>true</value>
    <description>
      If true, a DirectoryListCache will be used to supplement
      &amp;amp;quot;list&amp;amp;quot;       requests       to GCS to fill in any
      missing items caused by eventual list       consistency,
      intercepting create/delete/copy calls to create cache       entries. The
      concrete type is determined with       fs.gs.metadata.cache.type.
    </description>
  </property>
  <property>
    <name>fs.gs.implicit.dir.infer.enable</name>
    <value>true</value>
    <description>
      If set, we create and return in-memory directory objects on the fly when
      no backing object exists, but we know there are files with the same
      prefix.
    </description>
  </property>
  <property>
    <name>fs.AbstractFileSystem.gs.impl</name>
    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
    <description>The AbstractFileSystem for gs: (GCS) uris.</description>
  </property>
  <property>
    <name>fs.gs.metadata.cache.type</name>
    <value>FILESYSTEM_BACKED</value>
    <description>
      Specifies which implementation of DirectoryListCache to use for
      supplementing GCS API &amp;amp;quot;list&amp;amp;quot; requests. Supported
      implementations:       IN_MEMORY: Enforces immediate consistency within
      same Java process.       FILESYSTEM_BACKED: Enforces consistency across
      all cooperating processes       pointed at the same local mirror
      directory, which may be an NFS directory       for massively-distributed
      coordination.
    </description>
  </property>
</configuration>

## dataproc.sh
#!/bin/bash

echo gcloud dataproc clusters create hadoop-eu-1 \
  --bucket=hadoop-eu-atomx \
  --image-version=1.2 \
  --initialization-actions="gs://hadoop-eu-atomx/0.11.0-2.8.2/initialization.sh" \
  --initialization-action-timeout 15m \
  --master-boot-disk-size-gb 10 \
  --master-machine-type n1-standard-16 \
  --num-master-local-ssds 0 \
  --worker-machine-type n1-standard-16 \
  --worker-boot-disk-size-gb 10 \
  --num-worker-local-ssds 1 \
  --num-workers 6 \
  --num-preemptible-workers 0 \
  --zone europe-west1-c
	#!/bin/bash

	DRUID="0.11.0"
	HADOOP="2.8.2"
	CONNECTOR="1.6.1-hadoop2"

	# Our build of druid is at ../druid-src
	if [ ! -f ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz ]; then
	echo "../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz doesn't exist"
	exit
	fi

	# Build a druid tar with all our configs and additional jars.

	rm -r druid-0.*
	cp ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz .
	tar -xzf druid-${DRUID}-bin.tar.gz

	# Untar the mysql connector to the extensions dir.
	rm mysql-metadata-storage-*.tar.gz
	cp ../druid-src/distribution/target/mysql-metadata-storage-*.tar.gz .
	tar -xzf mysql-metadata-storage-*.tar.gz
	mv mysql-metadata-storage druid-${DRUID}/extensions/

	mkdir druid-${DRUID}/extensions/druid-google-extensions
	cp ../druid-src/extensions-contrib/google-extensions/target/druid-google-extensions-${DRUID}.jar druid-${DRUID}/extensions/druid-google-extensions/

	# Put the gcs connector where it can be found.
	if [ ! -f gcs-connector-${CONNECTOR}.jar ]; then
	wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-${CONNECTOR}.jar
	fi
	cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/lib/
	cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/extensions/druid-hdfs-storage/

	rm -r druid-${DRUID}/conf
	rm -r druid-${DRUID}/conf-quickstart/

	# Use our own config instead of the Druid default ones
	cp -r config/ druid-${DRUID}/config

	rm druid-${DRUID}-bin.tar.gz

	tar -czf druid-${DRUID}-bin.tar.gz druid-${DRUID}


	# Distribute the druid tars to all instances
	#gsutil cp druid-${DRUID}-bin.tar.gz gs://atomx/druid-${DRUID}-bin.tar.gz
	cp druid-${DRUID}-bin.tar.gz ~/
	scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-eu-historical-1:
	#...
	scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-indexer-1:
	#...


	# Create a Dataproc hadoop initialization
	mkdir -p initialization/jars/

	cat > initialization/initialization.sh << stop
	#!/bin/bash

	mkdir /tmp/initialization
	gsutil -m rsync -r -d gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/ /tmp/initialization/

	rm /usr/lib/hadoop-mapreduce/jackson--2..jar
	cp /tmp/initialization/jars/* /usr/lib/hadoop-mapreduce/

	exit 0
	stop

	rm -rf initialization/jars/*.jar
	mkdir -p initialization/jars
	cp druid-${DRUID}/lib/jackson--2..jar initialization/jars/

	gsutil -m rsync -d -r initialization/ gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/
	<?xml version="1.0" ?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License. See accompanying LICENSE file.
	--><!-- Put site-specific property overrides in this file. --><configuration>
	<property>
	<name>hadoop.tmp.dir</name>
	<value>/hadoop/tmp</value>
	<description>A base for other temporary directories.</description>
	</property>
	<property>
	<name>fs.default.name</name>
	<value>hdfs://hadoop-eu-1-m:8020</value>
	<description>The old FileSystem used by FsShell.</description>
	</property>
	<property>
	<name>fs.defaultFS</name>
	<value>hdfs://hadoop-eu-1-m:8020</value>
	<description>
	The name of the default file system. A URI whose scheme and authority
	determine the FileSystem implementation. The uri's scheme determines
	the config property (fs.SCHEME.impl) naming the FileSystem
	implementation class. The uri's authority is used to determine the
	host, port, etc. for a filesystem.
	</description>
	</property>
	<property>
	<name>fs.gs.working.dir</name>
	<value>/</value>
	<description>
	The directory relative gs: uris resolve in inside of the default bucket.
	</description>
	</property>
	<property>
	<name>fs.gs.system.bucket</name>
	<value>hadoop-eu-atomx</value>
	<description>
	GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
	</description>
	</property>
	<property>
	<name>fs.gs.metadata.cache.directory</name>
	<value>/tmp/hadoop_gcs_connector_metadata_cache</value>
	<description>
	Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
	the local path to use as the base path for storing mirrored GCS metadata.
	Must be an absolute path, must be a directory, and must be fully
	readable/writable/executable by any user running processes which use the
	GCS connector.
	</description>
	</property>
	<property>
	<name>fs.gs.impl</name>
	<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
	<description>The FileSystem for gs: (GCS) uris.</description>
	</property>
	<property>
	<name>fs.gs.project.id</name>
	<value>magnetic-icon-88807</value>
	<description>
	Google Cloud Project ID with access to configured GCS buckets.
	</description>
	</property>
	<property>
	<name>fs.gs.metadata.cache.enable</name>
	<value>true</value>
	<description>
	If true, a DirectoryListCache will be used to supplement
	&amp;quot;list&amp;quot; requests to GCS to fill in any
	missing items caused by eventual list consistency,
	intercepting create/delete/copy calls to create cache entries. The
	concrete type is determined with fs.gs.metadata.cache.type.
	</description>
	</property>
	<property>
	<name>fs.gs.implicit.dir.infer.enable</name>
	<value>true</value>
	<description>
	If set, we create and return in-memory directory objects on the fly when
	no backing object exists, but we know there are files with the same
	prefix.
	</description>
	</property>
	<property>
	<name>fs.AbstractFileSystem.gs.impl</name>
	<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
	<description>The AbstractFileSystem for gs: (GCS) uris.</description>
	</property>
	<property>
	<name>fs.gs.metadata.cache.type</name>
	<value>FILESYSTEM_BACKED</value>
	<description>
	Specifies which implementation of DirectoryListCache to use for
	supplementing GCS API &amp;quot;list&amp;quot; requests. Supported
	implementations: IN_MEMORY: Enforces immediate consistency within
	same Java process. FILESYSTEM_BACKED: Enforces consistency across
	all cooperating processes pointed at the same local mirror
	directory, which may be an NFS directory for massively-distributed
	coordination.
	</description>
	</property>
	</configuration>
	#!/bin/bash

	echo gcloud dataproc clusters create hadoop-eu-1 \
	--bucket=hadoop-eu-atomx \
	--image-version=1.2 \
	--initialization-actions="gs://hadoop-eu-atomx/0.11.0-2.8.2/initialization.sh" \
	--initialization-action-timeout 15m \
	--master-boot-disk-size-gb 10 \
	--master-machine-type n1-standard-16 \
	--num-master-local-ssds 0 \
	--worker-machine-type n1-standard-16 \
	--worker-boot-disk-size-gb 10 \
	--num-worker-local-ssds 1 \
	--num-workers 6 \
	--num-preemptible-workers 0 \
	--zone europe-west1-c