Skip to content

Instantly share code, notes, and snippets.

@erikdubbelboer
Last active December 21, 2017 04:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikdubbelboer/9c0e5ac6465c12f7925f823a2b110c10 to your computer and use it in GitHub Desktop.
Save erikdubbelboer/9c0e5ac6465c12f7925f823a2b110c10 to your computer and use it in GitHub Desktop.
Atomx Druid Dataproc scripts
#!/bin/bash
DRUID="0.11.0"
HADOOP="2.8.2"
CONNECTOR="1.6.1-hadoop2"
# Our build of druid is at ../druid-src
if [ ! -f ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz ]; then
echo "../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz doesn't exist"
exit
fi
# Build a druid tar with all our configs and additional jars.
rm -r druid-0.*
cp ../druid-src/distribution/target/druid-${DRUID}-bin.tar.gz .
tar -xzf druid-${DRUID}-bin.tar.gz
# Untar the mysql connector to the extensions dir.
rm mysql-metadata-storage-*.tar.gz
cp ../druid-src/distribution/target/mysql-metadata-storage-*.tar.gz .
tar -xzf mysql-metadata-storage-*.tar.gz
mv mysql-metadata-storage druid-${DRUID}/extensions/
mkdir druid-${DRUID}/extensions/druid-google-extensions
cp ../druid-src/extensions-contrib/google-extensions/target/druid-google-extensions-${DRUID}.jar druid-${DRUID}/extensions/druid-google-extensions/
# Put the gcs connector where it can be found.
if [ ! -f gcs-connector-${CONNECTOR}.jar ]; then
wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-${CONNECTOR}.jar
fi
cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/lib/
cp gcs-connector-${CONNECTOR}.jar druid-${DRUID}/extensions/druid-hdfs-storage/
rm -r druid-${DRUID}/conf
rm -r druid-${DRUID}/conf-quickstart/
# Use our own config instead of the Druid default ones
cp -r config/ druid-${DRUID}/config
rm druid-${DRUID}-bin.tar.gz
tar -czf druid-${DRUID}-bin.tar.gz druid-${DRUID}
# Distribute the druid tars to all instances
#gsutil cp druid-${DRUID}-bin.tar.gz gs://atomx/druid-${DRUID}-bin.tar.gz
cp druid-${DRUID}-bin.tar.gz ~/
scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-eu-historical-1:
#...
scp -o UserKnownHostsFile=/dev/null -o CheckHostIP=no -o StrictHostKeyChecking=no druid-${DRUID}-bin.tar.gz druid-indexer-1:
#...
# Create a Dataproc hadoop initialization
mkdir -p initialization/jars/
cat > initialization/initialization.sh << stop
#!/bin/bash
mkdir /tmp/initialization
gsutil -m rsync -r -d gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/ /tmp/initialization/
rm /usr/lib/hadoop-mapreduce/jackson-*-2.*.jar
cp /tmp/initialization/jars/* /usr/lib/hadoop-mapreduce/
exit 0
stop
rm -rf initialization/jars/*.jar
mkdir -p initialization/jars
cp druid-${DRUID}/lib/jackson-*-2.*.jar initialization/jars/
gsutil -m rsync -d -r initialization/ gs://hadoop-eu-atomx/${DRUID}-${HADOOP}/
<?xml version="1.0" ?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
--><!-- Put site-specific property overrides in this file. --><configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/hadoop/tmp</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://hadoop-eu-1-m:8020</value>
<description>The old FileSystem used by FsShell.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop-eu-1-m:8020</value>
<description>
The name of the default file system. A URI whose scheme and authority
determine the FileSystem implementation. The uri's scheme determines
the config property (fs.SCHEME.impl) naming the FileSystem
implementation class. The uri's authority is used to determine the
host, port, etc. for a filesystem.
</description>
</property>
<property>
<name>fs.gs.working.dir</name>
<value>/</value>
<description>
The directory relative gs: uris resolve in inside of the default bucket.
</description>
</property>
<property>
<name>fs.gs.system.bucket</name>
<value>hadoop-eu-atomx</value>
<description>
GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
</description>
</property>
<property>
<name>fs.gs.metadata.cache.directory</name>
<value>/tmp/hadoop_gcs_connector_metadata_cache</value>
<description>
Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
the local path to use as the base path for storing mirrored GCS metadata.
Must be an absolute path, must be a directory, and must be fully
readable/writable/executable by any user running processes which use the
GCS connector.
</description>
</property>
<property>
<name>fs.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
<description>The FileSystem for gs: (GCS) uris.</description>
</property>
<property>
<name>fs.gs.project.id</name>
<value>magnetic-icon-88807</value>
<description>
Google Cloud Project ID with access to configured GCS buckets.
</description>
</property>
<property>
<name>fs.gs.metadata.cache.enable</name>
<value>true</value>
<description>
If true, a DirectoryListCache will be used to supplement
&amp;amp;quot;list&amp;amp;quot; requests to GCS to fill in any
missing items caused by eventual list consistency,
intercepting create/delete/copy calls to create cache entries. The
concrete type is determined with fs.gs.metadata.cache.type.
</description>
</property>
<property>
<name>fs.gs.implicit.dir.infer.enable</name>
<value>true</value>
<description>
If set, we create and return in-memory directory objects on the fly when
no backing object exists, but we know there are files with the same
prefix.
</description>
</property>
<property>
<name>fs.AbstractFileSystem.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
<description>The AbstractFileSystem for gs: (GCS) uris.</description>
</property>
<property>
<name>fs.gs.metadata.cache.type</name>
<value>FILESYSTEM_BACKED</value>
<description>
Specifies which implementation of DirectoryListCache to use for
supplementing GCS API &amp;amp;quot;list&amp;amp;quot; requests. Supported
implementations: IN_MEMORY: Enforces immediate consistency within
same Java process. FILESYSTEM_BACKED: Enforces consistency across
all cooperating processes pointed at the same local mirror
directory, which may be an NFS directory for massively-distributed
coordination.
</description>
</property>
</configuration>
#!/bin/bash
echo gcloud dataproc clusters create hadoop-eu-1 \
--bucket=hadoop-eu-atomx \
--image-version=1.2 \
--initialization-actions="gs://hadoop-eu-atomx/0.11.0-2.8.2/initialization.sh" \
--initialization-action-timeout 15m \
--master-boot-disk-size-gb 10 \
--master-machine-type n1-standard-16 \
--num-master-local-ssds 0 \
--worker-machine-type n1-standard-16 \
--worker-boot-disk-size-gb 10 \
--num-worker-local-ssds 1 \
--num-workers 6 \
--num-preemptible-workers 0 \
--zone europe-west1-c
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment