Skip to content

Instantly share code, notes, and snippets.

@jongwook
Created June 24, 2016 20:31
Show Gist options
  • Save jongwook/79bfd00f34f5f525ac9e40ab5b2a3808 to your computer and use it in GitHub Desktop.
Save jongwook/79bfd00f34f5f525ac9e40ab5b2a3808 to your computer and use it in GitHub Desktop.
Scripts to build Spark according to the CDH version
#!/usr/bin/env bash
# Jenkins build parameters
TAG=v2.0.0-rc1
SNAPSHOT=false
BRANCH=branch-2.0
CDH=5.7.1
DEPLOY=false
# should abort when any command fails
set -e
# The jenkins script below
WORKSPACE=/tmp/spark-build
rm -rf $WORKSPACE
mkdir -p $WORKSPACE
cd $WORKSPACE
export PATH=$JAVA_HOME/bin:$PATH
# proxy settings for kakao
if [[ $(hostname -f) == *"iwilab"* ]] || [[ $(hostname -f) == *"akao"* ]]; then
export http_proxy=http://proxy.iwilab.com:8082
export https_proxy=http://proxy.iwilab.com:8082
export no_proxy="localhost,127.0.0.1,.iwilab.com"
fi
git init .
git config remote.origin.url https://github.com/apache/spark
git config remote.origin.fetch +refs/heads/*:refs/remotes/origin/*
git fetch --tags
rm -rf *
if [[ -z "$TAG" ]]; then
git checkout $BRANCH
git reset --hard
git pull
TAG=`git describe --tags`
echo "TAG=$TAG"
SNAPSHOT=true
else
git reset $TAG --hard
TAG=`git describe --tags`
fi
VERSION=${TAG/v/}-cdh$CDH
echo "SNAPSHOT=$SNAPSHOT"
if [[ $SNAPSHOT == "true" ]]; then
VERSION=$VERSION-SNAPSHOT
fi
echo "VERSION=$VERSION"
subdirs=`echo */`
poms=`for dir in $subdirs; do ls ${dir}pom.xml 2> /dev/null || echo > /dev/null; ls ${dir}*/pom.xml 2> /dev/null || echo > /dev/null; done`
poms="pom.xml $poms"
echo '#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from lxml import etree
nexus = """
<distributionManagement>
<repository>
<id>dk-aa-release</id>
<name>DaumKakao AA Releases</name>
<url>http://maven.daumcorp.com/content/repositories/dk-aa-release/</url>
</repository>
<snapshotRepository>
<id>dk-aa-snapshots</id>
<name>Daumkakao AA Snapshots</name>
<url>http://maven.daumcorp.com/content/repositories/dk-aa-snapshots/</url>
</snapshotRepository>
</distributionManagement>
"""
cloudera = """
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
"""
filename = sys.argv[1]
tag = sys.argv[2]
with open(filename) as input:
pom = etree.XML(input.read().encode("utf-8"))
def childtags(xml, tag):
return [el for el in xml.iterchildren() if type(el.tag) is str and el.tag.split("}")[-1] == tag]
def childtag(xml, tag):
try:
return childtags(xml, tag)[0]
except:
return None
# update package versions
version = childtag(pom, "version")
if version is not None:
version.text = tag
# update the parent pom version
parent = childtag(pom, "parent")
if parent is not None:
group = childtag(parent, "groupId")
artifact = childtag(parent, "artifactId")
version = childtag(parent, "version")
if group.text == "org.apache" and artifact.text == "apache":
group.text = "com.cloudera.cdh"
artifact.text = "cdh-root"
version.text = "'$CDH'"
elif group.text == "org.apache.spark" and artifact.text == "spark-parent_2.11":
version.text = tag
if artifact.text == "cdh-root":
for elem in childtags(pom, "distributionManagement"):
pom.remove(elem)
pom.append(etree.XML(nexus))
repos = childtag(pom, "repositories")
repos.append(etree.XML(cloudera))
# remove test-only dependencies
dependencies = childtag(pom, "dependencies")
for dependency in childtags(dependencies, "dependency"):
scope = childtag(dependency, "scope")
if scope is not None and scope.text == "test":
dependencies.remove(dependency)
group = childtag(dependency, "groupId")
artifact = childtag(dependency, "artifactId")
if artifact.text == "hive-beeline":
dependencies.append(etree.XML("""
<dependency>
<groupId>jline</groupId>
<artifactId>jline</artifactId>
<version>2.12</version>
</dependency>
"""))
# remove shadeTestJar configuration
build = childtag(pom, "build")
if build is not None:
plugins = childtag(build, "plugins")
if plugins is not None:
for plugin in childtags(plugins, "plugin"):
configuration = childtag(plugin, "configuration")
if configuration is not None:
shade = childtag(configuration, "shadeTestJar")
if shade is not None:
configuration.remove(shade)
# edit properties to follow CDH versions
properties = childtag(pom, "properties")
if properties is not None:
values = {
"slf4j.version": "${cdh.slf4j.version}",
"hadoop.version": "${cdh.hadoop.version}",
"protobuf.version": "${cdh.protobuf.version}",
"hbase.version": "${cdh.hbase.version}",
"hbase.artifact": "hbase-server",
"flume.version": "${cdh.flume-ng.version}",
"zookeeper.version": "${cdh.zookeeper.version}",
"hive.group": "org.spark-project.hive",
"hive.version": "1.2.1.spark2",
"hive.version.short": "1.2.1",
"jline.version": "${scala.version}",
"derby.version": "10.11.1.1",
"parquet.version": "1.8.1",
"avro.version": "${cdh.avro.version}",
"avro.mapred.classifier": "hadoop2",
"jets3t.version": "${cdh.jets3t.version}",
"codehaus.jackson.version": "${cdh.jackson.version}",
"fasterxml.jackson.version": "2.6.5",
"snappy.version": "${cdh.hadoop-snappy.version}"
}
for name, value in values.items():
property = childtag(properties, name)
if property is not None:
property.text = value
# fix datanucleus versions
if filename == "pom.xml":
management = childtag(pom, "dependencyManagement")
dependencies = childtag(management, "dependencies")
dependencies.append(etree.XML("""
<dependency>
<groupId>org.datanucleus</groupId>
<artifactId>datanucleus-rdbms</artifactId>
<version>3.2.9</version>
</dependency>
"""))
dependencies.append(etree.XML("""
<dependency>
<groupId>org.datanucleus</groupId>
<artifactId>datanucleus-api-jdo</artifactId>
<version>3.2.6</version>
</dependency>
"""))
# remove scalastyle
build = childtag(pom, "build")
if build is not None:
plugins = childtag(build, "plugins")
if plugins is not None:
for plugin in childtags(plugins, "plugin"):
artifact = childtag(plugin, "artifactId")
if artifact is not None and artifact.text == "scalastyle-maven-plugin":
plugins.remove(plugin)
with open(filename, "w") as output:
output.write(etree.tostring(pom).decode("utf-8"))
output.write("\n")
' > patch.py
chmod +x patch.py
for pom in $poms; do
echo "Patching $pom..."
./patch.py $pom $VERSION
done
if which gsed; then
SED=gsed
else
SED=sed
fi
$SED -i "s/val shortAppName/val shortAppName = appName \/\//g" core/src/main/scala/org/apache/spark/ui/UIUtils.scala
echo '
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs:///user/spark/applicationHistory
spark.driver.memory 4g
spark.executor.memory 4g
spark.executor.instances 4
spark.executor.extraJavaOptions -XX:+PrintGCDetails
#spark.yarn.historyServer.address http://raccon015.kr2.iwilab.com:18088
' > conf/spark-defaults.conf
echo '
#!/usr/bin/env bash
# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.
export HADOOP_CONF_DIR=/etc/hive/conf
' > conf/spark-env.sh
echo '
# Set everything to be logged to the console
log4j.rootCategory=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
# Settings to quiet third party logs that are too verbose
log4j.logger.org.eclipse.jetty=WARN
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
log4j.rootCategory=WARN, console
' > conf/log4j.properties
if [[ $CDH == 5.3.* ]]; then
HADOOP=2.5.0-cdh$CDH
HADOOP_PROFILE=hadoop-2.4
else
HADOOP=2.6.0-cdh$CDH
HADOOP_PROFILE=hadoop-2.6
fi
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
./dev/make-distribution.sh --name 2.11 --tgz -Psparkr -Phive -Pyarn -Dhadoop.version=$HADOOP -P$HADOOP_PROFILE -DskipTests -Dmaven.test.skip=true -Dscala-2.11
if [[ $DEPLOY == "true" ]]; then
build/mvn install -Phive -Pyarn -Dhadoop.version=$HADOOP -Phadoop-2.4 -D$HADOOP_PROFILE -Dmaven.test.skip=true -Dscala-2.11
build/mvn deploy -Phive -Pyarn -Dhadoop.version=$HADOOP -Phadoop-2.4 -D$HADOOP_PROFILE -Dmaven.test.skip=true -Dscala-2.11
fi
# copy the default configuration files
for tgz in *.tgz; do
name=${tgz/\.tgz/}
tar=${tgz/\.tgz/\.tar}
assembly=spark-assembly-$VERSION-hadoop$HADOOP.jar
gunzip $tgz
mkdir -p $name/conf
cp conf/spark-defaults.conf conf/spark-env.sh $name/conf/
tar -rf $tar $name/conf/spark-defaults.conf $name/conf/spark-env.sh
gzip $tar
mv $tar.gz ../$tgz
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment