Skip to content

Instantly share code, notes, and snippets.

@krishnasrinivas
Created April 10, 2019 06:28
Show Gist options
  • Save krishnasrinivas/75c89c0598d3255d61bf45d5d1104344 to your computer and use it in GitHub Desktop.
Save krishnasrinivas/75c89c0598d3255d61bf45d5d1104344 to your computer and use it in GitHub Desktop.
#!/bin/bash
echo "please ensure that this script is running on a spark-master node"
sudo apt update
sudo apt install -y openjdk-11-jdk
sudo apt install -y scala
echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
sudo apt-get update
sudo apt-get install sbt
mkdir -p apps
cd apps
mkdir -p count
cd count
cat << EOF > build.sbt
name := "Counter"
version := "1.0"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.1"
EOF
mkdir -p src/main/scala
cat << EOF > src/main/scala/count.scala
import org.apache.spark.sql.SparkSession
object Counter {
def main(args: Array[String]) {
val spark = SparkSession.builder.
appName("Counter")
.config("spark.speculation", "false")
.config("spark.hadoop.fs.s3a.path.style.access","true")
.config("spark.hadoop.fs.s3a.connection.maximum", 500).getOrCreate()
// can also set options after building the session
spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint","http://minio:9000")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key","minio")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key","minio123")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.path.style.access","true")
val textFile = spark.read.textfile("s3a://data-set/nyc_taxi_data.csv")
val counts = textFile.count()
println(s"Total taxi rides: $counts")
spark.stop()
}
}
EOF
echo "please change the address of minio in src/main/scala/count.scala to point to the correct instance of minio in your datacenter and recompile using 'sbt package'"
## compile the program
sbt package
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
export HADOOP_HOME=${HOME}/hadoop-3.1.2
export PATH=${HADOOP_HOME}/bin:$PATH
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
export SPARK_HOME=${HOME}/spark-2.4.1-bin-without-hadoop
export PATH=${SPARK_HOME}/bin:$PATH
spark-submit --deploy-mode=client --master spark://$(hostname):7077 --packages org.apache.hadoop:hadoop-aws:3.1.2 target/scala-2.12/counter_2.12-1.0.jar
mkdir -p src/main/scala
cat << EOF > src/main/scala/count.scala
import org.apache.spark.sql.SparkSession
object Counter {
def main(args: Array[String]) {
val spark = SparkSession.builder.
appName("Counter")
.config("spark.speculation", "false")
.config("spark.hadoop.fs.s3a.path.style.access","true")
.config("spark.hadoop.fs.s3a.connection.maximum", 500).getOrCreate()
// can also set options after building the session
spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint","http://minio:9000")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key","minio")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key","minio123")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.path.style.access","true")
val textFile = spark.read.textfile("s3a://data-set/nyc_taxi_data.csv")
val counts = textFile.count()
println(s"Total taxi rides: $counts")
spark.stop()
}
}
EOF
echo "please change the address of minio in src/main/scala/count.scala to point to the correct instance of minio in your datacenter and recompile using 'sbt package'"
## compile the program
sbt package
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
export HADOOP_HOME=${HOME}/hadoop-3.1.2
export PATH=${HADOOP_HOME}/bin:$PATH
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
export SPARK_HOME=${HOME}/spark-2.4.1-bin-without-hadoop
export PATH=${SPARK_HOME}/bin:$PATH
spark-submit --deploy-mode=client --master spark://$(hostname):7077 --packages org.apache.hadoop:hadoop-aws:3.1.2 target/scala-2.12/counter_2.12-1.0.jar
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment