Last active November 16, 2018 18:21
Spark read write test
package com.byond.sparkperformance
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
object ReadWriteTest {
def main(args: Array[String]) {
val (file: String, factor: Int) = args match {
case Array(f, fc) => (f, fc.toInt)
case _ => "target/classes/sample.txt"
val spark = SparkSession
import spark.implicits._
val df =
val count = df.count()
println(s"before muliply count: $count")
val multiplied = multiply(df, factor).map(s => s + "1")
println(s"after multiply count: ${multiplied.count}")
multiplied.coalesce(1).write.mode(SaveMode.Overwrite).text(file + ".transformed")
def multiply[T](ds: Dataset[T], times: Int): Dataset[T] = {
(1 to times).foldLeft(ds)((acc, _) => acc.union(ds))
../spark-2.3.2-bin-hadoop2.7/bin/spark-submit \
--deploy-mode cluster \
--master spark://integrity-hadoop-client-spark-master:6066 \
--class com.byond.sparkperformance.ReadWriteTest \
/opt/spark/jars/rw.jar <file to read> <multiply factor>
