Skip to content

Instantly share code, notes, and snippets.

@soujiro32167
Last active November 16, 2018 18:21
Show Gist options
  • Save soujiro32167/ce092ffc03f10ca35c745711be6ed911 to your computer and use it in GitHub Desktop.
Save soujiro32167/ce092ffc03f10ca35c745711be6ed911 to your computer and use it in GitHub Desktop.
Spark read write test
package com.byond.sparkperformance
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
object ReadWriteTest {
def main(args: Array[String]) {
val (file: String, factor: Int) = args match {
case Array(f, fc) => (f, fc.toInt)
case _ => "target/classes/sample.txt"
}
val spark = SparkSession
.builder
.getOrCreate()
import spark.implicits._
val df = spark.read.textFile(file)
val count = df.count()
println(s"before muliply count: $count")
val multiplied = multiply(df, factor).map(s => s + "1")
println(s"after multiply count: ${multiplied.count}")
multiplied.coalesce(1).write.mode(SaveMode.Overwrite).text(file + ".transformed")
}
def multiply[T](ds: Dataset[T], times: Int): Dataset[T] = {
(1 to times).foldLeft(ds)((acc, _) => acc.union(ds))
}
}
../spark-2.3.2-bin-hadoop2.7/bin/spark-submit \
--deploy-mode cluster \
--master spark://integrity-hadoop-client-spark-master:6066 \
--class com.byond.sparkperformance.ReadWriteTest \
/opt/spark/jars/rw.jar <file to read> <multiply factor>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment