Skip to content

Instantly share code, notes, and snippets.

@liancheng
Last active August 29, 2015 14:06
Show Gist options
  • Save liancheng/fe70a148de82e77bd2c8 to your computer and use it in GitHub Desktop.
Save liancheng/fe70a148de82e77bd2c8 to your computer and use it in GitHub Desktop.
Micro benchmark code for Spark PR #2337
package org.apache.spark.examples.sql.hive
import org.apache.spark.sql.hive.LocalHiveContext
import org.apache.spark.{SparkConf, SparkContext}
object HiveTableScanPrepare extends App {
val sparkContext = new SparkContext(
new SparkConf()
.setMaster("local")
.setAppName(getClass.getSimpleName.stripSuffix("$")))
val hiveContext = new LocalHiveContext(sparkContext)
import hiveContext._
sql("DROP TABLE IF EXISTS scan_csv")
sql("""CREATE TABLE scan_csv (b TINYINT, s SMALLINT, i INT, l BIGINT, d DOUBLE, f FLOAT)
| ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
| WITH SERDEPROPERTIES ('field.delim'=',')
""".stripMargin)
sql(s"""LOAD DATA LOCAL INPATH "${args(0)}" INTO TABLE scan_csv""")
sparkContext.stop()
}
object HiveTableScanBenchmark extends App {
val sparkContext = new SparkContext(
new SparkConf()
.setMaster("local")
.setAppName(getClass.getSimpleName.stripSuffix("$")))
val hiveContext = new LocalHiveContext(sparkContext)
import hiveContext._
cacheTable("scan_csv")
val buildDuration = benchmark(sql("SELECT COUNT(*) FROM scan_csv").count())
val scanDuration = benchmark(sql("SELECT b ,s, i, l, d, f FROM scan_csv").count())
println(s"Benchmark result: $buildDuration\t$scanDuration")
def benchmark(f: => Unit) = {
val begin = System.currentTimeMillis()
f
val end = System.currentTimeMillis()
end - begin
}
sparkContext.stop()
}
#!/usr/bin/env scala
var i = 0
while (i < 10000000) {
println(s"${i % Byte.MaxValue},${i % Short.MaxValue},$i,$i,$i,$i")
i += 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment