Skip to content

Instantly share code, notes, and snippets.

@tgruben
Created August 15, 2019 17:41
Show Gist options
  • Save tgruben/dbf995c81fea41ebf57f9a53df8a323e to your computer and use it in GitHub Desktop.
Save tgruben/dbf995c81fea41ebf57f9a53df8a323e to your computer and use it in GitHub Desktop.
import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Level, Logger}
object SimpleApp {
def main(args: Array[String]) {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val logFile = "/mnt/disks/data1/yellow_tripdata_2015-01.csv"
val spark = SparkSession.builder.appName("Simple Application").getOrCreate()
// val logData = spark.read.textFile(logFile).cache()
val df = spark.read.format("csv").option("header", "true").load(logFile)
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()
// val numAs = logData.filter(line => line.contains("a")).count()
val numBs = df.count()
df.printSchema()
println(s"============================================")
println(s"Lines: $numBs")
print(s"partitions:" + df.rdd.getNumPartitions)
println(s"============================================")
spark.stop()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment