dimastatz/benchmark.scala

## benchmark.scala
val df = spark
    .read
    .option("delimiter", " ")
    .option("comment", "#")
    .csv(compressedFilesPath)
    .repartition(numPartitions=numOfPartitions)
    .persist(StorageLevel.MEMORY_ONLY)

// read & unzip
df.count()

// hash column
val hashedDf = df.withColumn("hashed", sha2(col("_c2"), 256))
hashedDf.count()

// unix time convert
val unixTimeDf = df.withColumn("ts", from_unixtime(col("_c0")))
unixTimeDf.count()

// run regex on a long column
val regexDf = df.withColumn("ts", regexp_extract(col("_c9"), "(\\d+)-(\\d+)", 0))
regexDf.count()

// Convert to parquet and save
df.write
    .mode(SaveMode.Overwrite)
    .parquet(targetPath)
	val df = spark
	.read
	.option("delimiter", " ")
	.option("comment", "#")
	.csv(compressedFilesPath)
	.repartition(numPartitions=numOfPartitions)
	.persist(StorageLevel.MEMORY_ONLY)

	// read & unzip
	df.count()

	// hash column
	val hashedDf = df.withColumn("hashed", sha2(col("_c2"), 256))
	hashedDf.count()

	// unix time convert
	val unixTimeDf = df.withColumn("ts", from_unixtime(col("_c0")))
	unixTimeDf.count()

	// run regex on a long column
	val regexDf = df.withColumn("ts", regexp_extract(col("_c9"), "(\\d+)-(\\d+)", 0))
	regexDf.count()

	// Convert to parquet and save
	df.write
	.mode(SaveMode.Overwrite)
	.parquet(targetPath)