Skip to content

Instantly share code, notes, and snippets.

@ruloweb
Last active July 22, 2019 13:03
Show Gist options
  • Save ruloweb/b95c663675cf0c52755edba808bfc5c6 to your computer and use it in GitHub Desktop.
Save ruloweb/b95c663675cf0c52755edba808bfc5c6 to your computer and use it in GitHub Desktop.
Scala test
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, regexp_replace, substring_index, udf}
def time[R](block: => R): R = {
val t0 = System.currentTimeMillis()
val result = block
val t1 = System.currentTimeMillis()
println("Elapsed time: " + (t1 - t0) + "ms")
result
}
val spark: SparkSession = SparkSession
.builder()
.master("local")
.getOrCreate()
import spark.implicits._
// DataFrame.
val df = (1 to 100000)
.map(x => if (x % 2 == 0) "www.domain.com" else "domain.com:443")
.toDF("domain_name")
// Test 1.
val df1 = df
.withColumn("domain_name2", regexp_replace(col("domain_name"), "^www\\.", ""))
.withColumn("domain_name2", substring_index(col("domain_name2"), ":", 1))
// Test 2.
val func = (s: String) => {
val start = if (s.startsWith("www.")) 4 else 0
val end: Int = s.indexOf(":")
if (end == -1) s.substring(start)
else s.substring(start, end)
}
val udfWWW = udf(func)
val df2 = df
.withColumn("domain_name2", udfWWW(col("domain_name")))
// Run tests.
// Elapsed time: 2695ms.
time {
df1.count
}
// Elapsed time: 834ms.
time {
df2.count
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment