Skip to content

Instantly share code, notes, and snippets.

fun main(args: Array<String>) {
PropertyConfigurator.configure("src/main/resources/log4j.properties")
System.setProperty("hadoop.home.dir", "c:/winutil/")
Class.forName("com.dremio.jdbc.Driver")
val spark = SparkSession.builder()
.appName("Dremio-Spark example")
.master("local[*]")
.getOrCreate()
val dfRaw = spark.read()
.format("jdbc")
.option("url", "jdbc:dremio:direct=127.0.0.1:31010")
.option("dbtable", """"@admin"."logs"""")
.option("user", "admin")
.option("password", "admin123qwe")
.load()
fun castDf(df: Dataset<Row>) =
df.withColumn("_tmp", split(col("request"), " ")).select(
col("host"),
unix_timestamp(
col("request_time"), "dd/MMM/yyyy:HH:mm:ss"
).cast("timestamp").alias("time"),
col("_tmp").getItem(0).alias("verb"),
col("_tmp").getItem(1).alias("resource"),
col("status").cast("short"),
col("bytes")
val df = castDf(dfRaw)
df.printSchema()
df.describe("bytes").show()
df.groupBy("status")
.count()
.sort("status")
.show()
df.groupBy("host")
.count()
.filter(col("count").gt(700))
.sort(col("count").desc())
.show()
df.filter(col("status").equalTo(404))
.groupBy("resource")
.count()
.sort(desc("count"))
.show(10)
df.select("host")
.distinct()
.count()