Develop a Spark standalone application (using IntelliJ IDEA) with Spark MLlib and LogisticRegression to classify emails.
Think about command line and what parameters you'd like to accept for various use cases.
TIP Use scopt
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.1.1"
Find the record with the maximum value (in a column) per category.
val records = Seq(
(0, "hello", 100, 0),
(1, "world", 200, 0),
(2, "witaj swiecie", 150, 1)).toDF("id", "token", "level", "category")
scala> records.show
+---+-------------+-----+--------+
| id| token|level|category|
+---+-------------+-----+--------+
| 0| hello| 100| 0|
| 1| world| 200| 0|
| 2|witaj swiecie| 150| 1|
+---+-------------+-----+--------+
val q = records.groupBy("category").agg(max("level") as "max_level").join(records, "category").where($"max_level" === $"level")
// Solution 1
q.select(records.columns.head, records.columns.tail: _*).show
// Solution 2
val cols = records.columns.map(name => col(name))
q.select(cols: _*).show
scala> import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.expressions.Window
scala> val categories = Window.partitionBy("category")
categories: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@5e8d66d9
scala> records.select($"*", max("level") over categories as "max_level").filter($"max_level" === $"level").show
+---+-------------+-----+--------+---------+
| id| token|level|category|max_level|
+---+-------------+-----+--------+---------+
| 2|witaj swiecie| 150| 1| 150|
| 1| world| 200| 0| 200|
+---+-------------+-----+--------+---------+