Skip to content

Instantly share code, notes, and snippets.

@fsarradin
Created April 2, 2021 10:21
Show Gist options
  • Save fsarradin/196a532e6377b19c7663746e649d95e1 to your computer and use it in GitHub Desktop.
Save fsarradin/196a532e6377b19c7663746e649d95e1 to your computer and use it in GitHub Desktop.
Text exploration with Spark Core
package io.univalence.aboutfp
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object TextMain {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName(getClass.getSimpleName)
.config("spark.eventLog.enabled", true)
.config("spark.eventLog.dir", "/tmp/spark-events")
.getOrCreate()
// import spark.implicits._
val rdd: RDD[String] = spark.sparkContext.textFile("/Users/fsarradin/src/aboutfp/data/meaulnes.txt", 4)
println(rdd.getNumPartitions)
val result = rdd
.map(_.trim)
.filter(_.nonEmpty)
.flatMap(s => s.split("[\\s,.;:?!]+").toList)
.map(_.toLowerCase)
.filter(_.length >= 10)
.keyBy(identity)
.groupByKey()
.mapValues(_.size)
.sortBy { case (_, c) => -c }
result
.take(100)
.foreach(println)
spark.close()
}
}
@fsarradin
Copy link
Author

Dependency

"org.apache.spark" %% "spark-sql" % "3.1.1"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment