Skip to content

Instantly share code, notes, and snippets.

@fernandor777
Created July 26, 2019 19:30
Show Gist options
  • Save fernandor777/5756c807fd290c84bb0b09e2e4275a28 to your computer and use it in GitHub Desktop.
Save fernandor777/5756c807fd290c84bb0b09e2e4275a28 to your computer and use it in GitHub Desktop.
spark quick start

build.sbt:

name := "MovieSimilarities1M"

version := "1.0"

organization := "com.sundogsoftware"

scalaVersion := "2.12.8"

libraryDependencies ++= Seq(
  "org.apache.spark" %% "spark-core" % "2.4.3"
)

RatingsCounter.scala:

package com.sundogsoftware.spark

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._

/** Count up how many of each star rating exists in the MovieLens 100K data set. */
object RatingsCounter {

  /** Our main function where the action happens */
  def main(args: Array[String]) {

    // Set the log level to only print errors
    Logger.getLogger("org").setLevel(Level.ERROR)

    // Create a SparkContext using every core of the local machine, named RatingsCounter
    val sc = new SparkContext("local[*]", "RatingsCounter")

    // Load up each line of the ratings data into an RDD
    val lines = sc.textFile("../ml-100k/u.data")

    // Convert each line to a string, split it out by tabs, and extract the third field.
    // (The file format is userID, movieID, rating, timestamp)
    val ratings = lines.map(x => x.toString().split("\t")(2))

    // Count up how many times each value (rating) occurs
    val results = ratings.countByValue()

    // Sort the resulting map of (rating, count) tuples
    val sortedResults = results.toSeq.sortBy(_._1)

    // Print each result on its own line.
    sortedResults.foreach(println)
  }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment