Skip to content

Instantly share code, notes, and snippets.

@tlapusan
Created December 16, 2015 16:28
Show Gist options
  • Save tlapusan/f898934d1f13da0ee0d2 to your computer and use it in GitHub Desktop.
Save tlapusan/f898934d1f13da0ee0d2 to your computer and use it in GitHub Desktop.
val movies = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/movies.dat")
C:\Users\tudor\ml-1m
val movies = sc.textFile("C://Users//tudor//ml-1m//movies.dat")
movies.take(5)
val separated = movies.map(line => line.split("::"))
separated.take(5)
case class Movies(id:Int,name:String,genres:Array[String])
Movies(1,"ToyStory","Comedy".split("\\|"))
Movies("1".toInt,"ToyStory","Comedy".split("\\|"))
val test = separated.take(1)
val test = separated.take(1)(0)
Movies(test(0).toInt,test(1),test(2).split("\\|"))
val movies = separated.map(line => Movies(line(0).toInt,line(1),line(2).split("\\|")) )
val longMovies = separated.map(line =>{
val genres = line(2).split("\\|")
val id = line(0).toInt
val name = line(1)
for{
genre <- genres
}yield (id,name,genre)
} )
case class Movie(id:Int,name:String,genre:String)
val longMovies = separated.flatMap(line =>{
val genres = line(2).split("\\|")
val id = line(0).toInt
val name = line(1)
for{
genre <- genres
}yield Movie(id,name,genre)
} )
longMovies.filter(item => item._3 == "Comedy").count()
val ratings = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/ratings.dat")
val ratingsData = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/ratings.dat")
UserID::MovieID::Rating::Timestamp
case class Rating(userID:Long,movieID:Long,rating:Int,timestamp:Long)
val ratings = ratingsData.map(line => {
val arr = line.split("::")
val userId = arr(0).toLong
val movieId= arr(1).toLong
val rating = arr(2).toInt
val timestamp = arr(3).toLong
Rating(userId,movieId,rating,timestamp)
})
UserID::Gender::Age::Occupation::Zip-code
val usersData = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/users.dat")
case class User(userId:Int,gender:String,Occupation:Int,Age:Int,zipCode:String)
val users = usersData.map(line => {
val arr = line.split("::")
val userId = arr(0).toInt
val gender= arr(1)
val occupation = arr(2).toInt
val age = arr(3).toInt
val zipCode = arr(4).toString
User(userId,gender,occupation,age,zipCode)
})
val ages = Map( 1 -> "Under 18",18 -> "18-24",25 -> "25-34",35-> "35-44",45 ->"45-49",50 -> "50-55",56 -> "56+")
age.getOrElse(1,”None”)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._
val usersDF = users.toDF()
val ratingsDF = ratings.toDF()
val moviesDF = longMovies.toDF()
ratingsDF.printSchema()
ratingsDF.filter(ratingsDF("rating") < 5).count()
ratingsDF.registerTempTable("ratings")
usersDF.registerTempTable("users")
sqlContext.sql("select * from users limit 1").show()
val joined = sqlContext.sql("select gender,Occupation,rating from ratings t1 inner join users t2 on t1.userID = t2.userId limit 10 ")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment