Created
December 16, 2015 16:28
-
-
Save tlapusan/f898934d1f13da0ee0d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val movies = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/movies.dat") | |
C:\Users\tudor\ml-1m | |
val movies = sc.textFile("C://Users//tudor//ml-1m//movies.dat") | |
movies.take(5) | |
val separated = movies.map(line => line.split("::")) | |
separated.take(5) | |
case class Movies(id:Int,name:String,genres:Array[String]) | |
Movies(1,"ToyStory","Comedy".split("\\|")) | |
Movies("1".toInt,"ToyStory","Comedy".split("\\|")) | |
val test = separated.take(1) | |
val test = separated.take(1)(0) | |
Movies(test(0).toInt,test(1),test(2).split("\\|")) | |
val movies = separated.map(line => Movies(line(0).toInt,line(1),line(2).split("\\|")) ) | |
val longMovies = separated.map(line =>{ | |
val genres = line(2).split("\\|") | |
val id = line(0).toInt | |
val name = line(1) | |
for{ | |
genre <- genres | |
}yield (id,name,genre) | |
} ) | |
case class Movie(id:Int,name:String,genre:String) | |
val longMovies = separated.flatMap(line =>{ | |
val genres = line(2).split("\\|") | |
val id = line(0).toInt | |
val name = line(1) | |
for{ | |
genre <- genres | |
}yield Movie(id,name,genre) | |
} ) | |
longMovies.filter(item => item._3 == "Comedy").count() | |
val ratings = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/ratings.dat") | |
val ratingsData = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/ratings.dat") | |
UserID::MovieID::Rating::Timestamp | |
case class Rating(userID:Long,movieID:Long,rating:Int,timestamp:Long) | |
val ratings = ratingsData.map(line => { | |
val arr = line.split("::") | |
val userId = arr(0).toLong | |
val movieId= arr(1).toLong | |
val rating = arr(2).toInt | |
val timestamp = arr(3).toLong | |
Rating(userId,movieId,rating,timestamp) | |
}) | |
UserID::Gender::Age::Occupation::Zip-code | |
val usersData = sc.textFile("file:///home/ubuntu/Downloads/ml-1m/users.dat") | |
case class User(userId:Int,gender:String,Occupation:Int,Age:Int,zipCode:String) | |
val users = usersData.map(line => { | |
val arr = line.split("::") | |
val userId = arr(0).toInt | |
val gender= arr(1) | |
val occupation = arr(2).toInt | |
val age = arr(3).toInt | |
val zipCode = arr(4).toString | |
User(userId,gender,occupation,age,zipCode) | |
}) | |
val ages = Map( 1 -> "Under 18",18 -> "18-24",25 -> "25-34",35-> "35-44",45 ->"45-49",50 -> "50-55",56 -> "56+") | |
age.getOrElse(1,â€Noneâ€) | |
val sqlContext = new org.apache.spark.sql.SQLContext(sc) | |
// this is used to implicitly convert an RDD to a DataFrame. | |
import sqlContext.implicits._ | |
val usersDF = users.toDF() | |
val ratingsDF = ratings.toDF() | |
val moviesDF = longMovies.toDF() | |
ratingsDF.printSchema() | |
ratingsDF.filter(ratingsDF("rating") < 5).count() | |
ratingsDF.registerTempTable("ratings") | |
usersDF.registerTempTable("users") | |
sqlContext.sql("select * from users limit 1").show() | |
val joined = sqlContext.sql("select gender,Occupation,rating from ratings t1 inner join users t2 on t1.userID = t2.userId limit 10 ") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment