Skip to content

Instantly share code, notes, and snippets.

@NoahMarconi
Last active September 20, 2016 18:19
Show Gist options
  • Save NoahMarconi/536fec49e01be401c698d6f45d959450 to your computer and use it in GitHub Desktop.
Save NoahMarconi/536fec49e01be401c698d6f45d959450 to your computer and use it in GitHub Desktop.
Graphframe from JSON
import org.apache.spark.sql.functions._
// Construct nodes table.
val newColNames = List(
"age",
"gender",
"has_atleast_three_reading_lists",
"has_profile_picture",
"id",
"is_author",
"latitude",
"longitude",
"platform",
"show_age",
"show_name",
"user_language"
)
val followerDF = wattDF.selectExpr(
"follower_age",
"follower_gender",
"follower_has_atleast_three_reading_lists",
"follower_has_profile_picture",
"follower_id",
"follower_is_author",
"follower_latitude",
"follower_longitude",
"follower_platform",
"follower_show_age",
"follower_show_name",
"follower_user_language"
).toDF(newColNames: _*)
val followeeDF = wattDF.selectExpr(
"followee_age",
"followee_gender",
"followee_has_atleast_three_reading_lists",
"followee_has_profile_picture",
"followee_id",
"followee_is_author",
"followee_latitude",
"followee_longitude",
"followee_platform",
"followee_show_age",
"followee_show_name",
"followee_user_language"
).toDF(newColNames: _*)
val nodes = followeeDF.union(followerDF).distinct()
// Construct edges table.
val edgeColNames = List("src", "dst", "relationship")
val edges = wattDF.select($"follower_id", $"followee_id").withColumn("relationship", lit("follows")).toDF(edgeColNames: _*)
// Persist to shared location.
nodes.write.format("parquet").mode(SaveMode.Overwrite).save("mnt/hackondata/wattpadNodes.parquet")
edges.write.format("parquet").save("mnt/hackondata/wattpadEdges.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment