Created
September 21, 2016 16:01
-
-
Save tae-jun/138f595228aa83e89387b5d39d33b315 to your computer and use it in GitHub Desktop.
제플린 걸음마 서울시립대학교 데이터마이닝 활용사례 제플린 노트북 통계 추출 코드
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%sql | |
SELECT COALESCE(lang, "Total") AS language, COUNT(*) AS cnt | |
FROM lang | |
GROUP BY lang WITH ROLLUP | |
ORDER BY cnt DESC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql.functions._ | |
// Change ZEPPELIN_NOTEBOOK_PATH! | |
val jsonRdd = sc.wholeTextFiles("<ZEPPELIN_NOTEBOOK_PATH>/*/note.json").map(_._2) | |
// For Spark 1.X | |
// val notes = sqlContext.read.json(jsonRdd) | |
val notes = spark.read.json(jsonRdd) | |
// Filter tutorial notes | |
val myNotes = notes.filter(r => !r.getString(4).contains("Tutorial")) | |
println(s"#Notes = ${myNotes.count}") | |
// Create paragraph df | |
val paragraphs = myNotes.selectExpr("EXPLODE(paragraphs)") | |
.filter("col.text IS NOT NULL") | |
println(s"#Paragraphs = ${paragraphs.count}") | |
// UDF which extracts language from paragraph | |
val getLang = udf { s: String => s match { | |
case _ if s.trim.length < 1 => "empty" | |
case _ if s.startsWith("%") => | |
"%\\s?[\\w]+".r.findFirstIn(s) match { | |
case Some(lang) => lang.replaceAll(" ", "") | |
case None => "unknown" | |
} | |
case _ => "%spark" | |
} | |
} | |
// For Spark 1.X | |
// paragraphs.withColumn("lang", getLang($"col.text")) | |
// .registerTempTable("lang") | |
paragraphs.withColumn("lang", getLang($"col.text")) | |
.createOrReplaceTempView("lang") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment