Last active
August 29, 2015 14:22
-
-
Save AKB428/b0ca9e7eabd45d2eca4c to your computer and use it in GitHub Desktop.
Spark + kuromoji + D3.js で 簡単に「NHKつぶやきビッグデータ」を作る ref: http://qiita.com/AKB428/items/12118d1d28b9e067296a
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "json" | |
def build_children(title, value) | |
children = {} | |
children['name'] = title | |
children['children'] = [] | |
children['children'][0] = {'name' => title, 'size' => value} | |
children | |
end | |
def build_data(csv_data_map) | |
data = {} | |
data['name'] = 'flare' | |
data['children'] = csv_data_map.map{|k, v| build_children(k,v)} | |
JSON.dump(data) | |
end | |
csv_data_map = {} | |
open(ARGV[0]) {|file| | |
while line = file.gets | |
record = line.split(',') | |
csv_data_map[record[0]] = record[1] | |
end | |
} | |
puts build_data(csv_data_map) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
魔法科高校の劣等生,魔法科高校の劣等生,魔法科高校の劣等生,カスタム名詞 | |
セーラームーン,セーラームーン,セーラームーン,カスタム名詞 | |
七つの大罪,七つの大罪,七つの大罪,カスタム名詞 | |
ハイキュー,ハイキュー,ハイキュー,カスタム名詞 | |
ラブライブ,ラブライブ,ラブライブ,カスタム名詞 | |
ばらかもん,ばらかもん,ばらかもん,カスタム名詞 | |
東京喰種,東京喰種,東京喰種,カスタム名詞 | |
月刊少女野崎くん,月刊少女野崎くん,月刊少女野崎くん,カスタム名詞 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
矢澤にこ | 11200 | |
---|---|---|
歩道 | 10183 | |
レール | 1212 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "flare", | |
"children": [ | |
{ | |
"name": "矢澤にこ", | |
"children": [ | |
{ | |
"name": "矢澤にこ", | |
"size": "11200" | |
} | |
] | |
}, | |
{ | |
"name": "10183", | |
"children": [ | |
{ | |
"name": "10183", | |
"size": "70" | |
} | |
] | |
}, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.regex.{Matcher, Pattern} | |
import org.apache.spark.{SparkConf, SparkContext} | |
import org.atilika.kuromoji.{Token, Tokenizer} | |
import java.io.PrintWriter | |
/** | |
* Created by AKB428 | |
*/ | |
object inazumaTwitter { | |
def main(args: Array[String]): Unit = { | |
val conf = new SparkConf().setAppName("Inazuma Application") | |
conf.setMaster("local[*]") | |
val sc = new SparkContext(conf) | |
val input = sc.textFile(args(0)) // hdfs:// | |
var printRankingNum = 10 | |
var dictFilePath = "./dictionary/blank.txt" | |
if (args.length >= 2) { | |
dictFilePath = args(1) | |
} | |
if (args.length == 3) { | |
printRankingNum = args(2).toInt | |
} | |
// kuromoji(形態要素解析)で日本語解析 | |
val words = input.flatMap(x => { | |
// ref:http://www.intellilink.co.jp/article/column/bigdata-kk01.html | |
val japanese_pattern : Pattern = Pattern.compile("[¥¥u3040-¥¥u309F]+") //「ひらがなが含まれているか?」の正規表現 | |
// 不要な文字列の削除 | |
var text = x.replaceAll("http(s*)://(.*)/", "").replaceAll("¥¥uff57", "") | |
val tokens: java.util.List[Token] = CustomTwitterTokenizer.tokenize(text, dictFilePath) | |
val features: scala.collection.mutable.ArrayBuffer[String] = new collection.mutable.ArrayBuffer[String]() | |
if(japanese_pattern.matcher(x).find()) { | |
val pattern : Pattern = Pattern.compile("^[a-zA-Z]+$|^[0-9]+$") //「英数字か?」の正規表現 | |
for (index <- 0 to tokens.size() - 1) { | |
// 二文字以上の単語を抽出 | |
if (tokens.get(index).getSurfaceForm().length() >= 2) { | |
val matcher : Matcher = pattern.matcher(tokens.get(index).getSurfaceForm()) | |
if (!matcher.find()) { | |
if (tokens.get(index).getAllFeaturesArray()(0) == "名詞" && (tokens.get(index).getAllFeaturesArray()(1) == "一般" || tokens.get(index).getAllFeaturesArray()(1) == "固有名詞")) { | |
features += tokens.get(index).getSurfaceForm | |
} else if (tokens.get(index).getPartOfSpeech == "カスタム名詞") { | |
// println(tokens.get(index).getPartOfSpeech) | |
// println(tokens.get(index).getSurfaceForm) | |
features += tokens.get(index).getSurfaceForm | |
} | |
} | |
} | |
} | |
} | |
(features) | |
}) | |
// ソート方法を定義(必ずソートする前に定義) | |
implicit val sortIntegersByString = new Ordering[Int] { | |
override def compare(a: Int, b: Int) = a.compare(b)*(-1) | |
} | |
// ソート | |
val result = words.map(x => (x,1)).reduceByKey((x,y) => x + y).sortBy(_._2) | |
// ソート結果から上位を取得 | |
for (r <- result.take(printRankingNum)) { | |
println(r._1 + " " + r._2) | |
} | |
// 結果をCSVファイルに保存 | |
val out = new PrintWriter("data.csv") | |
for (r <- result.take(printRankingNum)) { | |
out.println(r._1 + "," + r._2) | |
} | |
out.close | |
sc.stop | |
} | |
} | |
object CustomTwitterTokenizer { | |
def tokenize(text: String, dictPath: String): java.util.List[Token] = { | |
Tokenizer.builder().mode(Tokenizer.Mode.SEARCH) | |
.userDictionary(dictPath) | |
.build().tokenize(text) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
TwitterStream twitterStream = new TwitterStreamFactory().getInstance(); | |
twitterStream.setOAuthConsumer(twitterModel.getConsumerKey(), | |
twitterModel.getConsumerSecret()); | |
twitterStream.setOAuthAccessToken(new AccessToken(twitterModel | |
.getAccessToken(), twitterModel.getAccessToken_secret())); | |
// MyStatusAdapterクラスでTwitterのStatusクラスを処理する | |
twitterStream.addListener(new MyStatusAdapter(applicationConfParser, bufferedWriter)); | |
ArrayList<String> track = new ArrayList<String>(); | |
track.addAll(Arrays.asList(Application.searchKeyword.split(","))); | |
String[] trackArray = track.toArray(new String[track.size()]); | |
// 400のキーワードが指定可能、5000のフォローが指定可能、25のロケーションが指定可能 | |
twitterStream.filter(new FilterQuery(0, null, trackArray)); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TwitterDataCleansing | |
open(ARGV[0]) {|file| | |
while line = file.gets | |
puts line.gsub(/(全員|ふぁぼ|ファボ|定期|相互)/, ' ') | |
end | |
} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment