Skip to content

Instantly share code, notes, and snippets.

@AKB428
Last active August 29, 2015 14:22
Show Gist options
  • Save AKB428/b0ca9e7eabd45d2eca4c to your computer and use it in GitHub Desktop.
Save AKB428/b0ca9e7eabd45d2eca4c to your computer and use it in GitHub Desktop.
Spark + kuromoji + D3.js で 簡単に「NHKつぶやきビッグデータ」を作る ref: http://qiita.com/AKB428/items/12118d1d28b9e067296a
require "json"
def build_children(title, value)
children = {}
children['name'] = title
children['children'] = []
children['children'][0] = {'name' => title, 'size' => value}
children
end
def build_data(csv_data_map)
data = {}
data['name'] = 'flare'
data['children'] = csv_data_map.map{|k, v| build_children(k,v)}
JSON.dump(data)
end
csv_data_map = {}
open(ARGV[0]) {|file|
while line = file.gets
record = line.split(',')
csv_data_map[record[0]] = record[1]
end
}
puts build_data(csv_data_map)
魔法科高校の劣等生,魔法科高校の劣等生,魔法科高校の劣等生,カスタム名詞
セーラームーン,セーラームーン,セーラームーン,カスタム名詞
七つの大罪,七つの大罪,七つの大罪,カスタム名詞
ハイキュー,ハイキュー,ハイキュー,カスタム名詞
ラブライブ,ラブライブ,ラブライブ,カスタム名詞
ばらかもん,ばらかもん,ばらかもん,カスタム名詞
東京喰種,東京喰種,東京喰種,カスタム名詞
月刊少女野崎くん,月刊少女野崎くん,月刊少女野崎くん,カスタム名詞
矢澤にこ 11200
歩道 10183
レール 1212
{
"name": "flare",
"children": [
{
"name": "矢澤にこ",
"children": [
{
"name": "矢澤にこ",
"size": "11200"
}
]
},
{
"name": "10183",
"children": [
{
"name": "10183",
"size": "70"
}
]
},
import java.util.regex.{Matcher, Pattern}
import org.apache.spark.{SparkConf, SparkContext}
import org.atilika.kuromoji.{Token, Tokenizer}
import java.io.PrintWriter
/**
* Created by AKB428
*/
object inazumaTwitter {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Inazuma Application")
conf.setMaster("local[*]")
val sc = new SparkContext(conf)
val input = sc.textFile(args(0)) // hdfs://
var printRankingNum = 10
var dictFilePath = "./dictionary/blank.txt"
if (args.length >= 2) {
dictFilePath = args(1)
}
if (args.length == 3) {
printRankingNum = args(2).toInt
}
// kuromoji(形態要素解析)で日本語解析
val words = input.flatMap(x => {
// ref:http://www.intellilink.co.jp/article/column/bigdata-kk01.html
val japanese_pattern : Pattern = Pattern.compile("[¥¥u3040-¥¥u309F]+") //「ひらがなが含まれているか?」の正規表現
// 不要な文字列の削除
var text = x.replaceAll("http(s*)://(.*)/", "").replaceAll("¥¥uff57", "")
val tokens: java.util.List[Token] = CustomTwitterTokenizer.tokenize(text, dictFilePath)
val features: scala.collection.mutable.ArrayBuffer[String] = new collection.mutable.ArrayBuffer[String]()
if(japanese_pattern.matcher(x).find()) {
val pattern : Pattern = Pattern.compile("^[a-zA-Z]+$|^[0-9]+$") //「英数字か?」の正規表現
for (index <- 0 to tokens.size() - 1) {
// 二文字以上の単語を抽出
if (tokens.get(index).getSurfaceForm().length() >= 2) {
val matcher : Matcher = pattern.matcher(tokens.get(index).getSurfaceForm())
if (!matcher.find()) {
if (tokens.get(index).getAllFeaturesArray()(0) == "名詞" && (tokens.get(index).getAllFeaturesArray()(1) == "一般" || tokens.get(index).getAllFeaturesArray()(1) == "固有名詞")) {
features += tokens.get(index).getSurfaceForm
} else if (tokens.get(index).getPartOfSpeech == "カスタム名詞") {
// println(tokens.get(index).getPartOfSpeech)
// println(tokens.get(index).getSurfaceForm)
features += tokens.get(index).getSurfaceForm
}
}
}
}
}
(features)
})
// ソート方法を定義(必ずソートする前に定義)
implicit val sortIntegersByString = new Ordering[Int] {
override def compare(a: Int, b: Int) = a.compare(b)*(-1)
}
// ソート
val result = words.map(x => (x,1)).reduceByKey((x,y) => x + y).sortBy(_._2)
// ソート結果から上位を取得
for (r <- result.take(printRankingNum)) {
println(r._1 + " " + r._2)
}
// 結果をCSVファイルに保存
val out = new PrintWriter("data.csv")
for (r <- result.take(printRankingNum)) {
out.println(r._1 + "," + r._2)
}
out.close
sc.stop
}
}
object CustomTwitterTokenizer {
def tokenize(text: String, dictPath: String): java.util.List[Token] = {
Tokenizer.builder().mode(Tokenizer.Mode.SEARCH)
.userDictionary(dictPath)
.build().tokenize(text)
}
}
TwitterStream twitterStream = new TwitterStreamFactory().getInstance();
twitterStream.setOAuthConsumer(twitterModel.getConsumerKey(),
twitterModel.getConsumerSecret());
twitterStream.setOAuthAccessToken(new AccessToken(twitterModel
.getAccessToken(), twitterModel.getAccessToken_secret()));
// MyStatusAdapterクラスでTwitterのStatusクラスを処理する
twitterStream.addListener(new MyStatusAdapter(applicationConfParser, bufferedWriter));
ArrayList<String> track = new ArrayList<String>();
track.addAll(Arrays.asList(Application.searchKeyword.split(",")));
String[] trackArray = track.toArray(new String[track.size()]);
// 400のキーワードが指定可能、5000のフォローが指定可能、25のロケーションが指定可能
twitterStream.filter(new FilterQuery(0, null, trackArray));
class TwitterDataCleansing
open(ARGV[0]) {|file|
while line = file.gets
puts line.gsub(/(全員|ふぁぼ|ファボ|定期|相互)/, ' ')
end
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment