Created
April 4, 2011 14:47
-
-
Save kzk/901754 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.kzk9; | |
import java.io.*; | |
import java.util.*; | |
import org.apache.hadoop.conf.*; | |
import org.apache.hadoop.hbase.*; | |
import org.apache.hadoop.hbase.client.*; | |
import org.apache.hadoop.hbase.filter.*; | |
import org.apache.hadoop.hbase.io.*; | |
import org.apache.hadoop.hbase.util.*; | |
import org.apache.hadoop.hbase.mapreduce.*; | |
import org.apache.hadoop.mapreduce.*; | |
import org.apache.hadoop.mapreduce.lib.output.*; | |
import org.apache.hadoop.util.*; | |
import org.json.simple.*; | |
import org.json.simple.parser.*; | |
public class HBaseTwitterJSONParse { | |
/** | |
* hBaseのテーブルを入力とするMapper | |
* MapReduce Counter APIを使用して行数をカウント | |
*/ | |
static class RowCounterMapper extends TableMapper<ImmutableBytesWritable, Result> { | |
public static enum Counters { VALID_ROWS, INVALID_ROWS } | |
JSONParser parser; | |
@Override | |
protected void setup(Mapper.Context context) { | |
// JSONパーサーを初期化 | |
parser = new JSONParser(); | |
} | |
// HBaseテーブルの1行がTwitterのJSON形式のデータ | |
@Override | |
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException { | |
for (KeyValue value: values.list()) { | |
String json_str = Bytes.toString(value.getValue()); | |
try { | |
Map tweet = (Map)parser.parse(new StringReader(json_str)); | |
String id_str = (String)tweet.get("id_str"); | |
if (id_str == null) | |
throw new IOException("id_str not found"); | |
String text = (String)tweet.get("text"); | |
if (id_str == null) | |
throw new IOException("text not found"); | |
Map user = (Map)tweet.get("user"); | |
if (user == null) | |
throw new IOException("user not found"); | |
String screen_name = (String)user.get("screen_name"); | |
if (id_str == null) | |
throw new IOException("screen_name not found"); | |
// increment valid rows | |
context.getCounter(Counters.VALID_ROWS).increment(1); | |
} catch (Throwable e) { | |
// ignore format & parse error | |
context.getCounter(Counters.INVALID_ROWS).increment(1); | |
} | |
} | |
} | |
} | |
public static void main(String[] args) throws Exception { | |
// 設定情報の読み込み | |
Configuration conf = HBaseConfiguration.create(); | |
conf.addResource("/etc/hbase/conf/hbase-default.xml"); | |
conf.addResource("/etc/hbase/conf/hbase-site.xml"); | |
conf.set("hbase.client.scanner.caching", "300"); | |
// conf.set("mapred.job.tracker", "local"); | |
// 引数のパース | |
new GenericOptionsParser(conf, args); | |
// ジョブの作成 | |
String tableName = "twitter"; | |
Job job = new Job(conf, "hBaseTableRowCounter_" + tableName); | |
job.setJarByClass(HBaseTwitterJSONParse.class); | |
// Reducerは使用しない。Counterで行数を数える。 | |
job.setOutputFormatClass(NullOutputFormat.class); | |
job.setNumReduceTasks(0); | |
// Scan条件の指定 | |
Scan scan = new Scan(); | |
// 最初のデータのみを取得 | |
scan.setFilter(new FirstKeyOnlyFilter()); | |
// data:jsonのみを取得 | |
scan = scan.addColumn(Bytes.toBytes("data"), Bytes.toBytes("json")); | |
// 補助関数を利用したMapperの初期化 | |
TableMapReduceUtil.initTableMapperJob(tableName, // テーブル名 | |
scan, // Mapperに渡す前に使用するScan | |
RowCounterMapper.class, // Mapperクラス | |
ImmutableBytesWritable.class, // MapperのKeyの型 | |
Result.class, // MapperのValueの型 | |
job); | |
// ジョブの実行 | |
System.exit(job.waitForCompletion(true) ? 0 : 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment