Skip to content

Instantly share code, notes, and snippets.

@VJ310
VJ310 / sentenceDetector.pig
Created April 24, 2013 06:23
Sentence Boundary Detection using Pig + Java UDF + OpenNLP
register 'opennlp-tools-1.5.1-incubating.jar';
register 'opennlp-maxent-3.0.1-incubating.jar';
register 'SentimentUDF-1.0-SNAPSHOT.jar';
define getSentences com.Sentiment.udfSentence();
--load reviews from json file
raw_review = LOAD 'review.json' USING JsonLoader('votes:(funny:int,useful:int,cool:int),user_id:chararray,review_id:chararray,stars:int,date:chararray,text:chararray,type:chararray,business_id:chararray');
--seperate sentences from given review text using java UDF
@VJ310
VJ310 / business.pig
Last active December 14, 2015 15:09
Load json data to Hbase using Pig + Elephant-Bird
register '/usr/local/pig-0.11.0/lib/json-simple-1.1.jar';
register '/usr/local/pig-0.11.0/lib/elephant-bird-pig-3.0.7.jar';
register '/usr/local/hbase-0.94.5/lib/zookeeper-3.4.5.jar';
register '/usr/local/hbase-0.94.5/lib/protobuf-java-2.4.0a.jar';
--Test json data
--{"business_id": "businessid1", "full_address": "full address", "schools": ["school1","school2"], "open": true, "categories":["category1", "category2"], "photo_url": "http://photourl.com/photo.gif", "city": "city", "review_count": 2, "name": "name", "neighborhoods": ["neighborhood1","neighborhood2"], "url": "http://url.com/xyz", "longitude": -80.488823999999994, "state": "CA", "stars": 4.0, "latitude": 43.449645199999999, "type": "xyz"}
raw_data = load 'business.json' using com.twitter.elephantbird.pig.load.JsonLoader() as (json: map[]);