Created
March 1, 2016 21:10
-
-
Save ramannanda9/7a4ba0cc12d3ca15e434 to your computer and use it in GitHub Desktop.
PIG script for searching words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
input_records = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/input.txt' as(tweet_text:chararray); | |
filter_words = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/filter_words.txt' as (filter_word:chararray); | |
input_records = FOREACH input_records GENERATE REPLACE(tweet_text,'([^a-zA-Z\\s]+)',' ') as tweet_text; | |
tokenize_words = FOREACH input_records GENERATE flatten(TOKENIZE(LOWER(tweet_text))) as word; | |
--INEFFICIENT BUT NECCESSARY to print filtered_word | |
cleanedData = JOIN filter_words by LOWER((filter_word)) FULL, tokenize_words by LOWER(word); | |
--REMOVE NON MATCHIN WORDS EXCEPT THE ONE IN WHICH THE FILTER WORD IS NOT NULL | |
filtered_data = FILTER cleanedData by ((word IS NULL and filter_word IS NOT NULL ) or word matches LOWER(filter_word) ) ; | |
grouped_words = GROUP filtered_data by filter_word; | |
--GROUP THEM BY group key and only count when the word is not empty, Since null is not counted so it works | |
word_cnt = FOREACH grouped_words GENERATE group as matched_word, COUNT(filtered_data.tokenize_words::word); | |
-- print the word count | |
dump word_cnt; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment