ramannanda9/word_count.pig

## word_count.pig
input_records = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/input.txt' as(tweet_text:chararray);
filter_words = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/filter_words.txt' as (filter_word:chararray);
input_records = FOREACH input_records GENERATE REPLACE(tweet_text,'([^a-zA-Z\\s]+)',' ') as tweet_text;
tokenize_words = FOREACH input_records GENERATE flatten(TOKENIZE(LOWER(tweet_text))) as word;
--INEFFICIENT BUT NECCESSARY to print filtered_word
cleanedData =  JOIN filter_words by LOWER((filter_word)) FULL, tokenize_words by LOWER(word);
--REMOVE NON MATCHIN WORDS EXCEPT THE ONE IN WHICH THE FILTER WORD IS NOT NULL
filtered_data = FILTER cleanedData by ((word IS NULL and filter_word IS NOT NULL ) or word matches LOWER(filter_word)  ) ;
grouped_words = GROUP filtered_data by filter_word;
--GROUP THEM BY group key and only count when the word is not empty, Since null is not counted so it works
word_cnt = FOREACH grouped_words GENERATE group as matched_word, COUNT(filtered_data.tokenize_words::word);
-- print the word count
dump word_cnt;
	input_records = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/input.txt' as(tweet_text:chararray);
	filter_words = LOAD 'hdfs://quickstart.cloudera:8020/user/cloudera/pig_word_count/filter_words.txt' as (filter_word:chararray);
	input_records = FOREACH input_records GENERATE REPLACE(tweet_text,'([^a-zA-Z\\s]+)',' ') as tweet_text;
	tokenize_words = FOREACH input_records GENERATE flatten(TOKENIZE(LOWER(tweet_text))) as word;
	--INEFFICIENT BUT NECCESSARY to print filtered_word
	cleanedData = JOIN filter_words by LOWER((filter_word)) FULL, tokenize_words by LOWER(word);
	--REMOVE NON MATCHIN WORDS EXCEPT THE ONE IN WHICH THE FILTER WORD IS NOT NULL
	filtered_data = FILTER cleanedData by ((word IS NULL and filter_word IS NOT NULL ) or word matches LOWER(filter_word) ) ;
	grouped_words = GROUP filtered_data by filter_word;
	--GROUP THEM BY group key and only count when the word is not empty, Since null is not counted so it works
	word_cnt = FOREACH grouped_words GENERATE group as matched_word, COUNT(filtered_data.tokenize_words::word);
	-- print the word count
	dump word_cnt;