joshy/alice-word-count

## alice-word-count
-- Read the whole book and save it to input_lines
input_lines = LOAD 'alice-im-wunderland.txt' using TextLoader() as (line:chararray);

-- Extract words from each line and put them into a pig bag
-- datatype, then flatten the bag to get one word on each row
words = foreach input_lines generate flatten(TOKENIZE(line)) AS word;

-- filter out any words that are just white spaces
filtered_words = FILTER words by word MATCHES '\\w+';

-- filter out all small words
filtered_long_words = FILTER filtered_words by SIZE(word) > 3;


-- create a group for each word
word_groups = GROUP filtered_long_words BY word;

-- count the entries in each group
word_count = FOREACH word_groups generate COUNT(filtered_long_words) as count, group as word;

-- sort the word counts be highest number first
word_count_sorted = ORDER word_count by count DESC;

--dump filtered_words;
dump word_count_sorted;
	-- Read the whole book and save it to input_lines
	input_lines = LOAD 'alice-im-wunderland.txt' using TextLoader() as (line:chararray);

	-- Extract words from each line and put them into a pig bag
	-- datatype, then flatten the bag to get one word on each row
	words = foreach input_lines generate flatten(TOKENIZE(line)) AS word;

	-- filter out any words that are just white spaces
	filtered_words = FILTER words by word MATCHES '\\w+';

	-- filter out all small words
	filtered_long_words = FILTER filtered_words by SIZE(word) > 3;


	-- create a group for each word
	word_groups = GROUP filtered_long_words BY word;

	-- count the entries in each group
	word_count = FOREACH word_groups generate COUNT(filtered_long_words) as count, group as word;

	-- sort the word counts be highest number first
	word_count_sorted = ORDER word_count by count DESC;

	--dump filtered_words;
	dump word_count_sorted;