sangheestyle/mapper.py

## mapper.py
#!/usr/bin/env python
import sys
import string

exclude = set(string.punctuation)

for line in sys.stdin:
    line = line.strip()
    line = ''.join(ch for ch in line if ch not in exclude)
    line = ''.join([i for i in line if not i.isdigit()])
    line = line.lower()
    words = line.split()
    for word in words:
        print '%s\t%s' % (word, 1)

## reducer.py
#!/usr/bin/env python

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print '%s\t%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s\t%s' % (current_word, current_count)

## word_counter.pig
a = LOAD '$INPUT' AS (foo:chararray);
b1 = FOREACH a GENERATE TOKENIZE(foo, ' ')
     AS tokens: {t:(word: chararray)};
b2 = FOREACH b1 {
     cleaned = FOREACH tokens GENERATE
               FLATTEN(REGEX_EXTRACT_ALL(LOWER(word),'.*?([a-z]+).*?'))
               AS word ;
     GENERATE FLATTEN(cleaned);
}
c = GROUP b2 BY word;
d = FOREACH c GENERATE COUNT(b2) AS counts, group AS word;
e = ORDER d BY counts DESC;
STORE e INTO '$OUTPUT';
	#!/usr/bin/env python
	import sys
	import string

	exclude = set(string.punctuation)

	for line in sys.stdin:
	line = line.strip()
	line = ''.join(ch for ch in line if ch not in exclude)
	line = ''.join([i for i in line if not i.isdigit()])
	line = line.lower()
	words = line.split()
	for word in words:
	print '%s\t%s' % (word, 1)
	#!/usr/bin/env python

	from operator import itemgetter
	import sys

	current_word = None
	current_count = 0
	word = None

	# input comes from STDIN
	for line in sys.stdin:
	# remove leading and trailing whitespace
	line = line.strip()

	# parse the input we got from mapper.py
	word, count = line.split('\t', 1)

	# convert count (currently a string) to int
	try:
	count = int(count)
	except ValueError:
	# count was not a number, so silently
	# ignore/discard this line
	continue

	# this IF-switch only works because Hadoop sorts map output
	# by key (here: word) before it is passed to the reducer
	if current_word == word:
	current_count += count
	else:
	if current_word:
	# write result to STDOUT
	print '%s\t%s' % (current_word, current_count)
	current_count = count
	current_word = word

	# do not forget to output the last word if needed!
	if current_word == word:
	print '%s\t%s' % (current_word, current_count)
	a = LOAD '$INPUT' AS (foo:chararray);
	b1 = FOREACH a GENERATE TOKENIZE(foo, ' ')
	AS tokens: {t:(word: chararray)};
	b2 = FOREACH b1 {
	cleaned = FOREACH tokens GENERATE
	FLATTEN(REGEX_EXTRACT_ALL(LOWER(word),'.?([a-z]+).?'))
	AS word ;
	GENERATE FLATTEN(cleaned);
	}
	c = GROUP b2 BY word;
	d = FOREACH c GENERATE COUNT(b2) AS counts, group AS word;
	e = ORDER d BY counts DESC;
	STORE e INTO '$OUTPUT';