Last active
March 6, 2016 12:05
POS Tags visualization MapReduce job
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'renienj' | |
#!/usr/bin/env python | |
import sys | |
def read_input(file): | |
for line in file: | |
# split the line into words | |
yield line.split() | |
# Add default count to each word | |
def pos_tag_count(data): | |
for words in data: | |
for word in words: | |
(value, pos) = word.split('_') | |
print '%s%s%d' % (pos, '\t', 1) | |
# Add default count to each pattern | |
def pos_tag_pattern_count(data): | |
for words in data: | |
pattern = [] | |
for word in words: | |
(value, pos) = word.split('_') | |
pattern.append(pos) | |
print '%s%s%d' % (" ".join(pattern), '\t', 1) | |
# words and words pattern frequency | |
def word_count(data): | |
for words in data: | |
for word in words: | |
(value, pos) = word.split('_') | |
print '%s%s%d' % (value, '\t', 1) | |
# Add default count to each pattern | |
def word_pattern_count(data): | |
for words in data: | |
pattern = [] | |
for word in words: | |
(value, pos) = word.split('_') | |
pattern.append(value) | |
print '%s%s%d' % (" ".join(pattern), '\t', 1) | |
def main(argv): | |
data = read_input(sys.stdin) | |
if argv[1] == "tag-pattern": | |
pos_tag_pattern_count(data) | |
elif argv[1] == "tag": | |
pos_tag_count(data) | |
elif argv[1] == "word-pattern": | |
word_pattern_count(data) | |
elif argv[1] == "word": | |
word_count(data) | |
if __name__ == "__main__": | |
main(sys.argv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'renienj' | |
from operator import itemgetter | |
import sys | |
current_word = None | |
current_count = 0 | |
word = None | |
# input comes from STDIN | |
for line in sys.stdin: | |
# remove leading and trailing whitespace | |
line = line.strip() | |
# parse the input we got from mapper.py | |
word, count = line.split('\t', 1) | |
# convert count (currently a string) to int | |
try: | |
count = int(count) | |
except ValueError: | |
# count was not a number, so silently | |
# ignore/discard this line | |
continue | |
# this IF-switch only works because Hadoop sorts map output | |
# by key (here: word) before it is passed to the reducer | |
if current_word == word: | |
current_count += count | |
else: | |
if current_word: | |
# write result to STDOUT | |
print '%s\t%s' % (current_word, current_count) | |
current_count = count | |
current_word = word | |
# do not forget to output the last word if needed! | |
if current_word == word: | |
print '%s\t%s' % (current_word, current_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Blog post : POS Tags Visualization data processing code
Python Hadoop Map Reduce files