Created
January 5, 2015 02:38
-
-
Save rjurney/2f350b2cbed9862b692b to your computer and use it in GitHub Desktop.
A join using Python/MrJob
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted for MrJob from Joe Stein's example at: | |
# http://allthingshadoop.com/2011/12/16/simple-hadoop-streaming-tutorial-using-joins-and-keys-with-python/ | |
import sys, os, re | |
from mrjob.job import MRJob | |
class MRJoin(MRJob): | |
SORT_VALUES = True | |
def mapper(self, _, line): | |
splits = line.rstrip("\n").split("|") | |
if len(splits) == 2: # country data | |
symbol = 'A' | |
countryName = splits[0] | |
country2digit = splits[1] | |
yield country2digit, [symbol, countryName] | |
else: # person data | |
symbol = 'B' | |
personName = splits[0] | |
personType = splits[1] | |
country2digit = splits[2] | |
yield country2digit, [symbol, personName, personType] | |
def reducer(self, key, values): | |
values = [x for x in values] | |
if len(values) > 1: # our join hit | |
country = values[0] | |
for value in values[1:]: | |
yield key, [country, value] | |
else: # our join missed | |
pass | |
if __name__ == '__main__': | |
MRJoin.run() |
Hi, I have a question. Since there is a shuffle and sort step, how are we sure that country is value[0]?
value[0] takes the country symbol, which is the country value being referred to I believe...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I have a question. Since there is a shuffle and sort step, how are we sure that country is value[0]?