tuttlem/mapper.py

## mapper.py
#!/usr/bin/env python

import sys
import itertools
from math import sqrt

def read_input(file):
    for line in file:
        # ensure that the newline is taken out of the equation
        snums = line.replace('\n', '').split(',')
        yield map(lambda s: int(s), snums)

def prime_candidate_seq():
  # these numbers are the only valid endings for primes above 9
  suffixes = [1, 3, 7, 9]

  index = 0
  decade = 10

  # send out the seed primes to test
  yield 2
  yield 3
  yield 5
  yield 7

  # the rest are just going to be generated guesses to test
  while True:
    yield decade + suffixes[index]
    index = index + 1

    if index == 4:
      index = 0
      decade += 10

def test_prime(n):

  # denominators can't exceed the square-root of
  # the source number that we're testing
  v = sqrt(n)

  # count up to the target number
  return not any(n % v == 0 for v in itertools.takewhile(lambda x: x < v, prime_candidate_seq()))

def main():
    rows = read_input(sys.stdin)

    for row in rows:
      for p in filter(lambda n: test_prime(n), row):
        print p

if __name__ == "__main__":
    main()

## reducer.py
#!/usr/bin/env python

import sys

def main():
    # input comes from STDIN (standard input)
    for line in sys.stdin:
        print line.replace('\n', '')

if __name__ == "__main__":
    main()

## submit.sh
#!/bin/bash

# candidates.txt needs to be uploaded to HDFS first. It'll hold all of the potential primes for processing.

# testing the mapper and reducer locally can be done with bash
#   cat candidates.txt | ./mapper.py | sort -k1,1 | ./reducer.py

$HADOOP_PREFIX/bin/hadoop jar \
  $HADOOP_PREFIX/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
  -mapper $(pwd)/mapper.py            \
  -reducer $(pwd)/reducer.py          \
  -input /user/root/candidates.txt    \
  -output /user/root/candidates-out
	#!/usr/bin/env python

	import sys
	import itertools
	from math import sqrt

	def read_input(file):
	for line in file:
	# ensure that the newline is taken out of the equation
	snums = line.replace('\n', '').split(',')
	yield map(lambda s: int(s), snums)

	def prime_candidate_seq():
	# these numbers are the only valid endings for primes above 9
	suffixes = [1, 3, 7, 9]

	index = 0
	decade = 10

	# send out the seed primes to test
	yield 2
	yield 3
	yield 5
	yield 7

	# the rest are just going to be generated guesses to test
	while True:
	yield decade + suffixes[index]
	index = index + 1

	if index == 4:
	index = 0
	decade += 10

	def test_prime(n):

	# denominators can't exceed the square-root of
	# the source number that we're testing
	v = sqrt(n)

	# count up to the target number
	return not any(n % v == 0 for v in itertools.takewhile(lambda x: x < v, prime_candidate_seq()))

	def main():
	rows = read_input(sys.stdin)

	for row in rows:
	for p in filter(lambda n: test_prime(n), row):
	print p

	if __name__ == "__main__":
	main()
	#!/bin/bash

	# candidates.txt needs to be uploaded to HDFS first. It'll hold all of the potential primes for processing.

	# testing the mapper and reducer locally can be done with bash
	# cat candidates.txt \| ./mapper.py \| sort -k1,1 \| ./reducer.py

	$HADOOP_PREFIX/bin/hadoop jar \
	$HADOOP_PREFIX/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
	-mapper $(pwd)/mapper.py \
	-reducer $(pwd)/reducer.py \
	-input /user/root/candidates.txt \
	-output /user/root/candidates-out