Skip to content

Instantly share code, notes, and snippets.

@tuttlem
Created November 23, 2015 10:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tuttlem/a9cbd1cbf39ca8636955 to your computer and use it in GitHub Desktop.
Save tuttlem/a9cbd1cbf39ca8636955 to your computer and use it in GitHub Desktop.
Prime evaluation with Hadoop Streaming
#!/usr/bin/env python
import sys
import itertools
from math import sqrt
def read_input(file):
for line in file:
# ensure that the newline is taken out of the equation
snums = line.replace('\n', '').split(',')
yield map(lambda s: int(s), snums)
def prime_candidate_seq():
# these numbers are the only valid endings for primes above 9
suffixes = [1, 3, 7, 9]
index = 0
decade = 10
# send out the seed primes to test
yield 2
yield 3
yield 5
yield 7
# the rest are just going to be generated guesses to test
while True:
yield decade + suffixes[index]
index = index + 1
if index == 4:
index = 0
decade += 10
def test_prime(n):
# denominators can't exceed the square-root of
# the source number that we're testing
v = sqrt(n)
# count up to the target number
return not any(n % v == 0 for v in itertools.takewhile(lambda x: x < v, prime_candidate_seq()))
def main():
rows = read_input(sys.stdin)
for row in rows:
for p in filter(lambda n: test_prime(n), row):
print p
if __name__ == "__main__":
main()
#!/usr/bin/env python
import sys
def main():
# input comes from STDIN (standard input)
for line in sys.stdin:
print line.replace('\n', '')
if __name__ == "__main__":
main()
#!/bin/bash
# candidates.txt needs to be uploaded to HDFS first. It'll hold all of the potential primes for processing.
# testing the mapper and reducer locally can be done with bash
# cat candidates.txt | ./mapper.py | sort -k1,1 | ./reducer.py
$HADOOP_PREFIX/bin/hadoop jar \
$HADOOP_PREFIX/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
-mapper $(pwd)/mapper.py \
-reducer $(pwd)/reducer.py \
-input /user/root/candidates.txt \
-output /user/root/candidates-out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment