Created
October 10, 2014 21:13
-
-
Save danlmarmot/29a9b7831ea8e5905b7c to your computer and use it in GitHub Desktop.
Simple PySpark 1.1 standalone program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Small demo of how to get a standalone Python script working in Spark 1.1 through the regular Python interpreter, | |
rather than using bin/pyspark <filename.py>. | |
Useful for interactive debugging and avoids messing with PYTHONPATH, I used this script to verify my PyCharm IDE | |
was correctly configured. | |
Note that Spark is installed at ~/bin/spark/current | |
Run this with either python simple_pyspark.py, or just ./sample_pyspark.py | |
''' | |
import sys, os | |
SPARK_HOME = os.path.join(os.environ["HOME"], "bin/spark/current/") | |
sys.path.append(os.path.join(SPARK_HOME, "python")) | |
sys.path.append(os.path.join(SPARK_HOME, "python/lib/py4j-0.8.2.1-src.zip")) | |
#Uncomment to examine Python paths | |
# from pprint import pprint as pp | |
# print "Paths are:" | |
# pp(sys.path) | |
from pyspark import SparkContext | |
read_me = os.path.join(SPARK_HOME, "README.md") | |
sc = SparkContext("local", "Read Me") | |
read_me_data = sc.textFile(read_me).cache() | |
numAs = read_me_data.filter(lambda s: 'a' in s).count() | |
numBs = read_me_data.filter(lambda s: 'b' in s).count() | |
print "Lines with a: %i, lines with b: %i" % (numAs, numBs) | |
# A couple of assertions | |
assert(numAs is 83) | |
assert(numBs is 38) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment