Skip to content

Instantly share code, notes, and snippets.

@guillefix
Created February 27, 2019 23:48
Show Gist options
  • Save guillefix/4be443e6ec63e374eb99dd46cf61c706 to your computer and use it in GitHub Desktop.
Save guillefix/4be443e6ec63e374eb99dd46cf61c706 to your computer and use it in GitHub Desktop.
Pyspark boilerplate
import os
import sys
# os.environ["PYSPARK_PYTHON"]="/usr/local/shared/python/3.6.3/bin/python3"
# os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/local/shared/python/3.6.3/bin/python3"
#pip3 install pyspark
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
print(sc._jsc.sc().getExecutorMemoryStatus())
print(sc)
print("Ready to go!")
filename = sys.argv[1]
data = sc.textFile(filename)
#data.take(15)
data = data.map(lambda x: x.split("\t")) # every x is a line from filename
# rdd = sc.parallelize(data)
# rdd.map(lambda x: hamming_comp(inputs_str,x,2)).collect()
def some_function(x):
return x
data = data.map(lambda x: "\t".join([x[0],str(some_function(x[0])), str(int(x[1]))]))
data.saveAsTextFile("processed_"+filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment