Skip to content

Instantly share code, notes, and snippets.

@alex-pat
Created March 16, 2018 15:19
Show Gist options
  • Save alex-pat/1c83da46111e1f13834fc833b54875bb to your computer and use it in GitHub Desktop.
Save alex-pat/1c83da46111e1f13834fc833b54875bb to your computer and use it in GitHub Desktop.
spark lab
#!/usr/bin/env python3
# https://spark.apache.org/docs/latest/rdd-programming-guide.html#resilient-distributed-datasets-rdds
import sys
from pyspark import SparkContext
# from pyspark.sql import SparkSession
n = {'asd': 0}
def inc(x):
global n
n['asd'] += len(x)
def main(logfile):
# conf = (SparkConf()
# .setAppName("Lab")
# .setMaster("local"))
# sc = SparkContext(conf=conf)
sc = SparkContext("local", "Lab")
rdd = sc.textFile(logfile).cache()
# rdd.map(lambda x: print(type(x)))
rdd.foreach(inc)
global n
print(n)
return
counts = rdd.flatMap(lambda x: x.split(" ")) \
.map(lambda x: (x, 1)) \
.reduceByKey(lambda a,b : a+b).count()
print("counts", counts, "(%s)" % repr(counts))
print(type(counts))
print(dir(counts))
# print(dir(rdd))
if __name__ == '__main__':
main("../sprk/spark/data.txt" if len(sys.argv) != 2 else sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment