Skip to content

Instantly share code, notes, and snippets.

@zeroflag
Last active November 2, 2021 12:35
Show Gist options
  • Save zeroflag/a80b59b486a3f36049f272dc91d2b718 to your computer and use it in GitHub Desktop.
Save zeroflag/a80b59b486a3f36049f272dc91d2b718 to your computer and use it in GitHub Desktop.
import sys
from operator import add
from pyspark.sql import SparkSession
import time
if __name__ == "__main__":
#if len(sys.argv) != 2:
# print("Usage: wordcount <file>", file=sys.stderr)
# sys.exit(-1)
print("Creating session")
spark = SparkSession\
.builder\
.appName("PythonWordCount")\
.getOrCreate()
for i in range(3700): # 1500 = 0.5h # 3500 = 1 hour
print("Reading lines")
lines = spark.read.text( 's3a://amagyar-bucket/1m' + str(i % 6) + '.txt' ).rdd.map(lambda r: r[0])
print("Counting")
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(add)
output = counts.collect()
print("Results")
for (word, count) in output:
print("%s: %i" % (word, count))
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment