Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from pyspark.sql import SparkSession
from operator import add
import re
print("Okay Google.")
spark = SparkSession\
.builder\
.appName("CountUniqueWords")\
.getOrCreate()
lines = spark.read.text("/sampledata/road-not-taken.txt").rdd.map(lambda x: x[0])
counts = lines.flatMap(lambda x: x.split(' ')) \
.filter(lambda x: re.sub('[^a-zA-Z]+', '', x)) \
.filter(lambda x: len(x)>1 ) \
.map(lambda x: x.upper()) \
.map(lambda x: (x, 1)) \
.reduceByKey(add) \
.sortByKey()
output = counts.collect()
for (word, count) in output:
print("%s = %i" % (word, count))
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.