Skip to content

Instantly share code, notes, and snippets.

@afonsoaugusto
Created June 25, 2019 21:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afonsoaugusto/516e995c064b50fb920f946a68b1629f to your computer and use it in GitHub Desktop.
Save afonsoaugusto/516e995c064b50fb920f946a68b1629f to your computer and use it in GitHub Desktop.
from pyspark.sql import SparkSession
from operator import add
import re
print("Okay Google.")
spark = SparkSession\
.builder\
.appName("CountUniqueWords")\
.getOrCreate()
lines = spark.read.text("/sampledata/road-not-taken.txt").rdd.map(lambda x: x[0])
counts = lines.flatMap(lambda x: x.split(' ')) \
.filter(lambda x: re.sub('[^a-zA-Z]+', '', x)) \
.filter(lambda x: len(x)>1 ) \
.map(lambda x: x.upper()) \
.map(lambda x: (x, 1)) \
.reduceByKey(add) \
.sortByKey()
output = counts.collect()
for (word, count) in output:
print("%s = %i" % (word, count))
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment