Skip to content

Instantly share code, notes, and snippets.

@drorata
Created August 18, 2015 10:31
Show Gist options
  • Save drorata/ce48a28d7ddc1a16a3b4 to your computer and use it in GitHub Desktop.
Save drorata/ce48a28d7ddc1a16a3b4 to your computer and use it in GitHub Desktop.
Sample of word count using spark from a local file. Order the result in a descending order.
import re
from pyspark import SparkContext
print "-----------------===========================-----------------"
print "-----------------==========Staring==========-----------------"
print "-----------------===========================-----------------"
sc = SparkContext(appName = "simple app")
print "-----------------===========================-----------------"
print "-----------------==========Loaded file======-----------------"
print "-----------------===========================-----------------"
text_file = sc.textFile("lorem.txt")
print "-----------------===========================-----------------"
print "-----------------========== Process ======-----------------"
print "-----------------===========================-----------------"
counts = text_file.flatMap(lambda line: re.split(r'\W*', line.rstrip())) \
.filter(lambda word: word != '') \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b) \
.map(lambda (a,b): (b, a)) \
.sortByKey(False)
output = counts.collect()
counts.saveAsTextFile("test.txt")
sc.stop()
for x in output:
print (x[0], x[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment