bobquest33/script_27_pyspark_wordcount_example.py

## script_27_pyspark_wordcount_example.py
#Find count of words in the text file
#import regex module
import re
#import add from operator module
from operator import add
#Read a text file and create RDD lines
lines = sc.textFile("wordtxt.txt")
#count total no of lines
print 'number of lines in file:',lines.count()
#add up lengths of each line
chars = lines.map(lambda s: len(s)).reduce(add)
print 'number of characters in file:',chars
#Get words from input file
words = lines.flatMap(lambda line: re.split("\W+",line.lower().strip()))
#filter out words with mininum 1 characters
wordsfil = words.filter(lambda x:len(x)>0)
#map phase set count 1 per word
wordmap = wordsfil.map(lambda w:(w,1))
#reduce phase - sum count all the words
reducedwords = wordmap.reduceByKey(add)
print 'word count summary list no of rows',reducedwords.count()
print 'word count summary list:'
print sorted(reducedwords.take(reducedwords.count()), key=lambda val: val[1],reverse=True)
	#Find count of words in the text file
	#import regex module
	import re
	#import add from operator module
	from operator import add
	#Read a text file and create RDD lines
	lines = sc.textFile("wordtxt.txt")
	#count total no of lines
	print 'number of lines in file:',lines.count()
	#add up lengths of each line
	chars = lines.map(lambda s: len(s)).reduce(add)
	print 'number of characters in file:',chars
	#Get words from input file
	words = lines.flatMap(lambda line: re.split("\W+",line.lower().strip()))
	#filter out words with mininum 1 characters
	wordsfil = words.filter(lambda x:len(x)>0)
	#map phase set count 1 per word
	wordmap = wordsfil.map(lambda w:(w,1))
	#reduce phase - sum count all the words
	reducedwords = wordmap.reduceByKey(add)
	print 'word count summary list no of rows',reducedwords.count()
	print 'word count summary list:'
	print sorted(reducedwords.take(reducedwords.count()), key=lambda val: val[1],reverse=True)