Created
April 29, 2017 07:22
-
-
Save bobquest33/a55450baef5a938b1d1777ac97578131 to your computer and use it in GitHub Desktop.
100 Scripts in 30 Days challenge: Script 26,27,28 Learning PySpark - Script 27, Find Count of words in a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Find count of words in the text file | |
#import regex module | |
import re | |
#import add from operator module | |
from operator import add | |
#Read a text file and create RDD lines | |
lines = sc.textFile("wordtxt.txt") | |
#count total no of lines | |
print 'number of lines in file:',lines.count() | |
#add up lengths of each line | |
chars = lines.map(lambda s: len(s)).reduce(add) | |
print 'number of characters in file:',chars | |
#Get words from input file | |
words = lines.flatMap(lambda line: re.split("\W+",line.lower().strip())) | |
#filter out words with mininum 1 characters | |
wordsfil = words.filter(lambda x:len(x)>0) | |
#map phase set count 1 per word | |
wordmap = wordsfil.map(lambda w:(w,1)) | |
#reduce phase - sum count all the words | |
reducedwords = wordmap.reduceByKey(add) | |
print 'word count summary list no of rows',reducedwords.count() | |
print 'word count summary list:' | |
print sorted(reducedwords.take(reducedwords.count()), key=lambda val: val[1],reverse=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment