dgadiraju/pyspark-01-rdd-wordcount.py

## pyspark-01-rdd-wordcount.py
data = sc.textFile('/public/randomtextwriter/part-m-00000')
wc = data. \
  flatMap(lambda line: line.split(' ')). \
  map(lambda word: (word, 1)). \
  reduceByKey(lambda x, y: x + y)
wc. \
  map(lambda rec: rec[0] + ',' + str(rec[1])). \
  saveAsTextFile('/user/training/core/wordcount')

## pyspark-02-df-wordcount.py
from pyspark.sql.functions import split, explode
data = spark.read.text('/public/randomtextwriter/part-m-00000')
wc = data.select(explode(split(data.value, ' ')).alias('words')). \
  groupBy('words'). \
  agg(count('words').alias('wc'))
wc.write.csv('/user/training/df/wordcount')
	data = sc.textFile('/public/randomtextwriter/part-m-00000')
	wc = data. \
	flatMap(lambda line: line.split(' ')). \
	map(lambda word: (word, 1)). \
	reduceByKey(lambda x, y: x + y)
	wc. \
	map(lambda rec: rec[0] + ',' + str(rec[1])). \
	saveAsTextFile('/user/training/core/wordcount')
	from pyspark.sql.functions import split, explode
	data = spark.read.text('/public/randomtextwriter/part-m-00000')
	wc = data.select(explode(split(data.value, ' ')).alias('words')). \
	groupBy('words'). \
	agg(count('words').alias('wc'))
	wc.write.csv('/user/training/df/wordcount')