mkaranasou/pyspark_simple_file_read_short.py

## pyspark_simple_file_read_short.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F

conf = SparkConf()
# optional but it would be good to set the amount of ram the driver can use to
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
conf.set('spark.driver.memory', '6G')

spark = SparkSession.builder \
        .config(conf=conf) \
        .appName('Homework-App') \
        .getOrCreate()

df = spark.read.text('full/path/to/file.txt)
df =  df.withColumn('has_big_data', F.when(F.col('value').contains('big data'), True).otherwise(False))
result = df.select('value').where(F.col('has_big_data')==True).count()
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F

	conf = SparkConf()
	# optional but it would be good to set the amount of ram the driver can use to
	# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
	conf.set('spark.driver.memory', '6G')

	spark = SparkSession.builder \
	.config(conf=conf) \
	.appName('Homework-App') \
	.getOrCreate()

	df = spark.read.text('full/path/to/file.txt)
	df = df.withColumn('has_big_data', F.when(F.col('value').contains('big data'), True).otherwise(False))
	result = df.select('value').where(F.col('has_big_data')==True).count()