mkaranasou/pyspark_simple_read_text_file.py

## pyspark_simple_read_text_file.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F

conf = SparkConf()
# optional but it would be good to set the amount of ram the driver can use to
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
conf.set('spark.driver.memory', '6G')

# create a spark session - nothing can be done without this:
spark = SparkSession.builder \
        .config(conf=conf) \
        .appName('Homework-App') \
        .getOrCreate()

# spark.read.text returns a dataframe, which is easier to manipulate and also more efficient
# you can also use: spark.sparkContext.textFile('') but that will return RDD[String]
df = spark.read.text('full/path/to/file.txt)

# spark is "lazy" so, nothing has happened so far, besides the initialization of the session
# let's call .show() to see that we've actually read the file and what it looks like
df.show()

+--------------------+
|               value|
+--------------------+
|                   1|
|                    |
|              Spark |
|The Definitive Guide|
|Excerpts from the...|
|big data simple w...|
|                    |
|By Bill Chambers ...|
|                    |
|http://databricks...|
|http://databricks...|
|                    |
|                    |
|Apache Spark has ...|
|several years. Th...|
|a true reflection...|
|made itself into ...|
|is proud to share...|
|Spark: The Defini...|
|courtesy of Datab...|
+--------------------+
only showing top 20 rows

# Now let's get back to the task: identify where `big data` is used.
# For this task, we add a column to the dataframe and populate it with True when `big data` is identified in a row
df =  df.withColumn('has_big_data', F.when(F.col('value').contains('big data'), True).otherwise(False))
df.show()
+--------------------+------------+
|               value|has_big_data|
+--------------------+------------+
|                   1|       false|
|                    |       false|
|              Spark |       false|
|The Definitive Guide|       false|
|Excerpts from the...|       false|
|big data simple w...|        true|
|                    |       false|
|By Bill Chambers ...|       false|
|                    |       false|
|http://databricks...|       false|
|http://databricks...|       false|
|                    |       false|
|                    |       false|
|Apache Spark has ...|       false|
|several years. Th...|       false|
|a true reflection...|       false|
|made itself into ...|       false|
|is proud to share...|       false|
|Spark: The Defini...|       false|
|courtesy of Datab...|       false|
+--------------------+------------+
only showing top 20 rows

# and the answer to the homework is to select the rows where the 'has_big_data' column is True
df.select('value').where(F.col('has_big_data')==True).show()
+--------------------+
|               value|
+--------------------+
|big data simple w...|
|In the previous e...|
|of functions that...|
|and genomics have...|
|This part of the ...|
|When working with...|
+--------------------+

df.select('value').where(F.col('has_big_data')==True).count()
# 6 - the number of rows that contain this term

# just to see that `big data` is indeed included in these few rows :)
df.select('value').where(F.col('has_big_data')==True).show(100, False)
+----------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                 |
+----------------------------------------------------------------------------------------------------------------------+
|big data simple with Apache Spark.                                                                                    |
|In the previous example, we created a DataFrame of a range of numbers. Not exactly groundbreaking big data. In        |
|of functions that you can leverage and import to help you resolve your big data problems faster. We will use the max  |
|and genomics have seen a particularly large surge in opportunity for big data applications. For example, the ADAM     |
|This part of the book will cover all the core ideas behind the Structured APIs and how to apply them to your big data |
|When working with big data, the second most common task you will do after filtering things is counting things. For    |
+----------------------------------------------------------------------------------------------------------------------+
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F

	conf = SparkConf()
	# optional but it would be good to set the amount of ram the driver can use to
	# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
	conf.set('spark.driver.memory', '6G')

	# create a spark session - nothing can be done without this:
	spark = SparkSession.builder \
	.config(conf=conf) \
	.appName('Homework-App') \
	.getOrCreate()

	# spark.read.text returns a dataframe, which is easier to manipulate and also more efficient
	# you can also use: spark.sparkContext.textFile('') but that will return RDD[String]
	df = spark.read.text('full/path/to/file.txt)

	# spark is "lazy" so, nothing has happened so far, besides the initialization of the session
	# let's call .show() to see that we've actually read the file and what it looks like
	df.show()

	+--------------------+
	\| value\|
	+--------------------+
	\| 1\|
	\| \|
	\| Spark \|
	\|The Definitive Guide\|
	\|Excerpts from the...\|
	\|big data simple w...\|
	\| \|
	\|By Bill Chambers ...\|
	\| \|
	\|http://databricks...\|
	\|http://databricks...\|
	\| \|
	\| \|
	\|Apache Spark has ...\|
	\|several years. Th...\|
	\|a true reflection...\|
	\|made itself into ...\|
	\|is proud to share...\|
	\|Spark: The Defini...\|
	\|courtesy of Datab...\|
	+--------------------+
	only showing top 20 rows

	# Now let's get back to the task: identify where `big data` is used.
	# For this task, we add a column to the dataframe and populate it with True when `big data` is identified in a row
	df = df.withColumn('has_big_data', F.when(F.col('value').contains('big data'), True).otherwise(False))
	df.show()
	+--------------------+------------+
	\| value\|has_big_data\|
	+--------------------+------------+
	\| 1\| false\|
	\| \| false\|
	\| Spark \| false\|
	\|The Definitive Guide\| false\|
	\|Excerpts from the...\| false\|
	\|big data simple w...\| true\|
	\| \| false\|
	\|By Bill Chambers ...\| false\|
	\| \| false\|
	\|http://databricks...\| false\|
	\|http://databricks...\| false\|
	\| \| false\|
	\| \| false\|
	\|Apache Spark has ...\| false\|
	\|several years. Th...\| false\|
	\|a true reflection...\| false\|
	\|made itself into ...\| false\|
	\|is proud to share...\| false\|
	\|Spark: The Defini...\| false\|
	\|courtesy of Datab...\| false\|
	+--------------------+------------+
	only showing top 20 rows

	# and the answer to the homework is to select the rows where the 'has_big_data' column is True
	df.select('value').where(F.col('has_big_data')==True).show()
	+--------------------+
	\| value\|
	+--------------------+
	\|big data simple w...\|
	\|In the previous e...\|
	\|of functions that...\|
	\|and genomics have...\|
	\|This part of the ...\|
	\|When working with...\|
	+--------------------+

	df.select('value').where(F.col('has_big_data')==True).count()
	# 6 - the number of rows that contain this term

	# just to see that `big data` is indeed included in these few rows :)
	df.select('value').where(F.col('has_big_data')==True).show(100, False)
	+----------------------------------------------------------------------------------------------------------------------+
	\|value \|
	+----------------------------------------------------------------------------------------------------------------------+
	\|big data simple with Apache Spark. \|
	\|In the previous example, we created a DataFrame of a range of numbers. Not exactly groundbreaking big data. In \|
	\|of functions that you can leverage and import to help you resolve your big data problems faster. We will use the max \|
	\|and genomics have seen a particularly large surge in opportunity for big data applications. For example, the ADAM \|
	\|This part of the book will cover all the core ideas behind the Structured APIs and how to apply them to your big data \|
	\|When working with big data, the second most common task you will do after filtering things is counting things. For \|
	+----------------------------------------------------------------------------------------------------------------------+