SuvroBaner/createSparkDataFrame.py

## createSparkDataFrame.py
# Creating Spark Configuration and Spark Context-

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("My Dataframe")
sc = SparkContext(conf = conf)

from pyspark.sql import SparkSession # To work with dataframe we need pyspark.sql
spark = SparkSession(sc) # passing Spark Context to SQL module

myRange = spark.range(1000).toDF("number")

# myRange is a Spark DataFrame with one column containing 1,000 rows with values from 0 to 999.
# When run on a cluster, each part of this range of numbers exists on a different executor.

# Let's perform a transformation-

divisBy2 = myRange.where("number % 2 = 0") # `where` is an alias for :func:`filter`.
	# Creating Spark Configuration and Spark Context-

	from pyspark import SparkContext, SparkConf
	conf = SparkConf().setAppName("My Dataframe")
	sc = SparkContext(conf = conf)

	from pyspark.sql import SparkSession # To work with dataframe we need pyspark.sql
	spark = SparkSession(sc) # passing Spark Context to SQL module

	myRange = spark.range(1000).toDF("number")

	# myRange is a Spark DataFrame with one column containing 1,000 rows with values from 0 to 999.
	# When run on a cluster, each part of this range of numbers exists on a different executor.

	# Let's perform a transformation-

	divisBy2 = myRange.where("number % 2 = 0") # `where` is an alias for :func:`filter`.