wagnerjgoncalves/example_dataframe_api.py

## example_dataframe_api.py
# -*- coding: utf-8 -*-

"""
    Example of Python Data Frame with SparkSession.
"""
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# variables
app = 'exampleDataFrameApi'
file = 'renda.csv'

# Spark context configurations
conf = (SparkConf().setAppName(app)
                    .setMaster("local[8]")
                    .set('spark.driver.maxResultSize', '8g')
                    .set('spark.logConf', 'true'))

# Create a new Spark Session to work with Data Frames
sparkSession = SparkSession.builder.appName(app).config(conf=conf).getOrCreate()

# Load a csv into a Data Frame class
schema = StructType([
    StructField('Nome completo', StringType(), True),
    StructField('Cargo Base', StringType(), True),
    StructField('Cargo em Comissao', StringType(), True),
    StructField('Remuneracao do Mes', FloatType(), True),
    StructField('Demais Elementos da Remuneracao', FloatType(), True),
    StructField('Remuneracao Bruta', FloatType(), True),
    StructField('Unidade', StringType(), True),
    StructField('Tp. Log', StringType(), True),
    StructField('Logadrouro', StringType(), True),
    StructField('Numero', StringType(), True),
    StructField('Complemento', StringType(), True),
    StructField('Jornada', StringType(), True)
])

df = sparkSession.read.csv(file, schema=schema, sep=';', encoding='utf-8', header=True)

# Show top 20 rows of df
df.show()

# Show top 100 rows of df
df.show(100)

# Count all records of df
df.count()

# Print the schema of df
df.printSchema()

# Print top 100 names
df.select('Nome completo').show(100)

# Calculate count, mean, stddev, min, max
df.describe(['Remuneracao do Mes']).show()
df.describe(['Demais Elementos da Remuneracao']).show()
df.describe(['Remuneracao Bruta']).show()

# Filter
df.filter(df['Remuneracao do Mes'] == 2495).show()
df.filter(df['Remuneracao do Mes'] > 20000).count()

# Group
df.groupBy('Remuneracao do Mes').count().show()
df.groupBy('Remuneracao Bruta').count().show()

# Sort
df.sort(desc('Remuneracao do Mes')).show()

# Sort + Group
df.groupBy('Remuneracao do Mes').count().sort(desc('Remuneracao do Mes')).show()
df.groupBy('Remuneracao Bruta').count().sort(desc('Remuneracao Bruta')).show()

# Drop columns
df2 = df.drop('Jornada')

df2.printSchema()
	# -- coding: utf-8 --

	"""
	Example of Python Data Frame with SparkSession.
	"""
	from pyspark.conf import SparkConf
	from pyspark.sql import SparkSession
	from pyspark.sql.functions import *
	from pyspark.sql.types import *

	# variables
	app = 'exampleDataFrameApi'
	file = 'renda.csv'

	# Spark context configurations
	conf = (SparkConf().setAppName(app)
	.setMaster("local[8]")
	.set('spark.driver.maxResultSize', '8g')
	.set('spark.logConf', 'true'))

	# Create a new Spark Session to work with Data Frames
	sparkSession = SparkSession.builder.appName(app).config(conf=conf).getOrCreate()

	# Load a csv into a Data Frame class
	schema = StructType([
	StructField('Nome completo', StringType(), True),
	StructField('Cargo Base', StringType(), True),
	StructField('Cargo em Comissao', StringType(), True),
	StructField('Remuneracao do Mes', FloatType(), True),
	StructField('Demais Elementos da Remuneracao', FloatType(), True),
	StructField('Remuneracao Bruta', FloatType(), True),
	StructField('Unidade', StringType(), True),
	StructField('Tp. Log', StringType(), True),
	StructField('Logadrouro', StringType(), True),
	StructField('Numero', StringType(), True),
	StructField('Complemento', StringType(), True),
	StructField('Jornada', StringType(), True)
	])

	df = sparkSession.read.csv(file, schema=schema, sep=';', encoding='utf-8', header=True)

	# Show top 20 rows of df
	df.show()

	# Show top 100 rows of df
	df.show(100)

	# Count all records of df
	df.count()

	# Print the schema of df
	df.printSchema()

	# Print top 100 names
	df.select('Nome completo').show(100)

	# Calculate count, mean, stddev, min, max
	df.describe(['Remuneracao do Mes']).show()
	df.describe(['Demais Elementos da Remuneracao']).show()
	df.describe(['Remuneracao Bruta']).show()

	# Filter
	df.filter(df['Remuneracao do Mes'] == 2495).show()
	df.filter(df['Remuneracao do Mes'] > 20000).count()

	# Group
	df.groupBy('Remuneracao do Mes').count().show()
	df.groupBy('Remuneracao Bruta').count().show()

	# Sort
	df.sort(desc('Remuneracao do Mes')).show()

	# Sort + Group
	df.groupBy('Remuneracao do Mes').count().sort(desc('Remuneracao do Mes')).show()
	df.groupBy('Remuneracao Bruta').count().sort(desc('Remuneracao Bruta')).show()

	# Drop columns
	df2 = df.drop('Jornada')

	df2.printSchema()