Pyspark using SparkSession example
# -*- coding: utf-8 -*-
Example of Python Data Frame with SparkSession.
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# variables
app = 'exampleDataFrameApi'
file = 'renda.csv'
# Spark context configurations
conf = (SparkConf().setAppName(app)
.set('spark.driver.maxResultSize', '8g')
.set('spark.logConf', 'true'))
# Create a new Spark Session to work with Data Frames
sparkSession = SparkSession.builder.appName(app).config(conf=conf).getOrCreate()
# Load a csv into a Data Frame class
schema = StructType([
StructField('Nome completo', StringType(), True),
StructField('Cargo Base', StringType(), True),
StructField('Cargo em Comissao', StringType(), True),
StructField('Remuneracao do Mes', FloatType(), True),
StructField('Demais Elementos da Remuneracao', FloatType(), True),
StructField('Remuneracao Bruta', FloatType(), True),
StructField('Unidade', StringType(), True),
StructField('Tp. Log', StringType(), True),
StructField('Logadrouro', StringType(), True),
StructField('Numero', StringType(), True),
StructField('Complemento', StringType(), True),
StructField('Jornada', StringType(), True)
df =, schema=schema, sep=';', encoding='utf-8', header=True)
# Show top 20 rows of df
# Show top 100 rows of df
# Count all records of df
# Print the schema of df
# Print top 100 names'Nome completo').show(100)
# Calculate count, mean, stddev, min, max
df.describe(['Remuneracao do Mes']).show()
df.describe(['Demais Elementos da Remuneracao']).show()
df.describe(['Remuneracao Bruta']).show()
# Filter
df.filter(df['Remuneracao do Mes'] == 2495).show()
df.filter(df['Remuneracao do Mes'] > 20000).count()
# Group
df.groupBy('Remuneracao do Mes').count().show()
df.groupBy('Remuneracao Bruta').count().show()
# Sort
df.sort(desc('Remuneracao do Mes')).show()
# Sort + Group
df.groupBy('Remuneracao do Mes').count().sort(desc('Remuneracao do Mes')).show()
df.groupBy('Remuneracao Bruta').count().sort(desc('Remuneracao Bruta')).show()
# Drop columns
df2 = df.drop('Jornada')
