Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Pyspark using SparkSession example
# -*- coding: utf-8 -*-
"""
Example of Python Data Frame with SparkSession.
"""
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# variables
app = 'exampleDataFrameApi'
file = 'renda.csv'
# Spark context configurations
conf = (SparkConf().setAppName(app)
.setMaster("local[8]")
.set('spark.driver.maxResultSize', '8g')
.set('spark.logConf', 'true'))
# Create a new Spark Session to work with Data Frames
sparkSession = SparkSession.builder.appName(app).config(conf=conf).getOrCreate()
# Load a csv into a Data Frame class
schema = StructType([
StructField('Nome completo', StringType(), True),
StructField('Cargo Base', StringType(), True),
StructField('Cargo em Comissao', StringType(), True),
StructField('Remuneracao do Mes', FloatType(), True),
StructField('Demais Elementos da Remuneracao', FloatType(), True),
StructField('Remuneracao Bruta', FloatType(), True),
StructField('Unidade', StringType(), True),
StructField('Tp. Log', StringType(), True),
StructField('Logadrouro', StringType(), True),
StructField('Numero', StringType(), True),
StructField('Complemento', StringType(), True),
StructField('Jornada', StringType(), True)
])
df = sparkSession.read.csv(file, schema=schema, sep=';', encoding='utf-8', header=True)
# Show top 20 rows of df
df.show()
# Show top 100 rows of df
df.show(100)
# Count all records of df
df.count()
# Print the schema of df
df.printSchema()
# Print top 100 names
df.select('Nome completo').show(100)
# Calculate count, mean, stddev, min, max
df.describe(['Remuneracao do Mes']).show()
df.describe(['Demais Elementos da Remuneracao']).show()
df.describe(['Remuneracao Bruta']).show()
# Filter
df.filter(df['Remuneracao do Mes'] == 2495).show()
df.filter(df['Remuneracao do Mes'] > 20000).count()
# Group
df.groupBy('Remuneracao do Mes').count().show()
df.groupBy('Remuneracao Bruta').count().show()
# Sort
df.sort(desc('Remuneracao do Mes')).show()
# Sort + Group
df.groupBy('Remuneracao do Mes').count().sort(desc('Remuneracao do Mes')).show()
df.groupBy('Remuneracao Bruta').count().sort(desc('Remuneracao Bruta')).show()
# Drop columns
df2 = df.drop('Jornada')
df2.printSchema()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.