Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@wagnerjgoncalves
Last active February 8, 2024 11:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wagnerjgoncalves/7cc46ea56be1de73ce0dfff5e23770c6 to your computer and use it in GitHub Desktop.
Save wagnerjgoncalves/7cc46ea56be1de73ce0dfff5e23770c6 to your computer and use it in GitHub Desktop.
Pyspark using SparkSession example
# -*- coding: utf-8 -*-
"""
Example of Python Data Frame with SparkSession.
"""
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# variables
app = 'exampleDataFrameApi'
file = 'renda.csv'
# Spark context configurations
conf = (SparkConf().setAppName(app)
.setMaster("local[8]")
.set('spark.driver.maxResultSize', '8g')
.set('spark.logConf', 'true'))
# Create a new Spark Session to work with Data Frames
sparkSession = SparkSession.builder.appName(app).config(conf=conf).getOrCreate()
# Load a csv into a Data Frame class
schema = StructType([
StructField('Nome completo', StringType(), True),
StructField('Cargo Base', StringType(), True),
StructField('Cargo em Comissao', StringType(), True),
StructField('Remuneracao do Mes', FloatType(), True),
StructField('Demais Elementos da Remuneracao', FloatType(), True),
StructField('Remuneracao Bruta', FloatType(), True),
StructField('Unidade', StringType(), True),
StructField('Tp. Log', StringType(), True),
StructField('Logadrouro', StringType(), True),
StructField('Numero', StringType(), True),
StructField('Complemento', StringType(), True),
StructField('Jornada', StringType(), True)
])
df = sparkSession.read.csv(file, schema=schema, sep=';', encoding='utf-8', header=True)
# Show top 20 rows of df
df.show()
# Show top 100 rows of df
df.show(100)
# Count all records of df
df.count()
# Print the schema of df
df.printSchema()
# Print top 100 names
df.select('Nome completo').show(100)
# Calculate count, mean, stddev, min, max
df.describe(['Remuneracao do Mes']).show()
df.describe(['Demais Elementos da Remuneracao']).show()
df.describe(['Remuneracao Bruta']).show()
# Filter
df.filter(df['Remuneracao do Mes'] == 2495).show()
df.filter(df['Remuneracao do Mes'] > 20000).count()
# Group
df.groupBy('Remuneracao do Mes').count().show()
df.groupBy('Remuneracao Bruta').count().show()
# Sort
df.sort(desc('Remuneracao do Mes')).show()
# Sort + Group
df.groupBy('Remuneracao do Mes').count().sort(desc('Remuneracao do Mes')).show()
df.groupBy('Remuneracao Bruta').count().sort(desc('Remuneracao Bruta')).show()
# Drop columns
df2 = df.drop('Jornada')
df2.printSchema()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment