This repo demonstrates moving from pandas to spark for big data analysis
Python - v3.5.2
Pandas - v0.19.2
Spark - v2.1.0
pip3 install pandas
wget http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz
tar -zxvf spark-2.1.0-bin-hadoop2.7.tgz
sudo mkdir -p /opt
sudo mv spark-2.1.0-bin-hadoop2.7 /opt/
rm -rf spark-2.1.0-bin-hadoop2.7.tgz
# Modify IP and Memory
cd /opt/spark-2.1.0-bin-hadoop2.7/conf/
cp spark-env.sh.template spark-env.sh
cp spark-defaults.conf.template spark-defaults.conf
sudo echo "SPARK_LOCAL_IP=127.0.0.1"\n"SPARK_MASTER_HOST=127.0.0.1"\n"SPARK_EXECUTOR_MEMORY=4G"\n"SPARK_DRIVER_MEMORY=4G" >> spark-env.sh
cd ../sbin
sudo ./start-master.sh
sudo ./start-slave.sh spark://127.0.0.1:7077
# Pandas
import pandas as pd
# PySpark
from pyspark.sql import SparkSession
sc = SparkSession.builder.master('spark://127.0.0.1:7077').appName("APP_NAME").getOrCreate()
# Pandas
data = pd.read_csv('path_to_file', header=0, index_col=0, delimiter=',')
# PySpark
# Index Col not present in Spark
data = sc.read.options(header=True, inferSchema=True, delimiter=',').csv("src/main/resources/sales.csv")
# Pandas
data.describe()
# PySpark
data.describe().show()
# Pandas
# First 5 rows
data.head(5)
# Last 5 rows
data.tail(5)
# PySpark - Supports first n rows
data.show(5)
#PySpark - Create a subset
sample = data.limit(1000)
# Pandas
data.to_csv('path_name', index=False, header=True)
# PySpark
# Multiple CSV Files
data.write.options(header=True, index=False).csv('path_name')
# Single CSV File
data.coalesce(1).write.options(header=True).csv("path_name")
# Spark DF to Pandas DF to CSV
data.toPandas().to_csv('path_name')