Skip to content

Instantly share code, notes, and snippets.

# Load Data
writeLines("=========================== Create Spark Data frame ======================")
library(nycflights13)
library(Lahman)
# copy_to creates a spark Dataframe
flights_tbl_spark <- copy_to(sc, nycflights13::flights, "flights", overwrite = TRUE)
iris_tbl_spark <- copy_to(sc, iris, "iris", overwrite = TRUE)
batting_tbl_spark <- copy_to(sc, Lahman::Batting, "batting", overwrite = TRUE)
@jystephan
jystephan / Create Spark Session.R
Created January 25, 2022 10:55
Create Spark Session (SparklyR)
library(stringr)
library(arrow)
library(sparklyr)
library(dplyr)
################################
# Create the Spark Session
################################
Sys.setenv(SPARK_HOME = "/opt/spark")
@jystephan
jystephan / SparklyR Dockerfile
Created January 25, 2022 10:53
SparklyR Dockerfile
FROM gcr.io/datamechanics/spark:3.0.2-hadoop-3.2.0-java-8-scala-2.12-python-3.8-dm13
ARG R_VERSION="4.0.4"
ARG LIBARROW_BINARY="true"
ARG RSPM_CHECKPOINT=2696074
ARG CRAN="https://packagemanager.rstudio.com/all/__linux__/focal/${RSPM_CHECKPOINT}"
USER root
RUN echo "deb http://cloud.r-project.org/bin/linux/debian buster-cran40/" >> /etc/apt/sources.list \
{
"spark.kubernetes.driver.reusePersistentVolumeClaim": "true",
"spark.kubernetes.driver.ownPersistentVolumeClaim": "true",
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.claimName": "OnDemand",
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.storageClass": "standard",
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.sizeLimit": "500Gi",
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.mount.path": "/var/data",
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.mount.readOnly": "false"
}
@jystephan
jystephan / justfile
Created October 28, 2021 16:14
PySpark Docker Tutorial - justfile
image_name := 'pyspark-example:dev'
aws_access_key_id := ''
aws_secret_access_key := ''
postgres_user := ''
postgres_password := ''
postgres_host := ''
postgres_port := ''
postgres_db_name := ''
table_name := ''
@jystephan
jystephan / main.py
Created October 28, 2021 16:12
PySpark Docker Tutorial - Main.py
import os, datetime
from databricks import koalas as ks
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import explode, split, col, sum, lit
from pyspark.sql import SparkSession
def apply_transforms(df: DataFrame) -> DataFrame:
# split _c0 column as it is a string and we want the population data from it
split_col = split(df['_c0'], '\t')
from gcr.io/datamechanics/spark:platform-3.1-dm14
ENV PYSPARK_MAJOR_PYTHON_VERSION=3
WORKDIR /opt/application/
RUN wget https://jdbc.postgresql.org/download/postgresql-42.2.5.jar
RUN mv postgresql-42.2.5.jar /opt/spark/jars
COPY requirements.txt .
RUN pip3 install -r requirements.txt